From db76e5513fef4a2df6a3ebb37376316bab43d12e Mon Sep 17 00:00:00 2001 From: Bart Turczynski <142225707+bart-turczynski@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:07:56 +0200 Subject: [PATCH] docs: demote URL surface to best-effort host extraction (PUNY-gfjkdjol) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-document url_encode/url_decode/parse_url as best-effort host extraction and rewriting in URL-shaped strings — explicitly NOT RFC 3986 / WHATWG URL parsing or canonicalization. State the non-goals (percent encoding/decoding, scheme/port/path semantics, full IPv6 incl. zone IDs / RFC 6874, serialization) and note the surface is slated for eventual removal in favour of a dedicated URL package (rurl) consuming punycoder's host functions. Updated roxygen, DESCRIPTION, and README (.Rmd + rendered .md). Phase 1 only; deprecation/removal is a separate follow-up ticket. No behaviour change. Co-Authored-By: Claude Opus 4.8 --- DESCRIPTION | 5 ++++- R/url-utils.R | 54 ++++++++++++++++++++++++++++++++++------------- README.Rmd | 17 +++++++++++---- README.md | 23 ++++++++++++++------ man/parse_url.Rd | 20 +++++++++++++----- man/url_decode.Rd | 16 +++++++++----- man/url_encode.Rd | 21 +++++++++++++----- 7 files changed, 115 insertions(+), 41 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5db3402..ea323ef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,10 @@ Description: High-performance Unicode and Punycode processing for apply Unicode IDNA normalization. 'host_normalize()' is the Unicode Technical Standard #46 host-normalization entry point, mapping a host name to a canonical lowercase ASCII comparison form (non-transitional - profile, pinned Unicode version). Aimed at URL processing and data + profile, pinned Unicode version). The 'url_encode()' / 'url_decode()' / + 'parse_url()' helpers do best-effort host extraction and rewriting in + URL-shaped strings and are deliberately not RFC 3986 / WHATWG URL + parsers or canonicalizers. Aimed at host normalization and data analysis workflows. Depends: R (>= 3.5.0) Imports: Rcpp (>= 1.0.0) diff --git a/R/url-utils.R b/R/url-utils.R index bdf5715..fc8048a 100644 --- a/R/url-utils.R +++ b/R/url-utils.R @@ -1,10 +1,20 @@ -#' Encode URLs with Unicode domains to ASCII +#' Best-effort host rewriting in a URL-shaped string (Unicode host to ASCII) #' -#' Converts URLs containing Unicode domain names to their ASCII representation -#' while preserving the rest of the URL structure. This function is essential -#' for preparing URLs for systems that require ASCII-only domain names. +#' Locates the host portion of a URL-shaped string with a hand-rolled +#' splitter, ASCII-encodes that host, and substitutes it back, leaving the +#' rest of the string untouched. #' -#' @param url Character vector of URLs with potential Unicode domains +#' This is **best-effort host extraction and rewriting, not URL parsing or +#' canonicalization.** It is deliberately *not* RFC 3986 / WHATWG URL +#' conformant. Non-goals (handled upstack, e.g. by `rurl`): percent +#' encoding/decoding, scheme validation, port/path/query semantics, full +#' IPv6 (including zone IDs / RFC 6874), and URL serialization. Pass only the +#' host to [host_normalize()] / [puny_encode()] when you control the parse; +#' use this helper only for quick host rewriting in an already-trusted +#' URL-shaped string. +#' +#' @param url Character vector of URL-shaped strings with potential Unicode +#' hosts #' @param strict Logical; whether to apply strict validation. Defaults to #' `getOption("punycoder.strict", TRUE)`. #' @return A character vector the same length as \code{url}, with each element @@ -34,13 +44,18 @@ url_encode <- function(url, strict = getOption("punycoder.strict", TRUE)) { .call_with_validation(url, strict, url_encode_cpp, "url") } -#' Decode URLs with ASCII punycode domains to Unicode +#' Best-effort host rewriting in a URL-shaped string (ASCII punycode to Unicode) +#' +#' Locates the host portion of a URL-shaped string with a hand-rolled +#' splitter, decodes that host from ASCII punycode to Unicode, and +#' substitutes it back, leaving the rest of the string untouched. #' -#' Converts URLs containing ASCII punycode domain names back to their Unicode -#' representation for display purposes. This function makes internationalized -#' URLs human-readable. +#' Like [url_encode()], this is **best-effort host extraction and rewriting, +#' not URL parsing or canonicalization**, and is not RFC 3986 / WHATWG URL +#' conformant (no percent encoding/decoding, scheme/port/path semantics, full +#' IPv6, or serialization). Those concerns live upstack in `rurl`. #' -#' @param url Character vector of URLs with ASCII punycode domains +#' @param url Character vector of URL-shaped strings with ASCII punycode hosts #' @param strict Logical; whether to apply strict validation. Defaults to #' `getOption("punycoder.strict", TRUE)`. #' @return A character vector the same length as \code{url}, with each element @@ -69,13 +84,22 @@ url_decode <- function(url, strict = getOption("punycoder.strict", TRUE)) { .call_with_validation(url, strict, url_decode_cpp, "url") } -#' Parse URLs with internationalized domain name handling +#' Best-effort host extraction from a URL-shaped string +#' +#' Splits a URL-shaped string into coarse components with a hand-rolled +#' splitter, primarily to extract the host for internationalized-domain-name +#' handling, optionally ASCII-encoding it. #' -#' Parses URLs and returns a structured list with proper handling of -#' internationalized domain names. This function provides both Unicode -#' and ASCII representations of domain components. +#' This is **best-effort host extraction, not a conformant URL parser.** It is +#' *not* RFC 3986 / WHATWG URL compliant: there is no percent encoding/decoding, +#' no scheme validation, no robust port/path/query semantics, no full IPv6 +#' (zone IDs / RFC 6874 are unhandled), and no serialization guarantees. The +#' non-host components are returned as a convenience only; for real URL parsing +#' and canonicalization use a dedicated URL package (e.g. `rurl`). This surface +#' is slated for eventual removal in favour of `rurl` consuming punycoder's host +#' functions. #' -#' @param url Character vector of URLs to parse +#' @param url Character vector of URL-shaped strings to split #' @param encode_domains Logical flag; encode parsed host names to ASCII. #' @return An object of class \code{"punycoder_parsed_url"} (a named list) #' with components: diff --git a/README.Rmd b/README.Rmd index a01e864..fc7cad6 100644 --- a/README.Rmd +++ b/README.Rmd @@ -99,7 +99,7 @@ validate_domain("test.com") ## Key Features - **Reliable Encoding/Decoding**: RFC 3492 compliant punycode conversion -- **URL-Aware Processing**: Handle complete URLs with international domains +- **Best-effort host rewriting**: Swap the host of a URL-shaped string in place (not a full URL parser; see below) - **High Performance**: Vectorized operations for processing large datasets - **Comprehensive Validation**: Robust error handling with informative messages - **Flexible Backend**: Automatically uses `libidn2` when available, with a built-in fallback backend @@ -116,10 +116,19 @@ international_urls <- c( "https://北京.中国/info" ) -# Convert for HTTP requests +# Convert for HTTP requests (best-effort host rewriting only) ascii_urls <- url_encode(international_urls) ``` +> `url_encode()`, `url_decode()`, and `parse_url()` do **best-effort host +> extraction and rewriting**, not RFC 3986 / WHATWG URL parsing or +> canonicalization. They have no percent encoding/decoding, scheme validation, +> robust port/path/query semantics, full IPv6 (zone IDs / RFC 6874), or +> serialization guarantees, and are slated for eventual removal in favour of a +> dedicated URL package consuming punycoder's host functions. Use +> `host_normalize()` / `puny_encode()` directly when you control the host +> parse. + ### Data Analysis Clean and standardize URL datasets: @@ -136,11 +145,11 @@ validate_domain(c("valid.com", "invalid..domain")) `punycoder` currently provides: - Domain encoding/decoding: `puny_encode()`, `puny_decode()` -- URL host processing: `url_encode()`, `url_decode()`, `parse_url()` +- Best-effort URL host rewriting/extraction (not URL parsing/canonicalization): `url_encode()`, `url_decode()`, `parse_url()` - Domain validation utilities: `is_punycode()`, `is_idn()`, `validate_domain()` - Vectorized operations and strict/non-strict handling for malformed input - Build-time backend selection (`libidn2` when present, built-in fallback otherwise) -- Structured URL parsing where invalid inputs are returned as missing components +- Best-effort structured host extraction where invalid inputs are returned as missing components ## Acknowledgments diff --git a/README.md b/README.md index 54a1616..3d4f91c 100644 --- a/README.md +++ b/README.md @@ -102,8 +102,8 @@ validate_domain("test.com") ## Key Features - **Reliable Encoding/Decoding**: RFC 3492 compliant punycode conversion -- **URL-Aware Processing**: Handle complete URLs with international - domains +- **Best-effort host rewriting**: Swap the host of a URL-shaped string + in place (not a full URL parser; see below) - **High Performance**: Vectorized operations for processing large datasets - **Comprehensive Validation**: Robust error handling with informative @@ -124,10 +124,19 @@ international_urls <- c( "https://北京.中国/info" ) -# Convert for HTTP requests +# Convert for HTTP requests (best-effort host rewriting only) ascii_urls <- url_encode(international_urls) ``` +> `url_encode()`, `url_decode()`, and `parse_url()` do **best-effort +> host extraction and rewriting**, not RFC 3986 / WHATWG URL parsing or +> canonicalization. They have no percent encoding/decoding, scheme +> validation, robust port/path/query semantics, full IPv6 (zone IDs / +> RFC 6874), or serialization guarantees, and are slated for eventual +> removal in favour of a dedicated URL package consuming punycoder’s +> host functions. Use `host_normalize()` / `puny_encode()` directly when +> you control the host parse. + ### Data Analysis Clean and standardize URL datasets: @@ -145,15 +154,17 @@ validate_domain(c("valid.com", "invalid..domain")) `punycoder` currently provides: - Domain encoding/decoding: `puny_encode()`, `puny_decode()` -- URL host processing: `url_encode()`, `url_decode()`, `parse_url()` +- Best-effort URL host rewriting/extraction (not URL + parsing/canonicalization): `url_encode()`, `url_decode()`, + `parse_url()` - Domain validation utilities: `is_punycode()`, `is_idn()`, `validate_domain()` - Vectorized operations and strict/non-strict handling for malformed input - Build-time backend selection (`libidn2` when present, built-in fallback otherwise) -- Structured URL parsing where invalid inputs are returned as missing - components +- Best-effort structured host extraction where invalid inputs are + returned as missing components ## Acknowledgments diff --git a/man/parse_url.Rd b/man/parse_url.Rd index ccfd7ec..83de400 100644 --- a/man/parse_url.Rd +++ b/man/parse_url.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/url-utils.R \name{parse_url} \alias{parse_url} -\title{Parse URLs with internationalized domain name handling} +\title{Best-effort host extraction from a URL-shaped string} \usage{ parse_url(url, encode_domains = FALSE) } \arguments{ -\item{url}{Character vector of URLs to parse} +\item{url}{Character vector of URL-shaped strings to split} \item{encode_domains}{Logical flag; encode parsed host names to ASCII.} } @@ -27,9 +27,19 @@ An object of class \code{"punycoder_parsed_url"} (a named list) \code{path} is returned as \code{""}. } \description{ -Parses URLs and returns a structured list with proper handling of -internationalized domain names. This function provides both Unicode -and ASCII representations of domain components. +Splits a URL-shaped string into coarse components with a hand-rolled +splitter, primarily to extract the host for internationalized-domain-name +handling, optionally ASCII-encoding it. +} +\details{ +This is **best-effort host extraction, not a conformant URL parser.** It is +*not* RFC 3986 / WHATWG URL compliant: there is no percent encoding/decoding, +no scheme validation, no robust port/path/query semantics, no full IPv6 +(zone IDs / RFC 6874 are unhandled), and no serialization guarantees. The +non-host components are returned as a convenience only; for real URL parsing +and canonicalization use a dedicated URL package (e.g. `rurl`). This surface +is slated for eventual removal in favour of `rurl` consuming punycoder's host +functions. } \examples{ \donttest{ diff --git a/man/url_decode.Rd b/man/url_decode.Rd index bfe9695..283c0cf 100644 --- a/man/url_decode.Rd +++ b/man/url_decode.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/url-utils.R \name{url_decode} \alias{url_decode} -\title{Decode URLs with ASCII punycode domains to Unicode} +\title{Best-effort host rewriting in a URL-shaped string (ASCII punycode to Unicode)} \usage{ url_decode(url, strict = getOption("punycoder.strict", TRUE)) } \arguments{ -\item{url}{Character vector of URLs with ASCII punycode domains} +\item{url}{Character vector of URL-shaped strings with ASCII punycode hosts} \item{strict}{Logical; whether to apply strict validation. Defaults to `getOption("punycoder.strict", TRUE)`.} @@ -20,9 +20,15 @@ A character vector the same length as \code{url}, with each element \code{NA_character_}. } \description{ -Converts URLs containing ASCII punycode domain names back to their Unicode -representation for display purposes. This function makes internationalized -URLs human-readable. +Locates the host portion of a URL-shaped string with a hand-rolled +splitter, decodes that host from ASCII punycode to Unicode, and +substitutes it back, leaving the rest of the string untouched. +} +\details{ +Like [url_encode()], this is **best-effort host extraction and rewriting, +not URL parsing or canonicalization**, and is not RFC 3986 / WHATWG URL +conformant (no percent encoding/decoding, scheme/port/path semantics, full +IPv6, or serialization). Those concerns live upstack in `rurl`. } \examples{ \donttest{ diff --git a/man/url_encode.Rd b/man/url_encode.Rd index 6626aac..0ceaff2 100644 --- a/man/url_encode.Rd +++ b/man/url_encode.Rd @@ -2,12 +2,13 @@ % Please edit documentation in R/url-utils.R \name{url_encode} \alias{url_encode} -\title{Encode URLs with Unicode domains to ASCII} +\title{Best-effort host rewriting in a URL-shaped string (Unicode host to ASCII)} \usage{ url_encode(url, strict = getOption("punycoder.strict", TRUE)) } \arguments{ -\item{url}{Character vector of URLs with potential Unicode domains} +\item{url}{Character vector of URL-shaped strings with potential Unicode +hosts} \item{strict}{Logical; whether to apply strict validation. Defaults to `getOption("punycoder.strict", TRUE)`.} @@ -19,9 +20,19 @@ A character vector the same length as \code{url}, with each element Elements corresponding to \code{NA} inputs are \code{NA_character_}. } \description{ -Converts URLs containing Unicode domain names to their ASCII representation -while preserving the rest of the URL structure. This function is essential -for preparing URLs for systems that require ASCII-only domain names. +Locates the host portion of a URL-shaped string with a hand-rolled +splitter, ASCII-encodes that host, and substitutes it back, leaving the +rest of the string untouched. +} +\details{ +This is **best-effort host extraction and rewriting, not URL parsing or +canonicalization.** It is deliberately *not* RFC 3986 / WHATWG URL +conformant. Non-goals (handled upstack, e.g. by `rurl`): percent +encoding/decoding, scheme validation, port/path/query semantics, full +IPv6 (including zone IDs / RFC 6874), and URL serialization. Pass only the +host to [host_normalize()] / [puny_encode()] when you control the parse; +use this helper only for quick host rewriting in an already-trusted +URL-shaped string. } \examples{ \donttest{