From db76e5513fef4a2df6a3ebb37376316bab43d12e Mon Sep 17 00:00:00 2001
From: Bart Turczynski <142225707+bart-turczynski@users.noreply.github.com>
Date: Tue, 16 Jun 2026 21:07:56 +0200
Subject: [PATCH] docs: demote URL surface to best-effort host extraction
 (PUNY-gfjkdjol)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-document url_encode/url_decode/parse_url as best-effort host extraction
and rewriting in URL-shaped strings — explicitly NOT RFC 3986 / WHATWG URL
parsing or canonicalization. State the non-goals (percent encoding/decoding,
scheme/port/path semantics, full IPv6 incl. zone IDs / RFC 6874,
serialization) and note the surface is slated for eventual removal in favour
of a dedicated URL package (rurl) consuming punycoder's host functions.

Updated roxygen, DESCRIPTION, and README (.Rmd + rendered .md). Phase 1 only;
deprecation/removal is a separate follow-up ticket. No behaviour change.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 DESCRIPTION       |  5 ++++-
 R/url-utils.R     | 54 ++++++++++++++++++++++++++++++++++-------------
 README.Rmd        | 17 +++++++++++----
 README.md         | 23 ++++++++++++++------
 man/parse_url.Rd  | 20 +++++++++++++-----
 man/url_decode.Rd | 16 +++++++++-----
 man/url_encode.Rd | 21 +++++++++++++-----
 7 files changed, 115 insertions(+), 41 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 5db3402..ea323ef 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -12,7 +12,10 @@ Description: High-performance Unicode and Punycode processing for
     apply Unicode IDNA normalization. 'host_normalize()' is the Unicode
     Technical Standard #46 host-normalization entry point, mapping a host
     name to a canonical lowercase ASCII comparison form (non-transitional
-    profile, pinned Unicode version). Aimed at URL processing and data
+    profile, pinned Unicode version). The 'url_encode()' / 'url_decode()' /
+    'parse_url()' helpers do best-effort host extraction and rewriting in
+    URL-shaped strings and are deliberately not RFC 3986 / WHATWG URL
+    parsers or canonicalizers. Aimed at host normalization and data
     analysis workflows.
 Depends: R (>= 3.5.0)
 Imports: Rcpp (>= 1.0.0)
diff --git a/R/url-utils.R b/R/url-utils.R
index bdf5715..fc8048a 100644
--- a/R/url-utils.R
+++ b/R/url-utils.R
@@ -1,10 +1,20 @@
-#' Encode URLs with Unicode domains to ASCII
+#' Best-effort host rewriting in a URL-shaped string (Unicode host to ASCII)
 #'
-#' Converts URLs containing Unicode domain names to their ASCII representation
-#' while preserving the rest of the URL structure. This function is essential
-#' for preparing URLs for systems that require ASCII-only domain names.
+#' Locates the host portion of a URL-shaped string with a hand-rolled
+#' splitter, ASCII-encodes that host, and substitutes it back, leaving the
+#' rest of the string untouched.
 #'
-#' @param url Character vector of URLs with potential Unicode domains
+#' This is **best-effort host extraction and rewriting, not URL parsing or
+#' canonicalization.** It is deliberately *not* RFC 3986 / WHATWG URL
+#' conformant. Non-goals (handled upstack, e.g. by `rurl`): percent
+#' encoding/decoding, scheme validation, port/path/query semantics, full
+#' IPv6 (including zone IDs / RFC 6874), and URL serialization. Pass only the
+#' host to [host_normalize()] / [puny_encode()] when you control the parse;
+#' use this helper only for quick host rewriting in an already-trusted
+#' URL-shaped string.
+#'
+#' @param url Character vector of URL-shaped strings with potential Unicode
+#'   hosts
 #' @param strict Logical; whether to apply strict validation. Defaults to
 #'   `getOption("punycoder.strict", TRUE)`.
 #' @return A character vector the same length as \code{url}, with each element
@@ -34,13 +44,18 @@ url_encode <- function(url, strict = getOption("punycoder.strict", TRUE)) {
   .call_with_validation(url, strict, url_encode_cpp, "url")
 }
 
-#' Decode URLs with ASCII punycode domains to Unicode
+#' Best-effort host rewriting in a URL-shaped string (ASCII punycode to Unicode)
+#'
+#' Locates the host portion of a URL-shaped string with a hand-rolled
+#' splitter, decodes that host from ASCII punycode to Unicode, and
+#' substitutes it back, leaving the rest of the string untouched.
 #'
-#' Converts URLs containing ASCII punycode domain names back to their Unicode
-#' representation for display purposes. This function makes internationalized
-#' URLs human-readable.
+#' Like [url_encode()], this is **best-effort host extraction and rewriting,
+#' not URL parsing or canonicalization**, and is not RFC 3986 / WHATWG URL
+#' conformant (no percent encoding/decoding, scheme/port/path semantics, full
+#' IPv6, or serialization). Those concerns live upstack in `rurl`.
 #'
-#' @param url Character vector of URLs with ASCII punycode domains
+#' @param url Character vector of URL-shaped strings with ASCII punycode hosts
 #' @param strict Logical; whether to apply strict validation. Defaults to
 #'   `getOption("punycoder.strict", TRUE)`.
 #' @return A character vector the same length as \code{url}, with each element
@@ -69,13 +84,22 @@ url_decode <- function(url, strict = getOption("punycoder.strict", TRUE)) {
   .call_with_validation(url, strict, url_decode_cpp, "url")
 }
 
-#' Parse URLs with internationalized domain name handling
+#' Best-effort host extraction from a URL-shaped string
+#'
+#' Splits a URL-shaped string into coarse components with a hand-rolled
+#' splitter, primarily to extract the host for internationalized-domain-name
+#' handling, optionally ASCII-encoding it.
 #'
-#' Parses URLs and returns a structured list with proper handling of
-#' internationalized domain names. This function provides both Unicode
-#' and ASCII representations of domain components.
+#' This is **best-effort host extraction, not a conformant URL parser.** It is
+#' *not* RFC 3986 / WHATWG URL compliant: there is no percent encoding/decoding,
+#' no scheme validation, no robust port/path/query semantics, no full IPv6
+#' (zone IDs / RFC 6874 are unhandled), and no serialization guarantees. The
+#' non-host components are returned as a convenience only; for real URL parsing
+#' and canonicalization use a dedicated URL package (e.g. `rurl`). This surface
+#' is slated for eventual removal in favour of `rurl` consuming punycoder's host
+#' functions.
 #'
-#' @param url Character vector of URLs to parse
+#' @param url Character vector of URL-shaped strings to split
 #' @param encode_domains Logical flag; encode parsed host names to ASCII.
 #' @return An object of class \code{"punycoder_parsed_url"} (a named list)
 #'   with components:
diff --git a/README.Rmd b/README.Rmd
index a01e864..fc7cad6 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -99,7 +99,7 @@ validate_domain("test.com")
 ## Key Features
 
 - **Reliable Encoding/Decoding**: RFC 3492 compliant punycode conversion
-- **URL-Aware Processing**: Handle complete URLs with international domains  
+- **Best-effort host rewriting**: Swap the host of a URL-shaped string in place (not a full URL parser; see below)
 - **High Performance**: Vectorized operations for processing large datasets
 - **Comprehensive Validation**: Robust error handling with informative messages
 - **Flexible Backend**: Automatically uses `libidn2` when available, with a built-in fallback backend
@@ -116,10 +116,19 @@ international_urls <- c(
   "https://北京.中国/info"
 )
 
-# Convert for HTTP requests
+# Convert for HTTP requests (best-effort host rewriting only)
 ascii_urls <- url_encode(international_urls)
 ```
 
+> `url_encode()`, `url_decode()`, and `parse_url()` do **best-effort host
+> extraction and rewriting**, not RFC 3986 / WHATWG URL parsing or
+> canonicalization. They have no percent encoding/decoding, scheme validation,
+> robust port/path/query semantics, full IPv6 (zone IDs / RFC 6874), or
+> serialization guarantees, and are slated for eventual removal in favour of a
+> dedicated URL package consuming punycoder's host functions. Use
+> `host_normalize()` / `puny_encode()` directly when you control the host
+> parse.
+
 ### Data Analysis
 Clean and standardize URL datasets:
 
@@ -136,11 +145,11 @@ validate_domain(c("valid.com", "invalid..domain"))
 `punycoder` currently provides:
 
 - Domain encoding/decoding: `puny_encode()`, `puny_decode()`
-- URL host processing: `url_encode()`, `url_decode()`, `parse_url()`
+- Best-effort URL host rewriting/extraction (not URL parsing/canonicalization): `url_encode()`, `url_decode()`, `parse_url()`
 - Domain validation utilities: `is_punycode()`, `is_idn()`, `validate_domain()`
 - Vectorized operations and strict/non-strict handling for malformed input
 - Build-time backend selection (`libidn2` when present, built-in fallback otherwise)
-- Structured URL parsing where invalid inputs are returned as missing components
+- Best-effort structured host extraction where invalid inputs are returned as missing components
 
 ## Acknowledgments
 
diff --git a/README.md b/README.md
index 54a1616..3d4f91c 100644
--- a/README.md
+++ b/README.md
@@ -102,8 +102,8 @@ validate_domain("test.com")
 ## Key Features
 
 - **Reliable Encoding/Decoding**: RFC 3492 compliant punycode conversion
-- **URL-Aware Processing**: Handle complete URLs with international
-  domains  
+- **Best-effort host rewriting**: Swap the host of a URL-shaped string
+  in place (not a full URL parser; see below)
 - **High Performance**: Vectorized operations for processing large
   datasets
 - **Comprehensive Validation**: Robust error handling with informative
@@ -124,10 +124,19 @@ international_urls <- c(
   "https://北京.中国/info"
 )
 
-# Convert for HTTP requests
+# Convert for HTTP requests (best-effort host rewriting only)
 ascii_urls <- url_encode(international_urls)
 ```
 
+> `url_encode()`, `url_decode()`, and `parse_url()` do **best-effort
+> host extraction and rewriting**, not RFC 3986 / WHATWG URL parsing or
+> canonicalization. They have no percent encoding/decoding, scheme
+> validation, robust port/path/query semantics, full IPv6 (zone IDs /
+> RFC 6874), or serialization guarantees, and are slated for eventual
+> removal in favour of a dedicated URL package consuming punycoder’s
+> host functions. Use `host_normalize()` / `puny_encode()` directly when
+> you control the host parse.
+
 ### Data Analysis
 
 Clean and standardize URL datasets:
@@ -145,15 +154,17 @@ validate_domain(c("valid.com", "invalid..domain"))
 `punycoder` currently provides:
 
 - Domain encoding/decoding: `puny_encode()`, `puny_decode()`
-- URL host processing: `url_encode()`, `url_decode()`, `parse_url()`
+- Best-effort URL host rewriting/extraction (not URL
+  parsing/canonicalization): `url_encode()`, `url_decode()`,
+  `parse_url()`
 - Domain validation utilities: `is_punycode()`, `is_idn()`,
   `validate_domain()`
 - Vectorized operations and strict/non-strict handling for malformed
   input
 - Build-time backend selection (`libidn2` when present, built-in
   fallback otherwise)
-- Structured URL parsing where invalid inputs are returned as missing
-  components
+- Best-effort structured host extraction where invalid inputs are
+  returned as missing components
 
 ## Acknowledgments
 
diff --git a/man/parse_url.Rd b/man/parse_url.Rd
index ccfd7ec..83de400 100644
--- a/man/parse_url.Rd
+++ b/man/parse_url.Rd
@@ -2,12 +2,12 @@
 % Please edit documentation in R/url-utils.R
 \name{parse_url}
 \alias{parse_url}
-\title{Parse URLs with internationalized domain name handling}
+\title{Best-effort host extraction from a URL-shaped string}
 \usage{
 parse_url(url, encode_domains = FALSE)
 }
 \arguments{
-\item{url}{Character vector of URLs to parse}
+\item{url}{Character vector of URL-shaped strings to split}
 
 \item{encode_domains}{Logical flag; encode parsed host names to ASCII.}
 }
@@ -27,9 +27,19 @@ An object of class \code{"punycoder_parsed_url"} (a named list)
   \code{path} is returned as \code{""}.
 }
 \description{
-Parses URLs and returns a structured list with proper handling of
-internationalized domain names. This function provides both Unicode
-and ASCII representations of domain components.
+Splits a URL-shaped string into coarse components with a hand-rolled
+splitter, primarily to extract the host for internationalized-domain-name
+handling, optionally ASCII-encoding it.
+}
+\details{
+This is **best-effort host extraction, not a conformant URL parser.** It is
+*not* RFC 3986 / WHATWG URL compliant: there is no percent encoding/decoding,
+no scheme validation, no robust port/path/query semantics, no full IPv6
+(zone IDs / RFC 6874 are unhandled), and no serialization guarantees. The
+non-host components are returned as a convenience only; for real URL parsing
+and canonicalization use a dedicated URL package (e.g. `rurl`). This surface
+is slated for eventual removal in favour of `rurl` consuming punycoder's host
+functions.
 }
 \examples{
 \donttest{
diff --git a/man/url_decode.Rd b/man/url_decode.Rd
index bfe9695..283c0cf 100644
--- a/man/url_decode.Rd
+++ b/man/url_decode.Rd
@@ -2,12 +2,12 @@
 % Please edit documentation in R/url-utils.R
 \name{url_decode}
 \alias{url_decode}
-\title{Decode URLs with ASCII punycode domains to Unicode}
+\title{Best-effort host rewriting in a URL-shaped string (ASCII punycode to Unicode)}
 \usage{
 url_decode(url, strict = getOption("punycoder.strict", TRUE))
 }
 \arguments{
-\item{url}{Character vector of URLs with ASCII punycode domains}
+\item{url}{Character vector of URL-shaped strings with ASCII punycode hosts}
 
 \item{strict}{Logical; whether to apply strict validation. Defaults to
 `getOption("punycoder.strict", TRUE)`.}
@@ -20,9 +20,15 @@ A character vector the same length as \code{url}, with each element
   \code{NA_character_}.
 }
 \description{
-Converts URLs containing ASCII punycode domain names back to their Unicode
-representation for display purposes. This function makes internationalized
-URLs human-readable.
+Locates the host portion of a URL-shaped string with a hand-rolled
+splitter, decodes that host from ASCII punycode to Unicode, and
+substitutes it back, leaving the rest of the string untouched.
+}
+\details{
+Like [url_encode()], this is **best-effort host extraction and rewriting,
+not URL parsing or canonicalization**, and is not RFC 3986 / WHATWG URL
+conformant (no percent encoding/decoding, scheme/port/path semantics, full
+IPv6, or serialization). Those concerns live upstack in `rurl`.
 }
 \examples{
 \donttest{
diff --git a/man/url_encode.Rd b/man/url_encode.Rd
index 6626aac..0ceaff2 100644
--- a/man/url_encode.Rd
+++ b/man/url_encode.Rd
@@ -2,12 +2,13 @@
 % Please edit documentation in R/url-utils.R
 \name{url_encode}
 \alias{url_encode}
-\title{Encode URLs with Unicode domains to ASCII}
+\title{Best-effort host rewriting in a URL-shaped string (Unicode host to ASCII)}
 \usage{
 url_encode(url, strict = getOption("punycoder.strict", TRUE))
 }
 \arguments{
-\item{url}{Character vector of URLs with potential Unicode domains}
+\item{url}{Character vector of URL-shaped strings with potential Unicode
+hosts}
 
 \item{strict}{Logical; whether to apply strict validation. Defaults to
 `getOption("punycoder.strict", TRUE)`.}
@@ -19,9 +20,19 @@ A character vector the same length as \code{url}, with each element
   Elements corresponding to \code{NA} inputs are \code{NA_character_}.
 }
 \description{
-Converts URLs containing Unicode domain names to their ASCII representation
-while preserving the rest of the URL structure. This function is essential
-for preparing URLs for systems that require ASCII-only domain names.
+Locates the host portion of a URL-shaped string with a hand-rolled
+splitter, ASCII-encodes that host, and substitutes it back, leaving the
+rest of the string untouched.
+}
+\details{
+This is **best-effort host extraction and rewriting, not URL parsing or
+canonicalization.** It is deliberately *not* RFC 3986 / WHATWG URL
+conformant. Non-goals (handled upstack, e.g. by `rurl`): percent
+encoding/decoding, scheme validation, port/path/query semantics, full
+IPv6 (including zone IDs / RFC 6874), and URL serialization. Pass only the
+host to [host_normalize()] / [puny_encode()] when you control the parse;
+use this helper only for quick host rewriting in an already-trusted
+URL-shaped string.
 }
 \examples{
 \donttest{