Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# punycoder 1.1.0.9000 (development version)
# punycoder (development version)

## New features

Expand All @@ -10,6 +10,16 @@
the same flag values to `normalization_profile_info()` for the matching
profile identity.

## Deprecated

* `url_encode()`, `url_decode()`, and `parse_url()` are deprecated and now emit
a `.Deprecated()` warning on use. They remain exported and fully functional
for this release and are scheduled for removal in the next one. These were
always best-effort host extraction/rewriting, not RFC 3986 / WHATWG URL
parsing; use the `rurl` package for URL parsing and canonicalization, or pass
the host alone to `host_normalize()` / `puny_encode()` / `puny_decode()` for
host-only needs.

## Breaking changes

* `host_normalize()` no longer takes a `strict` argument. It was inert (always
Expand Down
11 changes: 6 additions & 5 deletions R/normalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,12 @@ host_normalize <- function(x, check_hyphens = TRUE, use_std3 = TRUE,

# Derive the coarse `profile` cache token from a flag set. The default profile
# (all checks on) yields the byte-stable historical token; any deviation appends
# a deterministic, fixed-order tag so a token minted under one flag set can never
# `identical()`-match one minted under another. The token is a COARSE cache key
# only: the precise identity lives in the per-parameter columns, which downstream
# keys on (PUNY-nblrvplp). check_bidi / check_joiners / transitional are not
# knobs (fixed by the profile), so they never enter the token.
# a deterministic, fixed-order tag so a token minted under one flag set can
# never `identical()`-match one minted under another. The token is a COARSE
# cache key only: the precise identity lives in the per-parameter columns,
# which downstream keys on (PUNY-nblrvplp). check_bidi / check_joiners /
# transitional are not knobs (fixed by the profile), so they never enter the
# token.
.normalization_profile_token <- function(check_hyphens, use_std3,
verify_dns_length) {
base <- "uts46-nontransitional-std3-v1"
Expand Down
46 changes: 46 additions & 0 deletions R/url-utils.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,26 @@
# Emit the standard .Deprecated() warning for the URL surface. These functions
# (url_encode/url_decode/parse_url) are wound down in favour of `rurl` for URL
# parsing/canonicalization and host_normalize()/puny_encode()/puny_decode() for
# host-only needs; removal is scheduled for the next release.
.deprecate_url_surface <- function(old) {
hint <- switch(
old,
url_decode = "host_normalize() / puny_decode() for host-only decoding",
"host_normalize() / puny_encode() for host-only encoding"
)
.Deprecated(
msg = sprintf(
paste0(
"'%s()' is deprecated and will be removed in a future release.\n",
"Use the 'rurl' package for URL parsing/canonicalization, or %s."
),
old,
hint
),
old = old
)
}

#' Best-effort host rewriting in a URL-shaped string (Unicode host to ASCII)
#'
#' Locates the host portion of a URL-shaped string with a hand-rolled
Expand All @@ -13,6 +36,12 @@
#' use this helper only for quick host rewriting in an already-trusted
#' URL-shaped string.
#'
#' @section Deprecated:
#' This function is deprecated and slated for removal in a future release. For
#' URL parsing and canonicalization use a dedicated URL package (e.g. `rurl`);
#' for host-only encoding pass the host alone to [host_normalize()] or
#' [puny_encode()].
#'
#' @param url Character vector of URL-shaped strings with potential Unicode
#' hosts
#' @param strict Logical; whether to apply strict validation. Defaults to
Expand All @@ -39,8 +68,10 @@
#' )
#' url_encode(urls)
#' }
#' @keywords internal
#' @export
url_encode <- function(url, strict = getOption("punycoder.strict", TRUE)) {
.deprecate_url_surface("url_encode")
.call_with_validation(url, strict, url_encode_cpp, "url")
}

Expand All @@ -55,6 +86,11 @@ url_encode <- function(url, strict = getOption("punycoder.strict", TRUE)) {
#' conformant (no percent encoding/decoding, scheme/port/path semantics, full
#' IPv6, or serialization). Those concerns live upstack in `rurl`.
#'
#' @section Deprecated:
#' This function is deprecated and slated for removal in a future release. For
#' URL parsing and canonicalization use a dedicated URL package (e.g. `rurl`);
#' for host-only decoding pass the host alone to [puny_decode()].
#'
#' @param url Character vector of URL-shaped strings with ASCII punycode hosts
#' @param strict Logical; whether to apply strict validation. Defaults to
#' `getOption("punycoder.strict", TRUE)`.
Expand All @@ -79,8 +115,10 @@ url_encode <- function(url, strict = getOption("punycoder.strict", TRUE)) {
#' )
#' url_decode(ascii_urls)
#' }
#' @keywords internal
#' @export
url_decode <- function(url, strict = getOption("punycoder.strict", TRUE)) {
.deprecate_url_surface("url_decode")
.call_with_validation(url, strict, url_decode_cpp, "url")
}

Expand All @@ -99,6 +137,12 @@ url_decode <- function(url, strict = getOption("punycoder.strict", TRUE)) {
#' is slated for eventual removal in favour of `rurl` consuming punycoder's host
#' functions.
#'
#' @section Deprecated:
#' This function is deprecated and slated for removal in a future release. For
#' URL parsing and canonicalization use a dedicated URL package (e.g. `rurl`);
#' for host-only encoding pass the host alone to [host_normalize()] or
#' [puny_encode()].
#'
#' @param url Character vector of URL-shaped strings to split
#' @param encode_domains Logical flag; encode parsed host names to ASCII.
#' @return An object of class \code{"punycoder_parsed_url"} (a named list)
Expand Down Expand Up @@ -130,8 +174,10 @@ url_decode <- function(url, strict = getOption("punycoder.strict", TRUE)) {
#' )
#' parse_url(urls)
#' }
#' @keywords internal
#' @export
parse_url <- function(url, encode_domains = FALSE) {
.deprecate_url_surface("parse_url")
.assert_character(url)
.assert_flag(encode_domains, "encode_domains")
.warn_if_na(url)
Expand Down
9 changes: 9 additions & 0 deletions man/parse_url.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions man/url_decode.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions man/url_encode.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions tests/testthat/helper-validation.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,16 @@ expect_rejects_non_character <- function(fn, ...) {
testthat::expect_error(fn(TRUE, ...), "character vector")
testthat::expect_error(fn(list("test"), ...), "character vector")
}

# url_encode/url_decode/parse_url are deprecated (PUNY-vpegoytz) and emit a
# .Deprecated() warning on every call. The dedicated tests in test-urls.R assert
# that warning; the behavioural tests wrap their bodies in this muffler so the
# deprecation noise doesn't drown out (or get mistaken for) the warnings they
# actually exercise. Only the deprecatedWarning class is muffled, so NA-input
# warnings still surface for the tests that expect them.
suppress_url_deprecation <- function(code) {
withCallingHandlers(
code,
deprecatedWarning = function(w) invokeRestart("muffleWarning")
)
}
42 changes: 24 additions & 18 deletions tests/testthat/test-contracts.R
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
test_that("parse_url preserves object attributes and empty-path contract", {
parsed <- parse_url("https://example.com", encode_domains = TRUE)
test_that(
"parse_url preserves object attributes and empty-path contract",
suppress_url_deprecation({
parsed <- parse_url("https://example.com", encode_domains = TRUE)

expect_s3_class(parsed, "punycoder_parsed_url")
expect_identical(attr(parsed, "encode_domains"), TRUE)
expect_identical(parsed$path[[1]], "")
expect_identical(parsed$domain[[1]], "example.com")
})
expect_s3_class(parsed, "punycoder_parsed_url")
expect_identical(attr(parsed, "encode_domains"), TRUE)
expect_identical(parsed$path[[1]], "")
expect_identical(parsed$domain[[1]], "example.com")
})
)

test_that("parse_url invalid inputs return missing components", {
parsed <- parse_url("")
test_that(
"parse_url invalid inputs return missing components",
suppress_url_deprecation({
parsed <- parse_url("")

expect_true(is.na(parsed$scheme[[1]]))
expect_true(is.na(parsed$domain[[1]]))
expect_true(is.na(parsed$port[[1]]))
expect_true(is.na(parsed$path[[1]]))
expect_true(is.na(parsed$query[[1]]))
expect_true(is.na(parsed$fragment[[1]]))
})
expect_true(is.na(parsed$scheme[[1]]))
expect_true(is.na(parsed$domain[[1]]))
expect_true(is.na(parsed$port[[1]]))
expect_true(is.na(parsed$path[[1]]))
expect_true(is.na(parsed$query[[1]]))
expect_true(is.na(parsed$fragment[[1]]))
})
)

test_that("validate_domain preserves result attributes", {
result <- validate_domain("example.com", strict = FALSE)
Expand All @@ -36,11 +42,11 @@ test_that("strict wrappers preserve user-facing error prefixes", {
"^Error decoding domain:"
)
expect_error(
url_encode("", strict = TRUE),
suppress_url_deprecation(url_encode("", strict = TRUE)),
"^Error encoding URL:"
)
expect_error(
url_decode("", strict = TRUE),
suppress_url_deprecation(url_decode("", strict = TRUE)),
"^Error decoding URL:"
)
})
Expand Down
9 changes: 7 additions & 2 deletions tests/testthat/test-encoding.R
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,16 @@ test_that("strict defaults follow global punycoder.strict option", {
on.exit(options(old), add = TRUE)

expect_true(is.na(puny_encode("invalid..domain")))
expect_true(is.na(url_decode("https://xn--.example.com")))
expect_true(
is.na(suppress_url_deprecation(url_decode("https://xn--.example.com")))
)

options(punycoder.strict = TRUE)
expect_error(puny_encode("invalid..domain"), "Error encoding domain")
expect_error(url_decode("https://xn--.example.com"), "Error decoding URL")
expect_error(
suppress_url_deprecation(url_decode("https://xn--.example.com")),
"Error decoding URL"
)
})

test_that("punycode handles uppercase and trailing dots", {
Expand Down
6 changes: 5 additions & 1 deletion tests/testthat/test-idna-conformance.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ test_that("relaxing a UTS-46 flag stays bounded against IdnaTestV2", {
sort(.idna_known_divergence[[flag]] %||% character(0)),
info = flag
)
expect_identical(got[newly[bounded]], df$to_ascii[newly[bounded]], info = flag)
expect_identical(
got[newly[bounded]],
df$to_ascii[newly[bounded]],
info = flag
)
}
})
17 changes: 12 additions & 5 deletions tests/testthat/test-normalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ test_that("host_normalize relaxes exactly the named UTS #46 flag", {

# check_hyphens: leading/trailing hyphen and "--" in 3rd/4th positions.
expect_identical(host_normalize("-lead.com"), NA_character_)
expect_identical(host_normalize("-lead.com", check_hyphens = FALSE), "-lead.com")
expect_identical(
host_normalize("-lead.com", check_hyphens = FALSE), "-lead.com"
)
expect_identical(host_normalize("trail-.com"), NA_character_)
expect_identical(
host_normalize("ab--cd.com", check_hyphens = FALSE), "ab--cd.com"
Expand All @@ -120,7 +122,9 @@ test_that("host_normalize relaxes exactly the named UTS #46 flag", {
)

# Each flag is independent: relaxing one does not relax the others.
expect_identical(host_normalize("a_b.com", check_hyphens = FALSE), NA_character_)
expect_identical(
host_normalize("a_b.com", check_hyphens = FALSE), NA_character_
)
expect_identical(host_normalize("-lead.com", use_std3 = FALSE), NA_character_)
})

Expand Down Expand Up @@ -153,7 +157,7 @@ test_that("normalization_profile_info reports the ratified profile identity", {
expect_true(info$verify_dns_length)
})

test_that("normalization_profile_info reports identity for a specific flag set", {
test_that("normalization_profile_info reports identity for a flag set", {
# Each knob is reflected in its own column.
expect_false(normalization_profile_info(check_hyphens = FALSE)$check_hyphens)
expect_false(normalization_profile_info(use_std3 = FALSE)$use_std3)
Expand All @@ -170,7 +174,7 @@ test_that("normalization_profile_info reports identity for a specific flag set",
expect_true(relaxed$check_joiners)
})

test_that("profile token is byte-stable for defaults and distinct per flag set", {
test_that("profile token is byte-stable for defaults, distinct per flag set", {
# The default call is byte-identical to the historical token, so a zero-arg
# downstream reader (e.g. pslr) sees no change.
expect_identical(
Expand All @@ -194,7 +198,10 @@ test_that("profile token is byte-stable for defaults and distinct per flag set",
normalization_profile_info(
check_hyphens = FALSE, use_std3 = FALSE, verify_dns_length = FALSE
)$profile,
"uts46-nontransitional-std3-v1+no-check-hyphens+no-std3+no-verify-dns-length"
paste0(
"uts46-nontransitional-std3-v1",
"+no-check-hyphens+no-std3+no-verify-dns-length"
)
)

# Distinct flag sets never collide on the token.
Expand Down
33 changes: 18 additions & 15 deletions tests/testthat/test-performance.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,26 @@ test_that("Unicode domain throughput stays high for encode and decode", {
expect_rate_at_least(puny_decode, ascii_domains, 10000)
})

test_that("mixed URL throughput stays high for encode and decode", {
skip_on_cran()
test_that(
"mixed URL throughput stays high for encode and decode",
suppress_url_deprecation({
skip_on_cran()

unicode_urls <- rep(
c(
"https://café.example.com/path?query=value",
"https://user:pass@παράδειγμα.ελ:8443/path#frag",
"http://127.0.0.1/path",
"http://[2001:db8::1]/path"
),
4000
)
ascii_urls <- url_encode(unicode_urls)
unicode_urls <- rep(
c(
"https://café.example.com/path?query=value",
"https://user:pass@παράδειγμα.ελ:8443/path#frag",
"http://127.0.0.1/path",
"http://[2001:db8::1]/path"
),
4000
)
ascii_urls <- url_encode(unicode_urls)

expect_rate_at_least(url_encode, unicode_urls, 5000)
expect_rate_at_least(url_decode, ascii_urls, 5000)
})
expect_rate_at_least(url_encode, unicode_urls, 5000)
expect_rate_at_least(url_decode, ascii_urls, 5000)
})
)

test_that("large vector workloads remain scalable for encode and decode", {
skip_on_cran()
Expand Down
Loading
Loading