From f586c192cc38a24b394929caadae8b4a9d8757b1 Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 20:24:43 +0700 Subject: [PATCH 01/21] fix: accept case-insensitive percent-encoding, reject non-digit ports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per RFC-3986 §2.1/§6.2.2.1, HEXDIG in a percent-encoding is case-insensitive (%3a is equivalent to %3A). The validator rejected lowercase a-f, refusing valid input; it now accepts both cases. Per RFC-3986 §3.2.3, port = *DIGIT. Ports such as 0x1F or 1e3 were coerced by Number() and accepted; they are now kept raw on parse and rejected as URI_INVALID_PORT by checkURI, encodeURIString and decodeURIString. Adds an internal isPort guard and RFC-cited tests. --- src/checkers/chars.ts | 6 ++++- src/checkers/index.ts | 9 ++++--- src/decoders/index.ts | 9 ++++--- src/encoders/index.ts | 9 ++++--- src/helpers/cast.ts | 16 +++++++++++- src/parser/index.ts | 15 ++++++++--- tests/checkers-chars.test.ts | 11 +++++++++ tests/checkers.test.ts | 48 +++++++++++++++++++++++++----------- tests/fixtures/chars.ts | 7 ++++-- tests/fixtures/uris.ts | 24 +++++++++--------- tests/helpers-cast.test.ts | 27 +++++++++++++++++++- 11 files changed, 137 insertions(+), 44 deletions(-) diff --git a/src/checkers/chars.ts b/src/checkers/chars.ts index de8a037..ef590d8 100644 --- a/src/checkers/chars.ts +++ b/src/checkers/chars.ts @@ -315,8 +315,12 @@ const isSitemapQueryOrFragmentChar = function isSitemapQueryOrFragmentChar( * * Check percent encoding legal ascii codes according to RFC-3986 https://tools.ietf.org/html/rfc3986#section-2.1. * + * HEXDIG is case-insensitive: %3a and %3A are equivalent + * (RFC-3986 https://tools.ietf.org/html/rfc3986#section-6.2.2.1). + * * 48 to 57 0-9 * 65 to 70 A-F + * 97 to 102 a-f */ const isPercentEncodingChar = function isPercentEncodingChar(char: string): boolean { if (!is(String, char)) { @@ -325,7 +329,7 @@ const isPercentEncodingChar = function isPercentEncodingChar(char: string): bool const code = char.charCodeAt(0); - return (code >= 48 && code <= 57) || (code >= 65 && code <= 70); + return (code >= 48 && code <= 57) || (code >= 65 && code <= 70) || (code >= 97 && code <= 102); }; export { diff --git a/src/checkers/index.ts b/src/checkers/index.ts index c76bf01..ba7f3da 100644 --- a/src/checkers/index.ts +++ b/src/checkers/index.ts @@ -17,7 +17,7 @@ */ import { maxLengthURL, maxPortInteger, minPortInteger } from '../config/index.js'; import { isDomain } from '../domain/index.js'; -import { int } from '../helpers/cast.js'; +import { int, isPort } from '../helpers/cast.js'; import { exists, is } from '../helpers/object.js'; import { isIP } from '../ip/index.js'; import { type ParsedURI, parseURI } from '../parser/index.js'; @@ -444,8 +444,11 @@ const checkURI = function checkURI( throw error; } - // check port is a number if any - if (exists(port) && int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) { + // check port is a valid RFC-3986 *DIGIT and in range if any + if ( + exists(port) && + (!isPort(port) || int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) + ) { const error = new URIError( `port must be an integer between ${minPortInteger}-${maxPortInteger}, got '${port}'`, ) as URIErrorWithCode; diff --git a/src/decoders/index.ts b/src/decoders/index.ts index af24c76..b5a094c 100644 --- a/src/decoders/index.ts +++ b/src/decoders/index.ts @@ -9,7 +9,7 @@ import { checkSchemeChars, checkURISyntax } from '../checkers/index.js'; import { maxLengthURL, maxPortInteger, minPortInteger } from '../config/index.js'; import { isDomain } from '../domain/index.js'; -import { int } from '../helpers/cast.js'; +import { int, isPort } from '../helpers/cast.js'; import { exists, is } from '../helpers/object.js'; import { isIP } from '../ip/index.js'; import { recomposeURI } from '../parser/index.js'; @@ -138,8 +138,11 @@ const decodeURIString = function decodeURIString( throw error; } - // check port is a number if any - if (exists(port) && int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) { + // check port is a valid RFC-3986 *DIGIT and in range if any + if ( + exists(port) && + (!isPort(port) || int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) + ) { const error = new URIError( `port must be an integer between ${minPortInteger}-${maxPortInteger}, got '${port}'`, ) as URIErrorWithCode; diff --git a/src/encoders/index.ts b/src/encoders/index.ts index 0fc474e..8d0518e 100644 --- a/src/encoders/index.ts +++ b/src/encoders/index.ts @@ -18,7 +18,7 @@ import { import { checkSchemeChars, checkURISyntax } from '../checkers/index.js'; import { maxLengthURL, maxPortInteger, minPortInteger } from '../config/index.js'; import { isDomain } from '../domain/index.js'; -import { int } from '../helpers/cast.js'; +import { int, isPort } from '../helpers/cast.js'; import { exists, is } from '../helpers/object.js'; import { isIP } from '../ip/index.js'; import { recomposeURI } from '../parser/index.js'; @@ -171,8 +171,11 @@ const encodeURIString = function encodeURIString( throw error; } - // check port is a number if any - if (exists(port) && int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) { + // check port is a valid RFC-3986 *DIGIT and in range if any + if ( + exists(port) && + (!isPort(port) || int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) + ) { const error = new URIError( `port must be an integer between ${minPortInteger}-${maxPortInteger}, got '${port}'`, ) as URIError & { code: string }; diff --git a/src/helpers/cast.ts b/src/helpers/cast.ts index 66d7341..cff35ea 100644 --- a/src/helpers/cast.ts +++ b/src/helpers/cast.ts @@ -122,4 +122,18 @@ const int = function int(thing: unknown, { ge, le }: Range = {}): number | undef return castInt; }; -export { int, integer, num, number }; +/** + * @func isPort + * + * RFC-3986 §3.2.3: port = *DIGIT. True if the value is absent + * (null/undefined) or a possibly empty string of ASCII digits. + * The numeric range is validated separately by `int`. + * + * Rejects JS Number coercion artefacts (hex `0x1F`, scientific `1e3`, + * whitespace) that `Number()` would otherwise accept. + */ +const isPort = function isPort(thing: unknown): boolean { + return thing === null || thing === undefined || /^[0-9]*$/.test(String(thing)); +}; + +export { int, integer, isPort, num, number }; diff --git a/src/parser/index.ts b/src/parser/index.ts index 480ee8d..1061cde 100644 --- a/src/parser/index.ts +++ b/src/parser/index.ts @@ -6,7 +6,7 @@ * - parseURI(uri) -> Object */ import { maxPortInteger, minPortInteger } from '../config/index.js'; -import { int } from '../helpers/cast.js'; +import { int, isPort } from '../helpers/cast.js'; import { exists, is } from '../helpers/object.js'; import { isIPv6 } from '../ip/index.js'; import { punycode, punydecode } from '../punycode/index.js'; @@ -112,7 +112,11 @@ const recomposeURI = function recomposeURI(components?: URIComponents): string { uri += hostToURI(host); - if (exists(port) && int(port, { ge: minPortInteger, le: maxPortInteger }) !== undefined) { + if ( + exists(port) && + isPort(port) && + int(port, { ge: minPortInteger, le: maxPortInteger }) !== undefined + ) { uri += `:${port}`; } } else { @@ -250,7 +254,12 @@ const parseURI = function parseURI(uri: string): ParsedURI { // necessary to handle possible port errors when checking uri // port is a valid integer or we keep its initial value to be aware of the error // here we also don't check wrong range for the same reason - port = int(portToCast) || portToCast; + // RFC-3986 §3.2.3: a non-digit port (0x1F, 1e3, ...) is kept raw, not + // coerced by Number(), so checkURI can flag it as URI_INVALID_PORT + port = + is(String, portToCast) && portToCast.length > 0 && !isPort(portToCast) + ? portToCast + : int(portToCast) || portToCast; // recompose authority with punycode ASCII and Unicode serialization of the host // userinfo@host:port diff --git a/tests/checkers-chars.test.ts b/tests/checkers-chars.test.ts index 9efed92..30b3fc0 100644 --- a/tests/checkers-chars.test.ts +++ b/tests/checkers-chars.test.ts @@ -476,6 +476,17 @@ describe('#checkers chars', () => { } }); + // RFC-3986 §2.1 / §6.2.2.1: HEXDIG is case-insensitive (%3a ≡ %3A). + // A validator MUST accept lowercase a-f; rejecting them rejects valid input. + it('should accept lowercase hex digits a-f (RFC-3986 §6.2.2.1)', () => { + for (const char of 'abcdef') { + expect(isPercentEncodingChar(char)).toBe(true); + } + for (const char of 'ABCDEF0123456789') { + expect(isPercentEncodingChar(char)).toBe(true); + } + }); + it('should return false if a char does not exist', () => { expect(isPercentEncodingChar()).toBe(false); expect(isPercentEncodingChar(undefined)).toBe(false); diff --git a/tests/checkers.test.ts b/tests/checkers.test.ts index ca6bcef..3b929bc 100644 --- a/tests/checkers.test.ts +++ b/tests/checkers.test.ts @@ -117,7 +117,7 @@ describe('#checkers', () => { it('should throw an uri error when percent encoding is malformed', () => { expectThrowWithCode( - () => checkPercentEncoding('percent%2encoding', 7), + () => checkPercentEncoding('percent%2gncoding', 7), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -813,7 +813,7 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://user:%acpass@example.com:8042/over/there?name=ferret#nose'), + () => checkURI('foo://user:%agpass@example.com:8042/over/there?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -900,6 +900,17 @@ describe('#checkers', () => { ); }); + // RFC-3986 §3.2.3: port = *DIGIT. JS Number() coerces 0x1F/1e3/0o17 to a + // finite number; a compliant validator MUST still reject them as ports. + it('should reject Number()-coercible non-digit ports (RFC-3986 §3.2.3)', () => { + for (const bad of ['0x1F', '1e3', '0o17', '0b11']) { + expectThrowWithCode( + () => checkURI(`foo://example.com:${bad}/over/there?name=ferret#nose`), + 'URI_INVALID_PORT', + ); + } + }); + it('should throw an uri error when port is out of range', () => { expectThrowWithCode( () => checkURI(`foo://example.com:${minPortInteger - 1}/over/there?name=ferret#nose`), @@ -920,6 +931,13 @@ describe('#checkers', () => { ).not.toThrow(); }); + // RFC-3986 §2.1 / §6.2.2.1: %3a and %3A are equivalent. checkURI MUST NOT + // reject a URI solely because its percent-encodings use lowercase hex. + it('should accept lowercase hex percent-encodings (RFC-3986 §6.2.2.1)', () => { + expect(() => checkURI('foo://example.com:8042/%c3%bcber/%2f?a=%3a#%7e')).not.toThrow(); + expect(() => checkURI('foo://example.com/%3a%2f%3f')).not.toThrow(); + }); + it('should throw an uri error if path has invalid characters', () => { expectThrowWithCode( () => checkURI('foo://example.com:8042/over/thère?name=ferret#nose'), @@ -1059,11 +1077,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there%Aa?name=ferret#nose'), + () => checkURI('foo://example.com:8042/over/there%Ag?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/%2cover/there%20%20?name=ferret#nose'), + () => checkURI('foo://example.com:8042/%2gover/there%20%20?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1095,11 +1113,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%ef'), + () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%eg'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%ac'), + () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%ag'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1107,11 +1125,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%8c'), + () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%8g'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%a9'), + () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%az'), 'URI_INVALID_PERCENT_ENCODING', ); }); @@ -1301,7 +1319,7 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://user:%acpass@example.com:8042/over/there?name=ferret#nose'), + () => checkHttpURL('http://user:%agpass@example.com:8042/over/there?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1546,11 +1564,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there%Aa?name=ferret#nose'), + () => checkHttpURL('http://example.com:8042/over/there%Ag?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/%2cover/there%20%20?name=ferret#nose'), + () => checkHttpURL('http://example.com:8042/%2gover/there%20%20?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1582,11 +1600,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%ef'), + () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%eg'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%ac'), + () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%ag'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1594,11 +1612,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%8c'), + () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%8g'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%a9'), + () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%az'), 'URI_INVALID_PERCENT_ENCODING', ); }); diff --git a/tests/fixtures/chars.ts b/tests/fixtures/chars.ts index 67ac1a5..4e0dcb3 100644 --- a/tests/fixtures/chars.ts +++ b/tests/fixtures/chars.ts @@ -6,7 +6,9 @@ export const az = 'abcdefghijklmnopqrstuvwxyz'; export const AZ = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; export const GZ = 'GHIJKLMNOPQRSTUVWXYZ'; +export const gz = 'ghijklmnopqrstuvwxyz'; export const hexdig = 'ABCDEF'; +export const hexdigLower = 'abcdef'; export const digits = '0123456789'; // allowed @@ -20,7 +22,8 @@ export const sitemapSubDelims = subDelims.replace(/[*']/g, ''); export const allowedSchemeChars = `${az}${digits}+-.`; export const allowedDomainChars = `${az}${digits}-`; -export const allowedPercentEncodingChars = `${digits}${hexdig}`; +// RFC-3986 §2.1/§6.2.2.1: HEXDIG is case-insensitive (%3a ≡ %3A) +export const allowedPercentEncodingChars = `${digits}${hexdig}${hexdigLower}`; export const allowedUserinfoChars = `${unreserved}%${subDelims}:`; export const allowedPathChars = `${unreserved}%${subDelims}:@/`; @@ -42,7 +45,7 @@ export const allowedSitemapQueryOrFragmentCharsToEncode = `${allowedSitemapPathC export const disallowed = '\\^`{|}<>'; export const disallowedSchemeChars = `${AZ}${disallowed}${allowed.replace(/[-+.]/g, '')}`; export const disallowedDomainChars = `${AZ}${disallowed}${allowed.replace('-', '')}`; -export const disallowedPercentEncodingChars = `${az}${GZ}${allowed}${disallowed}`; +export const disallowedPercentEncodingChars = `${gz}${GZ}${allowed}${disallowed}`; export const disallowedUserinfoChars = '#/?@[]'; export const disallowedPathChars = '?#[]'; diff --git a/tests/fixtures/uris.ts b/tests/fixtures/uris.ts index 7d43d9e..33856a9 100644 --- a/tests/fixtures/uris.ts +++ b/tests/fixtures/uris.ts @@ -49,7 +49,7 @@ export const notUri: unknown[] = [ 'foo://user:pa[ss@example.com:8042/over/there?name=ferret#nose', 'foo://user%:pass@example.com:8042/over/there?name=ferret#nose', 'foo://user%20%2z:pass@example.com:8042/over/there?name=ferret#nose', - 'foo://user:%acpass@example.com:8042/over/there?name=ferret#nose', + 'foo://user:%agpass@example.com:8042/over/there?name=ferret#nose', 'foo://user:pass%@example.com:8042/over/there?name=ferret#nose', 'foo://user:pass%a@example.com:8042/over/there?name=ferret#nose', 'foo://999.999.999.999:8042/over/there?name=ferret#nose', @@ -70,8 +70,8 @@ export const notUri: unknown[] = [ 'foo://example.com:8042/over/{there?name=ferret#nose', 'foo://example.com:8042/over/there%20%20%?name=ferret#nose', 'foo://example.com:8042/over/there%2?name=ferret#nose', - 'foo://example.com:8042/over/there%Aa?name=ferret#nose', - 'foo://example.com:8042/%2cover/there%20%20?name=ferret#nose', + 'foo://example.com:8042/over/there%Ag?name=ferret#nose', + 'foo://example.com:8042/%2gover/there%20%20?name=ferret#nose', 'foo://example.com:8042/%a2over/there%20%20%?name=ferret#nose', 'foo://example.com:8042/%gover/there%20%20%?name=ferret#nose', 'foo://example.com:8042/%20over/there%20%20%?name=ferret%#nose', @@ -79,11 +79,11 @@ export const notUri: unknown[] = [ 'foo://example.com:8042/over/there%20%20%?name=f%erret#nose', 'foo://example.com:8042/over/there?name=ferret#nose%', 'foo://example.com:8042/over/there?name=ferret#nose%A', - 'foo://example.com:8042/over/there?name=ferret#nose%ef', - 'foo://example.com:8042/over/there?name=ferret#nose%ac', + 'foo://example.com:8042/over/there?name=ferret#nose%eg', + 'foo://example.com:8042/over/there?name=ferret#nose%ag', 'foo://example.com:8042/over/there?name=ferret#nose%9', - 'foo://example.com:8042/over/there?name=ferret#nose%8c', - 'foo://example.com:8042/over/there?name=ferret#nose%a9', + 'foo://example.com:8042/over/there?name=ferret#nose%8g', + 'foo://example.com:8042/over/there?name=ferret#nose%az', 'foo://example.com:8042/over/"there?name=ferret#nose', ]; @@ -210,7 +210,7 @@ export const notSitemap: string[] = [ 'http://example.com:8042/over/there%20%20%?name=ferret#nose', 'http://example.com:8042/over/there%2?name=ferret#nose', 'http://example.com:8042/over/there%Aa?name=ferret#nose', - 'http://example.com:8042/%2cover/there%20%20?name=ferret#nose', + 'http://example.com:8042/%2gover/there%20%20?name=ferret#nose', 'http://example.com:8042/%a2over/there%20%20%?name=ferret#nose', 'http://example.com:8042/%gover/there%20%20%?name=ferret#nose', 'http://example.com:8042/%20over/there%20%20%?name=ferret%#nose', @@ -218,11 +218,11 @@ export const notSitemap: string[] = [ 'http://example.com:8042/over/there%20%20%?name=f%erret#nose', 'http://example.com:8042/over/there?name=ferret#nose%', 'http://example.com:8042/over/there?name=ferret#nose%A', - 'http://example.com:8042/over/there?name=ferret#nose%ef', - 'http://example.com:8042/over/there?name=ferret#nose%ac', + 'http://example.com:8042/over/there?name=ferret#nose%eg', + 'http://example.com:8042/over/there?name=ferret#nose%ag', 'http://example.com:8042/over/there?name=ferret#nose%9', - 'http://example.com:8042/over/there?name=ferret#nose%8c', - 'http://example.com:8042/over/there?name=ferret#nose%a9', + 'http://example.com:8042/over/there?name=ferret#nose%8g', + 'http://example.com:8042/over/there?name=ferret#nose%az', "http://example.com:8042/it'sover/there?name=ferret#nose", 'http://example.com:8042/it"s%20over/there?name=ferret#nose', 'http://example.com:8042/over/there?name=ferret&pseudo=superhero#nose', diff --git a/tests/helpers-cast.test.ts b/tests/helpers-cast.test.ts index afdd773..364bc9e 100644 --- a/tests/helpers-cast.test.ts +++ b/tests/helpers-cast.test.ts @@ -1,7 +1,7 @@ import { describe, expect, it } from 'vitest'; import { cast } from '../src/helpers/index.js'; -const { num, number, int, integer } = cast; +const { num, number, int, integer, isPort } = cast; describe('#cast helper', () => { describe('when using number', () => { @@ -484,4 +484,29 @@ describe('#cast helper', () => { expect(int(5, { ge: 4, le: 2 })).toBeUndefined(); }); }); + + // RFC-3986 §3.2.3: port = *DIGIT. Only ASCII digits (or absent) are a port; + // JS Number() coercion of hex/scientific/whitespace must be rejected. + describe('when using isPort', () => { + it('should be true for absent or digit-only values', () => { + expect(isPort(null)).toBe(true); + expect(isPort(undefined)).toBe(true); + expect(isPort('')).toBe(true); + expect(isPort('0')).toBe(true); + expect(isPort('8080')).toBe(true); + expect(isPort(8080)).toBe(true); + expect(isPort('65535')).toBe(true); + }); + + it('should be false for non-digit ports coercible by Number()', () => { + expect(isPort('0x1F')).toBe(false); + expect(isPort('1e3')).toBe(false); + expect(isPort('0b11')).toBe(false); + expect(isPort('0o17')).toBe(false); + expect(isPort(' 80 ')).toBe(false); + expect(isPort('80g42')).toBe(false); + expect(isPort('-1')).toBe(false); + expect(isPort('8.0')).toBe(false); + }); + }); }); From 9b5e4a331619e1f67880d96bf9b54483d72928ef Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 20:27:52 +0700 Subject: [PATCH 02/21] fix: split userinfo and port on the last delimiter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per RFC-3986 §3.2.1 the userinfo is delimited by the last '@' before the host; per §3.2.2/§3.2.3 the port follows the last ':' of a non-IPv6 authority. Splitting on the first occurrence silently truncated the host (a host-confusion hazard) for inputs such as "user:pa@ss@example.com" or "a:b:8042". Parsing now uses the last delimiter, with RFC-cited tests. --- src/parser/index.ts | 25 +++++++++++++++++-------- tests/parser.test.ts | 20 ++++++++++++++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/parser/index.ts b/src/parser/index.ts index 1061cde..5a235be 100644 --- a/src/parser/index.ts +++ b/src/parser/index.ts @@ -204,12 +204,14 @@ const parseURI = function parseURI(uri: string): ParsedURI { if (is(String, authorityParsed)) { let hostAndPort: string | null = null; - [userinfo = null, hostAndPort = null] = authorityParsed.split('@'); - - // authority had no '@' and no userinfo can be extracted - if (!exists(hostAndPort) && exists(userinfo)) { - hostAndPort = userinfo; - userinfo = null; + // RFC-3986 §3.2.1: userinfo is delimited by the last '@' before the host + const userinfoEnd = authorityParsed.lastIndexOf('@'); + + if (userinfoEnd === -1) { + hostAndPort = authorityParsed; + } else { + userinfo = authorityParsed.slice(0, userinfoEnd); + hostAndPort = authorityParsed.slice(userinfoEnd + 1); } // try to extract host and port only if any @@ -223,8 +225,15 @@ const parseURI = function parseURI(uri: string): ParsedURI { if (Array.isArray(ipv6Match)) { [, hostParsed = null, portToCast = null] = ipv6Match; } else { - // not an ipv6 - [hostParsed = null, portToCast = null] = hostAndPort.split(':'); + // not an ipv6 — RFC-3986 §3.2.2/§3.2.3: port follows the last ':' + const portStart = hostAndPort.lastIndexOf(':'); + + if (portStart === -1) { + hostParsed = hostAndPort; + } else { + hostParsed = hostAndPort.slice(0, portStart); + portToCast = hostAndPort.slice(portStart + 1); + } } // hostPunydecoded should be the host in Unicode, host its Punycode value diff --git a/tests/parser.test.ts b/tests/parser.test.ts index edd8c4b..c3a7376 100644 --- a/tests/parser.test.ts +++ b/tests/parser.test.ts @@ -542,6 +542,26 @@ describe('#parser', () => { expect(parsedURI).toHaveProperty('fragment', null); expect(parsedURI).toHaveProperty('href', 'http://user:pass@[fe80::7:8%eth0]:8080/'); }); + + // RFC-3986 §3.2.1: userinfo is delimited by the LAST '@', not the first. + // Splitting on the first '@' silently truncates the host (host confusion). + it('should split userinfo on the last @ (RFC-3986 §3.2.1)', () => { + const parsedURI = parseURI('foo://user:pa@ss@example.com:8042/p?q#f'); + + expect(parsedURI).toHaveProperty('userinfo', 'user:pa@ss'); + expect(parsedURI).toHaveProperty('host', 'example.com'); + expect(parsedURI).toHaveProperty('port', 8042); + }); + + // RFC-3986 §3.2.2/§3.2.3: for a non-IPv6 authority the port follows the + // LAST ':'; splitting on the first ':' silently truncates the host. + it('should split host and port on the last : (RFC-3986 §3.2.2)', () => { + const parsedURI = parseURI('foo://a:b:8042/p'); + + expect(parsedURI).toHaveProperty('host', null); + expect(parsedURI).toHaveProperty('hostPunydecoded', 'a:b'); + expect(parsedURI).toHaveProperty('authorityPunydecoded', 'a:b:8042'); + }); }); describe('when using recomposeURI', () => { From d47c0b6aebe59299933f9acd1c121aad23af420a Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 20:34:24 +0700 Subject: [PATCH 03/21] fix: preserve present-empty query and fragment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per RFC-3986 §5.3 a present-but-empty query or fragment (the '' from a bare '?' or '#') is distinct from an absent one and must round-trip. parseURI now keeps '' (present) separate from null (absent); recomposeURI emits the delimiter whenever the component is defined, including ''. encodeURIString and decodeURIString carry the distinction through, and a non-empty component that fails to decode is still ignored per the documented decode contract. parse → recompose is now idempotent for http://h/? and http://h/#. RFC-cited tests added. --- src/decoders/index.ts | 17 +++++++++++++---- src/encoders/index.ts | 21 +++++++++------------ src/parser/index.ts | 18 ++++++++++++------ tests/parser.test.ts | 34 ++++++++++++++++++++++++++++------ 4 files changed, 62 insertions(+), 28 deletions(-) diff --git a/src/decoders/index.ts b/src/decoders/index.ts index b5a094c..990f4ba 100644 --- a/src/decoders/index.ts +++ b/src/decoders/index.ts @@ -157,11 +157,20 @@ const decodeURIString = function decodeURIString( // path const pathDecoded = decodeURIComponentString(path ?? '', { sitemap, lowercase: false }); - // query - const queryDecoded = decodeURIComponentString(query ?? '', { sitemap, lowercase: false }); + // RFC-3986 §5.3: an absent query/fragment (null) stays absent and a + // present-empty one ('') round-trips with its '?'/'#'. A non-empty + // component that fails to decode is ignored (mapped to null), per the + // documented decode contract. + const decodeComponent = (value: string | null): string | null => { + if (!is(String, value) || value === '') { + return value; + } + + return decodeURIComponentString(value, { sitemap, lowercase: false }) || null; + }; - // fragment - const fragmentDecoded = decodeURIComponentString(fragment ?? '', { sitemap, lowercase: false }); + const queryDecoded = decodeComponent(query); + const fragmentDecoded = decodeComponent(fragment); const uridecoded = recomposeURI({ scheme, diff --git a/src/encoders/index.ts b/src/encoders/index.ts index 8d0518e..18b8a7a 100644 --- a/src/encoders/index.ts +++ b/src/encoders/index.ts @@ -198,19 +198,16 @@ const encodeURIString = function encodeURIString( lowercase: false, }); - // query - const queryEncoded = encodeURIComponentString(query ?? '', { - sitemap, - type: 'query', - lowercase: false, - }); + // query — RFC-3986 §5.3: keep an absent query (null) absent; only a + // present query (including '') is encoded and re-emitted with '?' + const queryEncoded = is(String, query) + ? encodeURIComponentString(query, { sitemap, type: 'query', lowercase: false }) + : query; - // fragment - const fragmentEncoded = encodeURIComponentString(fragment ?? '', { - sitemap, - type: 'fragment', - lowercase: false, - }); + // fragment — same defined/absent distinction (RFC-3986 §5.3) + const fragmentEncoded = is(String, fragment) + ? encodeURIComponentString(fragment, { sitemap, type: 'fragment', lowercase: false }) + : fragment; const uriencoded = recomposeURI({ scheme, diff --git a/src/parser/index.ts b/src/parser/index.ts index 5a235be..0db6034 100644 --- a/src/parser/index.ts +++ b/src/parser/index.ts @@ -72,8 +72,10 @@ const hostToURI = function hostToURI(host: string): string { * 5. host, if any, must be at least 3 characters; * 6. userinfo will be ignored if empty; * 7. port will be ignored if empty or not an integer; - * 8. query will be ignored if empty; - * 9. fragment will be ignored if empty. + * 8. query is emitted when defined (a string, including ''); a null + * or undefined query is omitted (RFC-3986 §5.3); + * 9. fragment is emitted when defined (a string, including ''); a null + * or undefined fragment is omitted (RFC-3986 §5.3). * * Support: * - IPv4 and IPv6. @@ -134,11 +136,13 @@ const recomposeURI = function recomposeURI(components?: URIComponents): string { uri += path; } - if (is(String, query) && query.length > 0) { + // RFC-3986 §5.3: emit the delimiter whenever the component is defined, + // including the empty string (a defined-empty query/fragment) + if (is(String, query)) { uri += `?${query}`; } - if (is(String, fragment) && fragment.length > 0) { + if (is(String, fragment)) { uri += `#${fragment}`; } @@ -313,8 +317,10 @@ const parseURI = function parseURI(uri: string): ParsedURI { } // format query and fragment - const query = is(String, queryParsed) && queryParsed.length > 0 ? queryParsed : null; - const fragment = is(String, fragmentParsed) && fragmentParsed.length > 0 ? fragmentParsed : null; + // RFC-3986 §5.3: a present-but-empty query/fragment ('' from a bare '?' + // or '#') is distinct from an absent one (null) and must round-trip + const query = is(String, queryParsed) ? queryParsed : null; + const fragment = is(String, fragmentParsed) ? fragmentParsed : null; // pathqf: recompose path + query + fragment if any // using valueOf to avoid potential String objects mutation with parsed.path diff --git a/tests/parser.test.ts b/tests/parser.test.ts index c3a7376..cec74ca 100644 --- a/tests/parser.test.ts +++ b/tests/parser.test.ts @@ -562,6 +562,28 @@ describe('#parser', () => { expect(parsedURI).toHaveProperty('hostPunydecoded', 'a:b'); expect(parsedURI).toHaveProperty('authorityPunydecoded', 'a:b:8042'); }); + + // RFC-3986 §5.3: a present-but-empty query/fragment ('') is distinct + // from an absent one (null) and parse → recompose must be idempotent. + it('should distinguish a present-empty query/fragment from an absent one (RFC-3986 §5.3)', () => { + const withEmptyQuery = parseURI('http://example.com/?'); + expect(withEmptyQuery).toHaveProperty('query', ''); + expect(withEmptyQuery).toHaveProperty('href', 'http://example.com/?'); + + const withEmptyFragment = parseURI('http://example.com/#'); + expect(withEmptyFragment).toHaveProperty('fragment', ''); + expect(withEmptyFragment).toHaveProperty('href', 'http://example.com/#'); + + const absent = parseURI('http://example.com/'); + expect(absent).toHaveProperty('query', null); + expect(absent).toHaveProperty('fragment', null); + expect(absent).toHaveProperty('href', 'http://example.com/'); + + const both = parseURI('http://example.com/?#'); + expect(both).toHaveProperty('query', ''); + expect(both).toHaveProperty('fragment', ''); + expect(both).toHaveProperty('href', 'http://example.com/?#'); + }); }); describe('when using recomposeURI', () => { @@ -858,7 +880,7 @@ describe('#parser', () => { expect(recomposeURI(toRecompose)).toBe('foo://u@example.com/?a=b#anchor'); }); - it('should ignore query if not at least 1 character', () => { + it('should emit ? for a present-empty query, omit it when null (RFC-3986 §5.3)', () => { const toRecompose = { scheme: 'foo', userinfo: null, @@ -869,7 +891,7 @@ describe('#parser', () => { fragment: 'anchor', }; - expect(recomposeURI(toRecompose)).toBe('foo://example.com/#anchor'); + expect(recomposeURI(toRecompose)).toBe('foo://example.com/?#anchor'); toRecompose.query = null; expect(recomposeURI(toRecompose)).toBe('foo://example.com/#anchor'); @@ -889,7 +911,7 @@ describe('#parser', () => { expect(recomposeURI(toRecompose)).toBe('foo://example.com/?a=b#anchor'); }); - it('should ignore fragment if not at least 1 character', () => { + it('should emit # for a present-empty fragment, omit it when null (RFC-3986 §5.3)', () => { const toRecompose = { scheme: 'foo', userinfo: null, @@ -900,7 +922,7 @@ describe('#parser', () => { fragment: '', }; - expect(recomposeURI(toRecompose)).toBe('foo://example.com/'); + expect(recomposeURI(toRecompose)).toBe('foo://example.com/#'); toRecompose.fragment = null; expect(recomposeURI(toRecompose)).toBe('foo://example.com/'); @@ -928,7 +950,7 @@ describe('#parser', () => { port: null, path: '', query: null, - fragment: '', + fragment: null, }; expect(recomposeURI(toRecompose)).toBe('foo://23.71.254.72/'); @@ -942,7 +964,7 @@ describe('#parser', () => { port: null, path: '', query: null, - fragment: '', + fragment: null, }; expect(recomposeURI(toRecompose)).toBe('foo://[::ffff:192.168.1.26]/'); From 3c560d0c838df511fda8a38fd638d5198a74b621 Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 20:38:31 +0700 Subject: [PATCH 04/21] fix: complete sitemap XML entity table and tighten URL length The Sitemaps XML protocol requires all five XML entities to be escaped; only & and ' were. Adds " > < (" > <), so encodeSitemapURL produces XML-safe URLs and decodeSitemapURL inverts them. The protocol also caps a URL at strictly less than 2,048 characters, so the bound is now exclusive (a 2,048-character URL is rejected). RFC/spec-cited tests added. --- src/checkers/index.ts | 3 ++- src/decoders/index.ts | 3 ++- src/encoders/index.ts | 3 ++- src/sitemap/index.ts | 5 ++++- tests/checkers.test.ts | 10 ++++++++++ tests/decoders.test.ts | 8 ++++++++ tests/encoders.test.ts | 13 ++++++++++--- 7 files changed, 38 insertions(+), 7 deletions(-) diff --git a/src/checkers/index.ts b/src/checkers/index.ts index ba7f3da..32c4b19 100644 --- a/src/checkers/index.ts +++ b/src/checkers/index.ts @@ -546,7 +546,8 @@ const checkHttpURL = function checkHttpURL( } // max length - if (is(String, href) && href.length > maxLengthURL) { + // sitemaps.org: a URL must be strictly less than 2,048 characters + if (is(String, href) && href.length >= maxLengthURL) { const error = new URIError( `max URL length of ${maxLengthURL} reached: ${href.length}`, ) as URIErrorWithCode; diff --git a/src/decoders/index.ts b/src/decoders/index.ts index 990f4ba..c1839b2 100644 --- a/src/decoders/index.ts +++ b/src/decoders/index.ts @@ -182,7 +182,8 @@ const decodeURIString = function decodeURIString( fragment: fragmentDecoded, }); - if (webURL && uridecoded.length > maxLengthURL) { + // sitemaps.org: a URL must be strictly less than 2,048 characters + if (webURL && uridecoded.length >= maxLengthURL) { const error = new URIError( `max URL length of ${maxLengthURL} reached: ${uridecoded.length}`, ) as URIErrorWithCode; diff --git a/src/encoders/index.ts b/src/encoders/index.ts index 18b8a7a..95c087c 100644 --- a/src/encoders/index.ts +++ b/src/encoders/index.ts @@ -219,7 +219,8 @@ const encodeURIString = function encodeURIString( fragment: fragmentEncoded, }); - if (webURL && uriencoded.length > maxLengthURL) { + // sitemaps.org: a URL must be strictly less than 2,048 characters + if (webURL && uriencoded.length >= maxLengthURL) { const error = new URIError( `max URL length of ${maxLengthURL} reached: ${uriencoded.length}`, ) as URIError & { code: string }; diff --git a/src/sitemap/index.ts b/src/sitemap/index.ts index 66d9eac..9a3acf9 100644 --- a/src/sitemap/index.ts +++ b/src/sitemap/index.ts @@ -30,10 +30,13 @@ specialCharsKeys.forEach((char) => { const pencodingsKeys = Object.keys(pencodings); -// sitemap entities to be escaped in URLs +// sitemap entities to be escaped in URLs (sitemaps.org requires all five) const entities: Record = { '&': '&', "'": ''', + '"': '"', + '>': '>', + '<': '<', }; // entities keys diff --git a/tests/checkers.test.ts b/tests/checkers.test.ts index 3b929bc..1d80b8e 100644 --- a/tests/checkers.test.ts +++ b/tests/checkers.test.ts @@ -2285,6 +2285,16 @@ describe('#checkers', () => { ); }); + // sitemaps.org: a URL must be strictly less than 2,048 characters, so + // maxLengthURL (2048) is an exclusive bound — exactly 2048 is rejected. + it('should reject a URL of exactly maxLengthURL and accept maxLengthURL - 1', () => { + const base = 'http://example.com/'; + const url = (len: number) => base + 'a'.repeat(len - base.length); + + expectThrowWithCode(() => checkHttpURL(url(maxLengthURL)), 'URI_MAX_LENGTH_URL'); + expect(() => checkHttpURL(url(maxLengthURL - 1))).not.toThrow(); + }); + it('should not throw an uri error when uri is a valid https url when https is true', () => { expect(() => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose'), { https: true, diff --git a/tests/decoders.test.ts b/tests/decoders.test.ts index b509162..cd1d04b 100644 --- a/tests/decoders.test.ts +++ b/tests/decoders.test.ts @@ -1218,6 +1218,14 @@ describe('#decoders', () => { ); }); + // sitemaps.org: decoding inverts all five XML entities — & ' + // " > < — round-tripping encodeSitemapURL. + it('should decode all five sitemap XML entities (sitemaps.org)', () => { + expect(decodeSitemapURL('http://example.com/a&b'c"d<e>f')).toBe( + 'http://example.com/a&b\'c"df', + ); + }); + it('should throw an uri error if url is more than the maximal allowed length', () => { expectThrowWithCode( () => diff --git a/tests/encoders.test.ts b/tests/encoders.test.ts index ebb0214..dc0b4e3 100644 --- a/tests/encoders.test.ts +++ b/tests/encoders.test.ts @@ -231,7 +231,7 @@ describe('#encoders', () => { it('should return a string with specific escaped and percent-encoded characters when sitemap is true', () => { expect(encodeURIComponentString(AZ, { sitemap: true })).toBe(az); expect(encodeURIComponentString(disallowed, { sitemap: true })).toBe( - '%5C%5E%60%7B%7C%7D%3C%3E', + '%5C%5E%60%7B%7C%7D<>', ); expect(encodeURIComponentString("&'*", { sitemap: true })).toBe('&'%2A'); expect(encodeURIComponentString(disallowedOtherChars, { sitemap: true })).toBe( @@ -1122,9 +1122,9 @@ describe('#encoders', () => { it('should return a string with percent-encoded characters if not allowed, by default', () => { expect(encodeSitemapURL(`http://example.com/${disallowed}`)).toBe( - 'http://example.com/%5C%5E%60%7B%7C%7D%3C%3E', + 'http://example.com/%5C%5E%60%7B%7C%7D<>', ); - expect(encodeSitemapURL('http://example.com/<>')).toBe('http://example.com/%3C%3E'); + expect(encodeSitemapURL('http://example.com/<>')).toBe('http://example.com/<>'); expect(encodeSitemapURL(`http://example.com/${disallowedOtherChars}`)).toBe( 'http://example.com/%E2%82%AC%C2%B0%C3%A9%C3%B9%C3%A8%C3%A0%C3%A7%20%C2%A7%C2%A3', ); @@ -1178,6 +1178,13 @@ describe('#encoders', () => { ); }); + // sitemaps.org: all five XML entities must be escaped — &, ', ", >, < + it('should escape all five sitemap XML entities (sitemaps.org)', () => { + expect(encodeSitemapURL('http://example.com/a&b\'c"df')).toBe( + 'http://example.com/a&b'c"d<e>f', + ); + }); + it('should throw an uri error if url is more than the maximal allowed length when web or sitemap is true only', () => { expectThrowWithCode( () => From 067782eed618c39dd2c4aca5bb2681b0c4e93bd0 Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 20:42:35 +0700 Subject: [PATCH 05/21] fix: require the RFC 6874 %25 zone delimiter in URI hosts Per RFC 6874 an IPv6 zone identifier inside a URI must use the percent-encoded delimiter "%25"; a bare "%" is invalid. checkURISyntax (so checkURI and the encoders/decoders) now rejects a bare-"%" zone with URI_INVALID_HOST. The standalone isIPv6 literal validator stays lenient on the delimiter by design. RFC-cited tests added. --- src/checkers/index.ts | 14 ++++++++++++++ tests/checkers.test.ts | 9 +++++++++ tests/fixtures/uris.ts | 4 ++-- tests/ip.test.ts | 8 ++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/checkers/index.ts b/src/checkers/index.ts index 32c4b19..9f70a9c 100644 --- a/src/checkers/index.ts +++ b/src/checkers/index.ts @@ -368,6 +368,20 @@ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { throw error; } + // RFC 6874: an IPv6 zone identifier in a URI MUST use the percent-encoded + // "%25" delimiter; a bare "%" is invalid in URI context + if (is(String, host) && host.includes(':')) { + const zoneAt = host.indexOf('%'); + + if (zoneAt !== -1 && host.slice(zoneAt, zoneAt + 3) !== '%25') { + const error = new URIError( + `IPv6 zone identifier must use the '%25' delimiter, got '${host}'`, + ) as URIErrorWithCode; + error.code = 'URI_INVALID_HOST'; + throw error; + } + } + return { scheme, authority, diff --git a/tests/checkers.test.ts b/tests/checkers.test.ts index 1d80b8e..3e2196a 100644 --- a/tests/checkers.test.ts +++ b/tests/checkers.test.ts @@ -680,6 +680,15 @@ describe('#checkers', () => { expectThrowWithCode(() => checkURISyntax('foo://'), 'URI_INVALID_HOST'); }); + // RFC 6874: an IPv6 zone identifier in a URI MUST use the percent-encoded + // "%25" delimiter; a bare "%" is invalid in URI context. + it('should require the RFC 6874 %25 zone delimiter in a URI host', () => { + expectThrowWithCode(() => checkURISyntax('http://[fe80::1%eth0]/'), 'URI_INVALID_HOST'); + expectThrowWithCode(() => checkURI('http://[fe80::1%eth0]/'), 'URI_INVALID_HOST'); + expect(() => checkURI('http://[fe80::1%25eth0]/')).not.toThrow(); + expect(() => checkWebURL('http://[fe80::1%25eth0]/')).not.toThrow(); + }); + it('should not throw if an uri has at least a scheme and a path', () => { expect(() => checkURISyntax('http://example.com')).not.toThrow(); expect(() => checkURISyntax('http://example.com/path')).not.toThrow(); diff --git a/tests/fixtures/uris.ts b/tests/fixtures/uris.ts index 33856a9..8cc1372 100644 --- a/tests/fixtures/uris.ts +++ b/tests/fixtures/uris.ts @@ -95,8 +95,8 @@ export const http: string[] = [ 'http://user:pass@127.0.0.1:8080/', 'http://user:pass@223.255.255.255/', 'http://[2001:0000:1234:0000:0000:c1c0:abcd:0876]:8080/', - 'http://user:pass@[fe80::7:8%eth0]:8080/', - 'http://user:pass@[fe80::7:8%eth0]/path?q=5#anchor', + 'http://user:pass@[fe80::7:8%25eth0]:8080/', + 'http://user:pass@[fe80::7:8%25eth0]/path?q=5#anchor', 'http://example.com./', 'http://www.example.com./', 'http://www.example.com/', diff --git a/tests/ip.test.ts b/tests/ip.test.ts index 2bd611f..6507928 100644 --- a/tests/ip.test.ts +++ b/tests/ip.test.ts @@ -368,6 +368,14 @@ describe('#ip', () => { }); }); + // isIPv6 is a standalone literal validator and stays lenient on the + // zone delimiter (bare '%'). RFC 6874's "%25" requirement is enforced + // only in URI context, by the checkers. + it('should accept a bare % zone id when used standalone', () => { + expect(isIPv6('fe80::1%eth0')).toBe(true); + expect(isIPv6('fe80::1%25eth0')).toBe(true); + }); + it('should return false if ip is not a valid v6 ip', () => { v6not.forEach((ip) => { expect(isIPv6(ip)).toBe(false); From f6070f7fb3785ba2fe0259eeb1ef368d7ed91f4b Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 20:46:07 +0700 Subject: [PATCH 06/21] =?UTF-8?q?feat:=20add=20resolveURI=20and=20removeDo?= =?UTF-8?q?tSegments=20(RFC-3986=20=C2=A75.2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reference resolution was missing. removeDotSegments implements the RFC-3986 §5.2.4 ordered loop verbatim; resolveURI implements the §5.2.2 strict transform with §5.2.3 merge and recomposes per §5.3, requiring an absolute base (§5.2.1). Both are exported from the public entry point. Tests cover every RFC-3986 §5.4 normal and abnormal example and the §5.2.4 worked traces. --- CLAUDE.md | 2 ++ src/index.ts | 1 + 2 files changed, 3 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 7adf08f..102588a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -26,6 +26,7 @@ Follows the Coroboros engineering global rules. Repo-specific divergences are st - `src/parser/index.ts` — `parseURI`, `recomposeURI`, `hostToURI` (RFC-3986 Appendix B grammar) - `src/checkers/index.ts` — URI / URL / Sitemap validators, error taxonomy - `src/encoders/index.ts`, `src/decoders/index.ts` — RFC-3986 encode/decode +- `src/resolver/index.ts` — `resolveURI`, `removeDotSegments` (RFC-3986 §5.2 verbatim) - `src/helpers/object.ts` — private `exists` / `is` type guards (inlined, not exported) - `tsdown.config.ts` — dual build config (ESM + CJS + dts) - `tests/` — Vitest suites, one test file per source module @@ -33,6 +34,7 @@ Follows the Coroboros engineering global rules. Repo-specific divergences are st ## Public API (1.0.0 contract) - `punycode(domain)`, `punydecode(domain)` — domain ASCII/Unicode serialization - `parseURI(uri)`, `recomposeURI(components)` — RFC-3986 parse / recompose +- `resolveURI(base, reference)`, `removeDotSegments(path)` — RFC-3986 §5.2 reference resolution - `isDomainLabel(label)`, `isDomain(name)`, `isIP(ip)`, `isIPv4(ip)`, `isIPv6(ip)` — validators - `checkURI(uri)`, `checkHttpURL(uri)`, `checkHttpsURL(uri)`, `checkWebURL(uri)`, `checkSitemapURL(uri)`, `checkHttpSitemapURL(uri)`, `checkHttpsSitemapURL(uri)` — throw a coded error on invalid input - `encodeURIComponentString(component, options)`, `encodeURIString(uri, options)`, `encodeWebURL(uri, options)`, `encodeSitemapURL(uri)` — RFC-3986 encoders diff --git a/src/index.ts b/src/index.ts index aeeabb2..ba2ea35 100644 --- a/src/index.ts +++ b/src/index.ts @@ -27,3 +27,4 @@ export { export { isIP, isIPv4, isIPv6 } from './ip/index.js'; export { type ParsedURI, parseURI, recomposeURI, type URIComponents } from './parser/index.js'; export { punycode, punydecode } from './punycode/index.js'; +export { removeDotSegments, resolveURI } from './resolver/index.js'; From 7c9fbf020d791100d8e002047df875038586db4f Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 20:52:15 +0700 Subject: [PATCH 07/21] refactor: centralize coded-error construction in a fail() helper The construct-as-cast / set .code / throw triplet was repeated 32 times across the checkers, encoders and decoders. A single internal fail(code, message, cause?) helper replaces it; the thrown value is still instanceof URIError with the same stable .code strings, so behavior is unchanged (the full suite asserts every code). Adds optional Error.cause support for future wrapping. --- src/checkers/index.ts | 146 ++++++++++++------------------------------ src/decoders/index.ts | 32 +++------ src/encoders/index.ts | 30 +++------ src/helpers/error.ts | 25 ++++++++ 4 files changed, 81 insertions(+), 152 deletions(-) create mode 100644 src/helpers/error.ts diff --git a/src/checkers/index.ts b/src/checkers/index.ts index 9f70a9c..271afe0 100644 --- a/src/checkers/index.ts +++ b/src/checkers/index.ts @@ -18,6 +18,7 @@ import { maxLengthURL, maxPortInteger, minPortInteger } from '../config/index.js'; import { isDomain } from '../domain/index.js'; import { int, isPort } from '../helpers/cast.js'; +import { fail } from '../helpers/error.js'; import { exists, is } from '../helpers/object.js'; import { isIP } from '../ip/index.js'; import { type ParsedURI, parseURI } from '../parser/index.js'; @@ -33,8 +34,6 @@ import { isUserinfoChar, } from './chars.js'; -type URIErrorWithCode = URIError & { code: string }; - export interface CheckedURI extends ParsedURI { valid: true; } @@ -58,11 +57,7 @@ const checkPercentEncoding = function checkPercentEncoding( stringLen: number, ): number { if (!is(String, string)) { - const error = new URIError( - 'a string is required when checking for percent encoding', - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PERCENT_ENCODING'; - throw error; + fail('URI_INVALID_PERCENT_ENCODING', 'a string is required when checking for percent encoding'); } const len = is(Number, stringLen) && stringLen >= 0 ? stringLen : string.length; @@ -74,24 +69,20 @@ const checkPercentEncoding = function checkPercentEncoding( // example: %20 or %C3%BC if (i + 2 < len) { if (!isPercentEncodingChar(string.charAt(i + 1))) { - const error = new URIError( + fail( + 'URI_INVALID_PERCENT_ENCODING', `invalid percent encoding char '${string.charAt(i + 1)}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PERCENT_ENCODING'; - throw error; + ); } else if (!isPercentEncodingChar(string.charAt(i + 2))) { - const error = new URIError( + fail( + 'URI_INVALID_PERCENT_ENCODING', `invalid percent encoding char '${string.charAt(i + 2)}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PERCENT_ENCODING'; - throw error; + ); } else { offset = 2; } } else { - const error = new URIError('incomplete percent encoding found') as URIErrorWithCode; - error.code = 'URI_INVALID_PERCENT_ENCODING'; - throw error; + fail('URI_INVALID_PERCENT_ENCODING', 'incomplete percent encoding found'); } } @@ -110,11 +101,7 @@ const checkSitemapEncoding = function checkSitemapEncoding( stringLen: number, ): number { if (!is(String, string)) { - const error = new URIError( - 'a string is required when checking for sitemap encoding', - ) as URIErrorWithCode; - error.code = 'URI_INVALID_SITEMAP_ENCODING'; - throw error; + fail('URI_INVALID_SITEMAP_ENCODING', 'a string is required when checking for sitemap encoding'); } const len = is(Number, stringLen) && stringLen >= 0 ? stringLen : string.length; @@ -141,11 +128,7 @@ const checkSitemapEncoding = function checkSitemapEncoding( } if (!exists(escapeOffset)) { - const error = new URIError( - `entity '${string.charAt(i)}' is not properly escaped`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_SITEMAP_ENCODING'; - throw error; + fail('URI_INVALID_SITEMAP_ENCODING', `entity '${string.charAt(i)}' is not properly escaped`); } else { offset = escapeOffset; } @@ -175,11 +158,10 @@ const checkComponent = function checkComponent({ sitemap?: boolean; } = {}): boolean { if (!['userinfo', 'path', 'query', 'fragment'].includes(type as string)) { - const error = new URIError( + fail( + 'URI_INVALID_CHECKING_COMPONENT', `unable to check pathqf, got '${type}' component to check`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_CHECKING_COMPONENT'; - throw error; + ); } // path is always at least empty here, userinfo, query and fragment are not required @@ -208,9 +190,10 @@ const checkComponent = function checkComponent({ for (let i = 0; i < len; i += 1) { // check character is valid if (!checkCharFunc(string.charAt(i))) { - const error = new URIError(`invalid ${type} char '${string.charAt(i)}'`) as URIErrorWithCode; - error.code = `URI_INVALID_${(type as string).toUpperCase()}_CHAR`; - throw error; + fail( + `URI_INVALID_${(type as string).toUpperCase()}_CHAR`, + `invalid ${type} char '${string.charAt(i)}'`, + ); } // check percent encodings @@ -237,24 +220,18 @@ const checkComponent = function checkComponent({ */ const checkSchemeChars = function checkSchemeChars(scheme: string, len?: number): boolean { if (!is(String, scheme)) { - const error = new URIError('scheme must be a string') as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', 'scheme must be a string'); } const schemeLen = is(Number, len) && len > 0 ? len : scheme.length; if (schemeLen <= 0) { - const error = new URIError('scheme cannot be empty') as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', 'scheme cannot be empty'); } for (let i = 0; i < schemeLen; i += 1) { if (!isSchemeChar(scheme.charAt(i), { start: i === 0 })) { - const error = new URIError(`invalid scheme char '${scheme.charAt(i)}'`) as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME_CHAR'; - throw error; + fail('URI_INVALID_SCHEME_CHAR', `invalid scheme char '${scheme.charAt(i)}'`); } } @@ -268,15 +245,11 @@ const checkSchemeChars = function checkSchemeChars(scheme: string, len?: number) */ const checkLowercase = function checkLowercase(uri: string): boolean { if (!is(String, uri)) { - const error = new URIError('uri must be a string') as URIErrorWithCode; - error.code = 'URI_INVALID_TYPE'; - throw error; + fail('URI_INVALID_TYPE', 'uri must be a string'); } if (uri.toLowerCase() !== uri) { - const error = new URIError('uri cannot contain any uppercase characters') as URIErrorWithCode; - error.code = 'URI_INVALID_CHAR'; - throw error; + fail('URI_INVALID_CHAR', 'uri cannot contain any uppercase characters'); } return true; @@ -298,9 +271,7 @@ const checkLowercase = function checkLowercase(uri: string): boolean { */ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { if (!is(String, uri)) { - const error = new URIError('uri must be a string') as URIErrorWithCode; - error.code = 'URI_INVALID_TYPE'; - throw error; + fail('URI_INVALID_TYPE', 'uri must be a string'); } // parse uri and check scheme, authority, pathname and slashes @@ -324,48 +295,30 @@ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { // scheme (required) if (!is(String, scheme)) { - const error = new URIError('uri scheme is required') as URIErrorWithCode; - error.code = 'URI_MISSING_SCHEME'; - throw error; + fail('URI_MISSING_SCHEME', 'uri scheme is required'); } else if (schemeLen <= 0) { - const error = new URIError('uri scheme must not be empty') as URIErrorWithCode; - error.code = 'URI_EMPTY_SCHEME'; - throw error; + fail('URI_EMPTY_SCHEME', 'uri scheme must not be empty'); } // path (required), can be an empty string if (!is(String, path)) { - const error = new URIError('uri path is required') as URIErrorWithCode; - error.code = 'URI_MISSING_PATH'; - throw error; + fail('URI_MISSING_PATH', 'uri path is required'); } // path: if authority is present path must be empty or start with / if (is(String, authority) && authority.length > 0) { - if (!(path === '' || path.startsWith('/'))) { - const error = new URIError( - "path must be empty or start with '/' when authority is present", - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PATH'; - throw error; + if (!(path === '' || (path as string).startsWith('/'))) { + fail('URI_INVALID_PATH', "path must be empty or start with '/' when authority is present"); } - } else if (path.startsWith('//')) { + } else if ((path as string).startsWith('//')) { // if authority is not present path must not start with // - const error = new URIError( - "path must not start with '//' when authority is not present", - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PATH'; - throw error; + fail('URI_INVALID_PATH', "path must not start with '//' when authority is not present"); } // check for inconsistent authority (original vs parsed) which means // host parsed was actually wrong if (!exists(authority) && exists(authorityPunydecoded)) { - const error = new URIError( - `host must be a valid ip or domain name, got '${hostPunydecoded}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_HOST'; - throw error; + fail('URI_INVALID_HOST', `host must be a valid ip or domain name, got '${hostPunydecoded}'`); } // RFC 6874: an IPv6 zone identifier in a URI MUST use the percent-encoded @@ -374,11 +327,7 @@ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { const zoneAt = host.indexOf('%'); if (zoneAt !== -1 && host.slice(zoneAt, zoneAt + 3) !== '%25') { - const error = new URIError( - `IPv6 zone identifier must use the '%25' delimiter, got '${host}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_HOST'; - throw error; + fail('URI_INVALID_HOST', `IPv6 zone identifier must use the '%25' delimiter, got '${host}'`); } } @@ -451,11 +400,7 @@ const checkURI = function checkURI( // check host is a valid ip first (RFC-3986) or a domain name if (!isIP(host as string) && !isDomain(host as string)) { - const error = new URIError( - `host must be a valid ip or domain name, got '${host}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_HOST'; - throw error; + fail('URI_INVALID_HOST', `host must be a valid ip or domain name, got '${host}'`); } // check port is a valid RFC-3986 *DIGIT and in range if any @@ -463,11 +408,10 @@ const checkURI = function checkURI( exists(port) && (!isPort(port) || int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) ) { - const error = new URIError( + fail( + 'URI_INVALID_PORT', `port must be an integer between ${minPortInteger}-${maxPortInteger}, got '${port}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PORT'; - throw error; + ); } } @@ -545,28 +489,18 @@ const checkHttpURL = function checkHttpURL( // scheme if (!schemesToCheck.includes(scheme as string)) { - const error = new URIError( - `scheme must be ${schemesToCheck.join(' or ')}, got '${scheme}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', `scheme must be ${schemesToCheck.join(' or ')}, got '${scheme}'`); } // authority if (!is(String, authority)) { - const error = new URIError('authority is required') as URIErrorWithCode; - error.code = 'URI_MISSING_AUTHORITY'; - throw error; + fail('URI_MISSING_AUTHORITY', 'authority is required'); } // max length // sitemaps.org: a URL must be strictly less than 2,048 characters if (is(String, href) && href.length >= maxLengthURL) { - const error = new URIError( - `max URL length of ${maxLengthURL} reached: ${href.length}`, - ) as URIErrorWithCode; - error.code = 'URI_MAX_LENGTH_URL'; - throw error; + fail('URI_MAX_LENGTH_URL', `max URL length of ${maxLengthURL} reached: ${href.length}`); } return { diff --git a/src/decoders/index.ts b/src/decoders/index.ts index c1839b2..8ed190e 100644 --- a/src/decoders/index.ts +++ b/src/decoders/index.ts @@ -10,13 +10,12 @@ import { checkSchemeChars, checkURISyntax } from '../checkers/index.js'; import { maxLengthURL, maxPortInteger, minPortInteger } from '../config/index.js'; import { isDomain } from '../domain/index.js'; import { int, isPort } from '../helpers/cast.js'; +import { fail } from '../helpers/error.js'; import { exists, is } from '../helpers/object.js'; import { isIP } from '../ip/index.js'; import { recomposeURI } from '../parser/index.js'; import { escapeCodes, escapeCodesKeys, pencodings, pencodingsKeys } from '../sitemap/index.js'; -type URIErrorWithCode = URIError & { code: string }; - /** * @func decodeURIComponentString * @@ -111,11 +110,7 @@ const decodeURIString = function decodeURIString( // scheme must be http or https for web/sitemap or with valid chars, always in lowercase if (webURL) { if (scheme !== 'http' && scheme !== 'https') { - const error = new URIError( - `scheme must be http or https, got '${scheme}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', `scheme must be http or https, got '${scheme}'`); } } else { // check scheme characters, it is not intended to decode a scheme @@ -124,18 +119,12 @@ const decodeURIString = function decodeURIString( // authority is required and must be a valid host for web/sitemap if (webURL && !is(String, authority)) { - const error = new URIError('authority is required') as URIErrorWithCode; - error.code = 'URI_MISSING_AUTHORITY'; - throw error; + fail('URI_MISSING_AUTHORITY', 'authority is required'); } // check host is a valid ip first (RFC-3986) or a domain name if (exists(host) && !isIP(host) && !isDomain(host)) { - const error = new URIError( - `host must be a valid ip or domain name, got '${host}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_HOST'; - throw error; + fail('URI_INVALID_HOST', `host must be a valid ip or domain name, got '${host}'`); } // check port is a valid RFC-3986 *DIGIT and in range if any @@ -143,11 +132,10 @@ const decodeURIString = function decodeURIString( exists(port) && (!isPort(port) || int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) ) { - const error = new URIError( + fail( + 'URI_INVALID_PORT', `port must be an integer between ${minPortInteger}-${maxPortInteger}, got '${port}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PORT'; - throw error; + ); } // userinfo @@ -184,11 +172,7 @@ const decodeURIString = function decodeURIString( // sitemaps.org: a URL must be strictly less than 2,048 characters if (webURL && uridecoded.length >= maxLengthURL) { - const error = new URIError( - `max URL length of ${maxLengthURL} reached: ${uridecoded.length}`, - ) as URIErrorWithCode; - error.code = 'URI_MAX_LENGTH_URL'; - throw error; + fail('URI_MAX_LENGTH_URL', `max URL length of ${maxLengthURL} reached: ${uridecoded.length}`); } return uridecoded; diff --git a/src/encoders/index.ts b/src/encoders/index.ts index 95c087c..99dc9c8 100644 --- a/src/encoders/index.ts +++ b/src/encoders/index.ts @@ -19,6 +19,7 @@ import { checkSchemeChars, checkURISyntax } from '../checkers/index.js'; import { maxLengthURL, maxPortInteger, minPortInteger } from '../config/index.js'; import { isDomain } from '../domain/index.js'; import { int, isPort } from '../helpers/cast.js'; +import { fail } from '../helpers/error.js'; import { exists, is } from '../helpers/object.js'; import { isIP } from '../ip/index.js'; import { recomposeURI } from '../parser/index.js'; @@ -144,11 +145,7 @@ const encodeURIString = function encodeURIString( // scheme must be http or https for web/sitemap or with valid chars, always in lowercase if (webURL) { if (scheme !== 'http' && scheme !== 'https') { - const error = new URIError(`scheme must be http or https, got '${scheme}'`) as URIError & { - code: string; - }; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', `scheme must be http or https, got '${scheme}'`); } } else { // check scheme characters, it is not intended to encode a scheme @@ -157,18 +154,12 @@ const encodeURIString = function encodeURIString( // authority is required and must be a valid host for web/sitemap if (webURL && !is(String, authority)) { - const error = new URIError('authority is required') as URIError & { code: string }; - error.code = 'URI_MISSING_AUTHORITY'; - throw error; + fail('URI_MISSING_AUTHORITY', 'authority is required'); } // check host is a valid ip first (RFC-3986) or a domain name if (exists(host) && !isIP(host) && !isDomain(host)) { - const error = new URIError( - `host must be a valid ip or domain name, got '${host}'`, - ) as URIError & { code: string }; - error.code = 'URI_INVALID_HOST'; - throw error; + fail('URI_INVALID_HOST', `host must be a valid ip or domain name, got '${host}'`); } // check port is a valid RFC-3986 *DIGIT and in range if any @@ -176,11 +167,10 @@ const encodeURIString = function encodeURIString( exists(port) && (!isPort(port) || int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) ) { - const error = new URIError( + fail( + 'URI_INVALID_PORT', `port must be an integer between ${minPortInteger}-${maxPortInteger}, got '${port}'`, - ) as URIError & { code: string }; - error.code = 'URI_INVALID_PORT'; - throw error; + ); } // userinfo @@ -221,11 +211,7 @@ const encodeURIString = function encodeURIString( // sitemaps.org: a URL must be strictly less than 2,048 characters if (webURL && uriencoded.length >= maxLengthURL) { - const error = new URIError( - `max URL length of ${maxLengthURL} reached: ${uriencoded.length}`, - ) as URIError & { code: string }; - error.code = 'URI_MAX_LENGTH_URL'; - throw error; + fail('URI_MAX_LENGTH_URL', `max URL length of ${maxLengthURL} reached: ${uriencoded.length}`); } return uriencoded; diff --git a/src/helpers/error.ts b/src/helpers/error.ts new file mode 100644 index 0000000..bb011cb --- /dev/null +++ b/src/helpers/error.ts @@ -0,0 +1,25 @@ +/** + * error helper + * + * - fail(code, message, cause?) -> never (throws a coded URIError) + */ + +type URIErrorWithCode = URIError & { code: string }; + +/** + * @func fail + * + * Throw a URIError carrying a stable `code` string (and an optional + * `Error.cause`). The thrown value is always `instanceof URIError`. + */ +const fail = function fail(code: string, message: string, cause?: unknown): never { + const error = ( + cause === undefined ? new URIError(message) : new URIError(message, { cause }) + ) as URIErrorWithCode; + + error.code = code; + + throw error; +}; + +export { fail, type URIErrorWithCode }; From 11174bb3a07e1b0043f3cf55fef79f1bfe1f6a01 Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 20:53:49 +0700 Subject: [PATCH 08/21] perf: hoist per-call regular expressions to module scope isIP, isIPv4 and isIPv6 rebuilt their RegExp on every call, and the sitemap decoder rebuilt its alternation regexp on every sitemap decode. All four are now compiled once at module load. The IP patterns are stateless and the decoder regexp is used only through String.prototype.replace (which resets lastIndex), so reuse is safe and behavior is unchanged. --- src/decoders/index.ts | 7 +++++-- src/ip/index.ts | 11 ++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/decoders/index.ts b/src/decoders/index.ts index 8ed190e..29aa058 100644 --- a/src/decoders/index.ts +++ b/src/decoders/index.ts @@ -16,6 +16,10 @@ import { isIP } from '../ip/index.js'; import { recomposeURI } from '../parser/index.js'; import { escapeCodes, escapeCodesKeys, pencodings, pencodingsKeys } from '../sitemap/index.js'; +// compiled once at module load — used only via String.prototype.replace, +// which resets lastIndex per spec, so reusing the global regexp is safe +const sitemapDecodeRegexp = new RegExp(escapeCodesKeys.concat(pencodingsKeys).join('|'), 'g'); + /** * @func decodeURIComponentString * @@ -39,9 +43,8 @@ const decodeURIComponentString = function decodeURIComponentString( const componentToDecode = lowercase === true ? component.toLowerCase() : component; if (sitemap === true) { - const regexp = new RegExp(escapeCodesKeys.concat(pencodingsKeys).join('|'), 'g'); const uriToDecode = componentToDecode.replace( - regexp, + sitemapDecodeRegexp, (match) => escapeCodes[match] || pencodings[match] || '', ); diff --git a/src/ip/index.ts b/src/ip/index.ts index f4de4ca..ca05295 100644 --- a/src/ip/index.ts +++ b/src/ip/index.ts @@ -27,6 +27,11 @@ const v6 = ` .replace(/\n/g, '') .trim(); +// compiled once at module load — these patterns are stateless (no `g` flag) +const ipv4Regexp = new RegExp(`^${v4}$`); +const ipv6Regexp = new RegExp(`^${v6}$`); +const ipRegexp = new RegExp(`(?:^${v4}$)|(?:^${v6}$)`); + /** * @func isIP * @@ -37,7 +42,7 @@ const isIP = function isIP(ip: string): boolean { return false; } - return new RegExp(`(?:^${v4}$)|(?:^${v6}$)`).test(ip); + return ipRegexp.test(ip); }; /** @@ -50,7 +55,7 @@ const isIPv4 = function isIPv4(ip: string): boolean { return false; } - return new RegExp(`^${v4}$`).test(ip); + return ipv4Regexp.test(ip); }; /** @@ -63,7 +68,7 @@ const isIPv6 = function isIPv6(ip: string): boolean { return false; } - return new RegExp(`^${v6}$`).test(ip); + return ipv6Regexp.test(ip); }; export { isIP, isIPv4, isIPv6 }; From 34fa2ff668f5fbbfbfc5ef4d2e794b9b31803253 Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 21:02:13 +0700 Subject: [PATCH 09/21] feat: add the resolver module source and tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the resolveURI / removeDotSegments feature: the previous commit wired the public export and docs but the implementation file and its test were not staged. src/resolver/index.ts implements the RFC-3986 §5.2 transform verbatim; tests/resolver.test.ts covers every §5.4 example. --- src/resolver/index.ts | 211 +++++++++++++++++++++++++++++++++++++++++ tests/resolver.test.ts | 128 +++++++++++++++++++++++++ 2 files changed, 339 insertions(+) create mode 100644 src/resolver/index.ts create mode 100644 tests/resolver.test.ts diff --git a/src/resolver/index.ts b/src/resolver/index.ts new file mode 100644 index 0000000..749d444 --- /dev/null +++ b/src/resolver/index.ts @@ -0,0 +1,211 @@ +/** + * reference resolution + * + * - removeDotSegments(path) -> String + * - resolveURI(base, reference) -> String + * + * Based on: + * - RFC-3986 https://tools.ietf.org/html/rfc3986#section-5. + */ +import { is } from '../helpers/object.js'; + +// RFC-3986 Appendix B reference-parsing regexp. Unlike parseURI this keeps +// relative references (no scheme / no authority) so §5.2.2 can resolve them. +const referenceRegexp = /^(?:([^:/?#]+):)?(?:\/\/([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?/; + +interface Reference { + scheme: string | null; + authority: string | null; + path: string; + query: string | null; + fragment: string | null; +} + +/** + * @func parseReference + * + * Split a URI-reference into its five RFC-3986 components. A component is + * null when the delimiter is absent and '' when present but empty, so the + * defined/undefined distinction §5.2.2 relies on is preserved. + */ +const parseReference = function parseReference(reference: string): Reference { + const [, scheme, authority, path, query, fragment] = reference.match(referenceRegexp) ?? []; + + return { + scheme: scheme ?? null, + authority: authority ?? null, + path: path ?? '', + query: query ?? null, + fragment: fragment ?? null, + }; +}; + +/** + * @func removeDotSegments + * + * Remove the special "." and ".." complete path segments from a path, + * implementing the RFC-3986 §5.2.4 ordered loop verbatim. + * + * Based on: + * - RFC-3986 https://tools.ietf.org/html/rfc3986#section-5.2.4. + */ +const removeDotSegments = function removeDotSegments(path: string): string { + if (!is(String, path)) { + return ''; + } + + let input = path; + let output = ''; + + while (input.length > 0) { + // 2A + if (input.startsWith('../')) { + input = input.slice(3); + } else if (input.startsWith('./')) { + input = input.slice(2); + // 2B + } else if (input.startsWith('/./')) { + input = `/${input.slice(3)}`; + } else if (input === '/.') { + input = '/'; + // 2C + } else if (input.startsWith('/../')) { + input = `/${input.slice(4)}`; + output = output.slice(0, Math.max(0, output.lastIndexOf('/'))); + } else if (input === '/..') { + input = '/'; + output = output.slice(0, Math.max(0, output.lastIndexOf('/'))); + // 2D + } else if (input === '.' || input === '..') { + input = ''; + // 2E + } else { + const start = input.startsWith('/') ? 1 : 0; + const next = input.indexOf('/', start); + + if (next === -1) { + output += input; + input = ''; + } else { + output += input.slice(0, next); + input = input.slice(next); + } + } + } + + return output; +}; + +/** + * @func merge + * + * Merge a relative reference's path with the base path, per RFC-3986 §5.2.3. + */ +const merge = function merge(base: Reference, refPath: string): string { + if (is(String, base.authority) && base.path === '') { + return `/${refPath}`; + } + + const lastSlash = base.path.lastIndexOf('/'); + + return lastSlash === -1 ? refPath : base.path.slice(0, lastSlash + 1) + refPath; +}; + +/** + * @func recompose + * + * Recompose a resolved target from its components, per RFC-3986 §5.3. + * A component is emitted whenever it is defined (non-null), including ''. + */ +const recompose = function recompose(target: Reference): string { + let result = ''; + + if (is(String, target.scheme)) { + result += `${target.scheme}:`; + } + + if (is(String, target.authority)) { + result += `//${target.authority}`; + } + + result += target.path; + + if (is(String, target.query)) { + result += `?${target.query}`; + } + + if (is(String, target.fragment)) { + result += `#${target.fragment}`; + } + + return result; +}; + +/** + * @func resolveURI + * + * Resolve a URI reference against a base URI, implementing the RFC-3986 + * §5.2.2 strict transform (with §5.2.3 merge and §5.2.4 remove_dot_segments) + * and recomposing per §5.3. + * + * The base must be an absolute URI (a scheme is required, RFC-3986 §5.2.1); + * the empty string is returned if base or reference is invalid. + * + * Based on: + * - RFC-3986 https://tools.ietf.org/html/rfc3986#section-5.2. + */ +const resolveURI = function resolveURI(base: string, reference: string): string { + if (!(is(String, base) && is(String, reference))) { + return ''; + } + + const baseRef = parseReference(base); + + // RFC-3986 §5.2.1: the base URI MUST be an absolute URI + if (!is(String, baseRef.scheme)) { + return ''; + } + + const r = parseReference(reference); + const t: Reference = { + scheme: null, + authority: null, + path: '', + query: null, + fragment: null, + }; + + // RFC-3986 §5.2.2 (strict mode) + if (is(String, r.scheme)) { + t.scheme = r.scheme; + t.authority = r.authority; + t.path = removeDotSegments(r.path); + t.query = r.query; + } else { + if (is(String, r.authority)) { + t.authority = r.authority; + t.path = removeDotSegments(r.path); + t.query = r.query; + } else { + if (r.path === '') { + t.path = baseRef.path; + t.query = is(String, r.query) ? r.query : baseRef.query; + } else { + t.path = r.path.startsWith('/') + ? removeDotSegments(r.path) + : removeDotSegments(merge(baseRef, r.path)); + t.query = r.query; + } + + t.authority = baseRef.authority; + } + + t.scheme = baseRef.scheme; + } + + t.fragment = r.fragment; + + return recompose(t); +}; + +export { removeDotSegments, resolveURI }; diff --git a/tests/resolver.test.ts b/tests/resolver.test.ts new file mode 100644 index 0000000..87e067b --- /dev/null +++ b/tests/resolver.test.ts @@ -0,0 +1,128 @@ +import { describe, expect, it } from 'vitest'; +import { removeDotSegments, resolveURI } from '../src/resolver/index.js'; + +describe('#resolver', () => { + describe('when using removeDotSegments', () => { + // RFC-3986 §5.2.4 worked examples (verbatim from the specification) + it('should match the RFC-3986 §5.2.4 worked examples', () => { + expect(removeDotSegments('/a/b/c/./../../g')).toBe('/a/g'); + expect(removeDotSegments('mid/content=5/../6')).toBe('mid/6'); + }); + + it('should handle empty, root and trailing-dot paths', () => { + expect(removeDotSegments('')).toBe(''); + expect(removeDotSegments('/')).toBe('/'); + expect(removeDotSegments('/a/b/')).toBe('/a/b/'); + expect(removeDotSegments('a/./b')).toBe('a/b'); + expect(removeDotSegments('/.')).toBe('/'); + expect(removeDotSegments('/..')).toBe('/'); + expect(removeDotSegments('.')).toBe(''); + expect(removeDotSegments('..')).toBe(''); + expect(removeDotSegments('/a/.')).toBe('/a/'); + expect(removeDotSegments('/a/..')).toBe('/'); + }); + + it('should not treat .foo or g. as dot segments', () => { + expect(removeDotSegments('/.foo')).toBe('/.foo'); + expect(removeDotSegments('/b/c/g.')).toBe('/b/c/g.'); + expect(removeDotSegments('/b/c/..g')).toBe('/b/c/..g'); + }); + + it('should return the empty string for a non-string input', () => { + // @ts-expect-error runtime guard for non-string input + expect(removeDotSegments(null)).toBe(''); + // @ts-expect-error runtime guard for non-string input + expect(removeDotSegments(undefined)).toBe(''); + }); + }); + + describe('when using resolveURI', () => { + const base = 'http://a/b/c/d;p?q'; + + // RFC-3986 §5.4.1 — normal examples (verbatim) + const normal: [string, string][] = [ + ['g:h', 'g:h'], + ['g', 'http://a/b/c/g'], + ['./g', 'http://a/b/c/g'], + ['g/', 'http://a/b/c/g/'], + ['/g', 'http://a/g'], + ['//g', 'http://g'], + ['?y', 'http://a/b/c/d;p?y'], + ['g?y', 'http://a/b/c/g?y'], + ['#s', 'http://a/b/c/d;p?q#s'], + ['g#s', 'http://a/b/c/g#s'], + ['g?y#s', 'http://a/b/c/g?y#s'], + [';x', 'http://a/b/c/;x'], + ['g;x', 'http://a/b/c/g;x'], + ['g;x?y#s', 'http://a/b/c/g;x?y#s'], + ['', 'http://a/b/c/d;p?q'], + ['.', 'http://a/b/c/'], + ['./', 'http://a/b/c/'], + ['..', 'http://a/b/'], + ['../', 'http://a/b/'], + ['../g', 'http://a/b/g'], + ['../..', 'http://a/'], + ['../../', 'http://a/'], + ['../../g', 'http://a/g'], + ]; + + // RFC-3986 §5.4.2 — abnormal examples (verbatim, strict mode) + const abnormal: [string, string][] = [ + ['../../../g', 'http://a/g'], + ['../../../../g', 'http://a/g'], + ['/./g', 'http://a/g'], + ['/../g', 'http://a/g'], + ['g.', 'http://a/b/c/g.'], + ['.g', 'http://a/b/c/.g'], + ['g..', 'http://a/b/c/g..'], + ['..g', 'http://a/b/c/..g'], + ['./../g', 'http://a/b/g'], + ['./g/.', 'http://a/b/c/g/'], + ['g/./h', 'http://a/b/c/g/h'], + ['g/../h', 'http://a/b/c/h'], + ['g;x=1/./y', 'http://a/b/c/g;x=1/y'], + ['g;x=1/../y', 'http://a/b/c/y'], + ['g?y/./x', 'http://a/b/c/g?y/./x'], + ['g?y/../x', 'http://a/b/c/g?y/../x'], + ['g#s/./x', 'http://a/b/c/g#s/./x'], + ['g#s/../x', 'http://a/b/c/g#s/../x'], + ['http:g', 'http:g'], + ]; + + it('should resolve every RFC-3986 §5.4.1 normal example', () => { + for (const [reference, expected] of normal) { + expect(resolveURI(base, reference)).toBe(expected); + } + }); + + it('should resolve every RFC-3986 §5.4.2 abnormal example (strict)', () => { + for (const [reference, expected] of abnormal) { + expect(resolveURI(base, reference)).toBe(expected); + } + }); + + it('should preserve a present-empty query/fragment per RFC-3986 §5.3', () => { + expect(resolveURI(base, 'g?')).toBe('http://a/b/c/g?'); + expect(resolveURI(base, 'g#')).toBe('http://a/b/c/g#'); + expect(resolveURI('http://a/b?x', '')).toBe('http://a/b?x'); + }); + + it('should merge against a base with an authority and empty path', () => { + expect(resolveURI('http://a', 'g')).toBe('http://a/g'); + expect(resolveURI('http://a', './g')).toBe('http://a/g'); + }); + + it('should return the empty string when the base is not absolute', () => { + expect(resolveURI('/b/c', 'g')).toBe(''); + expect(resolveURI('//host/path', 'g')).toBe(''); + expect(resolveURI('', 'g')).toBe(''); + }); + + it('should return the empty string for a non-string argument', () => { + // @ts-expect-error runtime guard for non-string input + expect(resolveURI(null, 'g')).toBe(''); + // @ts-expect-error runtime guard for non-string input + expect(resolveURI(base, 42)).toBe(''); + }); + }); +}); From 12d98ab79175f88fdfc493bf175252ea65ba281b Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 21:02:31 +0700 Subject: [PATCH 10/21] chore: tighten TypeScript strictness and bundler configuration Enable exactOptionalPropertyTypes, erasableSyntaxOnly and isolatedDeclarations on top of strict. Option-bag optional properties now read `?: T | undefined` so callers can forward possibly-undefined values (non-breaking, exactOptional-friendly), and the computed sitemap constants carry explicit annotations for isolatedDeclarations. tsdown pins `platform: 'node'` and explicit tree-shaking. The dual ESM/CJS types matrix is correct by construction (separate .d.mts and .d.cts, types-first conditions). No runtime behavior change. --- src/checkers/index.ts | 14 +++++++++----- src/decoders/index.ts | 16 ++++++++++++---- src/encoders/index.ts | 22 +++++++++++++++++++--- src/sitemap/index.ts | 10 +++++----- tsconfig.json | 3 +++ tsdown.config.ts | 4 +++- 6 files changed, 51 insertions(+), 18 deletions(-) diff --git a/src/checkers/index.ts b/src/checkers/index.ts index 271afe0..9b02ca0 100644 --- a/src/checkers/index.ts +++ b/src/checkers/index.ts @@ -153,9 +153,9 @@ const checkComponent = function checkComponent({ string, sitemap, }: { - type?: string; - string?: string | null; - sitemap?: boolean; + type?: string | undefined; + string?: string | null | undefined; + sitemap?: boolean | undefined; } = {}): boolean { if (!['userinfo', 'path', 'query', 'fragment'].includes(type as string)) { fail( @@ -371,7 +371,7 @@ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { */ const checkURI = function checkURI( uri: string, - { sitemap }: { sitemap?: boolean } = {}, + { sitemap }: { sitemap?: boolean | undefined } = {}, ): CheckedURI { // check uri type and syntax const { @@ -455,7 +455,11 @@ const checkURI = function checkURI( */ const checkHttpURL = function checkHttpURL( uri: string, - { https, web, sitemap }: { https?: boolean; web?: boolean; sitemap?: boolean } = {}, + { + https, + web, + sitemap, + }: { https?: boolean | undefined; web?: boolean | undefined; sitemap?: boolean | undefined } = {}, ): CheckedURI { // precheck case for sitemap only if (sitemap === true) { diff --git a/src/decoders/index.ts b/src/decoders/index.ts index 29aa058..7668416 100644 --- a/src/decoders/index.ts +++ b/src/decoders/index.ts @@ -34,7 +34,7 @@ const sitemapDecodeRegexp = new RegExp(escapeCodesKeys.concat(pencodingsKeys).jo */ const decodeURIComponentString = function decodeURIComponentString( component: string, - { sitemap, lowercase }: { sitemap?: boolean; lowercase?: boolean } = {}, + { sitemap, lowercase }: { sitemap?: boolean | undefined; lowercase?: boolean | undefined } = {}, ): string { if (!is(String, component)) { return ''; @@ -91,7 +91,15 @@ const decodeURIComponentString = function decodeURIComponentString( */ const decodeURIString = function decodeURIString( uri: string, - { web, sitemap, lowercase }: { web?: boolean; sitemap?: boolean; lowercase?: boolean } = {}, + { + web, + sitemap, + lowercase, + }: { + web?: boolean | undefined; + sitemap?: boolean | undefined; + lowercase?: boolean | undefined; + } = {}, ): string { const uriToDecode = is(String, uri) && lowercase === true ? uri.toLowerCase() : uri; const webURL = web === true || sitemap === true; @@ -209,7 +217,7 @@ const decodeURIString = function decodeURIString( */ const decodeWebURL = function decodeWebURL( uri: string, - { lowercase }: { lowercase?: boolean } = {}, + { lowercase }: { lowercase?: boolean | undefined } = {}, ): string { return decodeURIString(uri, { lowercase, web: true }); }; @@ -245,7 +253,7 @@ const decodeWebURL = function decodeWebURL( */ const decodeSitemapURL = function decodeSitemapURL( uri: string, - { lowercase }: { lowercase?: boolean } = {}, + { lowercase }: { lowercase?: boolean | undefined } = {}, ): string { return decodeURIString(uri, { lowercase, sitemap: true }); }; diff --git a/src/encoders/index.ts b/src/encoders/index.ts index 99dc9c8..c3777be 100644 --- a/src/encoders/index.ts +++ b/src/encoders/index.ts @@ -46,7 +46,15 @@ import { entities, specialChars } from '../sitemap/index.js'; */ const encodeURIComponentString = function encodeURIComponentString( component: string, - { type, sitemap, lowercase }: { type?: string; sitemap?: boolean; lowercase?: boolean } = {}, + { + type, + sitemap, + lowercase, + }: { + type?: string | undefined; + sitemap?: boolean | undefined; + lowercase?: boolean | undefined; + } = {}, ): string { if (!is(String, component)) { return ''; @@ -133,7 +141,15 @@ const encodeURIComponentString = function encodeURIComponentString( */ const encodeURIString = function encodeURIString( uri: string, - { web, sitemap, lowercase }: { web?: boolean; sitemap?: boolean; lowercase?: boolean } = {}, + { + web, + sitemap, + lowercase, + }: { + web?: boolean | undefined; + sitemap?: boolean | undefined; + lowercase?: boolean | undefined; + } = {}, ): string { const uriToEncode = is(String, uri) && lowercase === true ? uri.toLowerCase() : uri; const webURL = web === true || sitemap === true; @@ -249,7 +265,7 @@ const encodeURIString = function encodeURIString( */ const encodeWebURL = function encodeWebURL( uri: string, - { lowercase }: { lowercase?: boolean } = {}, + { lowercase }: { lowercase?: boolean | undefined } = {}, ): string { return encodeURIString(uri, { lowercase, web: true }); }; diff --git a/src/sitemap/index.ts b/src/sitemap/index.ts index 9a3acf9..c4d9ba1 100644 --- a/src/sitemap/index.ts +++ b/src/sitemap/index.ts @@ -20,7 +20,7 @@ const specialChars: Record = { }; // special chars keys -const specialCharsKeys = Object.keys(specialChars); +const specialCharsKeys: string[] = Object.keys(specialChars); // inversed special chars (percent encodings) const pencodings: Record = {}; @@ -28,7 +28,7 @@ specialCharsKeys.forEach((char) => { pencodings[specialChars[char] as string] = char; }); -const pencodingsKeys = Object.keys(pencodings); +const pencodingsKeys: string[] = Object.keys(pencodings); // sitemap entities to be escaped in URLs (sitemaps.org requires all five) const entities: Record = { @@ -40,7 +40,7 @@ const entities: Record = { }; // entities keys -const entitiesKeys = Object.keys(entities); +const entitiesKeys: string[] = Object.keys(entities); // inversed entities keys (escape codes) const escapeCodes: Record = {}; @@ -49,8 +49,8 @@ entitiesKeys.forEach((entity) => { }); // escape codes keys and length -const escapeCodesKeys = Object.keys(escapeCodes); -const escapeCodesKeysLen = escapeCodesKeys.length; +const escapeCodesKeys: string[] = Object.keys(escapeCodes); +const escapeCodesKeysLen: number = escapeCodesKeys.length; export { entities, diff --git a/tsconfig.json b/tsconfig.json index 6057c0a..8559e16 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -7,6 +7,9 @@ "types": ["node"], "strict": true, "noUncheckedIndexedAccess": true, + "exactOptionalPropertyTypes": true, + "erasableSyntaxOnly": true, + "isolatedDeclarations": true, "noImplicitOverride": true, "noFallthroughCasesInSwitch": true, "esModuleInterop": true, diff --git a/tsdown.config.ts b/tsdown.config.ts index 6a5999c..96ef3c7 100644 --- a/tsdown.config.ts +++ b/tsdown.config.ts @@ -3,8 +3,10 @@ import { defineConfig } from 'tsdown'; export default defineConfig({ entry: ['src/index.ts'], format: ['esm', 'cjs'], + platform: 'node', + target: 'node22', dts: true, + treeshake: true, clean: true, - target: 'node22', sourcemap: true, }); From 2675dc35bbe9977e053063c39ead05cedf9442f4 Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 21:19:02 +0700 Subject: [PATCH 11/21] test: add property-based tests and enforce 100% coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds tests/uri.property.test.ts (fast-check): parseURI totality, parse → recompose idempotence, removeDotSegments idempotence and no-dot-segment invariant, resolveURI empty-reference and totality, component encode/decode round-trip — 1000 runs each. The vitest coverage threshold is now 100% on every metric. To make the gate honest, fail() drops its unused cause parameter, and the handful of guards that are unreachable by construction (indices bounded by their array length, the Appendix-B regexp always capturing a string, a resolved target always having a scheme) are marked with explained v8 ignore comments rather than fabricated tests. biome excludes the generated coverage directory. --- biome.json | 1 + package.json | 1 + pnpm-lock.yaml | 16 +++++ src/checkers/index.ts | 6 ++ src/decoders/index.ts | 2 + src/encoders/index.ts | 1 + src/helpers/cast.ts | 1 + src/helpers/error.ts | 12 ++-- src/parser/index.ts | 7 +++ src/resolver/index.ts | 3 + tests/decoders.test.ts | 6 ++ tests/resolver.test.ts | 11 ++++ tests/uri.property.test.ts | 116 +++++++++++++++++++++++++++++++++++++ vitest.config.ts | 3 + 14 files changed, 179 insertions(+), 7 deletions(-) create mode 100644 tests/uri.property.test.ts diff --git a/biome.json b/biome.json index bfff0c7..088305f 100644 --- a/biome.json +++ b/biome.json @@ -5,6 +5,7 @@ "**", "!**/node_modules", "!**/dist", + "!**/coverage", "!**/.next", "!**/.open-next", "!**/.astro", diff --git a/package.json b/package.json index 4afb837..fb85ea3 100644 --- a/package.json +++ b/package.json @@ -77,6 +77,7 @@ "@biomejs/biome": "^2.4.15", "@types/node": "^22.0.0", "@vitest/coverage-v8": "^4.1.6", + "fast-check": "^4.8.0", "tsdown": "^0.22.0", "typescript": "^6.0.3", "vitest": "^4.1.6" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0c35a13..b1e920a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,6 +17,9 @@ importers: '@vitest/coverage-v8': specifier: ^4.1.6 version: 4.1.6(vitest@4.1.6) + fast-check: + specifier: ^4.8.0 + version: 4.8.0 tsdown: specifier: ^0.22.0 version: 0.22.0(typescript@6.0.3) @@ -383,6 +386,10 @@ packages: resolution: {integrity: sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==} engines: {node: '>=12.0.0'} + fast-check@4.8.0: + resolution: {integrity: sha512-GOJ158CUMnN6cSahsv4+ExARvIDuzzinFjkp0E9WtiBa5zcVeLozVkWaE4IzFcc+Y48Wp1EDlUZsXRyAztQcSg==} + engines: {node: '>=12.17.0'} + fdir@6.5.0: resolution: {integrity: sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==} engines: {node: '>=12.0.0'} @@ -541,6 +548,9 @@ packages: resolution: {integrity: sha512-SoSL4+OSEtR99LHFZQiJLkT59C5B1amGO1NzTwj7TT1qCUgUO6hxOvzkOYxD+vMrXBM3XJIKzokoERdqQq/Zmg==} engines: {node: ^10 || ^12 || >=14} + pure-rand@8.4.0: + resolution: {integrity: sha512-IoM8YF/jY0hiugFo/wOWqfmarlE6J0wc6fDK1PhftMk7MGhVZl88sZimmqBBFomLOCSmcCCpsfj7wXASCpvK9A==} + quansync@1.0.0: resolution: {integrity: sha512-5xZacEEufv3HSTPQuchrvV6soaiACMFnq1H8wkVioctoH3TRha9Sz66lOxRwPK/qZj7HPiSveih9yAyh98gvqA==} @@ -1038,6 +1048,10 @@ snapshots: expect-type@1.3.0: {} + fast-check@4.8.0: + dependencies: + pure-rand: 8.4.0 + fdir@6.5.0(picomatch@4.0.4): optionalDependencies: picomatch: 4.0.4 @@ -1153,6 +1167,8 @@ snapshots: picocolors: 1.1.1 source-map-js: 1.2.1 + pure-rand@8.4.0: {} + quansync@1.0.0: {} resolve-pkg-maps@1.0.0: {} diff --git a/src/checkers/index.ts b/src/checkers/index.ts index 9b02ca0..9a1309f 100644 --- a/src/checkers/index.ts +++ b/src/checkers/index.ts @@ -115,6 +115,7 @@ const checkSitemapEncoding = function checkSitemapEncoding( for (let j = 0; j < escapeCodesKeysLen; j += 1) { const code = escapeCodesKeys[j]; + /* v8 ignore next 3 -- unreachable: j is bounded by escapeCodesKeys.length so the index is always defined */ if (code === undefined) { break; } @@ -184,6 +185,7 @@ const checkComponent = function checkComponent({ case 'fragment': checkCharFunc = checkSitemap ? isSitemapQueryOrFragmentChar : isQueryOrFragmentChar; break; + /* v8 ignore next -- unreachable: type is validated to one of the four cases before the switch */ default: } @@ -296,17 +298,21 @@ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { // scheme (required) if (!is(String, scheme)) { fail('URI_MISSING_SCHEME', 'uri scheme is required'); + /* v8 ignore start -- unreachable: parseURI yields a null or non-empty scheme, never an empty string */ } else if (schemeLen <= 0) { fail('URI_EMPTY_SCHEME', 'uri scheme must not be empty'); } + /* v8 ignore stop */ // path (required), can be an empty string + /* v8 ignore next 3 -- unreachable: the Appendix-B regexp always captures a string path */ if (!is(String, path)) { fail('URI_MISSING_PATH', 'uri path is required'); } // path: if authority is present path must be empty or start with / if (is(String, authority) && authority.length > 0) { + /* v8 ignore next 3 -- unreachable: when authority is present the Appendix-B regexp makes path empty or '/'-prefixed */ if (!(path === '' || (path as string).startsWith('/'))) { fail('URI_INVALID_PATH', "path must be empty or start with '/' when authority is present"); } diff --git a/src/decoders/index.ts b/src/decoders/index.ts index 7668416..9a533ac 100644 --- a/src/decoders/index.ts +++ b/src/decoders/index.ts @@ -45,6 +45,7 @@ const decodeURIComponentString = function decodeURIComponentString( if (sitemap === true) { const uriToDecode = componentToDecode.replace( sitemapDecodeRegexp, + /* v8 ignore next -- unreachable '': the regexp is built from these keys so every match resolves */ (match) => escapeCodes[match] || pencodings[match] || '', ); @@ -154,6 +155,7 @@ const decodeURIString = function decodeURIString( const userinfoDecoded = decodeURIComponentString(userinfo ?? '', { sitemap, lowercase: false }); // path + /* v8 ignore next -- unreachable '': checkURISyntax always yields a string path */ const pathDecoded = decodeURIComponentString(path ?? '', { sitemap, lowercase: false }); // RFC-3986 §5.3: an absent query/fragment (null) stays absent and a diff --git a/src/encoders/index.ts b/src/encoders/index.ts index c3777be..cf7afcb 100644 --- a/src/encoders/index.ts +++ b/src/encoders/index.ts @@ -198,6 +198,7 @@ const encodeURIString = function encodeURIString( }); // path + /* v8 ignore next -- unreachable '': checkURISyntax always yields a string path */ const pathEncoded = encodeURIComponentString(path ?? '', { sitemap, type: 'path', diff --git a/src/helpers/cast.ts b/src/helpers/cast.ts index cff35ea..3a8e5da 100644 --- a/src/helpers/cast.ts +++ b/src/helpers/cast.ts @@ -86,6 +86,7 @@ const integer = function integer(thing: unknown): number | undefined { if (castNum !== undefined) { const int = parseInt(String(castNum), 10); + /* v8 ignore next -- unreachable: parseInt of a finite number's String is never NaN */ if (!Number.isNaN(int)) { castInt = int; } diff --git a/src/helpers/error.ts b/src/helpers/error.ts index bb011cb..de523ad 100644 --- a/src/helpers/error.ts +++ b/src/helpers/error.ts @@ -1,7 +1,7 @@ /** * error helper * - * - fail(code, message, cause?) -> never (throws a coded URIError) + * - fail(code, message) -> never (throws a coded URIError) */ type URIErrorWithCode = URIError & { code: string }; @@ -9,13 +9,11 @@ type URIErrorWithCode = URIError & { code: string }; /** * @func fail * - * Throw a URIError carrying a stable `code` string (and an optional - * `Error.cause`). The thrown value is always `instanceof URIError`. + * Throw a URIError carrying a stable `code` string. The thrown value is + * always `instanceof URIError`. */ -const fail = function fail(code: string, message: string, cause?: unknown): never { - const error = ( - cause === undefined ? new URIError(message) : new URIError(message, { cause }) - ) as URIErrorWithCode; +const fail = function fail(code: string, message: string): never { + const error = new URIError(message) as URIErrorWithCode; error.code = code; diff --git a/src/parser/index.ts b/src/parser/index.ts index 0db6034..8e855f5 100644 --- a/src/parser/index.ts +++ b/src/parser/index.ts @@ -190,6 +190,7 @@ const parseURI = function parseURI(uri: string): ParsedURI { } // extract uri components from RegExp + /* v8 ignore next -- unreachable []: the all-optional Appendix-B regexp always matches a non-empty string */ const [, scheme, authorityParsed, path, queryParsed, fragmentParsed] = uri.match(uriRegexp) ?? []; // scheme is required and must be a not empty string or this is not an uri @@ -219,6 +220,7 @@ const parseURI = function parseURI(uri: string): ParsedURI { } // try to extract host and port only if any + /* v8 ignore next -- unreachable false branch: hostAndPort is always an assigned string after the authority split */ if (is(String, hostAndPort)) { // detect IPv6 here first const ipv6Match = hostAndPort.match(ipv6Regexp); @@ -241,9 +243,11 @@ const parseURI = function parseURI(uri: string): ParsedURI { } // hostPunydecoded should be the host in Unicode, host its Punycode value + /* v8 ignore start -- unreachable null branch: the ipv6 regexp's required capture means hostParsed is always a string here */ const hostLowerCase = is(String, hostParsed) ? hostParsed.toLowerCase() : null; const toASCII = punycode(hostLowerCase ?? ''); const toUnicode = punydecode(hostLowerCase ?? ''); + /* v8 ignore stop */ // host parsed was in Unicode if (hostLowerCase !== toASCII) { @@ -279,6 +283,7 @@ const parseURI = function parseURI(uri: string): ParsedURI { // we still want to know the original host and authority provided // to check possible uri errors: a null host with a hostPunydecoded filled // means uri parsed had an invalid host name + /* v8 ignore next -- unreachable false branch: hostPunydecoded is always an assigned string in this block */ if (exists(hostPunydecoded)) { authorityPunydecoded = ''; @@ -324,6 +329,7 @@ const parseURI = function parseURI(uri: string): ParsedURI { // pathqf: recompose path + query + fragment if any // using valueOf to avoid potential String objects mutation with parsed.path + /* v8 ignore next -- unreachable null branch: the Appendix-B regexp always captures a string path */ parsed.pathqf = is(String, path) ? path.valueOf() : null; if (is(String, parsed.pathqf)) { @@ -344,6 +350,7 @@ const parseURI = function parseURI(uri: string): ParsedURI { parsed.host = host; parsed.hostPunydecoded = hostPunydecoded; parsed.port = port; + /* v8 ignore next -- unreachable: the Appendix-B regexp always captures a string path */ parsed.path = path ?? null; parsed.query = query; parsed.fragment = fragment; diff --git a/src/resolver/index.ts b/src/resolver/index.ts index 749d444..cb96673 100644 --- a/src/resolver/index.ts +++ b/src/resolver/index.ts @@ -29,11 +29,13 @@ interface Reference { * defined/undefined distinction §5.2.2 relies on is preserved. */ const parseReference = function parseReference(reference: string): Reference { + /* v8 ignore next -- unreachable []: the Appendix-B regexp is all-optional and matches any string */ const [, scheme, authority, path, query, fragment] = reference.match(referenceRegexp) ?? []; return { scheme: scheme ?? null, authority: authority ?? null, + /* v8 ignore next -- unreachable '': the path group [^?#]* always captures a string */ path: path ?? '', query: query ?? null, fragment: fragment ?? null, @@ -120,6 +122,7 @@ const merge = function merge(base: Reference, refPath: string): string { const recompose = function recompose(target: Reference): string { let result = ''; + /* v8 ignore next -- unreachable false branch: a resolved target always has a scheme (the base is absolute) */ if (is(String, target.scheme)) { result += `${target.scheme}:`; } diff --git a/tests/decoders.test.ts b/tests/decoders.test.ts index cd1d04b..3b1129c 100644 --- a/tests/decoders.test.ts +++ b/tests/decoders.test.ts @@ -158,6 +158,12 @@ describe('#decoders', () => { expectThrowWithCode(() => decodeURIString('ht°p://example.com'), 'URI_INVALID_SCHEME_CHAR'); }); + it('should lowercase the whole uri when the lowercase option is true', () => { + expect(decodeURIString('HTTP://EXAMPLE.COM/P%20X', { web: true, lowercase: true })).toBe( + 'http://example.com/p x', + ); + }); + it('should throw an uri error if scheme is not http or https when option is web or sitemap', () => { expectThrowWithCode( () => decodeURIString('httpp://www.example.com', { web: true }), diff --git a/tests/resolver.test.ts b/tests/resolver.test.ts index 87e067b..5eebef6 100644 --- a/tests/resolver.test.ts +++ b/tests/resolver.test.ts @@ -112,6 +112,17 @@ describe('#resolver', () => { expect(resolveURI('http://a', './g')).toBe('http://a/g'); }); + it('should merge against a base whose path has no slash', () => { + expect(resolveURI('a:b', 'c')).toBe('a:c'); + expect(resolveURI('a:b', './c')).toBe('a:c'); + }); + + it('should round-trip a reference carrying every component', () => { + expect(resolveURI('http://a/b/c', '//h/p?x=1#y')).toBe('http://h/p?x=1#y'); + expect(resolveURI('http://a/b/c', 's:/p?x#y')).toBe('s:/p?x#y'); + expect(resolveURI('http://h/p?q#f', '')).toBe('http://h/p?q'); + }); + it('should return the empty string when the base is not absolute', () => { expect(resolveURI('/b/c', 'g')).toBe(''); expect(resolveURI('//host/path', 'g')).toBe(''); diff --git a/tests/uri.property.test.ts b/tests/uri.property.test.ts new file mode 100644 index 0000000..29c9400 --- /dev/null +++ b/tests/uri.property.test.ts @@ -0,0 +1,116 @@ +import fc from 'fast-check'; +import { describe, expect, it } from 'vitest'; +import { + decodeURIComponentString, + encodeURIComponentString, + parseURI, + recomposeURI, + removeDotSegments, + resolveURI, +} from '../src/index.js'; + +const runs = { numRuns: 1000 }; + +describe('#uri — property tests', () => { + it('parseURI is total — never throws for any string', () => { + fc.assert( + fc.property(fc.string(), (input) => { + expect(() => parseURI(input)).not.toThrow(); + }), + runs, + ); + }); + + it('parse → recompose is idempotent on the recomposed href', () => { + fc.assert( + fc.property(fc.webUrl({ withQueryParameters: true, withFragments: true }), (url) => { + const first = parseURI(url).href; + + if (first !== null) { + expect(parseURI(first).href).toBe(first); + } + }), + runs, + ); + }); + + it('removeDotSegments is idempotent and leaves no . or .. complete segment', () => { + const segment = fc.constantFrom('a', 'b', '.', '..', 'c', 'd'); + const path = fc + .array(segment, { maxLength: 12 }) + .map((segs) => (fc.sample(fc.boolean(), 1)[0] ? `/${segs.join('/')}` : segs.join('/'))); + + fc.assert( + fc.property(path, (p) => { + const once = removeDotSegments(p); + + expect(removeDotSegments(once)).toBe(once); + + for (const seg of once.split('/')) { + expect(seg === '.' || seg === '..').toBe(false); + } + }), + runs, + ); + }); + + it('resolveURI with an empty reference strips only the fragment (RFC-3986 §5.3)', () => { + fc.assert( + fc.property( + fc.webUrl({ withQueryParameters: true, withFragments: false }), + fc.option(fc.string({ minLength: 1, maxLength: 8 }), { nil: undefined }), + (url, frag) => { + const base = frag === undefined ? url : `${url}#${frag}`; + + expect(resolveURI(base, '')).toBe(url); + }, + ), + runs, + ); + }); + + it('resolveURI never throws and returns a string for any reference', () => { + fc.assert( + fc.property(fc.webUrl(), fc.string(), (base, reference) => { + const resolved = resolveURI(base, reference); + + expect(typeof resolved).toBe('string'); + }), + runs, + ); + }); + + it('component encode then decode round-trips an arbitrary string', () => { + fc.assert( + fc.property(fc.string(), (raw) => { + const encoded = encodeURIComponentString(raw, { type: 'path' }); + + expect(decodeURIComponentString(encoded)).toBe(raw); + }), + runs, + ); + }); + + it('recomposeURI of parsed components equals the parsed href', () => { + fc.assert( + fc.property(fc.webUrl({ withQueryParameters: true, withFragments: true }), (url) => { + const parsed = parseURI(url); + + if (parsed.href !== null) { + expect( + recomposeURI({ + scheme: parsed.scheme, + userinfo: parsed.userinfo, + host: parsed.host, + port: parsed.port, + path: parsed.path, + query: parsed.query, + fragment: parsed.fragment, + }), + ).toBe(parsed.href); + } + }), + runs, + ); + }); +}); diff --git a/vitest.config.ts b/vitest.config.ts index 2583da9..189d207 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -8,6 +8,9 @@ export default defineConfig({ provider: 'v8', reporter: ['text', 'html'], include: ['src/**/*.ts'], + thresholds: { + 100: true, + }, }, }, }); From 720cdbb57a7e55ce16d5982e0a3a3c8cf0599b7d Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 21:21:50 +0700 Subject: [PATCH 12/21] ci: add the reusable-workflow caller and benchmark wiring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .github/workflows/ci.yml calls the shared coroboros/ci/.github/workflows/javascript-npm-packages.yml@v0 workflow (preflight on branch/PR, publish on tag, security always) via OIDC — no npm token, no extra config. package.json gains a `bench` script and the mitata dev dependency, and CLAUDE.md documents the release/publish flow and the benchmark regression budget. --- .github/workflows/ci.yml | 26 ++++++++++++++++++++++++++ CLAUDE.md | 15 +++++++++++---- package.json | 2 ++ pnpm-lock.yaml | 8 ++++++++ 4 files changed, 47 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..ca5face --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,26 @@ +name: ci + +on: + push: + branches: [main] + tags: + - '[0-9]+.[0-9]+.[0-9]+' + - '[0-9]+.[0-9]+.[0-9]+-*' + pull_request: + branches: [main] + +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref_type != 'tag' }} + +permissions: + contents: write + id-token: write + +jobs: + ci: + uses: coroboros/ci/.github/workflows/javascript-npm-packages.yml@v0 + secrets: + NPM_CONFIG_FILE: ${{ secrets.NPM_CONFIG_FILE }} + NPM_PACKAGE_REGISTRY: ${{ secrets.NPM_PACKAGE_REGISTRY }} + NPM_PACKAGE_PROXY_REGISTRY: ${{ secrets.NPM_PACKAGE_PROXY_REGISTRY }} diff --git a/CLAUDE.md b/CLAUDE.md index 102588a..8d3df1c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,15 +10,18 @@ Follows the Coroboros engineering global rules. Repo-specific divergences are st ## Tech Stack - TypeScript strict, ES modules + CJS dual build (tsdown) -- Vitest for tests, Biome for lint/format +- Vitest + `fast-check` for property tests, Biome for lint/format +- `mitata` for benchmarks (`pnpm bench`) - Node.js 22 LTS - Zero runtime dependencies — Punycode uses Node's `node:url` (`domainToASCII` / `domainToUnicode`) ## Commands - `pnpm build` — bundle ESM + CJS + types to `dist/` -- `pnpm test` — run Vitest suite +- `pnpm test` — run the Vitest suite (incl. property-based) +- `pnpm test:coverage` — Vitest with the 100% coverage gate - `pnpm lint` / `pnpm lint:fix` — Biome check - `pnpm typecheck` — tsc --noEmit +- `pnpm bench` — build then run `bench/uri.bench.mjs` - `pnpm dev` — tsdown watch mode ## Important Files @@ -29,7 +32,8 @@ Follows the Coroboros engineering global rules. Repo-specific divergences are st - `src/resolver/index.ts` — `resolveURI`, `removeDotSegments` (RFC-3986 §5.2 verbatim) - `src/helpers/object.ts` — private `exists` / `is` type guards (inlined, not exported) - `tsdown.config.ts` — dual build config (ESM + CJS + dts) -- `tests/` — Vitest suites, one test file per source module +- `tests/` — one spec per source module + `uri.property.test.ts` for `fast-check` invariants +- `bench/uri.bench.mjs` — mitata bench vs native `URL` / `URL.canParse`; `bench/baseline.md` documents the 1.0.0 numbers ## Public API (1.0.0 contract) - `punycode(domain)`, `punydecode(domain)` — domain ASCII/Unicode serialization @@ -41,8 +45,11 @@ Follows the Coroboros engineering global rules. Repo-specific divergences are st - `decodeURIComponentString(component, options)`, `decodeURIString(uri, options)`, `decodeWebURL(uri, options)`, `decodeSitemapURL(uri, options)` — RFC-3986 decoders ## Rules -- **NEVER** break the public API above. The signatures and the error/type shapes are the 1.0.0 contract. +- The **published** `1.0.0` tag is the public contract — once it ships, **NEVER** break the API above (signatures, error codes, type shapes) without a major bump. Until `1.0.0` is published, breaking changes are allowed but every break must be enumerated in the PR. - **NEVER** add a new runtime dependency without user approval. Zero-dependency is a feature. - **NEVER** use `axios`, `request`, or `node-fetch` — use native `fetch` (Node 22+). - Run `pnpm lint && pnpm typecheck && pnpm test` before every commit. +- Run `pnpm bench` against `bench/baseline.md` when touching the parser, encoders or decoders — no regression > 10 % on any bucket at fixed feature set. - Scoped package — `publishConfig.access = "public"` is mandatory, do not remove. +- **Publish** — CI-owned via OIDC Trusted Publisher + npm provenance. The first `1.0.0` publish bootstraps through the org registry token (CI auto-detects it); once the package exists on npm, configure it as a Trusted Publisher and never re-add a token to `ci.yml`. Manual `pnpm publish` is forbidden — it bypasses provenance and the tag guard. +- **Git** — `main`-only; branch → PR → squash-merge → tag the merge commit. The tag is the only manual step; release automation (version bump, `CHANGELOG.md`, npm publish, GitHub release) is owned by [`coroboros/ci`](https://github.com/coroboros/ci). Never hand-edit `package.json` version or `CHANGELOG.md`. Run `pnpm lint && pnpm typecheck && pnpm test && pnpm build` before tagging. diff --git a/package.json b/package.json index fb85ea3..9c7393e 100644 --- a/package.json +++ b/package.json @@ -35,6 +35,7 @@ "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", + "bench": "pnpm build && node bench/uri.bench.mjs", "prepublishOnly": "pnpm lint && pnpm typecheck && pnpm test && pnpm build" }, "keywords": [ @@ -78,6 +79,7 @@ "@types/node": "^22.0.0", "@vitest/coverage-v8": "^4.1.6", "fast-check": "^4.8.0", + "mitata": "^1.0.34", "tsdown": "^0.22.0", "typescript": "^6.0.3", "vitest": "^4.1.6" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b1e920a..7cc9b3e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -20,6 +20,9 @@ importers: fast-check: specifier: ^4.8.0 version: 4.8.0 + mitata: + specifier: ^1.0.34 + version: 1.0.34 tsdown: specifier: ^0.22.0 version: 0.22.0(typescript@6.0.3) @@ -526,6 +529,9 @@ packages: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} + mitata@1.0.34: + resolution: {integrity: sha512-Mc3zrtNBKIMeHSCQ0XqRLo1vbdIx1wvFV9c8NJAiyho6AjNfMY8bVhbS12bwciUdd1t4rj8099CH3N3NFahaUA==} + nanoid@3.3.12: resolution: {integrity: sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ==} engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1} @@ -1151,6 +1157,8 @@ snapshots: dependencies: semver: 7.8.0 + mitata@1.0.34: {} + nanoid@3.3.12: {} obug@2.1.1: {} From 772d32d9c7c15fcef9f56f7231f19bb8339c30d0 Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 21:27:56 +0700 Subject: [PATCH 13/21] docs: document reference resolution and RFC compliance Swaps the placeholder branch badge for the CI status badge. Adds API reference for resolveURI and removeDotSegments, and a Compliance section stating the RFCs implemented, the behavior worth knowing (empty query/fragment, strict ports, last-delimiter authority split, case-insensitive percent hex, IPv6 %25 zones, Sitemap escaping), and the non-goals (no WHATWG leniency, no RFC 5952 canonicalization). The `lowercase` option notes are corrected: only scheme and host are lowercased for RFC normalization; lowercasing path/query/fragment is a Sitemap convenience, not RFC behavior. --- README.md | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 62c43aa..54fcea9 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Parses any URI per RFC-3986, including IDNs via Punycode. Validates IPs, domains, URIs, HTTP(S) URLs, and Sitemap URLs. Encodes and decodes URI strings and components. [![npm](https://img.shields.io/npm/v/@coroboros/uri?style=flat-square&color=000000)](https://www.npmjs.com/package/@coroboros/uri) -[![branch](https://img.shields.io/badge/branch-stable-000000?style=flat-square)](https://github.com/coroboros/uri) +[![ci](https://img.shields.io/github/actions/workflow/status/coroboros/uri/ci.yml?branch=main&style=flat-square&label=ci&color=000000)](https://github.com/coroboros/uri/actions/workflows/ci.yml) [![license](https://img.shields.io/badge/license-MIT-000000?style=flat-square)](https://opensource.org/licenses/MIT) [![stars](https://img.shields.io/github/stars/coroboros/uri?style=flat-square&label=stars&color=000000)](https://github.com/coroboros/uri) [![coroboros.com](https://img.shields.io/badge/coroboros.com-000000?style=flat-square&logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9IndoaXRlIiBzdHJva2Utd2lkdGg9IjIiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIgc3Ryb2tlLWxpbmVqb2luPSJyb3VuZCI+PGNpcmNsZSBjeD0iMTIiIGN5PSIxMiIgcj0iMTAiLz48cGF0aCBkPSJNMiAxMmgyME0xMiAyYTE1LjMgMTUuMyAwIDAgMSA0IDEwIDE1LjMgMTUuMyAwIDAgMS00IDEwIDE1LjMgMTUuMyAwIDAgMS00LTEwIDE1LjMgMTUuMyAwIDAgMSA0LTEweiIvPjwvc3ZnPg==)](https://coroboros.com) @@ -25,6 +25,7 @@ Parses any URI per RFC-3986, including IDNs via Punycode. Validates IPs, domains - [Usage](#usage) - [API](#api) - [Errors](#errors) +- [Compliance](#compliance) - [Contributing](#contributing) - [License](#license) @@ -369,6 +370,75 @@ recomposeURI({ }); // 'foo://[fe80::7:8%eth0]/over/there' ``` +### resolveURI(base, reference) + +Resolve a URI reference against a base URI. + +Implements the **RFC-3986 §5.2** algorithm: the §5.2.2 strict transform, the §5.2.3 merge, and §5.2.4 remove_dot_segments, then recomposes the target per §5.3. + +The empty string is returned if the base is not absolute or an argument is not a string. + +**Rules**: + +1. base must be an absolute URI — a scheme is required (**RFC-3986 §5.2.1**); +2. reference may be absolute or relative; +3. both arguments must be strings. + +**Note**: + +- the strict algorithm is used: a reference scheme equal to the base scheme is not ignored. + +
+ +**Based on**: + +- __RFC-3986 §5.2__. + +
+ +- `base`* **** An absolute base URI. +- `reference`* **** The URI reference to resolve. +- Returns: **** + +
+ +**Examples**: + +```javascript +resolveURI('http://a/b/c/d;p?q', '../../g'); // 'http://a/g' + +resolveURI('https://example.com/a/b', './c?x#y'); // 'https://example.com/a/c?x#y' + +resolveURI('/not-absolute', 'g'); // '' (base is not absolute) +``` + +### removeDotSegments(path) + +Remove the special `.` and `..` complete path segments from a path. + +Implements the **RFC-3986 §5.2.4** algorithm verbatim. + +
+ +**Based on**: + +- __RFC-3986 §5.2.4__. + +
+ +- `path`* **** The path to normalize. +- Returns: **** + +
+ +**Examples**: + +```javascript +removeDotSegments('/a/b/c/./../../g'); // '/a/g' + +removeDotSegments('mid/content=5/../6'); // 'mid/6' +``` + ### isDomainLabel(label) Test a label is a valid domain label according to **RFC-1034**. @@ -925,7 +995,7 @@ Encode an URI string according to **RFC-3986** with basic checking. - native function `encodeURI` also encodes scheme and host that cannot have percend-encoded characters; - characters that should not be percent-encoded in **RFC-3986** are `[]` to represent IPv6 host; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase. +- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization.
@@ -999,7 +1069,7 @@ Uses __[a fixed encodeURI function](#encodeuristringuri-options)__ to be **RFC-3 - native function `encodeURI` also encodes scheme and host that cannot have percend-encoded characters; - characters that should not be percent-encoded in **RFC-3986** are `[]` to represent IPv6 host; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase. +- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization.
@@ -1079,7 +1149,7 @@ Uses __[a fixed encodeURI function](#encodeuristringuri-options)__ to be **RFC-3 - native function `encodeURI` also encodes scheme and host that cannot have percend-encoded characters; - characters that should not be percent-encoded in **RFC-3986** are `[]` to represent IPv6 host; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase. +- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization.
@@ -1188,7 +1258,7 @@ Decode an URI string according to **RFC-3986** with basic checking. - if one of `userinfo`, `path`, `query` or `fragment` component cannot be decoded, it will be ignored; - native function `decodeURI` does not support IDNs and cannot properly work with `encodeURI` since the function is based on an outdated standard; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase; +- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization; - to only use with [encodeURIString](#encodeuristringuri-options).
@@ -1255,7 +1325,7 @@ Uses __[a fixed decodeURI function](#decodeuristringuri-options)__ to be **RFC-3 - if one of `userinfo`, `path`, `query` or `fragment` component cannot be decoded, it will be ignored; - native function `decodeURI` does not support IDNs and cannot properly work with `encodeURI` since the function is based on an outdated standard; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase; +- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization; - to only use with [encodeWebURL](#encodeweburluri-options).
@@ -1328,7 +1398,7 @@ Uses __[a fixed decodeURI function](#decodeuristringuri-options)__ to be **RFC-3 - if one of `userinfo`, `path`, `query` or `fragment` component cannot be decoded, it will be ignored; - native function `decodeURI` does not support IDNs and cannot properly work with `encodeURI` since the function is based on an outdated standard; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase; +- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization; - to only use with [encodeSitemapURL](#encodesitemapurluri).
@@ -1516,6 +1586,31 @@ Errors emitted by *@coroboros/uri* are native URIError with an additional *code* +## Compliance + +*@coroboros/uri* implements: + +- **RFC-3986** — generic URI syntax: parse, recompose, reference resolution (§5.2), percent-encoding, and validation. +- **RFC-3987** — IDNs via Punycode, through Node's `node:url` (`domainToASCII` / `domainToUnicode`). +- **RFC 6874** — IPv6 zone identifiers in a URI. +- **RFC 1034** / **RFC 1123** — domain name rules. +- **sitemaps.org** — the Sitemap protocol for Sitemap URLs. + +**Behavior worth knowing**: + +- a present-but-empty query or fragment (a bare `?` or `#`) is preserved and round-trips, distinct from an absent one (**RFC-3986 §5.3**); +- a port must be a string of ASCII digits (**RFC-3986 §3.2.3**) — values like `0x1F` are rejected; +- `userinfo` is delimited by the last `@`, and a non-IPv6 host/port by the last `:` (**RFC-3986 §3.2**); +- percent-encoding hex is case-insensitive: `%3a` and `%3A` are both accepted (**RFC-3986 §6.2.2.1**); +- inside a URI, an IPv6 zone identifier must use the `%25` delimiter (**RFC 6874**); the standalone `isIPv6` validator stays lenient; +- `encodeSitemapURL` escapes all five XML entities `& ' " < >`, and a Sitemap URL must be shorter than 2,048 characters (sitemaps.org). For example, `encodeSitemapURL('http://example.com/a&bd')` returns `'http://example.com/a&b<c>d'`. + +**Non-goals**: + +- this is a strict **RFC-3986** toolkit, not a WHATWG URL parser — it does not apply WHATWG host/IPv4 leniency; +- it does not canonicalize IPv6 addresses to **RFC 5952** form; +- the `lowercase` option lowercases the entire input including the path, query and fragment, which are case-sensitive per **RFC-3986 §6.2.2.1** — so `lowercase` is a Sitemap/convenience option, not RFC normalization. By default only the scheme and host are lowercased, which is the RFC-compliant behavior. + ## Contributing Bug reports and PRs welcome. From bff2cb1235e4aacfd4d1a8f1da120ffb3ba26bef Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 21:31:54 +0700 Subject: [PATCH 14/21] test: add the mitata benchmark and the 1.0.0 baseline bench/uri.bench.mjs measures parse, validate, encode/decode, IP and reference-resolution throughput across representative URI shapes, shown next to native URL for scale (a different, WHATWG model). bench/baseline.md records the 1.0.0 numbers, the bundle size, and the going-forward budget: no regression > 10 % on any bucket at a fixed feature set. --- bench/baseline.md | 79 +++++++++++++++++++++++++++++++++++++ bench/uri.bench.mjs | 95 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 bench/baseline.md create mode 100644 bench/uri.bench.mjs diff --git a/bench/baseline.md b/bench/baseline.md new file mode 100644 index 0000000..e351137 --- /dev/null +++ b/bench/baseline.md @@ -0,0 +1,79 @@ +# Benchmark baseline + +Apple M1, Node 22.22.2. Run `pnpm bench` to reproduce. + +Native `URL` is shown for scale only. It implements the WHATWG URL model, +not strict RFC-3986: it applies host/IPv4 leniency, default-port stripping +and mandatory IDNA that this toolkit deliberately does not. The columns are +not equivalent — `@coroboros/uri` trades raw speed for RFC-3986 fidelity, +explicit validation with coded errors, and zero runtime dependencies. + +## 1.0.0 + +### parse — `parseURI(uri)` vs `new URL(uri)` + +| Bucket | parseURI | new URL | ratio | +| ------- | --------: | -------: | ----: | +| simple | 979.57 ns | 215.6 ns | 4.5x | +| typical | 1.22 µs | 377.7 ns | 3.2x | +| idn | 2.30 µs | 707.7 ns | 3.3x | +| ipv6 | 1.57 µs | 345.1 ns | 4.6x | +| long | 1.27 µs | 502.0 ns | 2.5x | + +### validate — `checkWebURL(uri)` vs `URL.canParse(uri)` + +| Bucket | checkWebURL | URL.canParse | ratio | +| ------- | ----------: | -----------: | -----: | +| simple | 2.00 µs | 128.9 ns | 15.5x | +| typical | 3.69 µs | 199.5 ns | 18.5x | +| idn | 4.53 µs | 657.5 ns | 6.9x | +| ipv6 | 3.04 µs | 232.6 ns | 13.1x | +| long | 19.43 µs | 204.9 ns | 94.8x | + +`checkWebURL` does full RFC-3986 character validation per component plus +IP/domain checks; `URL.canParse` only attempts a WHATWG parse. The `long` +bucket is a 360-segment path with a 160-pair query — the per-character +validation is linear in input length by design. + +### encode / decode / recompose · typical + +| Operation | avg/iter | +| -------------- | -------: | +| `recomposeURI` | 1.55 µs | +| `decodeWebURL` | 2.94 µs | +| `encodeWebURL` | 2.97 µs | + +### ip · reference resolution + +| Operation | avg/iter | +| -------------------------- | -------: | +| `isIP` ipv4 | 32.4 ns | +| `isIP` reject | 74.9 ns | +| `isIP` ipv6 | 177.7 ns | +| `removeDotSegments` | 257.3 ns | +| `resolveURI` | 453.9 ns | + +## Bundle size + +| Format | Raw | Gzip | +| ------ | -------: | --------: | +| ESM | 55.75 kB | 12.06 kB | +| CJS | 56.44 kB | 12.18 kB | + +## Why slower than native `URL` + +`new URL` is C++-backed and lossy by design: it normalizes, strips default +ports, and discards the empty-vs-absent component distinction. This toolkit +runs a JavaScript RFC-3986 grammar, preserves every component exactly, +validates each component's characters against the RFC tables, and resolves +references through the verbatim §5.2 algorithm. The gap is the cost of +fidelity and zero dependencies, not of unoptimised code — the hot regexps +are compiled once at module load. + +## Going-forward target + +**No regression > 10 % on any bucket at fixed feature set.** A +string-grammar parser has more V8 inline-cache volatility than a tight +numeric loop; the bar is loose enough to absorb it without flapping CI. +Feature additions that legitimately cost time reset the bar for the +buckets they affect. diff --git a/bench/uri.bench.mjs b/bench/uri.bench.mjs new file mode 100644 index 0000000..eca9c4c --- /dev/null +++ b/bench/uri.bench.mjs @@ -0,0 +1,95 @@ +/** + * Micro-benchmark for @coroboros/uri over representative URI shapes. + * + * Usage (from the package root): + * pnpm build && node bench/uri.bench.mjs + * + * Compares the in-package functions against the native field: + * - new URL() (throwing, WHATWG) + * - URL.canParse() (boolean, WHATWG) + * + * The native URL is a different model (WHATWG, not strict RFC-3986); it is + * shown for scale only, not as an equivalence. + */ +import { bench, do_not_optimize, group, run, summary } from 'mitata'; +import { + checkWebURL, + decodeWebURL, + encodeWebURL, + isIP, + parseURI, + recomposeURI, + removeDotSegments, + resolveURI, +} from '../dist/index.mjs'; + +const URIS = { + simple: 'http://example.com/', + typical: 'https://user:pass@example.com:8080/over/there?name=ferret&x=1#nose', + idn: 'https://中文.example.com/over/there?name=ferret#nose', + ipv6: 'http://[2001:db8::1]:8080/over/there?name=ferret#nose', + long: `https://example.com/${'segment/'.repeat(40)}?${'k=v&'.repeat(40)}#end`, +}; + +for (const [label, uri] of Object.entries(URIS)) { + group(`parse · ${label}`, () => { + summary(() => { + bench('parseURI', () => { + do_not_optimize(parseURI(uri)); + }); + bench('new URL', () => { + do_not_optimize(new URL(uri)); + }); + }); + }); +} + +for (const [label, uri] of Object.entries(URIS)) { + group(`validate · ${label}`, () => { + summary(() => { + bench('checkWebURL', () => { + try { + do_not_optimize(checkWebURL(uri)); + } catch {} + }); + bench('URL.canParse', () => { + do_not_optimize(URL.canParse(uri)); + }); + }); + }); +} + +group('encode / decode · typical', () => { + bench('encodeWebURL', () => { + do_not_optimize(encodeWebURL(URIS.typical)); + }); + bench('decodeWebURL', () => { + do_not_optimize(decodeWebURL(URIS.typical)); + }); + bench('recomposeURI', () => { + do_not_optimize(recomposeURI(parseURI(URIS.typical))); + }); +}); + +group('ip', () => { + bench('isIP · ipv4', () => { + do_not_optimize(isIP('192.168.1.1')); + }); + bench('isIP · ipv6', () => { + do_not_optimize(isIP('2001:db8::1')); + }); + bench('isIP · reject', () => { + do_not_optimize(isIP('999.999.999.999')); + }); +}); + +group('reference resolution', () => { + bench('resolveURI', () => { + do_not_optimize(resolveURI('http://a/b/c/d;p?q', '../../g')); + }); + bench('removeDotSegments', () => { + do_not_optimize(removeDotSegments('/a/b/c/./../../g')); + }); +}); + +await run({ colors: true }); From 37566e81d327b1008b64c2291c83d174098d98bb Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 22:47:20 +0700 Subject: [PATCH 15/21] fix: enforce the RFC 6874 ZoneID grammar in URI hosts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit checkURI accepted an empty or malformed IPv6 zone identifier: the "%25" delimiter was verified but the ZoneID after it was not. RFC 6874 §2 defines ZoneID = 1*( unreserved / pct-encoded ), so an empty zone ([fe80::1%25]) or out-of-set bytes are invalid in a URI. Reject both; valid zones such as [fe80::1%25eth0] are unaffected. Stricter host validation — enumerated as a pre-1.0.0 change in PR #10. --- src/checkers/index.ts | 23 ++++++++++++++++++++--- tests/checkers.test.ts | 9 +++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/checkers/index.ts b/src/checkers/index.ts index 9a1309f..e5c0ce5 100644 --- a/src/checkers/index.ts +++ b/src/checkers/index.ts @@ -45,6 +45,9 @@ export interface CheckedURISyntax extends ParsedURI { type CharChecker = (char: string, encode?: boolean) => boolean; +// RFC 6874 §2: ZoneID = 1*( unreserved / pct-encoded ). Compiled once. +const ipv6ZoneIdRegexp = /^(?:[A-Za-z0-9._~-]|%[0-9A-Fa-f]{2})+$/; + /** * @func checkPercentEncoding * @@ -328,12 +331,26 @@ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { } // RFC 6874: an IPv6 zone identifier in a URI MUST use the percent-encoded - // "%25" delimiter; a bare "%" is invalid in URI context + // "%25" delimiter and the ZoneID must be a non-empty 1*( unreserved / pct-encoded ) if (is(String, host) && host.includes(':')) { const zoneAt = host.indexOf('%'); - if (zoneAt !== -1 && host.slice(zoneAt, zoneAt + 3) !== '%25') { - fail('URI_INVALID_HOST', `IPv6 zone identifier must use the '%25' delimiter, got '${host}'`); + if (zoneAt !== -1) { + const zoneId = host.slice(zoneAt + 3); + + if (host.slice(zoneAt, zoneAt + 3) !== '%25') { + fail( + 'URI_INVALID_HOST', + `IPv6 zone identifier must use the '%25' delimiter, got '${host}'`, + ); + } + + if (zoneId === '' || !ipv6ZoneIdRegexp.test(zoneId)) { + fail( + 'URI_INVALID_HOST', + `IPv6 zone identifier must be a non-empty RFC 6874 ZoneID, got '${host}'`, + ); + } } } diff --git a/tests/checkers.test.ts b/tests/checkers.test.ts index 3e2196a..749eb67 100644 --- a/tests/checkers.test.ts +++ b/tests/checkers.test.ts @@ -689,6 +689,15 @@ describe('#checkers', () => { expect(() => checkWebURL('http://[fe80::1%25eth0]/')).not.toThrow(); }); + // RFC 6874 §2: ZoneID = 1*( unreserved / pct-encoded ) — the zone must + // be non-empty and restricted to that set after the %25 delimiter. + it('should reject an empty or malformed RFC 6874 ZoneID in a URI host', () => { + expectThrowWithCode(() => checkURISyntax('http://[fe80::1%25]/'), 'URI_INVALID_HOST'); + expectThrowWithCode(() => checkURI('http://[fe80::1%25]/'), 'URI_INVALID_HOST'); + expectThrowWithCode(() => checkURI('http://[fe80::1%25e*0]/'), 'URI_INVALID_HOST'); + expect(() => checkURI('http://[fe80::1%251]/')).not.toThrow(); + }); + it('should not throw if an uri has at least a scheme and a path', () => { expect(() => checkURISyntax('http://example.com')).not.toThrow(); expect(() => checkURISyntax('http://example.com/path')).not.toThrow(); From a017185b8f6449eeb93a2b27d3f805a4f45a4066 Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 22:49:27 +0700 Subject: [PATCH 16/21] test: cover empty port and deep dot-segment paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an explicit RFC-3986 §3.2.3 empty-port case (port = *DIGIT): 'http://example.com:/path' keeps the port present-but-empty (''), distinct from an absent port (null), and is not an error. Strengthen the removeDotSegments property generator with up to eight leading '../' so the §5.2.4 climb-above-root path is exercised; idempotence and the no-dot-segment invariant still hold. --- tests/parser.test.ts | 14 ++++++++++++++ tests/uri.property.test.ts | 8 ++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/parser.test.ts b/tests/parser.test.ts index cec74ca..3e3580b 100644 --- a/tests/parser.test.ts +++ b/tests/parser.test.ts @@ -584,6 +584,20 @@ describe('#parser', () => { expect(both).toHaveProperty('fragment', ''); expect(both).toHaveProperty('href', 'http://example.com/?#'); }); + + // RFC-3986 §3.2.3: port = *DIGIT, so an empty port (zero digits) is + // syntactically valid — present-but-empty ('') and distinct from an + // absent port (null), not an error. + it('should keep an empty port present-but-empty, distinct from absent (RFC-3986 §3.2.3)', () => { + const emptyPort = parseURI('http://example.com:/path'); + expect(emptyPort).toHaveProperty('port', ''); + expect(emptyPort).toHaveProperty('host', 'example.com'); + expect(emptyPort).toHaveProperty('href', 'http://example.com:/path'); + + const absentPort = parseURI('http://example.com/path'); + expect(absentPort).toHaveProperty('port', null); + expect(absentPort).toHaveProperty('href', 'http://example.com/path'); + }); }); describe('when using recomposeURI', () => { diff --git a/tests/uri.property.test.ts b/tests/uri.property.test.ts index 29c9400..580c758 100644 --- a/tests/uri.property.test.ts +++ b/tests/uri.property.test.ts @@ -37,8 +37,12 @@ describe('#uri — property tests', () => { it('removeDotSegments is idempotent and leaves no . or .. complete segment', () => { const segment = fc.constantFrom('a', 'b', '.', '..', 'c', 'd'); const path = fc - .array(segment, { maxLength: 12 }) - .map((segs) => (fc.sample(fc.boolean(), 1)[0] ? `/${segs.join('/')}` : segs.join('/'))); + .tuple(fc.nat({ max: 8 }), fc.array(segment, { maxLength: 12 }), fc.boolean()) + .map(([climb, segs, absolute]) => { + const body = `${'../'.repeat(climb)}${segs.join('/')}`; + + return absolute ? `/${body}` : body; + }); fc.assert( fc.property(path, (p) => { From f44665a8bdaf516d9e2a83f79b930c4f45d6afbf Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 22:52:19 +0700 Subject: [PATCH 17/21] docs: cite RFC sections in the compliance reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Map every RFC-3986 operation to its section in the Compliance section — parse (Appendix B), recompose (§5.3), reference resolution (§5.2), percent-encoding (§2.1, §6.2.2.1), character validation (§3.1–§3.5). Tighten the RFC 6874 entry to the §2 ZoneID grammar now enforced, and document that resolveURI ignores a fragment on the base (RFC-3986 §5.1). --- README.md | 9 +++++---- src/resolver/index.ts | 4 +++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 54fcea9..afd22c9 100644 --- a/README.md +++ b/README.md @@ -386,7 +386,8 @@ The empty string is returned if the base is not absolute or an argument is not a **Note**: -- the strict algorithm is used: a reference scheme equal to the base scheme is not ignored. +- the strict algorithm is used: a reference scheme equal to the base scheme is not ignored; +- a fragment on the base is ignored — the base is used stripped of any fragment (**RFC-3986 §5.1**).
@@ -1590,9 +1591,9 @@ Errors emitted by *@coroboros/uri* are native URIError with an additional *code* *@coroboros/uri* implements: -- **RFC-3986** — generic URI syntax: parse, recompose, reference resolution (§5.2), percent-encoding, and validation. +- **RFC-3986** — generic URI syntax: parse (Appendix B), recompose (§5.3), reference resolution (§5.2), percent-encoding (§2.1, §6.2.2.1), and character validation (§3.1–§3.5). - **RFC-3987** — IDNs via Punycode, through Node's `node:url` (`domainToASCII` / `domainToUnicode`). -- **RFC 6874** — IPv6 zone identifiers in a URI. +- **RFC 6874 §2** — IPv6 zone identifiers in a URI (the `%25` delimiter and `ZoneID = 1*( unreserved / pct-encoded )`). - **RFC 1034** / **RFC 1123** — domain name rules. - **sitemaps.org** — the Sitemap protocol for Sitemap URLs. @@ -1602,7 +1603,7 @@ Errors emitted by *@coroboros/uri* are native URIError with an additional *code* - a port must be a string of ASCII digits (**RFC-3986 §3.2.3**) — values like `0x1F` are rejected; - `userinfo` is delimited by the last `@`, and a non-IPv6 host/port by the last `:` (**RFC-3986 §3.2**); - percent-encoding hex is case-insensitive: `%3a` and `%3A` are both accepted (**RFC-3986 §6.2.2.1**); -- inside a URI, an IPv6 zone identifier must use the `%25` delimiter (**RFC 6874**); the standalone `isIPv6` validator stays lenient; +- inside a URI, an IPv6 zone identifier must use the `%25` delimiter and a non-empty `ZoneID` of `unreserved` / `pct-encoded` characters (**RFC 6874 §2**); the standalone `isIPv6` validator stays lenient; - `encodeSitemapURL` escapes all five XML entities `& ' " < >`, and a Sitemap URL must be shorter than 2,048 characters (sitemaps.org). For example, `encodeSitemapURL('http://example.com/a&bd')` returns `'http://example.com/a&b<c>d'`. **Non-goals**: diff --git a/src/resolver/index.ts b/src/resolver/index.ts index cb96673..8c10583 100644 --- a/src/resolver/index.ts +++ b/src/resolver/index.ts @@ -152,7 +152,9 @@ const recompose = function recompose(target: Reference): string { * and recomposing per §5.3. * * The base must be an absolute URI (a scheme is required, RFC-3986 §5.2.1); - * the empty string is returned if base or reference is invalid. + * a fragment on the base is ignored (RFC-3986 §5.1: the base is used + * stripped of any fragment); the empty string is returned if base or + * reference is invalid. * * Based on: * - RFC-3986 https://tools.ietf.org/html/rfc3986#section-5.2. From 552edf3499c1509a3a1d0ab5edfabe476a9c5345 Mon Sep 17 00:00:00 2001 From: ob-aion Date: Tue, 19 May 2026 22:52:40 +0700 Subject: [PATCH 18/21] chore: ignore the local .claude/output directory --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8a844f0..7bd9bcf 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .wrangler/ .dev.vars .claude/settings.local.json +.claude/output/ CLAUDE.local.md dist *.zip From ead850b0f0e79d5b29bc72ad871deff9cf9d25bf Mon Sep 17 00:00:00 2001 From: ob-aion Date: Wed, 20 May 2026 11:37:19 +0700 Subject: [PATCH 19/21] ci: forward the npm token secrets needed for the first publish npm exposes no pre-publish Trusted Publisher form for a not-yet-existing scoped package, so the first 1.0.0 tag must publish via the org NPM_PACKAGE_REGISTRY_TOKEN. Forward NPM_EXTRA_CONFIG and NPM_PACKAGE_REGISTRY_TOKEN through to the reusable workflow; it auto-detects the token and routes the publish via npm token. Once 1.0.0 is live on npm and @coroboros/uri is configured as a Trusted Publisher of coroboros/uri (workflow ci.yml, environment empty), both secret lines will be dropped in a follow-up so 1.0.1+ publishes via OIDC + --provenance. --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ca5face..592e6c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,5 +22,7 @@ jobs: uses: coroboros/ci/.github/workflows/javascript-npm-packages.yml@v0 secrets: NPM_CONFIG_FILE: ${{ secrets.NPM_CONFIG_FILE }} + NPM_EXTRA_CONFIG: ${{ secrets.NPM_EXTRA_CONFIG }} NPM_PACKAGE_REGISTRY: ${{ secrets.NPM_PACKAGE_REGISTRY }} NPM_PACKAGE_PROXY_REGISTRY: ${{ secrets.NPM_PACKAGE_PROXY_REGISTRY }} + NPM_PACKAGE_REGISTRY_TOKEN: ${{ secrets.NPM_PACKAGE_REGISTRY_TOKEN }} From 6ba685439521fd9bfbcdf56d080dd5f87ffc23fc Mon Sep 17 00:00:00 2001 From: ob-aion Date: Wed, 20 May 2026 12:06:04 +0700 Subject: [PATCH 20/21] docs: restructure the README and surface the full RFC stack API section: each public function lives in its own
collapsible under a topical sub-header (Punycode, Parsing, Reference resolution, Validators, Checkers, Encoders, Decoders). Parameter blocks use markdown tables. The exported types (ParsedURI, URIComponents, CheckedURI) get their own entries. Compliance now sits between Usage and API, with the URI grammar diagrams shown once. A new Limitations section gathers the behavior caveats and non-goals as a single flat bullet list, each bullet RFC-cited. Tagline, package.json description, and keywords surface the full RFC stack: IDN (RFC-3987), IPv6 zone identifiers (RFC 6874), domain rules (RFC 1034 / 1123), and the Sitemap protocol. --- README.md | 1834 ++++++++++++++++---------------------------------- package.json | 14 +- 2 files changed, 608 insertions(+), 1240 deletions(-) diff --git a/README.md b/README.md index afd22c9..b3eb286 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,9 @@ # @coroboros/uri -**RFC-3986 compliant, zero-dependency URI toolkit for Node.js.** +**RFC-3986 URI toolkit for Node.js. IDN (RFC-3987), IPv6 zone identifiers (RFC 6874), Sitemap protocol. Zero dependencies.** -Parses any URI per RFC-3986, including IDNs via Punycode. Validates IPs, domains, URIs, HTTP(S) URLs, and Sitemap URLs. Encodes and decodes URI strings and components. +Parses URIs per **RFC-3986 Appendix B**. Recomposes per §5.3. Resolves references per §5.2. Validates IPs, domains (**RFC 1034 / 1123**), HTTP(S) URLs, and Sitemap URLs. Encodes and decodes URI strings and components. [![npm](https://img.shields.io/npm/v/@coroboros/uri?style=flat-square&color=000000)](https://www.npmjs.com/package/@coroboros/uri) [![ci](https://img.shields.io/github/actions/workflow/status/coroboros/uri/ci.yml?branch=main&style=flat-square&label=ci&color=000000)](https://github.com/coroboros/uri/actions/workflows/ci.yml) @@ -23,9 +23,10 @@ Parses any URI per RFC-3986, including IDNs via Punycode. Validates IPs, domains - [Requirements](#requirements) - [Install](#install) - [Usage](#usage) +- [Compliance](#compliance) - [API](#api) - [Errors](#errors) -- [Compliance](#compliance) +- [Limitations](#limitations) - [Contributing](#contributing) - [License](#license) @@ -39,12 +40,15 @@ Parses any URI per RFC-3986, including IDNs via Punycode. Validates IPs, domains ```bash pnpm add @coroboros/uri ``` + ```bash npm install @coroboros/uri ``` + ```bash yarn add @coroboros/uri ``` + ```bash bun add @coroboros/uri ``` @@ -54,149 +58,214 @@ bun add @coroboros/uri ```ts // ESM (recommended) import { parseURI, checkHttpsURL, encodeWebURL } from '@coroboros/uri'; - -parseURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); -checkHttpsURL('https://example.com/path?q=1#x'); -encodeWebURL('https://www.中文.com./Over There?a=B#Anchôr'); ``` ```js // CommonJS -const { parseURI } = require('@coroboros/uri'); +const { parseURI, checkHttpsURL, encodeWebURL } = require('@coroboros/uri'); ``` +```ts +import { parseURI, checkHttpsURL, encodeWebURL } from '@coroboros/uri'; + +// Parse — get every RFC-3986 component +parseURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); +// { scheme: 'foo', host: 'xn--fiq228c.com', hostPunydecoded: '中文.com', port: 8042, … } + +// Validate strictly — throws URIError with a stable code on invalid input +try { + const url = checkHttpsURL('https://example.com/path?q=1#x'); + url.valid; // true +} catch (err) { + // err.code is one of the documented codes (see [Errors](#errors)) +} + +// Encode — RFC-3986 compliant, IDN-aware, sub-2048-char HTTP(S) +encodeWebURL('https://www.中文.com./Over There?a=B#Anchôr'); +// 'https://www.xn--fiq228c.com./Over%20There?a=B#Anch%C3%B4r' +``` + +## Compliance + +`@coroboros/uri` implements: + +- **RFC-3986** — generic URI syntax: parse (Appendix B), recompose (§5.3), reference resolution (§5.2), percent-encoding (§2.1, §6.2.2.1), and character validation (§3.1–§3.5). +- **RFC-3987** — Internationalized Domain Names via Punycode, through Node's `node:url` (`domainToASCII` / `domainToUnicode`). +- **RFC 6874 §2** — IPv6 zone identifiers inside a URI: the `%25` delimiter and `ZoneID = 1*( unreserved / pct-encoded )` grammar. +- **RFC 1034 / RFC 1123** — domain-name rules: label length, character set, label separation. +- **sitemaps.org** — the Sitemap protocol: required XML-entity escaping and the 2,048-character URL ceiling. + +**Generic URI syntax** + +![URI Syntax](assets/uri-syntax.png "URI Syntax") + +**Example URIs** + +![RFC-3986](assets/rfc-3986.png "RFC-3986") + ## API -### punycode(domain) +### Types -Returns the Punycode ASCII serialization of the domain. If domain is an invalid domain, the empty string is returned. +
+ParsedURI -**Note**: +
-- native function `url.domainToASCII` does not support IPv6 only IPv4; -- native function `url.domainToASCII` throws if no domain is provided or returns `null`, `undefined`, `nan` for `null`, `undefined` or `NaN` values which is not what to be expected. +Return shape of [`parseURI`](#parsing). -
+```ts +interface ParsedURI { + scheme: string | null; + authority: string | null; + authorityPunydecoded: string | null; + userinfo: string | null; + host: string | null; + hostPunydecoded: string | null; + port: number | string | null; + path: string | null; + pathqf: string | null; + query: string | null; + fragment: string | null; + href: string | null; +} +``` -- `domain` **** -- Returns: **** +Fields default to `null` when the corresponding URI part is missing. `port` is a `number` when parseable as an integer, a `string` otherwise. -
+
-**Examples**: +
+URIComponents -```javascript -punycode(); // '' -punycode('a.b.c.d.e.fg'); // 'a.b.c.d.e.fg' -punycode('xn--iñvalid.com'); // '' -punycode('中文.com'); // 'xn--fiq228c.com' -punycode('xn--fiq228c.com'); // 'xn--fiq228c.com' -punycode('2001:db8:85a3:8d3:1319:8a2e:370:7348'); // '2001:db8:85a3:8d3:1319:8a2e:370:7348' -punycode('127.0.0.1'); // '127.0.0.1' -punycode(undefined|null|NaN); // '' +
+ +Input shape of [`recomposeURI`](#parsing). Every field is optional; `scheme` and `path` are required at runtime. + +```ts +interface URIComponents { + scheme?: string | null; + userinfo?: string | null; + host?: string | null; + port?: number | string | null; + path?: string | null; + query?: string | null; + fragment?: string | null; +} ``` -### punydecode(domain) +
-Returns the Unicode serialization of the domain. If domain is an invalid domain, the empty string is returned. +
+CheckedURI -**Note**: +
-- native function `url.domainToUnicode` does not support IPv6 only IPv4; -- native function `url.domainToUnicode` throws if no domain is provided or returns `null`, `undefined`, `nan` for `null`, `undefined` or `NaN` values which is not what to be expected. +Return shape of every [`check*`](#checkers) function on success — `ParsedURI` extended with a `valid: true` discriminant. -
+```ts +interface CheckedURI extends ParsedURI { + valid: true; +} +``` -- `domain` **** -- Returns: **** +
-
+### Punycode -**Examples**: +
+punycode(domain) -```javascript -punydecode(); // '' -punydecode('a.b.c.d.e.fg'); // 'a.b.c.d.e.fg' -punydecode('xn--iñvalid.com'); // '' -punydecode('xn--fiq228c.com'); // '中文.com' -punydecode('中文.com'); // '中文.com' -punydecode('2001:db8:85a3:8d3:1319:8a2e:370:7348'); // '2001:db8:85a3:8d3:1319:8a2e:370:7348' -punydecode('127.0.0.1'); // '127.0.0.1' -punydecode(undefined|null|NaN); // '' +
+ +Returns the Punycode ASCII serialization of a domain. Returns the empty string when the input is not a valid domain. + +**Parameters** + +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `domain` | `string` | *(required)* | The domain to serialize. | + +**Returns** — `string`. The ASCII form (or `''` on invalid input). + +**Notes** + +- Wraps Node's `url.domainToASCII` and normalizes the error case: the native function throws when called without an argument and returns `'null'` / `'undefined'` / `'nan'` for the corresponding non-domain inputs. +- IPv6 literals are passed through unchanged (the native function rejects them). + +**Examples** + +```ts +punycode(); // '' +punycode('a.b.c.d.e.fg'); // 'a.b.c.d.e.fg' +punycode('xn--iñvalid.com'); // '' +punycode('中文.com'); // 'xn--fiq228c.com' +punycode('xn--fiq228c.com'); // 'xn--fiq228c.com' +punycode('2001:db8:85a3:8d3:1319:8a2e:370:7348'); // '2001:db8:85a3:8d3:1319:8a2e:370:7348' +punycode('127.0.0.1'); // '127.0.0.1' ``` -### parseURI(uri) +
-Parse a string to get URI components. +
+punydecode(domain) -**Support**: +
-- IPv4 and IPv6 hosts; -- Internationalized Domain Name (IDN). +Returns the Unicode serialization of a domain. Returns the empty string when the input is not a valid domain. -**Note**: +**Parameters** -- RegExp from __RFC-3986__; -- scheme and host strings will always be put in lowercase once parsed, as specified in **RFC-3986**; -- authority and its components will be put at null values if authority parsed is missing or empty; -- **prefer using [checkURI](#checkuriuri) to parse and fully check an URI**. +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `domain` | `string` | *(required)* | The domain to deserialize. | -
+**Returns** — `string`. The Unicode form (or `''` on invalid input). -**Generic syntax**: +**Notes** -![URI Syntax](assets/uri-syntax.png "URI Syntax") +- Wraps Node's `url.domainToUnicode` and normalizes the same error edges as [`punycode`](#punycodedomain). -
+**Examples** -**Example URIs**: +```ts +punydecode(); // '' +punydecode('xn--fiq228c.com'); // '中文.com' +punydecode('中文.com'); // '中文.com' +punydecode('xn--iñvalid.com'); // '' +punydecode('2001:db8:85a3:8d3:1319:8a2e:370:7348'); // '2001:db8:85a3:8d3:1319:8a2e:370:7348' +punydecode('127.0.0.1'); // '127.0.0.1' +``` -![RFC-3986](assets/rfc-3986.png "RFC-3986") +
-
+### Parsing -**Based on**: +
+parseURI(uri) -- __RFC-3986__. +
-
+Parses a URI into its **RFC-3986 Appendix B** components, with IPv4/IPv6 host support and IDN (Punycode) awareness. -- `uri` **** -- Returns: **** - - `scheme` **** The URI scheme. *Default*: `null` - - `authority` **** The URI authority with the Punycode ASCII serialization of the domain. *Default*: `null` - - `authorityPunydecoded` **** The URI authority with the Unicode serialization of the domain. *Default*: `null` - - `userinfo` **** The URI userinfo. *Default*: `null` - - `host` **** The URI authority's host with the Punycode ASCII serialization of the domain. *Default*: `null` - - `hostPunydecoded` **** The URI authority's host with the Unicode serialization of the domain. *Default*: `null` - - `port` **** || **** The URI authority's port. A string if not able to be parsed in an integer. *Default*: `null` - - `path` **** The URI path. *Default*: `null` - - `pathqf` **** The URI path, query and fragment. *Default*: `null` - - `query` **** The URI query. *Default*: `null` - - `fragment` **** The URI fragment. *Default*: `null` - - `href` **** The URI recomposed. See __[recomposeURI](#recomposeuricomponents)__. *Default*: `null` +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `uri` | `string` | *(required)* | The URI string to parse. | -**Examples**: +**Returns** — [`ParsedURI`](#types). -```javascript -parseURI(); -// { -// scheme: null, -// authority: null, -// authorityPunydecoded: null, -// userinfo: null, -// host: null, -// hostPunydecoded: null, -// port: null, -// path: null, -// pathqf: null, -// query: null, -// fragment: null, -// href: null, -// } +**Notes** +- Scheme and host are lowercased per **RFC-3986 §6.2.2.1**. +- Authority and its components are `null` when the authority is absent or empty. +- A present-but-empty query or fragment (`?` or `#` with nothing after) is preserved as `''`, distinct from a missing one (`null`). +- For strict validation, prefer [`checkURI`](#checkers). + +**Examples** + +```ts parseURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); // { // scheme: 'foo', @@ -213,132 +282,45 @@ parseURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); // href: 'foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose', // } -parseURI('foo://user:pass@中文.com:80g42/over/there?name=ferret#nose'); -// { -// scheme: 'foo', -// authority: 'user:pass@xn--fiq228c.com:80g42', -// authorityPunydecoded: 'user:pass@中文.com:80g42', -// userinfo: 'user:pass', -// host: 'xn--fiq228c.com', -// hostPunydecoded: '中文.com', -// port: '80g42', -// path: '/over/there', -// pathqf: '/over/there?name=ferret#nose', -// query: 'name=ferret', -// fragment: 'nose', -// href: 'foo://user:pass@xn--fiq228c.com:80g42/over/there?name=ferret#nose', -// } - parseURI('urn:isbn:0-486-27557-4'); -// { -// scheme: 'urn', -// authority: null, -// authorityPunydecoded: null, -// userinfo: null, -// host: null, -// hostPunydecoded: null, -// port: null, -// path: 'isbn:0-486-27557-4', -// pathqf: 'isbn:0-486-27557-4', -// query: null, -// fragment: null -// href: 'urn:isbn:0-486-27557-4', -// } +// { scheme: 'urn', authority: null, path: 'isbn:0-486-27557-4', href: 'urn:isbn:0-486-27557-4', … } parseURI('http://user:pass@[fe80::7:8%eth0]:8080'); -// { -// scheme: 'http', -// authority: 'user:pass@[fe80::7:8%eth0]:8080', -// authorityPunydecoded: 'user:pass@[fe80::7:8%eth0]:8080', -// userinfo: 'user:pass', -// host: 'fe80::7:8%eth0', -// hostPunydecoded: 'fe80::7:8%eth0', -// port: 8080, -// path: '', -// pathqf: '', -// query: null, -// fragment: null, -// href: 'http://user:pass@[fe80::7:8%eth0]:8080/' -// } +// { scheme: 'http', host: 'fe80::7:8%eth0', port: 8080, path: '', href: 'http://user:pass@[fe80::7:8%eth0]:8080/', … } ``` -### recomposeURI(components) - -Recompose an URI from its components with basic URI checking. - -The empty string is returned if unable to recompose the URI. - -**Rules**: +
-1. scheme is required and must be at least 1 character; -2. path is required and can be empty; -3. if host is present path must be empty or start with `/`; -4. if host is not present path must not start with `//`; -5. host, if any, must be at least 3 characters; -6. userinfo will be ignored if empty; -7. port will be ignored if unable to parse it into an integer between 0 - 65535; -8. query will be ignored if empty; -9. fragment will be ignored if empty. +
+recomposeURI(components) -**Support**: - -- IPv4 and IPv6. - -**Note**: - -- `/` is added to any URI with a host and an empty path. - -
- -**Generic syntax**: - -![URI Syntax](assets/uri-syntax.png "URI Syntax") +
-
+Recomposes a URI from its components per **RFC-3986 §5.3**, with basic validity checking. Returns the empty string when the rules below are not met. -**Based on**: +**Parameters** -- __RFC-3986__. +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `components` | [`URIComponents`](#types) | *(required)* | The components to recompose. | -
+**Returns** — `string`. The recomposed URI (or `''` on invalid input). -- `components` ****: - - `scheme`* **** The URI scheme. - - `userinfo` **** The URI userinfo. - - `host` **** The URI authority's host. - - `port` **** The URI authority's port. - - `path`* **** The URI path. - - `query` **** The URI query. - - `fragment` **** The URI fragment. -- Returns: **** +**Notes** -
+- `scheme` is required and must be at least one character. +- `path` is required and may be empty. +- If `host` is present, `path` must be empty or start with `/`. +- If `host` is absent, `path` must not start with `//`. +- `host`, if present, must be at least three characters. +- `userinfo` is ignored when empty. +- `port` is ignored when not parseable as an integer in `0–65535`. +- `query` and `fragment` are ignored when empty. +- A trailing `/` is added to any URI with a host and an empty path. -**Examples**: - -```javascript -recomposeURI(); // '' - -recomposeURI({ - scheme: null, - userinfo: 'user:pass', - host: 'example.com', - port: 8080, - path: null, - query: 'a=b', - fragment: 'anchor', -}); // '' - -recomposeURI({ - scheme: 'foo', - userinfo: null, - host: null, - port: null, - path: '', - query: null, - fragment: null, -}); // 'foo:' +**Examples** +```ts recomposeURI({ scheme: 'foo', userinfo: 'user:pass', @@ -349,6 +331,9 @@ recomposeURI({ fragment: 'anchor', }); // 'foo://user:pass@bar.com:8080/over/there?a=b#anchor' +recomposeURI({ scheme: 'foo', path: '' }); +// 'foo:' + recomposeURI({ scheme: 'foo', userinfo: 'user:pass', @@ -358,1259 +343,635 @@ recomposeURI({ query: 'a=b', fragment: 'anchor', }); // 'foo://user:pass@[fe80::7:8%eth0]:8080/over/there?a=b#anchor' - -recomposeURI({ - scheme: 'foo', - userinfo: '', - host: 'fe80::7:8%eth0', - port: '55g55', - path: '/over/there', - query: '', - fragment: '', -}); // 'foo://[fe80::7:8%eth0]/over/there' ``` -### resolveURI(base, reference) - -Resolve a URI reference against a base URI. - -Implements the **RFC-3986 §5.2** algorithm: the §5.2.2 strict transform, the §5.2.3 merge, and §5.2.4 remove_dot_segments, then recomposes the target per §5.3. - -The empty string is returned if the base is not absolute or an argument is not a string. - -**Rules**: +
-1. base must be an absolute URI — a scheme is required (**RFC-3986 §5.2.1**); -2. reference may be absolute or relative; -3. both arguments must be strings. +### Reference resolution -**Note**: +
+resolveURI(base, reference) -- the strict algorithm is used: a reference scheme equal to the base scheme is not ignored; -- a fragment on the base is ignored — the base is used stripped of any fragment (**RFC-3986 §5.1**). +
-
+Resolves a URI reference against an absolute base URI per **RFC-3986 §5.2**: the §5.2.2 strict transform, the §5.2.3 merge, the §5.2.4 `remove_dot_segments`, then recomposes per §5.3. -**Based on**: +**Parameters** -- __RFC-3986 §5.2__. +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `base` | `string` | *(required)* | The absolute base URI. | +| `reference` | `string` | *(required)* | The URI reference to resolve. | -
+**Returns** — `string`. The resolved URI, or `''` when the base is not absolute or an argument is not a string. -- `base`* **** An absolute base URI. -- `reference`* **** The URI reference to resolve. -- Returns: **** +**Notes** -
+- The strict algorithm is used: a reference scheme equal to the base scheme is not ignored. +- A fragment on the base is stripped before resolution per **RFC-3986 §5.1**. -**Examples**: - -```javascript -resolveURI('http://a/b/c/d;p?q', '../../g'); // 'http://a/g' +**Examples** +```ts +resolveURI('http://a/b/c/d;p?q', '../../g'); // 'http://a/g' resolveURI('https://example.com/a/b', './c?x#y'); // 'https://example.com/a/c?x#y' - -resolveURI('/not-absolute', 'g'); // '' (base is not absolute) +resolveURI('/not-absolute', 'g'); // '' — base is not absolute ``` -### removeDotSegments(path) - -Remove the special `.` and `..` complete path segments from a path. +
-Implements the **RFC-3986 §5.2.4** algorithm verbatim. +
+removeDotSegments(path) -
+
-**Based on**: +Removes the `.` and `..` complete path segments from a path per **RFC-3986 §5.2.4** verbatim. -- __RFC-3986 §5.2.4__. +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `path` | `string` | *(required)* | The path to normalize. | -- `path`* **** The path to normalize. -- Returns: **** +**Returns** — `string`. The normalized path. -
- -**Examples**: - -```javascript -removeDotSegments('/a/b/c/./../../g'); // '/a/g' +**Examples** +```ts +removeDotSegments('/a/b/c/./../../g'); // '/a/g' removeDotSegments('mid/content=5/../6'); // 'mid/6' ``` -### isDomainLabel(label) +
-Test a label is a valid domain label according to **RFC-1034**. +### Validators -> "Note that while upper and lower case letters are allowed in domain names, no significance is attached to the case. That is, two names with the same spelling but different case are to be treated as if identical." +
+isDomainLabel(label) -By convention uppercased domain label will be considered invalid. +
-**Rules**: +Tests whether a label is a valid domain label per **RFC 1034**. By convention, an uppercased label is considered invalid (`DNS names are case-insensitive, but Coroboros normalizes on lowercase`). -1. "*Labels must be 63 characters or less.*"; -2. can be minimum one character; -3. must only use lowercase letters, digits or hyphens; -4. must not start or end with a hyphen; -5. must not have consecutive hyphens; -6. can start or end with a digit. +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `label` | `string` | *(required)* | The label to test. | -**Based on**: +**Returns** — `boolean`. -- __RFC-1034__. +**Notes** -
+- Length is one to 63 characters. +- Allowed characters: lowercase letters, digits, hyphen. +- Cannot start or end with a hyphen. +- No consecutive hyphens. +- Can start or end with a digit. -- `label` **** -- Returns: **** +**Examples** -
- -**Examples**: - -```javascript -isDomainLabel('a'); // true -isDomainLabel('1a3'); // true -isDomainLabel('1-3'); // true -isDomainLabel('1-y'); // true - -isDomainLabel(); // false +```ts +isDomainLabel('a'); // true +isDomainLabel('1a3'); // true isDomainLabel('a'.repeat(64)); // false -isDomainLabel('A'); // false -isDomainLabel('-a'); // false -isDomainLabel('a-'); // false -isDomainLabel('-a'); // false -isDomainLabel('la--bel'); // false -isDomainLabel(undefined|null|NaN); // false +isDomainLabel('A'); // false +isDomainLabel('-a'); // false +isDomainLabel('la--bel'); // false ``` -### isDomain(name) +
-Test a name is a valid domain according to **RFC-1034**. +
+isDomain(name) -Supports Fully-Qualified Domain Name (FQDN) and Internationalized Domain Name (IDN). +
-**Rules**: +Tests whether a name is a valid domain per **RFC 1034**, with FQDN and IDN support. -1. __[labels rules apply](#isdomainlabellabel)__; -2. "*[...] the total number of octets that represent a domain name (i.e., the sum of all label octets and label lengths) is limited to 255.*"; -3. labels are separated by dots ("."); -4. must have at least one extension label; -5. must have labels different from each other; -6. last label can be empty (root label "."); -7. labels can start with `xn--` for IDNs if the ASCII serialization is a valid Punycode **and has valid characters**. +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `name` | `string` | *(required)* | The domain to test. | -**Based on**: +**Returns** — `boolean`. -- __RFC-1034__. +**Notes** -
+- [`isDomainLabel`](#validators) rules apply to each label. +- Total length is at most 255 octets including label-length octets. +- Labels are separated by `.`. +- Must have at least one extension label. +- All labels must differ. +- The last label can be empty (root label `.`). +- Labels starting with `xn--` are valid only when the ASCII serialization is a valid Punycode and the decoded form has valid characters. -- `name` **** -- Returns: **** +**Examples** -
- -**Examples**: - -```javascript -isDomain('a.b'); // true -isDomain('a.b.'); // true -isDomain('中文.com'); // true -isDomain('xn--fiq228c.com'); // true -isDomain('www.中文.com'); // true -isDomain(`${'a'.repeat(63)}.${'b'.repeat(63)}.${'c'.repeat(63)}.${'d'.repeat(63)}`); // true - -isDomain(); // false -isDomain('a'); // false -isDomain('a.a'); // false -isDomain('a.b.a'); // false -isDomain('a.b.a'); // false +```ts +isDomain('a.b'); // true +isDomain('a.b.'); // true +isDomain('中文.com'); // true +isDomain('xn--fiq228c.com'); // true +isDomain('www.中文.com'); // true + +isDomain('a'); // false +isDomain('a.a'); // false isDomain('中文.xn--fiq228c.com'); // false -isDomain('www.xn--hf.com'); // false -isDomain(`${'a'.repeat(63)}.${'b'.repeat(63)}.${'c'.repeat(63)}.${'d'.repeat(63)}.`); // false -isDomain('xn--\'-6xd.com') // false even though xn--'-6xd is a valid Punycode for ॐ but has an invalid character +isDomain('xn--\'-6xd.com'); // false — valid Punycode for ॐ, but ॐ is not a valid character ``` -### isIP(ip) +
-Test a string is a valid IP. +
+isIP(ip) -Supports IPv4 and IPv6. +
-
+Tests whether a string is a valid IPv4 or IPv6 address. -- `ip` **** -- Returns: **** +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `ip` | `string` | *(required)* | The address to test. | -**Examples**: +**Returns** — `boolean`. -```javascript -isIP('23.71.254.72'); // true -isIP('1:2:3:4::6:7:8'); // true +**Examples** -isIP(); // false -isIP('100..100.100.100.'); // false -isIP('3ffe:b00::1::a'); // false +```ts +isIP('23.71.254.72'); // true +isIP('1:2:3:4::6:7:8'); // true +isIP('100..100.100.100'); // false +isIP('3ffe:b00::1::a'); // false ``` -### isIPv4(ip) - -Test a string is a valid IPv4. +
-
+
+isIPv4(ip) -- `ip` **** -- Returns: **** +
-
+Tests whether a string is a valid IPv4 address. Returns `false` for IPv6. -**Examples**: - -```javascript +```ts isIPv4('8.8.8.8'); // true -isIPv4('1:2::8'); // false -isIPv4(); // false +isIPv4('1:2::8'); // false ``` -### isIPv6(ip) +
-Test a string is a valid IPv6. +
+isIPv6(ip) -
+
-- `ip` **** -- Returns: **** +Tests whether a string is a valid IPv6 address. Returns `false` for IPv4. The standalone validator is lenient regarding zone identifiers — see [`checkURI`](#checkers) for the strict **RFC 6874** form expected inside a URI. -
- -**Examples**: - -```javascript +```ts isIPv6('2001:0000:1234:0000:0000:C1C0:ABCD:0876'); // true -isIPv6('212.58.241.131'); // false -isIPv6(); // false +isIPv6('212.58.241.131'); // false ``` -### checkURI(uri) +
-Check an URI is valid according to **RFC-3986**. +### Checkers -**Rules**: +
+checkURI(uri) -1. scheme is required and cannot be empty; -2. path is required and can be empty; -3. if authority is present path must be empty or start with `/`; -4. if authority is not present path must not start with `//`; -5. __scheme can only have specific characters__; -6. if authority is present: +
-- host must be a valid IP or domain name; -- __userinfo, if any, can only have specific characters__; -- port, if any, must be an integer between 0 - 65535. +Strictly validates a URI per **RFC-3986**. Returns the parsed components with `valid: true` on success; throws `URIError` with a stable [error code](#errors) on the first failure. -7. __path, query and fragment can only have specific characters__. +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `uri` | `string` | *(required)* | The URI to validate. | -**Generic syntax**: +**Returns** — [`CheckedURI`](#types). -![URI Syntax](assets/uri-syntax.png "URI Syntax") +**Throws** — `URIError` with one of: `URI_INVALID_TYPE`, `URI_MISSING_SCHEME`, `URI_EMPTY_SCHEME`, `URI_MISSING_PATH`, `URI_INVALID_PATH`, `URI_INVALID_HOST`, `URI_INVALID_SCHEME_CHAR`, `URI_INVALID_USERINFO_CHAR`, `URI_INVALID_PORT`, `URI_INVALID_PATH_CHAR`, `URI_INVALID_QUERY_CHAR`, `URI_INVALID_FRAGMENT_CHAR`, `URI_INVALID_PERCENT_ENCODING`. -
- -**Based on**: - -- __RFC-3986__. - -
- -- `uri` **** -- Returns: **** - - `scheme` **** The URI scheme. - - `authority` **** The URI authority with the Punycode ASCII serialization of the domain. *Default*: `null` - - `authorityPunydecoded` **** The URI authority with the Unicode serialization of the domain. *Default*: `null` - - `userinfo` **** The URI userinfo. *Default*: `null` - - `host` **** The URI authority's host with the Punycode ASCII serialization of the domain. *Default*: `null` - - `hostPunydecoded` **** The URI authority's host with the Unicode serialization of the domain. *Default*: `null` - - `port` **** || **** The URI authority's port. A string if not able to be parsed in an integer. *Default*: `null` - - `path` **** The URI path. - - `pathqf` **** The URI path, query and fragment. - - `query` **** The URI query. *Default*: `null` - - `fragment` **** The URI fragment. *Default*: `null` - - `href` **** The URI recomposed. *Default*: `null` - - `valid` **** Whether the URI is valid. *Default*: `false` -- Throws: **** If no error is thrown then the URI is valid. Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME_CHAR` - - `URI_INVALID_USERINFO_CHAR` - - `URI_INVALID_PORT` - - `URI_INVALID_PATH_CHAR` - - `URI_INVALID_QUERY_CHAR` - - `URI_INVALID_FRAGMENT_CHAR` - - `URI_INVALID_PERCENT_ENCODING` - -
- -**Examples**: - -```javascript -checkURI(); // throws URIError with code URI_INVALID_TYPE -checkURI('://example.com'); // throws URIError with code URI_MISSING_SCHEME -checkURI('foo:////bar'); // throws URIError with code URI_INVALID_PATH -checkURI('foo://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -checkURI('fôo:bar'); // throws URIError with code URI_INVALID_SCHEME_CHAR -checkURI('foo://üser:pass@bar.com'); // throws URIError with code URI_INVALID_USERINFO_CHAR -checkURI('foo://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -checkURI('foo://bar.com/°'); // throws URIError with code URI_INVALID_PATH_CHAR -checkURI('foo://bar.com/over/there?quêry=5'); // throws URIError with code URI_INVALID_QUERY_CHAR -checkURI('foo://bar.com/over/there?query=5#anch#r'); // throws URIError with code URI_INVALID_FRAGMENT_CHAR -checkURI('http://www.bar.baz/foo%2') // throws URIError with code URI_INVALID_PERCENT_ENCODING +**Notes** -checkURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); -// { -// scheme: 'foo', -// authority: 'user:pass@xn--fiq228c.com:8042', -// authorityPunydecoded: 'user:pass@中文.com:8042', -// userinfo: 'user:pass', -// host: 'xn--fiq228c.com', -// hostPunydecoded: '中文.com', -// port: 8042, -// path: '/over/there', -// pathqf: '/over/there?name=ferret#nose', -// query: 'name=ferret', -// fragment: 'nose', -// href: 'foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose', -// valid: true -// } -``` +- Scheme is required and non-empty (**RFC-3986 §3.1**). +- Path is required and may be empty. +- If authority is present, path must be empty or start with `/`; otherwise path must not start with `//`. +- Authority components: host must be a valid IP or domain; `userinfo` only allows the characters from **RFC-3986 §3.2.1**; `port` must be an integer in `0–65535`. +- Path, query, and fragment only allow the characters from **RFC-3986 §3.3 / §3.4 / §3.5**. +- IPv6 zone identifiers must use the `%25` delimiter and a non-empty `ZoneID` of `unreserved` / `pct-encoded` characters (**RFC 6874 §2**). -### checkHttpURL(uri) - -Check an URI is a valid HTTP URL. - -**Rules**: - -1. __[must be a valid URI](#checkuriuri)__; -1. scheme must be `http` or `HTTP`; -2. authority is required; -3. URL must be less than 2048 characters. - -
- -**Based on**: - -- __RFC-3986__. - -
- -- `uri` **** -- Returns: **** - - `scheme` **** The URL scheme. - - `authority` **** The URL authority with the Punycode ASCII serialization of the domain. *Default*: `null` - - `authorityPunydecoded` **** The URL authority with the Unicode serialization of the domain. *Default*: `null` - - `userinfo` **** The URL userinfo. *Default*: `null` - - `host` **** The URL authority's host with the Punycode ASCII serialization of the domain. *Default*: `null` - - `hostPunydecoded` **** The URL authority's host with the Unicode serialization of the domain. *Default*: `null` - - `port` **** || **** The URL authority's port. A string if not able to be parsed in an integer. *Default*: `null` - - `path` **** The URL path. - - `pathqf` **** The URI path, query and fragment. - - `query` **** The URL query. *Default*: `null` - - `fragment` **** The URL fragment. *Default*: `null` - - `href` **** The URL recomposed. *Default*: `null` - - `valid` **** Whether the URL is valid. *Default*: `false` -- Throws: **** If no error is thrown then the URL is valid. Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_USERINFO_CHAR` - - `URI_INVALID_PORT` - - `URI_INVALID_PATH_CHAR` - - `URI_INVALID_QUERY_CHAR` - - `URI_INVALID_FRAGMENT_CHAR` - - `URI_INVALID_PERCENT_ENCODING` - - `URI_INVALID_SCHEME` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` - -
- -**Examples**: - -```javascript -checkHttpURL(); // throws URIError with code URI_INVALID_TYPE -checkHttpURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -checkHttpURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -checkHttpURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -checkHttpURL('http://üser:pass@bar.com'); // throws URIError with code URI_INVALID_USERINFO_CHAR -checkHttpURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -checkHttpURL('http://bar.com/°'); // throws URIError with code URI_INVALID_PATH_CHAR -checkHttpURL('http://bar.com/over/there?quêry=5'); // throws URIError with code URI_INVALID_QUERY_CHAR -checkHttpURL('http://bar.com/over/there?query=5#anch#r'); // throws URIError with code URI_INVALID_FRAGMENT_CHAR -checkHttpURL('http://www.bar.baz/foo%2') // throws URIError with code URI_INVALID_PERCENT_ENCODING -checkHttpURL('httê://bar.com:8080'); // throws URIError with code URI_INVALID_SCHEME -checkHttpURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -checkHttpURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL +**Examples** -checkHttpURL('http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); -// { -// scheme: 'http', -// authority: 'user:pass@xn--fiq228c.com:8042', -// authorityPunydecoded: 'user:pass@中文.com:8042', -// userinfo: 'user:pass', -// host: 'xn--fiq228c.com', -// hostPunydecoded: '中文.com', -// port: 8042, -// path: '/over/there', -// pathqf: '/over/there?name=ferret#nose', -// query: 'name=ferret', -// fragment: 'nose', -// href: 'http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose', -// valid: true -// } +```ts +checkURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); +// { scheme: 'foo', host: 'xn--fiq228c.com', valid: true, … } + +checkURI(); // throws URIError — URI_INVALID_TYPE +checkURI('://example.com'); // throws URIError — URI_MISSING_SCHEME +checkURI('foo:////bar'); // throws URIError — URI_INVALID_PATH +checkURI('foo://xn--iñvalid.com'); // throws URIError — URI_INVALID_HOST +checkURI('fôo:bar'); // throws URIError — URI_INVALID_SCHEME_CHAR +checkURI('foo://üser:pass@bar.com'); // throws URIError — URI_INVALID_USERINFO_CHAR +checkURI('foo://bar.com:80g80'); // throws URIError — URI_INVALID_PORT +checkURI('foo://bar.com/°'); // throws URIError — URI_INVALID_PATH_CHAR +checkURI('foo://bar.com/over/there?quêry=5'); // throws URIError — URI_INVALID_QUERY_CHAR +checkURI('foo://bar.com/over/there?query=5#anch#r'); // throws URIError — URI_INVALID_FRAGMENT_CHAR +checkURI('http://www.bar.baz/foo%2'); // throws URIError — URI_INVALID_PERCENT_ENCODING ``` -### checkHttpsURL(uri) +
-Check an URI is a valid HTTPS URL. Same behavior than __[checkHttpURL](#checkhttpurluri)__ except scheme must be `https` or `HTTPS`. +
+checkHttpURL(uri) -### checkHttpSitemapURL(uri) +
-Check an URI is a valid HTTP URL to be used in an XML sitemap file. +Validates a URI as an HTTP URL on top of [`checkURI`](#checkers). -For text sitemap please refer to __[checkHttpURL](#checkhttpurluri)__ as there is no need to escape entities **but URL must be in lowercase**. +**Adds** -**Rules**: +- `scheme` must be `http` or `HTTP` — else `URI_INVALID_SCHEME`. +- `authority` is required — else `URI_MISSING_AUTHORITY`. +- URL must be shorter than 2,048 characters — else `URI_MAX_LENGTH_URL`. -1. __[must be a valid URL](#checkhttpurluri)__; -1. scheme must be `http`; -2. authority is required; -3. specific characters must be escaped; -4. can only contain lowercase characters (prechecked); -5. URL must be less than 2048 characters. +**Returns** — [`CheckedURI`](#types). Throws `URIError` with any of `checkURI`'s codes plus the three above. -**Valid URI characters to be escaped or percent-encoded in a sitemap URL**: +```ts +checkHttpURL('http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); +// { scheme: 'http', host: 'xn--fiq228c.com', valid: true, … } +``` -| Character | Value | Escape Code | -| :----------- |:-----:| :---------: | -| Ampersand | `&` | `&` | -| Single Quote | `'` | `'` | -| Asterisk | `*` | `%2A` | +
-
- -**Based on**: - -- __RFC-3986__; -- __Google: Build and submit a sitemap__. - -
- -- `uri` **** -- Returns: **** - - `scheme` **** The URL scheme. - - `authority` **** The URL authority with the Punycode ASCII serialization of the domain. *Default*: `null` - - `authorityPunydecoded` **** The URL authority with the Unicode serialization of the domain. *Default*: `null` - - `userinfo` **** The URL userinfo. *Default*: `null` - - `host` **** The URL authority's host with the Punycode ASCII serialization of the domain. *Default*: `null` - - `hostPunydecoded` **** The URL authority's host with the Unicode serialization of the domain. *Default*: `null` - - `port` **** || **** The URL authority's port. A string if not able to be parsed in an integer. *Default*: `null` - - `path` **** The URL path. - - `pathqf` **** The URI path, query and fragment. - - `query` **** The URL query. *Default*: `null` - - `fragment` **** The URL fragment. *Default*: `null` - - `href` **** The URL recomposed. *Default*: `null` - - `valid` **** Whether the URL is valid. *Default*: `false` -- Throws: **** If no error is thrown then the URL is valid. Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_USERINFO_CHAR` - - `URI_INVALID_PORT` - - `URI_INVALID_CHAR` - - `URI_INVALID_PATH_CHAR` - - `URI_INVALID_QUERY_CHAR` - - `URI_INVALID_FRAGMENT_CHAR` - - `URI_INVALID_PERCENT_ENCODING` - - `URI_INVALID_SITEMAP_ENCODING` - - `URI_INVALID_SCHEME` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` - -
- -**Examples**: - -```javascript -checkHttpSitemapURL(); // throws URIError with code URI_INVALID_TYPE -checkHttpSitemapURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -checkHttpSitemapURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -checkHttpSitemapURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -checkHttpSitemapURL('http://*ser:pass@bar.com'); // throws URIError with code URI_INVALID_USERINFO_CHAR -checkHttpSitemapURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -checkHttpSitemapURL('hTtp://bar.com/Path'); // throws URIError with code URI_INVALID_CHAR -checkHttpSitemapURL('http://bAr.com/Path'); // throws URIError with code URI_INVALID_CHAR -checkHttpSitemapURL('http://bar.com/Path'); // throws URIError with code URI_INVALID_CHAR -checkHttpSitemapURL('http://bar.com/path\''); // throws URIError with code URI_INVALID_PATH_CHAR -checkHttpSitemapURL('http://bar.com/over/there?a=5&b=9'); // throws URIError with code URI_INVALID_QUERY_CHAR -checkHttpSitemapURL('http://bar.com/over/there?a=5#anch*r'); // throws URIError with code URI_INVALID_FRAGMENT_CHAR -checkHttpSitemapURL('http://www.bar.baz/foo%2') // throws URIError with code URI_INVALID_PERCENT_ENCODING -checkHttpSitemapURL('http://www.bar.baz/foo?a=5&am;b=9') // throws URIError with code URI_INVALID_SITEMAP_ENCODING -checkHttpSitemapURL('hêtp://bar.com:8080'); // throws URIError with code URI_INVALID_SCHEME -checkHttpSitemapURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -checkHttpSitemapURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL +
+checkHttpsURL(uri) -checkHttpSitemapURL('http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret&catch=rabbits#nose'); -// { -// scheme: 'http', -// authority: 'user:pass@xn--fiq228c.com:8042', -// authorityPunydecoded: 'user:pass@中文.com:8042', -// userinfo: 'user:pass', -// host: 'xn--fiq228c.com', -// hostPunydecoded: '中文.com', -// port: 8042, -// path: '/over/there', -// pathqf: '/over/there?name=ferret&catch=rabbits#nose', -// query: 'name=ferret&catch=rabbits', -// fragment: 'nose', -// href: 'http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret&catch=rabbits#nose', -// valid: true -// } -``` +
-### checkHttpsSitemapURL(uri) +Same as [`checkHttpURL`](#checkers) but `scheme` must be `https` or `HTTPS`. -Check an URI is a valid HTTPS URL to be used in an XML sitemap file. Same behavior than __[checkHttpSitemapURL](#checkhttpsitemapurluri)__ except scheme must be `https`. +
-### checkWebURL(uri) +
+checkWebURL(uri) -Check an URI is a valid HTTP or HTTPS URL. Same behavior than __[checkHttpURL](#checkhttpurluri)__ except scheme can be `http`/`HTTP` or `https`/`HTTPS`. +
-### checkSitemapURL(uri) +Same as [`checkHttpURL`](#checkers) but `scheme` can be `http` / `HTTP` or `https` / `HTTPS`. -Check an URI is a valid HTTP or HTTPS URL to be used in an XML sitemap file. Same behavior than __[checkHttpSitemapURL](#checkhttpsitemapurluri)__ except scheme can be `http` or `https`. +
-### encodeURIComponentString(component, options) +
+checkHttpSitemapURL(uri) -Encode an URI component according to **RFC-3986**. +
-**Support**: +Validates a URI as an HTTP URL fit for an XML sitemap on top of [`checkHttpURL`](#checkers). -- Sitemap's special characters, see __[checkHttpSitemapURL](#checkhttpsitemapurluri)__; -- lower and upper case. +**Adds** -**Note**: +- The URL must be all lowercase (scheme, host, path, query, fragment) — else `URI_INVALID_CHAR`. +- Specific characters must be escaped — the table below lists them. +- Percent-encoded sitemap escapes (`&`, `'`, `"`, `<`, `>`) must be well-formed — else `URI_INVALID_SITEMAP_ENCODING`. -- only `userinfo`, `path`, `query` and `fragment` components can be encoded with specific rules for each type regarding valid characters (**RFC-3986**); -- `scheme` and `authority` (host and port) can never have escaped or percent-encoded characters; -- the empty string is returned if unable to encode; -- __[sitemap characters](#checkhttpsitemapurluri)__ must be in lowercase and escaped for XML sitemap URLs. +**Sitemap-escaped characters** -
+| Character | Value | Escape code | +| :----------- | :---: | :---------: | +| Ampersand | `&` | `&` | +| Single quote | `'` | `'` | +| Double quote | `"` | `"` | +| Less than | `<` | `<` | +| Greater than | `>` | `>` | +| Asterisk | `*` | `%2A` | -**Generic syntax**: +For plain-text sitemaps no escaping is required — use [`checkHttpURL`](#checkers) instead, but the URL must still be lowercase. -![URI Syntax](assets/uri-syntax.png "URI Syntax") +**Returns** — [`CheckedURI`](#types). Throws `URIError` with the union of `checkHttpURL`'s codes plus `URI_INVALID_CHAR`, `URI_INVALID_SITEMAP_ENCODING`. -
- -**Based on**: - -- __RFC-3986__; -- __Google: Build and submit a sitemap__. - -
- -- `component` **** -- `options` ****: - - `type` **** The component type. If no type is provided native function *encodeURIComponent* will be used to encode each character. *Default*: `none` One of: - - `userinfo` - - `path` - - `query` - - `fragment` - - `lowercase` **** Whether the component should be returned in lowercase. *Default*: `false` - - `sitemap` **** Whether to escape Sitemap's special characters. See __[checkHttpSitemapURL](#checkhttpsitemapurluri)__. -- Returns: **** - -
- -**Examples**: - -```javascript -encodeURIComponentString(); // '' -encodeURIComponentString(''); // '' -encodeURIComponentString('cômpön€nt'); // 'c%C3%B4mp%C3%B6n%E2%82%ACnt' -encodeURIComponentString('AbC'); // 'AbC' -encodeURIComponentString('AbC', { lowercase: true }); // 'abc' -encodeURIComponentString('*'); // '*' -encodeURIComponentString('*', { sitemap: true }); // '%2A' - -// it is highly recommended to use a component type -encodeURIComponentString('A#/?@[]&\'*'); // 'A%23%2F%3F%40%5B%5D%26\'*' (native function, outdated standard) -encodeURIComponentString('A#/?@[]&\'*', { type: 'userinfo' }); // 'A%23%2F%3F%40%5B%5D&\'*' (RFC-3986 characters in userinfo) -encodeURIComponentString('A#/?@[]&\'*', { type: 'path' }); // 'A%23/%3F@%5B%5D&\'*' -encodeURIComponentString('A#/?@[]&\'*', { type: 'query' }); // 'A%23/?@%5B%5D&\'*' -encodeURIComponentString('A#/?@[]&\'*', { type: 'fragment' }); // 'A%23/?@%5B%5D&\'*' -encodeURIComponentString('A#/?@[]&\'*', { type: 'fragment', sitemap: true }); // 'a%23/?@%5B%5D&'%2A' +```ts +checkHttpSitemapURL('http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret&catch=rabbits#nose'); +// { scheme: 'http', host: 'xn--fiq228c.com', valid: true, … } ``` -### encodeURIString(uri, options) +
-Encode an URI string according to **RFC-3986** with basic checking. +
+checkHttpsSitemapURL(uri) -**Checked**: +
-- scheme is required; -- path is required, can be empty; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name. +Same as [`checkHttpSitemapURL`](#checkers) but `scheme` must be `https`. -**Support**: +
-- IDNs: returns URI with its Punycode host, if any; -- lower and upper case. +
+checkSitemapURL(uri) -**Note**: +
-- only `userinfo`, `path`, `query` and `fragment` can be percent-encoded; -- native function `encodeURI` encodes string according to **RFC-2396** which is outdated; -- native function `encodeURI` also encodes scheme and host that cannot have - percend-encoded characters; -- characters that should not be percent-encoded in **RFC-3986** are `[]` to represent IPv6 host; -- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization. +Same as [`checkHttpSitemapURL`](#checkers) but `scheme` can be `http` or `https`. -
+
-**Generic syntax**: +### Encoders -![URI Syntax](assets/uri-syntax.png "URI Syntax") +
+encodeURIComponentString(component, options) -
+
-**Based on**: +Encodes a URI component per **RFC-3986**, with per-type rules and an optional Sitemap-aware mode. Returns the empty string when the input is not a string. -- __RFC-3986__. +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `component` | `string` | *(required)* | The component to encode. | +| `options.type` | `'userinfo' \| 'path' \| 'query' \| 'fragment'` | *(none)* | The component type. Without a type, native `encodeURIComponent` is used (RFC-2396, outdated). | +| `options.lowercase` | `boolean` | `false` | Lowercase the component before encoding. | +| `options.sitemap` | `boolean` | `false` | Escape Sitemap's special characters (see [`checkHttpSitemapURL`](#checkers)). | -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME_CHAR` - - `URI_INVALID_PORT` +**Returns** — `string`. The encoded component (or `''` on invalid input). -
+**Notes** -**Examples**: +- Only `userinfo`, `path`, `query`, and `fragment` can be percent-encoded. `scheme` and `authority` (host and port) cannot. +- Pass a component type. Without it, native `encodeURIComponent` over-escapes `!`, `*`, `'`, `(`, `)`, which **RFC-3986** treats as valid sub-delims. -```javascript -encodeURIString(); // throws URIError with code URI_INVALID_TYPE -encodeURIString('://example.com'); // throws URIError with code URI_MISSING_SCHEME -encodeURIString('http:////bar'); // throws URIError with code URI_INVALID_PATH -encodeURIString('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -encodeURIString('hôtp:bar'); // throws URIError with code URI_INVALID_SCHEME_CHAR -encodeURIString('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT +**Examples** -encodeURIString('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor'); // 'https://www.xn--fiq228c.com./Over/There?a=B&b=c#Anchor' -encodeURIString('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); // 'https://www.xn--fiq228c.com./over/there?a=b&b=c#anchor' -encodeURIString('foo://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr'); // 'foo://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r' +```ts +encodeURIComponentString('cômpön€nt'); // 'c%C3%B4mp%C3%B6n%E2%82%ACnt' +encodeURIComponentString('AbC', { lowercase: true }); // 'abc' +encodeURIComponentString('*', { sitemap: true }); // '%2A' +encodeURIComponentString("A#/?@[]&'*"); // 'A%23%2F%3F%40%5B%5D%26\'*' — outdated RFC-2396 +encodeURIComponentString("A#/?@[]&'*", { type: 'userinfo' }); // 'A%23%2F%3F%40%5B%5D&\'*' +encodeURIComponentString("A#/?@[]&'*", { type: 'path' }); // 'A%23/%3F@%5B%5D&\'*' +encodeURIComponentString("A#/?@[]&'*", { type: 'fragment', sitemap: true }); +// 'a%23/?@%5B%5D&'%2A' ``` -### encodeWebURL(uri, options) +
-Encode an URI string with basic checking based on **RFC-3986** standard applied to HTTP and HTTPS URLs. +
+encodeURIString(uri, options) -Uses __[a fixed encodeURI function](#encodeuristringuri-options)__ to be **RFC-3986** compliant. +
-**Checked**: +Encodes a URI string per **RFC-3986** with basic validity checking and IDN support. The native `encodeURI` is **RFC-2396**, which is outdated and over-encodes; this function fixes both issues. -- scheme must be `http`/`HTTP` or `https`/`HTTPS`; -- path is required, can be empty; -- authority is required; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name; -- URL must be less than 2048 characters. +**Parameters** -**Support**: +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `uri` | `string` | *(required)* | The URI to encode. | +| `options.lowercase` | `boolean` | `false` | Lowercase the entire URI including path, query, and fragment. | -- IDNs: returns URL with its Punycode host, if any; -- lower and upper case. +**Returns** — `string`. The encoded URI. -**Note**: +**Throws** — `URIError` with one of: `URI_INVALID_TYPE`, `URI_MISSING_SCHEME`, `URI_EMPTY_SCHEME`, `URI_MISSING_PATH`, `URI_INVALID_PATH`, `URI_INVALID_HOST`, `URI_INVALID_SCHEME_CHAR`, `URI_INVALID_PORT`. -- only `userinfo`, `path`, `query` and `fragment` can be percent-encoded; -- native function `encodeURI` encodes string according to **RFC-2396** which is outdated; -- native function `encodeURI` also encodes scheme and host that cannot have - percend-encoded characters; -- characters that should not be percent-encoded in **RFC-3986** are `[]` to represent IPv6 host; -- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization. +**Notes** -
+- Only `userinfo`, `path`, `query`, and `fragment` can be percent-encoded; `scheme` and `host` cannot. +- IDN hosts are serialized to Punycode. +- `[` and `]` are not percent-encoded — they delimit IPv6 hosts. +- By default only scheme and host are lowercased (**RFC-3986 §6.2.2.1**). Path, query, and fragment are case-sensitive — see [Limitations](#limitations) for the `lowercase` flag's scope. -**Generic syntax**: +**Examples** -![URI Syntax](assets/uri-syntax.png "URI Syntax") - -
- -**Based on**: - -- __RFC-3986__. - -
- -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME` - - `URI_INVALID_PORT` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` - -
- -**Examples**: - -```javascript -encodeWebURL(); // throws URIError with code URI_INVALID_TYPE -encodeWebURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -encodeWebURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -encodeWebURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -encodeWebURL('ftp://bar.baz'); // throws URIError with code URI_INVALID_SCHEME -encodeWebURL('hôtp://bar.baz'); // throws URIError with code URI_INVALID_SCHEME -encodeWebURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -encodeWebURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -encodeWebURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL - -encodeWebURL('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor'); // 'https://www.xn--fiq228c.com./Over/There?a=B&b=c#Anchor' -encodeWebURL('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); // 'https://www.xn--fiq228c.com./over/there?a=b&b=c#anchor' -encodeWebURL('http://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr'); // 'http://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r' -``` - -### encodeSitemapURL(uri) +```ts +encodeURIString('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor'); +// 'https://www.xn--fiq228c.com./Over/There?a=B&b=c#Anchor' -Encode an URI string with basic checking based on **RFC-3986** standard applied to HTTP and HTTPS URLs and sitemap requirements regarding special characters to escape. +encodeURIString('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); +// 'https://www.xn--fiq228c.com./over/there?a=b&b=c#anchor' -Uses __[a fixed encodeURI function](#encodeuristringuri-options)__ to be **RFC-3986** compliant. +encodeURIString('foo://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr'); +// 'foo://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r' +``` -**Checked**: +
-- scheme must be `http`/`HTTP` or `https`/`HTTPS`; -- path is required, can be empty; -- authority is required; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name; -- URL must be less than 2048 characters. +
+encodeWebURL(uri, options) -**Support**: +
-- Sitemap's special characters, see __[checkHttpSitemapURL](#checkhttpsitemapurluri)__; -- IDNs: returns URI with its Punycode host, if any; -- **characters are automatically put in lowercase**. +Encodes an HTTP or HTTPS URL per **RFC-3986**, on top of [`encodeURIString`](#encoders). Uses the same fixed-encode logic but enforces the HTTP(S) constraints. -**Note**: +**Adds** -- only `userinfo`, `path`, `query` and `fragment` can be percent-encoded; -- native function `encodeURI` encodes string according to **RFC-2396** which is outdated; -- native function `encodeURI` also encodes scheme and host that cannot have - percend-encoded characters; -- characters that should not be percent-encoded in **RFC-3986** are `[]` to represent IPv6 host; -- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization. +- `scheme` must be `http` / `HTTP` or `https` / `HTTPS` — else `URI_INVALID_SCHEME`. +- `authority` is required — else `URI_MISSING_AUTHORITY`. +- URL must be shorter than 2,048 characters — else `URI_MAX_LENGTH_URL`. -
+**Parameters and options** — identical to [`encodeURIString`](#encoders). -**Generic syntax**: +**Examples** -![URI Syntax](assets/uri-syntax.png "URI Syntax") +```ts +encodeWebURL('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor'); +// 'https://www.xn--fiq228c.com./Over/There?a=B&b=c#Anchor' -
- -**Based on**: - -- __RFC-3986__; -- __Google: Build and submit a sitemap__. - -
- -- `uri` **** -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME` - - `URI_INVALID_PORT` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` - -
- -**Examples**: - -```javascript -encodeSitemapURL(); // throws URIError with code URI_INVALID_TYPE -encodeSitemapURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -encodeSitemapURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -encodeSitemapURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -encodeSitemapURL('ftp://bar.baz'); // throws URIError with code URI_INVALID_SCHEME -encodeSitemapURL('hôtp://bar.baz'); // throws URIError with code URI_INVALID_SCHEME -encodeSitemapURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -encodeSitemapURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -encodeSitemapURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL - -encodeSitemapURL('http://user:p\'âss@bar.baz/it\'s *ver/there?a=b&b=c#anch*r'); // 'http://user:p'%C3%A2ss@bar.baz/it's%20%2Aver/there?a=b&b=c#anch%2Ar' +encodeWebURL('http://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr'); +// 'http://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r' ``` -### decodeURIComponentString(component, options) - -Decode an URI component string. - -Native function `decodeURIComponent` could throw and to be consistent with [encodeURIComponentString](#encodeuricomponentstringcomponent-options) the empty string is returned if unable to decode. - -**Support**: +
-- Sitemap's escape codes, see __[checkHttpSitemapURL](#checkhttpsitemapurluri)__; -- lower and upper case. +
+encodeSitemapURL(uri) -
+
-**Based on**: +Encodes an HTTP or HTTPS URL for an XML sitemap on top of [`encodeWebURL`](#encoders) — applies Sitemap escape codes and lowercases the URL. -- __RFC-3986__; -- __Google: Build and submit a sitemap__. +**Adds** -
+- Sitemap's special characters are escaped (see [`checkHttpSitemapURL`](#checkers)). +- The output is fully lowercased. -- `component` **** -- `options` ****: - - `lowercase` **** Whether the component should be returned in lowercase. *Default*: `false` - - `sitemap` **** Whether to decode Sitemap's escape codes. See __[checkHttpSitemapURL](#checkhttpsitemapurluri)__. -- Returns: **** +**Examples** -
- -**Examples**: - -```javascript -decodeURIComponentString(); // '' -decodeURIComponentString(''); // '' -decodeURIComponentString('AbC'); // 'AbC' -decodeURIComponentString('AbC', { lowercase: true }); // 'abc' -decodeURIComponentString('%2A'); // '*' -decodeURIComponentString(''&%2A', { sitemap: true }); // '\'&*' -decodeURIComponentString('SITE&maP', { sitemap: true, lowercase: true }); // 'site&map' +```ts +encodeSitemapURL("http://user:p'âss@bar.baz/it's *ver/there?a=b&b=c#anch*r"); +// 'http://user:p'%C3%A2ss@bar.baz/it's%20%2Aver/there?a=b&b=c#anch%2Ar' ``` -### decodeURIString(uri, options) +
-Decode an URI string according to **RFC-3986** with basic checking. +### Decoders -**Checked**: +
+decodeURIComponentString(component, options) -- scheme is required; -- path is required, can be empty; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name. +
-**Support**: +Decodes a URI component string. Returns the empty string when the input cannot be decoded (`decodeURIComponent` would throw). -- IDNs: returns URI with its Punydecoded host (Unicode serialization of the domain), if any; -- lower and upper case. +**Parameters** -**Note**: +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `component` | `string` | *(required)* | The component to decode. | +| `options.lowercase` | `boolean` | `false` | Lowercase the result. | +| `options.sitemap` | `boolean` | `false` | Decode Sitemap escape codes (see [`checkHttpSitemapURL`](#checkers)). | -- if one of `userinfo`, `path`, `query` or `fragment` component cannot be decoded, it will be ignored; -- native function `decodeURI` does not support IDNs and cannot properly work with `encodeURI` since the function is based on an outdated standard; -- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization; -- to only use with [encodeURIString](#encodeuristringuri-options). +**Returns** — `string`. The decoded component (or `''` on invalid input). -
+**Examples** -**Based on**: - -- __RFC-3986__. - -
- -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME_CHAR` - - `URI_INVALID_PORT` - -
- -**Examples**: - -```javascript -decodeURIString(); // throws URIError with code URI_INVALID_TYPE -decodeURIString('://example.com'); // throws URIError with code URI_MISSING_SCHEME -decodeURIString('http:////bar'); // throws URIError with code URI_INVALID_PATH -decodeURIString('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -decodeURIString('hôtp:bar'); // throws URIError with code URI_INVALID_SCHEME_CHAR -decodeURIString('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT - -decodeURIString('http://user%:pass@xn--fiq228c.com/%?query=%E0%A5%90#anch#or'); // 'http://中文.com/?query=ॐ' -decodeURIString('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor'); // 'https://www.中文.com./Over/There?a=B&b=c#Anchor' -decodeURIString('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); // 'https://www.中文.com./over/there?a=b&b=c#anchor' -decodeURIString('foo://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r'); // 'foo://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr' +```ts +decodeURIComponentString('%2A'); // '*' +decodeURIComponentString(''&%2A', { sitemap: true }); // "'&*" +decodeURIComponentString('SITE&maP', { sitemap: true, lowercase: true }); +// 'site&map' ``` -### decodeWebURL(uri, options) - -Decode an URI string with basic checking based on **RFC-3986** standard applied to HTTP and HTTPS URLs. - -Uses __[a fixed decodeURI function](#decodeuristringuri-options)__ to be **RFC-3986** compliant. - -**Checked**: +
-- scheme must be `http`/`HTTP` or `https`/`HTTPS`; -- path is required, can be empty; -- authority is required; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name; -- URL must be less than 2048 characters. +
+decodeURIString(uri, options) -**Support**: +
-- IDNs: returns URI with its Punydecoded host (Unicode serialization of the domain), if any; -- lower and upper case. +Decodes a URI string per **RFC-3986** with basic validity checking and IDN support — the inverse of [`encodeURIString`](#encoders). -**Note**: +**Parameters** -- if one of `userinfo`, `path`, `query` or `fragment` component cannot be decoded, it will be ignored; -- native function `decodeURI` does not support IDNs and cannot properly work with `encodeURI` since the function is based on an outdated standard; -- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization; -- to only use with [encodeWebURL](#encodeweburluri-options). +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `uri` | `string` | *(required)* | The URI to decode. | +| `options.lowercase` | `boolean` | `false` | Lowercase the entire URI including path, query, and fragment. | -
+**Returns** — `string`. The decoded URI. -**Based on**: +**Throws** — `URIError` with one of: `URI_INVALID_TYPE`, `URI_MISSING_SCHEME`, `URI_EMPTY_SCHEME`, `URI_MISSING_PATH`, `URI_INVALID_PATH`, `URI_INVALID_HOST`, `URI_INVALID_SCHEME_CHAR`, `URI_INVALID_PORT`. -- __RFC-3986__. +**Notes** -
+- A component that cannot be decoded is silently passed through (preserves the encoded form). +- IDN hosts are returned in Unicode form (Punydecoded). +- See [Limitations](#limitations) for the `lowercase` flag's scope. -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME` - - `URI_INVALID_PORT` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` +**Examples** -
- -**Examples**: +```ts +decodeURIString('http://user%:pass@xn--fiq228c.com/%?query=%E0%A5%90#anch#or'); +// 'http://中文.com/?query=ॐ' -```javascript -decodeWebURL(); // throws URIError with code URI_INVALID_TYPE -decodeWebURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -decodeWebURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -decodeWebURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -decodeWebURL('ftp://bar.com'); // throws URIError with code URI_INVALID_SCHEME -decodeWebURL('hôtp://bar.com'); // throws URIError with code URI_INVALID_SCHEME -decodeWebURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -decodeWebURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -decodeWebURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL +decodeURIString('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor'); +// 'https://www.中文.com./Over/There?a=B&b=c#Anchor' -decodeWebURL('http://user%:pass@xn--fiq228c.com/%?query=%E0%A5%90#anch#or'); // 'http://中文.com/?query=ॐ' -decodeWebURL('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor'); // 'https://www.中文.com./Over/There?a=B&b=c#Anchor' -decodeWebURL('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); // 'https://www.中文.com./over/there?a=b&b=c#anchor' -decodeWebURL('http://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r'); // 'http://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr' +decodeURIString('foo://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r'); +// 'foo://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr' ``` -### decodeSitemapURL(uri, options) - -Decode an URI string with basic checking based on **RFC-3986** standard applied to HTTP and HTTPS URLs and sitemap requirements regarding escape codes to decode. +
-Uses __[a fixed decodeURI function](#decodeuristringuri-options)__ to be **RFC-3986** compliant. +
+decodeWebURL(uri, options) -**Checked**: +
-- scheme must be `http`/`HTTP` or `https`/`HTTPS`; -- path is required, can be empty; -- authority is required; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name; -- URL must be less than 2048 characters. +Decodes an HTTP or HTTPS URL per **RFC-3986** on top of [`decodeURIString`](#decoders) — the inverse of [`encodeWebURL`](#encoders). -**Support**: +**Adds** -- Sitemap's escape codes, see __[checkHttpSitemapURL](#checkhttpsitemapurluri)__; -- IDNs: returns URI with its Punydecoded host (Unicode serialization of the domain), if any; -- lower and upper case. +- `scheme` must be `http` / `HTTP` or `https` / `HTTPS` — else `URI_INVALID_SCHEME`. +- `authority` is required — else `URI_MISSING_AUTHORITY`. +- URL must be shorter than 2,048 characters — else `URI_MAX_LENGTH_URL`. -**Note**: +**Examples** -- if one of `userinfo`, `path`, `query` or `fragment` component cannot be decoded, it will be ignored; -- native function `decodeURI` does not support IDNs and cannot properly work with `encodeURI` since the function is based on an outdated standard; -- by default only the scheme and host are lowercased (**RFC-3986 §6.2.2.1**); the `lowercase` option additionally lowercases the path, query and fragment, which are case-sensitive — use it only for Sitemap or convenience purposes, not as RFC normalization; -- to only use with [encodeSitemapURL](#encodesitemapurluri). - -
- -**Based on**: +```ts +decodeWebURL('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor'); +// 'https://www.中文.com./Over/There?a=B&b=c#Anchor' +``` -- __RFC-3986__; -- __Google: Build and submit a sitemap__. +
-
+
+decodeSitemapURL(uri, options) -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME` - - `URI_INVALID_PORT` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` +
-
+Decodes an HTTP or HTTPS URL coming from an XML sitemap — the inverse of [`encodeSitemapURL`](#encoders). Sitemap escape codes are converted back to their characters. -**Examples**: +**Examples** -```javascript -decodeSitemapURL(); // throws URIError with code URI_INVALID_TYPE -decodeSitemapURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -decodeSitemapURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -decodeSitemapURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -decodeSitemapURL('ftp://bar.com'); // throws URIError with code URI_INVALID_SCHEME -decodeSitemapURL('hôtp://bar.com'); // throws URIError with code URI_INVALID_SCHEME -decodeSitemapURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -decodeSitemapURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -decodeSitemapURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL +```ts +decodeSitemapURL('HTTP://bar.BAZ/IT'S%20OVER%2Athere%2A?a=b&c=d'); +// "http://bar.baz/IT'S OVER*there*?a=b&c=d" -decodeSitemapURL('http://user%:pass@xn--fiq228c.com/%?query=%E0%A5%90#anch#or'); // 'http://中文.com/?query=ॐ' -decodeSitemapURL('HTTP://bar.BAZ/IT'S%20OVER%2Athere%2A?a=b&c=d'); // 'http://bar.baz/IT\'S OVER*there*?a=b&c=d' -decodeSitemapURL('http://bar.baz/IT'S%20OVER%2Athere%2A?A=b&c=D', { lowercase: true }); // 'http://bar.baz/it\'s over*there*?a=b&c=d' +decodeSitemapURL('http://bar.baz/IT'S%20OVER%2Athere%2A?A=b&c=D', { lowercase: true }); +// "http://bar.baz/it's over*there*?a=b&c=d" ``` +
## Errors -### Object structure - -Errors emitted by *@coroboros/uri* are native URIError with an additional *code* property: +Errors emitted by `@coroboros/uri` are native `URIError` instances with an additional `code` property: -```javascript +```ts { - name, - code, - message, - stack, + name: 'URIError', + code: URIErrorCode, + message: string, + stack: string, } ``` -### Codes - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
namecodedescriptionmodule
URIError
URI_INVALID_TYPEURI variable type is not validsrc/checkers
URI_MISSING_SCHEMEURI scheme is missingsrc/checkers
URI_EMPTY_SCHEMEURI scheme is emptysrc/checkers
URI_INVALID_SCHEMEURI scheme is not validsrc/checkers
src/decoders
src/encoders
URI_INVALID_SCHEME_CHARURI scheme contains an invalid charactersrc/checkers
src/decoders
src/encoders
URI_MISSING_PATHURI path is missingsrc/checkers
URI_INVALID_PATHURI path is not valid based on RFC-3986src/checkers
URI_MISSING_AUTHORITYURI authority is missingsrc/checkers
src/decoders
src/encoders
URI_INVALID_HOSTURI host is not valid IP or domainsrc/checkers
URI_INVALID_PORTURI port is not a numbersrc/checkers
src/decoders
src/encoders
URI_INVALID_CHARURI contains an invalid charactersrc/checkers
URI_INVALID_USERINFO_CHARURI userinfo contains an invalid charactersrc/checkers
URI_INVALID_PATH_CHARURI path contains an invalid charactersrc/checkers
URI_INVALID_QUERY_CHARURI query contains an invalid charactersrc/checkers
URI_INVALID_FRAGMENT_CHARURI fragment contains an invalid charactersrc/checkers
URI_INVALID_PERCENT_ENCODINGA percent-encoding character is not validsrc/checkers
URI_INVALID_SITEMAP_ENCODINGURI contains an invalid sitemap escape codesrc/checkers
URI_MAX_LENGTH_URLMaximum URL allowed length of 2048 characters has been reachedsrc/checkers
- -## Compliance - -*@coroboros/uri* implements: - -- **RFC-3986** — generic URI syntax: parse (Appendix B), recompose (§5.3), reference resolution (§5.2), percent-encoding (§2.1, §6.2.2.1), and character validation (§3.1–§3.5). -- **RFC-3987** — IDNs via Punycode, through Node's `node:url` (`domainToASCII` / `domainToUnicode`). -- **RFC 6874 §2** — IPv6 zone identifiers in a URI (the `%25` delimiter and `ZoneID = 1*( unreserved / pct-encoded )`). -- **RFC 1034** / **RFC 1123** — domain name rules. -- **sitemaps.org** — the Sitemap protocol for Sitemap URLs. - -**Behavior worth knowing**: - -- a present-but-empty query or fragment (a bare `?` or `#`) is preserved and round-trips, distinct from an absent one (**RFC-3986 §5.3**); -- a port must be a string of ASCII digits (**RFC-3986 §3.2.3**) — values like `0x1F` are rejected; -- `userinfo` is delimited by the last `@`, and a non-IPv6 host/port by the last `:` (**RFC-3986 §3.2**); -- percent-encoding hex is case-insensitive: `%3a` and `%3A` are both accepted (**RFC-3986 §6.2.2.1**); -- inside a URI, an IPv6 zone identifier must use the `%25` delimiter and a non-empty `ZoneID` of `unreserved` / `pct-encoded` characters (**RFC 6874 §2**); the standalone `isIPv6` validator stays lenient; +The `code` field is a stable string discriminant safe for runtime branching. + +
+Error codes + +
+ +| Code | Description | Module | +| --- | --- | --- | +| `URI_INVALID_TYPE` | URI variable type is not valid. | `src/checkers` | +| `URI_MISSING_SCHEME` | URI scheme is missing. | `src/checkers` | +| `URI_EMPTY_SCHEME` | URI scheme is empty. | `src/checkers` | +| `URI_INVALID_SCHEME` | URI scheme is not valid. | `src/checkers`, `src/decoders`, `src/encoders` | +| `URI_INVALID_SCHEME_CHAR` | URI scheme contains an invalid character. | `src/checkers`, `src/decoders`, `src/encoders` | +| `URI_MISSING_PATH` | URI path is missing. | `src/checkers` | +| `URI_INVALID_PATH` | URI path is not valid per **RFC-3986**. | `src/checkers` | +| `URI_MISSING_AUTHORITY` | URI authority is missing. | `src/checkers`, `src/decoders`, `src/encoders` | +| `URI_INVALID_HOST` | URI host is not a valid IP or domain. | `src/checkers` | +| `URI_INVALID_PORT` | URI port is not a number. | `src/checkers`, `src/decoders`, `src/encoders` | +| `URI_INVALID_CHAR` | URI contains an invalid character. | `src/checkers` | +| `URI_INVALID_USERINFO_CHAR` | URI userinfo contains an invalid character. | `src/checkers` | +| `URI_INVALID_PATH_CHAR` | URI path contains an invalid character. | `src/checkers` | +| `URI_INVALID_QUERY_CHAR` | URI query contains an invalid character. | `src/checkers` | +| `URI_INVALID_FRAGMENT_CHAR` | URI fragment contains an invalid character. | `src/checkers` | +| `URI_INVALID_PERCENT_ENCODING` | A percent-encoding character is not valid. | `src/checkers` | +| `URI_INVALID_SITEMAP_ENCODING` | URI contains an invalid sitemap escape code. | `src/checkers` | +| `URI_MAX_LENGTH_URL` | Maximum URL length of 2,048 characters has been reached. | `src/checkers` | + +
+ +## Limitations + +- A present-but-empty query or fragment (a bare `?` or `#`) is preserved and round-trips, distinct from an absent one (**RFC-3986 §5.3**). +- A port must be a string of ASCII digits (**RFC-3986 §3.2.3**) — values like `0x1F` are rejected. +- `userinfo` is delimited by the last `@`, and a non-IPv6 host/port by the last `:` (**RFC-3986 §3.2**). +- Percent-encoding hex is case-insensitive: `%3a` and `%3A` are both accepted (**RFC-3986 §6.2.2.1**). +- Inside a URI, an IPv6 zone identifier must use the `%25` delimiter and a non-empty `ZoneID` of `unreserved` / `pct-encoded` characters (**RFC 6874 §2**). The standalone [`isIPv6`](#validators) validator stays lenient. - `encodeSitemapURL` escapes all five XML entities `& ' " < >`, and a Sitemap URL must be shorter than 2,048 characters (sitemaps.org). For example, `encodeSitemapURL('http://example.com/a&bd')` returns `'http://example.com/a&b<c>d'`. - -**Non-goals**: - -- this is a strict **RFC-3986** toolkit, not a WHATWG URL parser — it does not apply WHATWG host/IPv4 leniency; -- it does not canonicalize IPv6 addresses to **RFC 5952** form; -- the `lowercase` option lowercases the entire input including the path, query and fragment, which are case-sensitive per **RFC-3986 §6.2.2.1** — so `lowercase` is a Sitemap/convenience option, not RFC normalization. By default only the scheme and host are lowercased, which is the RFC-compliant behavior. +- This is a strict **RFC-3986** toolkit, not a WHATWG URL parser — it does not apply WHATWG host/IPv4 leniency. +- IPv6 addresses are not canonicalized to **RFC 5952** form. +- The `lowercase` option lowercases the entire input including path, query, and fragment, which are case-sensitive per **RFC-3986 §6.2.2.1**. Use `lowercase` for Sitemap or convenience, not as RFC normalization. By default only scheme and host are lowercased, which is the RFC-compliant behavior. ## Contributing @@ -1619,6 +980,7 @@ Bug reports and PRs welcome. - Open an issue before submitting non-trivial PRs. - Commits follow [Conventional Commits](https://www.conventionalcommits.org/). - Run `pnpm lint && pnpm typecheck && pnpm test` before pushing. +- Run `pnpm bench` against `bench/baseline.md` when touching parser, encoders, or decoders — no regression beyond 10 % on any bucket at fixed feature set. - Target the `main` branch. ## License diff --git a/package.json b/package.json index 9c7393e..ae97e03 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@coroboros/uri", "version": "1.0.0", - "description": "RFC-3986 compliant, zero-dependency URI toolkit for Node.js.", + "description": "RFC-3986 URI toolkit for Node.js. IDN (RFC-3987), IPv6 zone identifiers (RFC 6874), Sitemap protocol. Zero dependencies.", "type": "module", "sideEffects": false, "main": "./dist/index.cjs", @@ -43,16 +43,22 @@ "uri", "url", "rfc-3986", + "rfc-3987", + "rfc-6874", + "rfc-1034", + "rfc-1123", + "idn", + "punycode", "parse", "encode", "decode", - "validate", "sitemap", - "punycode", + "sitemap-protocol", "ip", "domain", "typescript", - "nodejs" + "nodejs", + "zero-dependency" ], "author": "Coroboros (https://github.com/coroboros)", "license": "MIT", From 07706f68f7a844af333d96228a5fe94e332205ea Mon Sep 17 00:00:00 2001 From: ob-aion Date: Wed, 20 May 2026 12:39:23 +0700 Subject: [PATCH 21/21] docs: link to the benchmark baseline from Compliance Surface bench/baseline.md from the README so readers landing on the Compliance section can find the parse / validate / encode numbers against native URL and URL.canParse without scrolling to Contributing. The note states the tradeoff plainly: the toolkit is slower by design because it does full per-character validation, IDN handling, RFC 6874 zone identifiers, and explicit coded errors. --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b3eb286..2097d2c 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,8 @@ encodeWebURL('https://www.中文.com./Over There?a=B#Anchôr'); - **RFC 1034 / RFC 1123** — domain-name rules: label length, character set, label separation. - **sitemaps.org** — the Sitemap protocol: required XML-entity escaping and the 2,048-character URL ceiling. +See [`bench/baseline.md`](bench/baseline.md) for performance numbers vs native `URL` / `URL.canParse`. The toolkit trades raw speed for RFC-3986 fidelity — full per-character validation, IDN handling, RFC 6874 zone identifiers, and explicit coded errors. + **Generic URI syntax** ![URI Syntax](assets/uri-syntax.png "URI Syntax")