diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..592e6c7 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,28 @@ +name: ci + +on: + push: + branches: [main] + tags: + - '[0-9]+.[0-9]+.[0-9]+' + - '[0-9]+.[0-9]+.[0-9]+-*' + pull_request: + branches: [main] + +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref_type != 'tag' }} + +permissions: + contents: write + id-token: write + +jobs: + ci: + uses: coroboros/ci/.github/workflows/javascript-npm-packages.yml@v0 + secrets: + NPM_CONFIG_FILE: ${{ secrets.NPM_CONFIG_FILE }} + NPM_EXTRA_CONFIG: ${{ secrets.NPM_EXTRA_CONFIG }} + NPM_PACKAGE_REGISTRY: ${{ secrets.NPM_PACKAGE_REGISTRY }} + NPM_PACKAGE_PROXY_REGISTRY: ${{ secrets.NPM_PACKAGE_PROXY_REGISTRY }} + NPM_PACKAGE_REGISTRY_TOKEN: ${{ secrets.NPM_PACKAGE_REGISTRY_TOKEN }} diff --git a/.gitignore b/.gitignore index 8a844f0..7bd9bcf 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .wrangler/ .dev.vars .claude/settings.local.json +.claude/output/ CLAUDE.local.md dist *.zip diff --git a/CLAUDE.md b/CLAUDE.md index 7adf08f..8d3df1c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,15 +10,18 @@ Follows the Coroboros engineering global rules. Repo-specific divergences are st ## Tech Stack - TypeScript strict, ES modules + CJS dual build (tsdown) -- Vitest for tests, Biome for lint/format +- Vitest + `fast-check` for property tests, Biome for lint/format +- `mitata` for benchmarks (`pnpm bench`) - Node.js 22 LTS - Zero runtime dependencies — Punycode uses Node's `node:url` (`domainToASCII` / `domainToUnicode`) ## Commands - `pnpm build` — bundle ESM + CJS + types to `dist/` -- `pnpm test` — run Vitest suite +- `pnpm test` — run the Vitest suite (incl. property-based) +- `pnpm test:coverage` — Vitest with the 100% coverage gate - `pnpm lint` / `pnpm lint:fix` — Biome check - `pnpm typecheck` — tsc --noEmit +- `pnpm bench` — build then run `bench/uri.bench.mjs` - `pnpm dev` — tsdown watch mode ## Important Files @@ -26,21 +29,27 @@ Follows the Coroboros engineering global rules. Repo-specific divergences are st - `src/parser/index.ts` — `parseURI`, `recomposeURI`, `hostToURI` (RFC-3986 Appendix B grammar) - `src/checkers/index.ts` — URI / URL / Sitemap validators, error taxonomy - `src/encoders/index.ts`, `src/decoders/index.ts` — RFC-3986 encode/decode +- `src/resolver/index.ts` — `resolveURI`, `removeDotSegments` (RFC-3986 §5.2 verbatim) - `src/helpers/object.ts` — private `exists` / `is` type guards (inlined, not exported) - `tsdown.config.ts` — dual build config (ESM + CJS + dts) -- `tests/` — Vitest suites, one test file per source module +- `tests/` — one spec per source module + `uri.property.test.ts` for `fast-check` invariants +- `bench/uri.bench.mjs` — mitata bench vs native `URL` / `URL.canParse`; `bench/baseline.md` documents the 1.0.0 numbers ## Public API (1.0.0 contract) - `punycode(domain)`, `punydecode(domain)` — domain ASCII/Unicode serialization - `parseURI(uri)`, `recomposeURI(components)` — RFC-3986 parse / recompose +- `resolveURI(base, reference)`, `removeDotSegments(path)` — RFC-3986 §5.2 reference resolution - `isDomainLabel(label)`, `isDomain(name)`, `isIP(ip)`, `isIPv4(ip)`, `isIPv6(ip)` — validators - `checkURI(uri)`, `checkHttpURL(uri)`, `checkHttpsURL(uri)`, `checkWebURL(uri)`, `checkSitemapURL(uri)`, `checkHttpSitemapURL(uri)`, `checkHttpsSitemapURL(uri)` — throw a coded error on invalid input - `encodeURIComponentString(component, options)`, `encodeURIString(uri, options)`, `encodeWebURL(uri, options)`, `encodeSitemapURL(uri)` — RFC-3986 encoders - `decodeURIComponentString(component, options)`, `decodeURIString(uri, options)`, `decodeWebURL(uri, options)`, `decodeSitemapURL(uri, options)` — RFC-3986 decoders ## Rules -- **NEVER** break the public API above. The signatures and the error/type shapes are the 1.0.0 contract. +- The **published** `1.0.0` tag is the public contract — once it ships, **NEVER** break the API above (signatures, error codes, type shapes) without a major bump. Until `1.0.0` is published, breaking changes are allowed but every break must be enumerated in the PR. - **NEVER** add a new runtime dependency without user approval. Zero-dependency is a feature. - **NEVER** use `axios`, `request`, or `node-fetch` — use native `fetch` (Node 22+). - Run `pnpm lint && pnpm typecheck && pnpm test` before every commit. +- Run `pnpm bench` against `bench/baseline.md` when touching the parser, encoders or decoders — no regression > 10 % on any bucket at fixed feature set. - Scoped package — `publishConfig.access = "public"` is mandatory, do not remove. +- **Publish** — CI-owned via OIDC Trusted Publisher + npm provenance. The first `1.0.0` publish bootstraps through the org registry token (CI auto-detects it); once the package exists on npm, configure it as a Trusted Publisher and never re-add a token to `ci.yml`. Manual `pnpm publish` is forbidden — it bypasses provenance and the tag guard. +- **Git** — `main`-only; branch → PR → squash-merge → tag the merge commit. The tag is the only manual step; release automation (version bump, `CHANGELOG.md`, npm publish, GitHub release) is owned by [`coroboros/ci`](https://github.com/coroboros/ci). Never hand-edit `package.json` version or `CHANGELOG.md`. Run `pnpm lint && pnpm typecheck && pnpm test && pnpm build` before tagging. diff --git a/README.md b/README.md index 62c43aa..2097d2c 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,12 @@ # @coroboros/uri -**RFC-3986 compliant, zero-dependency URI toolkit for Node.js.** +**RFC-3986 URI toolkit for Node.js. IDN (RFC-3987), IPv6 zone identifiers (RFC 6874), Sitemap protocol. Zero dependencies.** -Parses any URI per RFC-3986, including IDNs via Punycode. Validates IPs, domains, URIs, HTTP(S) URLs, and Sitemap URLs. Encodes and decodes URI strings and components. +Parses URIs per **RFC-3986 Appendix B**. Recomposes per §5.3. Resolves references per §5.2. Validates IPs, domains (**RFC 1034 / 1123**), HTTP(S) URLs, and Sitemap URLs. Encodes and decodes URI strings and components. [![npm](https://img.shields.io/npm/v/@coroboros/uri?style=flat-square&color=000000)](https://www.npmjs.com/package/@coroboros/uri) -[![branch](https://img.shields.io/badge/branch-stable-000000?style=flat-square)](https://github.com/coroboros/uri) +[![ci](https://img.shields.io/github/actions/workflow/status/coroboros/uri/ci.yml?branch=main&style=flat-square&label=ci&color=000000)](https://github.com/coroboros/uri/actions/workflows/ci.yml) [![license](https://img.shields.io/badge/license-MIT-000000?style=flat-square)](https://opensource.org/licenses/MIT) [![stars](https://img.shields.io/github/stars/coroboros/uri?style=flat-square&label=stars&color=000000)](https://github.com/coroboros/uri) [![coroboros.com](https://img.shields.io/badge/coroboros.com-000000?style=flat-square&logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9IndoaXRlIiBzdHJva2Utd2lkdGg9IjIiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIgc3Ryb2tlLWxpbmVqb2luPSJyb3VuZCI+PGNpcmNsZSBjeD0iMTIiIGN5PSIxMiIgcj0iMTAiLz48cGF0aCBkPSJNMiAxMmgyME0xMiAyYTE1LjMgMTUuMyAwIDAgMSA0IDEwIDE1LjMgMTUuMyAwIDAgMS00IDEwIDE1LjMgMTUuMyAwIDAgMS00LTEwIDE1LjMgMTUuMyAwIDAgMSA0LTEweiIvPjwvc3ZnPg==)](https://coroboros.com) @@ -23,8 +23,10 @@ Parses any URI per RFC-3986, including IDNs via Punycode. Validates IPs, domains - [Requirements](#requirements) - [Install](#install) - [Usage](#usage) +- [Compliance](#compliance) - [API](#api) - [Errors](#errors) +- [Limitations](#limitations) - [Contributing](#contributing) - [License](#license) @@ -38,12 +40,15 @@ Parses any URI per RFC-3986, including IDNs via Punycode. Validates IPs, domains ```bash pnpm add @coroboros/uri ``` + ```bash npm install @coroboros/uri ``` + ```bash yarn add @coroboros/uri ``` + ```bash bun add @coroboros/uri ``` @@ -53,149 +58,216 @@ bun add @coroboros/uri ```ts // ESM (recommended) import { parseURI, checkHttpsURL, encodeWebURL } from '@coroboros/uri'; - -parseURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); -checkHttpsURL('https://example.com/path?q=1#x'); -encodeWebURL('https://www.中文.com./Over There?a=B#Anchôr'); ``` ```js // CommonJS -const { parseURI } = require('@coroboros/uri'); +const { parseURI, checkHttpsURL, encodeWebURL } = require('@coroboros/uri'); ``` +```ts +import { parseURI, checkHttpsURL, encodeWebURL } from '@coroboros/uri'; + +// Parse — get every RFC-3986 component +parseURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); +// { scheme: 'foo', host: 'xn--fiq228c.com', hostPunydecoded: '中文.com', port: 8042, … } + +// Validate strictly — throws URIError with a stable code on invalid input +try { + const url = checkHttpsURL('https://example.com/path?q=1#x'); + url.valid; // true +} catch (err) { + // err.code is one of the documented codes (see [Errors](#errors)) +} + +// Encode — RFC-3986 compliant, IDN-aware, sub-2048-char HTTP(S) +encodeWebURL('https://www.中文.com./Over There?a=B#Anchôr'); +// 'https://www.xn--fiq228c.com./Over%20There?a=B#Anch%C3%B4r' +``` + +## Compliance + +`@coroboros/uri` implements: + +- **RFC-3986** — generic URI syntax: parse (Appendix B), recompose (§5.3), reference resolution (§5.2), percent-encoding (§2.1, §6.2.2.1), and character validation (§3.1–§3.5). +- **RFC-3987** — Internationalized Domain Names via Punycode, through Node's `node:url` (`domainToASCII` / `domainToUnicode`). +- **RFC 6874 §2** — IPv6 zone identifiers inside a URI: the `%25` delimiter and `ZoneID = 1*( unreserved / pct-encoded )` grammar. +- **RFC 1034 / RFC 1123** — domain-name rules: label length, character set, label separation. +- **sitemaps.org** — the Sitemap protocol: required XML-entity escaping and the 2,048-character URL ceiling. + +See [`bench/baseline.md`](bench/baseline.md) for performance numbers vs native `URL` / `URL.canParse`. The toolkit trades raw speed for RFC-3986 fidelity — full per-character validation, IDN handling, RFC 6874 zone identifiers, and explicit coded errors. + +**Generic URI syntax** + +![URI Syntax](assets/uri-syntax.png "URI Syntax") + +**Example URIs** + +![RFC-3986](assets/rfc-3986.png "RFC-3986") + ## API -### punycode(domain) +### Types -Returns the Punycode ASCII serialization of the domain. If domain is an invalid domain, the empty string is returned. +
+ParsedURI -**Note**: +
-- native function `url.domainToASCII` does not support IPv6 only IPv4; -- native function `url.domainToASCII` throws if no domain is provided or returns `null`, `undefined`, `nan` for `null`, `undefined` or `NaN` values which is not what to be expected. +Return shape of [`parseURI`](#parsing). -
+```ts +interface ParsedURI { + scheme: string | null; + authority: string | null; + authorityPunydecoded: string | null; + userinfo: string | null; + host: string | null; + hostPunydecoded: string | null; + port: number | string | null; + path: string | null; + pathqf: string | null; + query: string | null; + fragment: string | null; + href: string | null; +} +``` -- `domain` **** -- Returns: **** +Fields default to `null` when the corresponding URI part is missing. `port` is a `number` when parseable as an integer, a `string` otherwise. -
+
-**Examples**: +
+URIComponents -```javascript -punycode(); // '' -punycode('a.b.c.d.e.fg'); // 'a.b.c.d.e.fg' -punycode('xn--iñvalid.com'); // '' -punycode('中文.com'); // 'xn--fiq228c.com' -punycode('xn--fiq228c.com'); // 'xn--fiq228c.com' -punycode('2001:db8:85a3:8d3:1319:8a2e:370:7348'); // '2001:db8:85a3:8d3:1319:8a2e:370:7348' -punycode('127.0.0.1'); // '127.0.0.1' -punycode(undefined|null|NaN); // '' +
+ +Input shape of [`recomposeURI`](#parsing). Every field is optional; `scheme` and `path` are required at runtime. + +```ts +interface URIComponents { + scheme?: string | null; + userinfo?: string | null; + host?: string | null; + port?: number | string | null; + path?: string | null; + query?: string | null; + fragment?: string | null; +} ``` -### punydecode(domain) +
-Returns the Unicode serialization of the domain. If domain is an invalid domain, the empty string is returned. +
+CheckedURI -**Note**: +
-- native function `url.domainToUnicode` does not support IPv6 only IPv4; -- native function `url.domainToUnicode` throws if no domain is provided or returns `null`, `undefined`, `nan` for `null`, `undefined` or `NaN` values which is not what to be expected. +Return shape of every [`check*`](#checkers) function on success — `ParsedURI` extended with a `valid: true` discriminant. -
+```ts +interface CheckedURI extends ParsedURI { + valid: true; +} +``` -- `domain` **** -- Returns: **** +
-
+### Punycode -**Examples**: +
+punycode(domain) -```javascript -punydecode(); // '' -punydecode('a.b.c.d.e.fg'); // 'a.b.c.d.e.fg' -punydecode('xn--iñvalid.com'); // '' -punydecode('xn--fiq228c.com'); // '中文.com' -punydecode('中文.com'); // '中文.com' -punydecode('2001:db8:85a3:8d3:1319:8a2e:370:7348'); // '2001:db8:85a3:8d3:1319:8a2e:370:7348' -punydecode('127.0.0.1'); // '127.0.0.1' -punydecode(undefined|null|NaN); // '' +
+ +Returns the Punycode ASCII serialization of a domain. Returns the empty string when the input is not a valid domain. + +**Parameters** + +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `domain` | `string` | *(required)* | The domain to serialize. | + +**Returns** — `string`. The ASCII form (or `''` on invalid input). + +**Notes** + +- Wraps Node's `url.domainToASCII` and normalizes the error case: the native function throws when called without an argument and returns `'null'` / `'undefined'` / `'nan'` for the corresponding non-domain inputs. +- IPv6 literals are passed through unchanged (the native function rejects them). + +**Examples** + +```ts +punycode(); // '' +punycode('a.b.c.d.e.fg'); // 'a.b.c.d.e.fg' +punycode('xn--iñvalid.com'); // '' +punycode('中文.com'); // 'xn--fiq228c.com' +punycode('xn--fiq228c.com'); // 'xn--fiq228c.com' +punycode('2001:db8:85a3:8d3:1319:8a2e:370:7348'); // '2001:db8:85a3:8d3:1319:8a2e:370:7348' +punycode('127.0.0.1'); // '127.0.0.1' ``` -### parseURI(uri) +
-Parse a string to get URI components. +
+punydecode(domain) -**Support**: +
-- IPv4 and IPv6 hosts; -- Internationalized Domain Name (IDN). +Returns the Unicode serialization of a domain. Returns the empty string when the input is not a valid domain. -**Note**: +**Parameters** -- RegExp from __RFC-3986__; -- scheme and host strings will always be put in lowercase once parsed, as specified in **RFC-3986**; -- authority and its components will be put at null values if authority parsed is missing or empty; -- **prefer using [checkURI](#checkuriuri) to parse and fully check an URI**. +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `domain` | `string` | *(required)* | The domain to deserialize. | -
+**Returns** — `string`. The Unicode form (or `''` on invalid input). -**Generic syntax**: +**Notes** -![URI Syntax](assets/uri-syntax.png "URI Syntax") +- Wraps Node's `url.domainToUnicode` and normalizes the same error edges as [`punycode`](#punycodedomain). -
+**Examples** -**Example URIs**: +```ts +punydecode(); // '' +punydecode('xn--fiq228c.com'); // '中文.com' +punydecode('中文.com'); // '中文.com' +punydecode('xn--iñvalid.com'); // '' +punydecode('2001:db8:85a3:8d3:1319:8a2e:370:7348'); // '2001:db8:85a3:8d3:1319:8a2e:370:7348' +punydecode('127.0.0.1'); // '127.0.0.1' +``` -![RFC-3986](assets/rfc-3986.png "RFC-3986") +
-
+### Parsing -**Based on**: +
+parseURI(uri) -- __RFC-3986__. +
-
+Parses a URI into its **RFC-3986 Appendix B** components, with IPv4/IPv6 host support and IDN (Punycode) awareness. -- `uri` **** -- Returns: **** - - `scheme` **** The URI scheme. *Default*: `null` - - `authority` **** The URI authority with the Punycode ASCII serialization of the domain. *Default*: `null` - - `authorityPunydecoded` **** The URI authority with the Unicode serialization of the domain. *Default*: `null` - - `userinfo` **** The URI userinfo. *Default*: `null` - - `host` **** The URI authority's host with the Punycode ASCII serialization of the domain. *Default*: `null` - - `hostPunydecoded` **** The URI authority's host with the Unicode serialization of the domain. *Default*: `null` - - `port` **** || **** The URI authority's port. A string if not able to be parsed in an integer. *Default*: `null` - - `path` **** The URI path. *Default*: `null` - - `pathqf` **** The URI path, query and fragment. *Default*: `null` - - `query` **** The URI query. *Default*: `null` - - `fragment` **** The URI fragment. *Default*: `null` - - `href` **** The URI recomposed. See __[recomposeURI](#recomposeuricomponents)__. *Default*: `null` +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `uri` | `string` | *(required)* | The URI string to parse. | -**Examples**: +**Returns** — [`ParsedURI`](#types). -```javascript -parseURI(); -// { -// scheme: null, -// authority: null, -// authorityPunydecoded: null, -// userinfo: null, -// host: null, -// hostPunydecoded: null, -// port: null, -// path: null, -// pathqf: null, -// query: null, -// fragment: null, -// href: null, -// } +**Notes** + +- Scheme and host are lowercased per **RFC-3986 §6.2.2.1**. +- Authority and its components are `null` when the authority is absent or empty. +- A present-but-empty query or fragment (`?` or `#` with nothing after) is preserved as `''`, distinct from a missing one (`null`). +- For strict validation, prefer [`checkURI`](#checkers). +**Examples** + +```ts parseURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); // { // scheme: 'foo', @@ -212,132 +284,45 @@ parseURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); // href: 'foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose', // } -parseURI('foo://user:pass@中文.com:80g42/over/there?name=ferret#nose'); -// { -// scheme: 'foo', -// authority: 'user:pass@xn--fiq228c.com:80g42', -// authorityPunydecoded: 'user:pass@中文.com:80g42', -// userinfo: 'user:pass', -// host: 'xn--fiq228c.com', -// hostPunydecoded: '中文.com', -// port: '80g42', -// path: '/over/there', -// pathqf: '/over/there?name=ferret#nose', -// query: 'name=ferret', -// fragment: 'nose', -// href: 'foo://user:pass@xn--fiq228c.com:80g42/over/there?name=ferret#nose', -// } - parseURI('urn:isbn:0-486-27557-4'); -// { -// scheme: 'urn', -// authority: null, -// authorityPunydecoded: null, -// userinfo: null, -// host: null, -// hostPunydecoded: null, -// port: null, -// path: 'isbn:0-486-27557-4', -// pathqf: 'isbn:0-486-27557-4', -// query: null, -// fragment: null -// href: 'urn:isbn:0-486-27557-4', -// } +// { scheme: 'urn', authority: null, path: 'isbn:0-486-27557-4', href: 'urn:isbn:0-486-27557-4', … } parseURI('http://user:pass@[fe80::7:8%eth0]:8080'); -// { -// scheme: 'http', -// authority: 'user:pass@[fe80::7:8%eth0]:8080', -// authorityPunydecoded: 'user:pass@[fe80::7:8%eth0]:8080', -// userinfo: 'user:pass', -// host: 'fe80::7:8%eth0', -// hostPunydecoded: 'fe80::7:8%eth0', -// port: 8080, -// path: '', -// pathqf: '', -// query: null, -// fragment: null, -// href: 'http://user:pass@[fe80::7:8%eth0]:8080/' -// } +// { scheme: 'http', host: 'fe80::7:8%eth0', port: 8080, path: '', href: 'http://user:pass@[fe80::7:8%eth0]:8080/', … } ``` -### recomposeURI(components) - -Recompose an URI from its components with basic URI checking. +
-The empty string is returned if unable to recompose the URI. +
+recomposeURI(components) -**Rules**: +
-1. scheme is required and must be at least 1 character; -2. path is required and can be empty; -3. if host is present path must be empty or start with `/`; -4. if host is not present path must not start with `//`; -5. host, if any, must be at least 3 characters; -6. userinfo will be ignored if empty; -7. port will be ignored if unable to parse it into an integer between 0 - 65535; -8. query will be ignored if empty; -9. fragment will be ignored if empty. +Recomposes a URI from its components per **RFC-3986 §5.3**, with basic validity checking. Returns the empty string when the rules below are not met. -**Support**: +**Parameters** -- IPv4 and IPv6. - -**Note**: - -- `/` is added to any URI with a host and an empty path. - -
- -**Generic syntax**: - -![URI Syntax](assets/uri-syntax.png "URI Syntax") +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `components` | [`URIComponents`](#types) | *(required)* | The components to recompose. | -
+**Returns** — `string`. The recomposed URI (or `''` on invalid input). -**Based on**: +**Notes** -- __RFC-3986__. +- `scheme` is required and must be at least one character. +- `path` is required and may be empty. +- If `host` is present, `path` must be empty or start with `/`. +- If `host` is absent, `path` must not start with `//`. +- `host`, if present, must be at least three characters. +- `userinfo` is ignored when empty. +- `port` is ignored when not parseable as an integer in `0–65535`. +- `query` and `fragment` are ignored when empty. +- A trailing `/` is added to any URI with a host and an empty path. -
- -- `components` ****: - - `scheme`* **** The URI scheme. - - `userinfo` **** The URI userinfo. - - `host` **** The URI authority's host. - - `port` **** The URI authority's port. - - `path`* **** The URI path. - - `query` **** The URI query. - - `fragment` **** The URI fragment. -- Returns: **** - -
- -**Examples**: - -```javascript -recomposeURI(); // '' - -recomposeURI({ - scheme: null, - userinfo: 'user:pass', - host: 'example.com', - port: 8080, - path: null, - query: 'a=b', - fragment: 'anchor', -}); // '' - -recomposeURI({ - scheme: 'foo', - userinfo: null, - host: null, - port: null, - path: '', - query: null, - fragment: null, -}); // 'foo:' +**Examples** +```ts recomposeURI({ scheme: 'foo', userinfo: 'user:pass', @@ -348,6 +333,9 @@ recomposeURI({ fragment: 'anchor', }); // 'foo://user:pass@bar.com:8080/over/there?a=b#anchor' +recomposeURI({ scheme: 'foo', path: '' }); +// 'foo:' + recomposeURI({ scheme: 'foo', userinfo: 'user:pass', @@ -357,1164 +345,635 @@ recomposeURI({ query: 'a=b', fragment: 'anchor', }); // 'foo://user:pass@[fe80::7:8%eth0]:8080/over/there?a=b#anchor' - -recomposeURI({ - scheme: 'foo', - userinfo: '', - host: 'fe80::7:8%eth0', - port: '55g55', - path: '/over/there', - query: '', - fragment: '', -}); // 'foo://[fe80::7:8%eth0]/over/there' ``` -### isDomainLabel(label) +
-Test a label is a valid domain label according to **RFC-1034**. +### Reference resolution -> "Note that while upper and lower case letters are allowed in domain names, no significance is attached to the case. That is, two names with the same spelling but different case are to be treated as if identical." +
+resolveURI(base, reference) -By convention uppercased domain label will be considered invalid. +
-**Rules**: +Resolves a URI reference against an absolute base URI per **RFC-3986 §5.2**: the §5.2.2 strict transform, the §5.2.3 merge, the §5.2.4 `remove_dot_segments`, then recomposes per §5.3. -1. "*Labels must be 63 characters or less.*"; -2. can be minimum one character; -3. must only use lowercase letters, digits or hyphens; -4. must not start or end with a hyphen; -5. must not have consecutive hyphens; -6. can start or end with a digit. +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `base` | `string` | *(required)* | The absolute base URI. | +| `reference` | `string` | *(required)* | The URI reference to resolve. | -**Based on**: +**Returns** — `string`. The resolved URI, or `''` when the base is not absolute or an argument is not a string. -- __RFC-1034__. +**Notes** -
+- The strict algorithm is used: a reference scheme equal to the base scheme is not ignored. +- A fragment on the base is stripped before resolution per **RFC-3986 §5.1**. -- `label` **** -- Returns: **** +**Examples** -
- -**Examples**: - -```javascript -isDomainLabel('a'); // true -isDomainLabel('1a3'); // true -isDomainLabel('1-3'); // true -isDomainLabel('1-y'); // true - -isDomainLabel(); // false -isDomainLabel('a'.repeat(64)); // false -isDomainLabel('A'); // false -isDomainLabel('-a'); // false -isDomainLabel('a-'); // false -isDomainLabel('-a'); // false -isDomainLabel('la--bel'); // false -isDomainLabel(undefined|null|NaN); // false +```ts +resolveURI('http://a/b/c/d;p?q', '../../g'); // 'http://a/g' +resolveURI('https://example.com/a/b', './c?x#y'); // 'https://example.com/a/c?x#y' +resolveURI('/not-absolute', 'g'); // '' — base is not absolute ``` -### isDomain(name) - -Test a name is a valid domain according to **RFC-1034**. +
-Supports Fully-Qualified Domain Name (FQDN) and Internationalized Domain Name (IDN). +
+removeDotSegments(path) -**Rules**: +
-1. __[labels rules apply](#isdomainlabellabel)__; -2. "*[...] the total number of octets that represent a domain name (i.e., the sum of all label octets and label lengths) is limited to 255.*"; -3. labels are separated by dots ("."); -4. must have at least one extension label; -5. must have labels different from each other; -6. last label can be empty (root label "."); -7. labels can start with `xn--` for IDNs if the ASCII serialization is a valid Punycode **and has valid characters**. +Removes the `.` and `..` complete path segments from a path per **RFC-3986 §5.2.4** verbatim. -
+**Parameters** -**Based on**: +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `path` | `string` | *(required)* | The path to normalize. | -- __RFC-1034__. +**Returns** — `string`. The normalized path. -
+**Examples** -- `name` **** -- Returns: **** - -
- -**Examples**: - -```javascript -isDomain('a.b'); // true -isDomain('a.b.'); // true -isDomain('中文.com'); // true -isDomain('xn--fiq228c.com'); // true -isDomain('www.中文.com'); // true -isDomain(`${'a'.repeat(63)}.${'b'.repeat(63)}.${'c'.repeat(63)}.${'d'.repeat(63)}`); // true - -isDomain(); // false -isDomain('a'); // false -isDomain('a.a'); // false -isDomain('a.b.a'); // false -isDomain('a.b.a'); // false -isDomain('中文.xn--fiq228c.com'); // false -isDomain('www.xn--hf.com'); // false -isDomain(`${'a'.repeat(63)}.${'b'.repeat(63)}.${'c'.repeat(63)}.${'d'.repeat(63)}.`); // false -isDomain('xn--\'-6xd.com') // false even though xn--'-6xd is a valid Punycode for ॐ but has an invalid character +```ts +removeDotSegments('/a/b/c/./../../g'); // '/a/g' +removeDotSegments('mid/content=5/../6'); // 'mid/6' ``` -### isIP(ip) +
-Test a string is a valid IP. +### Validators -Supports IPv4 and IPv6. +
+isDomainLabel(label) -
+
-- `ip` **** -- Returns: **** +Tests whether a label is a valid domain label per **RFC 1034**. By convention, an uppercased label is considered invalid (`DNS names are case-insensitive, but Coroboros normalizes on lowercase`). -
+**Parameters** -**Examples**: +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `label` | `string` | *(required)* | The label to test. | -```javascript -isIP('23.71.254.72'); // true -isIP('1:2:3:4::6:7:8'); // true +**Returns** — `boolean`. -isIP(); // false -isIP('100..100.100.100.'); // false -isIP('3ffe:b00::1::a'); // false -``` +**Notes** -### isIPv4(ip) +- Length is one to 63 characters. +- Allowed characters: lowercase letters, digits, hyphen. +- Cannot start or end with a hyphen. +- No consecutive hyphens. +- Can start or end with a digit. -Test a string is a valid IPv4. +**Examples** -
+```ts +isDomainLabel('a'); // true +isDomainLabel('1a3'); // true +isDomainLabel('a'.repeat(64)); // false +isDomainLabel('A'); // false +isDomainLabel('-a'); // false +isDomainLabel('la--bel'); // false +``` -- `ip` **** -- Returns: **** +
-
+
+isDomain(name) -**Examples**: +
-```javascript -isIPv4('8.8.8.8'); // true -isIPv4('1:2::8'); // false -isIPv4(); // false -``` +Tests whether a name is a valid domain per **RFC 1034**, with FQDN and IDN support. -### isIPv6(ip) +**Parameters** -Test a string is a valid IPv6. +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `name` | `string` | *(required)* | The domain to test. | -
+**Returns** — `boolean`. -- `ip` **** -- Returns: **** +**Notes** -
+- [`isDomainLabel`](#validators) rules apply to each label. +- Total length is at most 255 octets including label-length octets. +- Labels are separated by `.`. +- Must have at least one extension label. +- All labels must differ. +- The last label can be empty (root label `.`). +- Labels starting with `xn--` are valid only when the ASCII serialization is a valid Punycode and the decoded form has valid characters. -**Examples**: +**Examples** -```javascript -isIPv6('2001:0000:1234:0000:0000:C1C0:ABCD:0876'); // true -isIPv6('212.58.241.131'); // false -isIPv6(); // false +```ts +isDomain('a.b'); // true +isDomain('a.b.'); // true +isDomain('中文.com'); // true +isDomain('xn--fiq228c.com'); // true +isDomain('www.中文.com'); // true + +isDomain('a'); // false +isDomain('a.a'); // false +isDomain('中文.xn--fiq228c.com'); // false +isDomain('xn--\'-6xd.com'); // false — valid Punycode for ॐ, but ॐ is not a valid character ``` -### checkURI(uri) +
-Check an URI is valid according to **RFC-3986**. +
+isIP(ip) -**Rules**: +
-1. scheme is required and cannot be empty; -2. path is required and can be empty; -3. if authority is present path must be empty or start with `/`; -4. if authority is not present path must not start with `//`; -5. __scheme can only have specific characters__; -6. if authority is present: +Tests whether a string is a valid IPv4 or IPv6 address. -- host must be a valid IP or domain name; -- __userinfo, if any, can only have specific characters__; -- port, if any, must be an integer between 0 - 65535. +**Parameters** -7. __path, query and fragment can only have specific characters__. +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `ip` | `string` | *(required)* | The address to test. | -
+**Returns** — `boolean`. -**Generic syntax**: +**Examples** -![URI Syntax](assets/uri-syntax.png "URI Syntax") +```ts +isIP('23.71.254.72'); // true +isIP('1:2:3:4::6:7:8'); // true +isIP('100..100.100.100'); // false +isIP('3ffe:b00::1::a'); // false +``` -
- -**Based on**: - -- __RFC-3986__. - -
- -- `uri` **** -- Returns: **** - - `scheme` **** The URI scheme. - - `authority` **** The URI authority with the Punycode ASCII serialization of the domain. *Default*: `null` - - `authorityPunydecoded` **** The URI authority with the Unicode serialization of the domain. *Default*: `null` - - `userinfo` **** The URI userinfo. *Default*: `null` - - `host` **** The URI authority's host with the Punycode ASCII serialization of the domain. *Default*: `null` - - `hostPunydecoded` **** The URI authority's host with the Unicode serialization of the domain. *Default*: `null` - - `port` **** || **** The URI authority's port. A string if not able to be parsed in an integer. *Default*: `null` - - `path` **** The URI path. - - `pathqf` **** The URI path, query and fragment. - - `query` **** The URI query. *Default*: `null` - - `fragment` **** The URI fragment. *Default*: `null` - - `href` **** The URI recomposed. *Default*: `null` - - `valid` **** Whether the URI is valid. *Default*: `false` -- Throws: **** If no error is thrown then the URI is valid. Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME_CHAR` - - `URI_INVALID_USERINFO_CHAR` - - `URI_INVALID_PORT` - - `URI_INVALID_PATH_CHAR` - - `URI_INVALID_QUERY_CHAR` - - `URI_INVALID_FRAGMENT_CHAR` - - `URI_INVALID_PERCENT_ENCODING` - -
- -**Examples**: - -```javascript -checkURI(); // throws URIError with code URI_INVALID_TYPE -checkURI('://example.com'); // throws URIError with code URI_MISSING_SCHEME -checkURI('foo:////bar'); // throws URIError with code URI_INVALID_PATH -checkURI('foo://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -checkURI('fôo:bar'); // throws URIError with code URI_INVALID_SCHEME_CHAR -checkURI('foo://üser:pass@bar.com'); // throws URIError with code URI_INVALID_USERINFO_CHAR -checkURI('foo://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -checkURI('foo://bar.com/°'); // throws URIError with code URI_INVALID_PATH_CHAR -checkURI('foo://bar.com/over/there?quêry=5'); // throws URIError with code URI_INVALID_QUERY_CHAR -checkURI('foo://bar.com/over/there?query=5#anch#r'); // throws URIError with code URI_INVALID_FRAGMENT_CHAR -checkURI('http://www.bar.baz/foo%2') // throws URIError with code URI_INVALID_PERCENT_ENCODING +
-checkURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); -// { -// scheme: 'foo', -// authority: 'user:pass@xn--fiq228c.com:8042', -// authorityPunydecoded: 'user:pass@中文.com:8042', -// userinfo: 'user:pass', -// host: 'xn--fiq228c.com', -// hostPunydecoded: '中文.com', -// port: 8042, -// path: '/over/there', -// pathqf: '/over/there?name=ferret#nose', -// query: 'name=ferret', -// fragment: 'nose', -// href: 'foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose', -// valid: true -// } -``` +
+isIPv4(ip) -### checkHttpURL(uri) - -Check an URI is a valid HTTP URL. - -**Rules**: - -1. __[must be a valid URI](#checkuriuri)__; -1. scheme must be `http` or `HTTP`; -2. authority is required; -3. URL must be less than 2048 characters. - -
- -**Based on**: - -- __RFC-3986__. - -
- -- `uri` **** -- Returns: **** - - `scheme` **** The URL scheme. - - `authority` **** The URL authority with the Punycode ASCII serialization of the domain. *Default*: `null` - - `authorityPunydecoded` **** The URL authority with the Unicode serialization of the domain. *Default*: `null` - - `userinfo` **** The URL userinfo. *Default*: `null` - - `host` **** The URL authority's host with the Punycode ASCII serialization of the domain. *Default*: `null` - - `hostPunydecoded` **** The URL authority's host with the Unicode serialization of the domain. *Default*: `null` - - `port` **** || **** The URL authority's port. A string if not able to be parsed in an integer. *Default*: `null` - - `path` **** The URL path. - - `pathqf` **** The URI path, query and fragment. - - `query` **** The URL query. *Default*: `null` - - `fragment` **** The URL fragment. *Default*: `null` - - `href` **** The URL recomposed. *Default*: `null` - - `valid` **** Whether the URL is valid. *Default*: `false` -- Throws: **** If no error is thrown then the URL is valid. Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_USERINFO_CHAR` - - `URI_INVALID_PORT` - - `URI_INVALID_PATH_CHAR` - - `URI_INVALID_QUERY_CHAR` - - `URI_INVALID_FRAGMENT_CHAR` - - `URI_INVALID_PERCENT_ENCODING` - - `URI_INVALID_SCHEME` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` - -
- -**Examples**: - -```javascript -checkHttpURL(); // throws URIError with code URI_INVALID_TYPE -checkHttpURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -checkHttpURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -checkHttpURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -checkHttpURL('http://üser:pass@bar.com'); // throws URIError with code URI_INVALID_USERINFO_CHAR -checkHttpURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -checkHttpURL('http://bar.com/°'); // throws URIError with code URI_INVALID_PATH_CHAR -checkHttpURL('http://bar.com/over/there?quêry=5'); // throws URIError with code URI_INVALID_QUERY_CHAR -checkHttpURL('http://bar.com/over/there?query=5#anch#r'); // throws URIError with code URI_INVALID_FRAGMENT_CHAR -checkHttpURL('http://www.bar.baz/foo%2') // throws URIError with code URI_INVALID_PERCENT_ENCODING -checkHttpURL('httê://bar.com:8080'); // throws URIError with code URI_INVALID_SCHEME -checkHttpURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -checkHttpURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL +
-checkHttpURL('http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); -// { -// scheme: 'http', -// authority: 'user:pass@xn--fiq228c.com:8042', -// authorityPunydecoded: 'user:pass@中文.com:8042', -// userinfo: 'user:pass', -// host: 'xn--fiq228c.com', -// hostPunydecoded: '中文.com', -// port: 8042, -// path: '/over/there', -// pathqf: '/over/there?name=ferret#nose', -// query: 'name=ferret', -// fragment: 'nose', -// href: 'http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose', -// valid: true -// } -``` +Tests whether a string is a valid IPv4 address. Returns `false` for IPv6. -### checkHttpsURL(uri) +```ts +isIPv4('8.8.8.8'); // true +isIPv4('1:2::8'); // false +``` -Check an URI is a valid HTTPS URL. Same behavior than __[checkHttpURL](#checkhttpurluri)__ except scheme must be `https` or `HTTPS`. +
-### checkHttpSitemapURL(uri) +
+isIPv6(ip) -Check an URI is a valid HTTP URL to be used in an XML sitemap file. +
-For text sitemap please refer to __[checkHttpURL](#checkhttpurluri)__ as there is no need to escape entities **but URL must be in lowercase**. +Tests whether a string is a valid IPv6 address. Returns `false` for IPv4. The standalone validator is lenient regarding zone identifiers — see [`checkURI`](#checkers) for the strict **RFC 6874** form expected inside a URI. -**Rules**: +```ts +isIPv6('2001:0000:1234:0000:0000:C1C0:ABCD:0876'); // true +isIPv6('212.58.241.131'); // false +``` -1. __[must be a valid URL](#checkhttpurluri)__; -1. scheme must be `http`; -2. authority is required; -3. specific characters must be escaped; -4. can only contain lowercase characters (prechecked); -5. URL must be less than 2048 characters. +
-**Valid URI characters to be escaped or percent-encoded in a sitemap URL**: +### Checkers -| Character | Value | Escape Code | -| :----------- |:-----:| :---------: | -| Ampersand | `&` | `&` | -| Single Quote | `'` | `'` | -| Asterisk | `*` | `%2A` | +
+checkURI(uri) -
- -**Based on**: - -- __RFC-3986__; -- __Google: Build and submit a sitemap__. - -
- -- `uri` **** -- Returns: **** - - `scheme` **** The URL scheme. - - `authority` **** The URL authority with the Punycode ASCII serialization of the domain. *Default*: `null` - - `authorityPunydecoded` **** The URL authority with the Unicode serialization of the domain. *Default*: `null` - - `userinfo` **** The URL userinfo. *Default*: `null` - - `host` **** The URL authority's host with the Punycode ASCII serialization of the domain. *Default*: `null` - - `hostPunydecoded` **** The URL authority's host with the Unicode serialization of the domain. *Default*: `null` - - `port` **** || **** The URL authority's port. A string if not able to be parsed in an integer. *Default*: `null` - - `path` **** The URL path. - - `pathqf` **** The URI path, query and fragment. - - `query` **** The URL query. *Default*: `null` - - `fragment` **** The URL fragment. *Default*: `null` - - `href` **** The URL recomposed. *Default*: `null` - - `valid` **** Whether the URL is valid. *Default*: `false` -- Throws: **** If no error is thrown then the URL is valid. Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_USERINFO_CHAR` - - `URI_INVALID_PORT` - - `URI_INVALID_CHAR` - - `URI_INVALID_PATH_CHAR` - - `URI_INVALID_QUERY_CHAR` - - `URI_INVALID_FRAGMENT_CHAR` - - `URI_INVALID_PERCENT_ENCODING` - - `URI_INVALID_SITEMAP_ENCODING` - - `URI_INVALID_SCHEME` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` - -
- -**Examples**: - -```javascript -checkHttpSitemapURL(); // throws URIError with code URI_INVALID_TYPE -checkHttpSitemapURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -checkHttpSitemapURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -checkHttpSitemapURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -checkHttpSitemapURL('http://*ser:pass@bar.com'); // throws URIError with code URI_INVALID_USERINFO_CHAR -checkHttpSitemapURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -checkHttpSitemapURL('hTtp://bar.com/Path'); // throws URIError with code URI_INVALID_CHAR -checkHttpSitemapURL('http://bAr.com/Path'); // throws URIError with code URI_INVALID_CHAR -checkHttpSitemapURL('http://bar.com/Path'); // throws URIError with code URI_INVALID_CHAR -checkHttpSitemapURL('http://bar.com/path\''); // throws URIError with code URI_INVALID_PATH_CHAR -checkHttpSitemapURL('http://bar.com/over/there?a=5&b=9'); // throws URIError with code URI_INVALID_QUERY_CHAR -checkHttpSitemapURL('http://bar.com/over/there?a=5#anch*r'); // throws URIError with code URI_INVALID_FRAGMENT_CHAR -checkHttpSitemapURL('http://www.bar.baz/foo%2') // throws URIError with code URI_INVALID_PERCENT_ENCODING -checkHttpSitemapURL('http://www.bar.baz/foo?a=5&am;b=9') // throws URIError with code URI_INVALID_SITEMAP_ENCODING -checkHttpSitemapURL('hêtp://bar.com:8080'); // throws URIError with code URI_INVALID_SCHEME -checkHttpSitemapURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -checkHttpSitemapURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL +
-checkHttpSitemapURL('http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret&catch=rabbits#nose'); -// { -// scheme: 'http', -// authority: 'user:pass@xn--fiq228c.com:8042', -// authorityPunydecoded: 'user:pass@中文.com:8042', -// userinfo: 'user:pass', -// host: 'xn--fiq228c.com', -// hostPunydecoded: '中文.com', -// port: 8042, -// path: '/over/there', -// pathqf: '/over/there?name=ferret&catch=rabbits#nose', -// query: 'name=ferret&catch=rabbits', -// fragment: 'nose', -// href: 'http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret&catch=rabbits#nose', -// valid: true -// } -``` +Strictly validates a URI per **RFC-3986**. Returns the parsed components with `valid: true` on success; throws `URIError` with a stable [error code](#errors) on the first failure. -### checkHttpsSitemapURL(uri) +**Parameters** -Check an URI is a valid HTTPS URL to be used in an XML sitemap file. Same behavior than __[checkHttpSitemapURL](#checkhttpsitemapurluri)__ except scheme must be `https`. +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `uri` | `string` | *(required)* | The URI to validate. | -### checkWebURL(uri) +**Returns** — [`CheckedURI`](#types). -Check an URI is a valid HTTP or HTTPS URL. Same behavior than __[checkHttpURL](#checkhttpurluri)__ except scheme can be `http`/`HTTP` or `https`/`HTTPS`. +**Throws** — `URIError` with one of: `URI_INVALID_TYPE`, `URI_MISSING_SCHEME`, `URI_EMPTY_SCHEME`, `URI_MISSING_PATH`, `URI_INVALID_PATH`, `URI_INVALID_HOST`, `URI_INVALID_SCHEME_CHAR`, `URI_INVALID_USERINFO_CHAR`, `URI_INVALID_PORT`, `URI_INVALID_PATH_CHAR`, `URI_INVALID_QUERY_CHAR`, `URI_INVALID_FRAGMENT_CHAR`, `URI_INVALID_PERCENT_ENCODING`. -### checkSitemapURL(uri) +**Notes** -Check an URI is a valid HTTP or HTTPS URL to be used in an XML sitemap file. Same behavior than __[checkHttpSitemapURL](#checkhttpsitemapurluri)__ except scheme can be `http` or `https`. +- Scheme is required and non-empty (**RFC-3986 §3.1**). +- Path is required and may be empty. +- If authority is present, path must be empty or start with `/`; otherwise path must not start with `//`. +- Authority components: host must be a valid IP or domain; `userinfo` only allows the characters from **RFC-3986 §3.2.1**; `port` must be an integer in `0–65535`. +- Path, query, and fragment only allow the characters from **RFC-3986 §3.3 / §3.4 / §3.5**. +- IPv6 zone identifiers must use the `%25` delimiter and a non-empty `ZoneID` of `unreserved` / `pct-encoded` characters (**RFC 6874 §2**). -### encodeURIComponentString(component, options) +**Examples** -Encode an URI component according to **RFC-3986**. +```ts +checkURI('foo://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); +// { scheme: 'foo', host: 'xn--fiq228c.com', valid: true, … } + +checkURI(); // throws URIError — URI_INVALID_TYPE +checkURI('://example.com'); // throws URIError — URI_MISSING_SCHEME +checkURI('foo:////bar'); // throws URIError — URI_INVALID_PATH +checkURI('foo://xn--iñvalid.com'); // throws URIError — URI_INVALID_HOST +checkURI('fôo:bar'); // throws URIError — URI_INVALID_SCHEME_CHAR +checkURI('foo://üser:pass@bar.com'); // throws URIError — URI_INVALID_USERINFO_CHAR +checkURI('foo://bar.com:80g80'); // throws URIError — URI_INVALID_PORT +checkURI('foo://bar.com/°'); // throws URIError — URI_INVALID_PATH_CHAR +checkURI('foo://bar.com/over/there?quêry=5'); // throws URIError — URI_INVALID_QUERY_CHAR +checkURI('foo://bar.com/over/there?query=5#anch#r'); // throws URIError — URI_INVALID_FRAGMENT_CHAR +checkURI('http://www.bar.baz/foo%2'); // throws URIError — URI_INVALID_PERCENT_ENCODING +``` -**Support**: +
-- Sitemap's special characters, see __[checkHttpSitemapURL](#checkhttpsitemapurluri)__; -- lower and upper case. +
+checkHttpURL(uri) -**Note**: +
-- only `userinfo`, `path`, `query` and `fragment` components can be encoded with specific rules for each type regarding valid characters (**RFC-3986**); -- `scheme` and `authority` (host and port) can never have escaped or percent-encoded characters; -- the empty string is returned if unable to encode; -- __[sitemap characters](#checkhttpsitemapurluri)__ must be in lowercase and escaped for XML sitemap URLs. +Validates a URI as an HTTP URL on top of [`checkURI`](#checkers). -
+**Adds** -**Generic syntax**: +- `scheme` must be `http` or `HTTP` — else `URI_INVALID_SCHEME`. +- `authority` is required — else `URI_MISSING_AUTHORITY`. +- URL must be shorter than 2,048 characters — else `URI_MAX_LENGTH_URL`. -![URI Syntax](assets/uri-syntax.png "URI Syntax") +**Returns** — [`CheckedURI`](#types). Throws `URIError` with any of `checkURI`'s codes plus the three above. -
- -**Based on**: - -- __RFC-3986__; -- __Google: Build and submit a sitemap__. - -
- -- `component` **** -- `options` ****: - - `type` **** The component type. If no type is provided native function *encodeURIComponent* will be used to encode each character. *Default*: `none` One of: - - `userinfo` - - `path` - - `query` - - `fragment` - - `lowercase` **** Whether the component should be returned in lowercase. *Default*: `false` - - `sitemap` **** Whether to escape Sitemap's special characters. See __[checkHttpSitemapURL](#checkhttpsitemapurluri)__. -- Returns: **** - -
- -**Examples**: - -```javascript -encodeURIComponentString(); // '' -encodeURIComponentString(''); // '' -encodeURIComponentString('cômpön€nt'); // 'c%C3%B4mp%C3%B6n%E2%82%ACnt' -encodeURIComponentString('AbC'); // 'AbC' -encodeURIComponentString('AbC', { lowercase: true }); // 'abc' -encodeURIComponentString('*'); // '*' -encodeURIComponentString('*', { sitemap: true }); // '%2A' - -// it is highly recommended to use a component type -encodeURIComponentString('A#/?@[]&\'*'); // 'A%23%2F%3F%40%5B%5D%26\'*' (native function, outdated standard) -encodeURIComponentString('A#/?@[]&\'*', { type: 'userinfo' }); // 'A%23%2F%3F%40%5B%5D&\'*' (RFC-3986 characters in userinfo) -encodeURIComponentString('A#/?@[]&\'*', { type: 'path' }); // 'A%23/%3F@%5B%5D&\'*' -encodeURIComponentString('A#/?@[]&\'*', { type: 'query' }); // 'A%23/?@%5B%5D&\'*' -encodeURIComponentString('A#/?@[]&\'*', { type: 'fragment' }); // 'A%23/?@%5B%5D&\'*' -encodeURIComponentString('A#/?@[]&\'*', { type: 'fragment', sitemap: true }); // 'a%23/?@%5B%5D&'%2A' +```ts +checkHttpURL('http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret#nose'); +// { scheme: 'http', host: 'xn--fiq228c.com', valid: true, … } ``` -### encodeURIString(uri, options) - -Encode an URI string according to **RFC-3986** with basic checking. +
-**Checked**: +
+checkHttpsURL(uri) -- scheme is required; -- path is required, can be empty; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name. +
-**Support**: +Same as [`checkHttpURL`](#checkers) but `scheme` must be `https` or `HTTPS`. -- IDNs: returns URI with its Punycode host, if any; -- lower and upper case. +
-**Note**: +
+checkWebURL(uri) -- only `userinfo`, `path`, `query` and `fragment` can be percent-encoded; -- native function `encodeURI` encodes string according to **RFC-2396** which is outdated; -- native function `encodeURI` also encodes scheme and host that cannot have - percend-encoded characters; -- characters that should not be percent-encoded in **RFC-3986** are `[]` to represent IPv6 host; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase. +
-
+Same as [`checkHttpURL`](#checkers) but `scheme` can be `http` / `HTTP` or `https` / `HTTPS`. -**Generic syntax**: +
-![URI Syntax](assets/uri-syntax.png "URI Syntax") +
+checkHttpSitemapURL(uri) -
+
-**Based on**: +Validates a URI as an HTTP URL fit for an XML sitemap on top of [`checkHttpURL`](#checkers). -- __RFC-3986__. +**Adds** -
+- The URL must be all lowercase (scheme, host, path, query, fragment) — else `URI_INVALID_CHAR`. +- Specific characters must be escaped — the table below lists them. +- Percent-encoded sitemap escapes (`&`, `'`, `"`, `<`, `>`) must be well-formed — else `URI_INVALID_SITEMAP_ENCODING`. -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME_CHAR` - - `URI_INVALID_PORT` +**Sitemap-escaped characters** -
+| Character | Value | Escape code | +| :----------- | :---: | :---------: | +| Ampersand | `&` | `&` | +| Single quote | `'` | `'` | +| Double quote | `"` | `"` | +| Less than | `<` | `<` | +| Greater than | `>` | `>` | +| Asterisk | `*` | `%2A` | -**Examples**: +For plain-text sitemaps no escaping is required — use [`checkHttpURL`](#checkers) instead, but the URL must still be lowercase. -```javascript -encodeURIString(); // throws URIError with code URI_INVALID_TYPE -encodeURIString('://example.com'); // throws URIError with code URI_MISSING_SCHEME -encodeURIString('http:////bar'); // throws URIError with code URI_INVALID_PATH -encodeURIString('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -encodeURIString('hôtp:bar'); // throws URIError with code URI_INVALID_SCHEME_CHAR -encodeURIString('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT +**Returns** — [`CheckedURI`](#types). Throws `URIError` with the union of `checkHttpURL`'s codes plus `URI_INVALID_CHAR`, `URI_INVALID_SITEMAP_ENCODING`. -encodeURIString('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor'); // 'https://www.xn--fiq228c.com./Over/There?a=B&b=c#Anchor' -encodeURIString('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); // 'https://www.xn--fiq228c.com./over/there?a=b&b=c#anchor' -encodeURIString('foo://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr'); // 'foo://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r' +```ts +checkHttpSitemapURL('http://user:pass@xn--fiq228c.com:8042/over/there?name=ferret&catch=rabbits#nose'); +// { scheme: 'http', host: 'xn--fiq228c.com', valid: true, … } ``` -### encodeWebURL(uri, options) +
-Encode an URI string with basic checking based on **RFC-3986** standard applied to HTTP and HTTPS URLs. +
+checkHttpsSitemapURL(uri) -Uses __[a fixed encodeURI function](#encodeuristringuri-options)__ to be **RFC-3986** compliant. +
-**Checked**: +Same as [`checkHttpSitemapURL`](#checkers) but `scheme` must be `https`. -- scheme must be `http`/`HTTP` or `https`/`HTTPS`; -- path is required, can be empty; -- authority is required; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name; -- URL must be less than 2048 characters. +
-**Support**: +
+checkSitemapURL(uri) -- IDNs: returns URL with its Punycode host, if any; -- lower and upper case. +
-**Note**: +Same as [`checkHttpSitemapURL`](#checkers) but `scheme` can be `http` or `https`. -- only `userinfo`, `path`, `query` and `fragment` can be percent-encoded; -- native function `encodeURI` encodes string according to **RFC-2396** which is outdated; -- native function `encodeURI` also encodes scheme and host that cannot have - percend-encoded characters; -- characters that should not be percent-encoded in **RFC-3986** are `[]` to represent IPv6 host; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase. +
-
+### Encoders -**Generic syntax**: +
+encodeURIComponentString(component, options) -![URI Syntax](assets/uri-syntax.png "URI Syntax") +
-
- -**Based on**: - -- __RFC-3986__. - -
- -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME` - - `URI_INVALID_PORT` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` - -
- -**Examples**: - -```javascript -encodeWebURL(); // throws URIError with code URI_INVALID_TYPE -encodeWebURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -encodeWebURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -encodeWebURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -encodeWebURL('ftp://bar.baz'); // throws URIError with code URI_INVALID_SCHEME -encodeWebURL('hôtp://bar.baz'); // throws URIError with code URI_INVALID_SCHEME -encodeWebURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -encodeWebURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -encodeWebURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL - -encodeWebURL('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor'); // 'https://www.xn--fiq228c.com./Over/There?a=B&b=c#Anchor' -encodeWebURL('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); // 'https://www.xn--fiq228c.com./over/there?a=b&b=c#anchor' -encodeWebURL('http://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr'); // 'http://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r' -``` +Encodes a URI component per **RFC-3986**, with per-type rules and an optional Sitemap-aware mode. Returns the empty string when the input is not a string. -### encodeSitemapURL(uri) +**Parameters** -Encode an URI string with basic checking based on **RFC-3986** standard applied to HTTP and HTTPS URLs and sitemap requirements regarding special characters to escape. +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `component` | `string` | *(required)* | The component to encode. | +| `options.type` | `'userinfo' \| 'path' \| 'query' \| 'fragment'` | *(none)* | The component type. Without a type, native `encodeURIComponent` is used (RFC-2396, outdated). | +| `options.lowercase` | `boolean` | `false` | Lowercase the component before encoding. | +| `options.sitemap` | `boolean` | `false` | Escape Sitemap's special characters (see [`checkHttpSitemapURL`](#checkers)). | -Uses __[a fixed encodeURI function](#encodeuristringuri-options)__ to be **RFC-3986** compliant. +**Returns** — `string`. The encoded component (or `''` on invalid input). -**Checked**: +**Notes** -- scheme must be `http`/`HTTP` or `https`/`HTTPS`; -- path is required, can be empty; -- authority is required; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name; -- URL must be less than 2048 characters. +- Only `userinfo`, `path`, `query`, and `fragment` can be percent-encoded. `scheme` and `authority` (host and port) cannot. +- Pass a component type. Without it, native `encodeURIComponent` over-escapes `!`, `*`, `'`, `(`, `)`, which **RFC-3986** treats as valid sub-delims. -**Support**: +**Examples** -- Sitemap's special characters, see __[checkHttpSitemapURL](#checkhttpsitemapurluri)__; -- IDNs: returns URI with its Punycode host, if any; -- **characters are automatically put in lowercase**. +```ts +encodeURIComponentString('cômpön€nt'); // 'c%C3%B4mp%C3%B6n%E2%82%ACnt' +encodeURIComponentString('AbC', { lowercase: true }); // 'abc' +encodeURIComponentString('*', { sitemap: true }); // '%2A' +encodeURIComponentString("A#/?@[]&'*"); // 'A%23%2F%3F%40%5B%5D%26\'*' — outdated RFC-2396 +encodeURIComponentString("A#/?@[]&'*", { type: 'userinfo' }); // 'A%23%2F%3F%40%5B%5D&\'*' +encodeURIComponentString("A#/?@[]&'*", { type: 'path' }); // 'A%23/%3F@%5B%5D&\'*' +encodeURIComponentString("A#/?@[]&'*", { type: 'fragment', sitemap: true }); +// 'a%23/?@%5B%5D&'%2A' +``` -**Note**: +
-- only `userinfo`, `path`, `query` and `fragment` can be percent-encoded; -- native function `encodeURI` encodes string according to **RFC-2396** which is outdated; -- native function `encodeURI` also encodes scheme and host that cannot have - percend-encoded characters; -- characters that should not be percent-encoded in **RFC-3986** are `[]` to represent IPv6 host; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase. +
+encodeURIString(uri, options) -
+
-**Generic syntax**: +Encodes a URI string per **RFC-3986** with basic validity checking and IDN support. The native `encodeURI` is **RFC-2396**, which is outdated and over-encodes; this function fixes both issues. -![URI Syntax](assets/uri-syntax.png "URI Syntax") +**Parameters** -
- -**Based on**: - -- __RFC-3986__; -- __Google: Build and submit a sitemap__. - -
- -- `uri` **** -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME` - - `URI_INVALID_PORT` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` - -
- -**Examples**: - -```javascript -encodeSitemapURL(); // throws URIError with code URI_INVALID_TYPE -encodeSitemapURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -encodeSitemapURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -encodeSitemapURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -encodeSitemapURL('ftp://bar.baz'); // throws URIError with code URI_INVALID_SCHEME -encodeSitemapURL('hôtp://bar.baz'); // throws URIError with code URI_INVALID_SCHEME -encodeSitemapURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -encodeSitemapURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -encodeSitemapURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL - -encodeSitemapURL('http://user:p\'âss@bar.baz/it\'s *ver/there?a=b&b=c#anch*r'); // 'http://user:p'%C3%A2ss@bar.baz/it's%20%2Aver/there?a=b&b=c#anch%2Ar' -``` +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `uri` | `string` | *(required)* | The URI to encode. | +| `options.lowercase` | `boolean` | `false` | Lowercase the entire URI including path, query, and fragment. | -### decodeURIComponentString(component, options) +**Returns** — `string`. The encoded URI. -Decode an URI component string. +**Throws** — `URIError` with one of: `URI_INVALID_TYPE`, `URI_MISSING_SCHEME`, `URI_EMPTY_SCHEME`, `URI_MISSING_PATH`, `URI_INVALID_PATH`, `URI_INVALID_HOST`, `URI_INVALID_SCHEME_CHAR`, `URI_INVALID_PORT`. -Native function `decodeURIComponent` could throw and to be consistent with [encodeURIComponentString](#encodeuricomponentstringcomponent-options) the empty string is returned if unable to decode. +**Notes** -**Support**: +- Only `userinfo`, `path`, `query`, and `fragment` can be percent-encoded; `scheme` and `host` cannot. +- IDN hosts are serialized to Punycode. +- `[` and `]` are not percent-encoded — they delimit IPv6 hosts. +- By default only scheme and host are lowercased (**RFC-3986 §6.2.2.1**). Path, query, and fragment are case-sensitive — see [Limitations](#limitations) for the `lowercase` flag's scope. -- Sitemap's escape codes, see __[checkHttpSitemapURL](#checkhttpsitemapurluri)__; -- lower and upper case. +**Examples** -
- -**Based on**: +```ts +encodeURIString('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor'); +// 'https://www.xn--fiq228c.com./Over/There?a=B&b=c#Anchor' -- __RFC-3986__; -- __Google: Build and submit a sitemap__. +encodeURIString('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); +// 'https://www.xn--fiq228c.com./over/there?a=b&b=c#anchor' -
+encodeURIString('foo://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr'); +// 'foo://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r' +``` -- `component` **** -- `options` ****: - - `lowercase` **** Whether the component should be returned in lowercase. *Default*: `false` - - `sitemap` **** Whether to decode Sitemap's escape codes. See __[checkHttpSitemapURL](#checkhttpsitemapurluri)__. -- Returns: **** +
-
+
+encodeWebURL(uri, options) -**Examples**: +
-```javascript -decodeURIComponentString(); // '' -decodeURIComponentString(''); // '' -decodeURIComponentString('AbC'); // 'AbC' -decodeURIComponentString('AbC', { lowercase: true }); // 'abc' -decodeURIComponentString('%2A'); // '*' -decodeURIComponentString(''&%2A', { sitemap: true }); // '\'&*' -decodeURIComponentString('SITE&maP', { sitemap: true, lowercase: true }); // 'site&map' -``` +Encodes an HTTP or HTTPS URL per **RFC-3986**, on top of [`encodeURIString`](#encoders). Uses the same fixed-encode logic but enforces the HTTP(S) constraints. -### decodeURIString(uri, options) +**Adds** -Decode an URI string according to **RFC-3986** with basic checking. +- `scheme` must be `http` / `HTTP` or `https` / `HTTPS` — else `URI_INVALID_SCHEME`. +- `authority` is required — else `URI_MISSING_AUTHORITY`. +- URL must be shorter than 2,048 characters — else `URI_MAX_LENGTH_URL`. -**Checked**: +**Parameters and options** — identical to [`encodeURIString`](#encoders). -- scheme is required; -- path is required, can be empty; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name. +**Examples** -**Support**: +```ts +encodeWebURL('HTTPS://WWW.中文.COM./Over/There?a=B&b=c#Anchor'); +// 'https://www.xn--fiq228c.com./Over/There?a=B&b=c#Anchor' -- IDNs: returns URI with its Punydecoded host (Unicode serialization of the domain), if any; -- lower and upper case. +encodeWebURL('http://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr'); +// 'http://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r' +``` -**Note**: +
-- if one of `userinfo`, `path`, `query` or `fragment` component cannot be decoded, it will be ignored; -- native function `decodeURI` does not support IDNs and cannot properly work with `encodeURI` since the function is based on an outdated standard; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase; -- to only use with [encodeURIString](#encodeuristringuri-options). +
+encodeSitemapURL(uri) -
+
-**Based on**: +Encodes an HTTP or HTTPS URL for an XML sitemap on top of [`encodeWebURL`](#encoders) — applies Sitemap escape codes and lowercases the URL. -- __RFC-3986__. +**Adds** -
+- Sitemap's special characters are escaped (see [`checkHttpSitemapURL`](#checkers)). +- The output is fully lowercased. -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME_CHAR` - - `URI_INVALID_PORT` +**Examples** -
+```ts +encodeSitemapURL("http://user:p'âss@bar.baz/it's *ver/there?a=b&b=c#anch*r"); +// 'http://user:p'%C3%A2ss@bar.baz/it's%20%2Aver/there?a=b&b=c#anch%2Ar' +``` -**Examples**: +
-```javascript -decodeURIString(); // throws URIError with code URI_INVALID_TYPE -decodeURIString('://example.com'); // throws URIError with code URI_MISSING_SCHEME -decodeURIString('http:////bar'); // throws URIError with code URI_INVALID_PATH -decodeURIString('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -decodeURIString('hôtp:bar'); // throws URIError with code URI_INVALID_SCHEME_CHAR -decodeURIString('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT +### Decoders -decodeURIString('http://user%:pass@xn--fiq228c.com/%?query=%E0%A5%90#anch#or'); // 'http://中文.com/?query=ॐ' -decodeURIString('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor'); // 'https://www.中文.com./Over/There?a=B&b=c#Anchor' -decodeURIString('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); // 'https://www.中文.com./over/there?a=b&b=c#anchor' -decodeURIString('foo://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r'); // 'foo://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr' -``` +
+decodeURIComponentString(component, options) -### decodeWebURL(uri, options) +
-Decode an URI string with basic checking based on **RFC-3986** standard applied to HTTP and HTTPS URLs. +Decodes a URI component string. Returns the empty string when the input cannot be decoded (`decodeURIComponent` would throw). -Uses __[a fixed decodeURI function](#decodeuristringuri-options)__ to be **RFC-3986** compliant. +**Parameters** -**Checked**: +| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `component` | `string` | *(required)* | The component to decode. | +| `options.lowercase` | `boolean` | `false` | Lowercase the result. | +| `options.sitemap` | `boolean` | `false` | Decode Sitemap escape codes (see [`checkHttpSitemapURL`](#checkers)). | -- scheme must be `http`/`HTTP` or `https`/`HTTPS`; -- path is required, can be empty; -- authority is required; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name; -- URL must be less than 2048 characters. +**Returns** — `string`. The decoded component (or `''` on invalid input). -**Support**: +**Examples** -- IDNs: returns URI with its Punydecoded host (Unicode serialization of the domain), if any; -- lower and upper case. +```ts +decodeURIComponentString('%2A'); // '*' +decodeURIComponentString(''&%2A', { sitemap: true }); // "'&*" +decodeURIComponentString('SITE&maP', { sitemap: true, lowercase: true }); +// 'site&map' +``` -**Note**: +
-- if one of `userinfo`, `path`, `query` or `fragment` component cannot be decoded, it will be ignored; -- native function `decodeURI` does not support IDNs and cannot properly work with `encodeURI` since the function is based on an outdated standard; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase; -- to only use with [encodeWebURL](#encodeweburluri-options). +
+decodeURIString(uri, options) -
+
-**Based on**: +Decodes a URI string per **RFC-3986** with basic validity checking and IDN support — the inverse of [`encodeURIString`](#encoders). -- __RFC-3986__. +**Parameters** -
+| Option | Type | Default | Description | +| --- | --- | --- | --- | +| `uri` | `string` | *(required)* | The URI to decode. | +| `options.lowercase` | `boolean` | `false` | Lowercase the entire URI including path, query, and fragment. | -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME` - - `URI_INVALID_PORT` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` +**Returns** — `string`. The decoded URI. -
+**Throws** — `URIError` with one of: `URI_INVALID_TYPE`, `URI_MISSING_SCHEME`, `URI_EMPTY_SCHEME`, `URI_MISSING_PATH`, `URI_INVALID_PATH`, `URI_INVALID_HOST`, `URI_INVALID_SCHEME_CHAR`, `URI_INVALID_PORT`. -**Examples**: +**Notes** -```javascript -decodeWebURL(); // throws URIError with code URI_INVALID_TYPE -decodeWebURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -decodeWebURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -decodeWebURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -decodeWebURL('ftp://bar.com'); // throws URIError with code URI_INVALID_SCHEME -decodeWebURL('hôtp://bar.com'); // throws URIError with code URI_INVALID_SCHEME -decodeWebURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -decodeWebURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -decodeWebURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL +- A component that cannot be decoded is silently passed through (preserves the encoded form). +- IDN hosts are returned in Unicode form (Punydecoded). +- See [Limitations](#limitations) for the `lowercase` flag's scope. -decodeWebURL('http://user%:pass@xn--fiq228c.com/%?query=%E0%A5%90#anch#or'); // 'http://中文.com/?query=ॐ' -decodeWebURL('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor'); // 'https://www.中文.com./Over/There?a=B&b=c#Anchor' -decodeWebURL('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor', { lowercase: true }); // 'https://www.中文.com./over/there?a=b&b=c#anchor' -decodeWebURL('http://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r'); // 'http://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr' -``` +**Examples** -### decodeSitemapURL(uri, options) +```ts +decodeURIString('http://user%:pass@xn--fiq228c.com/%?query=%E0%A5%90#anch#or'); +// 'http://中文.com/?query=ॐ' -Decode an URI string with basic checking based on **RFC-3986** standard applied to HTTP and HTTPS URLs and sitemap requirements regarding escape codes to decode. +decodeURIString('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor'); +// 'https://www.中文.com./Over/There?a=B&b=c#Anchor' -Uses __[a fixed decodeURI function](#decodeuristringuri-options)__ to be **RFC-3986** compliant. +decodeURIString('foo://us%C3%ABr:p%C3%A2ss@bar.baz:8080/Ov%C3%ABr%20There?%C3%B9=B&b=c#Anch%C3%B4r'); +// 'foo://usër:pâss@bar.baz:8080/Ovër There?ù=B&b=c#Anchôr' +``` -**Checked**: +
-- scheme must be `http`/`HTTP` or `https`/`HTTPS`; -- path is required, can be empty; -- authority is required; -- port, if any, must be an integer between 0 - 65535; -- host must be a valid IP or domain name; -- URL must be less than 2048 characters. +
+decodeWebURL(uri, options) -**Support**: +
-- Sitemap's escape codes, see __[checkHttpSitemapURL](#checkhttpsitemapurluri)__; -- IDNs: returns URI with its Punydecoded host (Unicode serialization of the domain), if any; -- lower and upper case. +Decodes an HTTP or HTTPS URL per **RFC-3986** on top of [`decodeURIString`](#decoders) — the inverse of [`encodeWebURL`](#encoders). -**Note**: +**Adds** -- if one of `userinfo`, `path`, `query` or `fragment` component cannot be decoded, it will be ignored; -- native function `decodeURI` does not support IDNs and cannot properly work with `encodeURI` since the function is based on an outdated standard; -- to stay fully **RFC-3986** compliant, scheme and host are put in lowercase; -- to only use with [encodeSitemapURL](#encodesitemapurluri). +- `scheme` must be `http` / `HTTP` or `https` / `HTTPS` — else `URI_INVALID_SCHEME`. +- `authority` is required — else `URI_MISSING_AUTHORITY`. +- URL must be shorter than 2,048 characters — else `URI_MAX_LENGTH_URL`. -
+**Examples** -**Based on**: +```ts +decodeWebURL('HTTPS://WWW.xn--fiq228c.COM./Over/There?a=B&b=c#Anchor'); +// 'https://www.中文.com./Over/There?a=B&b=c#Anchor' +``` -- __RFC-3986__; -- __Google: Build and submit a sitemap__. +
-
+
+decodeSitemapURL(uri, options) -- `uri` **** -- `options` ****: - - `lowercase` **** Whether the uri should be returned in lowercase. *Default*: `false` -- Returns: **** -- Throws: **** Error codes: - - `URI_INVALID_TYPE` - - `URI_MISSING_SCHEME` - - `URI_EMPTY_SCHEME` - - `URI_MISSING_PATH` - - `URI_INVALID_PATH` - - `URI_INVALID_HOST` - - `URI_INVALID_SCHEME` - - `URI_INVALID_PORT` - - `URI_MISSING_AUTHORITY` - - `URI_MAX_LENGTH_URL` +
-
+Decodes an HTTP or HTTPS URL coming from an XML sitemap — the inverse of [`encodeSitemapURL`](#encoders). Sitemap escape codes are converted back to their characters. -**Examples**: +**Examples** -```javascript -decodeSitemapURL(); // throws URIError with code URI_INVALID_TYPE -decodeSitemapURL('://example.com'); // throws URIError with code URI_MISSING_SCHEME -decodeSitemapURL('http:////bar'); // throws URIError with code URI_INVALID_PATH -decodeSitemapURL('http://xn--iñvalid.com'); // throws URIError with code URI_INVALID_HOST -decodeSitemapURL('ftp://bar.com'); // throws URIError with code URI_INVALID_SCHEME -decodeSitemapURL('hôtp://bar.com'); // throws URIError with code URI_INVALID_SCHEME -decodeSitemapURL('http://bar.com:80g80'); // throws URIError with code URI_INVALID_PORT -decodeSitemapURL('http:isbn:0-486-27557-4'); // throws URIError with code URI_MISSING_AUTHORITY -decodeSitemapURL(`http://example.com/${'path'.repeat(2040)}`); // throws URIError with code URI_MAX_LENGTH_URL +```ts +decodeSitemapURL('HTTP://bar.BAZ/IT'S%20OVER%2Athere%2A?a=b&c=d'); +// "http://bar.baz/IT'S OVER*there*?a=b&c=d" -decodeSitemapURL('http://user%:pass@xn--fiq228c.com/%?query=%E0%A5%90#anch#or'); // 'http://中文.com/?query=ॐ' -decodeSitemapURL('HTTP://bar.BAZ/IT'S%20OVER%2Athere%2A?a=b&c=d'); // 'http://bar.baz/IT\'S OVER*there*?a=b&c=d' -decodeSitemapURL('http://bar.baz/IT'S%20OVER%2Athere%2A?A=b&c=D', { lowercase: true }); // 'http://bar.baz/it\'s over*there*?a=b&c=d' +decodeSitemapURL('http://bar.baz/IT'S%20OVER%2Athere%2A?A=b&c=D', { lowercase: true }); +// "http://bar.baz/it's over*there*?a=b&c=d" ``` +
## Errors -### Object structure - -Errors emitted by *@coroboros/uri* are native URIError with an additional *code* property: +Errors emitted by `@coroboros/uri` are native `URIError` instances with an additional `code` property: -```javascript +```ts { - name, - code, - message, - stack, + name: 'URIError', + code: URIErrorCode, + message: string, + stack: string, } ``` -### Codes - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
namecodedescriptionmodule
URIError
URI_INVALID_TYPEURI variable type is not validsrc/checkers
URI_MISSING_SCHEMEURI scheme is missingsrc/checkers
URI_EMPTY_SCHEMEURI scheme is emptysrc/checkers
URI_INVALID_SCHEMEURI scheme is not validsrc/checkers
src/decoders
src/encoders
URI_INVALID_SCHEME_CHARURI scheme contains an invalid charactersrc/checkers
src/decoders
src/encoders
URI_MISSING_PATHURI path is missingsrc/checkers
URI_INVALID_PATHURI path is not valid based on RFC-3986src/checkers
URI_MISSING_AUTHORITYURI authority is missingsrc/checkers
src/decoders
src/encoders
URI_INVALID_HOSTURI host is not valid IP or domainsrc/checkers
URI_INVALID_PORTURI port is not a numbersrc/checkers
src/decoders
src/encoders
URI_INVALID_CHARURI contains an invalid charactersrc/checkers
URI_INVALID_USERINFO_CHARURI userinfo contains an invalid charactersrc/checkers
URI_INVALID_PATH_CHARURI path contains an invalid charactersrc/checkers
URI_INVALID_QUERY_CHARURI query contains an invalid charactersrc/checkers
URI_INVALID_FRAGMENT_CHARURI fragment contains an invalid charactersrc/checkers
URI_INVALID_PERCENT_ENCODINGA percent-encoding character is not validsrc/checkers
URI_INVALID_SITEMAP_ENCODINGURI contains an invalid sitemap escape codesrc/checkers
URI_MAX_LENGTH_URLMaximum URL allowed length of 2048 characters has been reachedsrc/checkers
+The `code` field is a stable string discriminant safe for runtime branching. + +
+Error codes + +
+ +| Code | Description | Module | +| --- | --- | --- | +| `URI_INVALID_TYPE` | URI variable type is not valid. | `src/checkers` | +| `URI_MISSING_SCHEME` | URI scheme is missing. | `src/checkers` | +| `URI_EMPTY_SCHEME` | URI scheme is empty. | `src/checkers` | +| `URI_INVALID_SCHEME` | URI scheme is not valid. | `src/checkers`, `src/decoders`, `src/encoders` | +| `URI_INVALID_SCHEME_CHAR` | URI scheme contains an invalid character. | `src/checkers`, `src/decoders`, `src/encoders` | +| `URI_MISSING_PATH` | URI path is missing. | `src/checkers` | +| `URI_INVALID_PATH` | URI path is not valid per **RFC-3986**. | `src/checkers` | +| `URI_MISSING_AUTHORITY` | URI authority is missing. | `src/checkers`, `src/decoders`, `src/encoders` | +| `URI_INVALID_HOST` | URI host is not a valid IP or domain. | `src/checkers` | +| `URI_INVALID_PORT` | URI port is not a number. | `src/checkers`, `src/decoders`, `src/encoders` | +| `URI_INVALID_CHAR` | URI contains an invalid character. | `src/checkers` | +| `URI_INVALID_USERINFO_CHAR` | URI userinfo contains an invalid character. | `src/checkers` | +| `URI_INVALID_PATH_CHAR` | URI path contains an invalid character. | `src/checkers` | +| `URI_INVALID_QUERY_CHAR` | URI query contains an invalid character. | `src/checkers` | +| `URI_INVALID_FRAGMENT_CHAR` | URI fragment contains an invalid character. | `src/checkers` | +| `URI_INVALID_PERCENT_ENCODING` | A percent-encoding character is not valid. | `src/checkers` | +| `URI_INVALID_SITEMAP_ENCODING` | URI contains an invalid sitemap escape code. | `src/checkers` | +| `URI_MAX_LENGTH_URL` | Maximum URL length of 2,048 characters has been reached. | `src/checkers` | + +
+ +## Limitations + +- A present-but-empty query or fragment (a bare `?` or `#`) is preserved and round-trips, distinct from an absent one (**RFC-3986 §5.3**). +- A port must be a string of ASCII digits (**RFC-3986 §3.2.3**) — values like `0x1F` are rejected. +- `userinfo` is delimited by the last `@`, and a non-IPv6 host/port by the last `:` (**RFC-3986 §3.2**). +- Percent-encoding hex is case-insensitive: `%3a` and `%3A` are both accepted (**RFC-3986 §6.2.2.1**). +- Inside a URI, an IPv6 zone identifier must use the `%25` delimiter and a non-empty `ZoneID` of `unreserved` / `pct-encoded` characters (**RFC 6874 §2**). The standalone [`isIPv6`](#validators) validator stays lenient. +- `encodeSitemapURL` escapes all five XML entities `& ' " < >`, and a Sitemap URL must be shorter than 2,048 characters (sitemaps.org). For example, `encodeSitemapURL('http://example.com/a&bd')` returns `'http://example.com/a&b<c>d'`. +- This is a strict **RFC-3986** toolkit, not a WHATWG URL parser — it does not apply WHATWG host/IPv4 leniency. +- IPv6 addresses are not canonicalized to **RFC 5952** form. +- The `lowercase` option lowercases the entire input including path, query, and fragment, which are case-sensitive per **RFC-3986 §6.2.2.1**. Use `lowercase` for Sitemap or convenience, not as RFC normalization. By default only scheme and host are lowercased, which is the RFC-compliant behavior. ## Contributing @@ -1523,6 +982,7 @@ Bug reports and PRs welcome. - Open an issue before submitting non-trivial PRs. - Commits follow [Conventional Commits](https://www.conventionalcommits.org/). - Run `pnpm lint && pnpm typecheck && pnpm test` before pushing. +- Run `pnpm bench` against `bench/baseline.md` when touching parser, encoders, or decoders — no regression beyond 10 % on any bucket at fixed feature set. - Target the `main` branch. ## License diff --git a/bench/baseline.md b/bench/baseline.md new file mode 100644 index 0000000..e351137 --- /dev/null +++ b/bench/baseline.md @@ -0,0 +1,79 @@ +# Benchmark baseline + +Apple M1, Node 22.22.2. Run `pnpm bench` to reproduce. + +Native `URL` is shown for scale only. It implements the WHATWG URL model, +not strict RFC-3986: it applies host/IPv4 leniency, default-port stripping +and mandatory IDNA that this toolkit deliberately does not. The columns are +not equivalent — `@coroboros/uri` trades raw speed for RFC-3986 fidelity, +explicit validation with coded errors, and zero runtime dependencies. + +## 1.0.0 + +### parse — `parseURI(uri)` vs `new URL(uri)` + +| Bucket | parseURI | new URL | ratio | +| ------- | --------: | -------: | ----: | +| simple | 979.57 ns | 215.6 ns | 4.5x | +| typical | 1.22 µs | 377.7 ns | 3.2x | +| idn | 2.30 µs | 707.7 ns | 3.3x | +| ipv6 | 1.57 µs | 345.1 ns | 4.6x | +| long | 1.27 µs | 502.0 ns | 2.5x | + +### validate — `checkWebURL(uri)` vs `URL.canParse(uri)` + +| Bucket | checkWebURL | URL.canParse | ratio | +| ------- | ----------: | -----------: | -----: | +| simple | 2.00 µs | 128.9 ns | 15.5x | +| typical | 3.69 µs | 199.5 ns | 18.5x | +| idn | 4.53 µs | 657.5 ns | 6.9x | +| ipv6 | 3.04 µs | 232.6 ns | 13.1x | +| long | 19.43 µs | 204.9 ns | 94.8x | + +`checkWebURL` does full RFC-3986 character validation per component plus +IP/domain checks; `URL.canParse` only attempts a WHATWG parse. The `long` +bucket is a 360-segment path with a 160-pair query — the per-character +validation is linear in input length by design. + +### encode / decode / recompose · typical + +| Operation | avg/iter | +| -------------- | -------: | +| `recomposeURI` | 1.55 µs | +| `decodeWebURL` | 2.94 µs | +| `encodeWebURL` | 2.97 µs | + +### ip · reference resolution + +| Operation | avg/iter | +| -------------------------- | -------: | +| `isIP` ipv4 | 32.4 ns | +| `isIP` reject | 74.9 ns | +| `isIP` ipv6 | 177.7 ns | +| `removeDotSegments` | 257.3 ns | +| `resolveURI` | 453.9 ns | + +## Bundle size + +| Format | Raw | Gzip | +| ------ | -------: | --------: | +| ESM | 55.75 kB | 12.06 kB | +| CJS | 56.44 kB | 12.18 kB | + +## Why slower than native `URL` + +`new URL` is C++-backed and lossy by design: it normalizes, strips default +ports, and discards the empty-vs-absent component distinction. This toolkit +runs a JavaScript RFC-3986 grammar, preserves every component exactly, +validates each component's characters against the RFC tables, and resolves +references through the verbatim §5.2 algorithm. The gap is the cost of +fidelity and zero dependencies, not of unoptimised code — the hot regexps +are compiled once at module load. + +## Going-forward target + +**No regression > 10 % on any bucket at fixed feature set.** A +string-grammar parser has more V8 inline-cache volatility than a tight +numeric loop; the bar is loose enough to absorb it without flapping CI. +Feature additions that legitimately cost time reset the bar for the +buckets they affect. diff --git a/bench/uri.bench.mjs b/bench/uri.bench.mjs new file mode 100644 index 0000000..eca9c4c --- /dev/null +++ b/bench/uri.bench.mjs @@ -0,0 +1,95 @@ +/** + * Micro-benchmark for @coroboros/uri over representative URI shapes. + * + * Usage (from the package root): + * pnpm build && node bench/uri.bench.mjs + * + * Compares the in-package functions against the native field: + * - new URL() (throwing, WHATWG) + * - URL.canParse() (boolean, WHATWG) + * + * The native URL is a different model (WHATWG, not strict RFC-3986); it is + * shown for scale only, not as an equivalence. + */ +import { bench, do_not_optimize, group, run, summary } from 'mitata'; +import { + checkWebURL, + decodeWebURL, + encodeWebURL, + isIP, + parseURI, + recomposeURI, + removeDotSegments, + resolveURI, +} from '../dist/index.mjs'; + +const URIS = { + simple: 'http://example.com/', + typical: 'https://user:pass@example.com:8080/over/there?name=ferret&x=1#nose', + idn: 'https://中文.example.com/over/there?name=ferret#nose', + ipv6: 'http://[2001:db8::1]:8080/over/there?name=ferret#nose', + long: `https://example.com/${'segment/'.repeat(40)}?${'k=v&'.repeat(40)}#end`, +}; + +for (const [label, uri] of Object.entries(URIS)) { + group(`parse · ${label}`, () => { + summary(() => { + bench('parseURI', () => { + do_not_optimize(parseURI(uri)); + }); + bench('new URL', () => { + do_not_optimize(new URL(uri)); + }); + }); + }); +} + +for (const [label, uri] of Object.entries(URIS)) { + group(`validate · ${label}`, () => { + summary(() => { + bench('checkWebURL', () => { + try { + do_not_optimize(checkWebURL(uri)); + } catch {} + }); + bench('URL.canParse', () => { + do_not_optimize(URL.canParse(uri)); + }); + }); + }); +} + +group('encode / decode · typical', () => { + bench('encodeWebURL', () => { + do_not_optimize(encodeWebURL(URIS.typical)); + }); + bench('decodeWebURL', () => { + do_not_optimize(decodeWebURL(URIS.typical)); + }); + bench('recomposeURI', () => { + do_not_optimize(recomposeURI(parseURI(URIS.typical))); + }); +}); + +group('ip', () => { + bench('isIP · ipv4', () => { + do_not_optimize(isIP('192.168.1.1')); + }); + bench('isIP · ipv6', () => { + do_not_optimize(isIP('2001:db8::1')); + }); + bench('isIP · reject', () => { + do_not_optimize(isIP('999.999.999.999')); + }); +}); + +group('reference resolution', () => { + bench('resolveURI', () => { + do_not_optimize(resolveURI('http://a/b/c/d;p?q', '../../g')); + }); + bench('removeDotSegments', () => { + do_not_optimize(removeDotSegments('/a/b/c/./../../g')); + }); +}); + +await run({ colors: true }); diff --git a/biome.json b/biome.json index bfff0c7..088305f 100644 --- a/biome.json +++ b/biome.json @@ -5,6 +5,7 @@ "**", "!**/node_modules", "!**/dist", + "!**/coverage", "!**/.next", "!**/.open-next", "!**/.astro", diff --git a/package.json b/package.json index 4afb837..ae97e03 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@coroboros/uri", "version": "1.0.0", - "description": "RFC-3986 compliant, zero-dependency URI toolkit for Node.js.", + "description": "RFC-3986 URI toolkit for Node.js. IDN (RFC-3987), IPv6 zone identifiers (RFC 6874), Sitemap protocol. Zero dependencies.", "type": "module", "sideEffects": false, "main": "./dist/index.cjs", @@ -35,6 +35,7 @@ "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", + "bench": "pnpm build && node bench/uri.bench.mjs", "prepublishOnly": "pnpm lint && pnpm typecheck && pnpm test && pnpm build" }, "keywords": [ @@ -42,16 +43,22 @@ "uri", "url", "rfc-3986", + "rfc-3987", + "rfc-6874", + "rfc-1034", + "rfc-1123", + "idn", + "punycode", "parse", "encode", "decode", - "validate", "sitemap", - "punycode", + "sitemap-protocol", "ip", "domain", "typescript", - "nodejs" + "nodejs", + "zero-dependency" ], "author": "Coroboros (https://github.com/coroboros)", "license": "MIT", @@ -77,6 +84,8 @@ "@biomejs/biome": "^2.4.15", "@types/node": "^22.0.0", "@vitest/coverage-v8": "^4.1.6", + "fast-check": "^4.8.0", + "mitata": "^1.0.34", "tsdown": "^0.22.0", "typescript": "^6.0.3", "vitest": "^4.1.6" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0c35a13..7cc9b3e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,6 +17,12 @@ importers: '@vitest/coverage-v8': specifier: ^4.1.6 version: 4.1.6(vitest@4.1.6) + fast-check: + specifier: ^4.8.0 + version: 4.8.0 + mitata: + specifier: ^1.0.34 + version: 1.0.34 tsdown: specifier: ^0.22.0 version: 0.22.0(typescript@6.0.3) @@ -383,6 +389,10 @@ packages: resolution: {integrity: sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==} engines: {node: '>=12.0.0'} + fast-check@4.8.0: + resolution: {integrity: sha512-GOJ158CUMnN6cSahsv4+ExARvIDuzzinFjkp0E9WtiBa5zcVeLozVkWaE4IzFcc+Y48Wp1EDlUZsXRyAztQcSg==} + engines: {node: '>=12.17.0'} + fdir@6.5.0: resolution: {integrity: sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==} engines: {node: '>=12.0.0'} @@ -519,6 +529,9 @@ packages: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} + mitata@1.0.34: + resolution: {integrity: sha512-Mc3zrtNBKIMeHSCQ0XqRLo1vbdIx1wvFV9c8NJAiyho6AjNfMY8bVhbS12bwciUdd1t4rj8099CH3N3NFahaUA==} + nanoid@3.3.12: resolution: {integrity: sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ==} engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1} @@ -541,6 +554,9 @@ packages: resolution: {integrity: sha512-SoSL4+OSEtR99LHFZQiJLkT59C5B1amGO1NzTwj7TT1qCUgUO6hxOvzkOYxD+vMrXBM3XJIKzokoERdqQq/Zmg==} engines: {node: ^10 || ^12 || >=14} + pure-rand@8.4.0: + resolution: {integrity: sha512-IoM8YF/jY0hiugFo/wOWqfmarlE6J0wc6fDK1PhftMk7MGhVZl88sZimmqBBFomLOCSmcCCpsfj7wXASCpvK9A==} + quansync@1.0.0: resolution: {integrity: sha512-5xZacEEufv3HSTPQuchrvV6soaiACMFnq1H8wkVioctoH3TRha9Sz66lOxRwPK/qZj7HPiSveih9yAyh98gvqA==} @@ -1038,6 +1054,10 @@ snapshots: expect-type@1.3.0: {} + fast-check@4.8.0: + dependencies: + pure-rand: 8.4.0 + fdir@6.5.0(picomatch@4.0.4): optionalDependencies: picomatch: 4.0.4 @@ -1137,6 +1157,8 @@ snapshots: dependencies: semver: 7.8.0 + mitata@1.0.34: {} + nanoid@3.3.12: {} obug@2.1.1: {} @@ -1153,6 +1175,8 @@ snapshots: picocolors: 1.1.1 source-map-js: 1.2.1 + pure-rand@8.4.0: {} + quansync@1.0.0: {} resolve-pkg-maps@1.0.0: {} diff --git a/src/checkers/chars.ts b/src/checkers/chars.ts index de8a037..ef590d8 100644 --- a/src/checkers/chars.ts +++ b/src/checkers/chars.ts @@ -315,8 +315,12 @@ const isSitemapQueryOrFragmentChar = function isSitemapQueryOrFragmentChar( * * Check percent encoding legal ascii codes according to RFC-3986 https://tools.ietf.org/html/rfc3986#section-2.1. * + * HEXDIG is case-insensitive: %3a and %3A are equivalent + * (RFC-3986 https://tools.ietf.org/html/rfc3986#section-6.2.2.1). + * * 48 to 57 0-9 * 65 to 70 A-F + * 97 to 102 a-f */ const isPercentEncodingChar = function isPercentEncodingChar(char: string): boolean { if (!is(String, char)) { @@ -325,7 +329,7 @@ const isPercentEncodingChar = function isPercentEncodingChar(char: string): bool const code = char.charCodeAt(0); - return (code >= 48 && code <= 57) || (code >= 65 && code <= 70); + return (code >= 48 && code <= 57) || (code >= 65 && code <= 70) || (code >= 97 && code <= 102); }; export { diff --git a/src/checkers/index.ts b/src/checkers/index.ts index c76bf01..e5c0ce5 100644 --- a/src/checkers/index.ts +++ b/src/checkers/index.ts @@ -17,7 +17,8 @@ */ import { maxLengthURL, maxPortInteger, minPortInteger } from '../config/index.js'; import { isDomain } from '../domain/index.js'; -import { int } from '../helpers/cast.js'; +import { int, isPort } from '../helpers/cast.js'; +import { fail } from '../helpers/error.js'; import { exists, is } from '../helpers/object.js'; import { isIP } from '../ip/index.js'; import { type ParsedURI, parseURI } from '../parser/index.js'; @@ -33,8 +34,6 @@ import { isUserinfoChar, } from './chars.js'; -type URIErrorWithCode = URIError & { code: string }; - export interface CheckedURI extends ParsedURI { valid: true; } @@ -46,6 +45,9 @@ export interface CheckedURISyntax extends ParsedURI { type CharChecker = (char: string, encode?: boolean) => boolean; +// RFC 6874 §2: ZoneID = 1*( unreserved / pct-encoded ). Compiled once. +const ipv6ZoneIdRegexp = /^(?:[A-Za-z0-9._~-]|%[0-9A-Fa-f]{2})+$/; + /** * @func checkPercentEncoding * @@ -58,11 +60,7 @@ const checkPercentEncoding = function checkPercentEncoding( stringLen: number, ): number { if (!is(String, string)) { - const error = new URIError( - 'a string is required when checking for percent encoding', - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PERCENT_ENCODING'; - throw error; + fail('URI_INVALID_PERCENT_ENCODING', 'a string is required when checking for percent encoding'); } const len = is(Number, stringLen) && stringLen >= 0 ? stringLen : string.length; @@ -74,24 +72,20 @@ const checkPercentEncoding = function checkPercentEncoding( // example: %20 or %C3%BC if (i + 2 < len) { if (!isPercentEncodingChar(string.charAt(i + 1))) { - const error = new URIError( + fail( + 'URI_INVALID_PERCENT_ENCODING', `invalid percent encoding char '${string.charAt(i + 1)}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PERCENT_ENCODING'; - throw error; + ); } else if (!isPercentEncodingChar(string.charAt(i + 2))) { - const error = new URIError( + fail( + 'URI_INVALID_PERCENT_ENCODING', `invalid percent encoding char '${string.charAt(i + 2)}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PERCENT_ENCODING'; - throw error; + ); } else { offset = 2; } } else { - const error = new URIError('incomplete percent encoding found') as URIErrorWithCode; - error.code = 'URI_INVALID_PERCENT_ENCODING'; - throw error; + fail('URI_INVALID_PERCENT_ENCODING', 'incomplete percent encoding found'); } } @@ -110,11 +104,7 @@ const checkSitemapEncoding = function checkSitemapEncoding( stringLen: number, ): number { if (!is(String, string)) { - const error = new URIError( - 'a string is required when checking for sitemap encoding', - ) as URIErrorWithCode; - error.code = 'URI_INVALID_SITEMAP_ENCODING'; - throw error; + fail('URI_INVALID_SITEMAP_ENCODING', 'a string is required when checking for sitemap encoding'); } const len = is(Number, stringLen) && stringLen >= 0 ? stringLen : string.length; @@ -128,6 +118,7 @@ const checkSitemapEncoding = function checkSitemapEncoding( for (let j = 0; j < escapeCodesKeysLen; j += 1) { const code = escapeCodesKeys[j]; + /* v8 ignore next 3 -- unreachable: j is bounded by escapeCodesKeys.length so the index is always defined */ if (code === undefined) { break; } @@ -141,11 +132,7 @@ const checkSitemapEncoding = function checkSitemapEncoding( } if (!exists(escapeOffset)) { - const error = new URIError( - `entity '${string.charAt(i)}' is not properly escaped`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_SITEMAP_ENCODING'; - throw error; + fail('URI_INVALID_SITEMAP_ENCODING', `entity '${string.charAt(i)}' is not properly escaped`); } else { offset = escapeOffset; } @@ -170,16 +157,15 @@ const checkComponent = function checkComponent({ string, sitemap, }: { - type?: string; - string?: string | null; - sitemap?: boolean; + type?: string | undefined; + string?: string | null | undefined; + sitemap?: boolean | undefined; } = {}): boolean { if (!['userinfo', 'path', 'query', 'fragment'].includes(type as string)) { - const error = new URIError( + fail( + 'URI_INVALID_CHECKING_COMPONENT', `unable to check pathqf, got '${type}' component to check`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_CHECKING_COMPONENT'; - throw error; + ); } // path is always at least empty here, userinfo, query and fragment are not required @@ -202,15 +188,17 @@ const checkComponent = function checkComponent({ case 'fragment': checkCharFunc = checkSitemap ? isSitemapQueryOrFragmentChar : isQueryOrFragmentChar; break; + /* v8 ignore next -- unreachable: type is validated to one of the four cases before the switch */ default: } for (let i = 0; i < len; i += 1) { // check character is valid if (!checkCharFunc(string.charAt(i))) { - const error = new URIError(`invalid ${type} char '${string.charAt(i)}'`) as URIErrorWithCode; - error.code = `URI_INVALID_${(type as string).toUpperCase()}_CHAR`; - throw error; + fail( + `URI_INVALID_${(type as string).toUpperCase()}_CHAR`, + `invalid ${type} char '${string.charAt(i)}'`, + ); } // check percent encodings @@ -237,24 +225,18 @@ const checkComponent = function checkComponent({ */ const checkSchemeChars = function checkSchemeChars(scheme: string, len?: number): boolean { if (!is(String, scheme)) { - const error = new URIError('scheme must be a string') as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', 'scheme must be a string'); } const schemeLen = is(Number, len) && len > 0 ? len : scheme.length; if (schemeLen <= 0) { - const error = new URIError('scheme cannot be empty') as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', 'scheme cannot be empty'); } for (let i = 0; i < schemeLen; i += 1) { if (!isSchemeChar(scheme.charAt(i), { start: i === 0 })) { - const error = new URIError(`invalid scheme char '${scheme.charAt(i)}'`) as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME_CHAR'; - throw error; + fail('URI_INVALID_SCHEME_CHAR', `invalid scheme char '${scheme.charAt(i)}'`); } } @@ -268,15 +250,11 @@ const checkSchemeChars = function checkSchemeChars(scheme: string, len?: number) */ const checkLowercase = function checkLowercase(uri: string): boolean { if (!is(String, uri)) { - const error = new URIError('uri must be a string') as URIErrorWithCode; - error.code = 'URI_INVALID_TYPE'; - throw error; + fail('URI_INVALID_TYPE', 'uri must be a string'); } if (uri.toLowerCase() !== uri) { - const error = new URIError('uri cannot contain any uppercase characters') as URIErrorWithCode; - error.code = 'URI_INVALID_CHAR'; - throw error; + fail('URI_INVALID_CHAR', 'uri cannot contain any uppercase characters'); } return true; @@ -298,9 +276,7 @@ const checkLowercase = function checkLowercase(uri: string): boolean { */ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { if (!is(String, uri)) { - const error = new URIError('uri must be a string') as URIErrorWithCode; - error.code = 'URI_INVALID_TYPE'; - throw error; + fail('URI_INVALID_TYPE', 'uri must be a string'); } // parse uri and check scheme, authority, pathname and slashes @@ -324,48 +300,58 @@ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { // scheme (required) if (!is(String, scheme)) { - const error = new URIError('uri scheme is required') as URIErrorWithCode; - error.code = 'URI_MISSING_SCHEME'; - throw error; + fail('URI_MISSING_SCHEME', 'uri scheme is required'); + /* v8 ignore start -- unreachable: parseURI yields a null or non-empty scheme, never an empty string */ } else if (schemeLen <= 0) { - const error = new URIError('uri scheme must not be empty') as URIErrorWithCode; - error.code = 'URI_EMPTY_SCHEME'; - throw error; + fail('URI_EMPTY_SCHEME', 'uri scheme must not be empty'); } + /* v8 ignore stop */ // path (required), can be an empty string + /* v8 ignore next 3 -- unreachable: the Appendix-B regexp always captures a string path */ if (!is(String, path)) { - const error = new URIError('uri path is required') as URIErrorWithCode; - error.code = 'URI_MISSING_PATH'; - throw error; + fail('URI_MISSING_PATH', 'uri path is required'); } // path: if authority is present path must be empty or start with / if (is(String, authority) && authority.length > 0) { - if (!(path === '' || path.startsWith('/'))) { - const error = new URIError( - "path must be empty or start with '/' when authority is present", - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PATH'; - throw error; + /* v8 ignore next 3 -- unreachable: when authority is present the Appendix-B regexp makes path empty or '/'-prefixed */ + if (!(path === '' || (path as string).startsWith('/'))) { + fail('URI_INVALID_PATH', "path must be empty or start with '/' when authority is present"); } - } else if (path.startsWith('//')) { + } else if ((path as string).startsWith('//')) { // if authority is not present path must not start with // - const error = new URIError( - "path must not start with '//' when authority is not present", - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PATH'; - throw error; + fail('URI_INVALID_PATH', "path must not start with '//' when authority is not present"); } // check for inconsistent authority (original vs parsed) which means // host parsed was actually wrong if (!exists(authority) && exists(authorityPunydecoded)) { - const error = new URIError( - `host must be a valid ip or domain name, got '${hostPunydecoded}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_HOST'; - throw error; + fail('URI_INVALID_HOST', `host must be a valid ip or domain name, got '${hostPunydecoded}'`); + } + + // RFC 6874: an IPv6 zone identifier in a URI MUST use the percent-encoded + // "%25" delimiter and the ZoneID must be a non-empty 1*( unreserved / pct-encoded ) + if (is(String, host) && host.includes(':')) { + const zoneAt = host.indexOf('%'); + + if (zoneAt !== -1) { + const zoneId = host.slice(zoneAt + 3); + + if (host.slice(zoneAt, zoneAt + 3) !== '%25') { + fail( + 'URI_INVALID_HOST', + `IPv6 zone identifier must use the '%25' delimiter, got '${host}'`, + ); + } + + if (zoneId === '' || !ipv6ZoneIdRegexp.test(zoneId)) { + fail( + 'URI_INVALID_HOST', + `IPv6 zone identifier must be a non-empty RFC 6874 ZoneID, got '${host}'`, + ); + } + } } return { @@ -408,7 +394,7 @@ const checkURISyntax = function checkURISyntax(uri: string): CheckedURISyntax { */ const checkURI = function checkURI( uri: string, - { sitemap }: { sitemap?: boolean } = {}, + { sitemap }: { sitemap?: boolean | undefined } = {}, ): CheckedURI { // check uri type and syntax const { @@ -437,20 +423,18 @@ const checkURI = function checkURI( // check host is a valid ip first (RFC-3986) or a domain name if (!isIP(host as string) && !isDomain(host as string)) { - const error = new URIError( - `host must be a valid ip or domain name, got '${host}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_HOST'; - throw error; + fail('URI_INVALID_HOST', `host must be a valid ip or domain name, got '${host}'`); } - // check port is a number if any - if (exists(port) && int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) { - const error = new URIError( + // check port is a valid RFC-3986 *DIGIT and in range if any + if ( + exists(port) && + (!isPort(port) || int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) + ) { + fail( + 'URI_INVALID_PORT', `port must be an integer between ${minPortInteger}-${maxPortInteger}, got '${port}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PORT'; - throw error; + ); } } @@ -494,7 +478,11 @@ const checkURI = function checkURI( */ const checkHttpURL = function checkHttpURL( uri: string, - { https, web, sitemap }: { https?: boolean; web?: boolean; sitemap?: boolean } = {}, + { + https, + web, + sitemap, + }: { https?: boolean | undefined; web?: boolean | undefined; sitemap?: boolean | undefined } = {}, ): CheckedURI { // precheck case for sitemap only if (sitemap === true) { @@ -528,27 +516,18 @@ const checkHttpURL = function checkHttpURL( // scheme if (!schemesToCheck.includes(scheme as string)) { - const error = new URIError( - `scheme must be ${schemesToCheck.join(' or ')}, got '${scheme}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', `scheme must be ${schemesToCheck.join(' or ')}, got '${scheme}'`); } // authority if (!is(String, authority)) { - const error = new URIError('authority is required') as URIErrorWithCode; - error.code = 'URI_MISSING_AUTHORITY'; - throw error; + fail('URI_MISSING_AUTHORITY', 'authority is required'); } // max length - if (is(String, href) && href.length > maxLengthURL) { - const error = new URIError( - `max URL length of ${maxLengthURL} reached: ${href.length}`, - ) as URIErrorWithCode; - error.code = 'URI_MAX_LENGTH_URL'; - throw error; + // sitemaps.org: a URL must be strictly less than 2,048 characters + if (is(String, href) && href.length >= maxLengthURL) { + fail('URI_MAX_LENGTH_URL', `max URL length of ${maxLengthURL} reached: ${href.length}`); } return { diff --git a/src/decoders/index.ts b/src/decoders/index.ts index af24c76..9a533ac 100644 --- a/src/decoders/index.ts +++ b/src/decoders/index.ts @@ -9,13 +9,16 @@ import { checkSchemeChars, checkURISyntax } from '../checkers/index.js'; import { maxLengthURL, maxPortInteger, minPortInteger } from '../config/index.js'; import { isDomain } from '../domain/index.js'; -import { int } from '../helpers/cast.js'; +import { int, isPort } from '../helpers/cast.js'; +import { fail } from '../helpers/error.js'; import { exists, is } from '../helpers/object.js'; import { isIP } from '../ip/index.js'; import { recomposeURI } from '../parser/index.js'; import { escapeCodes, escapeCodesKeys, pencodings, pencodingsKeys } from '../sitemap/index.js'; -type URIErrorWithCode = URIError & { code: string }; +// compiled once at module load — used only via String.prototype.replace, +// which resets lastIndex per spec, so reusing the global regexp is safe +const sitemapDecodeRegexp = new RegExp(escapeCodesKeys.concat(pencodingsKeys).join('|'), 'g'); /** * @func decodeURIComponentString @@ -31,7 +34,7 @@ type URIErrorWithCode = URIError & { code: string }; */ const decodeURIComponentString = function decodeURIComponentString( component: string, - { sitemap, lowercase }: { sitemap?: boolean; lowercase?: boolean } = {}, + { sitemap, lowercase }: { sitemap?: boolean | undefined; lowercase?: boolean | undefined } = {}, ): string { if (!is(String, component)) { return ''; @@ -40,9 +43,9 @@ const decodeURIComponentString = function decodeURIComponentString( const componentToDecode = lowercase === true ? component.toLowerCase() : component; if (sitemap === true) { - const regexp = new RegExp(escapeCodesKeys.concat(pencodingsKeys).join('|'), 'g'); const uriToDecode = componentToDecode.replace( - regexp, + sitemapDecodeRegexp, + /* v8 ignore next -- unreachable '': the regexp is built from these keys so every match resolves */ (match) => escapeCodes[match] || pencodings[match] || '', ); @@ -89,7 +92,15 @@ const decodeURIComponentString = function decodeURIComponentString( */ const decodeURIString = function decodeURIString( uri: string, - { web, sitemap, lowercase }: { web?: boolean; sitemap?: boolean; lowercase?: boolean } = {}, + { + web, + sitemap, + lowercase, + }: { + web?: boolean | undefined; + sitemap?: boolean | undefined; + lowercase?: boolean | undefined; + } = {}, ): string { const uriToDecode = is(String, uri) && lowercase === true ? uri.toLowerCase() : uri; const webURL = web === true || sitemap === true; @@ -111,11 +122,7 @@ const decodeURIString = function decodeURIString( // scheme must be http or https for web/sitemap or with valid chars, always in lowercase if (webURL) { if (scheme !== 'http' && scheme !== 'https') { - const error = new URIError( - `scheme must be http or https, got '${scheme}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', `scheme must be http or https, got '${scheme}'`); } } else { // check scheme characters, it is not intended to decode a scheme @@ -124,27 +131,23 @@ const decodeURIString = function decodeURIString( // authority is required and must be a valid host for web/sitemap if (webURL && !is(String, authority)) { - const error = new URIError('authority is required') as URIErrorWithCode; - error.code = 'URI_MISSING_AUTHORITY'; - throw error; + fail('URI_MISSING_AUTHORITY', 'authority is required'); } // check host is a valid ip first (RFC-3986) or a domain name if (exists(host) && !isIP(host) && !isDomain(host)) { - const error = new URIError( - `host must be a valid ip or domain name, got '${host}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_HOST'; - throw error; + fail('URI_INVALID_HOST', `host must be a valid ip or domain name, got '${host}'`); } - // check port is a number if any - if (exists(port) && int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) { - const error = new URIError( + // check port is a valid RFC-3986 *DIGIT and in range if any + if ( + exists(port) && + (!isPort(port) || int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) + ) { + fail( + 'URI_INVALID_PORT', `port must be an integer between ${minPortInteger}-${maxPortInteger}, got '${port}'`, - ) as URIErrorWithCode; - error.code = 'URI_INVALID_PORT'; - throw error; + ); } // userinfo @@ -152,13 +155,23 @@ const decodeURIString = function decodeURIString( const userinfoDecoded = decodeURIComponentString(userinfo ?? '', { sitemap, lowercase: false }); // path + /* v8 ignore next -- unreachable '': checkURISyntax always yields a string path */ const pathDecoded = decodeURIComponentString(path ?? '', { sitemap, lowercase: false }); - // query - const queryDecoded = decodeURIComponentString(query ?? '', { sitemap, lowercase: false }); + // RFC-3986 §5.3: an absent query/fragment (null) stays absent and a + // present-empty one ('') round-trips with its '?'/'#'. A non-empty + // component that fails to decode is ignored (mapped to null), per the + // documented decode contract. + const decodeComponent = (value: string | null): string | null => { + if (!is(String, value) || value === '') { + return value; + } + + return decodeURIComponentString(value, { sitemap, lowercase: false }) || null; + }; - // fragment - const fragmentDecoded = decodeURIComponentString(fragment ?? '', { sitemap, lowercase: false }); + const queryDecoded = decodeComponent(query); + const fragmentDecoded = decodeComponent(fragment); const uridecoded = recomposeURI({ scheme, @@ -170,12 +183,9 @@ const decodeURIString = function decodeURIString( fragment: fragmentDecoded, }); - if (webURL && uridecoded.length > maxLengthURL) { - const error = new URIError( - `max URL length of ${maxLengthURL} reached: ${uridecoded.length}`, - ) as URIErrorWithCode; - error.code = 'URI_MAX_LENGTH_URL'; - throw error; + // sitemaps.org: a URL must be strictly less than 2,048 characters + if (webURL && uridecoded.length >= maxLengthURL) { + fail('URI_MAX_LENGTH_URL', `max URL length of ${maxLengthURL} reached: ${uridecoded.length}`); } return uridecoded; @@ -209,7 +219,7 @@ const decodeURIString = function decodeURIString( */ const decodeWebURL = function decodeWebURL( uri: string, - { lowercase }: { lowercase?: boolean } = {}, + { lowercase }: { lowercase?: boolean | undefined } = {}, ): string { return decodeURIString(uri, { lowercase, web: true }); }; @@ -245,7 +255,7 @@ const decodeWebURL = function decodeWebURL( */ const decodeSitemapURL = function decodeSitemapURL( uri: string, - { lowercase }: { lowercase?: boolean } = {}, + { lowercase }: { lowercase?: boolean | undefined } = {}, ): string { return decodeURIString(uri, { lowercase, sitemap: true }); }; diff --git a/src/encoders/index.ts b/src/encoders/index.ts index 0fc474e..cf7afcb 100644 --- a/src/encoders/index.ts +++ b/src/encoders/index.ts @@ -18,7 +18,8 @@ import { import { checkSchemeChars, checkURISyntax } from '../checkers/index.js'; import { maxLengthURL, maxPortInteger, minPortInteger } from '../config/index.js'; import { isDomain } from '../domain/index.js'; -import { int } from '../helpers/cast.js'; +import { int, isPort } from '../helpers/cast.js'; +import { fail } from '../helpers/error.js'; import { exists, is } from '../helpers/object.js'; import { isIP } from '../ip/index.js'; import { recomposeURI } from '../parser/index.js'; @@ -45,7 +46,15 @@ import { entities, specialChars } from '../sitemap/index.js'; */ const encodeURIComponentString = function encodeURIComponentString( component: string, - { type, sitemap, lowercase }: { type?: string; sitemap?: boolean; lowercase?: boolean } = {}, + { + type, + sitemap, + lowercase, + }: { + type?: string | undefined; + sitemap?: boolean | undefined; + lowercase?: boolean | undefined; + } = {}, ): string { if (!is(String, component)) { return ''; @@ -132,7 +141,15 @@ const encodeURIComponentString = function encodeURIComponentString( */ const encodeURIString = function encodeURIString( uri: string, - { web, sitemap, lowercase }: { web?: boolean; sitemap?: boolean; lowercase?: boolean } = {}, + { + web, + sitemap, + lowercase, + }: { + web?: boolean | undefined; + sitemap?: boolean | undefined; + lowercase?: boolean | undefined; + } = {}, ): string { const uriToEncode = is(String, uri) && lowercase === true ? uri.toLowerCase() : uri; const webURL = web === true || sitemap === true; @@ -144,11 +161,7 @@ const encodeURIString = function encodeURIString( // scheme must be http or https for web/sitemap or with valid chars, always in lowercase if (webURL) { if (scheme !== 'http' && scheme !== 'https') { - const error = new URIError(`scheme must be http or https, got '${scheme}'`) as URIError & { - code: string; - }; - error.code = 'URI_INVALID_SCHEME'; - throw error; + fail('URI_INVALID_SCHEME', `scheme must be http or https, got '${scheme}'`); } } else { // check scheme characters, it is not intended to encode a scheme @@ -157,27 +170,23 @@ const encodeURIString = function encodeURIString( // authority is required and must be a valid host for web/sitemap if (webURL && !is(String, authority)) { - const error = new URIError('authority is required') as URIError & { code: string }; - error.code = 'URI_MISSING_AUTHORITY'; - throw error; + fail('URI_MISSING_AUTHORITY', 'authority is required'); } // check host is a valid ip first (RFC-3986) or a domain name if (exists(host) && !isIP(host) && !isDomain(host)) { - const error = new URIError( - `host must be a valid ip or domain name, got '${host}'`, - ) as URIError & { code: string }; - error.code = 'URI_INVALID_HOST'; - throw error; + fail('URI_INVALID_HOST', `host must be a valid ip or domain name, got '${host}'`); } - // check port is a number if any - if (exists(port) && int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) { - const error = new URIError( + // check port is a valid RFC-3986 *DIGIT and in range if any + if ( + exists(port) && + (!isPort(port) || int(port, { ge: minPortInteger, le: maxPortInteger }) === undefined) + ) { + fail( + 'URI_INVALID_PORT', `port must be an integer between ${minPortInteger}-${maxPortInteger}, got '${port}'`, - ) as URIError & { code: string }; - error.code = 'URI_INVALID_PORT'; - throw error; + ); } // userinfo @@ -189,25 +198,23 @@ const encodeURIString = function encodeURIString( }); // path + /* v8 ignore next -- unreachable '': checkURISyntax always yields a string path */ const pathEncoded = encodeURIComponentString(path ?? '', { sitemap, type: 'path', lowercase: false, }); - // query - const queryEncoded = encodeURIComponentString(query ?? '', { - sitemap, - type: 'query', - lowercase: false, - }); + // query — RFC-3986 §5.3: keep an absent query (null) absent; only a + // present query (including '') is encoded and re-emitted with '?' + const queryEncoded = is(String, query) + ? encodeURIComponentString(query, { sitemap, type: 'query', lowercase: false }) + : query; - // fragment - const fragmentEncoded = encodeURIComponentString(fragment ?? '', { - sitemap, - type: 'fragment', - lowercase: false, - }); + // fragment — same defined/absent distinction (RFC-3986 §5.3) + const fragmentEncoded = is(String, fragment) + ? encodeURIComponentString(fragment, { sitemap, type: 'fragment', lowercase: false }) + : fragment; const uriencoded = recomposeURI({ scheme, @@ -219,12 +226,9 @@ const encodeURIString = function encodeURIString( fragment: fragmentEncoded, }); - if (webURL && uriencoded.length > maxLengthURL) { - const error = new URIError( - `max URL length of ${maxLengthURL} reached: ${uriencoded.length}`, - ) as URIError & { code: string }; - error.code = 'URI_MAX_LENGTH_URL'; - throw error; + // sitemaps.org: a URL must be strictly less than 2,048 characters + if (webURL && uriencoded.length >= maxLengthURL) { + fail('URI_MAX_LENGTH_URL', `max URL length of ${maxLengthURL} reached: ${uriencoded.length}`); } return uriencoded; @@ -262,7 +266,7 @@ const encodeURIString = function encodeURIString( */ const encodeWebURL = function encodeWebURL( uri: string, - { lowercase }: { lowercase?: boolean } = {}, + { lowercase }: { lowercase?: boolean | undefined } = {}, ): string { return encodeURIString(uri, { lowercase, web: true }); }; diff --git a/src/helpers/cast.ts b/src/helpers/cast.ts index 66d7341..3a8e5da 100644 --- a/src/helpers/cast.ts +++ b/src/helpers/cast.ts @@ -86,6 +86,7 @@ const integer = function integer(thing: unknown): number | undefined { if (castNum !== undefined) { const int = parseInt(String(castNum), 10); + /* v8 ignore next -- unreachable: parseInt of a finite number's String is never NaN */ if (!Number.isNaN(int)) { castInt = int; } @@ -122,4 +123,18 @@ const int = function int(thing: unknown, { ge, le }: Range = {}): number | undef return castInt; }; -export { int, integer, num, number }; +/** + * @func isPort + * + * RFC-3986 §3.2.3: port = *DIGIT. True if the value is absent + * (null/undefined) or a possibly empty string of ASCII digits. + * The numeric range is validated separately by `int`. + * + * Rejects JS Number coercion artefacts (hex `0x1F`, scientific `1e3`, + * whitespace) that `Number()` would otherwise accept. + */ +const isPort = function isPort(thing: unknown): boolean { + return thing === null || thing === undefined || /^[0-9]*$/.test(String(thing)); +}; + +export { int, integer, isPort, num, number }; diff --git a/src/helpers/error.ts b/src/helpers/error.ts new file mode 100644 index 0000000..de523ad --- /dev/null +++ b/src/helpers/error.ts @@ -0,0 +1,23 @@ +/** + * error helper + * + * - fail(code, message) -> never (throws a coded URIError) + */ + +type URIErrorWithCode = URIError & { code: string }; + +/** + * @func fail + * + * Throw a URIError carrying a stable `code` string. The thrown value is + * always `instanceof URIError`. + */ +const fail = function fail(code: string, message: string): never { + const error = new URIError(message) as URIErrorWithCode; + + error.code = code; + + throw error; +}; + +export { fail, type URIErrorWithCode }; diff --git a/src/index.ts b/src/index.ts index aeeabb2..ba2ea35 100644 --- a/src/index.ts +++ b/src/index.ts @@ -27,3 +27,4 @@ export { export { isIP, isIPv4, isIPv6 } from './ip/index.js'; export { type ParsedURI, parseURI, recomposeURI, type URIComponents } from './parser/index.js'; export { punycode, punydecode } from './punycode/index.js'; +export { removeDotSegments, resolveURI } from './resolver/index.js'; diff --git a/src/ip/index.ts b/src/ip/index.ts index f4de4ca..ca05295 100644 --- a/src/ip/index.ts +++ b/src/ip/index.ts @@ -27,6 +27,11 @@ const v6 = ` .replace(/\n/g, '') .trim(); +// compiled once at module load — these patterns are stateless (no `g` flag) +const ipv4Regexp = new RegExp(`^${v4}$`); +const ipv6Regexp = new RegExp(`^${v6}$`); +const ipRegexp = new RegExp(`(?:^${v4}$)|(?:^${v6}$)`); + /** * @func isIP * @@ -37,7 +42,7 @@ const isIP = function isIP(ip: string): boolean { return false; } - return new RegExp(`(?:^${v4}$)|(?:^${v6}$)`).test(ip); + return ipRegexp.test(ip); }; /** @@ -50,7 +55,7 @@ const isIPv4 = function isIPv4(ip: string): boolean { return false; } - return new RegExp(`^${v4}$`).test(ip); + return ipv4Regexp.test(ip); }; /** @@ -63,7 +68,7 @@ const isIPv6 = function isIPv6(ip: string): boolean { return false; } - return new RegExp(`^${v6}$`).test(ip); + return ipv6Regexp.test(ip); }; export { isIP, isIPv4, isIPv6 }; diff --git a/src/parser/index.ts b/src/parser/index.ts index 480ee8d..8e855f5 100644 --- a/src/parser/index.ts +++ b/src/parser/index.ts @@ -6,7 +6,7 @@ * - parseURI(uri) -> Object */ import { maxPortInteger, minPortInteger } from '../config/index.js'; -import { int } from '../helpers/cast.js'; +import { int, isPort } from '../helpers/cast.js'; import { exists, is } from '../helpers/object.js'; import { isIPv6 } from '../ip/index.js'; import { punycode, punydecode } from '../punycode/index.js'; @@ -72,8 +72,10 @@ const hostToURI = function hostToURI(host: string): string { * 5. host, if any, must be at least 3 characters; * 6. userinfo will be ignored if empty; * 7. port will be ignored if empty or not an integer; - * 8. query will be ignored if empty; - * 9. fragment will be ignored if empty. + * 8. query is emitted when defined (a string, including ''); a null + * or undefined query is omitted (RFC-3986 §5.3); + * 9. fragment is emitted when defined (a string, including ''); a null + * or undefined fragment is omitted (RFC-3986 §5.3). * * Support: * - IPv4 and IPv6. @@ -112,7 +114,11 @@ const recomposeURI = function recomposeURI(components?: URIComponents): string { uri += hostToURI(host); - if (exists(port) && int(port, { ge: minPortInteger, le: maxPortInteger }) !== undefined) { + if ( + exists(port) && + isPort(port) && + int(port, { ge: minPortInteger, le: maxPortInteger }) !== undefined + ) { uri += `:${port}`; } } else { @@ -130,11 +136,13 @@ const recomposeURI = function recomposeURI(components?: URIComponents): string { uri += path; } - if (is(String, query) && query.length > 0) { + // RFC-3986 §5.3: emit the delimiter whenever the component is defined, + // including the empty string (a defined-empty query/fragment) + if (is(String, query)) { uri += `?${query}`; } - if (is(String, fragment) && fragment.length > 0) { + if (is(String, fragment)) { uri += `#${fragment}`; } @@ -182,6 +190,7 @@ const parseURI = function parseURI(uri: string): ParsedURI { } // extract uri components from RegExp + /* v8 ignore next -- unreachable []: the all-optional Appendix-B regexp always matches a non-empty string */ const [, scheme, authorityParsed, path, queryParsed, fragmentParsed] = uri.match(uriRegexp) ?? []; // scheme is required and must be a not empty string or this is not an uri @@ -200,15 +209,18 @@ const parseURI = function parseURI(uri: string): ParsedURI { if (is(String, authorityParsed)) { let hostAndPort: string | null = null; - [userinfo = null, hostAndPort = null] = authorityParsed.split('@'); + // RFC-3986 §3.2.1: userinfo is delimited by the last '@' before the host + const userinfoEnd = authorityParsed.lastIndexOf('@'); - // authority had no '@' and no userinfo can be extracted - if (!exists(hostAndPort) && exists(userinfo)) { - hostAndPort = userinfo; - userinfo = null; + if (userinfoEnd === -1) { + hostAndPort = authorityParsed; + } else { + userinfo = authorityParsed.slice(0, userinfoEnd); + hostAndPort = authorityParsed.slice(userinfoEnd + 1); } // try to extract host and port only if any + /* v8 ignore next -- unreachable false branch: hostAndPort is always an assigned string after the authority split */ if (is(String, hostAndPort)) { // detect IPv6 here first const ipv6Match = hostAndPort.match(ipv6Regexp); @@ -219,14 +231,23 @@ const parseURI = function parseURI(uri: string): ParsedURI { if (Array.isArray(ipv6Match)) { [, hostParsed = null, portToCast = null] = ipv6Match; } else { - // not an ipv6 - [hostParsed = null, portToCast = null] = hostAndPort.split(':'); + // not an ipv6 — RFC-3986 §3.2.2/§3.2.3: port follows the last ':' + const portStart = hostAndPort.lastIndexOf(':'); + + if (portStart === -1) { + hostParsed = hostAndPort; + } else { + hostParsed = hostAndPort.slice(0, portStart); + portToCast = hostAndPort.slice(portStart + 1); + } } // hostPunydecoded should be the host in Unicode, host its Punycode value + /* v8 ignore start -- unreachable null branch: the ipv6 regexp's required capture means hostParsed is always a string here */ const hostLowerCase = is(String, hostParsed) ? hostParsed.toLowerCase() : null; const toASCII = punycode(hostLowerCase ?? ''); const toUnicode = punydecode(hostLowerCase ?? ''); + /* v8 ignore stop */ // host parsed was in Unicode if (hostLowerCase !== toASCII) { @@ -250,13 +271,19 @@ const parseURI = function parseURI(uri: string): ParsedURI { // necessary to handle possible port errors when checking uri // port is a valid integer or we keep its initial value to be aware of the error // here we also don't check wrong range for the same reason - port = int(portToCast) || portToCast; + // RFC-3986 §3.2.3: a non-digit port (0x1F, 1e3, ...) is kept raw, not + // coerced by Number(), so checkURI can flag it as URI_INVALID_PORT + port = + is(String, portToCast) && portToCast.length > 0 && !isPort(portToCast) + ? portToCast + : int(portToCast) || portToCast; // recompose authority with punycode ASCII and Unicode serialization of the host // userinfo@host:port // we still want to know the original host and authority provided // to check possible uri errors: a null host with a hostPunydecoded filled // means uri parsed had an invalid host name + /* v8 ignore next -- unreachable false branch: hostPunydecoded is always an assigned string in this block */ if (exists(hostPunydecoded)) { authorityPunydecoded = ''; @@ -295,11 +322,14 @@ const parseURI = function parseURI(uri: string): ParsedURI { } // format query and fragment - const query = is(String, queryParsed) && queryParsed.length > 0 ? queryParsed : null; - const fragment = is(String, fragmentParsed) && fragmentParsed.length > 0 ? fragmentParsed : null; + // RFC-3986 §5.3: a present-but-empty query/fragment ('' from a bare '?' + // or '#') is distinct from an absent one (null) and must round-trip + const query = is(String, queryParsed) ? queryParsed : null; + const fragment = is(String, fragmentParsed) ? fragmentParsed : null; // pathqf: recompose path + query + fragment if any // using valueOf to avoid potential String objects mutation with parsed.path + /* v8 ignore next -- unreachable null branch: the Appendix-B regexp always captures a string path */ parsed.pathqf = is(String, path) ? path.valueOf() : null; if (is(String, parsed.pathqf)) { @@ -320,6 +350,7 @@ const parseURI = function parseURI(uri: string): ParsedURI { parsed.host = host; parsed.hostPunydecoded = hostPunydecoded; parsed.port = port; + /* v8 ignore next -- unreachable: the Appendix-B regexp always captures a string path */ parsed.path = path ?? null; parsed.query = query; parsed.fragment = fragment; diff --git a/src/resolver/index.ts b/src/resolver/index.ts new file mode 100644 index 0000000..8c10583 --- /dev/null +++ b/src/resolver/index.ts @@ -0,0 +1,216 @@ +/** + * reference resolution + * + * - removeDotSegments(path) -> String + * - resolveURI(base, reference) -> String + * + * Based on: + * - RFC-3986 https://tools.ietf.org/html/rfc3986#section-5. + */ +import { is } from '../helpers/object.js'; + +// RFC-3986 Appendix B reference-parsing regexp. Unlike parseURI this keeps +// relative references (no scheme / no authority) so §5.2.2 can resolve them. +const referenceRegexp = /^(?:([^:/?#]+):)?(?:\/\/([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?/; + +interface Reference { + scheme: string | null; + authority: string | null; + path: string; + query: string | null; + fragment: string | null; +} + +/** + * @func parseReference + * + * Split a URI-reference into its five RFC-3986 components. A component is + * null when the delimiter is absent and '' when present but empty, so the + * defined/undefined distinction §5.2.2 relies on is preserved. + */ +const parseReference = function parseReference(reference: string): Reference { + /* v8 ignore next -- unreachable []: the Appendix-B regexp is all-optional and matches any string */ + const [, scheme, authority, path, query, fragment] = reference.match(referenceRegexp) ?? []; + + return { + scheme: scheme ?? null, + authority: authority ?? null, + /* v8 ignore next -- unreachable '': the path group [^?#]* always captures a string */ + path: path ?? '', + query: query ?? null, + fragment: fragment ?? null, + }; +}; + +/** + * @func removeDotSegments + * + * Remove the special "." and ".." complete path segments from a path, + * implementing the RFC-3986 §5.2.4 ordered loop verbatim. + * + * Based on: + * - RFC-3986 https://tools.ietf.org/html/rfc3986#section-5.2.4. + */ +const removeDotSegments = function removeDotSegments(path: string): string { + if (!is(String, path)) { + return ''; + } + + let input = path; + let output = ''; + + while (input.length > 0) { + // 2A + if (input.startsWith('../')) { + input = input.slice(3); + } else if (input.startsWith('./')) { + input = input.slice(2); + // 2B + } else if (input.startsWith('/./')) { + input = `/${input.slice(3)}`; + } else if (input === '/.') { + input = '/'; + // 2C + } else if (input.startsWith('/../')) { + input = `/${input.slice(4)}`; + output = output.slice(0, Math.max(0, output.lastIndexOf('/'))); + } else if (input === '/..') { + input = '/'; + output = output.slice(0, Math.max(0, output.lastIndexOf('/'))); + // 2D + } else if (input === '.' || input === '..') { + input = ''; + // 2E + } else { + const start = input.startsWith('/') ? 1 : 0; + const next = input.indexOf('/', start); + + if (next === -1) { + output += input; + input = ''; + } else { + output += input.slice(0, next); + input = input.slice(next); + } + } + } + + return output; +}; + +/** + * @func merge + * + * Merge a relative reference's path with the base path, per RFC-3986 §5.2.3. + */ +const merge = function merge(base: Reference, refPath: string): string { + if (is(String, base.authority) && base.path === '') { + return `/${refPath}`; + } + + const lastSlash = base.path.lastIndexOf('/'); + + return lastSlash === -1 ? refPath : base.path.slice(0, lastSlash + 1) + refPath; +}; + +/** + * @func recompose + * + * Recompose a resolved target from its components, per RFC-3986 §5.3. + * A component is emitted whenever it is defined (non-null), including ''. + */ +const recompose = function recompose(target: Reference): string { + let result = ''; + + /* v8 ignore next -- unreachable false branch: a resolved target always has a scheme (the base is absolute) */ + if (is(String, target.scheme)) { + result += `${target.scheme}:`; + } + + if (is(String, target.authority)) { + result += `//${target.authority}`; + } + + result += target.path; + + if (is(String, target.query)) { + result += `?${target.query}`; + } + + if (is(String, target.fragment)) { + result += `#${target.fragment}`; + } + + return result; +}; + +/** + * @func resolveURI + * + * Resolve a URI reference against a base URI, implementing the RFC-3986 + * §5.2.2 strict transform (with §5.2.3 merge and §5.2.4 remove_dot_segments) + * and recomposing per §5.3. + * + * The base must be an absolute URI (a scheme is required, RFC-3986 §5.2.1); + * a fragment on the base is ignored (RFC-3986 §5.1: the base is used + * stripped of any fragment); the empty string is returned if base or + * reference is invalid. + * + * Based on: + * - RFC-3986 https://tools.ietf.org/html/rfc3986#section-5.2. + */ +const resolveURI = function resolveURI(base: string, reference: string): string { + if (!(is(String, base) && is(String, reference))) { + return ''; + } + + const baseRef = parseReference(base); + + // RFC-3986 §5.2.1: the base URI MUST be an absolute URI + if (!is(String, baseRef.scheme)) { + return ''; + } + + const r = parseReference(reference); + const t: Reference = { + scheme: null, + authority: null, + path: '', + query: null, + fragment: null, + }; + + // RFC-3986 §5.2.2 (strict mode) + if (is(String, r.scheme)) { + t.scheme = r.scheme; + t.authority = r.authority; + t.path = removeDotSegments(r.path); + t.query = r.query; + } else { + if (is(String, r.authority)) { + t.authority = r.authority; + t.path = removeDotSegments(r.path); + t.query = r.query; + } else { + if (r.path === '') { + t.path = baseRef.path; + t.query = is(String, r.query) ? r.query : baseRef.query; + } else { + t.path = r.path.startsWith('/') + ? removeDotSegments(r.path) + : removeDotSegments(merge(baseRef, r.path)); + t.query = r.query; + } + + t.authority = baseRef.authority; + } + + t.scheme = baseRef.scheme; + } + + t.fragment = r.fragment; + + return recompose(t); +}; + +export { removeDotSegments, resolveURI }; diff --git a/src/sitemap/index.ts b/src/sitemap/index.ts index 66d9eac..c4d9ba1 100644 --- a/src/sitemap/index.ts +++ b/src/sitemap/index.ts @@ -20,7 +20,7 @@ const specialChars: Record = { }; // special chars keys -const specialCharsKeys = Object.keys(specialChars); +const specialCharsKeys: string[] = Object.keys(specialChars); // inversed special chars (percent encodings) const pencodings: Record = {}; @@ -28,16 +28,19 @@ specialCharsKeys.forEach((char) => { pencodings[specialChars[char] as string] = char; }); -const pencodingsKeys = Object.keys(pencodings); +const pencodingsKeys: string[] = Object.keys(pencodings); -// sitemap entities to be escaped in URLs +// sitemap entities to be escaped in URLs (sitemaps.org requires all five) const entities: Record = { '&': '&', "'": ''', + '"': '"', + '>': '>', + '<': '<', }; // entities keys -const entitiesKeys = Object.keys(entities); +const entitiesKeys: string[] = Object.keys(entities); // inversed entities keys (escape codes) const escapeCodes: Record = {}; @@ -46,8 +49,8 @@ entitiesKeys.forEach((entity) => { }); // escape codes keys and length -const escapeCodesKeys = Object.keys(escapeCodes); -const escapeCodesKeysLen = escapeCodesKeys.length; +const escapeCodesKeys: string[] = Object.keys(escapeCodes); +const escapeCodesKeysLen: number = escapeCodesKeys.length; export { entities, diff --git a/tests/checkers-chars.test.ts b/tests/checkers-chars.test.ts index 9efed92..30b3fc0 100644 --- a/tests/checkers-chars.test.ts +++ b/tests/checkers-chars.test.ts @@ -476,6 +476,17 @@ describe('#checkers chars', () => { } }); + // RFC-3986 §2.1 / §6.2.2.1: HEXDIG is case-insensitive (%3a ≡ %3A). + // A validator MUST accept lowercase a-f; rejecting them rejects valid input. + it('should accept lowercase hex digits a-f (RFC-3986 §6.2.2.1)', () => { + for (const char of 'abcdef') { + expect(isPercentEncodingChar(char)).toBe(true); + } + for (const char of 'ABCDEF0123456789') { + expect(isPercentEncodingChar(char)).toBe(true); + } + }); + it('should return false if a char does not exist', () => { expect(isPercentEncodingChar()).toBe(false); expect(isPercentEncodingChar(undefined)).toBe(false); diff --git a/tests/checkers.test.ts b/tests/checkers.test.ts index ca6bcef..749eb67 100644 --- a/tests/checkers.test.ts +++ b/tests/checkers.test.ts @@ -117,7 +117,7 @@ describe('#checkers', () => { it('should throw an uri error when percent encoding is malformed', () => { expectThrowWithCode( - () => checkPercentEncoding('percent%2encoding', 7), + () => checkPercentEncoding('percent%2gncoding', 7), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -680,6 +680,24 @@ describe('#checkers', () => { expectThrowWithCode(() => checkURISyntax('foo://'), 'URI_INVALID_HOST'); }); + // RFC 6874: an IPv6 zone identifier in a URI MUST use the percent-encoded + // "%25" delimiter; a bare "%" is invalid in URI context. + it('should require the RFC 6874 %25 zone delimiter in a URI host', () => { + expectThrowWithCode(() => checkURISyntax('http://[fe80::1%eth0]/'), 'URI_INVALID_HOST'); + expectThrowWithCode(() => checkURI('http://[fe80::1%eth0]/'), 'URI_INVALID_HOST'); + expect(() => checkURI('http://[fe80::1%25eth0]/')).not.toThrow(); + expect(() => checkWebURL('http://[fe80::1%25eth0]/')).not.toThrow(); + }); + + // RFC 6874 §2: ZoneID = 1*( unreserved / pct-encoded ) — the zone must + // be non-empty and restricted to that set after the %25 delimiter. + it('should reject an empty or malformed RFC 6874 ZoneID in a URI host', () => { + expectThrowWithCode(() => checkURISyntax('http://[fe80::1%25]/'), 'URI_INVALID_HOST'); + expectThrowWithCode(() => checkURI('http://[fe80::1%25]/'), 'URI_INVALID_HOST'); + expectThrowWithCode(() => checkURI('http://[fe80::1%25e*0]/'), 'URI_INVALID_HOST'); + expect(() => checkURI('http://[fe80::1%251]/')).not.toThrow(); + }); + it('should not throw if an uri has at least a scheme and a path', () => { expect(() => checkURISyntax('http://example.com')).not.toThrow(); expect(() => checkURISyntax('http://example.com/path')).not.toThrow(); @@ -813,7 +831,7 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://user:%acpass@example.com:8042/over/there?name=ferret#nose'), + () => checkURI('foo://user:%agpass@example.com:8042/over/there?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -900,6 +918,17 @@ describe('#checkers', () => { ); }); + // RFC-3986 §3.2.3: port = *DIGIT. JS Number() coerces 0x1F/1e3/0o17 to a + // finite number; a compliant validator MUST still reject them as ports. + it('should reject Number()-coercible non-digit ports (RFC-3986 §3.2.3)', () => { + for (const bad of ['0x1F', '1e3', '0o17', '0b11']) { + expectThrowWithCode( + () => checkURI(`foo://example.com:${bad}/over/there?name=ferret#nose`), + 'URI_INVALID_PORT', + ); + } + }); + it('should throw an uri error when port is out of range', () => { expectThrowWithCode( () => checkURI(`foo://example.com:${minPortInteger - 1}/over/there?name=ferret#nose`), @@ -920,6 +949,13 @@ describe('#checkers', () => { ).not.toThrow(); }); + // RFC-3986 §2.1 / §6.2.2.1: %3a and %3A are equivalent. checkURI MUST NOT + // reject a URI solely because its percent-encodings use lowercase hex. + it('should accept lowercase hex percent-encodings (RFC-3986 §6.2.2.1)', () => { + expect(() => checkURI('foo://example.com:8042/%c3%bcber/%2f?a=%3a#%7e')).not.toThrow(); + expect(() => checkURI('foo://example.com/%3a%2f%3f')).not.toThrow(); + }); + it('should throw an uri error if path has invalid characters', () => { expectThrowWithCode( () => checkURI('foo://example.com:8042/over/thère?name=ferret#nose'), @@ -1059,11 +1095,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there%Aa?name=ferret#nose'), + () => checkURI('foo://example.com:8042/over/there%Ag?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/%2cover/there%20%20?name=ferret#nose'), + () => checkURI('foo://example.com:8042/%2gover/there%20%20?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1095,11 +1131,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%ef'), + () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%eg'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%ac'), + () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%ag'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1107,11 +1143,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%8c'), + () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%8g'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%a9'), + () => checkURI('foo://example.com:8042/over/there?name=ferret#nose%az'), 'URI_INVALID_PERCENT_ENCODING', ); }); @@ -1301,7 +1337,7 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://user:%acpass@example.com:8042/over/there?name=ferret#nose'), + () => checkHttpURL('http://user:%agpass@example.com:8042/over/there?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1546,11 +1582,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there%Aa?name=ferret#nose'), + () => checkHttpURL('http://example.com:8042/over/there%Ag?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/%2cover/there%20%20?name=ferret#nose'), + () => checkHttpURL('http://example.com:8042/%2gover/there%20%20?name=ferret#nose'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1582,11 +1618,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%ef'), + () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%eg'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%ac'), + () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%ag'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( @@ -1594,11 +1630,11 @@ describe('#checkers', () => { 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%8c'), + () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%8g'), 'URI_INVALID_PERCENT_ENCODING', ); expectThrowWithCode( - () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%a9'), + () => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose%az'), 'URI_INVALID_PERCENT_ENCODING', ); }); @@ -2267,6 +2303,16 @@ describe('#checkers', () => { ); }); + // sitemaps.org: a URL must be strictly less than 2,048 characters, so + // maxLengthURL (2048) is an exclusive bound — exactly 2048 is rejected. + it('should reject a URL of exactly maxLengthURL and accept maxLengthURL - 1', () => { + const base = 'http://example.com/'; + const url = (len: number) => base + 'a'.repeat(len - base.length); + + expectThrowWithCode(() => checkHttpURL(url(maxLengthURL)), 'URI_MAX_LENGTH_URL'); + expect(() => checkHttpURL(url(maxLengthURL - 1))).not.toThrow(); + }); + it('should not throw an uri error when uri is a valid https url when https is true', () => { expect(() => checkHttpURL('http://example.com:8042/over/there?name=ferret#nose'), { https: true, diff --git a/tests/decoders.test.ts b/tests/decoders.test.ts index b509162..3b1129c 100644 --- a/tests/decoders.test.ts +++ b/tests/decoders.test.ts @@ -158,6 +158,12 @@ describe('#decoders', () => { expectThrowWithCode(() => decodeURIString('ht°p://example.com'), 'URI_INVALID_SCHEME_CHAR'); }); + it('should lowercase the whole uri when the lowercase option is true', () => { + expect(decodeURIString('HTTP://EXAMPLE.COM/P%20X', { web: true, lowercase: true })).toBe( + 'http://example.com/p x', + ); + }); + it('should throw an uri error if scheme is not http or https when option is web or sitemap', () => { expectThrowWithCode( () => decodeURIString('httpp://www.example.com', { web: true }), @@ -1218,6 +1224,14 @@ describe('#decoders', () => { ); }); + // sitemaps.org: decoding inverts all five XML entities — & ' + // " > < — round-tripping encodeSitemapURL. + it('should decode all five sitemap XML entities (sitemaps.org)', () => { + expect(decodeSitemapURL('http://example.com/a&b'c"d<e>f')).toBe( + 'http://example.com/a&b\'c"df', + ); + }); + it('should throw an uri error if url is more than the maximal allowed length', () => { expectThrowWithCode( () => diff --git a/tests/encoders.test.ts b/tests/encoders.test.ts index ebb0214..dc0b4e3 100644 --- a/tests/encoders.test.ts +++ b/tests/encoders.test.ts @@ -231,7 +231,7 @@ describe('#encoders', () => { it('should return a string with specific escaped and percent-encoded characters when sitemap is true', () => { expect(encodeURIComponentString(AZ, { sitemap: true })).toBe(az); expect(encodeURIComponentString(disallowed, { sitemap: true })).toBe( - '%5C%5E%60%7B%7C%7D%3C%3E', + '%5C%5E%60%7B%7C%7D<>', ); expect(encodeURIComponentString("&'*", { sitemap: true })).toBe('&'%2A'); expect(encodeURIComponentString(disallowedOtherChars, { sitemap: true })).toBe( @@ -1122,9 +1122,9 @@ describe('#encoders', () => { it('should return a string with percent-encoded characters if not allowed, by default', () => { expect(encodeSitemapURL(`http://example.com/${disallowed}`)).toBe( - 'http://example.com/%5C%5E%60%7B%7C%7D%3C%3E', + 'http://example.com/%5C%5E%60%7B%7C%7D<>', ); - expect(encodeSitemapURL('http://example.com/<>')).toBe('http://example.com/%3C%3E'); + expect(encodeSitemapURL('http://example.com/<>')).toBe('http://example.com/<>'); expect(encodeSitemapURL(`http://example.com/${disallowedOtherChars}`)).toBe( 'http://example.com/%E2%82%AC%C2%B0%C3%A9%C3%B9%C3%A8%C3%A0%C3%A7%20%C2%A7%C2%A3', ); @@ -1178,6 +1178,13 @@ describe('#encoders', () => { ); }); + // sitemaps.org: all five XML entities must be escaped — &, ', ", >, < + it('should escape all five sitemap XML entities (sitemaps.org)', () => { + expect(encodeSitemapURL('http://example.com/a&b\'c"df')).toBe( + 'http://example.com/a&b'c"d<e>f', + ); + }); + it('should throw an uri error if url is more than the maximal allowed length when web or sitemap is true only', () => { expectThrowWithCode( () => diff --git a/tests/fixtures/chars.ts b/tests/fixtures/chars.ts index 67ac1a5..4e0dcb3 100644 --- a/tests/fixtures/chars.ts +++ b/tests/fixtures/chars.ts @@ -6,7 +6,9 @@ export const az = 'abcdefghijklmnopqrstuvwxyz'; export const AZ = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; export const GZ = 'GHIJKLMNOPQRSTUVWXYZ'; +export const gz = 'ghijklmnopqrstuvwxyz'; export const hexdig = 'ABCDEF'; +export const hexdigLower = 'abcdef'; export const digits = '0123456789'; // allowed @@ -20,7 +22,8 @@ export const sitemapSubDelims = subDelims.replace(/[*']/g, ''); export const allowedSchemeChars = `${az}${digits}+-.`; export const allowedDomainChars = `${az}${digits}-`; -export const allowedPercentEncodingChars = `${digits}${hexdig}`; +// RFC-3986 §2.1/§6.2.2.1: HEXDIG is case-insensitive (%3a ≡ %3A) +export const allowedPercentEncodingChars = `${digits}${hexdig}${hexdigLower}`; export const allowedUserinfoChars = `${unreserved}%${subDelims}:`; export const allowedPathChars = `${unreserved}%${subDelims}:@/`; @@ -42,7 +45,7 @@ export const allowedSitemapQueryOrFragmentCharsToEncode = `${allowedSitemapPathC export const disallowed = '\\^`{|}<>'; export const disallowedSchemeChars = `${AZ}${disallowed}${allowed.replace(/[-+.]/g, '')}`; export const disallowedDomainChars = `${AZ}${disallowed}${allowed.replace('-', '')}`; -export const disallowedPercentEncodingChars = `${az}${GZ}${allowed}${disallowed}`; +export const disallowedPercentEncodingChars = `${gz}${GZ}${allowed}${disallowed}`; export const disallowedUserinfoChars = '#/?@[]'; export const disallowedPathChars = '?#[]'; diff --git a/tests/fixtures/uris.ts b/tests/fixtures/uris.ts index 7d43d9e..8cc1372 100644 --- a/tests/fixtures/uris.ts +++ b/tests/fixtures/uris.ts @@ -49,7 +49,7 @@ export const notUri: unknown[] = [ 'foo://user:pa[ss@example.com:8042/over/there?name=ferret#nose', 'foo://user%:pass@example.com:8042/over/there?name=ferret#nose', 'foo://user%20%2z:pass@example.com:8042/over/there?name=ferret#nose', - 'foo://user:%acpass@example.com:8042/over/there?name=ferret#nose', + 'foo://user:%agpass@example.com:8042/over/there?name=ferret#nose', 'foo://user:pass%@example.com:8042/over/there?name=ferret#nose', 'foo://user:pass%a@example.com:8042/over/there?name=ferret#nose', 'foo://999.999.999.999:8042/over/there?name=ferret#nose', @@ -70,8 +70,8 @@ export const notUri: unknown[] = [ 'foo://example.com:8042/over/{there?name=ferret#nose', 'foo://example.com:8042/over/there%20%20%?name=ferret#nose', 'foo://example.com:8042/over/there%2?name=ferret#nose', - 'foo://example.com:8042/over/there%Aa?name=ferret#nose', - 'foo://example.com:8042/%2cover/there%20%20?name=ferret#nose', + 'foo://example.com:8042/over/there%Ag?name=ferret#nose', + 'foo://example.com:8042/%2gover/there%20%20?name=ferret#nose', 'foo://example.com:8042/%a2over/there%20%20%?name=ferret#nose', 'foo://example.com:8042/%gover/there%20%20%?name=ferret#nose', 'foo://example.com:8042/%20over/there%20%20%?name=ferret%#nose', @@ -79,11 +79,11 @@ export const notUri: unknown[] = [ 'foo://example.com:8042/over/there%20%20%?name=f%erret#nose', 'foo://example.com:8042/over/there?name=ferret#nose%', 'foo://example.com:8042/over/there?name=ferret#nose%A', - 'foo://example.com:8042/over/there?name=ferret#nose%ef', - 'foo://example.com:8042/over/there?name=ferret#nose%ac', + 'foo://example.com:8042/over/there?name=ferret#nose%eg', + 'foo://example.com:8042/over/there?name=ferret#nose%ag', 'foo://example.com:8042/over/there?name=ferret#nose%9', - 'foo://example.com:8042/over/there?name=ferret#nose%8c', - 'foo://example.com:8042/over/there?name=ferret#nose%a9', + 'foo://example.com:8042/over/there?name=ferret#nose%8g', + 'foo://example.com:8042/over/there?name=ferret#nose%az', 'foo://example.com:8042/over/"there?name=ferret#nose', ]; @@ -95,8 +95,8 @@ export const http: string[] = [ 'http://user:pass@127.0.0.1:8080/', 'http://user:pass@223.255.255.255/', 'http://[2001:0000:1234:0000:0000:c1c0:abcd:0876]:8080/', - 'http://user:pass@[fe80::7:8%eth0]:8080/', - 'http://user:pass@[fe80::7:8%eth0]/path?q=5#anchor', + 'http://user:pass@[fe80::7:8%25eth0]:8080/', + 'http://user:pass@[fe80::7:8%25eth0]/path?q=5#anchor', 'http://example.com./', 'http://www.example.com./', 'http://www.example.com/', @@ -210,7 +210,7 @@ export const notSitemap: string[] = [ 'http://example.com:8042/over/there%20%20%?name=ferret#nose', 'http://example.com:8042/over/there%2?name=ferret#nose', 'http://example.com:8042/over/there%Aa?name=ferret#nose', - 'http://example.com:8042/%2cover/there%20%20?name=ferret#nose', + 'http://example.com:8042/%2gover/there%20%20?name=ferret#nose', 'http://example.com:8042/%a2over/there%20%20%?name=ferret#nose', 'http://example.com:8042/%gover/there%20%20%?name=ferret#nose', 'http://example.com:8042/%20over/there%20%20%?name=ferret%#nose', @@ -218,11 +218,11 @@ export const notSitemap: string[] = [ 'http://example.com:8042/over/there%20%20%?name=f%erret#nose', 'http://example.com:8042/over/there?name=ferret#nose%', 'http://example.com:8042/over/there?name=ferret#nose%A', - 'http://example.com:8042/over/there?name=ferret#nose%ef', - 'http://example.com:8042/over/there?name=ferret#nose%ac', + 'http://example.com:8042/over/there?name=ferret#nose%eg', + 'http://example.com:8042/over/there?name=ferret#nose%ag', 'http://example.com:8042/over/there?name=ferret#nose%9', - 'http://example.com:8042/over/there?name=ferret#nose%8c', - 'http://example.com:8042/over/there?name=ferret#nose%a9', + 'http://example.com:8042/over/there?name=ferret#nose%8g', + 'http://example.com:8042/over/there?name=ferret#nose%az', "http://example.com:8042/it'sover/there?name=ferret#nose", 'http://example.com:8042/it"s%20over/there?name=ferret#nose', 'http://example.com:8042/over/there?name=ferret&pseudo=superhero#nose', diff --git a/tests/helpers-cast.test.ts b/tests/helpers-cast.test.ts index afdd773..364bc9e 100644 --- a/tests/helpers-cast.test.ts +++ b/tests/helpers-cast.test.ts @@ -1,7 +1,7 @@ import { describe, expect, it } from 'vitest'; import { cast } from '../src/helpers/index.js'; -const { num, number, int, integer } = cast; +const { num, number, int, integer, isPort } = cast; describe('#cast helper', () => { describe('when using number', () => { @@ -484,4 +484,29 @@ describe('#cast helper', () => { expect(int(5, { ge: 4, le: 2 })).toBeUndefined(); }); }); + + // RFC-3986 §3.2.3: port = *DIGIT. Only ASCII digits (or absent) are a port; + // JS Number() coercion of hex/scientific/whitespace must be rejected. + describe('when using isPort', () => { + it('should be true for absent or digit-only values', () => { + expect(isPort(null)).toBe(true); + expect(isPort(undefined)).toBe(true); + expect(isPort('')).toBe(true); + expect(isPort('0')).toBe(true); + expect(isPort('8080')).toBe(true); + expect(isPort(8080)).toBe(true); + expect(isPort('65535')).toBe(true); + }); + + it('should be false for non-digit ports coercible by Number()', () => { + expect(isPort('0x1F')).toBe(false); + expect(isPort('1e3')).toBe(false); + expect(isPort('0b11')).toBe(false); + expect(isPort('0o17')).toBe(false); + expect(isPort(' 80 ')).toBe(false); + expect(isPort('80g42')).toBe(false); + expect(isPort('-1')).toBe(false); + expect(isPort('8.0')).toBe(false); + }); + }); }); diff --git a/tests/ip.test.ts b/tests/ip.test.ts index 2bd611f..6507928 100644 --- a/tests/ip.test.ts +++ b/tests/ip.test.ts @@ -368,6 +368,14 @@ describe('#ip', () => { }); }); + // isIPv6 is a standalone literal validator and stays lenient on the + // zone delimiter (bare '%'). RFC 6874's "%25" requirement is enforced + // only in URI context, by the checkers. + it('should accept a bare % zone id when used standalone', () => { + expect(isIPv6('fe80::1%eth0')).toBe(true); + expect(isIPv6('fe80::1%25eth0')).toBe(true); + }); + it('should return false if ip is not a valid v6 ip', () => { v6not.forEach((ip) => { expect(isIPv6(ip)).toBe(false); diff --git a/tests/parser.test.ts b/tests/parser.test.ts index edd8c4b..3e3580b 100644 --- a/tests/parser.test.ts +++ b/tests/parser.test.ts @@ -542,6 +542,62 @@ describe('#parser', () => { expect(parsedURI).toHaveProperty('fragment', null); expect(parsedURI).toHaveProperty('href', 'http://user:pass@[fe80::7:8%eth0]:8080/'); }); + + // RFC-3986 §3.2.1: userinfo is delimited by the LAST '@', not the first. + // Splitting on the first '@' silently truncates the host (host confusion). + it('should split userinfo on the last @ (RFC-3986 §3.2.1)', () => { + const parsedURI = parseURI('foo://user:pa@ss@example.com:8042/p?q#f'); + + expect(parsedURI).toHaveProperty('userinfo', 'user:pa@ss'); + expect(parsedURI).toHaveProperty('host', 'example.com'); + expect(parsedURI).toHaveProperty('port', 8042); + }); + + // RFC-3986 §3.2.2/§3.2.3: for a non-IPv6 authority the port follows the + // LAST ':'; splitting on the first ':' silently truncates the host. + it('should split host and port on the last : (RFC-3986 §3.2.2)', () => { + const parsedURI = parseURI('foo://a:b:8042/p'); + + expect(parsedURI).toHaveProperty('host', null); + expect(parsedURI).toHaveProperty('hostPunydecoded', 'a:b'); + expect(parsedURI).toHaveProperty('authorityPunydecoded', 'a:b:8042'); + }); + + // RFC-3986 §5.3: a present-but-empty query/fragment ('') is distinct + // from an absent one (null) and parse → recompose must be idempotent. + it('should distinguish a present-empty query/fragment from an absent one (RFC-3986 §5.3)', () => { + const withEmptyQuery = parseURI('http://example.com/?'); + expect(withEmptyQuery).toHaveProperty('query', ''); + expect(withEmptyQuery).toHaveProperty('href', 'http://example.com/?'); + + const withEmptyFragment = parseURI('http://example.com/#'); + expect(withEmptyFragment).toHaveProperty('fragment', ''); + expect(withEmptyFragment).toHaveProperty('href', 'http://example.com/#'); + + const absent = parseURI('http://example.com/'); + expect(absent).toHaveProperty('query', null); + expect(absent).toHaveProperty('fragment', null); + expect(absent).toHaveProperty('href', 'http://example.com/'); + + const both = parseURI('http://example.com/?#'); + expect(both).toHaveProperty('query', ''); + expect(both).toHaveProperty('fragment', ''); + expect(both).toHaveProperty('href', 'http://example.com/?#'); + }); + + // RFC-3986 §3.2.3: port = *DIGIT, so an empty port (zero digits) is + // syntactically valid — present-but-empty ('') and distinct from an + // absent port (null), not an error. + it('should keep an empty port present-but-empty, distinct from absent (RFC-3986 §3.2.3)', () => { + const emptyPort = parseURI('http://example.com:/path'); + expect(emptyPort).toHaveProperty('port', ''); + expect(emptyPort).toHaveProperty('host', 'example.com'); + expect(emptyPort).toHaveProperty('href', 'http://example.com:/path'); + + const absentPort = parseURI('http://example.com/path'); + expect(absentPort).toHaveProperty('port', null); + expect(absentPort).toHaveProperty('href', 'http://example.com/path'); + }); }); describe('when using recomposeURI', () => { @@ -838,7 +894,7 @@ describe('#parser', () => { expect(recomposeURI(toRecompose)).toBe('foo://u@example.com/?a=b#anchor'); }); - it('should ignore query if not at least 1 character', () => { + it('should emit ? for a present-empty query, omit it when null (RFC-3986 §5.3)', () => { const toRecompose = { scheme: 'foo', userinfo: null, @@ -849,7 +905,7 @@ describe('#parser', () => { fragment: 'anchor', }; - expect(recomposeURI(toRecompose)).toBe('foo://example.com/#anchor'); + expect(recomposeURI(toRecompose)).toBe('foo://example.com/?#anchor'); toRecompose.query = null; expect(recomposeURI(toRecompose)).toBe('foo://example.com/#anchor'); @@ -869,7 +925,7 @@ describe('#parser', () => { expect(recomposeURI(toRecompose)).toBe('foo://example.com/?a=b#anchor'); }); - it('should ignore fragment if not at least 1 character', () => { + it('should emit # for a present-empty fragment, omit it when null (RFC-3986 §5.3)', () => { const toRecompose = { scheme: 'foo', userinfo: null, @@ -880,7 +936,7 @@ describe('#parser', () => { fragment: '', }; - expect(recomposeURI(toRecompose)).toBe('foo://example.com/'); + expect(recomposeURI(toRecompose)).toBe('foo://example.com/#'); toRecompose.fragment = null; expect(recomposeURI(toRecompose)).toBe('foo://example.com/'); @@ -908,7 +964,7 @@ describe('#parser', () => { port: null, path: '', query: null, - fragment: '', + fragment: null, }; expect(recomposeURI(toRecompose)).toBe('foo://23.71.254.72/'); @@ -922,7 +978,7 @@ describe('#parser', () => { port: null, path: '', query: null, - fragment: '', + fragment: null, }; expect(recomposeURI(toRecompose)).toBe('foo://[::ffff:192.168.1.26]/'); diff --git a/tests/resolver.test.ts b/tests/resolver.test.ts new file mode 100644 index 0000000..5eebef6 --- /dev/null +++ b/tests/resolver.test.ts @@ -0,0 +1,139 @@ +import { describe, expect, it } from 'vitest'; +import { removeDotSegments, resolveURI } from '../src/resolver/index.js'; + +describe('#resolver', () => { + describe('when using removeDotSegments', () => { + // RFC-3986 §5.2.4 worked examples (verbatim from the specification) + it('should match the RFC-3986 §5.2.4 worked examples', () => { + expect(removeDotSegments('/a/b/c/./../../g')).toBe('/a/g'); + expect(removeDotSegments('mid/content=5/../6')).toBe('mid/6'); + }); + + it('should handle empty, root and trailing-dot paths', () => { + expect(removeDotSegments('')).toBe(''); + expect(removeDotSegments('/')).toBe('/'); + expect(removeDotSegments('/a/b/')).toBe('/a/b/'); + expect(removeDotSegments('a/./b')).toBe('a/b'); + expect(removeDotSegments('/.')).toBe('/'); + expect(removeDotSegments('/..')).toBe('/'); + expect(removeDotSegments('.')).toBe(''); + expect(removeDotSegments('..')).toBe(''); + expect(removeDotSegments('/a/.')).toBe('/a/'); + expect(removeDotSegments('/a/..')).toBe('/'); + }); + + it('should not treat .foo or g. as dot segments', () => { + expect(removeDotSegments('/.foo')).toBe('/.foo'); + expect(removeDotSegments('/b/c/g.')).toBe('/b/c/g.'); + expect(removeDotSegments('/b/c/..g')).toBe('/b/c/..g'); + }); + + it('should return the empty string for a non-string input', () => { + // @ts-expect-error runtime guard for non-string input + expect(removeDotSegments(null)).toBe(''); + // @ts-expect-error runtime guard for non-string input + expect(removeDotSegments(undefined)).toBe(''); + }); + }); + + describe('when using resolveURI', () => { + const base = 'http://a/b/c/d;p?q'; + + // RFC-3986 §5.4.1 — normal examples (verbatim) + const normal: [string, string][] = [ + ['g:h', 'g:h'], + ['g', 'http://a/b/c/g'], + ['./g', 'http://a/b/c/g'], + ['g/', 'http://a/b/c/g/'], + ['/g', 'http://a/g'], + ['//g', 'http://g'], + ['?y', 'http://a/b/c/d;p?y'], + ['g?y', 'http://a/b/c/g?y'], + ['#s', 'http://a/b/c/d;p?q#s'], + ['g#s', 'http://a/b/c/g#s'], + ['g?y#s', 'http://a/b/c/g?y#s'], + [';x', 'http://a/b/c/;x'], + ['g;x', 'http://a/b/c/g;x'], + ['g;x?y#s', 'http://a/b/c/g;x?y#s'], + ['', 'http://a/b/c/d;p?q'], + ['.', 'http://a/b/c/'], + ['./', 'http://a/b/c/'], + ['..', 'http://a/b/'], + ['../', 'http://a/b/'], + ['../g', 'http://a/b/g'], + ['../..', 'http://a/'], + ['../../', 'http://a/'], + ['../../g', 'http://a/g'], + ]; + + // RFC-3986 §5.4.2 — abnormal examples (verbatim, strict mode) + const abnormal: [string, string][] = [ + ['../../../g', 'http://a/g'], + ['../../../../g', 'http://a/g'], + ['/./g', 'http://a/g'], + ['/../g', 'http://a/g'], + ['g.', 'http://a/b/c/g.'], + ['.g', 'http://a/b/c/.g'], + ['g..', 'http://a/b/c/g..'], + ['..g', 'http://a/b/c/..g'], + ['./../g', 'http://a/b/g'], + ['./g/.', 'http://a/b/c/g/'], + ['g/./h', 'http://a/b/c/g/h'], + ['g/../h', 'http://a/b/c/h'], + ['g;x=1/./y', 'http://a/b/c/g;x=1/y'], + ['g;x=1/../y', 'http://a/b/c/y'], + ['g?y/./x', 'http://a/b/c/g?y/./x'], + ['g?y/../x', 'http://a/b/c/g?y/../x'], + ['g#s/./x', 'http://a/b/c/g#s/./x'], + ['g#s/../x', 'http://a/b/c/g#s/../x'], + ['http:g', 'http:g'], + ]; + + it('should resolve every RFC-3986 §5.4.1 normal example', () => { + for (const [reference, expected] of normal) { + expect(resolveURI(base, reference)).toBe(expected); + } + }); + + it('should resolve every RFC-3986 §5.4.2 abnormal example (strict)', () => { + for (const [reference, expected] of abnormal) { + expect(resolveURI(base, reference)).toBe(expected); + } + }); + + it('should preserve a present-empty query/fragment per RFC-3986 §5.3', () => { + expect(resolveURI(base, 'g?')).toBe('http://a/b/c/g?'); + expect(resolveURI(base, 'g#')).toBe('http://a/b/c/g#'); + expect(resolveURI('http://a/b?x', '')).toBe('http://a/b?x'); + }); + + it('should merge against a base with an authority and empty path', () => { + expect(resolveURI('http://a', 'g')).toBe('http://a/g'); + expect(resolveURI('http://a', './g')).toBe('http://a/g'); + }); + + it('should merge against a base whose path has no slash', () => { + expect(resolveURI('a:b', 'c')).toBe('a:c'); + expect(resolveURI('a:b', './c')).toBe('a:c'); + }); + + it('should round-trip a reference carrying every component', () => { + expect(resolveURI('http://a/b/c', '//h/p?x=1#y')).toBe('http://h/p?x=1#y'); + expect(resolveURI('http://a/b/c', 's:/p?x#y')).toBe('s:/p?x#y'); + expect(resolveURI('http://h/p?q#f', '')).toBe('http://h/p?q'); + }); + + it('should return the empty string when the base is not absolute', () => { + expect(resolveURI('/b/c', 'g')).toBe(''); + expect(resolveURI('//host/path', 'g')).toBe(''); + expect(resolveURI('', 'g')).toBe(''); + }); + + it('should return the empty string for a non-string argument', () => { + // @ts-expect-error runtime guard for non-string input + expect(resolveURI(null, 'g')).toBe(''); + // @ts-expect-error runtime guard for non-string input + expect(resolveURI(base, 42)).toBe(''); + }); + }); +}); diff --git a/tests/uri.property.test.ts b/tests/uri.property.test.ts new file mode 100644 index 0000000..580c758 --- /dev/null +++ b/tests/uri.property.test.ts @@ -0,0 +1,120 @@ +import fc from 'fast-check'; +import { describe, expect, it } from 'vitest'; +import { + decodeURIComponentString, + encodeURIComponentString, + parseURI, + recomposeURI, + removeDotSegments, + resolveURI, +} from '../src/index.js'; + +const runs = { numRuns: 1000 }; + +describe('#uri — property tests', () => { + it('parseURI is total — never throws for any string', () => { + fc.assert( + fc.property(fc.string(), (input) => { + expect(() => parseURI(input)).not.toThrow(); + }), + runs, + ); + }); + + it('parse → recompose is idempotent on the recomposed href', () => { + fc.assert( + fc.property(fc.webUrl({ withQueryParameters: true, withFragments: true }), (url) => { + const first = parseURI(url).href; + + if (first !== null) { + expect(parseURI(first).href).toBe(first); + } + }), + runs, + ); + }); + + it('removeDotSegments is idempotent and leaves no . or .. complete segment', () => { + const segment = fc.constantFrom('a', 'b', '.', '..', 'c', 'd'); + const path = fc + .tuple(fc.nat({ max: 8 }), fc.array(segment, { maxLength: 12 }), fc.boolean()) + .map(([climb, segs, absolute]) => { + const body = `${'../'.repeat(climb)}${segs.join('/')}`; + + return absolute ? `/${body}` : body; + }); + + fc.assert( + fc.property(path, (p) => { + const once = removeDotSegments(p); + + expect(removeDotSegments(once)).toBe(once); + + for (const seg of once.split('/')) { + expect(seg === '.' || seg === '..').toBe(false); + } + }), + runs, + ); + }); + + it('resolveURI with an empty reference strips only the fragment (RFC-3986 §5.3)', () => { + fc.assert( + fc.property( + fc.webUrl({ withQueryParameters: true, withFragments: false }), + fc.option(fc.string({ minLength: 1, maxLength: 8 }), { nil: undefined }), + (url, frag) => { + const base = frag === undefined ? url : `${url}#${frag}`; + + expect(resolveURI(base, '')).toBe(url); + }, + ), + runs, + ); + }); + + it('resolveURI never throws and returns a string for any reference', () => { + fc.assert( + fc.property(fc.webUrl(), fc.string(), (base, reference) => { + const resolved = resolveURI(base, reference); + + expect(typeof resolved).toBe('string'); + }), + runs, + ); + }); + + it('component encode then decode round-trips an arbitrary string', () => { + fc.assert( + fc.property(fc.string(), (raw) => { + const encoded = encodeURIComponentString(raw, { type: 'path' }); + + expect(decodeURIComponentString(encoded)).toBe(raw); + }), + runs, + ); + }); + + it('recomposeURI of parsed components equals the parsed href', () => { + fc.assert( + fc.property(fc.webUrl({ withQueryParameters: true, withFragments: true }), (url) => { + const parsed = parseURI(url); + + if (parsed.href !== null) { + expect( + recomposeURI({ + scheme: parsed.scheme, + userinfo: parsed.userinfo, + host: parsed.host, + port: parsed.port, + path: parsed.path, + query: parsed.query, + fragment: parsed.fragment, + }), + ).toBe(parsed.href); + } + }), + runs, + ); + }); +}); diff --git a/tsconfig.json b/tsconfig.json index 6057c0a..8559e16 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -7,6 +7,9 @@ "types": ["node"], "strict": true, "noUncheckedIndexedAccess": true, + "exactOptionalPropertyTypes": true, + "erasableSyntaxOnly": true, + "isolatedDeclarations": true, "noImplicitOverride": true, "noFallthroughCasesInSwitch": true, "esModuleInterop": true, diff --git a/tsdown.config.ts b/tsdown.config.ts index 6a5999c..96ef3c7 100644 --- a/tsdown.config.ts +++ b/tsdown.config.ts @@ -3,8 +3,10 @@ import { defineConfig } from 'tsdown'; export default defineConfig({ entry: ['src/index.ts'], format: ['esm', 'cjs'], + platform: 'node', + target: 'node22', dts: true, + treeshake: true, clean: true, - target: 'node22', sourcemap: true, }); diff --git a/vitest.config.ts b/vitest.config.ts index 2583da9..189d207 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -8,6 +8,9 @@ export default defineConfig({ provider: 'v8', reporter: ['text', 'html'], include: ['src/**/*.ts'], + thresholds: { + 100: true, + }, }, }, });