From c8763a2242b2f1cdb5449f48904b2046b9b5da69 Mon Sep 17 00:00:00 2001 From: Dion Whitehead <{ID} {username}@users.noreply.github.com> Date: Thu, 5 Feb 2026 10:26:06 -0800 Subject: [PATCH] readme and working with tests v2 of datarefs --- .claude/settings.local.json | 10 - .github/workflows/publish.yml | 2 + .gitignore | 5 +- README.md | 330 ++++++++++++++++----- justfile | 29 +- package-lock.json | 4 +- package.json | 3 +- src/index.ts | 1 + src/test/convertLargeObjects.test.ts | 426 +++++++++++++++++++++++++++ src/test/v2.test.ts | 24 ++ src/v2/dataref.ts | 114 +++++++ 11 files changed, 861 insertions(+), 87 deletions(-) delete mode 100644 .claude/settings.local.json create mode 100644 src/test/convertLargeObjects.test.ts diff --git a/.claude/settings.local.json b/.claude/settings.local.json deleted file mode 100644 index 24d34cc..0000000 --- a/.claude/settings.local.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "permissions": { - "allow": [ - "Bash(npm install:*)", - "Bash(npm test:*)", - "Bash(node -e:*)", - "Bash(just check:*)" - ] - } -} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index cfce02c..504c45f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -6,6 +6,8 @@ on: push: branches: - "main" + tags: + - "v*" permissions: id-token: write # Required for OIDC diff --git a/.gitignore b/.gitignore index 551fd47..5ae410e 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,7 @@ node_modules docs # Generated by vitest -__snapshots__ \ No newline at end of file +__snapshots__ + +# AI +.claude/settings.local.json \ No newline at end of file diff --git a/README.md b/README.md index daba187..f4353fe 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,84 @@ # @metapages/dataref -**Encode any JavaScript type including TypedArrays into data URL strings for embedding in URL parameters, JSON, and more.** +**Encode any JavaScript type including TypedArrays into data URL strings for embedding in URL parameters, shrinking JSON, and more.** -Moving around large blobs of data is hard and complicated. Datarefs solve this by encoding complex binary types into compact, unambiguous string references that can be easily passed around your network, database, and URLs. +## The Problem -## Overview +You need to pass binary data (TypedArrays, ArrayBuffers) through JSON, URLs, or databases but they don't serialize, or they are too large. -This library uses **data URL strings** (e.g., `data:text/plain,hello`) to encode any JavaScript type including TypedArrays. Data URLs are unambiguous, URL-safe, and standards-based (RFC 2397). +```typescript +// This doesn't work: +JSON.stringify({ readings: new Float32Array([1.1, 2.2, 3.3]) }); +// => '{"readings":{}}' 😞 +``` -**Note:** v1 (JSON object format) is maintained internally for backwards compatibility but is not exported. All public APIs use the modern v2 data URL format. +And when your data gets large, you can't just inline it everywhere. A 50MB sensor dataset embedded in JSON clogs up your database, message queues, and API responses. -## Why Data URLs? +## The Solution + +Dataref solves both problems: + +**1. Inline encoding** for small data—encodes complex types into data URL strings that serialize cleanly: + +```typescript +import { typedArrayToDataUrl, dereferenceDataRefs } from "@metapages/dataref"; + +// Encode complex data into a JSON-safe structure +const packet = { + metadata: { version: 2, format: "sensor" }, + readings: typedArrayToDataUrl(new Float32Array([1.1, 2.2, 3.3]), "Float32Array"), +}; + +// Safe to serialize, store, transmit as JSON +const json = JSON.stringify(packet); // Works! ✓ + +// Later, decode everything at once +const restored = await dereferenceDataRefs(JSON.parse(json)); +// restored.readings => Float32Array [1.1, 2.2, 3.3] ✓ +``` + +**2. Cloud upload** for large data—automatically uploads big values to your storage, keeping JSON small: + +```typescript +import { convertLargeObjectsToDataRefs, dereferenceDataRefs } from "@metapages/dataref"; + +const data = { + metadata: { version: 2 }, + hugeDataset: new Array(100000).fill({ x: 1, y: 2 }), // 2MB+ of data +}; + +// Upload large values to cloud, replace with URL references +const compact = await convertLargeObjectsToDataRefs(data, 10240, async (blob, type) => { + const { url } = await fetch("/api/upload", { method: "POST", body: blob }).then(r => r.json()); + return url; +}); +// compact.hugeDataset => "data:text/x-uri;type=array,https://storage.example.com/abc123" + +// JSON is now tiny—safe for databases, message queues, API responses +await db.save(compact); -Data URL strings have several key advantages: +// Later, fetch and decode everything automatically +const restored = await dereferenceDataRefs(await db.load()); +// restored.hugeDataset => the full 100,000 element array +``` + +| Input | Encoded Data URL | Decoded Output | +|-------|------------------|----------------| +| `"Hello"` | `data:text/plain;charset=utf-8,Hello` | `"Hello"` | +| `{ key: "value" }` | `data:application/json;charset=utf-8,...` | `{ key: "value" }` | +| `new Uint8Array([1,2,3])` | `data:application/octet-stream;base64,AQID` | `ArrayBuffer` | +| `new Float32Array([1.1])` | `data:...;type=Float32Array;base64,...` | `Float32Array [1.1]` | +| Large blob (uploaded) | `data:text/x-uri,https://...` | Original data (fetched) | + +## Why Data URLs? 1. **Unambiguous**: A string starting with `data:` is clearly a dataref, not confused with regular data -2. **URL-safe**: Can be embedded directly in URL parameters without special handling -3. **JSON-safe**: When serialized to JSON, remains a simple string that's clearly identifiable -4. **Standards-based**: Uses the existing data URL standard (RFC 2397) -5. **Type preservation**: Supports all JavaScript types including TypedArrays with full type information +2. **JSON-safe**: Serializes as a simple string, deserializes perfectly +3. **URL-safe**: Can be embedded directly in URL parameters +4. **Type preservation**: TypedArrays decode to the correct type, not just ArrayBuffer +5. **Standards-based**: Uses the existing data URL standard (RFC 2397) + +**Note:** v1 (JSON object format) is maintained internally for backwards compatibility but is not exported. All public APIs use the modern v2 data URL format. ## Installation @@ -28,6 +88,41 @@ npm install @metapages/dataref ## Quick Start +### The Core Workflow: Encode → Serialize → Decode + +The most common use case is encoding complex data for JSON serialization, then decoding it later: + +```typescript +import { + jsonToDataUrl, + typedArrayToDataUrl, + dereferenceDataRefs, +} from "@metapages/dataref"; + +// 1. Encode complex data into a JSON-safe structure +const packet = { + metadata: jsonToDataUrl({ version: 2, format: "sensor" }), + readings: typedArrayToDataUrl(new Float32Array([1.1, 2.2, 3.3]), "Float32Array"), + label: "regular string stays as-is" +}; + +// 2. Safe to serialize, store, transmit as JSON +const json = JSON.stringify(packet); // Works! + +// 3. Later, decode everything at once with dereferenceDataRefs +const loaded = JSON.parse(json); +const restored = await dereferenceDataRefs(loaded); +// restored.metadata → { version: 2, format: "sensor" } +// restored.readings → Float32Array [1.1, 2.2, 3.3] +// restored.label → "regular string stays as-is" +``` + +`dereferenceDataRefs()` recursively traverses your JSON and decodes all data URLs in a single call. Non-dataref values pass through unchanged. + +### Individual Encode/Decode Functions + +For encoding and decoding individual values: + ```typescript import { textToDataUrl, @@ -38,38 +133,23 @@ import { dataUrlToJson, dataUrlToBuffer, dataUrlToTypedArray, - dereferenceDataRefs, } from "@metapages/dataref"; -// Encode text to data URL +// Text const textDataUrl = textToDataUrl("Hello, World!"); -// => "data:text/plain;charset=utf-8,Hello%2C%20World!" +const text = await dataUrlToText(textDataUrl); // "Hello, World!" -// Decode back to text -const text = await dataUrlToText(textDataUrl); -// => "Hello, World!" - -// Encode JSON to data URL +// JSON const jsonDataUrl = jsonToDataUrl({ name: "John", age: 30 }); -// => "data:application/json;charset=utf-8,%7B%22name%22%3A%22John%22%2C%22age%22%3A30%7D" - -// Decode back to JSON -const data = await dataUrlToJson(jsonDataUrl); -// => { name: "John", age: 30 } +const data = await dataUrlToJson(jsonDataUrl); // { name: "John", age: 30 } -// Encode binary data -const buffer = new Uint8Array([1, 2, 3, 4, 5]); -const bufferDataUrl = bufferToDataUrl(buffer); -// => "data:application/octet-stream;base64,AQIDBAU=" +// Binary (ArrayBuffer/Uint8Array) +const bufferDataUrl = bufferToDataUrl(new Uint8Array([1, 2, 3, 4, 5])); +const buffer = await dataUrlToBuffer(bufferDataUrl); // ArrayBuffer -// Encode TypedArrays with type preservation -const floatArray = new Float32Array([1.1, 2.2, 3.3]); -const arrayDataUrl = typedArrayToDataUrl(floatArray, "Float32Array"); -// => "data:application/octet-stream;type=Float32Array;base64,..." - -// Decode back to Float32Array -const decodedArray = await dataUrlToTypedArray(arrayDataUrl); -// => Float32Array [1.1, 2.2, 3.3] +// TypedArrays with type preservation +const arrayDataUrl = typedArrayToDataUrl(new Float32Array([1.1, 2.2]), "Float32Array"); +const array = await dataUrlToTypedArray(arrayDataUrl); // Float32Array [1.1, 2.2] ``` ## Core Concepts @@ -110,53 +190,97 @@ The library supports all JavaScript data types: ## Advanced Usage -### Dereferencing DataRefs in JSON +### Converting Large Objects to DataRefs -The `dereferenceDataRefs()` function traverses a JSON object and automatically converts all data URL strings into their actual values: +The `convertLargeObjectsToDataRefs()` function is the inverse of `dereferenceDataRefs()`. It traverses a JSON object and uploads large values to a storage service, replacing them with URL-based datarefs: ```typescript -import { dereferenceDataRefs, textToDataUrl, jsonToDataUrl, typedArrayToDataUrl } from "@metapages/dataref"; - -// Create a complex object with embedded datarefs -const obj = { - title: textToDataUrl("My Document"), - metadata: jsonToDataUrl({ author: "Jane", version: 2 }), - data: { - values: typedArrayToDataUrl(new Float32Array([1.1, 2.2]), "Float32Array"), - count: 42, +import { convertLargeObjectsToDataRefs } from "@metapages/dataref"; + +// Mock upload function that returns a URL for stored data +async function uploadToStorage(data: string, originalType: string): Promise { + // Upload data to your storage service (S3, Cloud Storage, etc.) + const response = await fetch("https://api.example.com/upload", { + method: "POST", + body: data, + headers: { "Content-Type": "application/json" } + }); + const { url } = await response.json(); + return url; +} + +const largeData = { + metadata: { version: 1, created: new Date() }, + hugeDataset: { + description: "Large dataset", + values: Array.from({ length: 10000 }, (_, i) => ({ id: i, value: Math.random() })) }, - items: [ - "regular string", - textToDataUrl("encoded text"), - { nested: jsonToDataUrl({ deep: "value" }) } - ] + smallValue: "This stays inline" }; -// Dereference all datarefs at once -const resolved = await dereferenceDataRefs(obj); +// Convert objects larger than 10KB to URL-based datarefs +const optimized = await convertLargeObjectsToDataRefs( + largeData, + 10240, // 10KB threshold + uploadToStorage +); // Result: // { -// title: "My Document", -// metadata: { author: "Jane", version: 2 }, -// data: { -// values: Float32Array [1.1, 2.2], -// count: 42 -// }, -// items: [ -// "regular string", -// "encoded text", -// { nested: { deep: "value" } } -// ] +// metadata: { version: 1, created: ... }, +// hugeDataset: "data:text/x-uri;type=object;charset=utf-8,https%3A%2F%2F...", +// smallValue: "This stays inline" // } ``` +**How it works:** +1. Traverses the JSON object recursively +2. Calculates the serialized size of each value +3. If a value exceeds `maxSizeBytes`, calls your upload function +4. Replaces the large value with a URL-based dataref that preserves type information +5. Small values remain unchanged for efficiency + **Key features:** -- Recursively traverses objects and arrays -- Preserves non-dataref values unchanged -- Handles all data types (text, JSON, TypedArrays, ArrayBuffers) -- Processes multiple datarefs in parallel for performance -- Returns a new immutable object (uses `mutative` library) +- Uploads large objects in parallel for performance +- Preserves original type information in the dataref +- Uses SHA-256 hashing for content-addressable storage +- Returns a new immutable object +- Works with any storage backend (S3, Azure, Google Cloud, custom API) + +**Round-trip example:** + +```typescript +// Step 1: Convert large objects to refs +const withRefs = await convertLargeObjectsToDataRefs( + originalData, + 5000, // 5KB threshold + uploadToS3 +); + +// Store or transmit the optimized data (much smaller) +await database.save(withRefs); + +// Step 2: Later, retrieve and dereference +const retrieved = await database.load(); + +// Create a custom fetch function for dereferencing +const customFetch = async (url: string) => { + const response = await fetch(url); + return response.arrayBuffer(); +}; + +const restored = await dereferenceDataRefs(retrieved, { + // Custom fetch can add auth headers, handle errors, etc. +}); +// restored now contains the original large objects +``` + +**Use cases:** +- Storing large datasets in databases without hitting size limits +- Optimizing API payloads by offloading large objects to CDN +- Implementing client-side caching with overflow to IndexedDB/localStorage +- Building content-addressable storage systems +- Reducing memory usage when working with large JSON structures ### URL References @@ -275,11 +399,38 @@ const file = await dataUrlToFile(dataUrl, "document.txt"); ### Utility Functions +#### `convertLargeObjectsToDataRefs(json: T, maxSizeBytes: number, uploadFn: (data: string, originalType: string) => Promise): Promise` +Traverses a JSON object and converts large values to URL-based datarefs. + +**Parameters:** +- `json`: The JSON object to process +- `maxSizeBytes`: Size threshold in bytes (objects larger than this are uploaded) +- `uploadFn`: Async function that receives serialized data and type, returns a URL + +**Returns:** New JSON object with large values replaced by URL-based datarefs + +```typescript +const result = await convertLargeObjectsToDataRefs( + myData, + 10240, // 10KB threshold + async (data, type) => { + const response = await fetch("/upload", { method: "POST", body: data }); + return (await response.json()).url; + } +); +``` + #### `dereferenceDataRefs(json: T, fetchOptions?: RequestInit): Promise` -Traverses a JSON object and dereferences all v2 data URLs. +Traverses a JSON object and dereferences all data URLs to their actual values. + +**Parameters:** +- `json`: The JSON object containing datarefs +- `fetchOptions`: Optional fetch options for URL-based datarefs + +**Returns:** New JSON object with all datarefs resolved #### `isDataUrl(value: unknown): boolean` -Checks if a value is a v2 data URL string. +Checks if a value is a data URL string. #### `isUrlDataUrl(dataUrl: DataUrl): boolean` Checks if a data URL is a URL reference. @@ -519,6 +670,41 @@ See LICENSE file. Contributions welcome! Please ensure all tests pass and add tests for new features. +### Development + +This project uses [just](https://github.com/casey/just) as a command runner. Available commands: + +```bash +just build # Build the library +just test # Run tests +just dev # Watch mode for development +just check # TypeScript type checking +``` + +### Publishing + +Publishing to npm is automated via GitHub Actions: + +1. **Bump the version:** + ```bash + just version patch # For bug fixes (2.0.0 -> 2.0.1) + just version minor # For new features (2.0.0 -> 2.1.0) + just version major # For breaking changes (2.0.0 -> 3.0.0) + ``` + +2. **Push to trigger publish:** + ```bash + just push-version # Push commits and tags to GitHub + ``` + +The GitHub Actions workflow will automatically: +- Build the package +- Run tests +- Publish to npm (if version doesn't already exist) +- Use npm provenance for supply chain security + +**Note:** This project uses npm's Trusted Publishing with OIDC - no secrets required! + ## LLM/AI Model Usage Guide **For AI models processing this library:** diff --git a/justfile b/justfile index 7844ed7..223d408 100644 --- a/justfile +++ b/justfile @@ -78,6 +78,33 @@ dev: _ensure_node_modules watch @list: npm view {{NPM_MODULE}} versions --json +# Bump version (patch|minor|major|), commit, tag, and push to trigger publish +version bump="patch": + #!/usr/bin/env bash + set -euo pipefail + # Ensure we're on a clean working tree + if ! git diff-index --quiet HEAD --; then + echo "❌ Working directory is not clean. Commit or stash changes first." + exit 1 + fi + # Bump version in package.json + npm version {{bump}} --no-git-tag-version + VERSION=`cat package.json | jq -r '.version'` + # Commit and tag + git add package.json + git commit -m "Bump version to $VERSION" + git tag "v$VERSION" + echo "✅ Version bumped to $VERSION" + echo "👉 Push with: git push && git push --tags" + echo " Or: just push-version" + +# Push version tag to trigger publish workflow +push-version: + #!/usr/bin/env bash + set -euo pipefail + git push && git push --tags + echo "✅ Pushed version tag. Check GitHub Actions for publish status." + # If the version does not exist, publish the packages (metaframe+metapage) publish: _ensure_node_modules #!/usr/bin/env bash @@ -96,7 +123,7 @@ publish: _ensure_node_modules just build rm -rf dist/test echo "PUBLISHING npm version $VERSION" - npm publish --access public . + npm publish --access public --provenance . # git tag $VERSION # git push origin $VERSION diff --git a/package-lock.json b/package-lock.json index f33fdaf..2e13034 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@metapages/dataref", - "version": "0.6.2", + "version": "2.0.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "@metapages/dataref", - "version": "0.6.2", + "version": "2.0.0", "dependencies": { "base64-arraybuffer": "^1.0.2", "mutative": "^1.3.0" diff --git a/package.json b/package.json index 47459dd..771db5e 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,8 @@ "scripts": { "note": "echo 'use just. See README.md'", "test": "vitest --run", - "dev": "vitest" + "dev": "vitest", + "build": "vite build" }, "devDependencies": { "@rollup/plugin-typescript": "^12.1.1", diff --git a/src/index.ts b/src/index.ts index fbbfdad..73bffe7 100644 --- a/src/index.ts +++ b/src/index.ts @@ -20,6 +20,7 @@ export { // Utility functions dereferenceDataRefs, + convertLargeObjectsToDataRefs, isDataUrl, isUrlDataUrl, getMimeType, diff --git a/src/test/convertLargeObjects.test.ts b/src/test/convertLargeObjects.test.ts new file mode 100644 index 0000000..ff16650 --- /dev/null +++ b/src/test/convertLargeObjects.test.ts @@ -0,0 +1,426 @@ +import { describe, it, expect } from "vitest"; +import { + convertLargeObjectsToDataRefs, + dereferenceDataRefs, + isDataUrl, + dataUrlToUrl, +} from "../index"; + +// Mock upload server that generates SHA-based URLs +class MockUploadServer { + private storage = new Map(); + + async upload(data: string, originalType: string): Promise { + // Generate SHA-256 hash of the data + const encoder = new TextEncoder(); + const dataBuffer = encoder.encode(data); + const hashBuffer = await crypto.subtle.digest("SHA-256", dataBuffer); + const hashArray = Array.from(new Uint8Array(hashBuffer)); + const sha = hashArray.map((b) => b.toString(16).padStart(2, "0")).join(""); + + // Store the data + this.storage.set(sha, data); + + // Return a URL with the SHA as the ID + return `https://storage.example.com/uploads/${sha}`; + } + + async download(url: string): Promise<{ data: string; found: boolean }> { + const sha = url.split("/").pop(); + if (!sha) { + return { data: "", found: false }; + } + + const data = this.storage.get(sha); + if (!data) { + return { data: "", found: false }; + } + + return { data, found: true }; + } + + getStorageSize(): number { + return this.storage.size; + } + + clear(): void { + this.storage.clear(); + } +} + +describe("convertLargeObjectsToDataRefs", () => { + describe("Basic functionality", () => { + it("should not convert small objects", async () => { + const server = new MockUploadServer(); + const input = { + small: { value: "tiny" }, + number: 42, + string: "hello", + }; + + const result = await convertLargeObjectsToDataRefs( + input, + 1000, // 1KB threshold + (data, type) => server.upload(data, type) + ); + + expect(result).toEqual(input); + expect(server.getStorageSize()).toBe(0); + }); + + it("should convert large objects to datarefs", async () => { + const server = new MockUploadServer(); + const largeData = { + description: "Large object with lots of data", + items: Array.from({ length: 100 }, (_, i) => ({ + id: i, + name: `Item ${i}`, + value: Math.random(), + })), + }; + + const input = { + metadata: { version: 1 }, + largeData, + }; + + const result = await convertLargeObjectsToDataRefs( + input, + 100, // 100 bytes threshold + (data, type) => server.upload(data, type) + ); + + // Metadata should remain unchanged + expect(result.metadata).toEqual({ version: 1 }); + + // Large data should be converted to a data URL + expect(typeof result.largeData).toBe("string"); + expect(isDataUrl(result.largeData as unknown as string)).toBe(true); + + // Should have uploaded to server + expect(server.getStorageSize()).toBe(1); + + // Extract and verify the URL + const dataUrl = result.largeData as unknown as string; + const url = dataUrlToUrl(dataUrl); + expect(url).toContain("https://storage.example.com/uploads/"); + }); + + it("should preserve original type information in dataref", async () => { + const server = new MockUploadServer(); + const largeArray = Array.from({ length: 50 }, (_, i) => i); + + const input = { + data: largeArray, + }; + + const result = await convertLargeObjectsToDataRefs( + input, + 50, // Small threshold + (data, type) => server.upload(data, type) + ); + + const dataUrl = result.data as unknown as string; + expect(dataUrl).toContain("type=array"); + }); + + it("should handle nested large objects", async () => { + const server = new MockUploadServer(); + const largeNested = { + level1: { + level2: { + data: Array.from({ length: 100 }, (_, i) => i), + }, + }, + }; + + const input = { + root: largeNested, + small: "value", + }; + + const result = await convertLargeObjectsToDataRefs( + input, + 100, + (data, type) => server.upload(data, type) + ); + + expect(result.small).toBe("value"); + expect(typeof result.root).toBe("string"); + expect(isDataUrl(result.root as unknown as string)).toBe(true); + expect(server.getStorageSize()).toBe(1); + }); + + it("should handle multiple large objects", async () => { + const server = new MockUploadServer(); + const large1 = { data: Array.from({ length: 100 }, () => "x") }; + const large2 = { data: Array.from({ length: 100 }, () => "y") }; + const large3 = { data: Array.from({ length: 100 }, () => "z") }; + + const input = { + obj1: large1, + obj2: large2, + obj3: large3, + }; + + const result = await convertLargeObjectsToDataRefs( + input, + 100, + (data, type) => server.upload(data, type) + ); + + expect(isDataUrl(result.obj1 as unknown as string)).toBe(true); + expect(isDataUrl(result.obj2 as unknown as string)).toBe(true); + expect(isDataUrl(result.obj3 as unknown as string)).toBe(true); + expect(server.getStorageSize()).toBe(3); + + // Each should have a different URL (different SHA) + const url1 = dataUrlToUrl(result.obj1 as unknown as string); + const url2 = dataUrlToUrl(result.obj2 as unknown as string); + const url3 = dataUrlToUrl(result.obj3 as unknown as string); + expect(url1).not.toBe(url2); + expect(url2).not.toBe(url3); + expect(url1).not.toBe(url3); + }); + + it("should not process existing datarefs", async () => { + const server = new MockUploadServer(); + const existingDataUrl = "data:text/plain,hello"; + + const input = { + existing: existingDataUrl, + large: { data: Array.from({ length: 100 }, () => "x") }, + }; + + const result = await convertLargeObjectsToDataRefs( + input, + 100, + (data, type) => server.upload(data, type) + ); + + // Existing dataref should remain unchanged + expect(result.existing).toBe(existingDataUrl); + + // Large object should be converted + expect(isDataUrl(result.large as unknown as string)).toBe(true); + expect(server.getStorageSize()).toBe(1); + }); + }); + + describe("Integration with dereferenceDataRefs", () => { + it("should round-trip: convert then dereference", async () => { + const server = new MockUploadServer(); + const originalData = { + metadata: { version: 1 }, + largeObject: { + description: "Large dataset", + values: Array.from({ length: 100 }, (_, i) => i), + }, + small: "unchanged", + }; + + // Step 1: Convert large objects to refs + const converted = await convertLargeObjectsToDataRefs( + originalData, + 100, + (data, type) => server.upload(data, type) + ); + + // Verify conversion happened + expect(typeof converted.largeObject).toBe("string"); + expect(converted.small).toBe("unchanged"); + + // Step 2: Mock dereferencing by downloading from server + // In a real scenario, dereferenceDataRefs would fetch from the URL + const dataUrl = converted.largeObject as unknown as string; + const url = dataUrlToUrl(dataUrl); + expect(url).not.toBeNull(); + + const { data: downloadedData, found } = await server.download(url!); + expect(found).toBe(true); + + const reconstructed = JSON.parse(downloadedData); + expect(reconstructed).toEqual(originalData.largeObject); + }); + + it("should handle multiple round-trips", async () => { + const server = new MockUploadServer(); + const data = { + level1: { + level2: { + largeData: Array.from({ length: 50 }, (_, i) => ({ + id: i, + value: i * 2, + })), + }, + }, + }; + + // Convert + const converted = await convertLargeObjectsToDataRefs( + data, + 100, + (data, type) => server.upload(data, type) + ); + + // Verify + expect(typeof converted.level1).toBe("string"); + + // Download and reconstruct + const url = dataUrlToUrl(converted.level1 as unknown as string); + const { data: downloaded } = await server.download(url!); + const reconstructed = JSON.parse(downloaded); + + expect(reconstructed).toEqual(data.level1); + }); + }); + + describe("Type preservation", () => { + it("should preserve object type", async () => { + const server = new MockUploadServer(); + const input = { + data: { large: Array.from({ length: 100 }, () => "x") }, + }; + + const result = await convertLargeObjectsToDataRefs( + input, + 50, + (data, type) => server.upload(data, type) + ); + + const dataUrl = result.data as unknown as string; + expect(dataUrl).toContain("type=object"); + }); + + it("should preserve array type", async () => { + const server = new MockUploadServer(); + const input = { + data: Array.from({ length: 100 }, () => "x"), + }; + + const result = await convertLargeObjectsToDataRefs( + input, + 50, + (data, type) => server.upload(data, type) + ); + + const dataUrl = result.data as unknown as string; + expect(dataUrl).toContain("type=array"); + }); + }); + + describe("Edge cases", () => { + it("should handle empty objects", async () => { + const server = new MockUploadServer(); + const input = {}; + + const result = await convertLargeObjectsToDataRefs( + input, + 100, + (data, type) => server.upload(data, type) + ); + + expect(result).toEqual({}); + expect(server.getStorageSize()).toBe(0); + }); + + it("should handle null and undefined values", async () => { + const server = new MockUploadServer(); + const input = { + nullValue: null, + undefinedValue: undefined, + large: { data: Array.from({ length: 100 }, () => "x") }, + }; + + const result = await convertLargeObjectsToDataRefs( + input, + 50, + (data, type) => server.upload(data, type) + ); + + expect(result.nullValue).toBeNull(); + expect(result.undefinedValue).toBeUndefined(); + expect(isDataUrl(result.large as unknown as string)).toBe(true); + }); + + it("should handle primitives at root level", async () => { + const server = new MockUploadServer(); + + const stringResult = await convertLargeObjectsToDataRefs( + "simple string", + 100, + (data, type) => server.upload(data, type) + ); + + const numberResult = await convertLargeObjectsToDataRefs( + 42, + 100, + (data, type) => server.upload(data, type) + ); + + expect(stringResult).toBe("simple string"); + expect(numberResult).toBe(42); + expect(server.getStorageSize()).toBe(0); + }); + + it("should generate consistent SHAs for identical data", async () => { + const server = new MockUploadServer(); + const data = { values: Array.from({ length: 50 }, (_, i) => i) }; + + const input1 = { data: { ...data } }; + const input2 = { data: { ...data } }; + + const result1 = await convertLargeObjectsToDataRefs( + input1, + 50, + (data, type) => server.upload(data, type) + ); + + const result2 = await convertLargeObjectsToDataRefs( + input2, + 50, + (data, type) => server.upload(data, type) + ); + + // Same data should generate same SHA/URL + expect(result1.data).toBe(result2.data); + + // But server should only store it once (SHA-based deduplication) + expect(server.getStorageSize()).toBe(1); + }); + }); + + describe("Performance", () => { + it("should process multiple large objects in parallel", async () => { + const server = new MockUploadServer(); + const uploadTimes: number[] = []; + + const slowUpload = async ( + data: string, + type: string + ): Promise => { + const start = Date.now(); + await new Promise((resolve) => setTimeout(resolve, 10)); // 10ms delay + const url = await server.upload(data, type); + uploadTimes.push(Date.now() - start); + return url; + }; + + const large1 = { data: Array.from({ length: 100 }, () => "x") }; + const large2 = { data: Array.from({ length: 100 }, () => "y") }; + const large3 = { data: Array.from({ length: 100 }, () => "z") }; + + const input = { obj1: large1, obj2: large2, obj3: large3 }; + + const start = Date.now(); + await convertLargeObjectsToDataRefs(input, 50, slowUpload); + const totalTime = Date.now() - start; + + // If parallel, total time should be ~10ms (one delay) + // If sequential, total time would be ~30ms (three delays) + // Allow some margin for test execution overhead + expect(totalTime).toBeLessThan(25); + expect(uploadTimes.length).toBe(3); + }); + }); +}); diff --git a/src/test/v2.test.ts b/src/test/v2.test.ts index 66e7bfa..806cb25 100644 --- a/src/test/v2.test.ts +++ b/src/test/v2.test.ts @@ -226,6 +226,30 @@ describe("v2 DataRef - Basic Type Conversions", () => { await dataUrlToTypedArray(dataUrl); expect(decodedArray).toEqual(originalArray); }); + + it("should round-trip Float32Array through URL reference with type preservation", async () => { + // Step 1: Create original Float32Array + const originalArray = new Float32Array([1.1, 2.2, 3.3, 4.4, 5.5]); + + // Step 2: Upload as binary (simulate upload to storage) + const binaryDataUrl = typedArrayToDataUrl(originalArray, "Float32Array"); + + // Verify it has the type parameter + expect(binaryDataUrl).toContain("type=Float32Array"); + + // Step 3: Download and decode back to Float32Array + const downloadedArray = await dataUrlToTypedArray(binaryDataUrl); + + // Step 4: Verify the downloaded array matches the original + expect(downloadedArray.constructor.name).toBe("Float32Array"); + expect(downloadedArray.length).toBe(originalArray.length); + for (let i = 0; i < originalArray.length; i++) { + expect(downloadedArray[i]).toBeCloseTo(originalArray[i]); + } + + // Verify they're exactly equal + expect(downloadedArray).toEqual(originalArray); + }); }); describe("URL handling", () => { diff --git a/src/v2/dataref.ts b/src/v2/dataref.ts index 3bfea0e..bc49cef 100644 --- a/src/v2/dataref.ts +++ b/src/v2/dataref.ts @@ -218,6 +218,120 @@ export const fetchDataUrlContent = async ( // Import mutative for efficient JSON traversal and modification import { create } from "mutative"; +/** + * Converts large objects in a JSON structure to data URL references. + * Objects exceeding the size threshold are uploaded using the provided function + * and replaced with URL-based datarefs that preserve type information. + * + * @param json - The JSON object to process + * @param maxSizeBytes - Size threshold in bytes (objects larger than this are converted to refs) + * @param uploadFn - Async function that takes serialized data and original type, returns a URL + * @returns A new JSON object with large values replaced by data URL references + */ +export const convertLargeObjectsToDataRefs = async ( + json: T, + maxSizeBytes: number, + uploadFn: (data: string, originalType: string) => Promise +): Promise => { + // Track all promises for async uploads + const promises: Array<{ + path: (string | number)[]; + promise: Promise<{ url: string; originalType: string }>; + }> = []; + + // Helper to get size of a value + const getSize = (value: any): number => { + return new TextEncoder().encode(JSON.stringify(value)).length; + }; + + // Helper to determine the type of value + const getValueType = (value: any): string => { + if (value === null) return "null"; + if (Array.isArray(value)) return "array"; + if (value instanceof Uint8Array) return "Uint8Array"; + if (value instanceof ArrayBuffer) return "ArrayBuffer"; + if (ArrayBuffer.isView(value)) { + return value.constructor.name; + } + return typeof value; + }; + + // Helper function to traverse and collect upload promises + const collectUploads = (obj: any, path: (string | number)[] = []) => { + if (obj === null || obj === undefined) { + return; + } + + // Check if this is a primitive type + if (typeof obj !== "object") { + return; + } + + // Don't process data URLs themselves + if (typeof obj === "string" && isDataUrl(obj)) { + return; + } + + // Check size for both arrays and objects + const size = getSize(obj); + const isTooLarge = size > maxSizeBytes && path.length > 0; + + if (isTooLarge) { + // This object/array is too large, upload it as a whole + const originalType = getValueType(obj); + const serialized = JSON.stringify(obj); + const promise = uploadFn(serialized, originalType).then((url) => ({ + url, + originalType, + })); + promises.push({ path: [...path], promise }); + return; // Don't traverse into this object/array further + } + + // Object/array is not too large, traverse its children + if (Array.isArray(obj)) { + obj.forEach((item, index) => { + collectUploads(item, [...path, index]); + }); + } else { + Object.keys(obj).forEach((key) => { + collectUploads(obj[key], [...path, key]); + }); + } + }; + + // First pass: collect all upload promises + collectUploads(json); + + // If no large objects found, return original + if (promises.length === 0) { + return json; + } + + // Wait for all uploads to complete + const results = await Promise.all(promises.map((p) => p.promise)); + + // Second pass: use mutative to update the JSON with URL datarefs + return create(json, (draft: any) => { + promises.forEach(({ path }, index) => { + const { url, originalType } = results[index]; + + // Create a data URL that references the uploaded URL + // Include the original type as a parameter + const encodedUrl = encodeURIComponent(url); + const dataUrl = `data:${MIME_TYPES.URI};type=${originalType};charset=utf-8,${encodedUrl}`; + + // Navigate to the parent and set the value + let current = draft; + for (let i = 0; i < path.length - 1; i++) { + current = current[path[i]]; + } + const lastKey = path[path.length - 1]; + current[lastKey] = dataUrl; + }); + }); +}; + /** * Traverses a JSON object and converts any data ref strings (v2 data URLs) * into their dereferenced data. Returns a new JSON object with all datarefs resolved.