diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index e5a438ee1..6a7852567 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -1,6 +1,6 @@ # GitHub Copilot Instructions for vscode-documentdb -VS Code Extension for Azure Cosmos DB and MongoDB. TypeScript (strict mode), React webviews, Jest testing. +VS Code Extension for Azure Cosmos DB and the MongoDB API. TypeScript (strict mode), React webviews, Jest testing. ## Critical Build Commands @@ -178,6 +178,22 @@ For Discovery View, both `treeId` and `clusterId` are sanitized (all `/` replace See `src/tree/models/BaseClusterModel.ts` and `docs/analysis/08-cluster-model-simplification-plan.md` for details. +## Terminology + +This is a **DocumentDB** extension that uses the **MongoDB-compatible wire protocol**. + +- Use **"DocumentDB"** when referring to the database service itself. +- Use **"MongoDB API"** or **"DocumentDB API"** when referring to the wire protocol, query language, or API compatibility layer. +- **Never use "MongoDB" alone** as a product name in code, comments, docs, or user-facing strings. + +| ✅ Do | ❌ Don't | +| ---------------------------------------------------- | -------------------------------- | +| `// Query operators supported by the DocumentDB API` | `// MongoDB query operators` | +| `// BSON types per the MongoDB API spec` | `// Uses MongoDB's $match stage` | +| `documentdbQuery` (variable name) | `mongoQuery` | + +This applies to: code comments, JSDoc/TSDoc, naming (prefer `documentdb` prefix), user-facing strings, docs, and test descriptions. + ## Additional Patterns For detailed patterns, see: diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ac02668fb..7ea04b6ef 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -55,6 +55,9 @@ jobs: - name: 📦 Install Dependencies (npm ci) run: npm ci --prefer-offline --no-audit --no-fund --progress=false --verbose + - name: 🔨 Build Workspace Packages + run: npm run build --workspaces --if-present + - name: 🌐 Check Localization Files run: npm run l10n:check diff --git a/.gitignore b/.gitignore index 7bc4f395e..f8c691f1e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ ## Ignore Visual Studio temporary files, build results, and ## files generated by popular Visual Studio add-ons. +/docs/analysis/ +/docs/plan/ + # User-specific files *.suo *.user @@ -157,6 +160,9 @@ PublishScripts/ **/packages/* # except build/, which is used as an MSBuild target. !**/packages/build/ +# Include our monorepo packages at the root +!/packages/ +!/packages/** # Uncomment if necessary however generally it will be regenerated when needed #!**/packages/repositories.config # NuGet v3's project.json files produces more ignoreable files @@ -268,6 +274,7 @@ dist stats.json *.tgz *.zip +*.tsbuildinfo # Scrapbooks *.mongo diff --git a/jest.config.js b/jest.config.js index 7ad26361a..6db24cd52 100644 --- a/jest.config.js +++ b/jest.config.js @@ -1,11 +1,17 @@ /** @type {import('ts-jest').JestConfigWithTsJest} **/ module.exports = { - testEnvironment: 'node', - testMatch: ['/src/**/*.test.ts'], - transform: { - '^.+.tsx?$': ['ts-jest', {}], - }, // Limit workers to avoid OOM kills on machines with many cores. // Each ts-jest worker loads the TypeScript compiler and consumes ~500MB+. maxWorkers: '50%', + projects: [ + { + displayName: 'extension', + testEnvironment: 'node', + testMatch: ['/src/**/*.test.ts'], + transform: { + '^.+\\.tsx?$': ['ts-jest', {}], + }, + }, + '/packages/schema-analyzer', + ], }; diff --git a/l10n/bundle.l10n.json b/l10n/bundle.l10n.json index bd7aca008..ef1cd63d9 100644 --- a/l10n/bundle.l10n.json +++ b/l10n/bundle.l10n.json @@ -721,7 +721,6 @@ "No matching resources found.": "No matching resources found.", "No node selected.": "No node selected.", "No parent folder selected.": "No parent folder selected.", - "No properties found in the schema at path \"{0}\"": "No properties found in the schema at path \"{0}\"", "No public connectivity": "No public connectivity", "No result returned from the MongoDB shell.": "No result returned from the MongoDB shell.", "No results found": "No results found", diff --git a/package-lock.json b/package-lock.json index 68ba135c7..06ce802e7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,6 +8,9 @@ "name": "vscode-documentdb", "version": "0.7.0", "license": "SEE LICENSE IN LICENSE.md", + "workspaces": [ + "packages/*" + ], "dependencies": { "@azure/arm-compute": "^22.4.0", "@azure/arm-cosmosdb": "~16.4.0", @@ -26,6 +29,7 @@ "@mongodb-js/explain-plan-helper": "1.4.24", "@trpc/client": "~11.10.0", "@trpc/server": "~11.10.0", + "@vscode-documentdb/schema-analyzer": "*", "@vscode/l10n": "~0.0.18", "antlr4ts": "^0.5.0-alpha.4", "bson": "~7.0.0", @@ -7307,6 +7311,10 @@ "win32" ] }, + "node_modules/@vscode-documentdb/schema-analyzer": { + "resolved": "packages/schema-analyzer", + "link": true + }, "node_modules/@vscode/extension-telemetry": { "version": "0.9.9", "resolved": "https://registry.npmjs.org/@vscode/extension-telemetry/-/extension-telemetry-0.9.9.tgz", @@ -22209,6 +22217,17 @@ "type": "github", "url": "https://github.com/sponsors/wooorm" } + }, + "packages/schema-analyzer": { + "name": "@vscode-documentdb/schema-analyzer", + "version": "1.0.0", + "license": "MIT", + "dependencies": { + "denque": "~2.1.0" + }, + "peerDependencies": { + "mongodb": ">=6.0.0" + } } } } diff --git a/package.json b/package.json index 9a87a3fe5..5071f6698 100644 --- a/package.json +++ b/package.json @@ -46,6 +46,9 @@ "type": "git", "url": "https://github.com/microsoft/vscode-documentdb" }, + "workspaces": [ + "packages/*" + ], "main": "./main", "l10n": "./l10n", "activationEvents": [ @@ -55,8 +58,9 @@ "onUri" ], "scripts": { + "prebuild": "npm run build --workspaces --if-present", "build": "tsc", - "clean": "git clean -dfx", + "clean": "rimraf out dist coverage && npm run clean --workspaces --if-present", "compile": "tsc -watch", "package": "run-script-os", "package:win32": "npm run webpack-prod && cd dist && npm pkg delete \"scripts.vscode:prepublish\" && npx vsce package --no-dependencies --out ../%npm_package_name%-%npm_package_version%.vsix", @@ -67,9 +71,10 @@ "lint": "eslint --quiet .", "lint-fix": "eslint . --fix", "prettier": "prettier -c \"(src|test|l10n|grammar|docs)/**/*.@(js|ts|jsx|tsx|json)\" \"./*.@(js|ts|jsx|tsx|json)\"", - "prettier-fix": "prettier -w \"(src|test|l10n|grammar|docs)/**/*.@(js|ts|jsx|tsx|json)\" \"./*.@(js|ts|jsx|tsx|json)\"", + "prettier-fix": "prettier -w \"(src|test|l10n|grammar|docs|packages)/**/*.@(js|ts|jsx|tsx|json)\" \"./*.@(js|ts|jsx|tsx|json)\"", "pretest": "npm run build", "test": "vscode-test", + "prejesttest": "npm run build --workspaces --if-present", "jesttest": "jest", "update-grammar": "antlr4ts -visitor ./grammar/mongo.g4 -o src/documentdb/grammar", "webpack-dev": "rimraf ./dist && npm run webpack-dev-ext && npm run webpack-dev-wv", @@ -165,6 +170,7 @@ "@trpc/client": "~11.10.0", "@trpc/server": "~11.10.0", "@vscode/l10n": "~0.0.18", + "@vscode-documentdb/schema-analyzer": "*", "antlr4ts": "^0.5.0-alpha.4", "bson": "~7.0.0", "denque": "~2.1.0", diff --git a/packages/schema-analyzer/README.md b/packages/schema-analyzer/README.md new file mode 100644 index 000000000..2010f2e7a --- /dev/null +++ b/packages/schema-analyzer/README.md @@ -0,0 +1,43 @@ +# @vscode-documentdb/schema-analyzer + +Incremental JSON Schema analyzer for DocumentDB API and MongoDB API documents. Processes documents one at a time (or in batches) and produces an extended JSON Schema with statistical metadata — field occurrence counts, BSON type distributions, min/max values, and array length stats. + +> **Note:** This package is not yet published to npm. We plan to publish it once the API stabilizes. For now, it is consumed internally via npm workspaces within the [vscode-documentdb](https://github.com/microsoft/vscode-documentdb) repository. + +## Overview + +The `SchemaAnalyzer` incrementally builds a JSON Schema by inspecting DocumentDB API / MongoDB API documents. It is designed for scenarios where documents arrive over time (streaming, pagination) and the schema needs to evolve as new documents are observed. + +Key capabilities: + +- **Incremental analysis** — add documents one at a time or in batches; the schema updates in place. +- **BSON type awareness** — recognizes BSON types defined by the MongoDB API (`ObjectId`, `Decimal128`, `Binary`, `UUID`, etc.) and annotates them with `x-bsonType`. +- **Statistical extensions** — tracks field occurrence (`x-occurrence`), type frequency (`x-typeOccurrence`), min/max values, string lengths, array sizes, and document counts (`x-documentsInspected`). +- **Known fields extraction** — derives a flat list of known field paths with their types and occurrence probabilities, useful for autocomplete and UI rendering. +- **Version tracking & caching** — a monotonic version counter enables efficient cache invalidation for derived data like `getKnownFields()`. + +## Usage + +```typescript +import { SchemaAnalyzer } from '@vscode-documentdb/schema-analyzer'; + +// Create an analyzer and feed it documents +const analyzer = new SchemaAnalyzer(); +analyzer.addDocument(doc1); +analyzer.addDocuments([doc2, doc3, doc4]); + +// Get the JSON Schema with statistical extensions +const schema = analyzer.getSchema(); + +// Get a flat list of known fields (cached, version-aware) +const fields = analyzer.getKnownFields(); +``` + +## Requirements + +- **Node.js** ≥ 18 +- **mongodb** driver ≥ 6.0.0 (peer dependency) + +## License + +[MIT](../../LICENSE.md) diff --git a/packages/schema-analyzer/jest.config.js b/packages/schema-analyzer/jest.config.js new file mode 100644 index 000000000..388d1e1d1 --- /dev/null +++ b/packages/schema-analyzer/jest.config.js @@ -0,0 +1,8 @@ +/** @type {import('ts-jest').JestConfigWithTsJest} **/ +module.exports = { + testEnvironment: 'node', + testMatch: ['/test/**/*.test.ts'], + transform: { + '^.+\\.tsx?$': ['ts-jest', {}], + }, +}; diff --git a/packages/schema-analyzer/package.json b/packages/schema-analyzer/package.json new file mode 100644 index 000000000..3751cdba2 --- /dev/null +++ b/packages/schema-analyzer/package.json @@ -0,0 +1,27 @@ +{ + "name": "@vscode-documentdb/schema-analyzer", + "version": "1.0.0", + "description": "Incremental JSON Schema analyzer for DocumentDB API / MongoDB API documents with statistical extensions", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "files": [ + "dist" + ], + "scripts": { + "build": "tsc -p .", + "clean": "rimraf dist tsconfig.tsbuildinfo", + "test": "jest --config jest.config.js" + }, + "repository": { + "type": "git", + "url": "https://github.com/microsoft/vscode-documentdb", + "directory": "packages/schema-analyzer" + }, + "license": "MIT", + "peerDependencies": { + "mongodb": ">=6.0.0" + }, + "dependencies": { + "denque": "~2.1.0" + } +} diff --git a/packages/schema-analyzer/src/BSONTypes.ts b/packages/schema-analyzer/src/BSONTypes.ts new file mode 100644 index 000000000..b8fb92f16 --- /dev/null +++ b/packages/schema-analyzer/src/BSONTypes.ts @@ -0,0 +1,199 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { + Binary, + BSONSymbol, + Code, + DBRef, + Decimal128, + Double, + Int32, + Long, + MaxKey, + MinKey, + ObjectId, + Timestamp, + UUID, +} from 'mongodb'; + +/** + * Represents the different data types that can be stored in a DocumentDB API / MongoDB API document. + * The string representation is case-sensitive and should match the MongoDB API documentation. + * https://www.mongodb.com/docs/manual/reference/bson-types/ + */ +export enum BSONTypes { + String = 'string', + Number = 'number', + Int32 = 'int32', + Double = 'double', + Decimal128 = 'decimal128', + Long = 'long', + Boolean = 'boolean', + Object = 'object', + Array = 'array', + Null = 'null', + Undefined = 'undefined', + Date = 'date', + RegExp = 'regexp', + Binary = 'binary', + ObjectId = 'objectid', + Symbol = 'symbol', + Timestamp = 'timestamp', + UUID = 'uuid', + UUID_LEGACY = 'uuid-legacy', // old UUID subtype, used in some legacy data + MinKey = 'minkey', + MaxKey = 'maxkey', + DBRef = 'dbref', + Code = 'code', + CodeWithScope = 'codewithscope', + Map = 'map', + // Add any deprecated types if necessary + _UNKNOWN_ = '_unknown_', // Catch-all for unknown types +} + +export namespace BSONTypes { + const displayStringMap: Record = { + [BSONTypes.String]: 'String', + [BSONTypes.Number]: 'Number', + [BSONTypes.Int32]: 'Int32', + [BSONTypes.Double]: 'Double', + [BSONTypes.Decimal128]: 'Decimal128', + [BSONTypes.Long]: 'Long', + [BSONTypes.Boolean]: 'Boolean', + [BSONTypes.Object]: 'Object', + [BSONTypes.Array]: 'Array', + [BSONTypes.Null]: 'Null', + [BSONTypes.Undefined]: 'Undefined', + [BSONTypes.Date]: 'Date', + [BSONTypes.RegExp]: 'RegExp', + [BSONTypes.Binary]: 'Binary', + [BSONTypes.ObjectId]: 'ObjectId', + [BSONTypes.Symbol]: 'Symbol', + [BSONTypes.Timestamp]: 'Timestamp', + [BSONTypes.MinKey]: 'MinKey', + [BSONTypes.MaxKey]: 'MaxKey', + [BSONTypes.DBRef]: 'DBRef', + [BSONTypes.Code]: 'Code', + [BSONTypes.CodeWithScope]: 'CodeWithScope', + [BSONTypes.Map]: 'Map', + [BSONTypes._UNKNOWN_]: 'Unknown', + [BSONTypes.UUID]: 'UUID', + [BSONTypes.UUID_LEGACY]: 'UUID (Legacy)', + }; + + export function toDisplayString(type: BSONTypes): string { + return displayStringMap[type] || 'Unknown'; + } + + export function toString(type: BSONTypes): string { + return type; + } + + /** + * Converts a MongoDB API data type to a case-sensitive JSON data type + * @param type The MongoDB API data type + * @returns A corresponding JSON data type (please note: it's case sensitive) + */ + export function toJSONType(type: BSONTypes): string { + switch (type) { + case BSONTypes.String: + case BSONTypes.Symbol: + case BSONTypes.Date: + case BSONTypes.Timestamp: + case BSONTypes.ObjectId: + case BSONTypes.RegExp: + case BSONTypes.Binary: + case BSONTypes.Code: + case BSONTypes.UUID: + case BSONTypes.UUID_LEGACY: + return 'string'; + + case BSONTypes.Boolean: + return 'boolean'; + + case BSONTypes.Int32: + case BSONTypes.Long: + case BSONTypes.Double: + case BSONTypes.Decimal128: + return 'number'; + + case BSONTypes.Object: + case BSONTypes.Map: + case BSONTypes.DBRef: + case BSONTypes.CodeWithScope: + return 'object'; + + case BSONTypes.Array: + return 'array'; + + case BSONTypes.Null: + case BSONTypes.Undefined: + case BSONTypes.MinKey: + case BSONTypes.MaxKey: + return 'null'; + + default: + return 'string'; // Default to string for unknown types + } + } + + /** + * Accepts a value from a MongoDB API `Document` object and returns the inferred type. + * @param value The value of a field in a MongoDB API `Document` object + * @returns + */ + export function inferType(value: unknown): BSONTypes { + if (value === null) return BSONTypes.Null; + if (value === undefined) return BSONTypes.Undefined; + + switch (typeof value) { + case 'string': + return BSONTypes.String; + case 'number': + return BSONTypes.Double; // JavaScript numbers are doubles + case 'boolean': + return BSONTypes.Boolean; + case 'object': + if (Array.isArray(value)) { + return BSONTypes.Array; + } + + // Check for common BSON types first + if (value instanceof ObjectId) return BSONTypes.ObjectId; + if (value instanceof Int32) return BSONTypes.Int32; + if (value instanceof Double) return BSONTypes.Double; + if (value instanceof Date) return BSONTypes.Date; + if (value instanceof Timestamp) return BSONTypes.Timestamp; + + // Less common types + if (value instanceof Decimal128) return BSONTypes.Decimal128; + if (value instanceof Long) return BSONTypes.Long; + if (value instanceof MinKey) return BSONTypes.MinKey; + if (value instanceof MaxKey) return BSONTypes.MaxKey; + if (value instanceof BSONSymbol) return BSONTypes.Symbol; + if (value instanceof DBRef) return BSONTypes.DBRef; + if (value instanceof Map) return BSONTypes.Map; + if (value instanceof UUID && value.sub_type === Binary.SUBTYPE_UUID) return BSONTypes.UUID; + if (value instanceof UUID && value.sub_type === Binary.SUBTYPE_UUID_OLD) return BSONTypes.UUID_LEGACY; + if (value instanceof Buffer || value instanceof Binary) return BSONTypes.Binary; + if (value instanceof RegExp) return BSONTypes.RegExp; + if (value instanceof Code) { + if (value.scope) { + return BSONTypes.CodeWithScope; + } else { + return BSONTypes.Code; + } + } + + // Default to Object if none of the above match + return BSONTypes.Object; + default: + // This should never happen, but if it does, we'll catch it here + // TODO: add telemetry somewhere to know when it happens (not here, this could get hit too often) + return BSONTypes._UNKNOWN_; + } + } +} diff --git a/src/utils/json/JSONSchema.ts b/packages/schema-analyzer/src/JSONSchema.ts similarity index 80% rename from src/utils/json/JSONSchema.ts rename to packages/schema-analyzer/src/JSONSchema.ts index 467669ed5..3127932d6 100644 --- a/src/utils/json/JSONSchema.ts +++ b/packages/schema-analyzer/src/JSONSchema.ts @@ -24,16 +24,14 @@ export interface JSONSchema { $id?: string; $schema?: string; type?: string | string[]; - 'x-documentsInspected'?: number; - 'x-occurrence'?: number; - 'x-typeOccurrence'?: number; - 'x-bsonType'?: string; // Explicitly declare the key with a dash using quotes title?: string; + description?: string; definitions?: { [name: string]: JSONSchema; }; - description?: string; - properties?: JSONSchema; // changed from: JSONSchemaMap; + + // Structure + properties?: JSONSchemaMap; patternProperties?: JSONSchemaMap; additionalProperties?: JSONSchemaRef; minProperties?: number; @@ -44,7 +42,6 @@ export interface JSONSchema { [prop: string]: string[]; }; items?: JSONSchemaRef | JSONSchemaRef[]; - required?: string[]; $ref?: string; anyOf?: JSONSchemaRef[]; @@ -58,14 +55,35 @@ export interface JSONSchema { propertyNames?: JSONSchemaRef; examples?: undefined[]; $comment?: string; - $defs?: { [name: string]: JSONSchema; }; + + // Monaco extensions markdownEnumDescriptions?: string[]; markdownDescription?: string; doNotSuggest?: boolean; suggestSortText?: string; + + // SchemaAnalyzer extensions — document/field level + 'x-documentsInspected'?: number; + 'x-occurrence'?: number; + + // SchemaAnalyzer extensions — type entry level (on entries in anyOf) + 'x-bsonType'?: string; + 'x-typeOccurrence'?: number; + 'x-minValue'?: number; + 'x-maxValue'?: number; + 'x-minLength'?: number; + 'x-maxLength'?: number; + 'x-minDate'?: number; + 'x-maxDate'?: number; + 'x-trueCount'?: number; + 'x-falseCount'?: number; + 'x-minItems'?: number; + 'x-maxItems'?: number; + 'x-minProperties'?: number; + 'x-maxProperties'?: number; } export interface JSONSchemaMap { [name: string]: JSONSchemaRef; diff --git a/src/utils/json/mongo/SchemaAnalyzer.ts b/packages/schema-analyzer/src/SchemaAnalyzer.ts similarity index 56% rename from src/utils/json/mongo/SchemaAnalyzer.ts rename to packages/schema-analyzer/src/SchemaAnalyzer.ts index 278f51fc4..8f24d532a 100644 --- a/src/utils/json/mongo/SchemaAnalyzer.ts +++ b/packages/schema-analyzer/src/SchemaAnalyzer.ts @@ -3,66 +3,125 @@ * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ +import Denque from 'denque'; +import { type Document, type WithId } from 'mongodb'; +import assert from 'node:assert/strict'; +import { BSONTypes } from './BSONTypes'; +import { type JSONSchema, type JSONSchemaRef } from './JSONSchema'; +import { type FieldEntry, getKnownFields as getKnownFieldsFromSchema } from './getKnownFields'; + /** - * This is an example of a JSON Schema document that will be generated from MongoDB documents. - * It's optimized for the use-case of generating a schema for a table view, the monaco editor, and schema statistics. - * - * This is a 'work in progress' and will be updated as we progress with the project. - * - * Curent focus is: - * - discovery of the document structure - * - basic pre for future statistics work + * Incremental schema analyzer for documents from the MongoDB API / DocumentDB API. * - * Future tasks: - * - statistics aggregation - * - meaningful 'description' and 'markdownDescription' - * - add more properties to the schema, incl. properties like '$id', '$schema', and enable schema sharing/download + * Analyzes documents one at a time (or in batches) and builds a cumulative + * JSON Schema with statistical extensions (x-occurrence, x-bsonType, etc.). * + * The output schema follows JSON Schema draft-07 with custom x- extensions. + */ +export class SchemaAnalyzer { + private _schema: JSONSchema = {}; + private _version: number = 0; + private _knownFieldsCache: FieldEntry[] | null = null; + private _knownFieldsCacheVersion: number = -1; -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "$id": "https://example.com/sample.schema.json", - "title": "Sample Document Schema", - "type": "object", - "properties": { - "a-propert-root-level": { - "description": "a description as text", - "anyOf": [ // anyOf is used to indicate that the value can be of any of the types listed - { - "type": "string" - }, - { - "type": "string" + /** + * A monotonically increasing version counter. Incremented on every mutation + * (addDocument, addDocuments, reset). Adapters can store this value alongside + * their cached derived data and recompute only when it changes. + */ + get version(): number { + return this._version; + } + + /** + * Adds a single document to the accumulated schema. + * This is the primary incremental API — call once per document. + */ + addDocument(document: WithId): void { + updateSchemaWithDocumentInternal(this._schema, document); + this._version++; + } + + /** + * Adds multiple documents to the accumulated schema. + * Convenience method equivalent to calling addDocument() for each. + * Increments version once for the entire batch — not per document. + */ + addDocuments(documents: ReadonlyArray>): void { + for (const doc of documents) { + updateSchemaWithDocumentInternal(this._schema, doc); } - ] - }, - "isOpen": { - "description": "Indicates if the item is open", - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number" + this._version++; + } + + /** + * Returns the current accumulated JSON Schema. + * The returned object is a live reference (not a copy) — do not mutate externally. + */ + getSchema(): JSONSchema { + return this._schema; + } + + /** + * Returns the number of documents analyzed so far. + */ + getDocumentCount(): number { + return (this._schema['x-documentsInspected'] as number) ?? 0; + } + + /** + * Resets the analyzer to its initial empty state. + */ + reset(): void { + this._schema = {}; + this._version++; + } + + /** + * Creates a deep copy of this analyzer, including all accumulated schema data. + * Useful for aggregation stage branching where each stage needs its own schema state. + * The clone starts with version 0, independent from the original. + */ + clone(): SchemaAnalyzer { + const copy = new SchemaAnalyzer(); + copy._schema = structuredClone(this._schema); + return copy; + } + + /** + * Returns the cached list of known fields (all nesting levels, sorted). + * Recomputed only when the schema version has changed since the last call. + */ + getKnownFields(): FieldEntry[] { + if (this._knownFieldsCacheVersion !== this._version || this._knownFieldsCache === null) { + this._knownFieldsCache = getKnownFieldsFromSchema(this._schema); + this._knownFieldsCacheVersion = this._version; } - ] + return this._knownFieldsCache; } - }, - "required": ["isOpen"] -} - * - * - */ + /** + * Creates a SchemaAnalyzer from a single document. + * Equivalent to creating an instance and calling addDocument() once. + */ + static fromDocument(document: WithId): SchemaAnalyzer { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(document); + return analyzer; + } -import * as l10n from '@vscode/l10n'; -import { assert } from 'console'; -import Denque from 'denque'; -import { type Document, type WithId } from 'mongodb'; -import { type JSONSchema } from '../JSONSchema'; -import { MongoBSONTypes } from './MongoBSONTypes'; + /** + * Creates a SchemaAnalyzer from multiple documents. + * Equivalent to creating an instance and calling addDocuments(). + */ + static fromDocuments(documents: ReadonlyArray>): SchemaAnalyzer { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocuments(documents); + return analyzer; + } +} -export function updateSchemaWithDocument(schema: JSONSchema, document: WithId): void { +function updateSchemaWithDocumentInternal(schema: JSONSchema, document: WithId): void { // Initialize schema if it's empty if (!schema.properties) { schema.properties = {}; @@ -74,7 +133,7 @@ export function updateSchemaWithDocument(schema: JSONSchema, document: WithId; const objKeysCount = Object.keys(objValue).length; // Update min and max property counts updateMinMaxStats(item.propertySchema, 'x-minProperties', 'x-maxProperties', objKeysCount); + // Track how many object instances contributed to this sub-schema. + // This enables uniform probability computation at every nesting level: + // probability = property.x-occurrence / parentObject.x-documentsInspected + // + // Without this, array-embedded objects have no denominator for probability. + // Example: doc1.a=[], doc2.a=[{b:1},...,{b:100}] + // b.x-occurrence = 100, root.x-documentsInspected = 2 + // Naive: 100/2 = 5000% — wrong! + // With fix: objectEntry.x-documentsInspected = 100, so 100/100 = 100% + item.propertySchema['x-documentsInspected'] = (item.propertySchema['x-documentsInspected'] ?? 0) + 1; + // Ensure 'properties' exists if (!item.propertySchema.properties) { item.propertySchema.properties = {}; @@ -158,7 +228,7 @@ export function updateSchemaWithDocument(schema: JSONSchema, document: WithId = new Map(); - // Iterate over the array elements for (const element of arrayValue) { - const elementMongoType = MongoBSONTypes.inferType(element); + const elementMongoType = BSONTypes.inferType(element); // Find or create the type entry in 'items.anyOf' let itemEntry = findTypeEntry(itemsSchema.anyOf as JSONSchema[], elementMongoType); + const isNewTypeEntry = !itemEntry; if (!itemEntry) { // Create a new type entry itemEntry = { - type: MongoBSONTypes.toJSONType(elementMongoType), + type: BSONTypes.toJSONType(elementMongoType), 'x-bsonType': elementMongoType, 'x-typeOccurrence': 0, }; @@ -249,18 +317,19 @@ export function updateSchemaWithDocument(schema: JSONSchema, document: WithId entry['x-bsonType'] === bsonType); } @@ -299,221 +368,69 @@ function findTypeEntry(anyOfArray: JSONSchema[], bsonType: MongoBSONTypes): JSON * Helper function to update min and max stats */ function updateMinMaxStats(schema: JSONSchema, minKey: string, maxKey: string, value: number): void { - if (schema[minKey] === undefined || value < schema[minKey]) { - schema[minKey] = value; + const record = schema as Record; + if (record[minKey] === undefined || value < (record[minKey] as number)) { + record[minKey] = value; } - if (schema[maxKey] === undefined || value > schema[maxKey]) { - schema[maxKey] = value; + if (record[maxKey] === undefined || value > (record[maxKey] as number)) { + record[maxKey] = value; } } -export function getSchemaFromDocument(document: WithId): JSONSchema { - const schema: JSONSchema = {}; - schema['x-documentsInspected'] = 1; // we're inspecting one document, this will make sense when we start aggregating stats - schema.properties = {}; - - type WorkItem = { - fieldName: string; - fieldMongoType: MongoBSONTypes; // the inferred BSON type - propertyTypeEntry: JSONSchema; // points to the entry within the 'anyOf' property of the schema - fieldValue: unknown; - pathSoFar: string; // used for debugging - }; - - // having some import/require issues with Denque atm - // prototype with an array - //const fifoQueue = new Denque(); - const fifoQueue: WorkItem[] = []; - - /** - * Push all elements from the root of the document into the queue - */ - for (const [name, value] of Object.entries(document)) { - const mongoDatatype = MongoBSONTypes.inferType(value); - - const typeEntry = { - type: MongoBSONTypes.toJSONType(mongoDatatype), - 'x-bsonType': mongoDatatype, - 'x-typeOccurrence': 1, - }; - - // please note (1/2): we're adding the type entry to the schema here - schema.properties[name] = { anyOf: [typeEntry], 'x-occurrence': 1 }; - - fifoQueue.push({ - fieldName: name, - fieldMongoType: mongoDatatype, - propertyTypeEntry: typeEntry, // please note (2/2): and we're keeping a reference to it here for further updates - fieldValue: value, - pathSoFar: name, - }); - } - - /** - * Work through the queue, adding elements to the schema as we go. - * This is a breadth-first search of the document, do note special - * handling on objects/arrays - */ - while (fifoQueue.length > 0) { - const item = fifoQueue.shift(); // todo, replace with a proper queue - if (item === undefined) { - // unexpected, but let's try to continue - continue; - } - - switch (item.fieldMongoType) { - case MongoBSONTypes.Object: { - const objKeys = Object.keys(item.fieldValue as object).length; - item.propertyTypeEntry['x-maxLength'] = objKeys; - item.propertyTypeEntry['x-minLength'] = objKeys; - - // prepare an entry for the object properties - item.propertyTypeEntry.properties = {}; - - for (const [name, value] of Object.entries(item.fieldValue as object)) { - const mongoDatatype = MongoBSONTypes.inferType(value); - - const typeEntry = { - type: MongoBSONTypes.toJSONType(mongoDatatype), - 'x-bsonType': mongoDatatype, - 'x-typeOccurrence': 1, - }; - - // please note (1/2): we're adding the entry to the main schema here - item.propertyTypeEntry.properties[name] = { anyOf: [typeEntry], 'x-occurrence': 1 }; - - fifoQueue.push({ - fieldName: name, - fieldMongoType: mongoDatatype, - propertyTypeEntry: typeEntry, // please note (2/2): and we're keeping a reference to it here for further updates to the schema - fieldValue: value, - pathSoFar: `${item.pathSoFar}.${item.fieldName}`, - }); - } - break; - } - case MongoBSONTypes.Array: { - const arrayLength = (item.fieldValue as unknown[]).length; - item.propertyTypeEntry['x-maxLength'] = arrayLength; - item.propertyTypeEntry['x-minLength'] = arrayLength; - - // preapare the array items entry (in two lines for ts not to compalin about the missing type later on) - item.propertyTypeEntry.items = {}; - item.propertyTypeEntry.items.anyOf = []; - - const encounteredMongoTypes: Map = new Map(); - - // iterate over the array and infer the type of each element - for (const element of item.fieldValue as unknown[]) { - const elementMongoType = MongoBSONTypes.inferType(element); - - let itemEntry: JSONSchema; - - if (!encounteredMongoTypes.has(elementMongoType)) { - itemEntry = { - type: MongoBSONTypes.toJSONType(elementMongoType), - 'x-bsonType': elementMongoType, - 'x-typeOccurrence': 1, // Initialize type occurrence counter - }; - item.propertyTypeEntry.items.anyOf.push(itemEntry); - encounteredMongoTypes.set(elementMongoType, itemEntry); - - initializeStatsForValue(element, elementMongoType, itemEntry); - } else { - // if we've already encountered this type, we'll just add the type to the existing entry - itemEntry = encounteredMongoTypes.get(elementMongoType) as JSONSchema; - - if (itemEntry === undefined) continue; // unexpected, but let's try to continue - - if (itemEntry['x-typeOccurrence'] !== undefined) { - itemEntry['x-typeOccurrence'] += 1; - } - - // Aggregate stats with the new value - aggregateStatsForValue(element, elementMongoType, itemEntry); - } - - // an imporant exception for arrays as we have to start adding them already now to the schema - // (if we want to avoid more iterations over the data) - if (elementMongoType === MongoBSONTypes.Object || elementMongoType === MongoBSONTypes.Array) { - fifoQueue.push({ - fieldName: '[]', // Array items don't have a field name - fieldMongoType: elementMongoType, - propertyTypeEntry: itemEntry, - fieldValue: element, - pathSoFar: `${item.pathSoFar}.${item.fieldName}.items`, - }); - } - } - - break; - } - - default: { - // For all other types, update stats for the value - initializeStatsForValue(item.fieldValue, item.fieldMongoType, item.propertyTypeEntry); - break; - } - } - } - - return schema; -} - /** * Helper function to compute stats for a value based on its MongoDB data type * Updates the provided propertyTypeEntry with the computed stats */ -function initializeStatsForValue(value: unknown, mongoType: MongoBSONTypes, propertyTypeEntry: JSONSchema): void { +function initializeStatsForValue(value: unknown, mongoType: BSONTypes, propertyTypeEntry: JSONSchema): void { switch (mongoType) { - case MongoBSONTypes.String: { + case BSONTypes.String: { const currentLength = (value as string).length; propertyTypeEntry['x-maxLength'] = currentLength; propertyTypeEntry['x-minLength'] = currentLength; break; } - case MongoBSONTypes.Number: - case MongoBSONTypes.Int32: - case MongoBSONTypes.Long: - case MongoBSONTypes.Double: - case MongoBSONTypes.Decimal128: { + case BSONTypes.Number: + case BSONTypes.Int32: + case BSONTypes.Long: + case BSONTypes.Double: + case BSONTypes.Decimal128: { const numericValue = Number(value); propertyTypeEntry['x-maxValue'] = numericValue; propertyTypeEntry['x-minValue'] = numericValue; break; } - case MongoBSONTypes.Boolean: { + case BSONTypes.Boolean: { const boolValue = value as boolean; propertyTypeEntry['x-trueCount'] = boolValue ? 1 : 0; propertyTypeEntry['x-falseCount'] = boolValue ? 0 : 1; break; } - case MongoBSONTypes.Date: { + case BSONTypes.Date: { const dateValue = (value as Date).getTime(); propertyTypeEntry['x-maxDate'] = dateValue; propertyTypeEntry['x-minDate'] = dateValue; break; } - case MongoBSONTypes.Binary: { + case BSONTypes.Binary: { const binaryLength = (value as Buffer).length; propertyTypeEntry['x-maxLength'] = binaryLength; propertyTypeEntry['x-minLength'] = binaryLength; break; } - case MongoBSONTypes.Null: - case MongoBSONTypes.RegExp: - case MongoBSONTypes.ObjectId: - case MongoBSONTypes.MinKey: - case MongoBSONTypes.MaxKey: - case MongoBSONTypes.Symbol: - case MongoBSONTypes.Timestamp: - case MongoBSONTypes.DBRef: - case MongoBSONTypes.Map: + case BSONTypes.Null: + case BSONTypes.RegExp: + case BSONTypes.ObjectId: + case BSONTypes.MinKey: + case BSONTypes.MaxKey: + case BSONTypes.Symbol: + case BSONTypes.Timestamp: + case BSONTypes.DBRef: + case BSONTypes.Map: // No stats computation for other types break; @@ -527,9 +444,9 @@ function initializeStatsForValue(value: unknown, mongoType: MongoBSONTypes, prop * Helper function to aggregate stats for a value based on its MongoDB data type * Used when processing multiple values (e.g., elements in arrays) */ -function aggregateStatsForValue(value: unknown, mongoType: MongoBSONTypes, propertyTypeEntry: JSONSchema): void { +function aggregateStatsForValue(value: unknown, mongoType: BSONTypes, propertyTypeEntry: JSONSchema): void { switch (mongoType) { - case MongoBSONTypes.String: { + case BSONTypes.String: { const currentLength = (value as string).length; // Update minLength @@ -544,11 +461,11 @@ function aggregateStatsForValue(value: unknown, mongoType: MongoBSONTypes, prope break; } - case MongoBSONTypes.Number: - case MongoBSONTypes.Int32: - case MongoBSONTypes.Long: - case MongoBSONTypes.Double: - case MongoBSONTypes.Decimal128: { + case BSONTypes.Number: + case BSONTypes.Int32: + case BSONTypes.Long: + case BSONTypes.Double: + case BSONTypes.Decimal128: { const numericValue = Number(value); // Update minValue @@ -563,7 +480,7 @@ function aggregateStatsForValue(value: unknown, mongoType: MongoBSONTypes, prope break; } - case MongoBSONTypes.Boolean: { + case BSONTypes.Boolean: { const boolValue = value as boolean; // Update trueCount and falseCount @@ -581,7 +498,7 @@ function aggregateStatsForValue(value: unknown, mongoType: MongoBSONTypes, prope break; } - case MongoBSONTypes.Date: { + case BSONTypes.Date: { const dateValue = (value as Date).getTime(); // Update minDate @@ -596,7 +513,7 @@ function aggregateStatsForValue(value: unknown, mongoType: MongoBSONTypes, prope break; } - case MongoBSONTypes.Binary: { + case BSONTypes.Binary: { const binaryLength = (value as Buffer).length; // Update minLength @@ -617,17 +534,12 @@ function aggregateStatsForValue(value: unknown, mongoType: MongoBSONTypes, prope } } -function getSchemaAtPath(schema: JSONSchema, path: string[]): JSONSchema { - let currentNode = schema; +function getSchemaAtPath(schema: JSONSchema, path: string[]): JSONSchema | undefined { + let currentNode: JSONSchema | undefined = schema; for (let i = 0; i < path.length; i++) { const key = path[i]; - // If the current node is an array, we should move to its `items` - // if (currentNode.type === 'array' && currentNode.items) { - // currentNode = currentNode.items; - // } - // Move to the next property in the schema if (currentNode && currentNode.properties && currentNode.properties[key]) { const nextNode: JSONSchema = currentNode.properties[key] as JSONSchema; @@ -636,13 +548,15 @@ function getSchemaAtPath(schema: JSONSchema, path: string[]): JSONSchema { * We're looking at the "Object"-one, because these have the properties we're interested in. */ if (nextNode.anyOf && nextNode.anyOf.length > 0) { - currentNode = nextNode.anyOf.find((entry: JSONSchema) => entry.type === 'object') as JSONSchema; + currentNode = nextNode.anyOf.find( + (entry: JSONSchemaRef): entry is JSONSchema => typeof entry === 'object' && entry.type === 'object', + ); } else { // we can't continue, as we're missing the next node, we abort at the last node we managed to extract return currentNode; } } else { - throw new Error(l10n.t('No properties found in the schema at path "{0}"', path.slice(0, i + 1).join('/'))); + throw new Error(`No properties found in the schema at path "${path.slice(0, i + 1).join('/')}"`); } } @@ -653,7 +567,7 @@ export function getPropertyNamesAtLevel(jsonSchema: JSONSchema, path: string[]): const headers = new Set(); // Explore the schema and apply the callback to collect headers at the specified path - const selectedSchema: JSONSchema = getSchemaAtPath(jsonSchema, path); + const selectedSchema = getSchemaAtPath(jsonSchema, path); if (selectedSchema && selectedSchema.properties) { Object.keys(selectedSchema.properties).forEach((key) => { diff --git a/src/utils/json/mongo/MongoValueFormatters.ts b/packages/schema-analyzer/src/ValueFormatters.ts similarity index 56% rename from src/utils/json/mongo/MongoValueFormatters.ts rename to packages/schema-analyzer/src/ValueFormatters.ts index 243ce2631..7f9e8e5fa 100644 --- a/src/utils/json/mongo/MongoValueFormatters.ts +++ b/packages/schema-analyzer/src/ValueFormatters.ts @@ -4,16 +4,16 @@ *--------------------------------------------------------------------------------------------*/ import { type Binary, type BSONRegExp, type ObjectId } from 'mongodb'; -import { MongoBSONTypes } from './MongoBSONTypes'; +import { BSONTypes } from './BSONTypes'; /** - * Converts a MongoDB value to its display string representation based on its type. + * Converts a MongoDB API value to its display string representation based on its type. * * @param value - The value to be converted to a display string. - * @param type - The MongoDB data type of the value. + * @param type - The MongoDB API data type of the value. * @returns The string representation of the value. * - * The function handles various MongoDB data types including: + * The function handles various MongoDB API data types including: * - String * - Number, Int32, Double, Decimal128, Long * - Boolean @@ -24,60 +24,60 @@ import { MongoBSONTypes } from './MongoBSONTypes'; * * For unsupported or unknown types, the function defaults to JSON stringification. */ -export function valueToDisplayString(value: unknown, type: MongoBSONTypes): string { +export function valueToDisplayString(value: unknown, type: BSONTypes): string { switch (type) { - case MongoBSONTypes.String: { + case BSONTypes.String: { return value as string; } - case MongoBSONTypes.Number: - case MongoBSONTypes.Int32: - case MongoBSONTypes.Double: - case MongoBSONTypes.Decimal128: - case MongoBSONTypes.Long: { + case BSONTypes.Number: + case BSONTypes.Int32: + case BSONTypes.Double: + case BSONTypes.Decimal128: + case BSONTypes.Long: { return (value as number).toString(); } - case MongoBSONTypes.Boolean: { + case BSONTypes.Boolean: { return (value as boolean).toString(); } - case MongoBSONTypes.Date: { + case BSONTypes.Date: { return (value as Date).toISOString(); } - case MongoBSONTypes.ObjectId: { + case BSONTypes.ObjectId: { return (value as ObjectId).toHexString(); } - case MongoBSONTypes.Null: { + case BSONTypes.Null: { return 'null'; } - case MongoBSONTypes.RegExp: { + case BSONTypes.RegExp: { const v = value as BSONRegExp; return `${v.pattern} ${v.options}`; } - case MongoBSONTypes.Binary: { + case BSONTypes.Binary: { return `Binary[${(value as Binary).length()}]`; } - case MongoBSONTypes.Symbol: { + case BSONTypes.Symbol: { return (value as symbol).toString(); } - case MongoBSONTypes.Timestamp: { + case BSONTypes.Timestamp: { return (value as { toString: () => string }).toString(); } - case MongoBSONTypes.MinKey: { + case BSONTypes.MinKey: { return 'MinKey'; } - case MongoBSONTypes.MaxKey: { + case BSONTypes.MaxKey: { return 'MaxKey'; } - case MongoBSONTypes.Code: - case MongoBSONTypes.CodeWithScope: { + case BSONTypes.Code: + case BSONTypes.CodeWithScope: { return JSON.stringify(value); } - case MongoBSONTypes.Array: - case MongoBSONTypes.Object: - case MongoBSONTypes.Map: - case MongoBSONTypes.DBRef: - case MongoBSONTypes.Undefined: - case MongoBSONTypes._UNKNOWN_: + case BSONTypes.Array: + case BSONTypes.Object: + case BSONTypes.Map: + case BSONTypes.DBRef: + case BSONTypes.Undefined: + case BSONTypes._UNKNOWN_: default: { return JSON.stringify(value); } diff --git a/packages/schema-analyzer/src/getKnownFields.ts b/packages/schema-analyzer/src/getKnownFields.ts new file mode 100644 index 000000000..f5da314b6 --- /dev/null +++ b/packages/schema-analyzer/src/getKnownFields.ts @@ -0,0 +1,219 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import Denque from 'denque'; +import { type JSONSchema } from './JSONSchema'; + +export interface FieldEntry { + /** Dot-notated path (e.g., "user.profile.name") */ + path: string; + /** JSON type of the dominant type entry ("string", "number", "object", "array", etc.) */ + type: string; + /** Dominant BSON type from x-bsonType on the most common type entry ("date", "objectid", "int32", etc.) */ + bsonType: string; + /** All observed BSON types for this field (for polymorphic fields) */ + bsonTypes?: string[]; + /** + * True if this field was not present in every inspected document + * (x-occurrence < parent x-documentsInspected). + * + * This is a statistical observation, not a schema constraint — in the MongoDB API / DocumentDB API, + * all fields are implicitly optional. + */ + isSparse?: boolean; + /** If the field is an array, the dominant element BSON type */ + arrayItemBsonType?: string; +} + +/** + * This function traverses our JSON Schema object and collects all leaf property paths + * along with their most common data types. + * + * This information is needed for auto-completion support + * + * The approach is as follows: + * - Initialize a queue with the root properties of the schema to perform a breadth-first traversal. + * - While the queue is not empty: + * - Dequeue the next item, which includes the current schema node and its path. + * - Determine the most common type for the current node by looking at the 'x-typeOccurrence' field. + * - If the most common type is an object with properties: + * - Enqueue its child properties with their updated paths into the queue for further traversal. + * - Else if the most common type is a leaf type (e.g., string, number, boolean): + * - Add the current path and type to the result array as it represents a leaf property. + * - Continue this process until all nodes have been processed. + * - Return the result array containing objects with 'path' and 'type' for each leaf property. + */ +export function getKnownFields(schema: JSONSchema): FieldEntry[] { + const result: FieldEntry[] = []; + + type QueueItem = { + path: string; + schemaNode: JSONSchema; + parentDocumentsInspected: number; + }; + + const rootDocumentsInspected = (schema['x-documentsInspected'] as number) ?? 0; + const queue: Denque = new Denque(); + + // Initialize the queue with root properties + // + // Note: JSON Schema allows boolean values as schema references (true = accept all, + // false = reject all), but our SchemaAnalyzer never produces boolean refs — it always + // emits full schema objects. The cast to JSONSchema below is therefore safe for our + // use case. If this function were ever reused with externally-sourced schemas, a + // `typeof propSchema === 'boolean'` guard should be added here and in the nested + // property loop below. + if (schema.properties) { + for (const propName of Object.keys(schema.properties)) { + const propSchema = schema.properties[propName] as JSONSchema; + queue.push({ + path: propName, + schemaNode: propSchema, + parentDocumentsInspected: rootDocumentsInspected, + }); + } + } + + while (queue.length > 0) { + const item = queue.shift(); + if (!item) continue; + + const { path, schemaNode, parentDocumentsInspected } = item; + const mostCommonTypeEntry = getMostCommonTypeEntry(schemaNode); + + if (mostCommonTypeEntry) { + if (mostCommonTypeEntry.type === 'object' && mostCommonTypeEntry.properties) { + // Not a leaf node, enqueue its properties + const objectDocumentsInspected = (mostCommonTypeEntry['x-documentsInspected'] as number) ?? 0; + for (const childName of Object.keys(mostCommonTypeEntry.properties)) { + const childSchema = mostCommonTypeEntry.properties[childName] as JSONSchema; + // TODO: Dot-delimited path concatenation is ambiguous when a field name + // itself contains a literal dot. For example, a root-level field named + // "a.b" produces path "a.b", indistinguishable from a nested field + // { a: { b: ... } }. Fields with literal dots in their names were + // prohibited before MongoDB API 3.6 and remain rare in practice. + // + // Future improvement: change `path` from `string` to `string[]` + // (segment array) to preserve the distinction between nesting and + // literal dots, pushing escaping/formatting decisions to consumers + // (TS definitions, completion items, aggregation references, etc.). + queue.push({ + path: `${path}.${childName}`, + schemaNode: childSchema, + parentDocumentsInspected: objectDocumentsInspected, + }); + } + } else { + // Leaf node, build the FieldEntry + const bsonType = (mostCommonTypeEntry['x-bsonType'] as string) ?? (mostCommonTypeEntry.type as string); + + const entry: FieldEntry = { + path, + type: mostCommonTypeEntry.type as string, + bsonType, + }; + + // bsonTypes: collect all distinct x-bsonType values from anyOf entries + const allBsonTypes = collectBsonTypes(schemaNode); + if (allBsonTypes.length >= 2) { + entry.bsonTypes = allBsonTypes; + } + + // isSparse: field was not observed in every document + const occurrence = (schemaNode['x-occurrence'] as number) ?? 0; + if (parentDocumentsInspected > 0 && occurrence < parentDocumentsInspected) { + entry.isSparse = true; + } + + // arrayItemBsonType: for array fields, find the dominant element type + if (mostCommonTypeEntry.type === 'array') { + const itemBsonType = getDominantArrayItemBsonType(mostCommonTypeEntry); + if (itemBsonType) { + entry.arrayItemBsonType = itemBsonType; + } + } + + result.push(entry); + } + } + } + + // Sort: _id first, then alphabetical by path + result.sort((a, b) => { + if (a.path === '_id') return -1; + if (b.path === '_id') return 1; + return a.path.localeCompare(b.path); + }); + + return result; +} + +/** + * Helper function to get the most common type entry from a schema node. + * It looks for the 'anyOf' array and selects the type with the highest 'x-typeOccurrence'. + */ +function getMostCommonTypeEntry(schemaNode: JSONSchema): JSONSchema | null { + if (schemaNode.anyOf && schemaNode.anyOf.length > 0) { + let maxOccurrence = -1; + let mostCommonTypeEntry: JSONSchema | null = null; + + for (const typeEntry of schemaNode.anyOf as JSONSchema[]) { + const occurrence = typeEntry['x-typeOccurrence'] || 0; + if (occurrence > maxOccurrence) { + maxOccurrence = occurrence; + mostCommonTypeEntry = typeEntry; + } + } + return mostCommonTypeEntry; + } else if (schemaNode.type) { + // If 'anyOf' is not present, use the 'type' field directly + return schemaNode; + } + return null; +} + +/** + * Collects all distinct x-bsonType values from a schema node's anyOf entries. + * Returns them sorted alphabetically for determinism. + */ +function collectBsonTypes(schemaNode: JSONSchema): string[] { + if (!schemaNode.anyOf || schemaNode.anyOf.length === 0) { + return []; + } + + const bsonTypes = new Set(); + for (const entry of schemaNode.anyOf as JSONSchema[]) { + const bsonType = entry['x-bsonType'] as string | undefined; + if (bsonType) { + bsonTypes.add(bsonType); + } + } + + return Array.from(bsonTypes).sort(); +} + +/** + * For an array type entry, finds the dominant element BSON type by looking at + * items.anyOf and selecting the entry with the highest x-typeOccurrence. + */ +function getDominantArrayItemBsonType(arrayTypeEntry: JSONSchema): string | undefined { + const itemsSchema = arrayTypeEntry.items as JSONSchema | undefined; + if (!itemsSchema?.anyOf || itemsSchema.anyOf.length === 0) { + return undefined; + } + + let maxOccurrence = -1; + let dominantBsonType: string | undefined; + + for (const entry of itemsSchema.anyOf as JSONSchema[]) { + const occurrence = (entry['x-typeOccurrence'] as number) ?? 0; + if (occurrence > maxOccurrence) { + maxOccurrence = occurrence; + dominantBsonType = entry['x-bsonType'] as string | undefined; + } + } + + return dominantBsonType; +} diff --git a/packages/schema-analyzer/src/index.ts b/packages/schema-analyzer/src/index.ts new file mode 100644 index 000000000..871fd61f8 --- /dev/null +++ b/packages/schema-analyzer/src/index.ts @@ -0,0 +1,10 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +export { BSONTypes } from './BSONTypes'; +export { getKnownFields, type FieldEntry } from './getKnownFields'; +export { type JSONSchema, type JSONSchemaMap, type JSONSchemaRef } from './JSONSchema'; +export { SchemaAnalyzer, buildFullPaths, getPropertyNamesAtLevel } from './SchemaAnalyzer'; +export { valueToDisplayString } from './ValueFormatters'; diff --git a/packages/schema-analyzer/test/SchemaAnalyzer.arrayStats.test.ts b/packages/schema-analyzer/test/SchemaAnalyzer.arrayStats.test.ts new file mode 100644 index 000000000..2669d5214 --- /dev/null +++ b/packages/schema-analyzer/test/SchemaAnalyzer.arrayStats.test.ts @@ -0,0 +1,464 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { ObjectId, type Document, type WithId } from 'mongodb'; +import { type JSONSchema } from '../src/JSONSchema'; +import { SchemaAnalyzer } from '../src/SchemaAnalyzer'; + +/** + * This test file investigates the array element occurrence/stats problem. + * + * The core issue: When an array contains mixed types (e.g., strings AND objects), + * `x-typeOccurrence` on the items' type entries counts individual elements across + * ALL documents, not occurrences-per-document. This makes "field presence probability" + * for nested object properties inside arrays hard to interpret. + * + * Example scenario: + * doc1.data = ["a", "b", "c", {"value": 23}] → 3 strings, 1 object + * doc2.data = ["x", "y", {"value": 42, "flag": true}] → 2 strings, 1 object + * doc3.data = ["z"] → 1 string, 0 objects + * + * After processing 3 docs: + * - items.anyOf[string].x-typeOccurrence = 6 (total string elements across all docs) + * - items.anyOf[object].x-typeOccurrence = 2 (total object elements across all docs) + * - items.anyOf[object].properties.value.x-occurrence = 2 (from 2 object elements) + * - items.anyOf[object].properties.flag.x-occurrence = 1 (from 1 object element) + * + * The problem: what is items.anyOf[object].properties.value's "probability"? + * - 2/2? (present in every object element → makes sense) + * - 2/3? (present in 2 of 3 documents → misleading, doc3 has no objects at all) + * - 2/6? (present in 2 of 6 total elements → nonsensical, mixes types) + * + * There's no x-documentsInspected equivalent at the array level to anchor + * the occurrence count. + */ +describe('Array element occurrence analysis', () => { + it('counts element types across multiple documents', () => { + const analyzer = new SchemaAnalyzer(); + + const doc1: WithId = { + _id: new ObjectId(), + data: ['a', 'b', 'c', { value: 23 }], + }; + const doc2: WithId = { + _id: new ObjectId(), + data: ['x', 'y', { value: 42, flag: true }], + }; + const doc3: WithId = { + _id: new ObjectId(), + data: ['z'], + }; + + analyzer.addDocument(doc1); + analyzer.addDocument(doc2); + analyzer.addDocument(doc3); + const schema = analyzer.getSchema(); + + // data field: array seen in 3 docs + const dataField = schema.properties?.['data'] as JSONSchema; + expect(dataField['x-occurrence']).toBe(3); + + // The array type entry + const arrayTypeEntry = dataField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + expect(arrayTypeEntry).toBeDefined(); + expect(arrayTypeEntry['x-typeOccurrence']).toBe(3); + + // Array items + const itemsSchema = arrayTypeEntry.items as JSONSchema; + const stringEntry = itemsSchema.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'string') as JSONSchema; + const objectEntry = itemsSchema.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'object') as JSONSchema; + + // String elements: "a","b","c","x","y","z" = 6 total + expect(stringEntry['x-typeOccurrence']).toBe(6); + + // Object elements: {value:23}, {value:42,flag:true} = 2 total + expect(objectEntry['x-typeOccurrence']).toBe(2); + + // Properties inside the object elements + const valueField = objectEntry.properties?.['value'] as JSONSchema; + const flagField = objectEntry.properties?.['flag'] as JSONSchema; + + // "value" appeared in both objects → x-occurrence = 2 + expect(valueField['x-occurrence']).toBe(2); + + // "flag" appeared in 1 object → x-occurrence = 1 + expect(flagField['x-occurrence']).toBe(1); + + // THE CORE QUESTION: What is the denominator for probability? + // + // We know objectEntry['x-typeOccurrence'] = 2 (2 objects total across all arrays). + // So valueField probability = 2/2 = 100% (correct: every object had "value") + // And flagField probability = 1/2 = 50% (correct: half of objects had "flag") + // + // BUT: there is NO x-documentsInspected on objectEntry to formally define + // the denominator. The consumer has to know to use x-typeOccurrence as the + // denominator for nested properties inside array elements. + // + // This actually WORKS — the semantics are: + // "of the N objects observed inside this array, M had this property" + // + // It just isn't obvious from the schema structure. + }); + + it('tracks min/max array lengths across documents', () => { + const analyzer = new SchemaAnalyzer(); + + const doc1: WithId = { + _id: new ObjectId(), + tags: ['a', 'b', 'c'], + }; + const doc2: WithId = { + _id: new ObjectId(), + tags: ['x'], + }; + const doc3: WithId = { + _id: new ObjectId(), + tags: ['p', 'q', 'r', 's', 't'], + }; + + analyzer.addDocument(doc1); + analyzer.addDocument(doc2); + analyzer.addDocument(doc3); + const schema = analyzer.getSchema(); + + const tagsField = schema.properties?.['tags'] as JSONSchema; + const arrayEntry = tagsField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + + expect(arrayEntry['x-minItems']).toBe(1); + expect(arrayEntry['x-maxItems']).toBe(5); + }); + + it('accumulates nested object properties from objects inside arrays across documents', () => { + const analyzer = new SchemaAnalyzer(); + + // doc1 has two objects with different properties in the items array + const doc1: WithId = { + _id: new ObjectId(), + items: [ + { name: 'Laptop', price: 999 }, + { name: 'Mouse', price: 29, discount: true }, + ], + }; + + // doc2 has one object with yet another property + const doc2: WithId = { + _id: new ObjectId(), + items: [{ name: 'Desk', weight: 50 }], + }; + + analyzer.addDocument(doc1); + analyzer.addDocument(doc2); + const schema = analyzer.getSchema(); + + const itemsField = schema.properties?.['items'] as JSONSchema; + const arrayEntry = itemsField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + const objEntry = (arrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + + const props = objEntry.properties as Record; + + // "name" appeared in all 3 object elements + expect(props['name']['x-occurrence']).toBe(3); + + // "price" appeared in 2 of 3 object elements + expect(props['price']['x-occurrence']).toBe(2); + + // "discount" appeared in 1 of 3 object elements + expect(props['discount']['x-occurrence']).toBe(1); + + // "weight" appeared in 1 of 3 object elements + expect(props['weight']['x-occurrence']).toBe(1); + + // Total object elements = 3 (2 from doc1 + 1 from doc2) + expect(objEntry['x-typeOccurrence']).toBe(3); + + // So probability interpretations: + // name: 3/3 = 100% + // price: 2/3 = 67% + // discount: 1/3 = 33% + // weight: 1/3 = 33% + // + // This is correct! x-typeOccurrence serves as the denominator. + }); + + it('handles arrays that ONLY contain primitives (no occurrence complexity)', () => { + const analyzer = new SchemaAnalyzer(); + + const doc1: WithId = { + _id: new ObjectId(), + scores: [90, 85, 78], + }; + const doc2: WithId = { + _id: new ObjectId(), + scores: [100, 55], + }; + + analyzer.addDocument(doc1); + analyzer.addDocument(doc2); + const schema = analyzer.getSchema(); + + const scoresField = schema.properties?.['scores'] as JSONSchema; + const arrayEntry = scoresField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + + const numEntry = (arrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'double', + ) as JSONSchema; + + // 5 total numeric elements + expect(numEntry['x-typeOccurrence']).toBe(5); + + // Stats across all elements + expect(numEntry['x-minValue']).toBe(55); + expect(numEntry['x-maxValue']).toBe(100); + + // Array length stats + expect(arrayEntry['x-minItems']).toBe(2); + expect(arrayEntry['x-maxItems']).toBe(3); + }); + + it('verifies that encounteredMongoTypes map is per-document', () => { + // The encounteredMongoTypes map is created inside the Array case handler. + // It controls whether initializeStatsForValue or aggregateStatsForValue is called. + // If it's per-array-occurrence (per document), stats should initialize fresh for each doc. + // + // BUT WAIT: The map is local to the switch case, which processes ONE array per queue item. + // Multiple documents contribute different queue items, and the map is re-created for each. + // However, the stats update goes to the SAME itemEntry across documents (because + // findTypeEntry finds the existing entry). So: + // + // doc1.scores = [10, 20] → first array processing, encounteredMongoTypes fresh + // - element 10: initializeStatsForValue (sets x-minValue=10, x-maxValue=10) + // - element 20: aggregateStatsForValue (updates x-maxValue=20) + // + // doc2.scores = [5, 30] → second array processing, encounteredMongoTypes fresh + // - element 5: initializeStatsForValue ← BUT x-minValue is already 10 from doc1! + // initializeStatsForValue OVERWRITES x-minValue to 5 (correct by accident here) + // Actually let's check... initializeStatsForValue sets x-maxValue = 5 + // and x-minValue = 5. So the 20 from doc1 would be lost! + // + // This is a REAL BUG: initializeStatsForValue is called for the first occurrence + // per array, but the typeEntry already has stats from previous arrays. + + const analyzer = new SchemaAnalyzer(); + + const doc1: WithId = { + _id: new ObjectId(), + scores: [10, 20, 30], + }; + const doc2: WithId = { + _id: new ObjectId(), + scores: [5, 15], + }; + + analyzer.addDocument(doc1); + analyzer.addDocument(doc2); + const schema = analyzer.getSchema(); + + const scoresField = schema.properties?.['scores'] as JSONSchema; + const arrayEntry = scoresField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + + const numEntry = (arrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'double', + ) as JSONSchema; + + // Expected correct values: + // All 5 elements: 10, 20, 30, 5, 15 + // Global min = 5, global max = 30 + + // If there's a bug, doc2 processing re-initializes: + // after doc1: min=10, max=30 + // doc2 first element (5): initializeStatsForValue → sets min=5, max=5 + // doc2 second element (15): aggregateStatsForValue → max becomes 15 + // final: min=5, max=15 ← WRONG (lost 30 from doc1) + + // This test documents the actual behavior (might be buggy): + expect(numEntry['x-minValue']).toBe(5); + // If the bug exists, this will be 15 instead of 30: + expect(numEntry['x-maxValue']).toBe(30); // should be 30 if correct + }); +}); + +describe('Array probability denominator problem', () => { + it('reproduces the >100% probability bug: empty array + large array', () => { + // User scenario: + // doc1: a = [] → 0 objects + // doc2: a = [{b:1}, {b:2}, ..., {b:100}] → 100 objects + // + // Naively computing probability as: + // occurrence_of_b / root.x-documentsInspected = 100 / 2 = 5000% + // + // The correct probability should be: + // occurrence_of_b / objectEntry.x-typeOccurrence = 100 / 100 = 100% + // + // FIX: Set x-documentsInspected on the object type entry so the uniform + // formula `x-occurrence / parent.x-documentsInspected` works at every + // nesting level. + + const analyzer = new SchemaAnalyzer(); + + const doc1: WithId = { + _id: new ObjectId(), + a: [], // empty array + }; + + // doc2: 100 objects, each with property "b" + const objectElements: Record[] = []; + for (let i = 1; i <= 100; i++) { + objectElements.push({ b: i }); + } + const doc2: WithId = { + _id: new ObjectId(), + a: objectElements, + }; + + analyzer.addDocument(doc1); + analyzer.addDocument(doc2); + const schema = analyzer.getSchema(); + + // Root level + expect(schema['x-documentsInspected']).toBe(2); + + // Navigate to the object type entry inside the array + const aField = schema.properties?.['a'] as JSONSchema; + expect(aField['x-occurrence']).toBe(2); // both docs have 'a' + + const arrayEntry = aField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + const objectEntry = (arrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + + // 100 object elements total + expect(objectEntry['x-typeOccurrence']).toBe(100); + + // Property "b" appears in all 100 objects + const bField = objectEntry.properties?.['b'] as JSONSchema; + expect(bField['x-occurrence']).toBe(100); + + // THE FIX: objectEntry should have x-documentsInspected = 100 + // so that the uniform formula works: + // probability = b.x-occurrence / objectEntry.x-documentsInspected + // = 100 / 100 = 100% + expect(objectEntry['x-documentsInspected']).toBe(100); + }); + + it('correctly computes probability for sparse properties in array objects', () => { + // doc1: items = [{name:"A", price:10}, {name:"B"}] → 2 objects, name in both, price in 1 + // doc2: items = [{name:"C", discount:true}] → 1 object + // + // Total objects = 3 + // name: 3/3 = 100% + // price: 1/3 = 33% + // discount: 1/3 = 33% + + const analyzer = new SchemaAnalyzer(); + + const doc1: WithId = { + _id: new ObjectId(), + items: [{ name: 'A', price: 10 }, { name: 'B' }], + }; + const doc2: WithId = { + _id: new ObjectId(), + items: [{ name: 'C', discount: true }], + }; + + analyzer.addDocument(doc1); + analyzer.addDocument(doc2); + const schema = analyzer.getSchema(); + + const itemsField = schema.properties?.['items'] as JSONSchema; + const arrayEntry = itemsField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + const objectEntry = (arrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + + // The object type entry should have x-documentsInspected = 3 + expect(objectEntry['x-documentsInspected']).toBe(3); + + const props = objectEntry.properties as Record; + + // Probability = x-occurrence / x-documentsInspected (uniform formula) + expect(props['name']['x-occurrence']).toBe(3); // 3/3 = 100% + expect(props['price']['x-occurrence']).toBe(1); // 1/3 = 33% + expect(props['discount']['x-occurrence']).toBe(1); // 1/3 = 33% + }); + + it('sets x-documentsInspected on nested objects at all levels', () => { + // items: [{address: {city: "NY", zip: "10001"}}, {address: {city: "LA"}}] + // + // At items.anyOf[object] level: x-documentsInspected = 2 + // At address.anyOf[object] level: x-documentsInspected = 2 + // city: 2/2 = 100%, zip: 1/2 = 50% + + const analyzer = new SchemaAnalyzer(); + + const doc: WithId = { + _id: new ObjectId(), + items: [{ address: { city: 'NY', zip: '10001' } }, { address: { city: 'LA' } }], + }; + + analyzer.addDocument(doc); + const schema = analyzer.getSchema(); + + const itemsField = schema.properties?.['items'] as JSONSchema; + const arrayEntry = itemsField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + const objectEntry = (arrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + + // 2 objects in the array + expect(objectEntry['x-documentsInspected']).toBe(2); + + // address.anyOf[object] — the nested object type + const addressProp = objectEntry.properties?.['address'] as JSONSchema; + const addressObjEntry = addressProp.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + + // Both objects had address, and both addresses were objects + expect(addressObjEntry['x-documentsInspected']).toBe(2); + + const addrProps = addressObjEntry.properties as Record; + expect(addrProps['city']['x-occurrence']).toBe(2); // 2/2 = 100% + expect(addrProps['zip']['x-occurrence']).toBe(1); // 1/2 = 50% + }); + + it('does NOT change x-documentsInspected at root level (root keeps document count)', () => { + const analyzer = new SchemaAnalyzer(); + + const doc1: WithId = { + _id: new ObjectId(), + name: 'Alice', + address: { city: 'NY' }, + }; + const doc2: WithId = { + _id: new ObjectId(), + name: 'Bob', + address: { city: 'LA', zip: '90001' }, + }; + + analyzer.addDocument(doc1); + analyzer.addDocument(doc2); + const schema = analyzer.getSchema(); + + // Root x-documentsInspected is document count, not affected by the fix + expect(schema['x-documentsInspected']).toBe(2); + + // Root-level probability still works: name.occurrence(2) / documentsInspected(2) = 100% + const nameField = schema.properties?.['name'] as JSONSchema; + expect(nameField['x-occurrence']).toBe(2); + + // Nested object: address.anyOf[object] should have x-documentsInspected = 2 + const addressField = schema.properties?.['address'] as JSONSchema; + const addressObjEntry = addressField.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + expect(addressObjEntry['x-documentsInspected']).toBe(2); + + const addrProps = addressObjEntry.properties as Record; + expect(addrProps['city']['x-occurrence']).toBe(2); // 2/2 = 100% + expect(addrProps['zip']['x-occurrence']).toBe(1); // 1/2 = 50% + }); +}); diff --git a/packages/schema-analyzer/test/SchemaAnalyzer.test.ts b/packages/schema-analyzer/test/SchemaAnalyzer.test.ts new file mode 100644 index 000000000..f23a97bdf --- /dev/null +++ b/packages/schema-analyzer/test/SchemaAnalyzer.test.ts @@ -0,0 +1,349 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { type JSONSchema, type JSONSchemaMap, type JSONSchemaRef } from '../src/JSONSchema'; +import { getPropertyNamesAtLevel, SchemaAnalyzer } from '../src/SchemaAnalyzer'; +import { + arraysWithDifferentDataTypes, + complexDocument, + complexDocumentsArray, + complexDocumentWithOddTypes, + embeddedDocumentOnly, + flatDocument, + sparseDocumentsArray, +} from './mongoTestDocuments'; + +describe('DocumentDB Schema Analyzer', () => { + it('prints out schema for testing', () => { + const analyzer = SchemaAnalyzer.fromDocument(embeddedDocumentOnly); + const schema = analyzer.getSchema(); + expect(schema).toBeDefined(); + }); + + it('supports many documents', () => { + const analyzer = SchemaAnalyzer.fromDocuments(sparseDocumentsArray); + const schema = analyzer.getSchema(); + expect(schema).toBeDefined(); + + // Check that 'x-documentsInspected' is correct + expect(schema['x-documentsInspected']).toBe(sparseDocumentsArray.length); + + // Check that the schema has the correct root properties + const expectedRootProperties = new Set(['_id', 'name', 'age', 'email', 'isActive', 'score', 'description']); + + expect(Object.keys(schema.properties || {})).toEqual( + expect.arrayContaining(Array.from(expectedRootProperties)), + ); + + // Check that the 'name' field is detected correctly + const nameField = schema.properties?.['name'] as JSONSchema; + expect(nameField).toBeDefined(); + expect(nameField?.['x-occurrence']).toBeGreaterThan(0); + + // Access 'anyOf' to get the type entries + const nameFieldTypes = nameField.anyOf?.map((typeEntry) => (typeEntry as JSONSchema)['type']); + expect(nameFieldTypes).toContain('string'); + + // Check that the 'age' field has the correct type + const ageField = schema.properties?.['age'] as JSONSchema; + expect(ageField).toBeDefined(); + const ageFieldTypes = ageField.anyOf?.map((typeEntry) => (typeEntry as JSONSchema)['type']); + expect(ageFieldTypes).toContain('number'); + + // Check that the 'isActive' field is a boolean + const isActiveField = schema.properties?.['isActive'] as JSONSchema; + expect(isActiveField).toBeDefined(); + const isActiveTypes = isActiveField.anyOf?.map((typeEntry) => (typeEntry as JSONSchema)['type']); + expect(isActiveTypes).toContain('boolean'); + + // Check that the 'description' field is optional (occurs in some documents) + const descriptionField = schema.properties?.['description'] as JSONSchema | undefined; + expect(descriptionField).toBeDefined(); + expect(descriptionField?.['x-occurrence']).toBeLessThan(sparseDocumentsArray.length); + }); + + it('detects all BSON types from flatDocument', () => { + const analyzer = SchemaAnalyzer.fromDocument(flatDocument); + const schema = analyzer.getSchema(); + + // Check that all fields are detected + const expectedFields = Object.keys(flatDocument); + expect(Object.keys(schema.properties || {})).toEqual(expect.arrayContaining(expectedFields)); + + // Helper function to get the 'x-bsonType' from a field + function getBsonType(fieldName: string): string | undefined { + const field = schema.properties?.[fieldName] as JSONSchema | undefined; + const anyOf = field?.anyOf; + return anyOf && (anyOf[0] as JSONSchema | undefined)?.['x-bsonType']; + } + + // Check that specific BSON types are correctly identified + expect(getBsonType('int32Field')).toBe('int32'); + expect(getBsonType('doubleField')).toBe('double'); + expect(getBsonType('decimalField')).toBe('decimal128'); + expect(getBsonType('dateField')).toBe('date'); + expect(getBsonType('objectIdField')).toBe('objectid'); + expect(getBsonType('codeField')).toBe('code'); + expect(getBsonType('uuidField')).toBe('uuid'); + expect(getBsonType('uuidLegacyField')).toBe('uuid-legacy'); + }); + + it('detects embedded objects correctly', () => { + const analyzer = SchemaAnalyzer.fromDocument(embeddedDocumentOnly); + const schema = analyzer.getSchema(); + + // Check that the root properties are detected + expect(schema.properties).toHaveProperty('personalInfo'); + expect(schema.properties).toHaveProperty('jobInfo'); + + // Access 'personalInfo' properties + const personalInfoAnyOf = + schema.properties && (schema.properties['personalInfo'] as JSONSchema | undefined)?.anyOf; + const personalInfoProperties = (personalInfoAnyOf?.[0] as JSONSchema | undefined)?.properties; + expect(personalInfoProperties).toBeDefined(); + expect(personalInfoProperties).toHaveProperty('name'); + expect(personalInfoProperties).toHaveProperty('age'); + expect(personalInfoProperties).toHaveProperty('married'); + expect(personalInfoProperties).toHaveProperty('address'); + + // Access 'address' properties within 'personalInfo' + const addressAnyOf = ((personalInfoProperties as JSONSchemaMap)['address'] as JSONSchema).anyOf; + const addressProperties = (addressAnyOf?.[0] as JSONSchema | undefined)?.properties; + expect(addressProperties).toBeDefined(); + expect(addressProperties).toHaveProperty('street'); + expect(addressProperties).toHaveProperty('city'); + expect(addressProperties).toHaveProperty('zip'); + }); + + it('detects arrays and their element types correctly', () => { + const analyzer = SchemaAnalyzer.fromDocument(arraysWithDifferentDataTypes); + const schema = analyzer.getSchema(); + + // Check that arrays are detected + expect(schema.properties).toHaveProperty('integersArray'); + expect(schema.properties).toHaveProperty('stringsArray'); + expect(schema.properties).toHaveProperty('booleansArray'); + expect(schema.properties).toHaveProperty('mixedArray'); + expect(schema.properties).toHaveProperty('datesArray'); + + // Helper function to get item types from an array field + function getArrayItemTypes(fieldName: string): string[] | undefined { + const field = schema.properties?.[fieldName] as JSONSchema | undefined; + const anyOf = field?.anyOf; + const itemsAnyOf: JSONSchemaRef[] | undefined = ( + (anyOf?.[0] as JSONSchema | undefined)?.items as JSONSchema | undefined + )?.anyOf; + return itemsAnyOf?.map((typeEntry) => (typeEntry as JSONSchema)['type'] as string); + } + + // Check that 'integersArray' has elements of type 'number' + const integerItemTypes = getArrayItemTypes('integersArray'); + expect(integerItemTypes).toContain('number'); + + // Check that 'stringsArray' has elements of type 'string' + const stringItemTypes = getArrayItemTypes('stringsArray'); + expect(stringItemTypes).toContain('string'); + + // Check that 'mixedArray' contains multiple types + const mixedItemTypes = getArrayItemTypes('mixedArray'); + expect(mixedItemTypes).toEqual(expect.arrayContaining(['number', 'string', 'boolean', 'object', 'null'])); + }); + + it('handles arrays within objects and objects within arrays', () => { + const analyzer = SchemaAnalyzer.fromDocument(complexDocument); + const schema = analyzer.getSchema(); + + // Access 'user.profile.hobbies' + const user = schema.properties?.['user'] as JSONSchema | undefined; + const userProfile = (user?.anyOf?.[0] as JSONSchema | undefined)?.properties?.['profile'] as + | JSONSchema + | undefined; + const hobbies = (userProfile?.anyOf?.[0] as JSONSchema | undefined)?.properties?.['hobbies'] as + | JSONSchema + | undefined; + const hobbiesItems = (hobbies?.anyOf?.[0] as JSONSchema | undefined)?.items as JSONSchema | undefined; + const hobbiesItemTypes = hobbiesItems?.anyOf?.map((typeEntry) => (typeEntry as JSONSchema).type); + expect(hobbiesItemTypes).toContain('string'); + + // Access 'user.profile.addresses' + const addresses = (userProfile?.anyOf?.[0] as JSONSchema | undefined)?.properties?.['addresses'] as + | JSONSchema + | undefined; + const addressesItems = (addresses?.anyOf?.[0] as JSONSchema | undefined)?.items as JSONSchema | undefined; + const addressItemTypes = addressesItems?.anyOf?.map((typeEntry) => (typeEntry as JSONSchema).type); + expect(addressItemTypes).toContain('object'); + + // Check that 'orders' is an array + const orders = schema.properties?.['orders'] as JSONSchema | undefined; + expect(orders).toBeDefined(); + const ordersType = (orders?.anyOf?.[0] as JSONSchema | undefined)?.type; + expect(ordersType).toBe('array'); + + // Access 'items' within 'orders' + const orderItemsParent = (orders?.anyOf?.[0] as JSONSchema | undefined)?.items as JSONSchema | undefined; + const orderItems = (orderItemsParent?.anyOf?.[0] as JSONSchema | undefined)?.properties?.['items'] as + | JSONSchema + | undefined; + const orderItemsType = (orderItems?.anyOf?.[0] as JSONSchema | undefined)?.type; + expect(orderItemsType).toBe('array'); + }); + + it('updates schema correctly when processing multiple documents', () => { + const analyzer = SchemaAnalyzer.fromDocuments(complexDocumentsArray); + const schema = analyzer.getSchema(); + + // Check that 'x-documentsInspected' is correct + expect(schema['x-documentsInspected']).toBe(complexDocumentsArray.length); + + // Check that some fields are present from different documents + expect(schema.properties).toHaveProperty('stringField'); + expect(schema.properties).toHaveProperty('personalInfo'); + expect(schema.properties).toHaveProperty('integersArray'); + expect(schema.properties).toHaveProperty('user'); + + // Check that 'integersArray' has correct min and max values + const integersArray = schema.properties?.['integersArray'] as JSONSchema | undefined; + const integerItemType = ((integersArray?.anyOf?.[0] as JSONSchema | undefined)?.items as JSONSchema | undefined) + ?.anyOf?.[0] as JSONSchema | undefined; + expect(integerItemType?.['x-minValue']).toBe(1); + expect(integerItemType?.['x-maxValue']).toBe(5); + + // Check that 'orders.items.price' is detected as Decimal128 + const orders2 = schema.properties?.['orders'] as JSONSchema | undefined; + const orderItemsParent2 = (orders2?.anyOf?.[0] as JSONSchema | undefined)?.items as JSONSchema | undefined; + const orderItems = (orderItemsParent2?.anyOf?.[0] as JSONSchema | undefined)?.properties?.['items'] as + | JSONSchema + | undefined; + const priceFieldParent = ((orderItems?.anyOf?.[0] as JSONSchema | undefined)?.items as JSONSchema | undefined) + ?.anyOf?.[0] as JSONSchema | undefined; + const priceField = priceFieldParent?.properties?.['price'] as JSONSchema | undefined; + const priceFieldType = priceField?.anyOf?.[0] as JSONSchema | undefined; + expect(priceFieldType?.['x-bsonType']).toBe('decimal128'); + }); + + describe('traverses schema', () => { + it('with valid paths', () => { + const analyzer = SchemaAnalyzer.fromDocument(complexDocument); + const schema = analyzer.getSchema(); + + let propertiesAtRoot = getPropertyNamesAtLevel(schema, []); + expect(propertiesAtRoot).toHaveLength(4); + + propertiesAtRoot = getPropertyNamesAtLevel(schema, ['user']); + expect(propertiesAtRoot).toHaveLength(3); + + propertiesAtRoot = getPropertyNamesAtLevel(schema, ['user', 'profile']); + expect(propertiesAtRoot).toHaveLength(4); + }); + + it('with broken paths', () => { + const analyzer = SchemaAnalyzer.fromDocument(complexDocument); + const schema = analyzer.getSchema(); + + const propertiesAtRoot = getPropertyNamesAtLevel(schema, []); + expect(propertiesAtRoot).toHaveLength(4); + + expect(() => getPropertyNamesAtLevel(schema, ['no-entry'])).toThrow(); + + expect(() => getPropertyNamesAtLevel(schema, ['user', 'no-entry'])).toThrow(); + }); + + it('with sparse docs and mixed types', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(complexDocument); + analyzer.addDocument(complexDocumentWithOddTypes); + const schema = analyzer.getSchema(); + + let propertiesAtRoot = getPropertyNamesAtLevel(schema, []); + expect(propertiesAtRoot).toHaveLength(4); + + propertiesAtRoot = getPropertyNamesAtLevel(schema, ['user']); + expect(propertiesAtRoot).toHaveLength(3); + expect(propertiesAtRoot).toEqual(['email', 'profile', 'username']); + + propertiesAtRoot = getPropertyNamesAtLevel(schema, ['user', 'profile']); + expect(propertiesAtRoot).toHaveLength(4); + expect(propertiesAtRoot).toEqual(['addresses', 'firstName', 'hobbies', 'lastName']); + + propertiesAtRoot = getPropertyNamesAtLevel(schema, ['history']); + expect(propertiesAtRoot).toHaveLength(6); + }); + }); + + describe('SchemaAnalyzer class methods', () => { + it('clone() creates an independent deep copy', () => { + // Use embeddedDocumentOnly (plain JS types) to avoid structuredClone issues with BSON types + const original = SchemaAnalyzer.fromDocument(embeddedDocumentOnly); + const cloned = original.clone(); + + // Clone has the same document count + expect(cloned.getDocumentCount()).toBe(1); + + // Clone has the same properties + const originalProps = Object.keys(original.getSchema().properties || {}); + const clonedProps = Object.keys(cloned.getSchema().properties || {}); + expect(clonedProps).toEqual(originalProps); + + // Add another document to the original only + original.addDocument(arraysWithDifferentDataTypes); + expect(original.getDocumentCount()).toBe(2); + expect(cloned.getDocumentCount()).toBe(1); + + // Clone's schema was NOT affected by the mutation + const originalPropsAfter = Object.keys(original.getSchema().properties || {}); + const clonedPropsAfter = Object.keys(cloned.getSchema().properties || {}); + expect(originalPropsAfter).toContain('integersArray'); + expect(originalPropsAfter).toContain('stringsArray'); + expect(clonedPropsAfter).not.toContain('integersArray'); + expect(clonedPropsAfter).not.toContain('stringsArray'); + }); + + it('reset() clears all accumulated state', () => { + const analyzer = SchemaAnalyzer.fromDocument(flatDocument); + expect(analyzer.getDocumentCount()).toBeGreaterThan(0); + expect(Object.keys(analyzer.getSchema().properties || {})).not.toHaveLength(0); + + analyzer.reset(); + + expect(analyzer.getDocumentCount()).toBe(0); + const schema = analyzer.getSchema(); + expect(schema.properties).toBeUndefined(); + expect(schema['x-documentsInspected']).toBeUndefined(); + }); + + it('fromDocument() creates analyzer with single document', () => { + const analyzer = SchemaAnalyzer.fromDocument(flatDocument); + expect(analyzer.getDocumentCount()).toBe(1); + + const schema = analyzer.getSchema(); + const expectedFields = Object.keys(flatDocument); + expect(Object.keys(schema.properties || {})).toEqual(expect.arrayContaining(expectedFields)); + }); + + it('fromDocuments() creates analyzer with multiple documents', () => { + const analyzer = SchemaAnalyzer.fromDocuments(sparseDocumentsArray); + expect(analyzer.getDocumentCount()).toBe(sparseDocumentsArray.length); + + // Compare with manually-built analyzer + const manual = new SchemaAnalyzer(); + manual.addDocuments(sparseDocumentsArray); + + expect(JSON.stringify(analyzer.getSchema())).toBe(JSON.stringify(manual.getSchema())); + }); + + it('addDocuments() is equivalent to multiple addDocument() calls', () => { + const batch = new SchemaAnalyzer(); + batch.addDocuments(complexDocumentsArray); + + const sequential = new SchemaAnalyzer(); + for (const doc of complexDocumentsArray) { + sequential.addDocument(doc); + } + + expect(batch.getDocumentCount()).toBe(sequential.getDocumentCount()); + expect(JSON.stringify(batch.getSchema())).toBe(JSON.stringify(sequential.getSchema())); + }); + }); +}); diff --git a/packages/schema-analyzer/test/SchemaAnalyzer.versioning.test.ts b/packages/schema-analyzer/test/SchemaAnalyzer.versioning.test.ts new file mode 100644 index 000000000..38ef144a6 --- /dev/null +++ b/packages/schema-analyzer/test/SchemaAnalyzer.versioning.test.ts @@ -0,0 +1,663 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { ObjectId, type Document, type WithId } from 'mongodb'; +import { type JSONSchema } from '../src/JSONSchema'; +import { SchemaAnalyzer } from '../src/SchemaAnalyzer'; + +// ------------------------------------------------------------------ +// Test fixtures +// ------------------------------------------------------------------ + +function makeDoc(fields: Record = {}): WithId { + return { _id: new ObjectId(), ...fields }; +} + +// ------------------------------------------------------------------ +// Version counter +// ------------------------------------------------------------------ +describe('SchemaAnalyzer version counter', () => { + it('starts at 0 for a new analyzer', () => { + const analyzer = new SchemaAnalyzer(); + expect(analyzer.version).toBe(0); + }); + + it('increments on addDocument()', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ a: 1 })); + expect(analyzer.version).toBe(1); + + analyzer.addDocument(makeDoc({ b: 2 })); + expect(analyzer.version).toBe(2); + }); + + it('increments only once for addDocuments() (batch)', () => { + const analyzer = new SchemaAnalyzer(); + const docs = [makeDoc({ a: 1 }), makeDoc({ b: 2 }), makeDoc({ c: 3 })]; + + analyzer.addDocuments(docs); + expect(analyzer.version).toBe(1); + }); + + it('increments on reset()', () => { + const analyzer = SchemaAnalyzer.fromDocument(makeDoc({ x: 1 })); + expect(analyzer.version).toBe(1); + + analyzer.reset(); + expect(analyzer.version).toBe(2); + }); + + it('cloned analyzer starts with version 0 (independent from original)', () => { + const original = new SchemaAnalyzer(); + original.addDocument(makeDoc({ a: 1 })); + original.addDocument(makeDoc({ b: 2 })); + expect(original.version).toBe(2); + + const cloned = original.clone(); + expect(cloned.version).toBe(0); + + // Mutating the clone does not affect the original's version + cloned.addDocument(makeDoc({ c: 3 })); + expect(cloned.version).toBe(1); + expect(original.version).toBe(2); + }); + + it('accumulates across mixed operations', () => { + const analyzer = new SchemaAnalyzer(); + // addDocument +1 + analyzer.addDocument(makeDoc()); + expect(analyzer.version).toBe(1); + + // addDocuments +1 (batch) + analyzer.addDocuments([makeDoc(), makeDoc()]); + expect(analyzer.version).toBe(2); + + // reset +1 + analyzer.reset(); + expect(analyzer.version).toBe(3); + + // addDocument after reset +1 + analyzer.addDocument(makeDoc()); + expect(analyzer.version).toBe(4); + }); + + it('fromDocument() factory yields version 1', () => { + const analyzer = SchemaAnalyzer.fromDocument(makeDoc({ a: 1 })); + expect(analyzer.version).toBe(1); + }); + + it('fromDocuments() factory yields version 1', () => { + const analyzer = SchemaAnalyzer.fromDocuments([makeDoc(), makeDoc(), makeDoc()]); + expect(analyzer.version).toBe(1); + }); +}); + +// ------------------------------------------------------------------ +// Version-based caching (getKnownFields cache) +// ------------------------------------------------------------------ +describe('SchemaAnalyzer getKnownFields cache', () => { + it('is populated on first call to getKnownFields()', () => { + const analyzer = SchemaAnalyzer.fromDocument(makeDoc({ name: 'Alice', age: 30 })); + const fields = analyzer.getKnownFields(); + + expect(fields.length).toBeGreaterThan(0); + // Should contain _id, age, name + const paths = fields.map((f) => f.path); + expect(paths).toContain('_id'); + expect(paths).toContain('name'); + expect(paths).toContain('age'); + }); + + it('is reused when version has not changed (same reference)', () => { + const analyzer = SchemaAnalyzer.fromDocument(makeDoc({ name: 'Alice' })); + const first = analyzer.getKnownFields(); + const second = analyzer.getKnownFields(); + + // Same array reference — cache was reused, not recomputed + expect(second).toBe(first); + }); + + it('is invalidated when addDocument() is called', () => { + const analyzer = SchemaAnalyzer.fromDocument(makeDoc({ name: 'Alice' })); + const before = analyzer.getKnownFields(); + + analyzer.addDocument(makeDoc({ name: 'Bob', email: 'bob@test.com' })); + const after = analyzer.getKnownFields(); + + // Different reference — cache was recomputed + expect(after).not.toBe(before); + // New field should be present + expect(after.map((f) => f.path)).toContain('email'); + }); + + it('is invalidated when addDocuments() is called', () => { + const analyzer = SchemaAnalyzer.fromDocument(makeDoc({ name: 'Alice' })); + const before = analyzer.getKnownFields(); + + analyzer.addDocuments([makeDoc({ score: 42 }), makeDoc({ level: 7 })]); + const after = analyzer.getKnownFields(); + + expect(after).not.toBe(before); + const paths = after.map((f) => f.path); + expect(paths).toContain('score'); + expect(paths).toContain('level'); + }); + + it('is invalidated when reset() is called', () => { + const analyzer = SchemaAnalyzer.fromDocument(makeDoc({ name: 'Alice' })); + const before = analyzer.getKnownFields(); + expect(before.length).toBeGreaterThan(0); + + analyzer.reset(); + const after = analyzer.getKnownFields(); + + expect(after).not.toBe(before); + // After reset the schema is empty so no fields + expect(after).toHaveLength(0); + }); + + it('returns updated results after cache invalidation', () => { + const analyzer = new SchemaAnalyzer(); + // Empty analyzer → no known fields + expect(analyzer.getKnownFields()).toHaveLength(0); + + // Add first doc + analyzer.addDocument(makeDoc({ x: 1 })); + const fields1 = analyzer.getKnownFields(); + expect(fields1.map((f) => f.path)).toEqual(expect.arrayContaining(['_id', 'x'])); + + // Add second doc with new field + analyzer.addDocument(makeDoc({ x: 2, y: 'hello' })); + const fields2 = analyzer.getKnownFields(); + expect(fields2).not.toBe(fields1); + expect(fields2.map((f) => f.path)).toContain('y'); + }); + + it('clone gets its own independent cache', () => { + const original = SchemaAnalyzer.fromDocument(makeDoc({ name: 'Alice' })); + const originalFields = original.getKnownFields(); + + const cloned = original.clone(); + const clonedFields = cloned.getKnownFields(); + + // Both should have the same content but be independent objects + expect(clonedFields).not.toBe(originalFields); + expect(clonedFields.map((f) => f.path)).toEqual(originalFields.map((f) => f.path)); + + // Mutating the clone should not affect the original cache + cloned.addDocument(makeDoc({ extra: true })); + const clonedFieldsAfter = cloned.getKnownFields(); + expect(clonedFieldsAfter.map((f) => f.path)).toContain('extra'); + expect(original.getKnownFields().map((f) => f.path)).not.toContain('extra'); + }); +}); + +// ------------------------------------------------------------------ +// Instances and types counting +// ------------------------------------------------------------------ +describe('SchemaAnalyzer instances and types counting', () => { + describe('x-occurrence (field instance counting)', () => { + it('counts 1 for a field present in a single document', () => { + const analyzer = SchemaAnalyzer.fromDocument(makeDoc({ name: 'Alice' })); + const schema = analyzer.getSchema(); + const nameField = schema.properties?.['name'] as JSONSchema; + expect(nameField['x-occurrence']).toBe(1); + }); + + it('counts correctly across multiple documents', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ name: 'Alice', age: 30 })); + analyzer.addDocument(makeDoc({ name: 'Bob', age: 25 })); + analyzer.addDocument(makeDoc({ name: 'Carol' })); // no age + + const schema = analyzer.getSchema(); + expect((schema.properties?.['name'] as JSONSchema)['x-occurrence']).toBe(3); + expect((schema.properties?.['age'] as JSONSchema)['x-occurrence']).toBe(2); + }); + + it('counts sparse fields correctly (field missing in some documents)', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ a: 1, b: 2, c: 3 })); + analyzer.addDocument(makeDoc({ a: 10 })); // only 'a' + analyzer.addDocument(makeDoc({ a: 100, c: 300 })); // 'a' and 'c' + + const schema = analyzer.getSchema(); + expect((schema.properties?.['a'] as JSONSchema)['x-occurrence']).toBe(3); + expect((schema.properties?.['b'] as JSONSchema)['x-occurrence']).toBe(1); + expect((schema.properties?.['c'] as JSONSchema)['x-occurrence']).toBe(2); + }); + + it('counts occurrences for nested object properties', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ user: { name: 'Alice', age: 30 } })); + analyzer.addDocument(makeDoc({ user: { name: 'Bob' } })); // no age + + const schema = analyzer.getSchema(); + const userField = schema.properties?.['user'] as JSONSchema; + const objectEntry = userField.anyOf?.find((e) => (e as JSONSchema).type === 'object') as JSONSchema; + + expect((objectEntry.properties?.['name'] as JSONSchema)['x-occurrence']).toBe(2); + expect((objectEntry.properties?.['age'] as JSONSchema)['x-occurrence']).toBe(1); + }); + }); + + describe('x-typeOccurrence (type counting)', () => { + it('counts type occurrences for a single-type field', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ name: 'Alice' })); + analyzer.addDocument(makeDoc({ name: 'Bob' })); + analyzer.addDocument(makeDoc({ name: 'Carol' })); + + const schema = analyzer.getSchema(); + const nameField = schema.properties?.['name'] as JSONSchema; + const stringEntry = nameField.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'string', + ) as JSONSchema; + + expect(stringEntry['x-typeOccurrence']).toBe(3); + }); + + it('counts type occurrences for polymorphic fields', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ value: 'hello' })); + analyzer.addDocument(makeDoc({ value: 42 })); + analyzer.addDocument(makeDoc({ value: 'world' })); + analyzer.addDocument(makeDoc({ value: true })); + + const schema = analyzer.getSchema(); + const valueField = schema.properties?.['value'] as JSONSchema; + + const stringEntry = valueField.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'string', + ) as JSONSchema; + const booleanEntry = valueField.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'boolean', + ) as JSONSchema; + + // 2 strings, 1 number, 1 boolean + expect(stringEntry['x-typeOccurrence']).toBe(2); + expect(booleanEntry['x-typeOccurrence']).toBe(1); + + // total x-occurrence should equal sum of x-typeOccurrence values + const totalTypeOccurrence = (valueField.anyOf as JSONSchema[]).reduce( + (sum, entry) => sum + ((entry['x-typeOccurrence'] as number) ?? 0), + 0, + ); + expect(valueField['x-occurrence']).toBe(totalTypeOccurrence); + }); + + it('counts array element types across documents', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ tags: ['a', 'b'] })); // 2 strings + analyzer.addDocument(makeDoc({ tags: ['c', 42] })); // 1 string + 1 number + analyzer.addDocument(makeDoc({ tags: [true] })); // 1 boolean + + const schema = analyzer.getSchema(); + const tagsField = schema.properties?.['tags'] as JSONSchema; + const arrayEntry = tagsField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + const itemsSchema = arrayEntry.items as JSONSchema; + + const stringEntry = itemsSchema.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'string', + ) as JSONSchema; + const booleanEntry = itemsSchema.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'boolean', + ) as JSONSchema; + + // 3 string elements total: "a", "b", "c" + expect(stringEntry['x-typeOccurrence']).toBe(3); + + // 1 boolean element + expect(booleanEntry['x-typeOccurrence']).toBe(1); + }); + + it('type occurrence count equals field occurrence for a single-type field', () => { + const analyzer = new SchemaAnalyzer(); + for (let i = 0; i < 5; i++) { + analyzer.addDocument(makeDoc({ score: i * 10 })); + } + + const schema = analyzer.getSchema(); + const scoreField = schema.properties?.['score'] as JSONSchema; + const typeEntries = scoreField.anyOf as JSONSchema[]; + + // Only one type, so its typeOccurrence should equal the field occurrence + expect(typeEntries).toHaveLength(1); + expect(typeEntries[0]['x-typeOccurrence']).toBe(scoreField['x-occurrence']); + }); + }); + + describe('x-documentsInspected counting', () => { + it('tracks document count at root level', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ a: 1 })); + analyzer.addDocument(makeDoc({ b: 2 })); + analyzer.addDocument(makeDoc({ c: 3 })); + + expect(analyzer.getSchema()['x-documentsInspected']).toBe(3); + expect(analyzer.getDocumentCount()).toBe(3); + }); + + it('tracks object instances for nested objects', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ info: { x: 1 } })); + analyzer.addDocument(makeDoc({ info: { x: 2, y: 3 } })); + + const schema = analyzer.getSchema(); + const infoField = schema.properties?.['info'] as JSONSchema; + const objectEntry = infoField.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + + expect(objectEntry['x-documentsInspected']).toBe(2); + }); + + it('tracks object instances inside arrays accurately', () => { + const analyzer = new SchemaAnalyzer(); + // doc1: array with 2 objects + analyzer.addDocument(makeDoc({ items: [{ a: 1 }, { a: 2 }] })); + // doc2: array with 1 object + analyzer.addDocument(makeDoc({ items: [{ a: 3, b: 4 }] })); + + const schema = analyzer.getSchema(); + const itemsField = schema.properties?.['items'] as JSONSchema; + const arrayEntry = itemsField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + const objectEntry = (arrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + + // 3 objects total (2 from doc1, 1 from doc2) + expect(objectEntry['x-documentsInspected']).toBe(3); + // "a" appears in all 3 objects + expect((objectEntry.properties?.['a'] as JSONSchema)['x-occurrence']).toBe(3); + // "b" appears in 1 of 3 objects + expect((objectEntry.properties?.['b'] as JSONSchema)['x-occurrence']).toBe(1); + }); + + it('resets to 0 after reset()', () => { + const analyzer = SchemaAnalyzer.fromDocuments([makeDoc({ a: 1 }), makeDoc({ b: 2 })]); + expect(analyzer.getDocumentCount()).toBe(2); + + analyzer.reset(); + expect(analyzer.getDocumentCount()).toBe(0); + }); + }); + + describe('probability correctness (occurrence / documentsInspected)', () => { + it('yields 100% for fields present in every document', () => { + const analyzer = new SchemaAnalyzer(); + for (let i = 0; i < 10; i++) { + analyzer.addDocument(makeDoc({ name: `user-${i}` })); + } + + const schema = analyzer.getSchema(); + const occurrence = (schema.properties?.['name'] as JSONSchema)['x-occurrence'] as number; + const total = schema['x-documentsInspected'] as number; + expect(occurrence / total).toBe(1); + }); + + it('yields correct fraction for sparse fields', () => { + const analyzer = new SchemaAnalyzer(); + // 3 docs with 'a', 1 doc with 'b' + analyzer.addDocument(makeDoc({ a: 1, b: 10 })); + analyzer.addDocument(makeDoc({ a: 2 })); + analyzer.addDocument(makeDoc({ a: 3 })); + + const schema = analyzer.getSchema(); + const total = schema['x-documentsInspected'] as number; + const aOccurrence = (schema.properties?.['a'] as JSONSchema)['x-occurrence'] as number; + const bOccurrence = (schema.properties?.['b'] as JSONSchema)['x-occurrence'] as number; + + expect(aOccurrence / total).toBe(1); // 3/3 + expect(bOccurrence / total).toBeCloseTo(1 / 3); // 1/3 + }); + + it('yields correct fraction for nested objects inside arrays', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument( + makeDoc({ + items: [ + { name: 'A', price: 10 }, + { name: 'B' }, // no price + ], + }), + ); + analyzer.addDocument(makeDoc({ items: [{ name: 'C', price: 20 }] })); + + const schema = analyzer.getSchema(); + const itemsField = schema.properties?.['items'] as JSONSchema; + const arrayEntry = itemsField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + const objectEntry = (arrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + + const denominator = objectEntry['x-documentsInspected'] as number; + const nameOccurrence = (objectEntry.properties?.['name'] as JSONSchema)['x-occurrence'] as number; + const priceOccurrence = (objectEntry.properties?.['price'] as JSONSchema)['x-occurrence'] as number; + + expect(denominator).toBe(3); // 3 objects total + expect(nameOccurrence / denominator).toBe(1); // 3/3 + expect(priceOccurrence / denominator).toBeCloseTo(2 / 3); // 2/3 + }); + }); + + describe('array and nested array counting', () => { + it('counts x-typeOccurrence for the array type entry across documents', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ tags: ['a'] })); + analyzer.addDocument(makeDoc({ tags: ['b', 'c'] })); + analyzer.addDocument(makeDoc({ tags: 42 })); // not an array + + const schema = analyzer.getSchema(); + const tagsField = schema.properties?.['tags'] as JSONSchema; + + // Field seen 3 times total + expect(tagsField['x-occurrence']).toBe(3); + + const arrayEntry = tagsField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + + // Array type seen 2 out of 3 times + expect(arrayEntry['x-typeOccurrence']).toBe(2); + + // x-minItems / x-maxItems tracked across array instances + expect(arrayEntry['x-minItems']).toBe(1); + expect(arrayEntry['x-maxItems']).toBe(2); + }); + + it('counts x-minItems / x-maxItems for arrays across documents', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ nums: [1, 2, 3] })); // length 3 + analyzer.addDocument(makeDoc({ nums: [10] })); // length 1 + analyzer.addDocument(makeDoc({ nums: [4, 5, 6, 7, 8] })); // length 5 + + const schema = analyzer.getSchema(); + const numsField = schema.properties?.['nums'] as JSONSchema; + const arrayEntry = numsField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + + expect(arrayEntry['x-minItems']).toBe(1); + expect(arrayEntry['x-maxItems']).toBe(5); + expect(arrayEntry['x-typeOccurrence']).toBe(3); + }); + + it('counts nested arrays (arrays within arrays)', () => { + const analyzer = new SchemaAnalyzer(); + // matrix is an array of arrays of numbers + analyzer.addDocument( + makeDoc({ + matrix: [ + [1, 2], + [3, 4, 5], + ], + }), + ); + analyzer.addDocument(makeDoc({ matrix: [[10]] })); + + const schema = analyzer.getSchema(); + const matrixField = schema.properties?.['matrix'] as JSONSchema; + const outerArrayEntry = matrixField.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'array', + ) as JSONSchema; + + // Outer array seen in 2 documents + expect(outerArrayEntry['x-typeOccurrence']).toBe(2); + // doc1 has 2 inner arrays, doc2 has 1 + expect(outerArrayEntry['x-minItems']).toBe(1); + expect(outerArrayEntry['x-maxItems']).toBe(2); + + // Inner arrays: items type should be 'array' + const innerArrayEntry = (outerArrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'array', + ) as JSONSchema; + expect(innerArrayEntry).toBeDefined(); + // 3 inner arrays total: [1,2], [3,4,5], [10] + expect(innerArrayEntry['x-typeOccurrence']).toBe(3); + // inner array lengths: 2, 3, 1 + expect(innerArrayEntry['x-minItems']).toBe(1); + expect(innerArrayEntry['x-maxItems']).toBe(3); + + // Elements inside inner arrays are numbers + const numberEntry = (innerArrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema).type === 'number', + ) as JSONSchema; + expect(numberEntry).toBeDefined(); + // 6 numbers total: 1,2,3,4,5,10 + expect(numberEntry['x-typeOccurrence']).toBe(6); + }); + + it('counts objects within arrays within objects (deep nesting)', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument( + makeDoc({ + company: { + departments: [ + { name: 'Eng', employees: [{ role: 'Dev' }, { role: 'QA', level: 3 }] }, + { name: 'Sales' }, + ], + }, + }), + ); + analyzer.addDocument( + makeDoc({ + company: { + departments: [{ name: 'HR', employees: [{ role: 'Recruiter' }] }], + }, + }), + ); + + const schema = analyzer.getSchema(); + + // company is an object + const companyField = schema.properties?.['company'] as JSONSchema; + const companyObj = companyField.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + expect(companyObj['x-documentsInspected']).toBe(2); + + // departments is an array inside company + const deptField = companyObj.properties?.['departments'] as JSONSchema; + const deptArrayEntry = deptField.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'array', + ) as JSONSchema; + expect(deptArrayEntry['x-typeOccurrence']).toBe(2); + + // department objects: 2 from doc1 + 1 from doc2 = 3 + const deptObjEntry = (deptArrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + expect(deptObjEntry['x-documentsInspected']).toBe(3); + expect(deptObjEntry['x-typeOccurrence']).toBe(3); + + // "name" in all 3 department objects, "employees" in 2 of 3 + expect((deptObjEntry.properties?.['name'] as JSONSchema)['x-occurrence']).toBe(3); + expect((deptObjEntry.properties?.['employees'] as JSONSchema)['x-occurrence']).toBe(2); + + // employees is an array inside department objects + const empField = deptObjEntry.properties?.['employees'] as JSONSchema; + const empArrayEntry = empField.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'array', + ) as JSONSchema; + expect(empArrayEntry['x-typeOccurrence']).toBe(2); + + // employee objects: 2 from first dept + 1 from HR = 3 + const empObjEntry = (empArrayEntry.items as JSONSchema).anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + expect(empObjEntry['x-documentsInspected']).toBe(3); + + // "role" in all 3 employee objects, "level" in 1 + expect((empObjEntry.properties?.['role'] as JSONSchema)['x-occurrence']).toBe(3); + expect((empObjEntry.properties?.['level'] as JSONSchema)['x-occurrence']).toBe(1); + }); + + it('tracks mixed types inside arrays (objects + primitives)', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument(makeDoc({ data: ['hello', { key: 'val' }, 42] })); + analyzer.addDocument(makeDoc({ data: [{ key: 'v2', extra: true }] })); + + const schema = analyzer.getSchema(); + const dataField = schema.properties?.['data'] as JSONSchema; + const arrayEntry = dataField.anyOf?.find((e) => (e as JSONSchema)['x-bsonType'] === 'array') as JSONSchema; + const itemsSchema = arrayEntry.items as JSONSchema; + + // string: 1, object: 2, number: 1 + const stringEntry = itemsSchema.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'string', + ) as JSONSchema; + const objectEntry = itemsSchema.anyOf?.find( + (e) => (e as JSONSchema)['x-bsonType'] === 'object', + ) as JSONSchema; + + expect(stringEntry['x-typeOccurrence']).toBe(1); + expect(objectEntry['x-typeOccurrence']).toBe(2); + expect(objectEntry['x-documentsInspected']).toBe(2); + + // "key" in both objects, "extra" in 1 + expect((objectEntry.properties?.['key'] as JSONSchema)['x-occurrence']).toBe(2); + expect((objectEntry.properties?.['extra'] as JSONSchema)['x-occurrence']).toBe(1); + }); + }); + + describe('addDocuments vs sequential addDocument equivalence', () => { + it('produces identical occurrence counts', () => { + const docs = [makeDoc({ a: 1, b: 'x' }), makeDoc({ a: 2 }), makeDoc({ a: 3, c: true })]; + + const batch = new SchemaAnalyzer(); + batch.addDocuments(docs); + + const sequential = new SchemaAnalyzer(); + for (const doc of docs) { + sequential.addDocument(doc); + } + + const batchSchema = batch.getSchema(); + const seqSchema = sequential.getSchema(); + + // Root counts match + expect(batchSchema['x-documentsInspected']).toBe(seqSchema['x-documentsInspected']); + + // Field-level occurrence counts match + for (const key of Object.keys(batchSchema.properties ?? {})) { + const batchField = batchSchema.properties?.[key] as JSONSchema; + const seqField = seqSchema.properties?.[key] as JSONSchema; + expect(batchField['x-occurrence']).toBe(seqField['x-occurrence']); + } + }); + + it('produces identical type occurrence counts', () => { + const docs = [makeDoc({ value: 'hello' }), makeDoc({ value: 42 }), makeDoc({ value: 'world' })]; + + const batch = new SchemaAnalyzer(); + batch.addDocuments(docs); + + const sequential = new SchemaAnalyzer(); + for (const doc of docs) { + sequential.addDocument(doc); + } + + // Stringify the schemas to compare their full type entry structures + expect(JSON.stringify(batch.getSchema())).toBe(JSON.stringify(sequential.getSchema())); + }); + }); +}); diff --git a/src/utils/json/mongo/mongoTestDocuments.ts b/packages/schema-analyzer/test/mongoTestDocuments.ts similarity index 100% rename from src/utils/json/mongo/mongoTestDocuments.ts rename to packages/schema-analyzer/test/mongoTestDocuments.ts diff --git a/packages/schema-analyzer/tsconfig.json b/packages/schema-analyzer/tsconfig.json new file mode 100644 index 000000000..8688f97ff --- /dev/null +++ b/packages/schema-analyzer/tsconfig.json @@ -0,0 +1,20 @@ +{ + "compilerOptions": { + "composite": true, + "declaration": true, + "declarationMap": true, + "module": "commonjs", + "target": "ES2023", + "lib": ["ES2023"], + "rootDir": "./src", + "outDir": "./dist", + "strict": true, + "esModuleInterop": true, + "allowSyntheticDefaultImports": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "sourceMap": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} diff --git a/src/documentdb/ClusterSession.ts b/src/documentdb/ClusterSession.ts index da81218fe..27ae9b8e4 100644 --- a/src/documentdb/ClusterSession.ts +++ b/src/documentdb/ClusterSession.ts @@ -3,11 +3,16 @@ * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ +import { + SchemaAnalyzer, + getPropertyNamesAtLevel, + type FieldEntry, + type JSONSchema, +} from '@vscode-documentdb/schema-analyzer'; import * as l10n from '@vscode/l10n'; import { EJSON } from 'bson'; import { ObjectId, type Document, type Filter, type WithId } from 'mongodb'; -import { type JSONSchema } from '../utils/json/JSONSchema'; -import { getPropertyNamesAtLevel, updateSchemaWithDocument } from '../utils/json/mongo/SchemaAnalyzer'; +import { ext } from '../extensionVariables'; import { getDataAtPath } from '../utils/slickgrid/mongo/toSlickGridTable'; import { toSlickGridTree, type TreeData } from '../utils/slickgrid/mongo/toSlickGridTree'; import { ClustersClient, type FindQueryParams } from './ClustersClient'; @@ -78,7 +83,7 @@ export class ClusterSession { * Updates progressively as users navigate through different pages. * Reset when the query or page size changes. */ - private _accumulatedJsonSchema: JSONSchema = {}; + private _schemaAnalyzer: SchemaAnalyzer = new SchemaAnalyzer(); /** * Tracks the highest page number that has been accumulated into the schema. @@ -162,7 +167,8 @@ export class ClusterSession { } // The user's query has changed, invalidate all caches - this._accumulatedJsonSchema = {}; + this._schemaAnalyzer.reset(); + ext.outputChannel.trace('[SchemaAnalyzer] Reset — query changed'); this._highestPageAccumulated = 0; this._currentPageSize = null; this._currentRawDocuments = []; @@ -185,7 +191,8 @@ export class ClusterSession { private resetAccumulationIfPageSizeChanged(newPageSize: number): void { if (this._currentPageSize !== null && this._currentPageSize !== newPageSize) { // Page size changed, reset accumulation tracking - this._accumulatedJsonSchema = {}; + this._schemaAnalyzer.reset(); + ext.outputChannel.trace('[SchemaAnalyzer] Reset — page size changed'); this._highestPageAccumulated = 0; } this._currentPageSize = newPageSize; @@ -298,8 +305,12 @@ export class ClusterSession { // Since navigation is sequential and starts at page 1, we only need to track // the highest page number accumulated if (pageNumber > this._highestPageAccumulated) { - this._currentRawDocuments.map((doc) => updateSchemaWithDocument(this._accumulatedJsonSchema, doc)); + this._schemaAnalyzer.addDocuments(this._currentRawDocuments); this._highestPageAccumulated = pageNumber; + + ext.outputChannel.trace( + `[SchemaAnalyzer] Analyzed ${String(this._schemaAnalyzer.getDocumentCount())} documents, ${String(this._schemaAnalyzer.getKnownFields().length)} known fields`, + ); } return documents.length; @@ -355,7 +366,7 @@ export class ClusterSession { public getCurrentPageAsTable(path: string[]): TableData { const responsePack: TableData = { path: path, - headers: getPropertyNamesAtLevel(this._accumulatedJsonSchema, path), + headers: getPropertyNamesAtLevel(this._schemaAnalyzer.getSchema(), path), data: getDataAtPath(this._currentRawDocuments, path), }; @@ -363,7 +374,15 @@ export class ClusterSession { } public getCurrentSchema(): JSONSchema { - return this._accumulatedJsonSchema; + return this._schemaAnalyzer.getSchema(); + } + + /** + * Returns the cached list of known fields from the accumulated schema. + * Uses SchemaAnalyzer's version-based caching — only recomputed when the schema changes. + */ + public getKnownFields(): FieldEntry[] { + return this._schemaAnalyzer.getKnownFields(); } // ============================================================================ diff --git a/src/utils/json/mongo/autocomplete/basicMongoFindFilterSchema.json b/src/utils/json/data-api/autocomplete/basicMongoFindFilterSchema.json similarity index 100% rename from src/utils/json/mongo/autocomplete/basicMongoFindFilterSchema.json rename to src/utils/json/data-api/autocomplete/basicMongoFindFilterSchema.json diff --git a/src/utils/json/data-api/autocomplete/future-work.md b/src/utils/json/data-api/autocomplete/future-work.md new file mode 100644 index 000000000..660113c7d --- /dev/null +++ b/src/utils/json/data-api/autocomplete/future-work.md @@ -0,0 +1,161 @@ +# Autocomplete — Future Work + +Outstanding TODOs flagged in code during the schema transformer implementation (PR #506). +These must be resolved before the completion providers ship to users. + +--- + +## ~~1. `SPECIAL_CHARS_PATTERN` is incomplete + `insertText` quoting doesn't escape~~ ✅ RESOLVED + +**Resolved in:** PR #506 (commit addressing copilot review comment) + +Replaced `SPECIAL_CHARS_PATTERN` with `JS_IDENTIFIER_PATTERN` — a proper identifier validity check. +Added `\` → `\\` and `"` → `\"` escaping when quoting `insertText`. +Tests cover dashes, brackets, digits, embedded quotes, and backslashes. + +--- + +## 2. `referenceText` is invalid MQL for special field names + +**Severity:** Medium — will generate broken aggregation expressions +**File:** `toFieldCompletionItems.ts` — `referenceText` construction +**When to fix:** Before the aggregation completion provider is wired up + +### Problem + +`referenceText` is always `$${entry.path}` (e.g., `$address.city`). In MQL, the `$field.path` syntax only works when every segment is a valid identifier without dots, spaces, or `$`. For field names like `order-items`, `a.b`, or `my field`, the `$` prefix syntax produces invalid references. + +### Examples + +| Field name | Current `referenceText` | Valid? | Correct MQL | +| ------------------- | ----------------------- | -------------- | ------------------------------------ | +| `age` | `$age` | ✅ | `$age` | +| `address.city` | `$address.city` | ✅ (nested) | `$address.city` | +| `order-items` | `$order-items` | ❌ | `{ $getField: "order-items" }` | +| `a.b` (literal dot) | `$a.b` | ❌ (ambiguous) | `{ $getField: { $literal: "a.b" } }` | +| `my field` | `$my field` | ❌ | `{ $getField: "my field" }` | + +### Proposed approaches + +**Option A — Make `referenceText` optional:** Return `undefined` for fields that can't use `$`-prefix syntax. The completion provider would omit the reference suggestion for those fields. + +**Option B — Use `$getField` for special names:** + +```typescript +referenceText: needsQuoting + ? `{ $getField: "${escaped}" }` + : `$${entry.path}`, +``` + +**Option C — Provide both forms:** Add a `referenceTextRaw` (always `$path`) and `referenceTextSafe` (uses `$getField` when needed). Let the completion provider choose based on context. + +**Recommendation:** Option B is pragmatic. Option C is more flexible if we later need to support both forms in different contexts (e.g., `$match` vs `$project`). + +--- + +## 3. `FieldEntry.path` dot-concatenation is ambiguous for literal dots + +**Severity:** Low (rare in practice) — fields with literal dots were prohibited before MongoDB API 3.6 +**File:** `getKnownFields.ts` — path concatenation at `path: \`${path}.${childName}\``**When to fix:** When we encounter real-world schemas with literal dots, or during the next`FieldEntry` interface revision + +### Problem + +Paths are built by concatenating segments with `.` as separator. A root-level field named `"a.b"` produces `path: "a.b"`, which is indistinguishable from a nested field `{ a: { b: ... } }`. + +This ambiguity flows downstream to all consumers: `toTypeScriptDefinition`, `toFieldCompletionItems`, `generateDescriptions`, and any future completion provider. + +### Examples + +| Document shape | Resulting `path` | Ambiguous? | +| --------------------- | ---------------- | ----------------------------- | +| `{ a: { b: 1 } }` | `"a.b"` | — | +| `{ "a.b": 1 }` | `"a.b"` | ✅ Same as above | +| `{ x: { "y.z": 1 } }` | `"x.y.z"` | ✅ Looks like 3-level nesting | + +### Proposed fix + +Change `FieldEntry.path` from `string` to `string[]` (segment array): + +```typescript +// Before +interface FieldEntry { + path: string; // "address.city" + ... +} + +// After +interface FieldEntry { + path: string[]; // ["address", "city"] + ... +} +``` + +Each consumer then formats the path for its own context: + +- **TypeScript definitions:** Already use schema `properties` keys directly (no change needed there) +- **Completion items:** `entry.path.join('.')` for display, bracket notation for special segments +- **Aggregation references:** `$` + segments joined with `.`, or `$getField` chains for special segments + +### Impact + +This is a **breaking change** to the `FieldEntry` interface. Affected consumers: + +- `toFieldCompletionItems.ts` +- `toTypeScriptDefinition.ts` (indirect — uses schema, not FieldEntry paths) +- `generateDescriptions.ts` (uses schema, not FieldEntry paths) +- `collectionViewRouter.ts` (imports `FieldEntry` type) +- `ClusterSession.ts` (imports `FieldEntry` type) +- `generateMongoFindJsonSchema.ts` (imports `FieldEntry` type) +- `SchemaAnalyzer.ts` (returns `FieldEntry[]` via `getKnownFields`) + +**Recommendation:** Defer until the completion provider is built. The ambiguity only matters for fields with literal dots, which are uncommon. When fixing, do it as a single atomic change across all consumers. + +--- + +## 4. TypeScript definition output references undeclared BSON type names + +**Severity:** Low — the TS definition is for display/hover only, not compiled or type-checked +**File:** `toTypeScriptDefinition.ts` — `bsonToTypeScriptMap` +**When to fix:** Before the TS definition is used in a context where type correctness matters (e.g., Monaco intellisense with an actual TS language service) + +### Problem + +The BSON-to-TypeScript type mapping emits non-built-in type names such as `ObjectId`, `Binary`, `Timestamp`, `MinKey`, `MaxKey`, `Code`, `DBRef`, and `UUID`. These are MongoDB API BSON driver types, but the generated definition string doesn't include `import` statements or `declare` stubs for them. + +If the output is ever fed to a TypeScript compiler or language service (e.g., Monaco with full TS checking), it will report "Cannot find name 'ObjectId'" etc. + +### Current state + +The generated output is used for documentation/hover display only — it's rendered as syntax-highlighted text, not compiled. So this is purely cosmetic today. + +### Proposed fix (when needed) + +**Option A — Emit `import type`:** + +```typescript +import type { ObjectId, Binary, Timestamp, MinKey, MaxKey, Code, DBRef, UUID } from 'mongodb'; +``` + +Only include types that actually appear in the schema. + +**Option B — Emit `declare type` stubs:** + +```typescript +declare type ObjectId = { toString(): string }; +declare type Binary = { length(): number }; +// ... etc. +``` + +Lightweight, no dependency on the `mongodb` package. + +**Option C — Map everything to primitive types:** + +```typescript +ObjectId → string // (its string representation) +Binary → Uint8Array +Timestamp → { t: number; i: number } +``` + +Loses semantic precision but avoids the undeclared-type problem entirely. + +**Recommendation:** Option A is the most correct approach. Collect the set of non-built-in types actually used in the schema, then prepend a single `import type` line. Defer until the output is consumed by a real TS language service. diff --git a/src/utils/json/data-api/autocomplete/generateDescriptions.test.ts b/src/utils/json/data-api/autocomplete/generateDescriptions.test.ts new file mode 100644 index 000000000..32a103431 --- /dev/null +++ b/src/utils/json/data-api/autocomplete/generateDescriptions.test.ts @@ -0,0 +1,210 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { type JSONSchema } from '@vscode-documentdb/schema-analyzer'; +import { generateDescriptions } from './generateDescriptions'; + +describe('generateDescriptions', () => { + it('adds descriptions with type and percentage for simple document', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + name: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 100, + }, + ], + }, + }, + }; + + generateDescriptions(schema); + + const nameSchema = schema.properties?.name as JSONSchema; + expect(nameSchema.description).toBe('String · 100%'); + }); + + it('includes min/max stats for numeric fields', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + age: { + 'x-occurrence': 95, + anyOf: [ + { + type: 'number', + 'x-bsonType': 'int32', + 'x-typeOccurrence': 95, + 'x-minValue': 18, + 'x-maxValue': 95, + }, + ], + }, + }, + }; + + generateDescriptions(schema); + + const ageSchema = schema.properties?.age as JSONSchema; + expect(ageSchema.description).toBe('Int32 · 95% · range: 18–95'); + }); + + it('includes length stats for string fields', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + name: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 100, + 'x-minLength': 3, + 'x-maxLength': 50, + }, + ], + }, + }, + }; + + generateDescriptions(schema); + + const nameSchema = schema.properties?.name as JSONSchema; + expect(nameSchema.description).toBe('String · 100% · length: 3–50'); + }); + + it('includes date range stats for date fields', () => { + const minDate = new Date('2020-01-01T00:00:00.000Z').getTime(); + const maxDate = new Date('2024-12-31T00:00:00.000Z').getTime(); + + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + createdAt: { + 'x-occurrence': 80, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'date', + 'x-typeOccurrence': 80, + 'x-minDate': minDate, + 'x-maxDate': maxDate, + }, + ], + }, + }, + }; + + generateDescriptions(schema); + + const createdAtSchema = schema.properties?.createdAt as JSONSchema; + expect(createdAtSchema.description).toBe('Date · 80% · range: 2020-01-01 – 2024-12-31'); + }); + + it('includes true/false counts for boolean fields', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + active: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'boolean', + 'x-bsonType': 'boolean', + 'x-typeOccurrence': 100, + 'x-trueCount': 80, + 'x-falseCount': 20, + }, + ], + }, + }, + }; + + generateDescriptions(schema); + + const activeSchema = schema.properties?.active as JSONSchema; + expect(activeSchema.description).toBe('Boolean · 100% · true: 80, false: 20'); + }); + + it('handles nested object fields (descriptions at nested level)', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + address: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'object', + 'x-bsonType': 'object', + 'x-typeOccurrence': 100, + 'x-documentsInspected': 100, + properties: { + city: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 100, + 'x-minLength': 2, + 'x-maxLength': 30, + }, + ], + }, + }, + }, + ], + }, + }, + }; + + generateDescriptions(schema); + + // The parent (address) should also get a description + const addressSchema = schema.properties?.address as JSONSchema; + expect(addressSchema.description).toBe('Object · 100%'); + + // The nested city should get its own description + const addressTypeEntry = (addressSchema.anyOf as JSONSchema[])[0]; + const citySchema = addressTypeEntry.properties?.city as JSONSchema; + expect(citySchema.description).toBe('String · 100% · length: 2–30'); + }); + + it('handles polymorphic fields (shows multiple types)', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + value: { + 'x-occurrence': 95, + anyOf: [ + { + type: 'number', + 'x-bsonType': 'int32', + 'x-typeOccurrence': 60, + 'x-minValue': 1, + 'x-maxValue': 100, + }, + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 35, + }, + ], + }, + }, + }; + + generateDescriptions(schema); + + const valueSchema = schema.properties?.value as JSONSchema; + // Dominant type first, then secondary + expect(valueSchema.description).toBe('Int32 | String · 95% · range: 1–100'); + }); +}); diff --git a/src/utils/json/data-api/autocomplete/generateDescriptions.ts b/src/utils/json/data-api/autocomplete/generateDescriptions.ts new file mode 100644 index 000000000..2f4f28867 --- /dev/null +++ b/src/utils/json/data-api/autocomplete/generateDescriptions.ts @@ -0,0 +1,218 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { BSONTypes, type JSONSchema } from '@vscode-documentdb/schema-analyzer'; +import Denque from 'denque'; + +/** + * Work item for BFS traversal of the schema tree. + */ +interface WorkItem { + schemaNode: JSONSchema; + parentDocumentsInspected: number; +} + +/** + * Post-processor that mutates the schema in-place, adding human-readable + * `description` strings to each property node. Descriptions include: + * - Dominant type name(s) + * - Occurrence percentage (based on `x-occurrence / parentDocumentsInspected`) + * - Type-specific stats (length, range, true/false counts, etc.) + * + * Uses BFS to traverse all property levels. + */ +export function generateDescriptions(schema: JSONSchema): void { + const rootDocumentsInspected = (schema['x-documentsInspected'] as number) ?? 0; + + const queue = new Denque(); + + // Seed the queue with root-level properties + if (schema.properties) { + for (const propName of Object.keys(schema.properties)) { + const propSchema = schema.properties[propName] as JSONSchema; + if (typeof propSchema === 'boolean') continue; + + queue.push({ + schemaNode: propSchema, + parentDocumentsInspected: rootDocumentsInspected, + }); + } + } + + while (queue.length > 0) { + const item = queue.shift(); + if (!item) continue; + + const { schemaNode, parentDocumentsInspected } = item; + + // Collect type display names from anyOf entries + const typeNames = collectTypeDisplayNames(schemaNode); + + // Build description parts + const parts: string[] = []; + + // Part 1: Type info + if (typeNames.length > 0) { + parts.push(typeNames.join(' | ')); + } + + // Part 2: Occurrence percentage + if (parentDocumentsInspected > 0) { + const occurrence = (schemaNode['x-occurrence'] as number) ?? 0; + const percentage = ((occurrence / parentDocumentsInspected) * 100).toFixed(0); + parts.push(`${percentage}%`); + } + + // Part 3: Stats from the dominant type entry + const dominantEntry = getDominantTypeEntry(schemaNode); + if (dominantEntry) { + const statString = getStatString(dominantEntry); + if (statString) { + parts.push(statString); + } + + // If the dominant entry is an object with properties, enqueue children + if (dominantEntry.type === 'object' && dominantEntry.properties) { + const objectDocumentsInspected = (dominantEntry['x-documentsInspected'] as number) ?? 0; + for (const childName of Object.keys(dominantEntry.properties)) { + const childSchema = dominantEntry.properties[childName] as JSONSchema; + if (typeof childSchema === 'boolean') continue; + + queue.push({ + schemaNode: childSchema, + parentDocumentsInspected: objectDocumentsInspected, + }); + } + } + } + + // Set the description + if (parts.length > 0) { + schemaNode.description = parts.join(' · '); + } + } +} + +/** + * Collects display names for all types in a schema node's `anyOf` entries. + * Returns them ordered by descending `x-typeOccurrence`. + */ +function collectTypeDisplayNames(schemaNode: JSONSchema): string[] { + if (!schemaNode.anyOf || schemaNode.anyOf.length === 0) { + return []; + } + + const entries: Array<{ name: string; occurrence: number }> = []; + for (const entry of schemaNode.anyOf) { + if (typeof entry === 'boolean') continue; + const bsonType = (entry['x-bsonType'] as string) ?? ''; + const occurrence = (entry['x-typeOccurrence'] as number) ?? 0; + const name = bsonType + ? BSONTypes.toDisplayString(bsonType as BSONTypes) + : ((entry.type as string) ?? 'Unknown'); + entries.push({ name, occurrence }); + } + + // Sort by occurrence descending so dominant type comes first + entries.sort((a, b) => b.occurrence - a.occurrence); + return entries.map((e) => e.name); +} + +/** + * Returns the anyOf entry with the highest `x-typeOccurrence`. + */ +function getDominantTypeEntry(schemaNode: JSONSchema): JSONSchema | null { + if (!schemaNode.anyOf || schemaNode.anyOf.length === 0) { + return null; + } + + let maxOccurrence = -1; + let dominant: JSONSchema | null = null; + + for (const entry of schemaNode.anyOf) { + if (typeof entry === 'boolean') continue; + const occurrence = (entry['x-typeOccurrence'] as number) ?? 0; + if (occurrence > maxOccurrence) { + maxOccurrence = occurrence; + dominant = entry; + } + } + + return dominant; +} + +/** + * Returns a type-specific stats string for the given type entry, or undefined if + * no relevant stats are available. + */ +function getStatString(typeEntry: JSONSchema): string | undefined { + const bsonType = (typeEntry['x-bsonType'] as string) ?? ''; + + switch (bsonType) { + case 'string': + case 'binary': { + const minLen = typeEntry['x-minLength'] as number | undefined; + const maxLen = typeEntry['x-maxLength'] as number | undefined; + if (minLen !== undefined && maxLen !== undefined) { + return `length: ${String(minLen)}–${String(maxLen)}`; + } + return undefined; + } + + case 'int32': + case 'double': + case 'long': + case 'decimal128': + case 'number': { + const minVal = typeEntry['x-minValue'] as number | undefined; + const maxVal = typeEntry['x-maxValue'] as number | undefined; + if (minVal !== undefined && maxVal !== undefined) { + return `range: ${String(minVal)}–${String(maxVal)}`; + } + return undefined; + } + + case 'date': { + const minDate = typeEntry['x-minDate'] as number | undefined; + const maxDate = typeEntry['x-maxDate'] as number | undefined; + if (minDate !== undefined && maxDate !== undefined) { + const minISO = new Date(minDate).toISOString().split('T')[0]; + const maxISO = new Date(maxDate).toISOString().split('T')[0]; + return `range: ${minISO} – ${maxISO}`; + } + return undefined; + } + + case 'boolean': { + const trueCount = typeEntry['x-trueCount'] as number | undefined; + const falseCount = typeEntry['x-falseCount'] as number | undefined; + if (trueCount !== undefined && falseCount !== undefined) { + return `true: ${String(trueCount)}, false: ${String(falseCount)}`; + } + return undefined; + } + + case 'array': { + const minItems = typeEntry['x-minItems'] as number | undefined; + const maxItems = typeEntry['x-maxItems'] as number | undefined; + if (minItems !== undefined && maxItems !== undefined) { + return `items: ${String(minItems)}–${String(maxItems)}`; + } + return undefined; + } + + case 'object': { + const minProps = typeEntry['x-minProperties'] as number | undefined; + const maxProps = typeEntry['x-maxProperties'] as number | undefined; + if (minProps !== undefined && maxProps !== undefined) { + return `properties: ${String(minProps)}–${String(maxProps)}`; + } + return undefined; + } + + default: + return undefined; + } +} diff --git a/src/utils/json/mongo/autocomplete/generateMongoFindJsonSchema.ts b/src/utils/json/data-api/autocomplete/generateMongoFindJsonSchema.ts similarity index 99% rename from src/utils/json/mongo/autocomplete/generateMongoFindJsonSchema.ts rename to src/utils/json/data-api/autocomplete/generateMongoFindJsonSchema.ts index 0f0fa7bbe..edaa8a64a 100644 --- a/src/utils/json/mongo/autocomplete/generateMongoFindJsonSchema.ts +++ b/src/utils/json/data-api/autocomplete/generateMongoFindJsonSchema.ts @@ -3,7 +3,7 @@ * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ -import { type FieldEntry } from './getKnownFields'; +import { type FieldEntry } from '@vscode-documentdb/schema-analyzer'; /** * Generates a JSON schema for MongoDB find filter queries. diff --git a/src/utils/json/data-api/autocomplete/getKnownFields.test.ts b/src/utils/json/data-api/autocomplete/getKnownFields.test.ts new file mode 100644 index 000000000..d0680e2f3 --- /dev/null +++ b/src/utils/json/data-api/autocomplete/getKnownFields.test.ts @@ -0,0 +1,128 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { type FieldEntry, getKnownFields, SchemaAnalyzer } from '@vscode-documentdb/schema-analyzer'; +import { ObjectId } from 'bson'; + +describe('getKnownFields', () => { + it('returns bsonType for primitive fields', () => { + const analyzer = SchemaAnalyzer.fromDocument({ + _id: new ObjectId(), + name: 'Alice', + age: 42, + score: 3.14, + active: true, + }); + const fields = getKnownFields(analyzer.getSchema()); + + const nameField = fields.find((f: FieldEntry) => f.path === 'name'); + expect(nameField?.type).toBe('string'); + expect(nameField?.bsonType).toBe('string'); + + const ageField = fields.find((f: FieldEntry) => f.path === 'age'); + expect(ageField?.type).toBe('number'); + // bsonType could be 'double' or 'int32' depending on JS runtime + expect(['double', 'int32']).toContain(ageField?.bsonType); + + const activeField = fields.find((f: FieldEntry) => f.path === 'active'); + expect(activeField?.type).toBe('boolean'); + expect(activeField?.bsonType).toBe('boolean'); + }); + + it('returns _id first and sorts alphabetically', () => { + const analyzer = SchemaAnalyzer.fromDocument({ + _id: new ObjectId(), + zebra: 1, + apple: 2, + mango: 3, + }); + const fields = getKnownFields(analyzer.getSchema()); + const paths = fields.map((f: FieldEntry) => f.path); + + expect(paths[0]).toBe('_id'); + // Remaining should be alphabetical + expect(paths.slice(1)).toEqual(['apple', 'mango', 'zebra']); + }); + + it('detects optional fields', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument({ _id: new ObjectId(), name: 'Alice', age: 30 }); + analyzer.addDocument({ _id: new ObjectId(), name: 'Bob' }); // no 'age' + + const fields = getKnownFields(analyzer.getSchema()); + + const nameField = fields.find((f: FieldEntry) => f.path === 'name'); + expect(nameField?.isSparse).toBeUndefined(); // present in all docs + + const ageField = fields.find((f: FieldEntry) => f.path === 'age'); + expect(ageField?.isSparse).toBe(true); // missing in doc2 + }); + + it('returns bsonTypes for polymorphic fields', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument({ _id: new ObjectId(), value: 'hello' }); + analyzer.addDocument({ _id: new ObjectId(), value: 42 }); + + const fields = getKnownFields(analyzer.getSchema()); + const valueField = fields.find((f: FieldEntry) => f.path === 'value'); + + expect(valueField?.bsonTypes).toBeDefined(); + expect(valueField?.bsonTypes).toHaveLength(2); + expect(valueField?.bsonTypes).toContain('string'); + // Could be 'double' or 'int32' + expect(valueField?.bsonTypes?.some((t: string) => ['double', 'int32'].includes(t))).toBe(true); + }); + + it('returns arrayItemBsonType for array fields', () => { + const analyzer = SchemaAnalyzer.fromDocument({ + _id: new ObjectId(), + tags: ['a', 'b', 'c'], + scores: [10, 20, 30], + }); + const fields = getKnownFields(analyzer.getSchema()); + + const tagsField = fields.find((f: FieldEntry) => f.path === 'tags'); + expect(tagsField?.type).toBe('array'); + expect(tagsField?.bsonType).toBe('array'); + expect(tagsField?.arrayItemBsonType).toBe('string'); + + const scoresField = fields.find((f: FieldEntry) => f.path === 'scores'); + expect(scoresField?.type).toBe('array'); + expect(scoresField?.arrayItemBsonType).toBeDefined(); + }); + + it('handles nested object fields', () => { + const analyzer = SchemaAnalyzer.fromDocument({ + _id: new ObjectId(), + user: { + name: 'Alice', + profile: { + bio: 'hello', + }, + }, + }); + const fields = getKnownFields(analyzer.getSchema()); + const paths = fields.map((f: FieldEntry) => f.path); + + // Objects are expanded, not leaf nodes + expect(paths).not.toContain('user'); + expect(paths).toContain('user.name'); + expect(paths).toContain('user.profile.bio'); + }); + + it('detects optional nested fields', () => { + const analyzer = new SchemaAnalyzer(); + analyzer.addDocument({ _id: new ObjectId(), user: { name: 'Alice', age: 30 } }); + analyzer.addDocument({ _id: new ObjectId(), user: { name: 'Bob' } }); // no age in nested obj + + const fields = getKnownFields(analyzer.getSchema()); + + const nameField = fields.find((f: FieldEntry) => f.path === 'user.name'); + expect(nameField?.isSparse).toBeUndefined(); // present in both objects + + const ageField = fields.find((f: FieldEntry) => f.path === 'user.age'); + expect(ageField?.isSparse).toBe(true); // missing in doc2's user object + }); +}); diff --git a/src/utils/json/data-api/autocomplete/toFieldCompletionItems.test.ts b/src/utils/json/data-api/autocomplete/toFieldCompletionItems.test.ts new file mode 100644 index 000000000..37a7ecc4e --- /dev/null +++ b/src/utils/json/data-api/autocomplete/toFieldCompletionItems.test.ts @@ -0,0 +1,129 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { type FieldEntry } from '@vscode-documentdb/schema-analyzer'; +import { toFieldCompletionItems } from './toFieldCompletionItems'; + +describe('toFieldCompletionItems', () => { + it('converts simple fields', () => { + const fields: FieldEntry[] = [ + { path: 'name', type: 'string', bsonType: 'string' }, + { path: 'age', type: 'number', bsonType: 'int32' }, + ]; + + const result = toFieldCompletionItems(fields); + + expect(result).toHaveLength(2); + expect(result[0].fieldName).toBe('name'); + expect(result[0].displayType).toBe('String'); + expect(result[0].bsonType).toBe('string'); + expect(result[0].insertText).toBe('name'); + + expect(result[1].fieldName).toBe('age'); + expect(result[1].displayType).toBe('Int32'); + expect(result[1].bsonType).toBe('int32'); + expect(result[1].insertText).toBe('age'); + }); + + it('escapes dotted paths in insertText', () => { + const fields: FieldEntry[] = [ + { path: 'address.city', type: 'string', bsonType: 'string' }, + { path: 'user.profile.bio', type: 'string', bsonType: 'string' }, + ]; + + const result = toFieldCompletionItems(fields); + + expect(result[0].insertText).toBe('"address.city"'); + expect(result[1].insertText).toBe('"user.profile.bio"'); + }); + + it('quotes field names with dashes', () => { + const fields: FieldEntry[] = [{ path: 'order-items', type: 'string', bsonType: 'string' }]; + const result = toFieldCompletionItems(fields); + expect(result[0].insertText).toBe('"order-items"'); + expect(result[0].fieldName).toBe('order-items'); // display stays unescaped + }); + + it('quotes field names with brackets', () => { + const fields: FieldEntry[] = [{ path: 'items[0]', type: 'string', bsonType: 'string' }]; + const result = toFieldCompletionItems(fields); + expect(result[0].insertText).toBe('"items[0]"'); + }); + + it('quotes field names starting with a digit', () => { + const fields: FieldEntry[] = [{ path: '123abc', type: 'string', bsonType: 'string' }]; + const result = toFieldCompletionItems(fields); + expect(result[0].insertText).toBe('"123abc"'); + }); + + it('escapes embedded double quotes in insertText', () => { + const fields: FieldEntry[] = [{ path: 'say"hi"', type: 'string', bsonType: 'string' }]; + const result = toFieldCompletionItems(fields); + expect(result[0].insertText).toBe('"say\\"hi\\""'); + expect(result[0].fieldName).toBe('say"hi"'); // display stays unescaped + }); + + it('escapes backslashes in insertText', () => { + const fields: FieldEntry[] = [{ path: 'back\\slash', type: 'string', bsonType: 'string' }]; + const result = toFieldCompletionItems(fields); + expect(result[0].insertText).toBe('"back\\\\slash"'); + }); + + it('does not quote valid identifiers', () => { + const fields: FieldEntry[] = [ + { path: 'name', type: 'string', bsonType: 'string' }, + { path: '_id', type: 'string', bsonType: 'objectid' }, + { path: '$type', type: 'string', bsonType: 'string' }, + ]; + const result = toFieldCompletionItems(fields); + expect(result[0].insertText).toBe('name'); + expect(result[1].insertText).toBe('_id'); + expect(result[2].insertText).toBe('$type'); + }); + + it('adds $ prefix to referenceText', () => { + const fields: FieldEntry[] = [ + { path: 'age', type: 'number', bsonType: 'int32' }, + { path: 'address.city', type: 'string', bsonType: 'string' }, + ]; + + const result = toFieldCompletionItems(fields); + + expect(result[0].referenceText).toBe('$age'); + expect(result[1].referenceText).toBe('$address.city'); + }); + + it('preserves isSparse', () => { + const fields: FieldEntry[] = [ + { path: 'name', type: 'string', bsonType: 'string', isSparse: false }, + { path: 'nickname', type: 'string', bsonType: 'string', isSparse: true }, + { path: 'email', type: 'string', bsonType: 'string' }, // undefined → false + ]; + + const result = toFieldCompletionItems(fields); + + expect(result[0].isSparse).toBe(false); + expect(result[1].isSparse).toBe(true); + expect(result[2].isSparse).toBe(false); + }); + + it('uses correct displayType', () => { + const fields: FieldEntry[] = [ + { path: '_id', type: 'string', bsonType: 'objectid' }, + { path: 'createdAt', type: 'string', bsonType: 'date' }, + { path: 'active', type: 'boolean', bsonType: 'boolean' }, + { path: 'score', type: 'number', bsonType: 'double' }, + { path: 'tags', type: 'array', bsonType: 'array' }, + ]; + + const result = toFieldCompletionItems(fields); + + expect(result[0].displayType).toBe('ObjectId'); + expect(result[1].displayType).toBe('Date'); + expect(result[2].displayType).toBe('Boolean'); + expect(result[3].displayType).toBe('Double'); + expect(result[4].displayType).toBe('Array'); + }); +}); diff --git a/src/utils/json/data-api/autocomplete/toFieldCompletionItems.ts b/src/utils/json/data-api/autocomplete/toFieldCompletionItems.ts new file mode 100644 index 000000000..63e85bd92 --- /dev/null +++ b/src/utils/json/data-api/autocomplete/toFieldCompletionItems.ts @@ -0,0 +1,80 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { BSONTypes, type FieldEntry } from '@vscode-documentdb/schema-analyzer'; + +/** + * Completion-ready data for a single field entry. + * + * Design intent: + * - `fieldName` is the human-readable, unescaped field path shown in the completion list. + * Users see clean names like "address.city" or "order-items" without quotes or escaping. + * - `insertText` is the escaped/quoted form that gets inserted when the user selects a + * completion item. For simple identifiers it matches `fieldName`; for names containing + * special characters (dots, spaces, `$`, etc.) it is wrapped in double quotes. + * - `referenceText` is the `$`-prefixed aggregation field reference (e.g., "$age"). + */ +export interface FieldCompletionData { + /** The full dot-notated field name, e.g., "address.city" — kept unescaped for display */ + fieldName: string; + /** Human-readable type display, e.g., "String", "Date", "ObjectId" */ + displayType: string; + /** Raw BSON type from FieldEntry */ + bsonType: string; + /** Whether the field was not present in every inspected document (statistical observation, not a constraint) */ + isSparse: boolean; + /** Text to insert when the user selects this completion — quoted/escaped if the field name contains special chars */ + insertText: string; + /** + * Field reference for aggregation expressions, e.g., "$age", "$address.city". + * + * TODO: The simple `$field.path` syntax is invalid MQL for field names containing dots, + * spaces, or `$` characters. For such fields, the correct MQL syntax is + * `{ $getField: "fieldName" }`. This should be addressed when the aggregation + * completion provider is wired up — either by using `$getField` for special names + * or by making `referenceText` optional for fields that cannot use the `$` prefix syntax. + */ + referenceText: string; +} + +/** + * Matches valid JavaScript/TypeScript identifiers. + * A valid identifier starts with a letter, underscore, or dollar sign, + * followed by zero or more letters, digits, underscores, or dollar signs. + * + * Field names that do NOT match this pattern must be quoted and escaped + * in `insertText` to produce valid query expressions. + */ +const JS_IDENTIFIER_PATTERN = /^[a-zA-Z_$][a-zA-Z0-9_$]*$/; + +/** + * Converts an array of FieldEntry objects into completion-ready FieldCompletionData items. + * + * @param fields - Array of FieldEntry objects from getKnownFields + * @returns Array of FieldCompletionData ready for use in editor completions + */ +export function toFieldCompletionItems(fields: FieldEntry[]): FieldCompletionData[] { + return fields.map((entry) => { + const displayType = BSONTypes.toDisplayString(entry.bsonType as BSONTypes); + const needsQuoting = !JS_IDENTIFIER_PATTERN.test(entry.path); + + let insertText: string; + if (needsQuoting) { + const escaped = entry.path.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); + insertText = `"${escaped}"`; + } else { + insertText = entry.path; + } + + return { + fieldName: entry.path, + displayType, + bsonType: entry.bsonType, + isSparse: entry.isSparse ?? false, + insertText, + referenceText: `$${entry.path}`, + }; + }); +} diff --git a/src/utils/json/data-api/autocomplete/toTypeScriptDefinition.test.ts b/src/utils/json/data-api/autocomplete/toTypeScriptDefinition.test.ts new file mode 100644 index 000000000..d003b9ded --- /dev/null +++ b/src/utils/json/data-api/autocomplete/toTypeScriptDefinition.test.ts @@ -0,0 +1,318 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { type JSONSchema } from '@vscode-documentdb/schema-analyzer'; +import { toTypeScriptDefinition } from './toTypeScriptDefinition'; + +describe('toTypeScriptDefinition', () => { + it('generates basic interface with primitive types', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + _id: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'objectid', + 'x-typeOccurrence': 100, + }, + ], + }, + name: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 100, + }, + ], + }, + age: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'number', + 'x-bsonType': 'int32', + 'x-typeOccurrence': 100, + }, + ], + }, + }, + }; + + const result = toTypeScriptDefinition(schema, 'users'); + + expect(result).toContain('interface UsersDocument {'); + expect(result).toContain(' _id: ObjectId;'); + expect(result).toContain(' name: string;'); + expect(result).toContain(' age: number;'); + expect(result).toContain('}'); + }); + + it('marks optional fields with ?', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + name: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 100, + }, + ], + }, + nickname: { + 'x-occurrence': 50, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 50, + }, + ], + }, + }, + }; + + const result = toTypeScriptDefinition(schema, 'users'); + + expect(result).toContain(' name: string;'); + expect(result).toContain(' nickname?: string;'); + }); + + it('handles nested objects as inline blocks', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + address: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'object', + 'x-bsonType': 'object', + 'x-typeOccurrence': 100, + 'x-documentsInspected': 100, + properties: { + city: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 100, + }, + ], + }, + zip: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 100, + }, + ], + }, + }, + }, + ], + }, + }, + }; + + const result = toTypeScriptDefinition(schema, 'users'); + + expect(result).toContain(' address: {'); + expect(result).toContain(' city: string;'); + expect(result).toContain(' zip: string;'); + expect(result).toContain(' };'); + }); + + it('handles arrays with element types', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + tags: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'array', + 'x-bsonType': 'array', + 'x-typeOccurrence': 100, + items: { + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 100, + }, + ], + }, + }, + ], + }, + }, + }; + + const result = toTypeScriptDefinition(schema, 'posts'); + + expect(result).toContain(' tags: string[];'); + }); + + it('handles polymorphic fields as unions', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + metadata: { + 'x-occurrence': 80, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 50, + }, + { + type: 'number', + 'x-bsonType': 'int32', + 'x-typeOccurrence': 20, + }, + { + type: 'null', + 'x-bsonType': 'null', + 'x-typeOccurrence': 10, + }, + ], + }, + }, + }; + + const result = toTypeScriptDefinition(schema, 'items'); + + expect(result).toContain(' metadata?: string | number | null;'); + }); + + it('PascalCase conversion for collection name', () => { + expect(toTypeScriptDefinition({ 'x-documentsInspected': 0 }, 'users')).toContain('interface UsersDocument'); + expect(toTypeScriptDefinition({ 'x-documentsInspected': 0 }, 'order_items')).toContain( + 'interface OrderItemsDocument', + ); + expect(toTypeScriptDefinition({ 'x-documentsInspected': 0 }, 'my-awesome-collection')).toContain( + 'interface MyAwesomeCollectionDocument', + ); + }); + + it('prefixes with _ when collection name starts with a digit', () => { + expect(toTypeScriptDefinition({ 'x-documentsInspected': 0 }, '123abc')).toContain('interface _123abcDocument'); + expect(toTypeScriptDefinition({ 'x-documentsInspected': 0 }, '99_bottles')).toContain( + 'interface _99BottlesDocument', + ); + }); + + it('falls back to CollectionDocument when name is only separators', () => { + expect(toTypeScriptDefinition({ 'x-documentsInspected': 0 }, '---')).toContain('interface CollectionDocument'); + expect(toTypeScriptDefinition({ 'x-documentsInspected': 0 }, '_ _ _')).toContain( + 'interface CollectionDocument', + ); + }); + + it('falls back to CollectionDocument for empty string', () => { + expect(toTypeScriptDefinition({ 'x-documentsInspected': 0 }, '')).toContain('interface CollectionDocument'); + }); + + describe('special character field names', () => { + function makeSchemaWithField(fieldName: string): JSONSchema { + return { + 'x-documentsInspected': 100, + properties: { + [fieldName]: { + 'x-occurrence': 100, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 100, + }, + ], + }, + }, + }; + } + + it('leaves valid identifiers unquoted', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('age'), 'test'); + expect(result).toContain(' age: string;'); + }); + + it('leaves underscore-prefixed identifiers unquoted', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('_id'), 'test'); + expect(result).toContain(' _id: string;'); + }); + + it('leaves dollar-prefixed identifiers unquoted', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('$type'), 'test'); + expect(result).toContain(' $type: string;'); + }); + + it('quotes field names with dashes', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('order-items'), 'test'); + expect(result).toContain(' "order-items": string;'); + }); + + it('quotes field names with dots', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('a.b'), 'test'); + expect(result).toContain(' "a.b": string;'); + }); + + it('quotes field names with spaces', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('my field'), 'test'); + expect(result).toContain(' "my field": string;'); + }); + + it('quotes field names with brackets', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('items[0]'), 'test'); + expect(result).toContain(' "items[0]": string;'); + }); + + it('escapes embedded double quotes in field names', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('say"hi"'), 'test'); + expect(result).toContain(' "say\\"hi\\"": string;'); + }); + + it('escapes backslashes in field names', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('back\\slash'), 'test'); + expect(result).toContain(' "back\\\\slash": string;'); + }); + + it('quotes field names that start with a digit', () => { + const result = toTypeScriptDefinition(makeSchemaWithField('123abc'), 'test'); + expect(result).toContain(' "123abc": string;'); + }); + + it('preserves optionality with quoted field names', () => { + const schema: JSONSchema = { + 'x-documentsInspected': 100, + properties: { + 'order-items': { + 'x-occurrence': 50, + anyOf: [ + { + type: 'string', + 'x-bsonType': 'string', + 'x-typeOccurrence': 50, + }, + ], + }, + }, + }; + + const result = toTypeScriptDefinition(schema, 'test'); + expect(result).toContain(' "order-items"?: string;'); + }); + }); +}); diff --git a/src/utils/json/data-api/autocomplete/toTypeScriptDefinition.ts b/src/utils/json/data-api/autocomplete/toTypeScriptDefinition.ts new file mode 100644 index 000000000..17328dfeb --- /dev/null +++ b/src/utils/json/data-api/autocomplete/toTypeScriptDefinition.ts @@ -0,0 +1,272 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { BSONTypes, type JSONSchema } from '@vscode-documentdb/schema-analyzer'; + +/** + * Maps a BSON type string to the corresponding TypeScript type representation. + */ +const bsonToTypeScriptMap: Record = { + [BSONTypes.String]: 'string', + [BSONTypes.Int32]: 'number', + [BSONTypes.Double]: 'number', + [BSONTypes.Long]: 'number', + [BSONTypes.Decimal128]: 'number', + [BSONTypes.Number]: 'number', + [BSONTypes.Boolean]: 'boolean', + [BSONTypes.Date]: 'Date', + [BSONTypes.ObjectId]: 'ObjectId', + [BSONTypes.Null]: 'null', + [BSONTypes.Undefined]: 'undefined', + [BSONTypes.Binary]: 'Binary', + [BSONTypes.RegExp]: 'RegExp', + [BSONTypes.UUID]: 'UUID', + [BSONTypes.UUID_LEGACY]: 'UUID', + [BSONTypes.Timestamp]: 'Timestamp', + [BSONTypes.MinKey]: 'MinKey', + [BSONTypes.MaxKey]: 'MaxKey', + [BSONTypes.Code]: 'Code', + [BSONTypes.CodeWithScope]: 'Code', + [BSONTypes.DBRef]: 'DBRef', + [BSONTypes.Map]: 'Map', + [BSONTypes.Symbol]: 'symbol', +}; + +/** + * Converts a BSON type string to a TypeScript type string. + */ +function bsonTypeToTS(bsonType: string): string { + return bsonToTypeScriptMap[bsonType] ?? 'unknown'; +} + +/** + * Matches valid JavaScript/TypeScript identifiers. + * A valid identifier starts with a letter, underscore, or dollar sign, + * followed by zero or more letters, digits, underscores, or dollar signs. + */ +const JS_IDENTIFIER_PATTERN = /^[a-zA-Z_$][a-zA-Z0-9_$]*$/; + +/** + * Returns a safe TypeScript property name for use in interface definitions. + * If the name is a valid JS identifier, it is returned as-is. + * Otherwise, it is wrapped in double quotes with internal quotes and backslashes escaped. + * + * Examples: + * - "age" → "age" (valid identifier, unchanged) + * - "order-items" → '"order-items"' (dash) + * - "a.b" → '"a.b"' (dot) + * - "my field" → '"my field"' (space) + * - 'say"hi"' → '"say\\"hi\\""' (embedded quotes escaped) + */ +function safePropertyName(name: string): string { + if (JS_IDENTIFIER_PATTERN.test(name)) { + return name; + } + const escaped = name.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); + return `"${escaped}"`; +} + +/** + * Converts a collection name to PascalCase and appends "Document". + * If the result would start with a digit, a leading `_` is prepended. + * If the collection name contains only separators or is empty, falls back to "CollectionDocument". + * + * Examples: + * - "users" → "UsersDocument" + * - "order_items" → "OrderItemsDocument" + * - "123abc" → "_123abcDocument" + * - "---" → "CollectionDocument" + */ +function toInterfaceName(collectionName: string): string { + const pascal = collectionName + .split(/[_\-\s]+/) + .filter((s) => s.length > 0) + .map((segment) => segment.charAt(0).toUpperCase() + segment.slice(1)) + .join(''); + + if (pascal.length === 0) { + return 'CollectionDocument'; + } + + // Prefix with _ if the first character is a digit (invalid TS identifier start) + const prefix = /^[0-9]/.test(pascal) ? '_' : ''; + return `${prefix}${pascal}Document`; +} + +/** + * Generates a TypeScript interface definition string from a JSONSchema + * produced by the SchemaAnalyzer. + * + * @param schema - The JSON Schema with x- extensions from SchemaAnalyzer + * @param collectionName - The MongoDB API collection name, used to derive the interface name + * @returns A formatted TypeScript interface definition string + */ +export function toTypeScriptDefinition(schema: JSONSchema, collectionName: string): string { + const interfaceName = toInterfaceName(collectionName); + const rootDocumentsInspected = (schema['x-documentsInspected'] as number) ?? 0; + + const lines: string[] = []; + lines.push(`interface ${interfaceName} {`); + + if (schema.properties) { + renderProperties(schema.properties, rootDocumentsInspected, 1, lines); + } + + lines.push('}'); + return lines.join('\n'); +} + +/** + * Renders property lines for a set of JSON Schema properties at a given indent level. + */ +function renderProperties( + properties: Record, + parentDocumentsInspected: number, + indentLevel: number, + lines: string[], +): void { + const indent = ' '.repeat(indentLevel); + + for (const [propName, propSchema] of Object.entries(properties)) { + if (typeof propSchema === 'boolean') continue; + + const isOptional = isFieldOptional(propSchema, parentDocumentsInspected); + const optionalMarker = isOptional ? '?' : ''; + const tsType = resolveTypeString(propSchema, indentLevel); + const safeName = safePropertyName(propName); + + lines.push(`${indent}${safeName}${optionalMarker}: ${tsType};`); + } +} + +/** + * Returns true if the field's occurrence is less than the parent's document count. + */ +function isFieldOptional(schemaNode: JSONSchema, parentDocumentsInspected: number): boolean { + const occurrence = (schemaNode['x-occurrence'] as number) ?? 0; + return parentDocumentsInspected > 0 && occurrence < parentDocumentsInspected; +} + +/** + * Resolves a full TypeScript type string for a schema node by examining its + * `anyOf` entries. Handles primitives, objects (inline blocks), and arrays. + */ +function resolveTypeString(schemaNode: JSONSchema, indentLevel: number): string { + if (!schemaNode.anyOf || schemaNode.anyOf.length === 0) { + return 'unknown'; + } + + const typeStrings: string[] = []; + + for (const entry of schemaNode.anyOf) { + if (typeof entry === 'boolean') continue; + const ts = singleEntryToTS(entry, indentLevel); + if (ts && !typeStrings.includes(ts)) { + typeStrings.push(ts); + } + } + + if (typeStrings.length === 0) { + return 'unknown'; + } + + return typeStrings.join(' | '); +} + +/** + * Converts a single `anyOf` type entry to a TypeScript type string. + */ +function singleEntryToTS(entry: JSONSchema, indentLevel: number): string { + const bsonType = (entry['x-bsonType'] as string) ?? ''; + + // Object with nested properties → inline block + if (entry.type === 'object' && entry.properties) { + return renderInlineObject(entry, indentLevel); + } + + // Array → determine element types + if (entry.type === 'array' || bsonType === (BSONTypes.Array as string)) { + return renderArrayType(entry, indentLevel); + } + + // Primitive or mapped type + if (bsonType) { + return bsonTypeToTS(bsonType); + } + + // Fallback to JSON type + const jsonType = entry.type as string | undefined; + if (jsonType) { + return jsonType; + } + + return 'unknown'; +} + +/** + * Renders an inline object type `{ field: type; ... }`. + */ +function renderInlineObject(entry: JSONSchema, indentLevel: number): string { + const lines: string[] = []; + const objectDocumentsInspected = (entry['x-documentsInspected'] as number) ?? 0; + + lines.push('{'); + + if (entry.properties) { + renderProperties(entry.properties, objectDocumentsInspected, indentLevel + 1, lines); + } + + const closingIndent = ' '.repeat(indentLevel); + lines.push(`${closingIndent}}`); + + return lines.join('\n'); +} + +/** + * Renders an array type, e.g., `string[]` or `(string | number)[]`. + */ +function renderArrayType(entry: JSONSchema, indentLevel: number): string { + const itemsSchema = entry.items; + + if (!itemsSchema || typeof itemsSchema === 'boolean') { + return 'unknown[]'; + } + + // Items specified as a single schema (not an array of schemas) + if (!Array.isArray(itemsSchema)) { + const itemSchema = itemsSchema as JSONSchema; + + if (itemSchema.anyOf && itemSchema.anyOf.length > 0) { + const elementTypes: string[] = []; + for (const itemEntry of itemSchema.anyOf) { + if (typeof itemEntry === 'boolean') continue; + const ts = singleEntryToTS(itemEntry, indentLevel); + if (ts && !elementTypes.includes(ts)) { + elementTypes.push(ts); + } + } + + if (elementTypes.length === 0) { + return 'unknown[]'; + } + + if (elementTypes.length === 1) { + return `${elementTypes[0]}[]`; + } + + return `(${elementTypes.join(' | ')})[]`; + } + + // Single item type without anyOf + const bsonType = (itemSchema['x-bsonType'] as string) ?? ''; + if (bsonType) { + return `${bsonTypeToTS(bsonType)}[]`; + } + + return 'unknown[]'; + } + + return 'unknown[]'; +} diff --git a/src/utils/json/mongo/MongoBSONTypes.ts b/src/utils/json/mongo/MongoBSONTypes.ts deleted file mode 100644 index fa97add9c..000000000 --- a/src/utils/json/mongo/MongoBSONTypes.ts +++ /dev/null @@ -1,200 +0,0 @@ -/*--------------------------------------------------------------------------------------------- - * Copyright (c) Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See License.txt in the project root for license information. - *--------------------------------------------------------------------------------------------*/ - -import { - Binary, - BSONSymbol, - Code, - DBRef, - Decimal128, - Double, - Int32, - Long, - MaxKey, - MinKey, - ObjectId, - Timestamp, - UUID, -} from 'mongodb'; - -/** - * Represents the different data types that can be stored in a MongoDB document. - * The string representation is casesensitive and should match the MongoDB documentation. - * https://www.mongodb.com/docs/manual/reference/bson-types/ - */ -export enum MongoBSONTypes { - String = 'string', - Number = 'number', - Int32 = 'int32', - Double = 'double', - Decimal128 = 'decimal128', - Long = 'long', - Boolean = 'boolean', - Object = 'object', - Array = 'array', - Null = 'null', - Undefined = 'undefined', - Date = 'date', - RegExp = 'regexp', - Binary = 'binary', - ObjectId = 'objectid', - Symbol = 'symbol', - Timestamp = 'timestamp', - UUID = 'uuid', - UUID_LEGACY = 'uuid-legacy', // old UUID subtype, used in some legacy data - MinKey = 'minkey', - MaxKey = 'maxkey', - DBRef = 'dbref', - Code = 'code', - CodeWithScope = 'codewithscope', - Map = 'map', - // Add any deprecated types if necessary - _UNKNOWN_ = '_unknown_', // Catch-all for unknown types -} - -export namespace MongoBSONTypes { - const displayStringMap: Record = { - [MongoBSONTypes.String]: 'String', - [MongoBSONTypes.Number]: 'Number', - [MongoBSONTypes.Int32]: 'Int32', - [MongoBSONTypes.Double]: 'Double', - [MongoBSONTypes.Decimal128]: 'Decimal128', - [MongoBSONTypes.Long]: 'Long', - [MongoBSONTypes.Boolean]: 'Boolean', - [MongoBSONTypes.Object]: 'Object', - [MongoBSONTypes.Array]: 'Array', - [MongoBSONTypes.Null]: 'Null', - [MongoBSONTypes.Undefined]: 'Undefined', - [MongoBSONTypes.Date]: 'Date', - [MongoBSONTypes.RegExp]: 'RegExp', - [MongoBSONTypes.Binary]: 'Binary', - [MongoBSONTypes.ObjectId]: 'ObjectId', - [MongoBSONTypes.Symbol]: 'Symbol', - [MongoBSONTypes.Timestamp]: 'Timestamp', - [MongoBSONTypes.MinKey]: 'MinKey', - [MongoBSONTypes.MaxKey]: 'MaxKey', - [MongoBSONTypes.DBRef]: 'DBRef', - [MongoBSONTypes.Code]: 'Code', - [MongoBSONTypes.CodeWithScope]: 'CodeWithScope', - [MongoBSONTypes.Map]: 'Map', - [MongoBSONTypes._UNKNOWN_]: 'Unknown', - [MongoBSONTypes.UUID]: 'UUID', - [MongoBSONTypes.UUID_LEGACY]: 'UUID (Legacy)', - }; - - export function toDisplayString(type: MongoBSONTypes): string { - return displayStringMap[type] || 'Unknown'; - } - - export function toString(type: MongoBSONTypes): string { - return type; - } - - /** - * Converts a MongoDB data type to a case sensitive JSON data type - * @param type The MongoDB data type - * @returns A corresponding JSON data type (please note: it's case sensitive) - */ - export function toJSONType(type: MongoBSONTypes): string { - switch (type) { - case MongoBSONTypes.String: - case MongoBSONTypes.Symbol: - case MongoBSONTypes.Date: - case MongoBSONTypes.Timestamp: - case MongoBSONTypes.ObjectId: - case MongoBSONTypes.RegExp: - case MongoBSONTypes.Binary: - case MongoBSONTypes.Code: - case MongoBSONTypes.UUID: - case MongoBSONTypes.UUID_LEGACY: - return 'string'; - - case MongoBSONTypes.Boolean: - return 'boolean'; - - case MongoBSONTypes.Int32: - case MongoBSONTypes.Long: - case MongoBSONTypes.Double: - case MongoBSONTypes.Decimal128: - return 'number'; - - case MongoBSONTypes.Object: - case MongoBSONTypes.Map: - case MongoBSONTypes.DBRef: - case MongoBSONTypes.CodeWithScope: - return 'object'; - - case MongoBSONTypes.Array: - return 'array'; - - case MongoBSONTypes.Null: - case MongoBSONTypes.Undefined: - case MongoBSONTypes.MinKey: - case MongoBSONTypes.MaxKey: - return 'null'; - - default: - return 'string'; // Default to string for unknown types - } - } - - /** - * Accepts a value from a MongoDB 'Document' object and returns the inferred type. - * @param value The value of a field in a MongoDB 'Document' object - * @returns - */ - export function inferType(value: unknown): MongoBSONTypes { - if (value === null) return MongoBSONTypes.Null; - if (value === undefined) return MongoBSONTypes.Undefined; - - switch (typeof value) { - case 'string': - return MongoBSONTypes.String; - case 'number': - return MongoBSONTypes.Double; // JavaScript numbers are doubles - case 'boolean': - return MongoBSONTypes.Boolean; - case 'object': - if (Array.isArray(value)) { - return MongoBSONTypes.Array; - } - - // Check for common BSON types first - if (value instanceof ObjectId) return MongoBSONTypes.ObjectId; - if (value instanceof Int32) return MongoBSONTypes.Int32; - if (value instanceof Double) return MongoBSONTypes.Double; - if (value instanceof Date) return MongoBSONTypes.Date; - if (value instanceof Timestamp) return MongoBSONTypes.Timestamp; - - // Less common types - if (value instanceof Decimal128) return MongoBSONTypes.Decimal128; - if (value instanceof Long) return MongoBSONTypes.Long; - if (value instanceof MinKey) return MongoBSONTypes.MinKey; - if (value instanceof MaxKey) return MongoBSONTypes.MaxKey; - if (value instanceof BSONSymbol) return MongoBSONTypes.Symbol; - if (value instanceof DBRef) return MongoBSONTypes.DBRef; - if (value instanceof Map) return MongoBSONTypes.Map; - if (value instanceof UUID && value.sub_type === Binary.SUBTYPE_UUID) return MongoBSONTypes.UUID; - if (value instanceof UUID && value.sub_type === Binary.SUBTYPE_UUID_OLD) - return MongoBSONTypes.UUID_LEGACY; - if (value instanceof Buffer || value instanceof Binary) return MongoBSONTypes.Binary; - if (value instanceof RegExp) return MongoBSONTypes.RegExp; - if (value instanceof Code) { - if (value.scope) { - return MongoBSONTypes.CodeWithScope; - } else { - return MongoBSONTypes.Code; - } - } - - // Default to Object if none of the above match - return MongoBSONTypes.Object; - default: - // This should never happen, but if it does, we'll catch it here - // TODO: add telemetry somewhere to know when it happens (not here, this could get hit too often) - return MongoBSONTypes._UNKNOWN_; - } - } -} diff --git a/src/utils/json/mongo/SchemaAnalyzer.test.ts b/src/utils/json/mongo/SchemaAnalyzer.test.ts deleted file mode 100644 index 731791611..000000000 --- a/src/utils/json/mongo/SchemaAnalyzer.test.ts +++ /dev/null @@ -1,255 +0,0 @@ -/*--------------------------------------------------------------------------------------------- - * Copyright (c) Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See License.txt in the project root for license information. - *--------------------------------------------------------------------------------------------*/ - -import { type JSONSchema, type JSONSchemaRef } from '../JSONSchema'; -import { getPropertyNamesAtLevel, updateSchemaWithDocument } from './SchemaAnalyzer'; -import { - arraysWithDifferentDataTypes, - complexDocument, - complexDocumentsArray, - complexDocumentWithOddTypes, - embeddedDocumentOnly, - flatDocument, - sparseDocumentsArray, -} from './mongoTestDocuments'; - -describe('DocumentDB Schema Analyzer', () => { - it('prints out schema for testing', () => { - const schema: JSONSchema = {}; - updateSchemaWithDocument(schema, embeddedDocumentOnly); - console.log(JSON.stringify(schema, null, 2)); - expect(schema).toBeDefined(); - }); - - it('supports many documents', () => { - const schema: JSONSchema = {}; - sparseDocumentsArray.forEach((doc) => updateSchemaWithDocument(schema, doc)); - expect(schema).toBeDefined(); - - // Check that 'x-documentsInspected' is correct - expect(schema['x-documentsInspected']).toBe(sparseDocumentsArray.length); - - // Check that the schema has the correct root properties - const expectedRootProperties = new Set(['_id', 'name', 'age', 'email', 'isActive', 'score', 'description']); - - expect(Object.keys(schema.properties || {})).toEqual( - expect.arrayContaining(Array.from(expectedRootProperties)), - ); - - // Check that the 'name' field is detected correctly - const nameField: JSONSchema = schema.properties?.['name']; - expect(nameField).toBeDefined(); - expect(nameField?.['x-occurrence']).toBeGreaterThan(0); - - // Access 'anyOf' to get the type entries - const nameFieldTypes = nameField.anyOf?.map((typeEntry) => typeEntry['type']); - expect(nameFieldTypes).toContain('string'); - - // Check that the 'age' field has the correct type - const ageField: JSONSchema = schema.properties?.['age']; - expect(ageField).toBeDefined(); - const ageFieldTypes = ageField.anyOf?.map((typeEntry) => typeEntry['type']); - expect(ageFieldTypes).toContain('number'); - - // Check that the 'isActive' field is a boolean - const isActiveField: JSONSchema = schema.properties?.['isActive']; - expect(isActiveField).toBeDefined(); - const isActiveTypes = isActiveField.anyOf?.map((typeEntry) => typeEntry['type']); - expect(isActiveTypes).toContain('boolean'); - - // Check that the 'description' field is optional (occurs in some documents) - const descriptionField = schema.properties?.['description']; - expect(descriptionField).toBeDefined(); - expect(descriptionField?.['x-occurrence']).toBeLessThan(sparseDocumentsArray.length); - }); - - it('detects all BSON types from flatDocument', () => { - const schema: JSONSchema = {}; - updateSchemaWithDocument(schema, flatDocument); - - // Check that all fields are detected - const expectedFields = Object.keys(flatDocument); - expect(Object.keys(schema.properties || {})).toEqual(expect.arrayContaining(expectedFields)); - - // Helper function to get the 'x-bsonType' from a field - function getBsonType(fieldName: string): string | undefined { - const field = schema.properties?.[fieldName]; - const anyOf = field?.anyOf; - return anyOf && anyOf[0]?.['x-bsonType']; - } - - // Check that specific BSON types are correctly identified - expect(getBsonType('int32Field')).toBe('int32'); - expect(getBsonType('doubleField')).toBe('double'); - expect(getBsonType('decimalField')).toBe('decimal128'); - expect(getBsonType('dateField')).toBe('date'); - expect(getBsonType('objectIdField')).toBe('objectid'); - expect(getBsonType('codeField')).toBe('code'); - expect(getBsonType('uuidField')).toBe('uuid'); - expect(getBsonType('uuidLegacyField')).toBe('uuid-legacy'); - }); - - it('detects embedded objects correctly', () => { - const schema: JSONSchema = {}; - updateSchemaWithDocument(schema, embeddedDocumentOnly); - - // Check that the root properties are detected - expect(schema.properties).toHaveProperty('personalInfo'); - expect(schema.properties).toHaveProperty('jobInfo'); - - // Access 'personalInfo' properties - const personalInfoAnyOf = schema.properties && schema.properties['personalInfo']?.anyOf; - const personalInfoProperties = personalInfoAnyOf?.[0]?.properties; - expect(personalInfoProperties).toBeDefined(); - expect(personalInfoProperties).toHaveProperty('name'); - expect(personalInfoProperties).toHaveProperty('age'); - expect(personalInfoProperties).toHaveProperty('married'); - expect(personalInfoProperties).toHaveProperty('address'); - - // Access 'address' properties within 'personalInfo' - const addressAnyOf = personalInfoProperties['address'].anyOf; - const addressProperties = addressAnyOf?.[0]?.properties; - expect(addressProperties).toBeDefined(); - expect(addressProperties).toHaveProperty('street'); - expect(addressProperties).toHaveProperty('city'); - expect(addressProperties).toHaveProperty('zip'); - }); - - it('detects arrays and their element types correctly', () => { - const schema: JSONSchema = {}; - updateSchemaWithDocument(schema, arraysWithDifferentDataTypes); - - // Check that arrays are detected - expect(schema.properties).toHaveProperty('integersArray'); - expect(schema.properties).toHaveProperty('stringsArray'); - expect(schema.properties).toHaveProperty('booleansArray'); - expect(schema.properties).toHaveProperty('mixedArray'); - expect(schema.properties).toHaveProperty('datesArray'); - - // Helper function to get item types from an array field - function getArrayItemTypes(fieldName: string): string[] | undefined { - const field = schema.properties?.[fieldName]; - const anyOf = field?.anyOf; - const itemsAnyOf: JSONSchemaRef[] = anyOf?.[0]?.items?.anyOf; - return itemsAnyOf?.map((typeEntry) => typeEntry['type']); - } - - // Check that 'integersArray' has elements of type 'number' - const integerItemTypes = getArrayItemTypes('integersArray'); - expect(integerItemTypes).toContain('number'); - - // Check that 'stringsArray' has elements of type 'string' - const stringItemTypes = getArrayItemTypes('stringsArray'); - expect(stringItemTypes).toContain('string'); - - // Check that 'mixedArray' contains multiple types - const mixedItemTypes = getArrayItemTypes('mixedArray'); - expect(mixedItemTypes).toEqual(expect.arrayContaining(['number', 'string', 'boolean', 'object', 'null'])); - }); - - it('handles arrays within objects and objects within arrays', () => { - const schema: JSONSchema = {}; - updateSchemaWithDocument(schema, complexDocument); - - // Access 'user.profile.hobbies' - const userProfile = schema.properties && schema.properties['user'].anyOf?.[0]?.properties?.['profile']; - const hobbies = userProfile?.anyOf?.[0]?.properties?.['hobbies']; - // eslint-disable-next-line @typescript-eslint/no-unsafe-call - const hobbiesItemTypes = hobbies?.anyOf?.[0]?.items?.anyOf?.map((typeEntry) => typeEntry['type']); - expect(hobbiesItemTypes).toContain('string'); - - // Access 'user.profile.addresses' - const addresses = userProfile?.anyOf?.[0]?.properties?.['addresses']; - // eslint-disable-next-line @typescript-eslint/no-unsafe-call - const addressItemTypes = addresses?.anyOf?.[0]?.items?.anyOf?.map((typeEntry) => typeEntry['type']); - expect(addressItemTypes).toContain('object'); - - // Check that 'orders' is an array - const orders = schema.properties && schema.properties['orders']; - expect(orders).toBeDefined(); - const ordersType = orders.anyOf?.[0]?.type; - expect(ordersType).toBe('array'); - - // Access 'items' within 'orders' - const orderItems = orders.anyOf?.[0]?.items?.anyOf?.[0]?.properties?.['items']; - const orderItemsType = orderItems?.anyOf?.[0]?.type; - expect(orderItemsType).toBe('array'); - }); - - it('updates schema correctly when processing multiple documents', () => { - const schema: JSONSchema = {}; - complexDocumentsArray.forEach((doc) => updateSchemaWithDocument(schema, doc)); - - // Check that 'x-documentsInspected' is correct - expect(schema['x-documentsInspected']).toBe(complexDocumentsArray.length); - - // Check that some fields are present from different documents - expect(schema.properties).toHaveProperty('stringField'); - expect(schema.properties).toHaveProperty('personalInfo'); - expect(schema.properties).toHaveProperty('integersArray'); - expect(schema.properties).toHaveProperty('user'); - - // Check that 'integersArray' has correct min and max values - const integersArray = schema.properties && schema.properties['integersArray']; - const integerItemType = integersArray.anyOf?.[0]?.items?.anyOf?.[0]; - expect(integerItemType?.['x-minValue']).toBe(1); - expect(integerItemType?.['x-maxValue']).toBe(5); - - // Check that 'orders.items.price' is detected as Decimal128 - const orders = schema.properties && schema.properties['orders']; - const orderItems = orders.anyOf?.[0]?.items?.anyOf?.[0]?.properties?.['items']; - const priceField = orderItems?.anyOf?.[0]?.items?.anyOf?.[0]?.properties?.['price']; - const priceFieldType = priceField?.anyOf?.[0]; - expect(priceFieldType?.['x-bsonType']).toBe('decimal128'); - }); - - describe('traverses schema', () => { - it('with valid paths', () => { - const schema: JSONSchema = {}; - updateSchemaWithDocument(schema, complexDocument); - - let propertiesAtRoot = getPropertyNamesAtLevel(schema, []); - expect(propertiesAtRoot).toHaveLength(4); - - propertiesAtRoot = getPropertyNamesAtLevel(schema, ['user']); - expect(propertiesAtRoot).toHaveLength(3); - - propertiesAtRoot = getPropertyNamesAtLevel(schema, ['user', 'profile']); - expect(propertiesAtRoot).toHaveLength(4); - }); - - it('with broken paths', () => { - const schema: JSONSchema = {}; - updateSchemaWithDocument(schema, complexDocument); - - const propertiesAtRoot = getPropertyNamesAtLevel(schema, []); - expect(propertiesAtRoot).toHaveLength(4); - - expect(() => getPropertyNamesAtLevel(schema, ['no-entry'])).toThrow(); - - expect(() => getPropertyNamesAtLevel(schema, ['user', 'no-entry'])).toThrow(); - }); - - it('with sparse docs and mixed types', () => { - const schema: JSONSchema = {}; - updateSchemaWithDocument(schema, complexDocument); - updateSchemaWithDocument(schema, complexDocumentWithOddTypes); - - let propertiesAtRoot = getPropertyNamesAtLevel(schema, []); - expect(propertiesAtRoot).toHaveLength(4); - - propertiesAtRoot = getPropertyNamesAtLevel(schema, ['user']); - expect(propertiesAtRoot).toHaveLength(3); - expect(propertiesAtRoot).toEqual(['email', 'profile', 'username']); - - propertiesAtRoot = getPropertyNamesAtLevel(schema, ['user', 'profile']); - expect(propertiesAtRoot).toHaveLength(4); - expect(propertiesAtRoot).toEqual(['addresses', 'firstName', 'hobbies', 'lastName']); - - propertiesAtRoot = getPropertyNamesAtLevel(schema, ['history']); - expect(propertiesAtRoot).toHaveLength(6); - }); - }); -}); diff --git a/src/utils/json/mongo/autocomplete/getKnownFields.ts b/src/utils/json/mongo/autocomplete/getKnownFields.ts deleted file mode 100644 index a82277a73..000000000 --- a/src/utils/json/mongo/autocomplete/getKnownFields.ts +++ /dev/null @@ -1,95 +0,0 @@ -/*--------------------------------------------------------------------------------------------- - * Copyright (c) Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See License.txt in the project root for license information. - *--------------------------------------------------------------------------------------------*/ - -import Denque from 'denque'; -import { type JSONSchema } from '../../JSONSchema'; - -export interface FieldEntry { - path: string; - type: string; -} - -/** - * This function traverses our JSON Schema object and collects all leaf property paths - * along with their most common data types. - * - * This information is needed for auto-completion support - * - * The approach is as follows: - * - Initialize a queue with the root properties of the schema to perform a breadth-first traversal. - * - While the queue is not empty: - * - Dequeue the next item, which includes the current schema node and its path. - * - Determine the most common type for the current node by looking at the 'x-typeOccurrence' field. - * - If the most common type is an object with properties: - * - Enqueue its child properties with their updated paths into the queue for further traversal. - * - Else if the most common type is a leaf type (e.g., string, number, boolean): - * - Add the current path and type to the result array as it represents a leaf property. - * - Continue this process until all nodes have been processed. - * - Return the result array containing objects with 'path' and 'type' for each leaf property. - */ -export function getKnownFields(schema: JSONSchema): FieldEntry[] { - const result: Array<{ path: string; type: string }> = []; - type QueueItem = { - path: string; - schemaNode: JSONSchema; - }; - - const queue: Denque = new Denque(); - - // Initialize the queue with root properties - if (schema.properties) { - for (const propName of Object.keys(schema.properties)) { - const propSchema = schema.properties[propName] as JSONSchema; - queue.push({ path: propName, schemaNode: propSchema }); - } - } - - while (queue.length > 0) { - const item = queue.shift(); - if (!item) continue; - - const { path, schemaNode } = item; - const mostCommonTypeEntry = getMostCommonTypeEntry(schemaNode); - - if (mostCommonTypeEntry) { - if (mostCommonTypeEntry.type === 'object' && mostCommonTypeEntry.properties) { - // Not a leaf node, enqueue its properties - for (const childName of Object.keys(mostCommonTypeEntry.properties)) { - const childSchema = mostCommonTypeEntry.properties[childName] as JSONSchema; - queue.push({ path: `${path}.${childName}`, schemaNode: childSchema }); - } - } else { - // Leaf node, add to result - result.push({ path: path, type: mostCommonTypeEntry.type as string }); - } - } - } - - return result; -} - -/** - * Helper function to get the most common type entry from a schema node. - * It looks for the 'anyOf' array and selects the type with the highest 'x-typeOccurrence'. - */ -function getMostCommonTypeEntry(schemaNode: JSONSchema): JSONSchema | null { - if (schemaNode.anyOf && schemaNode.anyOf.length > 0) { - let maxOccurrence = -1; - let mostCommonTypeEntry: JSONSchema | null = null; - - for (const typeEntry of schemaNode.anyOf as JSONSchema[]) { - const occurrence = typeEntry['x-typeOccurrence'] || 0; - if (occurrence > maxOccurrence) { - maxOccurrence = occurrence; - mostCommonTypeEntry = typeEntry; - } - } - return mostCommonTypeEntry; - } else if (schemaNode.type) { - // If 'anyOf' is not present, use the 'type' field directly - return schemaNode; - } - return null; -} diff --git a/src/utils/slickgrid/mongo/toSlickGridTable.test.ts b/src/utils/slickgrid/mongo/toSlickGridTable.test.ts index 69156bf1b..4b1d0af3f 100644 --- a/src/utils/slickgrid/mongo/toSlickGridTable.test.ts +++ b/src/utils/slickgrid/mongo/toSlickGridTable.test.ts @@ -76,7 +76,6 @@ describe('toSlickGridTable', () => { it('at a nested level', () => { const tableData = getDataAtPath(mongoDocuments, ['nestedDocument']); - console.log(tableData); expect(tableData).toHaveLength(5); expect(tableData[0]['key']).toBeDefined(); diff --git a/src/utils/slickgrid/mongo/toSlickGridTable.ts b/src/utils/slickgrid/mongo/toSlickGridTable.ts index 737fcb7c0..5deb51fe0 100644 --- a/src/utils/slickgrid/mongo/toSlickGridTable.ts +++ b/src/utils/slickgrid/mongo/toSlickGridTable.ts @@ -3,11 +3,10 @@ * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ +import { BSONTypes, valueToDisplayString } from '@vscode-documentdb/schema-analyzer'; import { EJSON } from 'bson'; import { type Document, type WithId } from 'mongodb'; import { type TableDataEntry } from '../../../documentdb/ClusterSession'; -import { MongoBSONTypes } from '../../json/mongo/MongoBSONTypes'; -import { valueToDisplayString } from '../../json/mongo/MongoValueFormatters'; /** * Extracts data from a list of MongoDB documents at a specified path. @@ -45,8 +44,8 @@ export function getDataAtPath(documents: WithId[], path: string[]): Ta // we also make sure that the '_id' field is always included in the data! if (doc._id) { row['_id'] = { - value: valueToDisplayString(doc._id, MongoBSONTypes.inferType(doc._id)), - type: MongoBSONTypes.inferType(doc._id), + value: valueToDisplayString(doc._id, BSONTypes.inferType(doc._id)), + type: BSONTypes.inferType(doc._id), }; // TODO: problem here -> what if the user has a field with this name... row['x-objectid'] = EJSON.stringify(doc._id, { relaxed: false }); // this is crucial, we need to retain the _id field for future queries from the table view @@ -72,13 +71,13 @@ export function getDataAtPath(documents: WithId[], path: string[]): Ta continue; } else { const value: unknown = subdocument[key]; - const type: MongoBSONTypes = MongoBSONTypes.inferType(value); + const type: BSONTypes = BSONTypes.inferType(value); // eslint-disable-next-line if (value instanceof Array) { row[key] = { value: `array[${value.length}]`, - type: MongoBSONTypes.Array, + type: BSONTypes.Array, }; } else { row[key] = { value: valueToDisplayString(value, type), type: type }; diff --git a/src/utils/slickgrid/mongo/toSlickGridTree.ts b/src/utils/slickgrid/mongo/toSlickGridTree.ts index 9d3742cfe..849ad42b6 100644 --- a/src/utils/slickgrid/mongo/toSlickGridTree.ts +++ b/src/utils/slickgrid/mongo/toSlickGridTree.ts @@ -3,9 +3,8 @@ * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ +import { BSONTypes, valueToDisplayString } from '@vscode-documentdb/schema-analyzer'; import { type Document, type ObjectId, type WithId } from 'mongodb'; -import { MongoBSONTypes } from '../../json/mongo/MongoBSONTypes'; -import { valueToDisplayString } from '../../json/mongo/MongoValueFormatters'; /** * The data structure for a single node entry in the tree data structure for SlickGrid. @@ -113,10 +112,10 @@ export function documentToSlickGridTree(document: WithId, idPrefix?: s continue; } - const dataType: MongoBSONTypes = MongoBSONTypes.inferType(stackEntry.value); + const dataType: BSONTypes = BSONTypes.inferType(stackEntry.value); switch (dataType) { - case MongoBSONTypes.Object: { + case BSONTypes.Object: { tree.push({ id: globalEntryId, field: `${stackEntry.key}`, @@ -131,7 +130,7 @@ export function documentToSlickGridTree(document: WithId, idPrefix?: s }); break; } - case MongoBSONTypes.Array: { + case BSONTypes.Array: { const value = stackEntry.value as unknown[]; tree.push({ @@ -157,7 +156,7 @@ export function documentToSlickGridTree(document: WithId, idPrefix?: s id: globalEntryId, field: `${stackEntry.key}`, value: valueToDisplayString(stackEntry.value, dataType), - type: MongoBSONTypes.toDisplayString(MongoBSONTypes.inferType(stackEntry.value)), + type: BSONTypes.toDisplayString(BSONTypes.inferType(stackEntry.value)), parentId: stackEntry.parentId, }); break; diff --git a/src/webviews/documentdb/collectionView/collectionViewRouter.ts b/src/webviews/documentdb/collectionView/collectionViewRouter.ts index fec8eb05d..62ef2f476 100644 --- a/src/webviews/documentdb/collectionView/collectionViewRouter.ts +++ b/src/webviews/documentdb/collectionView/collectionViewRouter.ts @@ -4,6 +4,7 @@ *--------------------------------------------------------------------------------------------*/ import { callWithTelemetryAndErrorHandling, type IActionContext } from '@microsoft/vscode-azext-utils'; +import { type FieldEntry } from '@vscode-documentdb/schema-analyzer'; import * as fs from 'fs'; import { type Document } from 'mongodb'; import * as path from 'path'; @@ -12,7 +13,6 @@ import { type JSONSchema } from 'vscode-json-languageservice'; import { z } from 'zod'; import { ClusterSession } from '../../../documentdb/ClusterSession'; import { getConfirmationAsInSettings } from '../../../utils/dialogs/getConfirmation'; -import { getKnownFields, type FieldEntry } from '../../../utils/json/mongo/autocomplete/getKnownFields'; import { publicProcedureWithTelemetry, router, type WithTelemetry } from '../../api/extension-server/trpc'; import * as l10n from '@vscode/l10n'; @@ -40,8 +40,8 @@ import { ext } from '../../../extensionVariables'; import { QueryInsightsAIService } from '../../../services/ai/QueryInsightsAIService'; import { type CollectionItem } from '../../../tree/documentdb/CollectionItem'; // eslint-disable-next-line import/no-internal-modules -import basicFindQuerySchema from '../../../utils/json/mongo/autocomplete/basicMongoFindFilterSchema.json'; -import { generateMongoFindJsonSchema } from '../../../utils/json/mongo/autocomplete/generateMongoFindJsonSchema'; +import basicFindQuerySchema from '../../../utils/json/data-api/autocomplete/basicMongoFindFilterSchema.json'; +import { generateMongoFindJsonSchema } from '../../../utils/json/data-api/autocomplete/generateMongoFindJsonSchema'; import { promptAfterActionEventually } from '../../../utils/survey'; import { UsageImpact } from '../../../utils/surveyTypes'; import { type BaseRouterContext } from '../../api/configuration/appRouter'; @@ -241,8 +241,7 @@ export const collectionsViewRouter = router({ const session: ClusterSession = ClusterSession.getSession(myCtx.sessionId); - const _currentJsonSchema = session.getCurrentSchema(); - const autoCompletionData: FieldEntry[] = getKnownFields(_currentJsonSchema); + const autoCompletionData: FieldEntry[] = session.getKnownFields(); let querySchema: JSONSchema; diff --git a/src/webviews/documentdb/collectionView/components/queryEditor/QueryEditor.tsx b/src/webviews/documentdb/collectionView/components/queryEditor/QueryEditor.tsx index 7b6d7e8ec..b03e5e9bb 100644 --- a/src/webviews/documentdb/collectionView/components/queryEditor/QueryEditor.tsx +++ b/src/webviews/documentdb/collectionView/components/queryEditor/QueryEditor.tsx @@ -11,7 +11,7 @@ import { InputWithProgress } from '../../../../components/InputWithProgress'; // eslint-disable-next-line import/no-internal-modules import type * as monacoEditor from 'monaco-editor/esm/vs/editor/editor.api'; // eslint-disable-next-line import/no-internal-modules -import basicFindQuerySchema from '../../../../../utils/json/mongo/autocomplete/basicMongoFindFilterSchema.json'; +import basicFindQuerySchema from '../../../../../utils/json/data-api/autocomplete/basicMongoFindFilterSchema.json'; import { useConfiguration } from '../../../../api/webview-client/useConfiguration'; import { type CollectionViewWebviewConfigurationType } from '../../collectionViewController'; diff --git a/test/mongoGetCommand.test.ts b/test/mongoGetCommand.test.ts index 7b4ce3f4d..bf34fa867 100644 --- a/test/mongoGetCommand.test.ts +++ b/test/mongoGetCommand.test.ts @@ -797,7 +797,6 @@ suite('scrapbook parsing Tests', () => { const commands: MongoCommand[] = getAllCommandsFromText(text); const command: MongoCommand = findCommandAtPosition(commands, new Position(0, 0)); const generatedRegExp = (nonNullProp(command, 'argumentObjects')[0]).sku; - console.log('generatedRegExp', generatedRegExp); assert.deepEqual(generatedRegExp.options, 'i'); assert.deepEqual(generatedRegExp.pattern, '789$'); }); @@ -838,11 +837,8 @@ suite('scrapbook parsing Tests', () => { // The regex parsing tests following this test should help zero-in on which case isn't handled properly. test('test regular expression parsing - with many special cases', () => { const text = `db.test1.beep.find({ sku: /^(hello?= world).*[^0-9]+|(world\\b\\*){0,2}$/ })`; - console.log(text); const commands: MongoCommand[] = getAllCommandsFromText(text); - console.log('commands', commands); const command: MongoCommand = findCommandAtPosition(commands, new Position(0, 0)); - console.log('command', command); const generatedRegExp = (nonNullProp(command, 'argumentObjects')[0]).sku; assert.deepEqual(generatedRegExp.options, ''); assert.deepEqual(generatedRegExp.pattern, '^(hello?= world).*[^0-9]+|(world\\b\\*){0,2}$'); diff --git a/tsconfig.json b/tsconfig.json index 894220ad0..ae69b6474 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -30,5 +30,6 @@ ] */ }, - "exclude": ["node_modules", ".vscode-test"] + "exclude": ["node_modules", ".vscode-test", "packages/*/dist"], + "references": [{ "path": "packages/schema-analyzer" }] }