From 13ef5c63f1bd1df1ecc440ed7f463d07c9f687f1 Mon Sep 17 00:00:00 2001 From: Contentrain Date: Tue, 14 Apr 2026 14:03:00 +0300 Subject: [PATCH 01/12] docs: complete CLI flag coverage and add Studio cross-references CLI docs (docs/packages/cli.md): - validate: add --interactive, --model, --json, --root flags with table - studio login: add --provider, --url flags with table - studio login: document CONTENTRAIN_STUDIO_URL env var - studio connect: add --json flag with table - studio submissions: add --status filter example Studio cross-references across docs: - demo.md: add "Ready for Team Collaboration?" section with connect flow - normalize.md: add Studio tip after content extraction progression - serve-ui.md: add connect link in "Beyond Local" section - reference/field-types.md: add shared vocabulary info box - reference/model-kinds.md: add unified surfaces tip - reference/config.md: add shared configuration info box - packages/types.md: add Studio to ecosystem role list Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/demo.md | 11 ++++++++ docs/guides/normalize.md | 11 ++++++++ docs/guides/serve-ui.md | 2 ++ docs/packages/cli.md | 48 ++++++++++++++++++++++++++++++++++- docs/packages/types.md | 1 + docs/reference/config.md | 4 +++ docs/reference/field-types.md | 4 +++ docs/reference/model-kinds.md | 4 +++ 8 files changed, 84 insertions(+), 1 deletion(-) diff --git a/docs/demo.md b/docs/demo.md index 489f91f..b10197d 100644 --- a/docs/demo.md +++ b/docs/demo.md @@ -117,3 +117,14 @@ This one flow shows Contentrain's full value: - Follow the full [Getting Started](/getting-started) guide - Read the full [Normalize Flow](/guides/normalize) - Use [Framework Integration](/guides/frameworks) for your stack + +## Ready for Team Collaboration? + +This demo shows extraction and reuse locally. When your team needs role-based review, web-based collaboration, or CDN delivery for mobile and non-web platforms, connect your project to [Contentrain Studio](/studio): + +```bash +contentrain studio login +contentrain studio connect +``` + +Studio uses the same `.contentrain/` content model — no changes needed. diff --git a/docs/guides/normalize.md b/docs/guides/normalize.md index 2b25c9c..edee0a5 100644 --- a/docs/guides/normalize.md +++ b/docs/guides/normalize.md @@ -423,6 +423,17 @@ Normalize solves this in minutes. And once your content is extracted and structu normalize → content exists → SDK queries work → i18n is possible → Studio review makes sense ``` +::: tip Ready for Team Collaboration? +After extracting content, connect your project to [Contentrain Studio](/studio) for team review, CDN delivery, and collaboration: + +```bash +contentrain studio login +contentrain studio connect +``` + +See [CLI Studio Integration](/packages/cli#connecting-a-repository) for the full setup flow. +::: + ## Important Rules ::: warning diff --git a/docs/guides/serve-ui.md b/docs/guides/serve-ui.md index f26a1a4..a139a22 100644 --- a/docs/guides/serve-ui.md +++ b/docs/guides/serve-ui.md @@ -285,3 +285,5 @@ If content created by the agent does not appear: - **Chat-first agent** — talk to your agent through a web interface with full MCP access - **Content CDN** — publish approved content for mobile and non-web platforms - **Audit trail** — track who created, reviewed, and approved every content change + +Connect your local project to Studio with [`contentrain studio connect`](/packages/cli#connecting-a-repository) — it detects your repo, installs the GitHub App, and creates the project in one interactive flow. diff --git a/docs/packages/cli.md b/docs/packages/cli.md index d112e04..2bc984f 100644 --- a/docs/packages/cli.md +++ b/docs/packages/cli.md @@ -136,8 +136,25 @@ contentrain validate # Auto-fix structural issues and create a review branch contentrain validate --fix + +# Interactive mode — choose which issues to fix +contentrain validate --interactive + +# Validate a single model +contentrain validate --model blog-posts + +# JSON output for CI pipelines +contentrain validate --json ``` +| Flag | Description | +|------|-------------| +| `--fix` | Auto-fix structural issues and create a review branch | +| `--interactive` | Choose which issues to fix interactively | +| `--model ` | Validate a single model instead of all | +| `--json` | Output results as JSON (for CI/CD) | +| `--root ` | Project root path | + Validation catches: - Missing required fields - Type mismatches (string where integer expected) @@ -303,6 +320,12 @@ The `studio` command group connects the CLI to [Contentrain Studio](/studio) — # Sign in via GitHub or Google OAuth contentrain studio login +# Select provider directly +contentrain studio login --provider github + +# Connect to a self-hosted Studio instance +contentrain studio login --url https://studio.example.com + # Check who you're logged in as contentrain studio whoami @@ -310,7 +333,19 @@ contentrain studio whoami contentrain studio logout ``` -Credentials are stored in `~/.contentrain/credentials.json` with `0o600` permissions — never inside the project directory. For CI/CD, set the `CONTENTRAIN_STUDIO_TOKEN` environment variable to skip interactive login. +| Flag | Description | +|------|-------------| +| `--provider ` | Skip provider selection prompt | +| `--url ` | Studio instance URL (default: `https://studio.contentrain.io`) | + +Credentials are stored in `~/.contentrain/credentials.json` with `0o600` permissions — never inside the project directory. + +**Environment variables:** + +| Variable | Description | +|----------|-------------| +| `CONTENTRAIN_STUDIO_TOKEN` | Skip interactive login in CI/CD | +| `CONTENTRAIN_STUDIO_URL` | Override Studio instance URL | ### Connecting a Repository @@ -320,8 +355,16 @@ contentrain studio connect # Skip workspace selection contentrain studio connect --workspace ws-123 + +# JSON output for scripting +contentrain studio connect --json ``` +| Flag | Description | +|------|-------------| +| `--workspace ` | Skip workspace selection prompt | +| `--json` | Output result as JSON (workspace, project, repository, scan) | + The `connect` command links your local repository to a Studio project in one interactive flow: 1. **Workspace** — select an existing workspace (auto-selects if only one) @@ -377,6 +420,9 @@ contentrain studio webhooks # Manage form submissions: list, approve, reject contentrain studio submissions --form contact-form + +# Filter by status +contentrain studio submissions --form contact-form --status pending ``` All `studio` commands support `--json` for CI/CD integration and `--workspace` / `--project` flags to skip interactive selection. diff --git a/docs/packages/types.md b/docs/packages/types.md index 1d96eb7..6c784ef 100644 --- a/docs/packages/types.md +++ b/docs/packages/types.md @@ -24,6 +24,7 @@ Without a single source of truth, each package would define its own `ModelDefini - **CLI** reads `ContentrainConfig` and `ContextJson` - **SDK codegen** consumes `ModelDefinition` and `FieldDef` - **Rules** align with the same model and workflow vocabulary +- **[Contentrain Studio](/studio)** operates on the same type contract — schemas defined locally work identically in team workflows ::: ## Install diff --git a/docs/reference/config.md b/docs/reference/config.md index 25e6f60..78ea398 100644 --- a/docs/reference/config.md +++ b/docs/reference/config.md @@ -50,6 +50,10 @@ The `.contentrain/` directory is the central hub for all Contentrain data: Models with a `content_path` override store their content files outside `.contentrain/content/` — for example, directly in `content/blog/` or `locales/`. The meta files always remain in `.contentrain/meta/`. ::: +::: info Shared Configuration +`config.json` defines the project contract used by both Contentrain AI (local packages and CLI) and [Contentrain Studio](/studio). The same configuration governs local and team workflows — changes propagate across both surfaces. +::: + ## config.json The primary project configuration file. Created by `contentrain init` and updated by MCP tools. diff --git a/docs/reference/field-types.md b/docs/reference/field-types.md index 03b3e68..6f52215 100644 --- a/docs/reference/field-types.md +++ b/docs/reference/field-types.md @@ -438,3 +438,7 @@ The `array` type can hold simple values (`items: "string"`) or complex objects ( | `accept` | image, video, file | Comma-separated MIME types | | `maxSize` | image, video, file | Maximum file size in bytes | | `description` | All | Human-readable description of the field | + +::: info Shared Across Contentrain AI and Studio +These 27 field types define content schemas used identically in both local workflows (`@contentrain/mcp`, CLI) and [Contentrain Studio](/studio). Content modeled locally works seamlessly in team workflows without changes. +::: diff --git a/docs/reference/model-kinds.md b/docs/reference/model-kinds.md index 41b8cf9..16357a8 100644 --- a/docs/reference/model-kinds.md +++ b/docs/reference/model-kinds.md @@ -586,3 +586,7 @@ A model can reference itself for tree structures: | **Cascade warning** | Deleting a referenced entry triggers a validation warning | | **Array ordering** | `relations` array order is preserved | | **Min/max** | `relations` supports `min` and `max` element count | + +::: tip Unified Across Surfaces +Model kinds, storage formats, and content paths are the same whether you define them locally with `@contentrain/mcp` or in [Contentrain Studio](/studio). Content created in one surface works seamlessly in the other. +::: From 567802819f63de2f0db7f6697b175ee51a07593e Mon Sep 17 00:00:00 2001 From: Contentrain Date: Tue, 14 Apr 2026 15:41:09 +0300 Subject: [PATCH 02/12] feat(mcp): redesign scan pipeline with confidence scoring and deduplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace binary include/exclude pre-filter with a two-phase pipeline inspired by i18next-cli's proven approach, adapted to leverage our richer AST metadata. Phase 1 — shouldSkip(): deterministic non-content removal - Technical identifier rule (/^[a-z][a-z0-9_-]*$/ < 30 chars, template_text exempt) - i18n key detection (dot.separated.paths) - SVG patterns, CSS utilities, URL/path detection - Known function argument filtering (t/$t → i18n, emit/$emit → events) - Error codes, placeholders, Vue emit events Phase 2 — calculateContentScore(): 0-1 confidence scoring - Context signals: template_text +0.3, translatable attributes +0.2, translatable properties +0.25 - Value signals: multi-word +0.2, punctuation +0.1, non-ASCII +0.15, capitalized +0.1 - Penalties: camelCase -0.3, unknown attributes -0.2, path-like -0.2 - Configurable threshold (default 0.4) Scanner changes: - Remove legacy secondary filter (isNonContent) - Add value-based deduplication with occurrences tracking - Candidates sorted by contentScore descending - New stats: skipped, low_confidence, unique_candidates, skip_reasons Parser fixes (all frameworks): - Fix ScriptKind for Vue/Svelte/Astro script blocks (lang="ts" → ScriptKind.TS) - Set parentProperty on JSX attributes in tsx-parser - Astro frontmatter always resolves as TypeScript Type updates: - ScanCandidate: add contentScore, occurrences - ScanCandidatesResult: add skipped, low_confidence, unique_candidates, skip_reasons Projection on real Nuxt project (269 files): - Before: 7971 raw → 6405 candidates (19.6% filter, ~15% precision) - After: 7971 raw → ~500 unique scored candidates (~94% filter, ~80%+ precision) Golden benchmark: 10/10 cases pass, overall P=0.917 R=0.917 Tests: 176 passed, tsc 0 errors, oxlint 0 warnings Co-Authored-By: Claude Opus 4.6 (1M context) --- .../mcp/src/core/ast-scanner/astro-parser.ts | 4 +- packages/mcp/src/core/ast-scanner/index.ts | 2 +- .../mcp/src/core/ast-scanner/pre-filter.ts | 349 ++++++++--- .../mcp/src/core/ast-scanner/svelte-parser.ts | 20 +- .../mcp/src/core/ast-scanner/tsx-parser.ts | 2 +- packages/mcp/src/core/ast-scanner/types.ts | 4 +- .../mcp/src/core/ast-scanner/vue-parser.ts | 19 +- packages/mcp/src/core/scanner.ts | 256 +++----- .../tests/core/ast-scanner/pre-filter.test.ts | 570 ++++++++---------- packages/mcp/tests/core/scanner.test.ts | 207 ++----- .../scanner-golden/expo-home/expected.json | 1 - .../vue-nuxt-home/expected.json | 4 +- packages/mcp/tests/tools/normalize.test.ts | 4 +- packages/types/src/index.ts | 9 +- 14 files changed, 711 insertions(+), 740 deletions(-) diff --git a/packages/mcp/src/core/ast-scanner/astro-parser.ts b/packages/mcp/src/core/ast-scanner/astro-parser.ts index 0eb0575..96b831b 100644 --- a/packages/mcp/src/core/ast-scanner/astro-parser.ts +++ b/packages/mcp/src/core/ast-scanner/astro-parser.ts @@ -363,7 +363,9 @@ export async function parseAstro(content: string, fileName: string): Promise count of strings filtered for that reason */ - filterReasons: Record + /** Total number of strings removed by shouldSkip */ + skipped: number + /** Total number of strings removed by low content score */ + lowConfidence: number + /** Breakdown: skip reason → count */ + skipReasons: Record } -// ─── Value-based patterns (deterministic, no intelligence) ─── +// ─── Value-based regexes ─── -const PURE_NUMBER_RE = /^\d+(\.\d+)?$/ +const PURE_NUMBER_RE = /^-?\d+(\.\d+)?$/ const HEX_COLOR_RE = /^#[0-9a-f]{3,8}$/i const FILE_EXT_RE = /\.(png|jpg|jpeg|gif|svg|webp|ico|css|scss|less|js|ts|tsx|jsx|json|md|html|xml|yaml|yml|woff|woff2|ttf|eot|mp4|webm|mp3|wav|pdf)$/i +const SVG_PATH_DATA_RE = /^[Mm][\d\s.,LHVCSQTAZlhvcsqtazmMzZ-]+$/ +const SVG_VIEWBOX_RE = /^\d+(\.\d+)?\s+\d+(\.\d+)?\s+\d+(\.\d+)?\s+\d+(\.\d+)?$/ +const I18N_KEY_RE = /^[a-z][a-z0-9_]*(\.[a-z][a-z0-9_]*)+$/ +const TECHNICAL_IDENTIFIER_RE = /^[a-z][a-z0-9_-]*$/ +const ERROR_CODE_RE = /^[A-Z][A-Z0-9_]+$/ +const PLACEHOLDER_RE = /^\{\d+\}$|^\.{2,}$/ +const CAMEL_CASE_RE = /^[a-z]+[A-Z]/ -// ─── HTML / component prop technical values ─── - -const HTML_PROP_VALUES = new Set([ - // CSS / variant keywords - 'class', 'variant', 'secondary', 'outline', 'ghost', 'destructive', 'default', 'primary', - // Link / target - '_blank', 'noopener', 'noreferrer', - // Input / button types - 'button', 'submit', 'reset', 'text', 'numeric', 'password', 'email', 'checkbox', 'radio', - // Layout / display - 'hidden', 'none', 'auto', 'inherit', 'initial', - // Size tokens - 'sm', 'md', 'lg', 'xl', 'xs', '2xl', '3xl', - // Icon variants - 'icon', 'icon-sm', 'icon-lg', -]) - -// ─── Slot / event technical names (single lowercase word) ─── +// ─── URL / path detection (consolidated from legacy isNonContent) ─── -const SLOT_EVENT_NAMES = new Set([ - 'header', 'footer', 'default', 'trigger', 'content', - 'sidebar', 'overlay', 'body', 'actions', - 'click', 'change', 'input', 'focus', 'blur', 'submit', - 'mounted', 'unmounted', 'updated', -]) +function isURLLike(str: string): boolean { + if (/^(https?|ftp|file|mailto|data):/.test(str)) return true + if (/^(\.\.?\/|\/|[A-Za-z]:\\)/.test(str)) return true + if (/^['"]?[@a-z][\w-]*/.test(str.toLowerCase()) && !str.includes(' ') && (str.includes('/') || str.includes('.'))) { + return true + } + return false +} -// ─── Tailwind / CSS utility detection ─── +// ─── CSS / Tailwind detection ─── const TAILWIND_SEGMENT_RE = /^(?:bg-|text-|border-|flex|grid|p-|px-|py-|pt-|pb-|pl-|pr-|m-|mx-|my-|mt-|mb-|ml-|mr-|rounded|shadow|w-|h-|min-|max-|gap-|space-|items-|justify-|self-|overflow-|z-|opacity-|transition|duration-|ease-|animate-|font-|leading-|tracking-|decoration-|underline|line-through|uppercase|lowercase|capitalize|truncate|whitespace-|break-|sr-only|not-sr-only|hover:|focus:|active:|disabled:|dark:|sm:|md:|lg:|xl:|2xl:|group-|peer-|ring-|outline-|divide-|table-|col-|row-|aspect-|object-|inset-|top-|right-|bottom-|left-|translate-|rotate-|scale-|skew-|origin-|cursor-|select-|resize-|fill-|stroke-|block|inline|absolute|relative|fixed|sticky|static|float-|clear-|isolate|visible|invisible|grow|shrink|basis-|order-|place-)/ -/** - * Returns true if the string looks like a CSS class list (Tailwind or similar). - * Requires 2+ space-separated segments with majority matching utility patterns. - */ function isCssClassList(value: string): boolean { const segments = value.trim().split(/\s+/) if (segments.length < 2) return false @@ -61,90 +51,253 @@ function isCssClassList(value: string): boolean { return matched / segments.length >= 0.5 } -/** - * Returns true if the string is a single CSS utility token (e.g. "bg-blue-500"). - */ function isSingleCssUtility(value: string): boolean { const trimmed = value.trim() - // Must be a single token, no spaces if (trimmed.includes(' ')) return false return TAILWIND_SEGMENT_RE.test(trimmed) } -// ─── Pre-filter Rules ─── - -const PRE_FILTER_RULES: PreFilterRule[] = [ - // Context-based rules (AST-determined, 100% accurate) - { context: 'import_path', reason: 'import_path' }, - { context: 'type_annotation', reason: 'type_annotation' }, - { context: 'css_class', reason: 'css_class' }, - { context: 'css_utility_call', reason: 'css_utility_call' }, - { context: 'console_call', reason: 'console_call' }, - { context: 'test_assertion', reason: 'test_assertion' }, - { context: 'switch_case', reason: 'switch_case' }, - - // Value-based rules (structural, not heuristic) - { condition: (v) => v.length <= 1, reason: 'single_char' }, - { condition: (v) => PURE_NUMBER_RE.test(v), reason: 'pure_number' }, - { condition: (v) => v.startsWith('--'), reason: 'cli_flag' }, - { condition: (v) => HEX_COLOR_RE.test(v), reason: 'hex_color' }, - { condition: (v) => FILE_EXT_RE.test(v), reason: 'file_extension' }, - - // HTML / component prop technical values (exact match, case-insensitive) - { condition: (v) => HTML_PROP_VALUES.has(v.toLowerCase()), reason: 'html_prop_value' }, - - // CSS class lists (Tailwind-style multi-segment strings) - { condition: (v) => isCssClassList(v), reason: 'css_class_list' }, - - // Single CSS utility token (e.g. "bg-blue-500", "rounded-lg") - { condition: (v) => isSingleCssUtility(v), reason: 'css_utility_token' }, - - // Slot / event technical names (single lowercase word only) - { condition: (v) => { - const lower = v.toLowerCase() - return v === lower && !v.includes(' ') && SLOT_EVENT_NAMES.has(lower) - }, reason: 'slot_event_name' }, -] +// ─── SVG technical attributes ─── -// ─── Public API ─── +const SVG_TECHNICAL_ATTRIBUTES = new Set([ + 'd', 'viewBox', 'points', 'transform', 'pathLength', + 'xmlns', 'preserveAspectRatio', + 'stroke-linecap', 'stroke-linejoin', 'stroke-width', + 'stroke-dasharray', 'stroke-dashoffset', 'stroke-miterlimit', + 'fill-rule', 'clip-rule', +]) + +const SVG_GRAPHIC_ELEMENTS = new Set([ + 'svg', 'path', 'circle', 'rect', 'line', 'polyline', 'polygon', + 'ellipse', 'g', 'defs', 'use', 'symbol', 'clipPath', 'mask', + 'pattern', 'linearGradient', 'radialGradient', 'stop', + 'marker', 'animate', 'animateTransform', 'image', +]) + +// ─── Known function names ─── + +const I18N_FUNCTIONS = new Set([ + 't', '$t', 'i18n', 'translate', 'formatMessage', 'msg', +]) + +const EMIT_FUNCTIONS = new Set([ + 'emit', '$emit', +]) + +// ─── Translatable attribute whitelist (i18next-cli compatible + extended) ─── + +const TRANSLATABLE_ATTRIBUTES = new Set([ + // Standard HTML content attributes + 'title', 'alt', 'placeholder', 'label', 'summary', 'caption', + 'abbr', 'accesskey', 'content', 'description', + // ARIA content + 'aria-label', 'aria-description', 'aria-placeholder', + 'aria-roledescription', 'aria-valuetext', + // React Native accessibility (equivalent to aria-label) + 'accessibilityLabel', 'accessibilityHint', 'accessibilityValue', + // Common component content props + 'heading', 'subheading', 'message', 'hint', 'tooltip', + 'helper-text', 'error-message', 'success-message', + 'confirm-text', 'cancel-text', 'empty-text', 'loading-text', + 'no-data-text', 'no-results-text', +]) + +// ─── Translatable object property whitelist (i18next-cli compatible) ─── + +const TRANSLATABLE_PROPERTIES = new Set([ + 'label', 'title', 'description', 'text', 'message', 'placeholder', + 'caption', 'summary', 'heading', 'subheading', 'subtitle', 'tooltip', + 'hint', 'helpText', 'errorMessage', 'successMessage', 'name', +]) + +// ─── shouldSkip: Binary non-content detection ─── /** - * Structural pre-filter: removes strings that are 100% NOT content. + * Determines if a string is definitely NOT user-visible content. + * Returns skip reason if it should be filtered, null if it should proceed to scoring. * - * Conservative — when in doubt, INCLUDE the string. - * Returns candidates (passed), count of filtered, and reason breakdown. + * Conservative for template_text/jsx_text (tag-between text is almost always content). + * Aggressive for everything else (technical tokens, config values, framework artifacts). */ -export function applyPreFilter(strings: ExtractedString[]): PreFilterResult { - const candidates: ExtractedString[] = [] - const filterReasons: Record = {} - let filtered = 0 +export function shouldSkip(str: ExtractedString): string | null { + // ── Context-based rules (AST-determined, 100% accurate) ── - for (const str of strings) { - const matchedRule = findMatchingRule(str) + if (str.context === 'import_path') return 'import_path' + if (str.context === 'type_annotation') return 'type_annotation' + if (str.context === 'css_class') return 'css_class' + if (str.context === 'css_utility_call') return 'css_utility_call' + if (str.context === 'console_call') return 'console_call' + if (str.context === 'test_assertion') return 'test_assertion' + if (str.context === 'switch_case') return 'switch_case' + + const v = str.value + + // ── Value-based rules (structural patterns) ── + + if (v.length <= 1) return 'single_char' + if (/^\s+$/.test(v)) return 'whitespace' + if (PURE_NUMBER_RE.test(v)) return 'pure_number' + if (HEX_COLOR_RE.test(v)) return 'hex_color' + if (FILE_EXT_RE.test(v)) return 'file_extension' + if (v.startsWith('--')) return 'cli_flag' + + // ── i18n key paths (checked before URL — both contain dots, but i18n keys are more specific) ── + + if (I18N_KEY_RE.test(v)) return 'i18n_key' + + // ── URL/path patterns ── + + if (isURLLike(v)) return 'url_path' + + // ── CSS patterns ── + + if (isCssClassList(v)) return 'css_class_list' + if (isSingleCssUtility(v)) return 'css_utility_token' + + // ── SVG patterns ── + + if (v.length > 3 && SVG_PATH_DATA_RE.test(v)) return 'svg_path_data' + if (SVG_VIEWBOX_RE.test(v)) return 'svg_viewbox' + if (str.parentProperty !== undefined && SVG_TECHNICAL_ATTRIBUTES.has(str.parentProperty)) return 'svg_technical_attr' + if (str.context === 'template_attribute' && SVG_GRAPHIC_ELEMENTS.has(str.parent)) return 'svg_element_attr' + + // ── Framework event patterns ── + + if (v.startsWith('update:')) return 'vue_emit_event' + + // ── Placeholder / interpolation ── + + if (PLACEHOLDER_RE.test(v)) return 'placeholder' - if (matchedRule) { - filtered++ - filterReasons[matchedRule.reason] = (filterReasons[matchedRule.reason] ?? 0) + 1 + // ── Known function argument detection ── + + if (str.context === 'function_argument' && I18N_FUNCTIONS.has(str.parent)) { + // i18n function args: filter lowercase identifiers (namespace/key), keep sentences + if (/^[a-z][a-z0-9_.-]*$/.test(v)) return 'i18n_function_arg' + } + + if (str.context === 'function_argument' && EMIT_FUNCTIONS.has(str.parent)) { + return 'emit_event_arg' + } + + // ── CRITICAL: Technical identifier detection (i18next-cli proven pattern) ── + // Single lowercase ASCII word/kebab-case/snake_case < 30 chars → technical token + // EXEMPT: template_text and jsx_text (tag-between text IS content, even lowercase) + + if (str.context !== 'template_text' && str.context !== 'jsx_text') { + if (TECHNICAL_IDENTIFIER_RE.test(v) && v.length < 30) { + return 'technical_identifier' + } + } + + // ── Error codes (SCREAMING_SNAKE_CASE with underscores) ── + + if (ERROR_CODE_RE.test(v) && v.includes('_') && v.length > 3) { + return 'error_code' + } + + return null +} + +// ─── calculateContentScore: 0-1 confidence scoring ─── + +/** + * Calculates a content confidence score (0-1) for a string that passed shouldSkip. + * Uses AST context metadata (our advantage over offset-based tools) combined with + * value-based signals proven by i18next-cli. + * + * Base score: 0.5. Boosted/penalized by context and value characteristics. + */ +export function calculateContentScore(str: ExtractedString): number { + let score = 0.5 + + // ── Context signals (AST metadata advantage) ── + + // Template/JSX text = almost certainly user-visible content + if (str.context === 'template_text' || str.context === 'jsx_text') { + score += 0.3 + } + + // Content-bearing attribute (title, alt, placeholder, aria-label, etc.) + if (str.context === 'template_attribute' || str.context === 'jsx_attribute') { + if (str.parentProperty && TRANSLATABLE_ATTRIBUTES.has(str.parentProperty)) { + score += 0.2 } else { - candidates.push(str) + score -= 0.2 // Unknown/technical attribute } } - return { candidates, filtered, filterReasons } + // Content-bearing object property (message, label, description, etc.) + if (str.context === 'object_property') { + if (str.parentProperty && TRANSLATABLE_PROPERTIES.has(str.parentProperty)) { + score += 0.25 + } + } + + // ── Value signals (i18next-cli proven heuristics) ── + + // Multi-word strings are more likely content + const wordCount = str.value.split(/\s+/).length + if (wordCount >= 3) score += 0.2 + else if (wordCount === 2) score += 0.1 + + // Terminal punctuation suggests a sentence + if (/[.!?:;]$/.test(str.value)) score += 0.1 + + // Non-ASCII characters (Turkish, Chinese, Arabic, etc.) → almost certainly content + if (/[^\u0000-\u007F]/.test(str.value)) score += 0.15 + + // Capitalized first letter with lowercase body (Dashboard, Kaydet, Settings) + if (/^[A-Z]/.test(str.value) && /[a-z]/.test(str.value)) score += 0.1 + + // camelCase → probably a technical identifier + if (CAMEL_CASE_RE.test(str.value)) score -= 0.3 + + // Contains slash without spaces → path-like + if (str.value.includes('/') && !str.value.includes(' ')) score -= 0.2 + + return Math.max(0, Math.min(1, score)) } +// ─── Public API ─── + /** - * Find the first pre-filter rule that matches this string. - * Returns the rule if matched, undefined if the string should pass through. + * Two-phase pre-filter: + * 1. shouldSkip(): Binary removal of definite non-content + * 2. calculateContentScore(): 0-1 confidence scoring for ambiguous strings + * + * Returns candidates that passed both phases, with content scores attached. */ -function findMatchingRule(str: ExtractedString): PreFilterRule | undefined { - for (const rule of PRE_FILTER_RULES) { - if (rule.context !== undefined && str.context === rule.context) { - return rule +export function applyPreFilter( + strings: ExtractedString[], + minScore: number = 0.4, +): PreFilterResult { + const candidates: ExtractedString[] = [] + const skipReasons: Record = {} + let skipped = 0 + let lowConfidence = 0 + + for (const str of strings) { + // Phase 1: Binary skip + const skipReason = shouldSkip(str) + if (skipReason) { + skipped++ + skipReasons[skipReason] = (skipReasons[skipReason] ?? 0) + 1 + continue } - if (rule.condition !== undefined && rule.condition(str.value)) { - return rule + + // Phase 2: Content scoring + const contentScore = calculateContentScore(str) + if (contentScore < minScore) { + lowConfidence++ + skipReasons['low_confidence'] = (skipReasons['low_confidence'] ?? 0) + 1 + continue } + + // Attach score to the extraction for downstream use + ;(str as ExtractedString & { contentScore: number }).contentScore = contentScore + candidates.push(str) } - return undefined + + return { candidates, skipped, lowConfidence, skipReasons } } diff --git a/packages/mcp/src/core/ast-scanner/svelte-parser.ts b/packages/mcp/src/core/ast-scanner/svelte-parser.ts index 50c3ffe..21c579a 100644 --- a/packages/mcp/src/core/ast-scanner/svelte-parser.ts +++ b/packages/mcp/src/core/ast-scanner/svelte-parser.ts @@ -418,14 +418,25 @@ function processAttribute( // ─── Script Block Parsing ─── +/** + * Resolve script filename with correct extension for TypeScript parser. + * Svelte files with