databricks-solutions · qian-yu-db · May 27, 2026
diff --git a/databricks-skills/databricks-ai-functions/1-task-functions.md b/databricks-skills/databricks-ai-functions/1-task-functions.md
@@ -1,6 +1,6 @@
 # Task-Specific AI Functions — Full Reference
 
-These functions require no model endpoint selection. They call pre-configured Foundation Model APIs optimized for each task. All require DBR 15.1+ (15.4 ML LTS for batch); `ai_parse_document` requires DBR 17.1+.
+These functions require no model endpoint selection. They call pre-configured Foundation Model APIs optimized for each task. All require DBR 15.1+ (15.4 ML LTS for batch); `ai_parse_document` requires DBR 17.3+; `ai_prep_search` requires DBR 18.2+ (serverless env v3+).
 
 ---
 
@@ -34,10 +34,13 @@ df.withColumn("sentiment", expr("ai_analyze_sentiment(review_text)")).display()
   - With descriptions: `'{"billing_error": "Payment, invoice, or refund issues", "product_defect": "Any malfunction or bug"}'` (descriptions up to 1000 chars each)
   - 2–500 labels, each 1–100 characters
 - `options`: optional MAP\<STRING, STRING\>:
+  - `version`: `"2.0"` (recommended) or `"1.0"` for backward compatibility
   - `instructions`: task context to improve accuracy (max 20,000 chars)
   - `multilabel`: `"true"` to return multiple matching labels (default `"false"`)
 
-Returns VARIANT. Returns `NULL` if content is `NULL`.
+Returns VARIANT `{"response": ["label", ...], "error_message": null}`. Returns `NULL` if content is `NULL`.
+
+**Constraints:** total input + labels context capped at **128,000 tokens**; not available on Databricks SQL Classic.
 
 ```sql
 -- simple labels
@@ -91,12 +94,15 @@ df.withColumn(
       "line_items": {"type": "array", "items": {"type": "object", "properties": {...}}}
     }
     ```
-  - Supported types: `string`, `integer`, `number`, `boolean`, `enum`
-  - Max 128 fields, 7 nesting levels, 500 enum values
+  - Supported types: `string`, `integer`, `number`, `boolean`, `enum`, `object` (with `properties`), `array` (with `items`)
+  - Max 128 fields, field names up to 150 chars, 7 nesting levels, 500 enum values, 128,000 token total context
 - `options`: optional MAP\<STRING, STRING\>:
+  - `version`: `"2.1"` (recommended) / `"2.0"` / `"1.0"`
   - `instructions`: task context to improve extraction quality (max 20,000 chars)
+  - `enableCitations`: `"true"` to attach `citation_ids` to each extracted field
+  - `enableConfidenceScores`: `"true"` to attach a per-field `confidence_score` (0–1)
 
-Returns VARIANT `{"response": {...}, "error_message": null}`. Returns `NULL` if content is `NULL`.
+Returns VARIANT `{"response": {...}, "error_message": null}`. With `enableCitations` or `enableConfidenceScores` enabled, each scalar field becomes an object `{"value": ..., "citation_ids": [...], "confidence_score": 0.x}` and a `metadata` block is added at the top level. Returns `NULL` if content is `NULL`.
 
 ```sql
 -- simple schema
@@ -129,6 +135,32 @@ df = df.withColumn(
 df.display()
 ```
 
+### Version 2.1: citations and confidence scores
+
+Pass `version => 2.1` with `enableCitations` and/or `enableConfidenceScores` to attach provenance and reliability metadata to each extracted field. Useful for review queues and downstream filtering by confidence.
+
+```sql
+SELECT ai_extract(
+    document_text,
+    '["invoice_id", "vendor_name", "total_amount"]',
+    MAP(
+        'version', '2.1',
+        'enableCitations', 'true',
+        'enableConfidenceScores', 'true'
+    )
+) AS extracted
+FROM parsed_documents;
+
+-- Each scalar field is now an object: {value, citation_ids, confidence_score}
+-- Access:
+SELECT
+    extracted:response:invoice_id:value::STRING       AS invoice_id,
+    extracted:response:invoice_id:confidence_score::DOUBLE AS invoice_id_conf,
+    extracted:response:total_amount:value::DOUBLE     AS total_amount,
+    extracted:metadata                                AS metadata
+FROM extracted_invoices;
+```
+
 ---
 
 ## `ai_fix_grammar`
@@ -300,38 +332,44 @@ df.withColumn(
 
 **Docs:** https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_parse_document
 
-**Requires:** DBR 17.1+
+**Requires:** DBR 17.3+ (serverless env v3+ for VARIANT). Region-restricted — check feature availability.
 
 **Syntax:** `ai_parse_document(content [, options])`
 - `content`: BINARY — document content loaded from `read_files()` or `spark.read.format("binaryFile")`
 - `options`: MAP\<STRING, STRING\> (optional) — parsing configuration
 
-**Supported formats:** PDF, JPG/JPEG, PNG, DOCX, PPTX
+**Supported formats:** PDF, JPG/JPEG, PNG, TIFF/TIF, DOC/DOCX, PPT/PPTX
 
-Returns a VARIANT with pages, elements (text paragraphs, tables, figures, headers, footers), bounding boxes, and error metadata.
+Returns a VARIANT with pages, elements (text, tables, figures, titles, captions, section headers, page headers/footers, page numbers, footnotes), bounding boxes, confidence scores, and error metadata.
 
 **Options:**
 
 | Key | Values | Description |
 |-----|--------|-------------|
 | `version` | `'2.0'` | Output schema version |
-| `imageOutputPath` | Volume path | Save rendered page images |
-| `descriptionElementTypes` | `''`, `'figure'`, `'*'` | AI-generated descriptions (default: `'*'` for all) |
+| `imageOutputPath` | Volume path | Save rendered page images to a UC Volume |
+| `descriptionElementTypes` | `''`, `'figure'`, `'*'` | AI-generated descriptions (default: `'*'` for all). Set to `''` to disable and reduce cost. |
+| `pageRange` | e.g. `'1,3,5-10'` | Restrict parsing to a subset of pages (1-indexed) |
 
-**Output schema:**
+**Output schema (v2.0):**
 
 ```
 document
-├── pages[]          -- page id, image_uri
+├── pages[]          -- id, image_uri
 └── elements[]       -- extracted content
-    ├── type         -- "text", "table", "figure", etc.
+    ├── id           -- per-element id
+    ├── type         -- text | table | figure | title | caption | section_header
+    │                --   | page_header | page_footer | page_number | footnote
     ├── content      -- extracted text
-    ├── bbox         -- bounding box coordinates
-    └── description  -- AI-generated description
-metadata             -- file info, schema version
-error_status[]       -- errors per page (if any)
+    ├── confidence   -- DOUBLE 0–1
+    ├── bbox         -- [{coord:[...], page_id}]
+    └── description  -- AI-generated description (figures/tables when enabled)
+metadata             -- id, version, file_metadata
+error_status[]       -- {error_message, page_id} per page (if any)
 ```
 
+**Limits:** max 500 pages per document, max 100 MB file size.
+
 ```sql
 -- Parse and extract text blocks
 SELECT
@@ -353,6 +391,13 @@ SELECT ai_parse_document(
     )
 ) AS parsed
 FROM read_files('/Volumes/catalog/schema/volume/invoices/', format => 'binaryFile');
+
+-- Parse only specific pages (cheaper for large documents)
+SELECT ai_parse_document(
+    content,
+    map('version', '2.0', 'pageRange', '1,3,5-10')
+) AS parsed
+FROM read_files('/Volumes/catalog/schema/volume/contracts/', format => 'binaryFile');
 ```
 
 ```python
@@ -380,6 +425,106 @@ df.display()
 ```
 
 **Limitations:**
+- Max 500 pages per document, max 100 MB file size
 - Processing is slow for dense or low-resolution documents
-- Suboptimal for non-Latin alphabets and digitally signed PDFs
+- Suboptimal for non-Latin alphabets (e.g., Japanese, Korean in images) and digitally signed PDFs
 - Custom models not supported — always uses the built-in parsing model
+
+---
+
+## `ai_prep_search`
+
+**Docs:** https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_prep_search
+
+**Requires:** DBR **18.2+** (serverless env v3+ for VARIANT support).
+
+Takes the VARIANT output of `ai_parse_document` and returns RAG-ready chunks. The function performs:
+1. **Semantic chunking** — splits document content into retrieval-sized chunks at natural boundaries (paragraphs, sections, tables).
+2. **Context enrichment** — adds document title, section headers, page numbers, and captions to each chunk's embedding text so Vector Search can match on context, not just chunk content.
+
+Use this instead of hand-rolled `variant_get` + `explode` + `md5` chunking when feeding `ai_parse_document` output into Databricks Vector Search.
+
+**Syntax:** `ai_prep_search(parsed [, options])`
+- `parsed`: VARIANT — output from `ai_parse_document`
+- `options`: optional MAP\<STRING, STRING\>:
+  - `version`: output schema version (major.minor; minor upgrades are backward-compatible)
+
+**Returns:** VARIANT with chunks ready for Vector Search:
+
+```
+chunks[]
+├── chunk_id            -- unique id (document_id + position) — use as PK
+├── chunk_position      -- ordinal within the document
+├── chunk_to_retrieve   -- raw chunk text (return this to the LLM)
+└── chunk_to_embed      -- context-enriched text (use this as the embedding source)
+pages[]                 -- page index + image_uri (when imageOutputPath was set on ai_parse_document)
+source_uri              -- input document path
+error_status            -- per-page error info, if any
+```
+
+**End-to-end SQL — parse, prep, persist for Vector Search:**
+
+```sql
+CREATE OR REPLACE TABLE catalog.schema.parsed_chunks AS
+WITH parsed AS (
+  SELECT
+    path AS source_path,
+    ai_parse_document(content) AS parsed
+  FROM read_files('/Volumes/catalog/schema/docs/', format => 'binaryFile')
+),
+prepped AS (
+  SELECT
+    source_path,
+    ai_prep_search(parsed) AS prep
+  FROM parsed
+),
+chunks AS (
+  SELECT
+    source_path,
+    explode(variant_get(prep, '$.chunks', 'ARRAY<VARIANT>')) AS chunk
+  FROM prepped
+)
+SELECT
+  variant_get(chunk, '$.chunk_id',          'STRING') AS chunk_id,
+  variant_get(chunk, '$.chunk_position',    'INT')    AS chunk_position,
+  variant_get(chunk, '$.chunk_to_retrieve', 'STRING') AS chunk_to_retrieve,
+  variant_get(chunk, '$.chunk_to_embed',    'STRING') AS chunk_to_embed,
+  source_path,
+  current_timestamp() AS prepped_at
+FROM chunks;
+
+-- Enable CDF so Vector Search Delta Sync picks up incremental changes
+ALTER TABLE catalog.schema.parsed_chunks
+SET TBLPROPERTIES (delta.enableChangeDataFeed = true);
+```
+
+**PySpark equivalent:**
+
+```python
+from pyspark.sql.functions import expr, current_timestamp
+
+chunks_df = (
+    spark.read.format("binaryFile")
+    .load("/Volumes/catalog/schema/docs/")
+    .withColumn("parsed", expr("ai_parse_document(content)"))
+    .withColumn("prep",   expr("ai_prep_search(parsed)"))
+    .withColumn("chunk",  expr("explode(variant_get(prep, '$.chunks', 'ARRAY<VARIANT>'))"))
+    .selectExpr(
+        "variant_get(chunk, '$.chunk_id',          'STRING') AS chunk_id",
+        "variant_get(chunk, '$.chunk_position',    'INT')    AS chunk_position",
+        "variant_get(chunk, '$.chunk_to_retrieve', 'STRING') AS chunk_to_retrieve",
+        "variant_get(chunk, '$.chunk_to_embed',    'STRING') AS chunk_to_embed",
+        "path AS source_path",
+    )
+    .withColumn("prepped_at", current_timestamp())
+)
+
+chunks_df.write.format("delta").mode("overwrite").saveAsTable("catalog.schema.parsed_chunks")
+```
+
+**Vector Search integration:** Point a Delta Sync index at this table with `chunk_to_embed` as the embedding source column and `chunk_id` as the primary key. The `chunk_to_retrieve` column is what you return to the LLM at query time.
+
+**Tips:**
+- Pass `imageOutputPath` on the upstream `ai_parse_document` call if you want page image URIs available in the prep output for multimodal retrieval.
+- Schema is versioned major.minor; minor upgrades are backward-compatible — pin `version` only if you need to lock schema across deployments.
+- On DBR < 18.2, fall back to manual chunking via `variant_get` + `explode` on `ai_parse_document` output.