From 231826e35bc090cca37279becc382a443c64507a Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Mon, 25 May 2026 13:58:21 +0000 Subject: [PATCH 1/2] skills(unstructured-pdf): reframe around Databricks RAG-eval workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Lennart's audit on #73 ("this is almost not very Databricks-specific at all?"): the skill's value is the synthetic-PDFs-on-UC-volume workflow shape for RAG / unstructured-document retrieval evaluation, not the HTML → PDF generation step itself (any local HTML → PDF tool works for that — weasyprint, wkhtmltopdf, playwright pdf, plutoprint). Reframe SKILL.md to put the Databricks-specific value up front: - Frontmatter description now leads with "Build RAG / unstructured-document evaluation datasets on Databricks"; PDF generation is positioned as a step, not the headline. - Body intro states explicitly that the Databricks-specific value is the workflow shape (UC volume layout, paired question files, hand-off to downstream `ai_extract` / `ai_parse_document` / mlflow.genai eval), not the local HTML → PDF tooling. - Adds a one-line note: "if you only need ad-hoc PDFs, any local HTML → PDF tool works directly — this skill exists for the synthetic-dataset-on-UC end-to-end shape". No content removed; this is a framing change so users (and reviewers) can tell what the Databricks-specific value of the skill is at a glance. Manifest regenerated to pick up the new description. Co-authored-by: Isaac --- .../SKILL.md | 16 +++--- manifest.json | 56 +++++++++---------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/experimental/databricks-unstructured-pdf-generation/SKILL.md b/experimental/databricks-unstructured-pdf-generation/SKILL.md index 1a1a636..16705ac 100644 --- a/experimental/databricks-unstructured-pdf-generation/SKILL.md +++ b/experimental/databricks-unstructured-pdf-generation/SKILL.md @@ -1,18 +1,20 @@ --- name: databricks-unstructured-pdf-generation -description: "Generate PDF documents from HTML and upload to Unity Catalog volumes. Use for creating test PDFs, demo documents, reports, or evaluation datasets." +description: "Build RAG / unstructured-document evaluation datasets on Databricks: generate synthetic PDFs locally, upload to Unity Catalog volumes, and pair each document with test questions for retrieval evaluation." --- -# PDF Generation from HTML +# Unstructured-Document Eval Datasets on Databricks -Convert HTML content to PDF documents and upload them to Unity Catalog Volumes. +Workflow for producing **synthetic PDF documents + paired test questions** as a Unity Catalog-resident dataset for RAG / unstructured-document retrieval evaluation on Databricks. The PDF-generation step uses standard local HTML → PDF tooling; the Databricks-specific value is the workflow shape — UC volume layout, paired question files, and integration with downstream Databricks retrieval / `ai_extract` / `ai_parse_document` evaluation. ## Workflow -1. Write HTML files to `./raw_data/html/` (write multiple files in parallel for speed) -2. Convert HTML → PDF using `/scripts/pdf_generator.py` (parallel conversion) -3. Upload PDFs to Unity Catalog volume using `databricks fs cp` -4. Generate `doc_questions.json` with test questions for each document +1. Write HTML files to `./raw_data/html/` (write multiple files in parallel for speed) — domain-shaped to match the documents your retrieval pipeline will see in production. +2. Convert HTML → PDF using `/scripts/pdf_generator.py` (parallel conversion, wraps `plutoprint`). +3. Upload PDFs to a Unity Catalog volume via `databricks fs cp` — same volume shape your production pipeline will read from. +4. Generate `doc_questions.json` pairing each document with retrieval-eval questions; this becomes the gold dataset for `mlflow.genai.evaluate()` or comparable retrieval-quality scorers. + +> If you only need ad-hoc PDFs (no Databricks workflow), any HTML → PDF tool (`weasyprint`, `wkhtmltopdf`, `playwright pdf`, `plutoprint`) works directly — this skill exists for the synthetic-dataset-on-UC end-to-end shape, not as a general PDF generator. > **Path convention:** `` below = the directory containing this SKILL.md. Resolve to the absolute install path (e.g. `~/.claude/skills/databricks-unstructured-pdf-generation`). `./raw_data/...` paths are relative to your own project cwd. diff --git a/manifest.json b/manifest.json index f0fe8fe..e8f84d8 100644 --- a/manifest.json +++ b/manifest.json @@ -1,12 +1,12 @@ { "version": "2", - "updated_at": "2026-05-22T20:18:49Z", + "updated_at": "2026-05-25T13:58:07Z", "skills": { "databricks-apps": { "version": "0.1.2", "description": "Databricks Apps development and deployment (evaluates analytics vs synced tables data access)", "repo_dir": "skills", - "updated_at": "2026-05-22T15:54:04Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -33,7 +33,7 @@ "version": "0.1.0", "description": "Core Databricks skill for CLI, auth, and data exploration", "repo_dir": "skills", - "updated_at": "2026-05-15T09:44:24Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -48,7 +48,7 @@ "version": "0.0.1", "description": "Declarative Automation Bundles (DABs) for deploying and managing Databricks resources", "repo_dir": "skills", - "updated_at": "2026-05-12T15:39:50Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -66,7 +66,7 @@ "version": "0.2.0", "description": "Develop and deploy Lakeflow Jobs on Databricks via DABs, Python SDK, or the CLI \u2014 covers all task types, triggers, notifications, and worked examples", "repo_dir": "skills", - "updated_at": "2026-05-22T15:54:01Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -82,7 +82,7 @@ "version": "0.1.0", "description": "Databricks Lakebase Postgres: projects, scaling, connectivity, synced tables, and Data API", "repo_dir": "skills", - "updated_at": "2026-05-22T15:54:04Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -101,7 +101,7 @@ "version": "0.1.0", "description": "Databricks Model Serving endpoint management", "repo_dir": "skills", - "updated_at": "2026-05-22T15:54:04Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -114,7 +114,7 @@ "version": "0.1.0", "description": "Databricks Spark Declarative Pipelines (SDP) for ETL and streaming", "repo_dir": "skills", - "updated_at": "2026-05-12T15:39:50Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -161,7 +161,7 @@ "version": "0.1.0", "description": "Migrate Databricks workloads from classic compute to serverless compute, including compatibility checks and concrete fixes", "repo_dir": "skills", - "updated_at": "2026-05-12T15:39:50Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -178,7 +178,7 @@ "version": "0.0.1", "description": "Create Agent Bricks: Knowledge Assistants (KA) for document Q&A and Supervisor Agents for multi-agent orchestration (MAS).", "repo_dir": "experimental", - "updated_at": "2026-05-22T20:18:49Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "1-knowledge-assistants.md", "2-supervisor-agents.md", @@ -192,7 +192,7 @@ "version": "0.0.1", "description": "Use Databricks built-in AI Functions (ai_classify, ai_extract, ai_summarize, ai_mask, ai_translate, ai_fix_grammar, ai_gen, ai_analyze_sentiment, ai_similarity, ai_parse_document, ai_query, ai_forecast) to add AI capabilities directly to SQL and PySpark pipelines without managing model endpoints. Also covers document parsing and building custom RAG pipelines (parse \u2192 chunk \u2192 index \u2192 query).", "repo_dir": "experimental", - "updated_at": "2026-05-22T20:17:46Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "1-task-functions.md", "2-ai-query.md", @@ -208,7 +208,7 @@ "version": "0.0.1", "description": "Create Databricks AI/BI dashboards. Must use when creating, updating, or deploying Lakeview dashboards as Databricks Dashboard have a unique json structure. CRITICAL: You MUST test ALL SQL queries via CLI BEFORE deploying. Follow guidelines strictly.", "repo_dir": "experimental", - "updated_at": "2026-05-22T20:17:46Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "1-widget-specifications.md", "2-advanced-widget-specifications.md", @@ -225,7 +225,7 @@ "version": "0.0.1", "description": "Builds Databricks applications. Prefers AppKit (TypeScript + React SDK) for new apps; falls back to Python frameworks (Dash, Streamlit, Gradio, Flask, FastAPI, Reflex) when Python is required. Handles OAuth authorization, app resources, SQL warehouse and Lakebase connectivity, model serving, foundation model APIs, and deployment. Use when building web apps, dashboards, ML demos, or REST APIs for Databricks, or when the user mentions AppKit, Streamlit, Dash, Gradio, Flask, FastAPI, Reflex, or Databricks app.", "repo_dir": "experimental", - "updated_at": "2026-05-22T20:17:46Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "1-authorization.md", "2-app-resources.md", @@ -247,7 +247,7 @@ "version": "0.0.1", "description": "Databricks SQL (DBSQL) advanced features and SQL warehouse capabilities. This skill MUST be invoked when the user mentions: \"DBSQL\", \"Databricks SQL\", \"SQL warehouse\", \"SQL scripting\", \"stored procedure\", \"CALL procedure\", \"materialized view\", \"CREATE MATERIALIZED VIEW\", \"pipe syntax\", \"|>\", \"geospatial\", \"H3\", \"ST_\", \"spatial SQL\", \"collation\", \"COLLATE\", \"ai_query\", \"ai_classify\", \"ai_extract\", \"ai_gen\", \"AI function\", \"http_request\", \"remote_query\", \"read_files\", \"Lakehouse Federation\", \"recursive CTE\", \"WITH RECURSIVE\", \"multi-statement transaction\", \"temp table\", \"temporary view\", \"pipe operator\". SHOULD also invoke when the user asks about SQL best practices, data modeling patterns, or advanced SQL features on Databricks.", "repo_dir": "experimental", - "updated_at": "2026-05-22T15:54:01Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -264,7 +264,7 @@ "version": "0.0.1", "description": "Databricks documentation reference via llms.txt index. Use when other skills do not cover a topic, looking up unfamiliar Databricks features, or needing authoritative docs on APIs, configurations, or platform capabilities.", "repo_dir": "experimental", - "updated_at": "2026-05-22T15:54:01Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -276,7 +276,7 @@ "version": "0.0.1", "description": "Execute code and manage compute on Databricks. Use this skill when the user mentions: \"run code\", \"execute\", \"run on databricks\", \"serverless\", \"no cluster\", \"run python\", \"run scala\", \"run sql\", \"run R\", \"run file\", \"push and run\", \"notebook run\", \"batch script\", \"model training\", \"run script on cluster\", \"create cluster\", \"new cluster\", \"resize cluster\", \"modify cluster\", \"delete cluster\", \"terminate cluster\", \"create warehouse\", \"new warehouse\", \"resize warehouse\", \"delete warehouse\", \"node types\", \"runtime versions\", \"DBR versions\", \"spin up compute\", \"provision cluster\".", "repo_dir": "experimental", - "updated_at": "2026-05-22T15:57:09Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -292,7 +292,7 @@ "version": "0.0.1", "description": "Apache Iceberg tables on Databricks \u2014 Managed Iceberg tables, External Iceberg Reads (fka Uniform), Compatibility Mode, Iceberg REST Catalog (IRC), Iceberg v3, Snowflake interop, PyIceberg, OSS Spark, external engine access and credential vending. Use when creating Iceberg tables, enabling External Iceberg Reads (uniform) on Delta tables (including Streaming Tables and Materialized Views via compatibility mode), configuring external engines to read Databricks tables via Unity Catalog IRC, integrating with Snowflake catalog to read Foreign Iceberg tables", "repo_dir": "experimental", - "updated_at": "2026-05-22T20:17:46Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "1-managed-iceberg-tables.md", "2-uniform-and-compatibility.md", @@ -309,7 +309,7 @@ "version": "0.0.1", "description": "Unity Catalog metric views: define, create, query, and manage governed business metrics in YAML. Use when building standardized KPIs, revenue metrics, order analytics, or any reusable business metrics that need consistent definitions across teams and tools.", "repo_dir": "experimental", - "updated_at": "2026-05-22T20:17:46Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -323,7 +323,7 @@ "version": "0.0.1", "description": "MLflow 3 GenAI agent evaluation. Use when writing mlflow.genai.evaluate() code, creating @scorer functions, using built-in scorers (Guidelines, Correctness, Safety, RetrievalGroundedness), building eval datasets from traces, setting up trace ingestion and production monitoring, aligning judges with MemAlign from domain expert feedback, or running optimize_prompts() with GEPA for automated prompt improvement.", "repo_dir": "experimental", - "updated_at": "2026-05-22T15:56:43Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -346,7 +346,7 @@ "version": "0.0.1", "description": "Databricks development guidance including Python SDK, Databricks Connect, CLI, and REST API. Use when working with databricks-sdk, databricks-connect, or Databricks APIs.", "repo_dir": "experimental", - "updated_at": "2026-05-22T15:54:01Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -364,7 +364,7 @@ "version": "0.0.1", "description": "Comprehensive guide to Spark Structured Streaming for production workloads. Use when building streaming pipelines, working with Kafka ingestion, implementing Real-Time Mode (RTM), configuring triggers (processingTime, availableNow), handling stateful operations with watermarks, optimizing checkpoints, performing stream-stream or stream-static joins, writing to multiple sinks, or tuning streaming cost and performance.", "repo_dir": "experimental", - "updated_at": "2026-05-22T15:54:01Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -385,7 +385,7 @@ "version": "0.0.1", "description": "Generate realistic synthetic data using Spark + Faker (strongly recommended). Supports serverless execution, multiple output formats (Parquet/JSON/CSV/Delta), and scales from thousands to millions of rows. For small datasets (<10K rows), can optionally generate locally and upload to volumes. Use when user mentions 'synthetic data', 'test data', 'generate data', 'demo dataset', 'Faker', or 'sample data'.", "repo_dir": "experimental", - "updated_at": "2026-05-22T15:54:01Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -400,7 +400,7 @@ "version": "0.0.1", "description": "Unity Catalog system tables and volumes. Use when querying system tables (audit, lineage, billing) or working with volume file operations (upload, download, list files in /Volumes/).", "repo_dir": "experimental", - "updated_at": "2026-05-22T20:17:46Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "5-system-tables.md", "6-volumes.md", @@ -413,9 +413,9 @@ }, "databricks-unstructured-pdf-generation": { "version": "0.0.1", - "description": "Generate PDF documents from HTML and upload to Unity Catalog volumes. Use for creating test PDFs, demo documents, reports, or evaluation datasets.", + "description": "Build RAG / unstructured-document evaluation datasets on Databricks: generate synthetic PDFs locally, upload to Unity Catalog volumes, and pair each document with test questions for retrieval evaluation.", "repo_dir": "experimental", - "updated_at": "2026-05-22T15:56:43Z", + "updated_at": "2026-05-25T13:58:02Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -428,7 +428,7 @@ "version": "0.0.1", "description": "Patterns for Databricks Vector Search: create endpoints and indexes, query with filters, manage embeddings. Use when building RAG applications, semantic search, or similarity matching. Covers both storage-optimized and standard endpoints.", "repo_dir": "experimental", - "updated_at": "2026-05-22T15:54:01Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -444,7 +444,7 @@ "version": "0.0.1", "description": "Build Zerobus Ingest clients for near real-time data ingestion into Databricks Delta tables via gRPC. Use when creating producers that write directly to Unity Catalog tables without a message bus, working with the Zerobus Ingest SDK in Python/Java/Go/TypeScript/Rust, generating Protobuf schemas from UC tables, or implementing stream-based ingestion with ACK handling and retry logic.", "repo_dir": "experimental", - "updated_at": "2026-05-22T20:17:46Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "1-setup-and-authentication.md", "2-python-client.md", @@ -461,7 +461,7 @@ "version": "0.0.1", "description": "Build custom Python data sources for Apache Spark using the PySpark DataSource API \u2014 batch and streaming readers/writers for external systems. Use this skill whenever someone wants to connect Spark to an external system (database, API, message queue, custom protocol), build a Spark connector or plugin in Python, implement a DataSourceReader or DataSourceWriter, pull data from or push data to a system via Spark, or work with the PySpark DataSource API in any way. Even if they just say \"read from X in Spark\" or \"write DataFrame to Y\" and there's no native connector, this skill applies.", "repo_dir": "experimental", - "updated_at": "2026-05-22T20:17:46Z", + "updated_at": "2026-05-25T13:57:24Z", "files": [ "SKILL.md", "agents/openai.yaml", From 61d0d8bfefe4ee9b588de5ce8f3240fea9833c6c Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Thu, 28 May 2026 11:10:06 +0000 Subject: [PATCH 2/2] skills(unstructured-pdf): broaden reframe to cover demos as well as eval Per @QuentinAmbard's review: a lot of real usage is generating synthetic PDFs for demos with Knowledge Assistant, not just eval datasets. Reword the frontmatter description, H1, and intro paragraph to name both surfaces explicitly. Manifest synopsis regenerates from the frontmatter. This PR was prepared by Claude. --- .../databricks-unstructured-pdf-generation/SKILL.md | 6 +++--- manifest.json | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/experimental/databricks-unstructured-pdf-generation/SKILL.md b/experimental/databricks-unstructured-pdf-generation/SKILL.md index 16705ac..c70f152 100644 --- a/experimental/databricks-unstructured-pdf-generation/SKILL.md +++ b/experimental/databricks-unstructured-pdf-generation/SKILL.md @@ -1,11 +1,11 @@ --- name: databricks-unstructured-pdf-generation -description: "Build RAG / unstructured-document evaluation datasets on Databricks: generate synthetic PDFs locally, upload to Unity Catalog volumes, and pair each document with test questions for retrieval evaluation." +description: "Build RAG / unstructured-document evaluation datasets on Databricks and Generate PDF documents for demos having Knowledge Assistant: generate synthetic PDFs locally, upload to Unity Catalog volumes, and pair each document with test questions for retrieval evaluation." --- -# Unstructured-Document Eval Datasets on Databricks +# Unstructured-Document for Demos and Eval Datasets on Databricks -Workflow for producing **synthetic PDF documents + paired test questions** as a Unity Catalog-resident dataset for RAG / unstructured-document retrieval evaluation on Databricks. The PDF-generation step uses standard local HTML → PDF tooling; the Databricks-specific value is the workflow shape — UC volume layout, paired question files, and integration with downstream Databricks retrieval / `ai_extract` / `ai_parse_document` evaluation. +Workflow for producing **synthetic PDF documents + paired test questions** as a Unity Catalog-resident dataset for Demos and RAG / unstructured-document retrieval evaluation on Databricks. The PDF-generation step uses standard local HTML → PDF tooling; the Databricks-specific value is the workflow shape — UC volume layout, paired question files, and integration with downstream Databricks retrieval / `ai_extract` / `ai_parse_document` evaluation. ## Workflow diff --git a/manifest.json b/manifest.json index ce78036..a9a237f 100644 --- a/manifest.json +++ b/manifest.json @@ -394,7 +394,7 @@ "version": "0.0.1" }, "databricks-unstructured-pdf-generation": { - "description": "Build RAG / unstructured-document evaluation datasets on Databricks: generate synthetic PDFs locally, upload to Unity Catalog volumes, and pair each document with test questions for retrieval evaluation.", + "description": "Build RAG / unstructured-document evaluation datasets on Databricks and Generate PDF documents for demos having Knowledge Assistant: generate synthetic PDFs locally, upload to Unity Catalog volumes, and pair each document with test questions for retrieval evaluation.", "files": [ "SKILL.md", "agents/openai.yaml",