From fca1866677e76f87d0deff7968fccb7580272683 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Sun, 24 May 2026 20:51:53 +0000 Subject: [PATCH 1/8] skills(model-serving): port FM API endpoints reference from a-d-k MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 of #73's TODO #1b. Adds references/fm-api-endpoints.md with the curated Foundation Model API endpoint table (chat/instruct + embedding models) from databricks-solutions/ai-dev-kit's model-serving skill, plus common defaults and query examples (CLI + SDK). Stripped: the cloud/language prefix on the docs link, and the leftover MCP-tool references in the source. The endpoint table itself is static catalog data — no MCP coupling. SKILL.md updates: - bump version to 0.2.0 - point Endpoint Types table at the new reference - point the Foundation Model discovery bullet at the new reference Subsequent phases (separate PRs / commits) port the remaining dev-side content: classical-ml autolog patterns, Custom PyFunc signatures, ResponsesAgent with the create_text_output_item gotcha, UCFunctionToolkit + VectorSearchRetrieverTool resource passthrough. Co-authored-by: Isaac --- manifest.json | 9 +-- skills/databricks-model-serving/SKILL.md | 6 +- .../references/fm-api-endpoints.md | 73 +++++++++++++++++++ 3 files changed, 80 insertions(+), 8 deletions(-) create mode 100644 skills/databricks-model-serving/references/fm-api-endpoints.md diff --git a/manifest.json b/manifest.json index b64dd62..ffa6a0a 100644 --- a/manifest.json +++ b/manifest.json @@ -55,8 +55,7 @@ "references/deploy-and-run.md", "references/resource-permissions.md", "references/sdp-pipelines.md" - ], - "base_revision": "e742f36e8ab1" + ] }, "databricks-jobs": { "version": "0.2.0", @@ -92,7 +91,7 @@ ] }, "databricks-model-serving": { - "version": "0.1.0", + "version": "0.2.0", "description": "Databricks Model Serving endpoint management", "repo_dir": "skills", "files": [ @@ -100,6 +99,7 @@ "agents/openai.yaml", "assets/databricks.png", "assets/databricks.svg", + "references/fm-api-endpoints.md", "references/off-platform-streaming.md" ] }, @@ -146,8 +146,7 @@ "references/view-sql.md", "references/view.md", "references/write-spark-declarative-pipelines.md" - ], - "base_revision": "5c4b4fb0a82a" + ] }, "databricks-serverless-migration": { "version": "0.1.0", diff --git a/skills/databricks-model-serving/SKILL.md b/skills/databricks-model-serving/SKILL.md index 628fd33..7087c81 100644 --- a/skills/databricks-model-serving/SKILL.md +++ b/skills/databricks-model-serving/SKILL.md @@ -3,7 +3,7 @@ name: databricks-model-serving description: "Manage Databricks Model Serving endpoints via CLI. Use when asked to create, configure, query, or manage model serving endpoints for LLM inference, custom models, or external models." compatibility: Requires databricks CLI (>= v0.294.0) metadata: - version: "0.1.0" + version: "0.2.0" parent: databricks-core --- @@ -17,7 +17,7 @@ Model Serving provides managed endpoints for serving LLMs, custom ML models, and | Type | When to Use | Key Detail | |------|-------------|------------| -| Pay-per-token | Foundation Model APIs (Llama, DBRX, etc.) | Uses `system.ai.*` catalog models, simplest setup | +| Pay-per-token | Foundation Model APIs (Llama, GPT-5, Claude, Gemini, etc.) | Uses `system.ai.*` catalog models, simplest setup. Endpoint names in [references/fm-api-endpoints.md](references/fm-api-endpoints.md). | | Provisioned throughput | Dedicated GPU capacity | Guaranteed throughput, higher cost | | Custom model | Your own MLflow models or containers | Deploy any model with an MLflow signature | @@ -74,7 +74,7 @@ databricks serving-endpoints create \ }' --profile ``` -- Discover available Foundation Models: check the `system.ai` catalog in Unity Catalog, or use `databricks serving-endpoints list --profile ` to see available endpoints. Use `databricks serving-endpoints get-open-api --profile ` to inspect the endpoint's API schema. +- Discover available Foundation Models: see [references/fm-api-endpoints.md](references/fm-api-endpoints.md) for the curated table of pay-per-token endpoint names. You can also check the `system.ai` catalog in Unity Catalog, or run `databricks serving-endpoints list --profile ` to see what's deployed in the workspace. Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's API schema. - Long-running operation; the CLI waits for completion by default. Use `--no-wait` to return immediately, then poll: ```bash databricks serving-endpoints get --profile diff --git a/skills/databricks-model-serving/references/fm-api-endpoints.md b/skills/databricks-model-serving/references/fm-api-endpoints.md new file mode 100644 index 0000000..e35e3ee --- /dev/null +++ b/skills/databricks-model-serving/references/fm-api-endpoints.md @@ -0,0 +1,73 @@ +# Foundation Model API Endpoints + +Pay-per-token Foundation Model API endpoints available in every workspace. Use the **exact endpoint name** from the tables below as `served_entities[].entity_name` (or as the model identifier when calling `serving-endpoints query`); never abbreviate or guess. + +For production-grade workloads, consider provisioned throughput mode. See the docs page for [supported models](https://docs.databricks.com/machine-learning/foundation-model-apis/supported-models). + +## Chat / Instruct Models + +| Endpoint Name | Provider | Notes | +|--------------|----------|-------| +| `databricks-gpt-5-2` | OpenAI | Latest GPT, 400K context | +| `databricks-gpt-5-1` | OpenAI | Instant + Thinking modes | +| `databricks-gpt-5-1-codex-max` | OpenAI | Code-specialized (high perf) | +| `databricks-gpt-5-1-codex-mini` | OpenAI | Code-specialized (cost-opt) | +| `databricks-gpt-5` | OpenAI | 400K context, reasoning | +| `databricks-gpt-5-mini` | OpenAI | Cost-optimized reasoning | +| `databricks-gpt-5-nano` | OpenAI | High-throughput, lightweight | +| `databricks-gpt-oss-120b` | OpenAI | Open-weight, 128K context | +| `databricks-gpt-oss-20b` | OpenAI | Lightweight open-weight | +| `databricks-claude-opus-4-6` | Anthropic | Most capable, 1M context | +| `databricks-claude-sonnet-4-6` | Anthropic | Hybrid reasoning | +| `databricks-claude-sonnet-4-5` | Anthropic | Hybrid reasoning | +| `databricks-claude-opus-4-5` | Anthropic | Deep analysis, 200K context | +| `databricks-claude-sonnet-4` | Anthropic | Hybrid reasoning | +| `databricks-claude-opus-4-1` | Anthropic | 200K context, 32K output | +| `databricks-claude-haiku-4-5` | Anthropic | Fastest, cost-effective | +| `databricks-claude-3-7-sonnet` | Anthropic | Retiring April 2026 | +| `databricks-meta-llama-3-3-70b-instruct` | Meta | 128K context, multilingual | +| `databricks-meta-llama-3-1-405b-instruct` | Meta | Retiring May 2026 (PT) | +| `databricks-meta-llama-3-1-8b-instruct` | Meta | Lightweight, 128K context | +| `databricks-llama-4-maverick` | Meta | MoE architecture | +| `databricks-gemini-3-1-pro` | Google | 1M context, hybrid reasoning | +| `databricks-gemini-3-pro` | Google | 1M context, hybrid reasoning | +| `databricks-gemini-3-flash` | Google | Fast, cost-efficient | +| `databricks-gemini-2-5-pro` | Google | 1M context, Deep Think | +| `databricks-gemini-2-5-flash` | Google | 1M context, hybrid reasoning | +| `databricks-gemma-3-12b` | Google | 128K context, multilingual | +| `databricks-qwen3-next-80b-a3b-instruct` | Alibaba | Efficient MoE | + +## Embedding Models + +| Endpoint Name | Dimensions | Max Tokens | Notes | +|--------------|-----------|------------|-------| +| `databricks-gte-large-en` | 1024 | 8192 | English, not normalized | +| `databricks-bge-large-en` | 1024 | 512 | English, normalized | +| `databricks-qwen3-embedding-0-6b` | up to 1024 | ~32K | 100+ languages, instruction-aware | + +## Common defaults + +- **Agent LLM**: `databricks-meta-llama-3-3-70b-instruct` — good balance of quality/cost. +- **Embedding**: `databricks-gte-large-en`. +- **Code tasks**: `databricks-gpt-5-1-codex-mini` (cost-opt) or `databricks-gpt-5-1-codex-max` (high perf). + +## Querying + +Pay-per-token endpoints are pre-deployed; you don't need to create them. Query directly: + +```bash +databricks serving-endpoints query databricks-meta-llama-3-3-70b-instruct \ + --json '{"messages":[{"role":"user","content":"hi"}]}' +``` + +```python +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() +response = w.serving_endpoints.query( + name="databricks-meta-llama-3-3-70b-instruct", + messages=[{"role": "user", "content": "hi"}], +) +``` + +For a full list of available models in the current workspace, browse the `system.ai` catalog in Unity Catalog. Endpoint names listed above match the model identifiers under `system.ai.*`. From d400effa1cfd3d53010c05b55b13bbefb50a4856 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 26 May 2026 09:44:53 +0000 Subject: [PATCH 2/8] fix(model-serving): refresh FM API endpoint catalog against current docs Aligns the verbatim a-d-k port with the live docs.databricks.com supported-models page (validated via WebFetch on 2026-05-26): ADDED (missing from a-d-k snapshot): - databricks-claude-opus-4-7 (now most capable Claude) - databricks-gpt-5-5-pro, 5-5 - databricks-gpt-5-4, 5-4-mini, 5-4-nano - databricks-gpt-5-3-codex, 5-2-codex - databricks-gemini-3-1-flash-lite, 3-5-flash - databricks-qwen35-122b-a10b (Preview) REMOVED (retired, no longer in docs): - databricks-claude-3-7-sonnet - databricks-meta-llama-3-1-405b-instruct UPDATED notes: - claude-opus-4-6 no longer "Most capable" - gpt-5-2 no longer "Latest" - gpt-5-1-codex-{max,mini} + gpt-5-2-codex marked retiring 2026-07-16 - gemini-3-pro marked retired 2026-03-26 with redirect through 2026-06-07 - Several Gemini / Codex endpoints annotated with cross-geo requirement - qwen3-next-80b annotated as Preview OPENING PARAGRAPH: - "available in every workspace" -> "available in supported Model Serving regions"; calls out cross-geo requirement for several endpoints NOT TOUCHED (out of scope: not docs-validatable from supported-models page): - served_entities[].entity_name guidance (line 3 second half) - SKILL.md "system.ai.* catalog" claim on the pay-per-token row These remain as in the a-d-k snapshot and should be revisited if/when docs cover them directly. Test plan: `scripts/skills.py validate` -> "Everything is up to date"; `scripts/skills.py generate` -> only refreshes manifest.json timestamps. Co-authored-by: Isaac --- .../references/fm-api-endpoints.md | 61 +++++++++++-------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/skills/databricks-model-serving/references/fm-api-endpoints.md b/skills/databricks-model-serving/references/fm-api-endpoints.md index e35e3ee..83e6fb7 100644 --- a/skills/databricks-model-serving/references/fm-api-endpoints.md +++ b/skills/databricks-model-serving/references/fm-api-endpoints.md @@ -1,41 +1,50 @@ # Foundation Model API Endpoints -Pay-per-token Foundation Model API endpoints available in every workspace. Use the **exact endpoint name** from the tables below as `served_entities[].entity_name` (or as the model identifier when calling `serving-endpoints query`); never abbreviate or guess. +Pay-per-token Foundation Model API endpoints available in supported Model Serving regions; several endpoints (notably the Gemini family) require cross-geography routing — see the [supported models](https://docs.databricks.com/machine-learning/foundation-model-apis/supported-models) page for per-endpoint region and cross-geo notes. Use the **exact endpoint name** from the tables below as `served_entities[].entity_name` (or as the model identifier when calling `serving-endpoints query`); never abbreviate or guess. -For production-grade workloads, consider provisioned throughput mode. See the docs page for [supported models](https://docs.databricks.com/machine-learning/foundation-model-apis/supported-models). +For production-grade workloads, consider provisioned throughput mode. ## Chat / Instruct Models | Endpoint Name | Provider | Notes | |--------------|----------|-------| -| `databricks-gpt-5-2` | OpenAI | Latest GPT, 400K context | -| `databricks-gpt-5-1` | OpenAI | Instant + Thinking modes | -| `databricks-gpt-5-1-codex-max` | OpenAI | Code-specialized (high perf) | -| `databricks-gpt-5-1-codex-mini` | OpenAI | Code-specialized (cost-opt) | +| `databricks-gpt-5-5-pro` | OpenAI | Multimodal, 400K context, extended prompt caching | +| `databricks-gpt-5-5` | OpenAI | Multimodal, 400K context, extended prompt caching | +| `databricks-gpt-5-4` | OpenAI | Multimodal, 400K context, general-purpose reasoning | +| `databricks-gpt-5-4-mini` | OpenAI | Multimodal, 400K context, cost-optimized | +| `databricks-gpt-5-4-nano` | OpenAI | Multimodal, 400K context, high-throughput | +| `databricks-gpt-5-3-codex` | OpenAI | Multimodal, 400K context, agentic coding (not in AI Playground) | +| `databricks-gpt-5-2-codex` | OpenAI | Code-specialized; **retiring 2026-07-16** | +| `databricks-gpt-5-2` | OpenAI | Multimodal, 400K context, general-purpose reasoning | +| `databricks-gpt-5-1` | OpenAI | Instant + Thinking modes, 400K context | +| `databricks-gpt-5-1-codex-max` | OpenAI | Code-specialized (high perf); global endpoint, cross-geo required; **retiring 2026-07-16** | +| `databricks-gpt-5-1-codex-mini` | OpenAI | Code-specialized (cost-opt); global endpoint, cross-geo required; **retiring 2026-07-16** | | `databricks-gpt-5` | OpenAI | 400K context, reasoning | | `databricks-gpt-5-mini` | OpenAI | Cost-optimized reasoning | | `databricks-gpt-5-nano` | OpenAI | High-throughput, lightweight | -| `databricks-gpt-oss-120b` | OpenAI | Open-weight, 128K context | -| `databricks-gpt-oss-20b` | OpenAI | Lightweight open-weight | -| `databricks-claude-opus-4-6` | Anthropic | Most capable, 1M context | -| `databricks-claude-sonnet-4-6` | Anthropic | Hybrid reasoning | -| `databricks-claude-sonnet-4-5` | Anthropic | Hybrid reasoning | -| `databricks-claude-opus-4-5` | Anthropic | Deep analysis, 200K context | -| `databricks-claude-sonnet-4` | Anthropic | Hybrid reasoning | -| `databricks-claude-opus-4-1` | Anthropic | 200K context, 32K output | -| `databricks-claude-haiku-4-5` | Anthropic | Fastest, cost-effective | -| `databricks-claude-3-7-sonnet` | Anthropic | Retiring April 2026 | +| `databricks-gpt-oss-120b` | OpenAI | Open-weight reasoning, 128K context | +| `databricks-gpt-oss-20b` | OpenAI | Lightweight open-weight, 128K context | +| `databricks-claude-opus-4-7` | Anthropic | Most capable Claude; 1M context, improved vision | +| `databricks-claude-opus-4-6` | Anthropic | Adaptive thinking with max-effort mode, 1M context | +| `databricks-claude-opus-4-5` | Anthropic | Hybrid reasoning, 200K context | +| `databricks-claude-opus-4-1` | Anthropic | General-purpose hybrid reasoning, 200K/32K output | +| `databricks-claude-sonnet-4-6` | Anthropic | Hybrid reasoning with two modes | +| `databricks-claude-sonnet-4-5` | Anthropic | Hybrid reasoning with two modes | +| `databricks-claude-sonnet-4` | Anthropic | Hybrid reasoning with two modes | +| `databricks-claude-haiku-4-5` | Anthropic | Fastest, most cost-effective Claude | | `databricks-meta-llama-3-3-70b-instruct` | Meta | 128K context, multilingual | -| `databricks-meta-llama-3-1-405b-instruct` | Meta | Retiring May 2026 (PT) | | `databricks-meta-llama-3-1-8b-instruct` | Meta | Lightweight, 128K context | -| `databricks-llama-4-maverick` | Meta | MoE architecture | -| `databricks-gemini-3-1-pro` | Google | 1M context, hybrid reasoning | -| `databricks-gemini-3-pro` | Google | 1M context, hybrid reasoning | -| `databricks-gemini-3-flash` | Google | Fast, cost-efficient | -| `databricks-gemini-2-5-pro` | Google | 1M context, Deep Think | -| `databricks-gemini-2-5-flash` | Google | 1M context, hybrid reasoning | -| `databricks-gemma-3-12b` | Google | 128K context, multilingual | -| `databricks-qwen3-next-80b-a3b-instruct` | Alibaba | Efficient MoE | +| `databricks-llama-4-maverick` | Meta | Multimodal, mixture-of-experts | +| `databricks-gemini-3-1-pro` | Google | 1M context; global endpoint, cross-geo required | +| `databricks-gemini-3-pro` | Google | **Retired 2026-03-26**; redirects to Gemini 3.1 Pro through 2026-06-07 | +| `databricks-gemini-3-1-flash-lite` | Google | Multimodal (text/image/video/audio); global endpoint, cross-geo required | +| `databricks-gemini-3-5-flash` | Google | Multimodal; cross-geo required outside US/EU | +| `databricks-gemini-3-flash` | Google | Multimodal; global endpoint, cross-geo required | +| `databricks-gemini-2-5-pro` | Google | 1M context, Deep Think mode, audio output | +| `databricks-gemini-2-5-flash` | Google | 1M context, fully hybrid reasoning | +| `databricks-gemma-3-12b` | Google | Multimodal, 128K context, 140+ languages | +| `databricks-qwen35-122b-a10b` | Alibaba | Reasoning-only (cannot disable), 256K context (Preview) | +| `databricks-qwen3-next-80b-a3b-instruct` | Alibaba | Ultra-long context, instruction-following (Preview) | ## Embedding Models @@ -49,7 +58,7 @@ For production-grade workloads, consider provisioned throughput mode. See the do - **Agent LLM**: `databricks-meta-llama-3-3-70b-instruct` — good balance of quality/cost. - **Embedding**: `databricks-gte-large-en`. -- **Code tasks**: `databricks-gpt-5-1-codex-mini` (cost-opt) or `databricks-gpt-5-1-codex-max` (high perf). +- **Code tasks**: `databricks-gpt-5-3-codex` (current agentic-coding endpoint; predecessor `databricks-gpt-5-2-codex` and the `databricks-gpt-5-1-codex-*` pair are retiring 2026-07-16). ## Querying From c148500d9d4cc79ab1c3a7834f6e77f4fab1f713 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 26 May 2026 15:38:33 +0000 Subject: [PATCH 3/8] =?UTF-8?q?fix(model-serving):=20port=20from=20experim?= =?UTF-8?q?ental=20properly=20=E2=80=94=20runtime=20list,=20not=20static?= =?UTF-8?q?=20catalog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quentin pointed out (PR #84) that the prior two commits actually ported from `main:databricks-skills/databricks-model-serving/`, not `experimental:databricks-skills/databricks-ml-training-serving/` as the PR description claimed. The two skills take opposite approaches: - `main` ships a static catalog table of FM API endpoint names. - `experimental` deliberately rejects that ("a static skill list goes stale fast — always list at runtime instead of hard-coding names") and ships a `databricks serving-endpoints list | jq ...` one-liner plus runtime-resolved defaults (highest-numbered Claude Sonnet for agents, highest-numbered `-codex-max` for code). Re-port to match the experimental philosophy: - `references/fm-api-endpoints.md`: replace the static catalog with the runtime-list snippet (filtered by `databricks-` name prefix AND `system.ai.*` served entity, to exclude non-FM endpoints sharing the prefix), runtime-resolved family defaults, and CLI + SDK query examples that use a placeholder endpoint name rather than a hard-coded model. - `SKILL.md`: update the Endpoint Types row + the Foundation-Model discovery bullet to reframe the reference as "discover at runtime" rather than "curated table". Version stays at 0.2.0 (frontmatter unchanged → manifest unchanged). The 2026-05-26 catalog refresh in the previous commit is dropped here: the experimental skill's point is that no static table is the right shape, so curating one against docs.databricks.com isn't useful for the stable skill either. Co-authored-by: Isaac --- skills/databricks-model-serving/SKILL.md | 4 +- .../references/fm-api-endpoints.md | 83 ++++++------------- 2 files changed, 27 insertions(+), 60 deletions(-) diff --git a/skills/databricks-model-serving/SKILL.md b/skills/databricks-model-serving/SKILL.md index 7087c81..84e0fde 100644 --- a/skills/databricks-model-serving/SKILL.md +++ b/skills/databricks-model-serving/SKILL.md @@ -17,7 +17,7 @@ Model Serving provides managed endpoints for serving LLMs, custom ML models, and | Type | When to Use | Key Detail | |------|-------------|------------| -| Pay-per-token | Foundation Model APIs (Llama, GPT-5, Claude, Gemini, etc.) | Uses `system.ai.*` catalog models, simplest setup. Endpoint names in [references/fm-api-endpoints.md](references/fm-api-endpoints.md). | +| Pay-per-token | Foundation Model APIs (Llama, GPT-5, Claude, Gemini, etc.) | Uses `system.ai.*` catalog models, simplest setup. Discover endpoints at runtime — see [references/fm-api-endpoints.md](references/fm-api-endpoints.md). | | Provisioned throughput | Dedicated GPU capacity | Guaranteed throughput, higher cost | | Custom model | Your own MLflow models or containers | Deploy any model with an MLflow signature | @@ -74,7 +74,7 @@ databricks serving-endpoints create \ }' --profile ``` -- Discover available Foundation Models: see [references/fm-api-endpoints.md](references/fm-api-endpoints.md) for the curated table of pay-per-token endpoint names. You can also check the `system.ai` catalog in Unity Catalog, or run `databricks serving-endpoints list --profile ` to see what's deployed in the workspace. Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's API schema. +- Discover available Foundation Models: see [references/fm-api-endpoints.md](references/fm-api-endpoints.md) for the `databricks serving-endpoints list | jq ...` one-liner and runtime-resolved defaults (don't hard-code model names — new endpoints land regularly and old ones retire). The `system.ai` catalog in Unity Catalog is a second source of truth. Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's API schema. - Long-running operation; the CLI waits for completion by default. Use `--no-wait` to return immediately, then poll: ```bash databricks serving-endpoints get --profile diff --git a/skills/databricks-model-serving/references/fm-api-endpoints.md b/skills/databricks-model-serving/references/fm-api-endpoints.md index 83e6fb7..1d88be4 100644 --- a/skills/databricks-model-serving/references/fm-api-endpoints.md +++ b/skills/databricks-model-serving/references/fm-api-endpoints.md @@ -1,72 +1,39 @@ -# Foundation Model API Endpoints +# Foundation Model API endpoints -Pay-per-token Foundation Model API endpoints available in supported Model Serving regions; several endpoints (notably the Gemini family) require cross-geography routing — see the [supported models](https://docs.databricks.com/machine-learning/foundation-model-apis/supported-models) page for per-endpoint region and cross-geo notes. Use the **exact endpoint name** from the tables below as `served_entities[].entity_name` (or as the model identifier when calling `serving-endpoints query`); never abbreviate or guess. +Pay-per-token Foundation Model API endpoints are pre-provisioned in every supported workspace. New models land regularly and a static skill list goes stale fast — **always list at runtime instead of hard-coding names**. -For production-grade workloads, consider provisioned throughput mode. +Filter by the `databricks-` name prefix AND by the served entity living under `system.ai.*` — other endpoints (e.g. `databricks-app-template-serving`) share the prefix but aren't FM API endpoints. -## Chat / Instruct Models +```bash +# Foundation Model API endpoints in this workspace, grouped by task (chat / embeddings / etc.) +databricks serving-endpoints list --profile \ + | jq -r '.[] + | select(.name | startswith("databricks-")) + | select((.config.served_entities[0].entity_name // "") | startswith("system.ai.")) + | "\(.task)\t\(.name)"' \ + | sort +``` -| Endpoint Name | Provider | Notes | -|--------------|----------|-------| -| `databricks-gpt-5-5-pro` | OpenAI | Multimodal, 400K context, extended prompt caching | -| `databricks-gpt-5-5` | OpenAI | Multimodal, 400K context, extended prompt caching | -| `databricks-gpt-5-4` | OpenAI | Multimodal, 400K context, general-purpose reasoning | -| `databricks-gpt-5-4-mini` | OpenAI | Multimodal, 400K context, cost-optimized | -| `databricks-gpt-5-4-nano` | OpenAI | Multimodal, 400K context, high-throughput | -| `databricks-gpt-5-3-codex` | OpenAI | Multimodal, 400K context, agentic coding (not in AI Playground) | -| `databricks-gpt-5-2-codex` | OpenAI | Code-specialized; **retiring 2026-07-16** | -| `databricks-gpt-5-2` | OpenAI | Multimodal, 400K context, general-purpose reasoning | -| `databricks-gpt-5-1` | OpenAI | Instant + Thinking modes, 400K context | -| `databricks-gpt-5-1-codex-max` | OpenAI | Code-specialized (high perf); global endpoint, cross-geo required; **retiring 2026-07-16** | -| `databricks-gpt-5-1-codex-mini` | OpenAI | Code-specialized (cost-opt); global endpoint, cross-geo required; **retiring 2026-07-16** | -| `databricks-gpt-5` | OpenAI | 400K context, reasoning | -| `databricks-gpt-5-mini` | OpenAI | Cost-optimized reasoning | -| `databricks-gpt-5-nano` | OpenAI | High-throughput, lightweight | -| `databricks-gpt-oss-120b` | OpenAI | Open-weight reasoning, 128K context | -| `databricks-gpt-oss-20b` | OpenAI | Lightweight open-weight, 128K context | -| `databricks-claude-opus-4-7` | Anthropic | Most capable Claude; 1M context, improved vision | -| `databricks-claude-opus-4-6` | Anthropic | Adaptive thinking with max-effort mode, 1M context | -| `databricks-claude-opus-4-5` | Anthropic | Hybrid reasoning, 200K context | -| `databricks-claude-opus-4-1` | Anthropic | General-purpose hybrid reasoning, 200K/32K output | -| `databricks-claude-sonnet-4-6` | Anthropic | Hybrid reasoning with two modes | -| `databricks-claude-sonnet-4-5` | Anthropic | Hybrid reasoning with two modes | -| `databricks-claude-sonnet-4` | Anthropic | Hybrid reasoning with two modes | -| `databricks-claude-haiku-4-5` | Anthropic | Fastest, most cost-effective Claude | -| `databricks-meta-llama-3-3-70b-instruct` | Meta | 128K context, multilingual | -| `databricks-meta-llama-3-1-8b-instruct` | Meta | Lightweight, 128K context | -| `databricks-llama-4-maverick` | Meta | Multimodal, mixture-of-experts | -| `databricks-gemini-3-1-pro` | Google | 1M context; global endpoint, cross-geo required | -| `databricks-gemini-3-pro` | Google | **Retired 2026-03-26**; redirects to Gemini 3.1 Pro through 2026-06-07 | -| `databricks-gemini-3-1-flash-lite` | Google | Multimodal (text/image/video/audio); global endpoint, cross-geo required | -| `databricks-gemini-3-5-flash` | Google | Multimodal; cross-geo required outside US/EU | -| `databricks-gemini-3-flash` | Google | Multimodal; global endpoint, cross-geo required | -| `databricks-gemini-2-5-pro` | Google | 1M context, Deep Think mode, audio output | -| `databricks-gemini-2-5-flash` | Google | 1M context, fully hybrid reasoning | -| `databricks-gemma-3-12b` | Google | Multimodal, 128K context, 140+ languages | -| `databricks-qwen35-122b-a10b` | Alibaba | Reasoning-only (cannot disable), 256K context (Preview) | -| `databricks-qwen3-next-80b-a3b-instruct` | Alibaba | Ultra-long context, instruction-following (Preview) | +For per-endpoint region, cross-geo, and retirement notes, see the [supported models](https://docs.databricks.com/machine-learning/foundation-model-apis/supported-models) docs page. For production-grade workloads, consider provisioned throughput mode. -## Embedding Models +## Defaults when the user doesn't specify -| Endpoint Name | Dimensions | Max Tokens | Notes | -|--------------|-----------|------------|-------| -| `databricks-gte-large-en` | 1024 | 8192 | English, not normalized | -| `databricks-bge-large-en` | 1024 | 512 | English, normalized | -| `databricks-qwen3-embedding-0-6b` | up to 1024 | ~32K | 100+ languages, instruction-aware | +Resolve actual names from the live list above; pick by family rather than memorising a version: -## Common defaults +- **Agent / chat LLM**: highest-numbered `databricks-claude-sonnet-*` (good quality / latency / cost balance). +- **Code tasks**: highest-numbered `databricks-gpt-*-codex` available (older `databricks-gpt-5-1-codex-*` and `databricks-gpt-5-2-codex` are scheduled to retire 2026-07-16). +- **Embeddings**: `databricks-gte-large-en` (1024 dims, 8192 max tokens). -- **Agent LLM**: `databricks-meta-llama-3-3-70b-instruct` — good balance of quality/cost. -- **Embedding**: `databricks-gte-large-en`. -- **Code tasks**: `databricks-gpt-5-3-codex` (current agentic-coding endpoint; predecessor `databricks-gpt-5-2-codex` and the `databricks-gpt-5-1-codex-*` pair are retiring 2026-07-16). +The `system.ai` catalog in Unity Catalog is a second source of truth — endpoint names match the model identifiers under `system.ai.*`. ## Querying -Pay-per-token endpoints are pre-deployed; you don't need to create them. Query directly: +Pay-per-token endpoints are pre-deployed; you don't need to create them. Use the endpoint name resolved from the runtime list: ```bash -databricks serving-endpoints query databricks-meta-llama-3-3-70b-instruct \ - --json '{"messages":[{"role":"user","content":"hi"}]}' +databricks serving-endpoints query \ + --json '{"messages":[{"role":"user","content":"hi"}]}' \ + --profile ``` ```python @@ -74,9 +41,9 @@ from databricks.sdk import WorkspaceClient w = WorkspaceClient() response = w.serving_endpoints.query( - name="databricks-meta-llama-3-3-70b-instruct", + name="", messages=[{"role": "user", "content": "hi"}], ) ``` -For a full list of available models in the current workspace, browse the `system.ai` catalog in Unity Catalog. Endpoint names listed above match the model identifiers under `system.ai.*`. +Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's request/response schema before constructing non-chat payloads (e.g. embeddings). From 1604082336fdd3301e0be3f447178e8bd828b91f Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 26 May 2026 15:59:05 +0000 Subject: [PATCH 4/8] fix(model-serving): replace fm-api-endpoints.md with verbatim experimental port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous commit (c148500) restated the experimental section in my own words and added a "Querying" section + provisioned-throughput aside + docs-link gloss that aren't in the upstream skill. The PR's stated goal is to port from experimental — do an actual port, not a paraphrase. `references/fm-api-endpoints.md` now mirrors the `## Foundation Model API endpoints` section of `experimental:databricks-ml-training-serving/SKILL.md` verbatim (heading promoted from `##` to `#` since this is a standalone file): intro paragraph + the `databricks serving-endpoints list | jq ...` one-liner + the family-based default-picking rule. Nothing else. Also trim the SKILL.md discovery bullet back toward its original shape — link to the reference file for the runtime-list snippet, then the same `system.ai` / `serving-endpoints list` / `get-open-api` alternatives that were already there. Co-authored-by: Isaac --- skills/databricks-model-serving/SKILL.md | 2 +- .../references/fm-api-endpoints.md | 40 ++----------------- 2 files changed, 4 insertions(+), 38 deletions(-) diff --git a/skills/databricks-model-serving/SKILL.md b/skills/databricks-model-serving/SKILL.md index 84e0fde..379682b 100644 --- a/skills/databricks-model-serving/SKILL.md +++ b/skills/databricks-model-serving/SKILL.md @@ -74,7 +74,7 @@ databricks serving-endpoints create \ }' --profile ``` -- Discover available Foundation Models: see [references/fm-api-endpoints.md](references/fm-api-endpoints.md) for the `databricks serving-endpoints list | jq ...` one-liner and runtime-resolved defaults (don't hard-code model names — new endpoints land regularly and old ones retire). The `system.ai` catalog in Unity Catalog is a second source of truth. Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's API schema. +- Discover available Foundation Models: see [references/fm-api-endpoints.md](references/fm-api-endpoints.md) for the runtime-list snippet and default-picking rules. You can also check the `system.ai` catalog in Unity Catalog, or run `databricks serving-endpoints list --profile ` to see what's deployed in the workspace. Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's API schema. - Long-running operation; the CLI waits for completion by default. Use `--no-wait` to return immediately, then poll: ```bash databricks serving-endpoints get --profile diff --git a/skills/databricks-model-serving/references/fm-api-endpoints.md b/skills/databricks-model-serving/references/fm-api-endpoints.md index 1d88be4..123e04a 100644 --- a/skills/databricks-model-serving/references/fm-api-endpoints.md +++ b/skills/databricks-model-serving/references/fm-api-endpoints.md @@ -1,12 +1,10 @@ # Foundation Model API endpoints -Pay-per-token Foundation Model API endpoints are pre-provisioned in every supported workspace. New models land regularly and a static skill list goes stale fast — **always list at runtime instead of hard-coding names**. - -Filter by the `databricks-` name prefix AND by the served entity living under `system.ai.*` — other endpoints (e.g. `databricks-app-template-serving`) share the prefix but aren't FM API endpoints. +Pay-per-token, pre-provisioned in every workspace. New models land regularly and a static skill list goes stale fast — **always list at runtime instead of hard-coding names**. Filter by the `databricks-` name prefix AND by the served entity being in `system.ai.*` (other endpoints like `databricks-app-template-serving` share the prefix but aren't FM API endpoints). ```bash # Foundation Model API endpoints in this workspace, grouped by task (chat / embeddings / etc.) -databricks serving-endpoints list --profile \ +databricks serving-endpoints list \ | jq -r '.[] | select(.name | startswith("databricks-")) | select((.config.served_entities[0].entity_name // "") | startswith("system.ai.")) @@ -14,36 +12,4 @@ databricks serving-endpoints list --profile \ | sort ``` -For per-endpoint region, cross-geo, and retirement notes, see the [supported models](https://docs.databricks.com/machine-learning/foundation-model-apis/supported-models) docs page. For production-grade workloads, consider provisioned throughput mode. - -## Defaults when the user doesn't specify - -Resolve actual names from the live list above; pick by family rather than memorising a version: - -- **Agent / chat LLM**: highest-numbered `databricks-claude-sonnet-*` (good quality / latency / cost balance). -- **Code tasks**: highest-numbered `databricks-gpt-*-codex` available (older `databricks-gpt-5-1-codex-*` and `databricks-gpt-5-2-codex` are scheduled to retire 2026-07-16). -- **Embeddings**: `databricks-gte-large-en` (1024 dims, 8192 max tokens). - -The `system.ai` catalog in Unity Catalog is a second source of truth — endpoint names match the model identifiers under `system.ai.*`. - -## Querying - -Pay-per-token endpoints are pre-deployed; you don't need to create them. Use the endpoint name resolved from the runtime list: - -```bash -databricks serving-endpoints query \ - --json '{"messages":[{"role":"user","content":"hi"}]}' \ - --profile -``` - -```python -from databricks.sdk import WorkspaceClient - -w = WorkspaceClient() -response = w.serving_endpoints.query( - name="", - messages=[{"role": "user", "content": "hi"}], -) -``` - -Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's request/response schema before constructing non-chat payloads (e.g. embeddings). +**Defaults when the user doesn't specify**: pick the highest-numbered Claude Sonnet for agents, the highest-numbered `-codex-max` for code, `databricks-gte-large-en` for embeddings — resolve actual names from the live list above. From c4f3362e05aecc239f9cc438ccebe367ada6969d Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 26 May 2026 16:16:32 +0000 Subject: [PATCH 5/8] skills(model-serving): port full dev-side content from a-d-k experimental MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expands the port from the FM-endpoints-only scope to cover every section of `experimental:databricks-ml-training-serving/`. Mirrors the experimental skill's 3-file structure 1:1 into stable's `references/` directory; the standalone fm-api-endpoints.md added in earlier commits goes away (its content lives inline in training-and-serving.md exactly as it does in experimental's SKILL.md). Added (all verbatim ports, mechanical adjustments only): references/training-and-serving.md Ports experimental SKILL.md content. Mechanical changes only: frontmatter stripped (destination is a reference file, not a SKILL.md); `1-custom-pyfunc.md` → `custom-pyfunc.md`, `2-genai-agents.md` → `genai-agents.md` (filename renames); `..//SKILL.md` → `../..//SKILL.md` (one more level of nesting since this file is in references/ rather than at the skill root). Content covers: canonical train/register/serve flow, `mlflow.{sklearn,xgboost,…}.autolog()` patterns, UC alias-based promotion, batch scoring via `spark_udf`, real-time endpoint create + zero-downtime version swap, `state.ready` vs `state.config_update` poll-both gotcha, `jobs submit --no-wait` serverless deploy pattern, Foundation Model API endpoints runtime-list, and the full gotchas trap-table. references/custom-pyfunc.md Ports experimental 1-custom-pyfunc.md verbatim. Mechanical change: `[SKILL.md]` → `[training-and-serving.md]` where the original cross-referenced its parent SKILL.md. Content: file-based PyFunc ("Models from Code"), `infer_signature`, `code_paths`, pre-deploy validation via `mlflow.models.predict(env_manager="uv")`. references/genai-agents.md Ports experimental 2-genai-agents.md verbatim. Mechanical changes: cross-skill paths bumped one level deeper; `[SKILL.md]` → `[training-and-serving.md]`. Content covers: `ResponsesAgent` interface, LangGraph agent with `UCFunctionToolkit` + `VectorSearchRetrieverTool`, the `create_text_output_item` raw-dict-silently-fails gotcha, the `resources=[...]` passthrough-auth list (DatabricksServingEndpoint, DatabricksFunction, DatabricksVectorSearchIndex, DatabricksLakebase), async deploy via `agents.deploy()` from a serverless job, query via CLI and OpenAI-compatible client. Removed: references/fm-api-endpoints.md Standalone file from earlier commits; its content lives inline in training-and-serving.md exactly as it does in experimental's SKILL.md, so the deliberate split is no longer needed. Stable SKILL.md updates (minimal, ops-focus preserved): - FM-endpoint link targets updated from `references/fm-api-endpoints.md` to `references/training-and-serving.md#foundation-model-api-endpoints` in the Endpoint Types table row and the FM-discovery bullet. - New `### Develop & deploy new models` subsection under "What's Next" with a 3-row table pointing at the new dev-side references, framed as "this skill is ops-focused; for the dev-side flow, see below". Manifest regenerated. Co-authored-by: Isaac --- manifest.json | 6 +- skills/databricks-model-serving/SKILL.md | 14 +- .../references/custom-pyfunc.md | 106 +++++++ .../references/fm-api-endpoints.md | 15 - .../references/genai-agents.md | 237 ++++++++++++++ .../references/training-and-serving.md | 300 ++++++++++++++++++ 6 files changed, 659 insertions(+), 19 deletions(-) create mode 100644 skills/databricks-model-serving/references/custom-pyfunc.md delete mode 100644 skills/databricks-model-serving/references/fm-api-endpoints.md create mode 100644 skills/databricks-model-serving/references/genai-agents.md create mode 100644 skills/databricks-model-serving/references/training-and-serving.md diff --git a/manifest.json b/manifest.json index ffa6a0a..d98d970 100644 --- a/manifest.json +++ b/manifest.json @@ -99,8 +99,10 @@ "agents/openai.yaml", "assets/databricks.png", "assets/databricks.svg", - "references/fm-api-endpoints.md", - "references/off-platform-streaming.md" + "references/custom-pyfunc.md", + "references/genai-agents.md", + "references/off-platform-streaming.md", + "references/training-and-serving.md" ] }, "databricks-pipelines": { diff --git a/skills/databricks-model-serving/SKILL.md b/skills/databricks-model-serving/SKILL.md index 379682b..badff29 100644 --- a/skills/databricks-model-serving/SKILL.md +++ b/skills/databricks-model-serving/SKILL.md @@ -17,7 +17,7 @@ Model Serving provides managed endpoints for serving LLMs, custom ML models, and | Type | When to Use | Key Detail | |------|-------------|------------| -| Pay-per-token | Foundation Model APIs (Llama, GPT-5, Claude, Gemini, etc.) | Uses `system.ai.*` catalog models, simplest setup. Discover endpoints at runtime — see [references/fm-api-endpoints.md](references/fm-api-endpoints.md). | +| Pay-per-token | Foundation Model APIs (Llama, GPT-5, Claude, Gemini, etc.) | Uses `system.ai.*` catalog models, simplest setup. Discover endpoints at runtime — see [references/training-and-serving.md § Foundation Model API endpoints](references/training-and-serving.md#foundation-model-api-endpoints). | | Provisioned throughput | Dedicated GPU capacity | Guaranteed throughput, higher cost | | Custom model | Your own MLflow models or containers | Deploy any model with an MLflow signature | @@ -74,7 +74,7 @@ databricks serving-endpoints create \ }' --profile ``` -- Discover available Foundation Models: see [references/fm-api-endpoints.md](references/fm-api-endpoints.md) for the runtime-list snippet and default-picking rules. You can also check the `system.ai` catalog in Unity Catalog, or run `databricks serving-endpoints list --profile ` to see what's deployed in the workspace. Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's API schema. +- Discover available Foundation Models: see [references/training-and-serving.md § Foundation Model API endpoints](references/training-and-serving.md#foundation-model-api-endpoints) for the runtime-list snippet and default-picking rules. You can also check the `system.ai` catalog in Unity Catalog, or run `databricks serving-endpoints list --profile ` to see what's deployed in the workspace. Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's API schema. - Long-running operation; the CLI waits for completion by default. Use `--no-wait` to return immediately, then poll: ```bash databricks serving-endpoints get --profile @@ -164,6 +164,16 @@ env: Then add a tRPC route to call it from your app. For the full app integration pattern, use the **`databricks-apps`** skill and read the [Model Serving Guide](../databricks-apps/references/appkit/model-serving.md). +### Develop & deploy new models + +This skill is ops-focused (manage existing endpoints). For the dev-side flow — train a model, register to Unity Catalog, log a PyFunc or `ResponsesAgent`, deploy — see the references below. + +| Reference | When to read | +|---|---| +| [references/training-and-serving.md](references/training-and-serving.md) | Train + register classical ML with `mlflow.autolog`, alias-based promotion (`@prod`), batch scoring via `spark_udf`, real-time endpoint create + zero-downtime version swap, async deploy via `jobs submit --no-wait`. Includes the Foundation Model API endpoints runtime-list and the gotchas table. | +| [references/custom-pyfunc.md](references/custom-pyfunc.md) | When `autolog` isn't enough — file-based `PythonModel` ("Models from Code"), `infer_signature`, `code_paths`, pre-deploy validation with `mlflow.models.predict(env_manager="uv")`. | +| [references/genai-agents.md](references/genai-agents.md) | Hand-rolled `ResponsesAgent` with LangGraph + `UCFunctionToolkit` + `VectorSearchRetrieverTool`. Includes the `create_text_output_item` helper-method gotcha and the `resources=[...]` passthrough-auth list. | + ## Troubleshooting | Error | Solution | diff --git a/skills/databricks-model-serving/references/custom-pyfunc.md b/skills/databricks-model-serving/references/custom-pyfunc.md new file mode 100644 index 0000000..8d85c3e --- /dev/null +++ b/skills/databricks-model-serving/references/custom-pyfunc.md @@ -0,0 +1,106 @@ +# Custom pyfunc model + +When sklearn / XGBoost autolog isn't enough: custom preprocessing not captured by a sklearn pipeline, multiple sub-models behind one endpoint, external API calls during inference, business-logic-heavy post-processing. + +Same UC registry + serving story as classical ML — only the *logging* step changes. + +## End-to-end example: file-based pyfunc with preprocessing + sub-model + +Project layout: + +``` +my_model/ +├── model.py # PythonModel + mlflow.models.set_model(...) +├── log_model.py # Logs + registers to UC +└── artifacts/ + ├── preprocessor.pkl + └── booster.json +``` + +```python +# model.py — logged verbatim via python_model="model.py" (Models from Code). +# DO NOT pickle a class instance; use this file-path pattern instead. +import json, pickle, pandas as pd +import mlflow +from mlflow.pyfunc import PythonModel + +class TurbineRiskModel(PythonModel): + def load_context(self, context): + with open(context.artifacts["preprocessor"], "rb") as f: + self.pre = pickle.load(f) + from xgboost import Booster + self.booster = Booster() + self.booster.load_model(context.artifacts["booster"]) + + def predict(self, context, model_input: pd.DataFrame, params=None) -> pd.DataFrame: + X = self.pre.transform(model_input) + proba = self.booster.predict(X) + return pd.DataFrame({ + "risk_score": proba, + "risk_level": ["HIGH" if p > 0.7 else "MEDIUM" if p > 0.4 else "LOW" for p in proba], + }) + +mlflow.models.set_model(TurbineRiskModel()) +``` + +```python +# log_model.py +import mlflow +from mlflow.models import infer_signature +from mlflow.tracking import MlflowClient + +mlflow.set_registry_uri("databricks-uc") +mlflow.set_experiment("/Users/me@example.com/turbine_risk") + +CATALOG, SCHEMA, NAME = "ai_demo_gen", "wind_farm", "turbine_risk" +FULL_NAME = f"{CATALOG}.{SCHEMA}.{NAME}" + +sample_input = pd.DataFrame({"vib_rms": [0.4], "rpm_mean": [18.2], "bearing_temp_max": [71.3]}) +sample_output = pd.DataFrame({"risk_score": [0.0], "risk_level": ["LOW"]}) + +with mlflow.start_run(): + info = mlflow.pyfunc.log_model( + name="model", + python_model="model.py", # file path, not an instance + artifacts={ + "preprocessor": "artifacts/preprocessor.pkl", + "booster": "artifacts/booster.json", + }, + signature=infer_signature(sample_input, sample_output), + input_example=sample_input, + # Pin exact versions — endpoint rebuilds the env from these: + pip_requirements=["mlflow==2.22.0", "xgboost==2.1.3", "scikit-learn==1.5.2", "pandas"], + # Extra modules to ship with the model (e.g. shared util libs): + # code_paths=["src/utils.py"], + registered_model_name=FULL_NAME, + ) + +# Pre-deploy validation — rebuilds the env locally and runs predict(). +# Catches missing deps / signature drift BEFORE the endpoint does. +mlflow.models.predict( + model_uri=info.model_uri, + input_data=sample_input, + env_manager="uv", # MLflow ≥ 2.22; falls back to "virtualenv" otherwise +) + +# Promote to @prod +client = MlflowClient(registry_uri="databricks-uc") +v = max(client.search_model_versions(f"name='{FULL_NAME}'"), key=lambda x: int(x.version)).version +client.set_registered_model_alias(FULL_NAME, "prod", v) +``` + +**Why `python_model="model.py"`**: file logged verbatim, no class pickling — avoids Python-version unpickle crashes between training and serving runtimes. Pair with `code_paths=[...]` to ship companion modules; `mlflow.models.set_model(instance)` at end of file is the contract (exactly one call). + +## Consume + +Same two paths as autologged classical ML — see [training-and-serving.md](training-and-serving.md#consume-batch-scoring-over-delta). + +- **Batch**: `mlflow.pyfunc.spark_udf(spark, model_uri=f"models:/{FULL_NAME}@prod", env_manager="local")` over a Delta table. +- **Real-time**: `client.create_endpoint(...)` (see training-and-serving.md). Query returns a DataFrame-shaped JSON since `predict` returns a DataFrame. + +```bash +databricks serving-endpoints query turbine-risk-endpoint --json '{ + "dataframe_records": [{"vib_rms": 0.6, "rpm_mean": 19.0, "bearing_temp_max": 78.0}] +}' +# → {"predictions": [{"risk_score": 0.82, "risk_level": "HIGH"}]} +``` diff --git a/skills/databricks-model-serving/references/fm-api-endpoints.md b/skills/databricks-model-serving/references/fm-api-endpoints.md deleted file mode 100644 index 123e04a..0000000 --- a/skills/databricks-model-serving/references/fm-api-endpoints.md +++ /dev/null @@ -1,15 +0,0 @@ -# Foundation Model API endpoints - -Pay-per-token, pre-provisioned in every workspace. New models land regularly and a static skill list goes stale fast — **always list at runtime instead of hard-coding names**. Filter by the `databricks-` name prefix AND by the served entity being in `system.ai.*` (other endpoints like `databricks-app-template-serving` share the prefix but aren't FM API endpoints). - -```bash -# Foundation Model API endpoints in this workspace, grouped by task (chat / embeddings / etc.) -databricks serving-endpoints list \ - | jq -r '.[] - | select(.name | startswith("databricks-")) - | select((.config.served_entities[0].entity_name // "") | startswith("system.ai.")) - | "\(.task)\t\(.name)"' \ - | sort -``` - -**Defaults when the user doesn't specify**: pick the highest-numbered Claude Sonnet for agents, the highest-numbered `-codex-max` for code, `databricks-gte-large-en` for embeddings — resolve actual names from the live list above. diff --git a/skills/databricks-model-serving/references/genai-agents.md b/skills/databricks-model-serving/references/genai-agents.md new file mode 100644 index 0000000..8cc32cd --- /dev/null +++ b/skills/databricks-model-serving/references/genai-agents.md @@ -0,0 +1,237 @@ +# Custom GenAI agents with MLflow ResponsesAgent + +Edge case. **For most demos, use [databricks-agent-bricks](../../databricks-agent-bricks/SKILL.md)** — pre-built Knowledge Assistants and Supervisor Agents wire up Genie + KAs + tools without any agent code. Hand-roll a `ResponsesAgent` only when you need a custom orchestration the supervisor can't express (custom routing logic, multi-step plans, agent that calls another agent over HTTP). + +## What ResponsesAgent is + +MLflow 3's standardized agent interface. OpenAI-compatible request/response (`{input: [{role, content}]}` → `{output: [...]}`). Supports streaming. Logs with `python_model="agent.py"` (file-based) and deploys via `databricks.agents.deploy()` to a serving endpoint with built-in tracing and eval hooks. + +## Full example: LangGraph agent with UC Function + Vector Search tools + +Project layout: + +``` +my_agent/ +├── agent.py # LangGraphAgent + tools + mlflow.models.set_model(...) +├── log_model.py # Logs with resources= for auto-auth, registers to UC +└── deploy_agent.py # Submitted as a job because deploy takes ~15 min +``` + +```python +# agent.py +import mlflow +from mlflow.pyfunc import ResponsesAgent +from mlflow.types.responses import ( + ResponsesAgentRequest, ResponsesAgentResponse, ResponsesAgentStreamEvent, + output_to_responses_items_stream, to_chat_completions_input, +) +from databricks_langchain import ( + ChatDatabricks, UCFunctionToolkit, VectorSearchRetrieverTool, +) +from langchain_core.messages import AIMessage +from langchain_core.runnables import RunnableLambda +from langgraph.graph import END, StateGraph +from langgraph.graph.message import add_messages +from langgraph.prebuilt.tool_node import ToolNode +from typing import Annotated, Generator, Sequence, TypedDict + +LLM_ENDPOINT = "databricks-claude-sonnet-4-6" # resolve at runtime — see training-and-serving.md +VS_INDEX = "ai_demo_gen.wind_farm.docs_index" +UC_FUNCTIONS = ["ai_demo_gen.wind_farm.lookup_turbine_history"] +SYSTEM_PROMPT = ( + "You are a turbine ops assistant. Use lookup_turbine_history for hardware " + "history queries, the docs retriever for procedure questions." +) + +class State(TypedDict): + messages: Annotated[Sequence, add_messages] + +class TurbineAgent(ResponsesAgent): + def __init__(self): + self.llm = ChatDatabricks(endpoint=LLM_ENDPOINT, temperature=0.1) + # Tools — UC functions and Vector Search both come from databricks_langchain. + self.tools = list(UCFunctionToolkit(function_names=UC_FUNCTIONS).tools) + self.vs_tool = VectorSearchRetrieverTool( + index_name=VS_INDEX, num_results=5, + columns=["content", "doc_uri", "title"], + ) + self.tools.append(self.vs_tool) + self.llm_with_tools = self.llm.bind_tools(self.tools) + + def _graph(self): + def call_model(state): + msgs = [{"role": "system", "content": SYSTEM_PROMPT}] + state["messages"] + return {"messages": [self.llm_with_tools.invoke(msgs)]} + def should_continue(state): + last = state["messages"][-1] + return "tools" if isinstance(last, AIMessage) and last.tool_calls else "end" + + g = StateGraph(State) + g.add_node("agent", RunnableLambda(call_model)) + g.add_node("tools", ToolNode(self.tools)) + g.set_entry_point("agent") + g.add_conditional_edges("agent", should_continue, {"tools": "tools", "end": END}) + g.add_edge("tools", "agent") + return g.compile() + + def predict_stream(self, req: ResponsesAgentRequest) -> Generator[ResponsesAgentStreamEvent, None, None]: + msgs = to_chat_completions_input([m.model_dump() for m in req.input]) + for kind, payload in self._graph().stream({"messages": msgs}, stream_mode=["updates"]): + if kind != "updates": continue + for node in payload.values(): + if node.get("messages"): + yield from output_to_responses_items_stream(node["messages"]) + + def predict(self, req: ResponsesAgentRequest) -> ResponsesAgentResponse: + items = [ev.item for ev in self.predict_stream(req) + if ev.type == "response.output_item.done"] + return ResponsesAgentResponse(output=items) + +mlflow.langchain.autolog() +mlflow.models.set_model(TurbineAgent()) +``` + +### CRITICAL: output items must use helper methods + +The supervisor will silently drop your output if you return raw dicts: + +```python +# WRONG — raw dicts silently fail +return ResponsesAgentResponse(output=[{"role": "assistant", "content": "..."}]) + +# CORRECT +return ResponsesAgentResponse(output=[ + self.create_text_output_item(text="...", id="msg_1"), +]) +``` + +Three helpers on `ResponsesAgent`: +- `self.create_text_output_item(text, id)` — text response. +- `self.create_function_call_item(id, call_id, name, arguments)` — tool call. +- `self.create_function_call_output_item(call_id, output)` — tool result. + +LangGraph's `output_to_responses_items_stream` (used above) emits these correctly, so the helpers are mainly relevant when hand-building events. + +## Log + register + +The non-obvious bit: `resources=[...]` is mandatory for auto-passthrough auth. Without it the deployed endpoint has no creds for the LLM, the UC functions, or the Vector Search index — every query returns `PERMISSION_DENIED` and the error doesn't explain why. + +```python +# log_model.py +import mlflow +from mlflow.models.resources import ( + DatabricksServingEndpoint, DatabricksFunction, DatabricksVectorSearchIndex, +) +from mlflow.tracking import MlflowClient +from agent import LLM_ENDPOINT, VS_INDEX, UC_FUNCTIONS + +mlflow.set_registry_uri("databricks-uc") +mlflow.set_experiment("/Users/me@example.com/turbine_agent") + +FULL_NAME = "ai_demo_gen.wind_farm.turbine_agent" + +resources = [ + DatabricksServingEndpoint(endpoint_name=LLM_ENDPOINT), + DatabricksVectorSearchIndex(index_name=VS_INDEX), + *[DatabricksFunction(function_name=f) for f in UC_FUNCTIONS], +] + +with mlflow.start_run(): + info = mlflow.pyfunc.log_model( + name="agent", + python_model="agent.py", # file path; agent.py calls set_model() + resources=resources, # auto-auth — DO NOT skip + input_example={"input": [{"role": "user", "content": "What's the maintenance history for turbine WTG-12?"}]}, + pip_requirements=[ + "mlflow==2.22.0", + "databricks-langchain", + "langgraph==0.3.4", + "databricks-agents", + "pydantic>=2", + ], + registered_model_name=FULL_NAME, + ) + +# Pre-deploy validation — rebuild the env, run a request, surface failures early. +mlflow.models.predict( + model_uri=info.model_uri, + input_data={"input": [{"role": "user", "content": "ping"}]}, + env_manager="uv", +) + +client = MlflowClient(registry_uri="databricks-uc") +v = max(client.search_model_versions(f"name='{FULL_NAME}'"), key=lambda x: int(x.version)).version +client.set_registered_model_alias(FULL_NAME, "prod", v) +``` + +### Resources that need passthrough auth + +| Resource | Import (`mlflow.models.resources`) | +|---|---| +| Foundation Model API / custom serving endpoint | `DatabricksServingEndpoint(endpoint_name=...)` | +| UC SQL/Python function | `DatabricksFunction(function_name=...)` | +| Vector Search index | `DatabricksVectorSearchIndex(index_name=...)` | +| Lakebase Postgres | `DatabricksLakebase(database_instance_name=...)` | + +Anything the agent calls that isn't covered here will hit auth errors at the endpoint. + +## Deploy (async job, ~15 min) + +`databricks.agents.deploy()` blocks for ~15 minutes — don't run it inline from the CLI. Submit as a serverless job so the chat session doesn't hold the connection. + +```python +# deploy_agent.py +import json, sys +from databricks import agents + +model_name = sys.argv[1] +version = sys.argv[2] +endpoint_name = sys.argv[3] if len(sys.argv) > 3 else None + +# Always pass endpoint_name explicitly — auto-derived names are +# `agents_--` with dots → dashes, which is unpredictable. +kwargs = {"tags": {"aidevkit_project": "ai-dev-kit"}} +if endpoint_name: + kwargs["endpoint_name"] = endpoint_name + +deployment = agents.deploy(model_name, version, **kwargs) + +# Land structured output via dbutils.notebook.exit — print() unreliable on serverless. +dbutils.notebook.exit(json.dumps({ + "endpoint_name": deployment.endpoint_name, + "query_endpoint": deployment.query_endpoint, +})) +``` + +Submit via the same `jobs submit --no-wait` pattern shown in [training-and-serving.md](training-and-serving.md#train--deploy-as-a-serverless-job) — same script, just `deploy_agent.py` as the notebook. + +## Query + +```bash +databricks serving-endpoints query turbine-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "What is the maintenance history for WTG-12?"}], + "max_tokens": 800 +}' +``` + +OpenAI-compatible client also works: + +```python +from openai import OpenAI +client = OpenAI( + base_url=f"{WORKSPACE_URL}/serving-endpoints/turbine-agent-endpoint", + api_key=DATABRICKS_TOKEN, +) +client.chat.completions.create( + model="turbine-agent-endpoint", + messages=[{"role": "user", "content": "..."}], +) +``` + +## Iteration + +`databricks workspace import-dir ./my_agent ... --overwrite` then re-run `log_model.py`. `agents.deploy()` with a new version **updates the existing endpoint in place** — no need to recreate. Re-deploy only when changing endpoint config (workload size, route splits). + +## Packages + +DBR 16.1+ has `mlflow` 3.x, `langchain`, `pydantic`, `databricks-sdk` pre-installed. Typically only need `%pip install -q databricks-langchain langgraph databricks-agents`. diff --git a/skills/databricks-model-serving/references/training-and-serving.md b/skills/databricks-model-serving/references/training-and-serving.md new file mode 100644 index 0000000..3b71474 --- /dev/null +++ b/skills/databricks-model-serving/references/training-and-serving.md @@ -0,0 +1,300 @@ +# ML Training & Serving on Databricks + +Train with MLflow → register to Unity Catalog → consume the **same artifact** as either a batch Spark UDF over Delta or a real-time REST endpoint (~5–15 min cold start, quota-bound — only when the user asks for per-request low-latency scoring). + +> **Always train on Databricks** (serverless job or notebook), never in the local Python process the agent is running in. Local training has no access to the silver tables, no MLflow tracking server, no UC registry path, and dies if the chat session drops — submit `databricks jobs submit --no-wait` (see "Train + deploy as a serverless job" below). Only fall back to local execution if the user explicitly asks for it. + +| Consumption | When | How | +|---|---|---| +| **Batch UDF** | Dashboards, daily/hourly scores, precomputed ~daily predictions, read by Genie/Dashboards, or app (typically synched to a lakebase table) | `mlflow.pyfunc.spark_udf(...)` → `INSERT INTO gold_predictions` | +| **Real-time endpoint** | Score on a user action (fraud at authorization, rec at page load) — sub-100ms | `mlflow.deployments.get_deploy_client()` (classical) / `agents.deploy()` (agents) | + +## Canonical flow + +``` +silver_ + silver_ + ▼ + notebook (as a serverless job): + ├── train with mlflow.autolog (XGBoost / sklearn / etc.) + ├── mlflow.register_model → UC: {catalog}.{schema}.{model} + ├── set_registered_model_alias(name, "prod", version) + └── spark_udf(@prod) over latest features → MERGE into gold_predictions + ▼ +gold__predictions ◄── dashboards, apps, Genie read this +``` + +One notebook, one artifact. Re-running = retraining. Gold is where truth lives — read paths never call the model directly. Keep label-window logic (`failure occurred within 7 days`) in the notebook during dev; once stable, promote to a silver materialized view in SDP. + +--- + +## Train and register (the 90% case) + +`mlflow.autolog()` captures params, metrics, code, and the model artifact for every run; `registered_model_name=...` auto-registers the best run to UC (auto-incremented version). Wrap training with **Optuna** so each trial is a child run and the best one is what gets registered. + +**Always `mlflow.set_registry_uri("databricks-uc")`** — without it, models land in the deprecated workspace registry. **The experiment's parent folder must exist** — `set_experiment` does NOT auto-create it (fails with `NOT_FOUND: Parent directory does not exist`). Pre-create it once with `databricks workspace mkdirs` before the job runs. + +```bash +# Once per project — create the parent folder for the MLflow experiment. +databricks workspace mkdirs /Users/me@example.com/turbine_project +``` + +```python +import mlflow, mlflow.xgboost, optuna +from mlflow.tracking import MlflowClient +from xgboost import XGBClassifier +from sklearn.metrics import roc_auc_score + +mlflow.set_registry_uri("databricks-uc") +mlflow.set_experiment("/Users/me@example.com/turbine_project/mlflow_experiment") + +CATALOG, SCHEMA, NAME = "ai_demo_gen", "wind_farm", "turbine_failure" +FULL_NAME = f"{CATALOG}.{SCHEMA}.{NAME}" + +mlflow.xgboost.autolog(log_input_examples=True, registered_model_name=FULL_NAME) + +# For imbalanced labels: stratify the split, set scale_pos_weight = neg/pos. +def objective(trial): + params = { + "n_estimators": trial.suggest_int("n_estimators", 100, 400), + "max_depth": trial.suggest_int("max_depth", 3, 10), + "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True), + } + with mlflow.start_run(nested=True): + m = XGBClassifier(**params).fit(X_train, y_train) + return roc_auc_score(y_test, m.predict_proba(X_test)[:, 1]) + +with mlflow.start_run(run_name="hpo"): + optuna.create_study(direction="maximize").optimize(objective, n_trials=20) + +# Move @prod alias to the just-registered version. Stages are deprecated — aliases only. +client = MlflowClient(registry_uri="databricks-uc") +latest = max(client.search_model_versions(f"name='{FULL_NAME}'"), + key=lambda v: int(v.version)) +client.set_registered_model_alias(FULL_NAME, "prod", latest.version) +``` + +**Framework autolog**: `mlflow.{sklearn,xgboost,lightgbm,pytorch,tensorflow,spark}.autolog()`. + +**Aliases, not stages**: UC dropped `Staging`/`Production`. Use movable `@prod`/`@challenger`; load with `models:/{full_name}@prod`. Promoting a new version is one `set_registered_model_alias` call. + +--- + +## Consume: batch scoring over Delta + +The cheap, default path. Load the registered model as a Spark UDF and score a Delta table; write predictions to a gold table that downstream consumers read. + +```python +import mlflow + +# env_manager rules: +# "local" → same runtime as training (same notebook/job). Fastest for the demo, keep that. +# "virtualenv"→ different runtime than training; rebuilds the model's env. +# "uv" → same as virtualenv but faster (MLflow ≥ 2.22). +predict = mlflow.pyfunc.spark_udf( + spark, + model_uri=f"models:/{FULL_NAME}@prod", + env_manager="local", +) + +features = spark.table(f"{CATALOG}.{SCHEMA}.silver_turbine_features_latest") +scored = features.withColumn("risk_score", predict(*[features[c] for c in feature_cols])) + +# Overwrite-per-run pattern for "latest score per entity": +scored.select("turbine_id", "risk_score", F.current_timestamp().alias("scored_at")) \ + .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.gold_turbine_predictions") +``` + +For incremental scoring with history, MERGE into the predictions table instead of overwrite. + +--- + +## Consume: real-time serving endpoint (only when required) + +Use the MLflow Deployments client. `workload_size: "Small"` + `scale_to_zero_enabled: true` is the default for demos and dev. First deploy can take ~5 min for classical ML + +```python +from mlflow.deployments import get_deploy_client + +client = get_deploy_client("databricks") +client.create_endpoint( + name="turbine-risk-endpoint", + config={ + "served_entities": [{ + "entity_name": FULL_NAME, + "entity_version": latest.version, + "workload_size": "Small", + "scale_to_zero_enabled": True, # Always + }], + # served_model_name = "-"; the API auto-derives it but + # you reference this exact string in traffic_config. + "traffic_config": {"routes": [ + {"served_model_name": f"{NAME}-{latest.version}", "traffic_percentage": 100} + ]}, + }, + # Tags are TOP-LEVEL — NOT inside `config`. Same {key, value} shape used + # by `serving-endpoints patch --add-tags`. Tag every demo resource for cleanup. + tags=[{"key": "aidevkit_project", "value": "ai-dev-kit"}], +) +``` + +**Zero-downtime version swap.** Repoint the alias *and* call `update_endpoint`: + +```python +client.set_registered_model_alias(FULL_NAME, "prod", new_version) +client.update_endpoint(endpoint="turbine-risk-endpoint", config={ + "served_entities": [{"entity_name": FULL_NAME, "entity_version": new_version, + "workload_size": "Small", "scale_to_zero_enabled": True}], + "traffic_config": {"routes": [ + {"served_model_name": f"{NAME}-{new_version}", "traffic_percentage": 100} + ]}, +}) +``` + +### Endpoint management (CLI) + +```bash +databricks serving-endpoints list +databricks serving-endpoints get turbine-risk-endpoint +databricks serving-endpoints delete turbine-risk-endpoint + +# Query a classical ML endpoint +databricks serving-endpoints query turbine-risk-endpoint --json '{ + "dataframe_records": [{"vibration": 0.42, "rpm": 18.3, "temp_c": 71.2}] +}' + +# Query a chat/agent endpoint +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role":"user","content":"Hello"}], "max_tokens": 500 +}' + +# Tag for project tracking +databricks serving-endpoints patch turbine-risk-endpoint --json '{ + "add_tags": [{"key": "aidevkit_project", "value": "ai-dev-kit"}] +}' +``` + +### Readiness has TWO state fields + +`databricks serving-endpoints get` returns both: + +- `state.ready` — `READY` once the endpoint has any working config (first deploy). +- `state.config_update` — `NOT_UPDATING` once the *current* config update finishes; `IN_PROGRESS` during a version swap. + +A loop watching only `state.ready` will say "ready" mid version-swap while the old version is still serving. Poll **both**: + +```bash +databricks serving-endpoints get turbine-risk-endpoint \ + | jq '{ready: .state.ready, config_update: .state.config_update}' +``` + +--- + +## Train + deploy as a serverless job + +Training notebooks run a few minutes (Optuna + UC register; endpoint warmup adds 5–15 min if you also deploy). Submit as a serverless one-time run so the CLI doesn't block. The notebook ends with `dbutils.notebook.exit(json.dumps({...}))` so the structured result (`model_version`, `val_auc`, `endpoint_name`) reaches `.notebook_output.result`. + +```bash +# 1. Upload the training notebook +databricks workspace import /Workspace/Users/me@example.com/turbine_project/train \ + --file ./train_notebook.py --format SOURCE --language PYTHON --overwrite + +# 2. Submit as serverless one-time run (returns {"run_id": N} immediately with --no-wait) +RUN_ID=$(databricks jobs submit --no-wait --json '{ + "run_name": "turbine-train-and-deploy", + "tasks": [{ + "task_key": "train", + "notebook_task": {"notebook_path": "/Workspace/Users/me@example.com/turbine_project/train"}, + "environment_key": "ml_env" + }], + "environments": [{ + "environment_key": "ml_env", + "spec": { + "client": "4", + "dependencies": ["mlflow==2.22.0", "xgboost==2.1.3", "optuna==4.1.0", "scikit-learn==1.5.2"] + } + }] +}' | jq -r .run_id) + +# 3. Poll until a terminal life_cycle_state. +for _ in $(seq 60); do + STATE=$(databricks jobs get-run "$RUN_ID" | jq -r '.state.life_cycle_state // "UNKNOWN"') + echo "$(date +%H:%M:%S) $STATE" + [[ "$STATE" =~ ^(TERMINATED|SKIPPED|INTERNAL_ERROR)$ ]] && break + sleep 30 +done +[[ "$STATE" =~ ^(TERMINATED|SKIPPED|INTERNAL_ERROR)$ ]] || { databricks jobs cancel-run "$RUN_ID"; exit 1; } + +# life_cycle_state TERMINATED only means "the run ended" — check result_state +# (SUCCESS / FAILED / TIMEDOUT / CANCELED / SUCCESS_WITH_FAILURES / …) for outcome. +RESULT=$(databricks jobs get-run "$RUN_ID" | jq -r '.state.result_state // "UNKNOWN"') +echo "result_state=$RESULT" +[[ "$RESULT" == "SUCCESS" ]] || { echo "Run did not succeed"; exit 1; } + +# 4. Pull structured output via the TASK run_id (NOT the submit run_id). +TASK_RUN_ID=$(databricks jobs get-run "$RUN_ID" | jq -r '.tasks[0].run_id') +databricks jobs get-run-output "$TASK_RUN_ID" | jq '.notebook_output.result' +# → '{"model_version":"3","val_auc":0.91,"rows_scored":124,"endpoint":"turbine-risk-endpoint"}' +``` + +**Serving UI hides SP-owned endpoints by default.** If the deploy ran as a service principal, the Serving page won't show the new endpoint until you switch from "Owned by me" to "All". Or just `databricks serving-endpoints list`. + +For the four `jobs submit` traps (`spec.client: "4"` requirement, TASK-vs-submit run_id, `print()` unreliable, tags rejected) and full debugging flow, see **[databricks-jobs](../../databricks-jobs/SKILL.md#one-time-runs-jobs-submit--async-pattern-for-notebooks)**. + +--- + +## Custom pyfunc + +When sklearn/XGBoost autolog isn't enough — custom preprocessing, multiple sub-models, external API calls, ensemble logic. See **[custom-pyfunc.md](custom-pyfunc.md)** for a full worked example. Two non-obvious things: + +- **`python_model="path/to/file.py"`** (file path, not class instance) + `mlflow.models.set_model(MyModel())` at the end of that file. This is the "Models from Code" pattern — the file is logged verbatim, no pickling of the class. +- **`mlflow.models.predict(model_uri=..., input_data=..., env_manager="uv")`** before deploying. Catches missing deps before the endpoint does. + +--- + +## Foundation Model API endpoints + +Pay-per-token, pre-provisioned in every workspace. New models land regularly and a static skill list goes stale fast — **always list at runtime instead of hard-coding names**. Filter by the `databricks-` name prefix AND by the served entity being in `system.ai.*` (other endpoints like `databricks-app-template-serving` share the prefix but aren't FM API endpoints). + +```bash +# Foundation Model API endpoints in this workspace, grouped by task (chat / embeddings / etc.) +databricks serving-endpoints list \ + | jq -r '.[] + | select(.name | startswith("databricks-")) + | select((.config.served_entities[0].entity_name // "") | startswith("system.ai.")) + | "\(.task)\t\(.name)"' \ + | sort +``` + +**Defaults when the user doesn't specify**: pick the highest-numbered Claude Sonnet for agents, the highest-numbered `-codex-max` for code, `databricks-gte-large-en` for embeddings — resolve actual names from the live list above. + +--- + +## Gotchas (the ones that cost time) + +| Trap | Fix | +|---|---| +| Model lands in workspace registry, not UC | `mlflow.set_registry_uri("databricks-uc")` *before* logging | +| Endpoint returns PERMISSION_DENIED at first query | Pass `resources=[...]` to `log_model` (covers UC functions, VS indexes, other endpoints, Lakebase) — see [genai-agents.md](genai-agents.md#resources-that-need-passthrough-auth) for the full list | +| Used `transition_model_version_stage` | Stages are deprecated in UC. Use `client.set_registered_model_alias(name, "prod", version)` | +| `spark_udf` rebuilds a virtualenv on every call | Pass `env_manager="local"` when training+scoring share a runtime | +| Endpoint version swap says "ready" but old version still serving | Poll **both** `state.ready` AND `state.config_update` — see "Readiness has TWO state fields" | +| `pip_requirements` mismatch crashes endpoint at load | Pin exact versions; or pull live with `f"mlflow=={get_distribution('mlflow').version}"` | +| `agents.deploy()` produced a weirdly-named endpoint | Pass `endpoint_name=...` explicitly. Auto-derived name is `agents_--` | +| Endpoint missing from Serving UI | UI filter defaults to "Owned by me"; deploy jobs run as SP. Switch to "All" or use `serving-endpoints list` | + +--- + +## Reference files + +| File | Contents | +|---|---| +| [custom-pyfunc.md](custom-pyfunc.md) | Single end-to-end custom pyfunc example: artifacts, signature, code_paths, log → register → deploy → query. | +| [genai-agents.md](genai-agents.md) | Edge case: deploying a LangGraph `ResponsesAgent` with UC Function + Vector Search tools. For supervised multi-agent tiles, use **databricks-agent-bricks** instead. | + +## Related skills + +- **[databricks-agent-bricks](../../databricks-agent-bricks/SKILL.md)** — no-code Knowledge Assistants and Supervisor Agents. Prefer this over hand-rolling agents. +- **[databricks-mlflow-evaluation](../../databricks-mlflow-evaluation/SKILL.md)** — evaluate model/agent quality before promoting `@prod`. +- **[databricks-vector-search](../../databricks-vector-search/SKILL.md)** — vector indexes used as retrieval tools in agents. +- **[databricks-jobs](../../databricks-jobs/SKILL.md)** — async deploy pattern (`--no-wait`, TASK run_id trap). +- **[databricks-unity-catalog](../../databricks-unity-catalog/SKILL.md)** — UC governs the registered model: permissions, lineage, audit. From afa0d90a2ee692958743cd699ee7d69cbc0b3c77 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Wed, 27 May 2026 08:20:10 +0000 Subject: [PATCH 6/8] fix(model-serving): repair cross-skill links + broaden frontmatter scope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - The mechanical `../` → `../../` rewrite in the verbatim port assumed every peer skill is stable, but 4 of them live in `experimental/`. `../..//SKILL.md` resolved to `skills//SKILL.md` which does not exist for `databricks-agent-bricks`, `databricks-mlflow-evaluation`, `databricks-vector-search`, `databricks-unity-catalog`. Repointed to `../../../experimental//SKILL.md`. `databricks-jobs` link unchanged (it's stable). - SKILL.md frontmatter `description` only described the ops surface, so agents wouldn't route dev-side asks (train, register, PyFunc, ResponsesAgent) to this skill. Broadened to cover both ops and the new dev surface. - Version bumped 0.2.0 → 0.3.0 + manifest regenerated. Co-authored-by: Isaac --- manifest.json | 2 +- skills/databricks-model-serving/SKILL.md | 4 ++-- .../databricks-model-serving/references/genai-agents.md | 2 +- .../references/training-and-serving.md | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/manifest.json b/manifest.json index d98d970..71ece5a 100644 --- a/manifest.json +++ b/manifest.json @@ -91,7 +91,7 @@ ] }, "databricks-model-serving": { - "version": "0.2.0", + "version": "0.3.0", "description": "Databricks Model Serving endpoint management", "repo_dir": "skills", "files": [ diff --git a/skills/databricks-model-serving/SKILL.md b/skills/databricks-model-serving/SKILL.md index badff29..52213e1 100644 --- a/skills/databricks-model-serving/SKILL.md +++ b/skills/databricks-model-serving/SKILL.md @@ -1,9 +1,9 @@ --- name: databricks-model-serving -description: "Manage Databricks Model Serving endpoints via CLI. Use when asked to create, configure, query, or manage model serving endpoints for LLM inference, custom models, or external models." +description: "Databricks Model Serving (ops) and MLflow model development (dev). Use when asked to: manage serving endpoints via CLI (create, query, update-config, scale, version-swap, integrate with Apps); train classical ML with MLflow autolog (sklearn/XGBoost/LightGBM/PyTorch); register models to Unity Catalog and promote via `@prod`/`@challenger` aliases; load registered models for batch scoring via `mlflow.pyfunc.spark_udf`; build custom PyFunc models (Models from Code); deploy a LangGraph `ResponsesAgent` with UC Function or Vector Search tools; discover Foundation Model API endpoints at runtime. NOT for: no-code Knowledge Assistants or Supervisor Agents (use databricks-agent-bricks); MLflow evaluation / scorers (use databricks-mlflow-evaluation)." compatibility: Requires databricks CLI (>= v0.294.0) metadata: - version: "0.2.0" + version: "0.3.0" parent: databricks-core --- diff --git a/skills/databricks-model-serving/references/genai-agents.md b/skills/databricks-model-serving/references/genai-agents.md index 8cc32cd..d635872 100644 --- a/skills/databricks-model-serving/references/genai-agents.md +++ b/skills/databricks-model-serving/references/genai-agents.md @@ -1,6 +1,6 @@ # Custom GenAI agents with MLflow ResponsesAgent -Edge case. **For most demos, use [databricks-agent-bricks](../../databricks-agent-bricks/SKILL.md)** — pre-built Knowledge Assistants and Supervisor Agents wire up Genie + KAs + tools without any agent code. Hand-roll a `ResponsesAgent` only when you need a custom orchestration the supervisor can't express (custom routing logic, multi-step plans, agent that calls another agent over HTTP). +Edge case. **For most demos, use [databricks-agent-bricks](../../../experimental/databricks-agent-bricks/SKILL.md)** — pre-built Knowledge Assistants and Supervisor Agents wire up Genie + KAs + tools without any agent code. Hand-roll a `ResponsesAgent` only when you need a custom orchestration the supervisor can't express (custom routing logic, multi-step plans, agent that calls another agent over HTTP). ## What ResponsesAgent is diff --git a/skills/databricks-model-serving/references/training-and-serving.md b/skills/databricks-model-serving/references/training-and-serving.md index 3b71474..3078a2d 100644 --- a/skills/databricks-model-serving/references/training-and-serving.md +++ b/skills/databricks-model-serving/references/training-and-serving.md @@ -293,8 +293,8 @@ databricks serving-endpoints list \ ## Related skills -- **[databricks-agent-bricks](../../databricks-agent-bricks/SKILL.md)** — no-code Knowledge Assistants and Supervisor Agents. Prefer this over hand-rolling agents. -- **[databricks-mlflow-evaluation](../../databricks-mlflow-evaluation/SKILL.md)** — evaluate model/agent quality before promoting `@prod`. -- **[databricks-vector-search](../../databricks-vector-search/SKILL.md)** — vector indexes used as retrieval tools in agents. +- **[databricks-agent-bricks](../../../experimental/databricks-agent-bricks/SKILL.md)** — no-code Knowledge Assistants and Supervisor Agents. Prefer this over hand-rolling agents. +- **[databricks-mlflow-evaluation](../../../experimental/databricks-mlflow-evaluation/SKILL.md)** — evaluate model/agent quality before promoting `@prod`. +- **[databricks-vector-search](../../../experimental/databricks-vector-search/SKILL.md)** — vector indexes used as retrieval tools in agents. - **[databricks-jobs](../../databricks-jobs/SKILL.md)** — async deploy pattern (`--no-wait`, TASK run_id trap). -- **[databricks-unity-catalog](../../databricks-unity-catalog/SKILL.md)** — UC governs the registered model: permissions, lineage, audit. +- **[databricks-unity-catalog](../../../experimental/databricks-unity-catalog/SKILL.md)** — UC governs the registered model: permissions, lineage, audit. From 8c8a1b39f4e41c933930b30d323b06cb012f694d Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Wed, 27 May 2026 14:38:19 +0000 Subject: [PATCH 7/8] skills(model-serving): add idempotency check before deploy submit Per @simonfaltum review: before resubmitting a deploy serverless job, agents should check whether a run is already in flight (active job runs filtered on run_name) or whether the target endpoint already exists in the right state. Avoids wasting ~15 min of serverless and racing for the same endpoint name. Co-authored-by: Isaac --- .../references/genai-agents.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/skills/databricks-model-serving/references/genai-agents.md b/skills/databricks-model-serving/references/genai-agents.md index d635872..b4573ae 100644 --- a/skills/databricks-model-serving/references/genai-agents.md +++ b/skills/databricks-model-serving/references/genai-agents.md @@ -179,6 +179,20 @@ Anything the agent calls that isn't covered here will hit auth errors at the end `databricks.agents.deploy()` blocks for ~15 minutes — don't run it inline from the CLI. Submit as a serverless job so the chat session doesn't hold the connection. +**Before submitting, check whether a deploy is already in flight or already done.** Re-submitting on top of a running deploy wastes ~15 min of serverless and can race for the same endpoint name. + +```bash +# 1. Is a deploy_agent run already active for this model? Match on run_name. +databricks jobs list-runs --active-only --output json \ + | jq --arg name "deploy_${MODEL_NAME}" '.runs[]? | select(.run_name == $name) | {run_id, state}' + +# 2. Does the target endpoint already exist? If READY on the right version, skip the redeploy. +databricks serving-endpoints get 2>/dev/null \ + | jq '{ready: .state.ready, served: [.config.served_models[] | {name, model_version}]}' +``` + +If either check returns a hit, follow the existing run with `jobs get-run ` instead of submitting a new one. + ```python # deploy_agent.py import json, sys From 43341ca428efe7a21d6fef62b5c697006211d37c Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Thu, 28 May 2026 10:34:04 +0200 Subject: [PATCH 8/8] skills: split databricks-model-serving into ops + experimental databricks-ml-training Splits the post-port databricks-model-serving skill into two skills with clean responsibility boundaries: databricks-model-serving keeps the endpoint lifecycle / ops surface, and a new experimental databricks-ml-training owns the dev-side training, MLflow tracking, UC registration, custom PyFunc, and hand-rolled ResponsesAgent content. Also closes five small gaps in databricks-model-serving where non-obvious serving behavior from the original a-d-k port had fallen through the cracks (Python deployments client gotchas, zero-downtime version swap, two-field readiness rationale, classical-ML query shape, Serving-UI SP filter). Co-authored-by: Isaac --- .../databricks-ml-training/SKILL.md | 181 +++++++----------- .../databricks-ml-training/agents/openai.yaml | 7 + .../assets/databricks.png | Bin 0 -> 15366 bytes .../assets/databricks.svg | 3 + .../references/custom-pyfunc.md | 4 +- .../references/genai-agents.md | 4 +- manifest.json | 20 +- skills/databricks-model-serving/SKILL.md | 89 +++++++-- 8 files changed, 168 insertions(+), 140 deletions(-) rename skills/databricks-model-serving/references/training-and-serving.md => experimental/databricks-ml-training/SKILL.md (57%) create mode 100644 experimental/databricks-ml-training/agents/openai.yaml create mode 100644 experimental/databricks-ml-training/assets/databricks.png create mode 100644 experimental/databricks-ml-training/assets/databricks.svg rename {skills/databricks-model-serving => experimental/databricks-ml-training}/references/custom-pyfunc.md (91%) rename {skills/databricks-model-serving => experimental/databricks-ml-training}/references/genai-agents.md (94%) diff --git a/skills/databricks-model-serving/references/training-and-serving.md b/experimental/databricks-ml-training/SKILL.md similarity index 57% rename from skills/databricks-model-serving/references/training-and-serving.md rename to experimental/databricks-ml-training/SKILL.md index 3078a2d..9fd85fe 100644 --- a/skills/databricks-model-serving/references/training-and-serving.md +++ b/experimental/databricks-ml-training/SKILL.md @@ -1,15 +1,28 @@ -# ML Training & Serving on Databricks +--- +name: databricks-ml-training +description: "Classical ML and custom-agent model training, MLflow tracking, and Unity Catalog model registration on Databricks. Use when the user asks to: train models (with MLflow, sklearn, XGBoost, LightGBM, PyTorch, custom pyfunc, etc.); run hyperparameter tuning with Optuna; register models to Unity Catalog and promote versions with `@prod` / `@challenger` aliases; load a registered model for batch scoring via `mlflow.pyfunc.spark_udf`; run inferences as batch, build custom MLflow PyFunc models (Models from Code); author a custom MLflow `ResponsesAgent` (LangGraph, OpenAI-compatible chat) with UC Function or Vector Search tools. NOT for: managing existing serving endpoints (use databricks-model-serving); no-code Knowledge Assistants or Supervisor Agents (use databricks-agent-bricks); MLflow evaluation / scorers (use databricks-mlflow-evaluation)." +compatibility: Requires databricks CLI (>= v0.294.0) +metadata: + version: "0.1.0" +parent: databricks-core +--- + +# ML Training on Databricks -Train with MLflow → register to Unity Catalog → consume the **same artifact** as either a batch Spark UDF over Delta or a real-time REST endpoint (~5–15 min cold start, quota-bound — only when the user asks for per-request low-latency scoring). +**FIRST**: Use the parent `databricks-core` skill for CLI basics, authentication, and profile selection. + +Train with MLflow → register to Unity Catalog → consume the **same artifact** as either a batch Spark UDF over Delta or (when low-latency is required) a real-time serving endpoint. > **Always train on Databricks** (serverless job or notebook), never in the local Python process the agent is running in. Local training has no access to the silver tables, no MLflow tracking server, no UC registry path, and dies if the chat session drops — submit `databricks jobs submit --no-wait` (see "Train + deploy as a serverless job" below). Only fall back to local execution if the user explicitly asks for it. +If you need to deploy a real time model serving endpoint **after** the model is registered (creating endpoints, traffic config, version-swapping, querying, Foundation Model API endpoints), see [databricks-model-serving](../../skills/databricks-model-serving/SKILL.md). + | Consumption | When | How | |---|---|---| -| **Batch UDF** | Dashboards, daily/hourly scores, precomputed ~daily predictions, read by Genie/Dashboards, or app (typically synched to a lakebase table) | `mlflow.pyfunc.spark_udf(...)` → `INSERT INTO gold_predictions` | -| **Real-time endpoint** | Score on a user action (fraud at authorization, rec at page load) — sub-100ms | `mlflow.deployments.get_deploy_client()` (classical) / `agents.deploy()` (agents) | +| **Batch UDF** | Dashboards, daily/hourly scores, predictions read by Genie/Dashboards or an app (often synced to a Lakebase table) | `mlflow.pyfunc.spark_udf(...)` → `INSERT INTO gold_predictions` | +| **Real-time endpoint** | Score on a user action (fraud at authorization, rec at page load) — sub-100ms | `mlflow.deployments.get_deploy_client()` (classical) / `agents.deploy()` (agents). Endpoint lifecycle: see [databricks-model-serving](../../skills/databricks-model-serving/SKILL.md). | -## Canonical flow +## Default Canonical flow ``` silver_ + silver_ @@ -38,7 +51,26 @@ One notebook, one artifact. Re-running = retraining. Gold is where truth lives databricks workspace mkdirs /Users/me@example.com/turbine_project ``` +Use the Databricks notebook source format (`# Databricks notebook source` header, `# COMMAND ----------` separators, `# MAGIC %md`/`%sql` magics for markdown/SQL cells): + ```python +# Databricks notebook source +# MAGIC %md +# MAGIC # Turbine failure prediction +# MAGIC +# MAGIC Train an XGBoost classifier on engineered turbine telemetry features. +# MAGIC ## Data exploration + +# COMMAND ---------- + +# (basic data exploration — class balance, schema sanity, etc.) + +# COMMAND ---------- +# MAGIC %md +# MAGIC ## Training the model + +# COMMAND ---------- + import mlflow, mlflow.xgboost, optuna from mlflow.tracking import MlflowClient from xgboost import XGBClassifier @@ -66,7 +98,12 @@ def objective(trial): with mlflow.start_run(run_name="hpo"): optuna.create_study(direction="maximize").optimize(objective, n_trials=20) -# Move @prod alias to the just-registered version. Stages are deprecated — aliases only. +# COMMAND ---------- +# MAGIC %md +# MAGIC ## Promote to @prod alias + +# COMMAND ---------- +# Stages are deprecated — UC uses movable aliases. client = MlflowClient(registry_uri="databricks-uc") latest = max(client.search_model_versions(f"name='{FULL_NAME}'"), key=lambda v: int(v.version)) @@ -84,10 +121,15 @@ client.set_registered_model_alias(FULL_NAME, "prod", latest.version) The cheap, default path. Load the registered model as a Spark UDF and score a Delta table; write predictions to a gold table that downstream consumers read. ```python +# COMMAND ---------- +# MAGIC %md +# MAGIC ## Score and save to a gold predictions table + +# COMMAND ---------- import mlflow # env_manager rules: -# "local" → same runtime as training (same notebook/job). Fastest for the demo, keep that. +# "local" → same runtime as training (same notebook/job). Fastest, default in dev/demo. # "virtualenv"→ different runtime than training; rebuilds the model's env. # "uv" → same as virtualenv but faster (MLflow ≥ 2.22). predict = mlflow.pyfunc.spark_udf( @@ -108,84 +150,11 @@ For incremental scoring with history, MERGE into the predictions table instead o --- -## Consume: real-time serving endpoint (only when required) - -Use the MLflow Deployments client. `workload_size: "Small"` + `scale_to_zero_enabled: true` is the default for demos and dev. First deploy can take ~5 min for classical ML - -```python -from mlflow.deployments import get_deploy_client - -client = get_deploy_client("databricks") -client.create_endpoint( - name="turbine-risk-endpoint", - config={ - "served_entities": [{ - "entity_name": FULL_NAME, - "entity_version": latest.version, - "workload_size": "Small", - "scale_to_zero_enabled": True, # Always - }], - # served_model_name = "-"; the API auto-derives it but - # you reference this exact string in traffic_config. - "traffic_config": {"routes": [ - {"served_model_name": f"{NAME}-{latest.version}", "traffic_percentage": 100} - ]}, - }, - # Tags are TOP-LEVEL — NOT inside `config`. Same {key, value} shape used - # by `serving-endpoints patch --add-tags`. Tag every demo resource for cleanup. - tags=[{"key": "aidevkit_project", "value": "ai-dev-kit"}], -) -``` - -**Zero-downtime version swap.** Repoint the alias *and* call `update_endpoint`: - -```python -client.set_registered_model_alias(FULL_NAME, "prod", new_version) -client.update_endpoint(endpoint="turbine-risk-endpoint", config={ - "served_entities": [{"entity_name": FULL_NAME, "entity_version": new_version, - "workload_size": "Small", "scale_to_zero_enabled": True}], - "traffic_config": {"routes": [ - {"served_model_name": f"{NAME}-{new_version}", "traffic_percentage": 100} - ]}, -}) -``` - -### Endpoint management (CLI) - -```bash -databricks serving-endpoints list -databricks serving-endpoints get turbine-risk-endpoint -databricks serving-endpoints delete turbine-risk-endpoint - -# Query a classical ML endpoint -databricks serving-endpoints query turbine-risk-endpoint --json '{ - "dataframe_records": [{"vibration": 0.42, "rpm": 18.3, "temp_c": 71.2}] -}' - -# Query a chat/agent endpoint -databricks serving-endpoints query my-agent-endpoint --json '{ - "messages": [{"role":"user","content":"Hello"}], "max_tokens": 500 -}' - -# Tag for project tracking -databricks serving-endpoints patch turbine-risk-endpoint --json '{ - "add_tags": [{"key": "aidevkit_project", "value": "ai-dev-kit"}] -}' -``` - -### Readiness has TWO state fields +## Real-time serving (when required) -`databricks serving-endpoints get` returns both: +After registering a model to UC, deploy it behind a Model Serving endpoint. The dev-side call is `mlflow.deployments.get_deploy_client("databricks").create_endpoint(...)` for classical ML or `agents.deploy(...)` for `ResponsesAgent`s. First deploy is ~5 min for classical ML. -- `state.ready` — `READY` once the endpoint has any working config (first deploy). -- `state.config_update` — `NOT_UPDATING` once the *current* config update finishes; `IN_PROGRESS` during a version swap. - -A loop watching only `state.ready` will say "ready" mid version-swap while the old version is still serving. Poll **both**: - -```bash -databricks serving-endpoints get turbine-risk-endpoint \ - | jq '{ready: .state.ready, config_update: .state.config_update}' -``` +For endpoint create / update / version-swap, traffic config, AI Gateway, querying, the `state.ready` + `state.config_update` two-field readiness check, and Foundation Model API endpoints, see **[databricks-model-serving](../../skills/databricks-model-serving/SKILL.md)**. --- @@ -224,8 +193,7 @@ for _ in $(seq 60); do done [[ "$STATE" =~ ^(TERMINATED|SKIPPED|INTERNAL_ERROR)$ ]] || { databricks jobs cancel-run "$RUN_ID"; exit 1; } -# life_cycle_state TERMINATED only means "the run ended" — check result_state -# (SUCCESS / FAILED / TIMEDOUT / CANCELED / SUCCESS_WITH_FAILURES / …) for outcome. +# life_cycle_state TERMINATED only means "the run ended" — check result_state. RESULT=$(databricks jobs get-run "$RUN_ID" | jq -r '.state.result_state // "UNKNOWN"') echo "result_state=$RESULT" [[ "$RESULT" == "SUCCESS" ]] || { echo "Run did not succeed"; exit 1; } @@ -236,36 +204,24 @@ databricks jobs get-run-output "$TASK_RUN_ID" | jq '.notebook_output.result' # → '{"model_version":"3","val_auc":0.91,"rows_scored":124,"endpoint":"turbine-risk-endpoint"}' ``` -**Serving UI hides SP-owned endpoints by default.** If the deploy ran as a service principal, the Serving page won't show the new endpoint until you switch from "Owned by me" to "All". Or just `databricks serving-endpoints list`. - -For the four `jobs submit` traps (`spec.client: "4"` requirement, TASK-vs-submit run_id, `print()` unreliable, tags rejected) and full debugging flow, see **[databricks-jobs](../../databricks-jobs/SKILL.md#one-time-runs-jobs-submit--async-pattern-for-notebooks)**. +For the four `jobs submit` traps (`spec.client: "4"` requirement, TASK-vs-submit run_id, `print()` unreliable, tags rejected) and full debugging flow, see **[databricks-jobs](../../skills/databricks-jobs/SKILL.md#one-time-runs-jobs-submit--async-pattern-for-notebooks)**. --- ## Custom pyfunc -When sklearn/XGBoost autolog isn't enough — custom preprocessing, multiple sub-models, external API calls, ensemble logic. See **[custom-pyfunc.md](custom-pyfunc.md)** for a full worked example. Two non-obvious things: +When sklearn/XGBoost autolog isn't enough — custom preprocessing, multiple sub-models, external API calls, ensemble logic. See **[references/custom-pyfunc.md](references/custom-pyfunc.md)** for a full worked example. Two non-obvious things: - **`python_model="path/to/file.py"`** (file path, not class instance) + `mlflow.models.set_model(MyModel())` at the end of that file. This is the "Models from Code" pattern — the file is logged verbatim, no pickling of the class. - **`mlflow.models.predict(model_uri=..., input_data=..., env_manager="uv")`** before deploying. Catches missing deps before the endpoint does. --- -## Foundation Model API endpoints +## Custom GenAI agents -Pay-per-token, pre-provisioned in every workspace. New models land regularly and a static skill list goes stale fast — **always list at runtime instead of hard-coding names**. Filter by the `databricks-` name prefix AND by the served entity being in `system.ai.*` (other endpoints like `databricks-app-template-serving` share the prefix but aren't FM API endpoints). +Hand-rolled `ResponsesAgent` (LangGraph + UC Function tools + Vector Search retrieval) — see **[references/genai-agents.md](references/genai-agents.md)**. -```bash -# Foundation Model API endpoints in this workspace, grouped by task (chat / embeddings / etc.) -databricks serving-endpoints list \ - | jq -r '.[] - | select(.name | startswith("databricks-")) - | select((.config.served_entities[0].entity_name // "") | startswith("system.ai.")) - | "\(.task)\t\(.name)"' \ - | sort -``` - -**Defaults when the user doesn't specify**: pick the highest-numbered Claude Sonnet for agents, the highest-numbered `-codex-max` for code, `databricks-gte-large-en` for embeddings — resolve actual names from the live list above. +Prefer no-code authoring via [databricks-agent-bricks](../databricks-agent-bricks/SKILL.md) (Knowledge Assistants, Supervisor Agents) unless the user explicitly needs a custom LangGraph agent. --- @@ -274,13 +230,13 @@ databricks serving-endpoints list \ | Trap | Fix | |---|---| | Model lands in workspace registry, not UC | `mlflow.set_registry_uri("databricks-uc")` *before* logging | -| Endpoint returns PERMISSION_DENIED at first query | Pass `resources=[...]` to `log_model` (covers UC functions, VS indexes, other endpoints, Lakebase) — see [genai-agents.md](genai-agents.md#resources-that-need-passthrough-auth) for the full list | +| Endpoint returns PERMISSION_DENIED at first query | Pass `resources=[...]` to `log_model` (covers UC functions, VS indexes, other endpoints, Lakebase) — see [references/genai-agents.md#resources-that-need-passthrough-auth](references/genai-agents.md#resources-that-need-passthrough-auth) for the full list | | Used `transition_model_version_stage` | Stages are deprecated in UC. Use `client.set_registered_model_alias(name, "prod", version)` | | `spark_udf` rebuilds a virtualenv on every call | Pass `env_manager="local"` when training+scoring share a runtime | -| Endpoint version swap says "ready" but old version still serving | Poll **both** `state.ready` AND `state.config_update` — see "Readiness has TWO state fields" | | `pip_requirements` mismatch crashes endpoint at load | Pin exact versions; or pull live with `f"mlflow=={get_distribution('mlflow').version}"` | | `agents.deploy()` produced a weirdly-named endpoint | Pass `endpoint_name=...` explicitly. Auto-derived name is `agents_--` | -| Endpoint missing from Serving UI | UI filter defaults to "Owned by me"; deploy jobs run as SP. Switch to "All" or use `serving-endpoints list` | + +Endpoint-lifecycle gotchas (readiness two-state, version-swap, Serving-UI SP filter) live in [databricks-model-serving](../../skills/databricks-model-serving/SKILL.md). --- @@ -288,13 +244,14 @@ databricks serving-endpoints list \ | File | Contents | |---|---| -| [custom-pyfunc.md](custom-pyfunc.md) | Single end-to-end custom pyfunc example: artifacts, signature, code_paths, log → register → deploy → query. | -| [genai-agents.md](genai-agents.md) | Edge case: deploying a LangGraph `ResponsesAgent` with UC Function + Vector Search tools. For supervised multi-agent tiles, use **databricks-agent-bricks** instead. | +| [references/custom-pyfunc.md](references/custom-pyfunc.md) | Single end-to-end custom pyfunc example: artifacts, signature, code_paths, log → register → deploy → query. | +| [references/genai-agents.md](references/genai-agents.md) | Custom LangGraph `ResponsesAgent` with UC Function + Vector Search tools. `create_text_output_item` gotcha and the `resources=[...]` passthrough-auth list. For no-code agents prefer **databricks-agent-bricks**. | ## Related skills -- **[databricks-agent-bricks](../../../experimental/databricks-agent-bricks/SKILL.md)** — no-code Knowledge Assistants and Supervisor Agents. Prefer this over hand-rolling agents. -- **[databricks-mlflow-evaluation](../../../experimental/databricks-mlflow-evaluation/SKILL.md)** — evaluate model/agent quality before promoting `@prod`. -- **[databricks-vector-search](../../../experimental/databricks-vector-search/SKILL.md)** — vector indexes used as retrieval tools in agents. -- **[databricks-jobs](../../databricks-jobs/SKILL.md)** — async deploy pattern (`--no-wait`, TASK run_id trap). -- **[databricks-unity-catalog](../../../experimental/databricks-unity-catalog/SKILL.md)** — UC governs the registered model: permissions, lineage, audit. +- **[databricks-model-serving](../../skills/databricks-model-serving/SKILL.md)** — serving-endpoint lifecycle (create, query, update-config, version-swap, AI Gateway, Foundation Model API endpoints). +- **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** — no-code Knowledge Assistants and Supervisor Agents. Prefer this over hand-rolling agents. +- **[databricks-mlflow-evaluation](../databricks-mlflow-evaluation/SKILL.md)** — evaluate model/agent quality before promoting `@prod`. +- **[databricks-vector-search](../databricks-vector-search/SKILL.md)** — vector indexes used as retrieval tools in agents. +- **[databricks-jobs](../../skills/databricks-jobs/SKILL.md)** — async deploy pattern (`--no-wait`, TASK run_id trap). +- **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** — UC governs the registered model: permissions, lineage, audit. diff --git a/experimental/databricks-ml-training/agents/openai.yaml b/experimental/databricks-ml-training/agents/openai.yaml new file mode 100644 index 0000000..f875f53 --- /dev/null +++ b/experimental/databricks-ml-training/agents/openai.yaml @@ -0,0 +1,7 @@ +interface: + display_name: "Databricks ML Training" + short_description: "Train and register ML models on Databricks with MLflow" + icon_small: "./assets/databricks.svg" + icon_large: "./assets/databricks.png" + brand_color: "#FF3621" + default_prompt: "Use $databricks-ml-training for training and registering ML models on Databricks." diff --git a/experimental/databricks-ml-training/assets/databricks.png b/experimental/databricks-ml-training/assets/databricks.png new file mode 100644 index 0000000000000000000000000000000000000000..263fe98b84e8ff3516edc93e7c99230fb8fb3113 GIT binary patch literal 15366 zcmeHuwGvL_%Pi2UiaF2#a{7SAsT9mWLN30;^Ey{$?uT3zOpOTyHdK{j^8hhfvh5NwU}M^})*R($^?%1Z)$O zv6^?FX-TiMYx?H}6GoR%4rDx|NW8K`Drt!9hxATaunL8TRATtgiasJ(Q{PhVpMNi)U<1Ugcz_bCAU;~4{5W3% z5W$DAqRanFT@??0-Ol$8^xsbi-0mN(|4;h=odd}sIzOAK*$SI5KK$-&e7Gh)_%Zq+ z5gXOfSo)LD>!=#p3^9Dt*&Pw%L%YAZ7t5}5PezRE``s=U8v*!45K%_IS;-gEhzPwMWE=_QBR8TZ%nf|E~Ns4QQ zo-p>@gQ2AH!E-$3T;wb(ezjD?`9$2^z1i=E37uyZNW?>m-ne;7dKS-L%BhhsVf62=-{0M$lV7G(h{>xM zez-sySdqqLiJY|~n91da9X4#Rki0qBIQOGKZ1je*!}|j9kfVf@80lyH{sW@3(_z1l zcN)%mi%w3NcbX-Gk4Oo800%y|ovstTrVu?p5xl__L2R53ty&+g{&ve(luo~h0gZ>e z(bA{wVou=Fir+@7wKdSi?VERb2G3ylXh#?yIk~zt)m#OS!OStL!k+^1MK$G=RvHg( zgrdktULkPyvC$XoA$2iXkNs#FZI%ofFf-WR+i9(wj!NpexzT%w(&)FN>)#KF}R5`TG0 zX$YOQ&K;pp=}1Dg3ZC&iq_$|tk=d2@7yXn#44k%_!^exNKj2555E?j!V1COIMuMk3;Tcp)oyUGH^1MNc-BQ^FjKlg>g6;DfuE&ezv z;K5yA!a^UgXmH^DOQVlGb3W&Y1Be=!? z-fFUfE`kR&fkQbYR9%=d<1n9li=TwY9GUsmI(_mcfMj}8X*QY=NwnfS_zb%x)8143 zv%uFw^=CQn)(Bc!o&aJdKl(z55WiYxQTUY*kWL8Ej<46QPtr7sS8E1eI^KfA0m8_N zf-&Q@^s141gb{Wci)J&F^S_2}J&B7uGBXWxN|6QMk96Z>J)QiXdr}C?Pu0YA z{#ZO@l}0n)X9MRz-_)f+Cco!~y!AVij|t5prz?5eGAB8My`E#Y1ZMdpUvT@nBY}L( zA)%##sCc=r(<~aLi0YFsIFxuF<7x8IoQxbJ9zKOM#F|8k)FXt@fyBR89^8+6qI{3*CXCh|Hk)~swk8o258(Z#m*~= z6x-A!ryOMHY7c8|p~-X(nL6Lr*RJ`j6bRQDZ8jPhJp6@SvD>e{i{e4tXnawv&OGY5 z-5RHFqHgZH^;jZ&c5qS@9%ZUTbuIXdIWe_-&cM9 z$D#I@?3_>X<=2RrE4j)af>`^1$90^XNa!eF(8BYLFZkeBcAAqfT z>Cvjn?v9bp*`0)+tuGwg%qWs(S(u+^a-Ov=MZoxm%ZX`X?rzGA^9(ynWqdBIdSbgC zK%Z!<9S^jWeuLwL62WMJR1;IdgG+Rw9o|y1l8eUbO|*GCCoGX3+filj{UCxNpA!nE z)-wk~zio(B_wWqXT2GG^3Nj%H&6V~c9u#uK$X{QNkuFFGoz19zfjpz5PF;`jWMHi} za0$XAu65Km;CtpFI-Tz@Lx#}gKGqx6tp?M~`7|F6d2^RSnGvO_(Z!47L&7&WevQ~L zCIEjb80bqBOn5L)eRi9F>&Y;G#55(zFBUi>`tIy^=fFGUw+|C61_C#Cy<6~k?F2X+ zNO5$Pn{-b^y%4;4X}C5U1P5dTsM8i6)w%6I8zyszCB}y#UiDe+goeX@d_A6{Pp{MK zA>knRNTgBRW6ua0u9&S^WYn6>K6^#8eH$&1(eJJ^rp2f+2p8lHC}$){!(qn#o}6gB zXN~^H;8T9d!%<#O7)kjjQ+~j}{#|UA%2UG}!=bO$yD2c_GI4T+$21Pb6DC|xJ!L$p zaw<@)){LTeSN){p4{I{`RKLhBlBcjwL9+KU$z$=I2(EPJWlsxqzM{5*o*iu~fIK}s zDZh4n1`Zl&l~m3%{EOi+rlj0Iw@$JZ`df0X#-$}!QYfmrD**OWb+l`Lk$9hyWT=j> z=Qo?ovzW$$Ata>(xFeB_eT_%o%l2RhZA0C<6Pxj?;X@MO&T*r6Gxs1EXRiOJ?1&h(wT&~VcLVkR_1Wf7YoKbRuj z#WB*CYYp}+t#UWooon{H@#iuaqMO!wjSBv%!>9${u#DECf;Gc?;V5JiMPE;|#4SsE zz1>9s+++gx1F5J<-lPyWc>q(ZZJ9s6XGQd4>;sCB)+;dUT;N-GUFL zz_~*T1ifJ;tU}2*C7SU_{;fMX9P?~ zcIp8;FZcdz(dHXzny0&2dUWNv{i0t;2|W8f-2@BY=iHn+5igkRa3=x$kR+~|QjvU# zOu0@-SrW-&GbY13sNf84ta9PD)rchGrgop_Nz@H}(l@5^I3PT!sIA&{-L)t_d5AIg zOL{HOQ#6u-B>b7h+DaTs%-7GOdJSbGhef}mj>mIA)8_OVcInN$p(o1vF0fJYvj1xP zAQgTqw8CL|GDM`Qnl!3L3MqPZqCF;a-EwlK-!I<#?R$(gP@DT%LK}l%{ORAoANXq zs&b%A?RU;JFDU`+HoFET;zje?{R5hx3Xt3Y9uX^(YV*&wk=5k&Kkp5Rk`SChQY1`U zPfonN%C7|pe|h=hHR79sA{?!S8Y!`bt_R#9Z!fdb(u=i~eunUuT~Dyl=>EmPqfwgv z@q;Nv6Qvf6geSAD8fNvOQP@3S;{LcbQd})M8IFQQ-urcE?Xmw~LtSw*7`WvC`2u+% zb1F|MBXM69dll^%JO(cRRX6fd#x^8Kc19neJkjF)uTC9L53L0r8^masTgBb*c`2Xv z29D={1zdE{nNt60# zIyJtpBS;B@{j_^gEfYfcQ#EN_mX#Qve>a6rfdQT}n30^@Y_sP=xik(7mThdtYX}-p zfGi7cG@SRjUA$9pzQPzJbW5j*%wOX&l5&o!T_SuNa8KQ0CTY3MXg?P9x^#F&dw;>I zMer(!;34?D+fHYlYb?D{@C9O)Fc`UlFb-WD1N$t4IGGBLd3Xh6)AF;fQqSl~|LW46 zsE0zYSpDr}Lk~Wf@|E9T**f>ye>>{lirLh~83KCwMCH>v7s@6ag;3?sAn^V2_OF`) zrCdD|Zns!9?UZ#k0VURUHOtE4Mm@#*&y>O3XalWYuP5R=&7a9qyIE)7OK;sMaCyww z*DZH1L9eUo*Y-j{$!pn?t8cK7bXc$qD}Pwd4q@7pKF(qDYi9N+O1X%QAuMM+cSC;J z-|%djb_Rl7%I%Wht3~u|=-PIozb0GCcBbZ=AFHBYXE!A4_(SV>l{0h{pJu@{82g&P z<(_!3qfb_dPI|&vmsB56v^*n=UgHZzi4D^;jx_k z=37Kd>={Qht6tx&DSW29bsk-FSUGmtRU6;kLm0-HQyjCQwGuT6_0G5Ns|Kix4;cQ~ z7#`ubv#2_tKCLDuvVrwUX8+8Q>vI-(e1P|eNI%GivWp!Z^X!ce1=+LX&96nZ+jFtH zOuoEiEBy>?CoJTaZ8t&A%ZeX+O?!ep|EqP}P=}7CVQ*sAVWDzq`x|{*SH7>E7C1Bc z<4Nxq=W|B!OEoH7j_;SC2&LU%UgHd0!Osdgjl*5+CyzzCj;Dsn0Y4yqCY=$x?l6tO zxY{AFbF)a6`xlOw%p-aNLx@Kb^!7;ZFw>7KHLXg6K0&swV(}Bj<;;jlU%6C=5SHP& z$9#5F(G1!Ss9`7RYMI}YhGcJLh%6nnAafhRrj{8;nbR<^j7?1nwxa$_lH95qVI4`> zDerlYtcm?8aHBfkPsOM}B74zuXSBX+#57C7Ibm9L2Mv9)eeug($e9CrDRz-O;sopH zB%=O1t3B$3W<%CkMj!j9e$8(vkk^jrt3FmX_r|O@8^DtMtojMs2MPUuYu(b|(^!|W z`<=7wpMzXUU)*gBR1NP$!RY-FJvuwG`*=e1-S-RIpuAl&)37`l);9c$JM~u#ev%9s zrgJr4%% z=vGGx9@5G_VB|{sj;!rryVq%-*H$dughu26=}M1!s>CMDjEO{BZ^2c+e2A2*8pNqJ z0?h{S1-2bZ#86=1zRE~=v)>2yjGrkO(G0Pg7W^q;7Ia^MH8o-=LiV(;An102C4r|H zTFeEU^i=cOBqfRKggFdZne*t17XSOBOv`JU!coA+KjSznRSzDt}wAhb%m_M#7WM9S5MzV{y zK+MWSxc%GlO>tSl(E$&35J%?{d}7Ysced+Byei%2$?bkfo+l5NXu%(2&L6`D!LT+p z8+=b;(*`h_FrcuzP(wS_%{Xr$_lHc#L*}lJ6(hdF%H6(b2I=Ip!Exi{?~7)*zZ^2p zYzVw|y-(vd88X&BQDmfH70nS%ALA9mOCaf4F*L=4i2c~8^ieyG&mLquvWjHS;qscb z1FVTL%UGHLWsUS4QC4x;(Qz}}q*H|2^`TSe1cXTsqWpCNB|^~QYS`^YFUGznM4YH( zX#0Uun3c|lIBAGT2+!Q_{i8YWWjkAt|Hb=tRd-Fb9Vqb{mL!$2qk9r<?_`Z;DkOAPJbxH0{{io= zZ~NbD@#{3Fw$(DfggF9+2jQd4+}h9OAnZwoDw3W_etDJ6Z9i0=$(PkH*DQ%v+Qt^%ZhUF4^uq^}3xX1HD4-Zs0hmHdqQw_E(DAb%ACK_$X2%lr|n z$*yFv$`;8*>%45m4I3Cq>dyRL&7*3d3!MFku9ENi$TwQ=KsZ5e zD3WWg=R6@F@*G%zklf@QP%dFJ6*2WBV4eiI$qVmKHCXo)hPz$uqLhus1y64m<#_mK zD-GF5dx>jJsx=Su#^_z#xi=CRT7I2?vT@#_W@}aVG=`A< z+23Ccf{eIoocMCwuCQ}B^|wp<801(@^6K{{Rf;7uOYls`%YS(P5UD5kpa{0sV`=qU z8~SpS)aqgTr}jxg^@B3sd=JClt)eg{%q>u>(>poix64Jg!)hE7zDZId^<2AJsgd=o zw`oO%xr=hHxVr_)E74D_nthfP+$zKtO0ek_X@E>1cQV8ID}M)BO$0ZxDnj}9?;AXW z48O+x`-l5u4M&?_tK)v2Hdnk|*>3Rk=CGv*FPD5T#(c3CMb02AE{}^S8Pe&;>CsPl zUZ(g1ufLY>h7gYHX*&e`8GcODJMt0N0Ku&qe7%9JBF${d{4f{xxzjn}jsk@RbBc%Z^~Idh$Ng1?r8%|+(gbII%mGEr4 zo8=&dTXzEAEYmg_Wn1mY@;iOBnso90aH4ha%N3#;R`Ph9d3D&*_!c}o+Gbi8Vuq&O zuZgTc3$RBFxjan`VR_lbKd*HdmLU4fWYzlSi^Gv7b7n4N)eKkI-wG`3rN=uQ#+*kt za$mr!G^$0ivVvR6wtD_=tXoLe$IZ``7N<1uNsG8Ez7CvdBzI-1z0FxzbW8G)nRh%F zxz+Orwp)5y=S#AxRWP- zhOvfvfAUPPHdC*c(EYa6r(TVy2=<{C$#Y{An7CExS#dE-3f~*WQ=6i8ui09uYa38C z<>KEOxXnZX>I_O-qWqItZW3M+rLZRE=x*|mC9IbE@ll8CHAs}uWdBKlN5_CyXRgDn zwR~cA{PJ3tc(SuQoBUU!f0%U#Hd`{W5`Q1Z_jD|NfTCk3 zFOxyj;j8+p`@2aelPbk^k8rCj$Q;CqD_72G!lH^&jz4SFAnnle4_kf2WQAW>$*>mj0` z|Lc^N&jZi(V2=>R4oc@h1*bMoT+y=h_A%-?3lU^a&sj{}fuwjKUU1lfn_?bQSzI`p zFj~B4FZ>0)bmT%qMdiJ& zeb*NhhHp=MvFqHDHm}-S2=XqER1A6@1dEq8p=)}>#2=9NL4Jn-hZ(!SFSQ)b;-#N< zH(C7WwL>MQC@*&&sxCW1c#3zhFCu?eTEm!HRIen3K08V@Ou z#xr5H+7?NJcDd``%Au=P`lYn<7!#aknI&!GZ71oT?=QSL)ZPDVSja$BV23nKDHu>A zHrQITFgWPEueyo-`b8zZ&hS4+AJ%xFe-ex5^{K5J#3BnX*-4XGP9w|z1EasF4 zTdIr4VY~LN7qwiZgf29itdfQ(#)PMkJ9#IjuT$SH8*;@(ZN6JGWHo5ZlA23>$OdxN zy(6;bmeWVsgw_#n9oM_FjZyI>apGo^p_eXXO(1 z?%d+4kyEcYkzRrdpFk@SFVzJ5uiSJJR{2$rp<9#Uis%xN&%7jRaP)DRmw@*Trm8Nx zgs$pxb#*3+Ygi!F>Rt9)!yF|hsLD@Ca1VM1jC+m7A`>5MFJQGA&sW~?jg=wP(~2iw z3`5%uV)uoc4wNda8ai$nw9Tvb7+*l)JokDTk%?Whrvfo~C#%`)_jL=is^=G+56UE; ze2Q!{+H}tWB@51H%hbrYu9ykFDCgL9*VRXpLJ8eo>J9l1*X_`o?{ac#F1-51m`|*$ z&S(J1o=HG5hvaL59j1(Rsf_n{P)qR>`oRQyTpxpGI4x4oyk#8F{d|-;`Fcrqi@#8Z zIb@cP<|Llz-gueSaa#LfaH}QIm@D&1e*p1D1JcaSjSjoqN^-3C9qUJfsSLn!EFOwg z_$?8tjYQrxC57|g+jFCh4NULa488p5;%L;^(*UwsO3N#zJ` zfj)@vO2S->k2XH_1jRRV6ghtqB=BG?{Q2Thpe*$PmRId^ran zPwEB6k;B(L-9BrJD=?%(d|Ii*T79R_1Gi55marduE&1ye>t49LjJk9E+*4TiEb-Tu zsk?_T5^a4!8n(`Wh0n$22hgpb^Ii%Nc~PZg703fAQge>VR1dGw3jqT~j>^jM^iV}l zeNbGMZQr7JLbe5hvG~23NSAA73XlHzt$dRC=3FP{=!%uKZ)V<~49aH~v|_}OM{iGX z7rKC8yOq5-xU~Ls9i`l57K#O_TxQE#mM9px)5H_`pf_@BCy)}wjk&WzD z^m24(T0Hmofs)Q71+q)Wff+0(LO^Wd2q`_QY_v=2-(EkHPZ}CF~@lw-7@Ef9S)CPLeB8hX&l}v0LsiPOe-a z9p0GvTr}x4o-O?vgts`REWB9tazJd? zoxJ((en?IFuv4Q?g*@c2pY+d%1QH<9R7|C_&`wsl%VCe8!1Gm*yU$C!isJZ*U^PSYwqEcC-4g9U7&{rbFi7NEyGwKhGkCH%upmP_j_UF4whhwK0?K%e*T0q#XDe zv+SZ4IM2n(7g^D=Ay>qwy9k4hGF6*xBfOd8i}f-r-ZShCQj^qqbd*u=|FMUYS49yc ztD?vo&g4)>bLzK0pDRLLHy;<1hgj5{I)ys;EO<+kIZ1z@#rb<%_fRxF3HbiAH*0H% zuw1zXYS-?)xMiXDZ8D=DtN2F%>JnOu4^Ig&FR@+OY-w&9=x3I|`JNbz)c^g7{r(6h z$wSD+e0hu0l2E%tonAR14ig4nMMuEKrL>ZNPIU!VSBUE5_e)mgKP9=l^^p8%Q)G8T zXL>{`lyIr=$KSNUKTe==w7#Resk75s+?CIUUcBCH`n@?&-+|jR*o!*Q!%-{Djh0X{ zV@XYCV(-{d4h4SHAVcRK@a|0RQ9~KiM(`=_Noui>&cV}~SIESk%S0e8U zqj#Rl&Gp@ z1g@xmDiB6H_hI`*wdGJ^2#Xq|qAk0wD&L#hEnfB`Z`iOiqoXOIG_baZM+`_0nC2p3 zokZfu)ay8k#22}4Diyx@M~X^bRP{+X%}^>urQE|`y7^y^q1cMlyXT=W+A*-{CXxA$ zTfMsYZ`Oge%IwYNq?PKYs`*DV2zHiPse}@4+0tFsXi5BJAyBsaQjNe)4Nf(n>d--> z9D6TMGJ~-PNHNdSP(lNDF54E+tNBP90eCKO{gZEatI+uK5BnswDQoLg*+TalaL+lD z&ScSN+~1v+M=q5v@@xP4S6CCh6YpfyW;`}cb+tXvFdax7P>d6|nvj`P+_K~syD z7BFh*T1MiY|JnC=e(!hI*Sm8JW=WEU{FBh9t`owNn4i|i?*E$2g*_nFV2cDiJdo0l zDFNlGNY`*$#E@Y%v)J&U(Q6^e>iP0Jk7GqM5-Rt3P&$_%sND)SnPErQ3p|u+2Z;c5 z+N#Uh6WkCYsyI}V+>|>1Bt;WjEj#G0F{bj!)TQfmdL$`y>T}^RZma8La^kWp5gf?( z3`1RBU(%M$0K-P z!R6AU7yHTS*?RHQ)P+x4j^>=aG}dQQLPTd%T-{9#cLf}BR%LaZ8mBl7TMHYG_P??J zJv%r_Fg^@p-F-X~%nBOTw1=iRj4)Sfn0Jfl7h$ep)QzqMPT<4v9YBA8@stcM5p0D| zvhw|ln_xv+4$v}0icOLP4kcW8FqGxQUVX$rwt(Q!MLR+YAGEQdEu(pku7!yX3#WKG zVif5Hv47%yb(^NV^n9(SFloKt!ev7WL2aPT$=e46&*do zyBv6+w|QdlXIRk_&sJ5rLqS-)w66O~F`<*VABXv{^`t_iS|;l;rcW}bzv7RY9+%UF zV7+H0446IX|CqfPTS$Oc?o6%>rx&Yrnoex_@iXk_N*t!Y>f%V25nsEOja>9>DhNh< zPaYEFp!-|UfKi2KNVd{M@d2BAsj1hK%P?0eN0!GSP|p;q<(u z6_qeJy^0&bhafPh8o?Qer*BxBNzJ}l09t}|3eq4`bdaifbdqD`#M2`_blMCL|69iB zY$UiPJL(~Ybx?HK61z>Fn7h%Pn(%8Myz}Q?kOuSsI?X_$=D=-W!wtCM5}ZnPrk4{0ajC)4SJ zzPR!8%4_TjTtpA|XfJa&LAL#vX2)zwjeCqy&DR29 zX^Y=R9hPXDuJa{Ns2O2+cv2s#o;^BswE0-D-ocbBno)-r|A>is_<3p@=*dbL8JkqZ z!9g|s%C6OJR!!iOntcCGe#f(G^&nO9!7%8{gtE>DRL5@P%Ab+o!2 zTsw}vW>caC+A29-*URlt8beobjH2>X@HuxIy+2N4+WrFTg!~^LmKN9nJAi6mAr0y>v762K6!3Y zo2z=Q=i>1FJVvMa&q$#HXm=}2T_}EdI}pPMdgs*kwZ<=&E=;I`D_Y}TJ&TK{MeI?N zXfLb1VdvnV3?baIvxCf;NUSP?ik?;1+kT3NL-$Y4_#&Ane}W*?1XsM~C}$8^G-K?v zbQ`^W{XLVZ2Sp2K?It#JB=Gd0qR2b$!1(LqLZ`$3LSPbw-6Nyx9DBExGX~*rg@u4? zyQl8J-5*bu#XiuGO-ZuRYtzXt>8R@yOh}Hs1F)+4Vln|97xp9dkJkkZxT+iz~v+ zm0A}eFg4Yom|Bix*0r8?FZh1-JO&aOa_B(Sb4*TpEkL&}MzoNWOPLTb;W4 z&h3gAIIdoQIsW_reBs%z_&}IW>zUXCeh2Psd+}?pIX%6+9?T3=Jmj6xE!ekHS=0Bj zFOPgbT?HCo<-Y`dq>xtDQqU~Mrx%PTA@}JXN&2ZuCv5BGOi~`SnyatC38!q)Vi2zSa5XRt0-za`p{j_gY$Y24GeSx8+Ko#sq2Gh z;lYi2z~|rpWnRb2Qh&Cr(Sn^h{t3XzRsmcsydQf~8UsfPSk!#ppO5sut8oN}N+18Tpv>OZmkvH1$`;W7H8xw_x zCjZdfp+fKr;7|qw);J&8_o*Y+Yr+zgiw1sH10ZSXFQjrPjISk{q`<#+kM-ji`Hhp8 z%=gGwENHT`c8%S4)l$?ZUPUxSTw_Pujat_{QHDu=dC%q^!z}1EEB8oN*gR60skQ4s z3CL7)tDbOQ4g$q9*yw#e^CX8DdAe=)5lGcJa0y?*0TfLWroRc5+#OI_u0b0EMXeE5 z4{BP4?`^mr6nt9f&(>5xuJxS$jT4zlGny+MFrbB)+0uT<& z_Ihx!gDDCt=gR{fC2;l)v^W0%kmaW_cWr|J(bn5-ObNfwDyFYiOvPY**myCh@r1=Y z5=jfS`NmVqS&p1&4hY93CU=$xABV9_+V1hV?0`>h$8&!?8GG+8>&cELyIJ6+xywo{ z+0GM#!@R?Km%Wg5n`XwLKF1J$&trOY{Sj;w^xEz5=6cBpTA(7RZA*TFw0L=duEl0wH zUrFqKq(TUa_*>%xR9Nx!GcmOFb1QKdbcx-`@$LH8x_zgzmnhxut!T0UQVz6RJN__Iy~HR*VFb3CY$)N9_ZxOXy0R} z)lB;_Z4xgJpd&UTl%Oj7T(`&>lm|b-yJbN^LBhz^G;Z^q4b?3KAwWdq?`i1Qc*ZEz z*4j$T$4>DoGbBxK>~FvKJt?noWl^|520 zyHeOmx{b`@p}Z_4;kn9%(7W0_(N^ew@BP$o7N1B*=(OrW_THirDi5=E3q)BRemlHs zzIe%rg8d9l2y@_^<@WX1RU29T&hW6~vI;RE2w5Ujun&3ZrU>cji*W)~JRufUsCkzw z-0D<79kUIDk!$TIjYGw|%dFHUtn~%4ie;}>7wQ4L$|&%=9}{ew8c}zcxBl~N06&&Al(h)3;3cwC z7dM#!;Jf~s)h*gU>iJ5}sn!6To`7XWbUR5yfz8p?mUmXv4T((i&J?;T;)%ZebkElrYUNLnpMdKk?`fgKOli-tn z#2nkxYwT|4H}q#a2bM_=ME+5hWoVg zQ>+;wv%LeZ*Xp?qI15Aa_XP@Z7$Ea50)TJO;eGe!8bB*w*%tkU@1M6Lt3JVQEr!mK zk7n(=4Ql3^&6*oz1e7gc-eT*QI|D-dTx)lJDdjewNM6F`J8ktPLLK{OFGMbJr{tfF z1|u6mV8s*ma-^u;vmYFY{09-o#l7KAoD)1T3JG^!>xhL+sxWRN4KPn@-2gE9)zbw@ zLhvqep3wlQ|L{EKP6K9@?N!vmCmF5qYQUGvyTuw`h>}K;X(^NO9eTyzxgwthOBVXJ zXn}F>A1LKsi=KJr&4J)edfzRF2Tvj8NpMrj2s(m0Nn^+`Y`m!e!QgsLtD%x~#ZKj# zhIcSa5&y`Ng?{;^lFepT4p-*?p(9EL;1^*#sjG-&RPIqG)=SJ&s^(5iiV~wEWw_BN zRbaFqUS|zyQS{Zx_8?*f6du##|FsS^G^wL5Js7%4a0!j1Cm@n*R5>k)7vexA4`(jU zC6a4~i5-gq1r`&!-g=)kqq38X>4L-ag%#IAVWX&YUu4}B*ceMgWAH@I&}XMmX%J>}?KxWL=S5x-i4qRp9$&7QGPLK9Ug+`h09z!H82HdBzv z#SVD|Yu{;R@na>g@_&nfYs7#zV}v~~v8pJPIKl6#yA)Tz^2?D$Z6F98w5kpmEh1u_ zjRL3H%M}Z`oO~p@17A3oj#3^9tYz$5L@5}!)2Xy7%P~ + + \ No newline at end of file diff --git a/skills/databricks-model-serving/references/custom-pyfunc.md b/experimental/databricks-ml-training/references/custom-pyfunc.md similarity index 91% rename from skills/databricks-model-serving/references/custom-pyfunc.md rename to experimental/databricks-ml-training/references/custom-pyfunc.md index 8d85c3e..034c35e 100644 --- a/skills/databricks-model-serving/references/custom-pyfunc.md +++ b/experimental/databricks-ml-training/references/custom-pyfunc.md @@ -93,10 +93,10 @@ client.set_registered_model_alias(FULL_NAME, "prod", v) ## Consume -Same two paths as autologged classical ML — see [training-and-serving.md](training-and-serving.md#consume-batch-scoring-over-delta). +Same two paths as autologged classical ML — see [SKILL.md § batch scoring](../SKILL.md#consume-batch-scoring-over-delta). - **Batch**: `mlflow.pyfunc.spark_udf(spark, model_uri=f"models:/{FULL_NAME}@prod", env_manager="local")` over a Delta table. -- **Real-time**: `client.create_endpoint(...)` (see training-and-serving.md). Query returns a DataFrame-shaped JSON since `predict` returns a DataFrame. +- **Real-time**: `client.create_endpoint(...)` for the dev-side call; endpoint lifecycle in [databricks-model-serving](../../../skills/databricks-model-serving/SKILL.md). Query returns a DataFrame-shaped JSON since `predict` returns a DataFrame. ```bash databricks serving-endpoints query turbine-risk-endpoint --json '{ diff --git a/skills/databricks-model-serving/references/genai-agents.md b/experimental/databricks-ml-training/references/genai-agents.md similarity index 94% rename from skills/databricks-model-serving/references/genai-agents.md rename to experimental/databricks-ml-training/references/genai-agents.md index b4573ae..589cd7a 100644 --- a/skills/databricks-model-serving/references/genai-agents.md +++ b/experimental/databricks-ml-training/references/genai-agents.md @@ -1,6 +1,6 @@ # Custom GenAI agents with MLflow ResponsesAgent -Edge case. **For most demos, use [databricks-agent-bricks](../../../experimental/databricks-agent-bricks/SKILL.md)** — pre-built Knowledge Assistants and Supervisor Agents wire up Genie + KAs + tools without any agent code. Hand-roll a `ResponsesAgent` only when you need a custom orchestration the supervisor can't express (custom routing logic, multi-step plans, agent that calls another agent over HTTP). +Edge case. **For most demos, use [databricks-agent-bricks](../../databricks-agent-bricks/SKILL.md)** — pre-built Knowledge Assistants and Supervisor Agents wire up Genie + KAs + tools without any agent code. Hand-roll a `ResponsesAgent` only when you need a custom orchestration the supervisor can't express (custom routing logic, multi-step plans, agent that calls another agent over HTTP). ## What ResponsesAgent is @@ -217,7 +217,7 @@ dbutils.notebook.exit(json.dumps({ })) ``` -Submit via the same `jobs submit --no-wait` pattern shown in [training-and-serving.md](training-and-serving.md#train--deploy-as-a-serverless-job) — same script, just `deploy_agent.py` as the notebook. +Submit via the same `jobs submit --no-wait` pattern shown in [SKILL.md § Train + deploy as a serverless job](../SKILL.md#train--deploy-as-a-serverless-job) — same script, just `deploy_agent.py` as the notebook. ## Query diff --git a/manifest.json b/manifest.json index 799ad1a..436f9ac 100644 --- a/manifest.json +++ b/manifest.json @@ -226,6 +226,19 @@ "repo_dir": "experimental", "version": "0.0.1" }, + "databricks-ml-training": { + "description": "Classical ML and custom-agent model training, MLflow tracking, and Unity Catalog model registration on Databricks. Use when the user asks to: train models (with MLflow, sklearn, XGBoost, LightGBM, PyTorch, custom pyfunc, etc.); run hyperparameter tuning with Optuna; register models to Unity Catalog and promote versions with `@prod` / `@challenger` aliases; load a registered model for batch scoring via `mlflow.pyfunc.spark_udf`; run inferences as batch, build custom MLflow PyFunc models (Models from Code); author a custom MLflow `ResponsesAgent` (LangGraph, OpenAI-compatible chat) with UC Function or Vector Search tools. NOT for: managing existing serving endpoints (use databricks-model-serving); no-code Knowledge Assistants or Supervisor Agents (use databricks-agent-bricks); MLflow evaluation / scorers (use databricks-mlflow-evaluation).", + "files": [ + "SKILL.md", + "agents/openai.yaml", + "assets/databricks.png", + "assets/databricks.svg", + "references/custom-pyfunc.md", + "references/genai-agents.md" + ], + "repo_dir": "experimental", + "version": "0.1.0" + }, "databricks-mlflow-evaluation": { "description": "MLflow 3 GenAI agent evaluation. Use when writing mlflow.genai.evaluate() code, creating @scorer functions, using built-in scorers (Guidelines, Correctness, Safety, RetrievalGroundedness), building eval datasets from traces, setting up trace ingestion and production monitoring, aligning judges with MemAlign from domain expert feedback, or running optimize_prompts() with GEPA for automated prompt improvement.", "files": [ @@ -255,13 +268,10 @@ "agents/openai.yaml", "assets/databricks.png", "assets/databricks.svg", - "references/custom-pyfunc.md", - "references/genai-agents.md", - "references/off-platform-streaming.md", - "references/training-and-serving.md" + "references/off-platform-streaming.md" ], "repo_dir": "skills", - "version": "0.3.0" + "version": "0.4.0" }, "databricks-pipelines": { "description": "Databricks Spark Declarative Pipelines (SDP) for ETL and streaming", diff --git a/skills/databricks-model-serving/SKILL.md b/skills/databricks-model-serving/SKILL.md index d99b437..2c23e78 100644 --- a/skills/databricks-model-serving/SKILL.md +++ b/skills/databricks-model-serving/SKILL.md @@ -1,9 +1,9 @@ --- name: databricks-model-serving -description: "Databricks Model Serving (ops) and MLflow model development (dev). Use when asked to: manage serving endpoints via CLI (create, query, update-config, scale, version-swap, integrate with Apps); train classical ML with MLflow autolog (sklearn/XGBoost/LightGBM/PyTorch); register models to Unity Catalog and promote via `@prod`/`@challenger` aliases; load registered models for batch scoring via `mlflow.pyfunc.spark_udf`; build custom PyFunc models (Models from Code); deploy a LangGraph `ResponsesAgent` with UC Function or Vector Search tools; discover Foundation Model API endpoints at runtime. NOT for: no-code Knowledge Assistants or Supervisor Agents (use databricks-agent-bricks); MLflow evaluation / scorers (use databricks-mlflow-evaluation)." +description: "Databricks Model Serving endpoint lifecycle and ops. Use when asked to: create, query, update, scale, or delete serving endpoints via CLI or the MLflow Deployments client; configure traffic routing for A/B / canary deployments; do zero-downtime version swaps; manage AI Gateway rate limits and usage tracking; discover Foundation Model API endpoints at runtime; integrate an endpoint into a Databricks App. NOT for: training models, MLflow autologging, UC model registration, custom PyFunc authoring, or hand-rolled ResponsesAgent code (use databricks-ml-training); no-code Knowledge Assistants or Supervisor Agents (use databricks-agent-bricks); MLflow evaluation / scorers (use databricks-mlflow-evaluation)." compatibility: Requires databricks CLI (>= v0.294.0) metadata: - version: "0.3.0" + version: "0.4.0" parent: databricks-core --- @@ -17,7 +17,7 @@ Model Serving provides managed endpoints for serving LLMs, custom ML models, and | Type | When to Use | Key Detail | |------|-------------|------------| -| Pay-per-token | Foundation Model APIs (Llama, GPT-5, Claude, Gemini, etc.) | Uses `system.ai.*` catalog models, simplest setup. Discover endpoints at runtime — see [references/training-and-serving.md § Foundation Model API endpoints](references/training-and-serving.md#foundation-model-api-endpoints). | +| Pay-per-token | Foundation Model APIs (Llama, GPT-5, Claude, Gemini, etc.) | Uses `system.ai.*` catalog models, pre-provisioned in every workspace. Discover at runtime — see [Foundation Model API endpoints](#foundation-model-api-endpoints) below. | | Provisioned throughput | Dedicated GPU capacity | Guaranteed throughput, higher cost | | Custom model | Your own MLflow models or containers | Deploy any model with an MLflow signature | @@ -74,7 +74,7 @@ databricks serving-endpoints create \ }' --profile ``` -- Discover available Foundation Models: see [references/training-and-serving.md § Foundation Model API endpoints](references/training-and-serving.md#foundation-model-api-endpoints) for the runtime-list snippet and default-picking rules. You can also check the `system.ai` catalog in Unity Catalog, or run `databricks serving-endpoints list --profile ` to see what's deployed in the workspace. Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's API schema. +- Discover available Foundation Models: see [Foundation Model API endpoints](#foundation-model-api-endpoints) below for the runtime-list snippet and default-picking rules. You can also check the `system.ai` catalog in Unity Catalog, or run `databricks serving-endpoints list --profile ` to see what's deployed in the workspace. Use `databricks serving-endpoints get-open-api --profile ` to inspect a specific endpoint's API schema. - Long-running operation; the CLI waits for completion by default. Use `--no-wait` to return immediately, then poll: ```bash databricks serving-endpoints get --profile @@ -82,29 +82,65 @@ databricks serving-endpoints create \ ``` - For provisioned throughput or custom model endpoints, run `databricks serving-endpoints create -h` to discover the required JSON fields for your endpoint type. +### MLflow Deployments client (Python alternative) + +`mlflow.deployments.get_deploy_client("databricks").create_endpoint(name=..., config={...})` takes the same JSON shape as the CLI. Two gotchas: + +- **`tags=` is a top-level kwarg**, NOT a field inside `config`. Same `[{key, value}]` shape as `serving-endpoints patch --add-tags`. +- **`traffic_config.routes[].served_model_name` = `"-"`** (e.g. `"turbine_failure-3"`). The API auto-derives this from the entity, but you reference the exact string in `traffic_config` — get the format wrong and the route silently doesn't match. + +### Zero-downtime version swap + +To roll an endpoint to a new model version: repoint the alias **and** call `update_endpoint` with the new `served_entities` + matching `traffic_config`. Missing either half is the common bug — alias-only doesn't update the endpoint; `update_endpoint`-only leaves the alias pointing at the old version. + +```python +client.set_registered_model_alias(FULL_NAME, "prod", new_version) +client.update_endpoint(endpoint=ENDPOINT_NAME, config={ + "served_entities": [{"entity_name": FULL_NAME, "entity_version": new_version, + "workload_size": "Small", "scale_to_zero_enabled": True}], + "traffic_config": {"routes": [ + {"served_model_name": f"{NAME}-{new_version}", "traffic_percentage": 100} + ]}, +}) +``` + +The CLI equivalent is `databricks serving-endpoints update-config --json '...'`. Either way, poll both `state.ready` and `state.config_update` afterward — see Endpoint Readiness below. + ### Endpoint Readiness -After `create` or `update-config`, the endpoint provisions compute and loads the model. **Do not query the endpoint until it is ready.** +After `create` or `update-config`, the endpoint provisions compute and loads the model. **Do not query the endpoint until it is ready.** Two state fields matter and they mean different things: + +- `state.ready` — `READY` once the endpoint has any working config. Stays `READY` during a version swap. +- `state.config_update` — `NOT_UPDATING` once the *current* config update finishes; `IN_PROGRESS` during a version swap. -Poll for readiness: +A loop watching only `state.ready` will say "ready" mid version-swap while the old version is still serving. **Poll both:** ```bash -databricks serving-endpoints get --profile -o json -# Ready when: state.ready == "READY" AND state.config_update == "NOT_UPDATING" +databricks serving-endpoints get --profile \ + | jq '{ready: .state.ready, config_update: .state.config_update}' +# Fully ready when ready == "READY" AND config_update == "NOT_UPDATING" ``` -Provisioning may take several minutes. Provisioned throughput endpoints take the longest (GPU allocation). Queries to endpoints that are not yet `READY` return 404 or 503 errors. +Provisioning may take several minutes. Provisioned throughput endpoints take the longest (GPU allocation). Queries to endpoints that are not yet `READY` return 404 or 503. ## Query an Endpoint +Chat / agent endpoints use the `messages` array: + +```bash +databricks serving-endpoints query \ + --json '{"messages": [{"role": "user", "content": "Hello"}]}' --profile +``` + +Classical-ML endpoints use `dataframe_records` (one record per row): + ```bash databricks serving-endpoints query \ - --json '{"messages": [{"role": "user", "content": "Hello, how are you?"}]}' \ - --profile + --json '{"dataframe_records": [{"vibration": 0.42, "rpm": 18.3, "temp_c": 71.2}]}' ``` -- Use `--stream` for streaming responses. -- For non-chat endpoints (embeddings, custom models): use `get-open-api ` first to discover the request/response schema, then construct the appropriate JSON payload. +- Use `--stream` for streaming responses on chat endpoints. +- For embeddings or other custom schemas: use `get-open-api ` first to discover the request/response shape. ## Get Endpoint Schema (OpenAPI) @@ -179,13 +215,27 @@ Then add a tRPC route to call it from your app. For the full app integration pat ### Develop & deploy new models -This skill is ops-focused (manage existing endpoints). For the dev-side flow — train a model, register to Unity Catalog, log a PyFunc or `ResponsesAgent`, deploy — see the references below. +This skill is ops-focused (manage existing endpoints). For the dev-side flow — training, MLflow tracking, UC registration, custom PyFunc authoring, and hand-rolled `ResponsesAgent` code — see **[databricks-ml-training](../../experimental/databricks-ml-training/SKILL.md)** (experimental). + +## Foundation Model API endpoints + +Pay-per-token, pre-provisioned in every workspace. New models land regularly and a static skill list goes stale fast — **always list at runtime instead of hard-coding names**. Filter by the `databricks-` name prefix AND by the served entity being in `system.ai.*` (other endpoints like `databricks-app-template-serving` share the prefix but aren't FM API endpoints). + +```bash +# FM API endpoints in this workspace, grouped by task (chat / embeddings / etc.) +databricks serving-endpoints list \ + | jq -r '.[] + | select(.name | startswith("databricks-")) + | select((.config.served_entities[0].entity_name // "") | startswith("system.ai.")) + | "\(.task)\t\(.name)"' \ + | sort +``` + +**Defaults when the user doesn't specify**: pick the highest-numbered Claude Sonnet for agents, the highest-numbered `-codex-max` for code, `databricks-gte-large-en` for embeddings — resolve actual names from the live list above. + +## Off-platform streaming -| Reference | When to read | -|---|---| -| [references/training-and-serving.md](references/training-and-serving.md) | Train + register classical ML with `mlflow.autolog`, alias-based promotion (`@prod`), batch scoring via `spark_udf`, real-time endpoint create + zero-downtime version swap, async deploy via `jobs submit --no-wait`. Includes the Foundation Model API endpoints runtime-list and the gotchas table. | -| [references/custom-pyfunc.md](references/custom-pyfunc.md) | When `autolog` isn't enough — file-based `PythonModel` ("Models from Code"), `infer_signature`, `code_paths`, pre-deploy validation with `mlflow.models.predict(env_manager="uv")`. | -| [references/genai-agents.md](references/genai-agents.md) | Hand-rolled `ResponsesAgent` with LangGraph + `UCFunctionToolkit` + `VectorSearchRetrieverTool`. Includes the `create_text_output_item` helper-method gotcha and the `resources=[...]` passthrough-auth list. | +For apps deployed **outside** Databricks Apps (Vercel, AWS, standalone Node.js) hitting Databricks AI Gateway with Vercel AI SDK v6, see [references/off-platform-streaming.md](references/off-platform-streaming.md). For AppKit-based apps, use the `databricks-apps` skill's built-in serving plugin instead. ## Troubleshooting @@ -197,3 +247,4 @@ This skill is ops-focused (manage existing endpoints). For the dev-side flow — | `RESOURCE_DOES_NOT_EXIST` | Verify endpoint name with `list` | | Query returns 404 | Endpoint may still be provisioning; check `state.ready` via `get` | | `RATE_LIMIT_EXCEEDED` (429) | AI Gateway rate limit; check `put-ai-gateway` config or retry after backoff | +| Endpoint missing from the Serving UI after deploy | UI filter defaults to "Owned by me". Deploy jobs run as a service principal, so the endpoint is hidden until you switch to "All". `databricks serving-endpoints list` always shows it. |