diff --git a/manifest.json b/manifest.json index 54ec72f..3553b78 100644 --- a/manifest.json +++ b/manifest.json @@ -1,12 +1,12 @@ { "version": "2", - "updated_at": "2026-04-30T11:02:41Z", + "updated_at": "2026-05-04T13:00:55Z", "skills": { "databricks-apps": { "version": "0.1.1", "description": "Databricks Apps development and deployment (evaluates analytics vs synced tables data access)", "experimental": false, - "updated_at": "2026-04-30T11:00:26Z", + "updated_at": "2026-04-30T11:19:36Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -33,7 +33,7 @@ "version": "0.1.0", "description": "Core Databricks skill for CLI, auth, and data exploration", "experimental": false, - "updated_at": "2026-04-23T13:47:44Z", + "updated_at": "2026-05-04T12:38:42Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -48,7 +48,7 @@ "version": "0.0.0", "description": "Declarative Automation Bundles (DABs) for deploying and managing Databricks resources", "experimental": false, - "updated_at": "2026-04-23T13:47:44Z", + "updated_at": "2026-04-30T11:19:36Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -66,7 +66,7 @@ "version": "0.1.0", "description": "Databricks Jobs orchestration and scheduling", "experimental": false, - "updated_at": "2026-04-23T13:47:44Z", + "updated_at": "2026-04-30T11:19:36Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -78,7 +78,7 @@ "version": "0.1.0", "description": "Databricks Lakebase Postgres: projects, scaling, connectivity, synced tables, and Data API", "experimental": false, - "updated_at": "2026-04-30T11:02:37Z", + "updated_at": "2026-04-30T11:19:36Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -93,7 +93,7 @@ "version": "0.1.0", "description": "Databricks Model Serving endpoint management", "experimental": false, - "updated_at": "2026-04-23T13:47:44Z", + "updated_at": "2026-04-30T11:19:36Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -105,7 +105,7 @@ "version": "0.1.0", "description": "Databricks Pipelines (DLT) for ETL and streaming", "experimental": false, - "updated_at": "2026-04-23T13:47:44Z", + "updated_at": "2026-04-30T11:19:36Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -152,7 +152,7 @@ "version": "0.1.0", "description": "Migrate Databricks workloads from classic compute to serverless compute, including compatibility checks and concrete fixes", "experimental": false, - "updated_at": "2026-04-24T15:10:23Z", + "updated_at": "2026-04-30T11:19:36Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -164,6 +164,25 @@ "references/networking-and-security.md", "references/streaming-migration.md" ] + }, + "databricks-unitycatalog": { + "version": "0.1.0", + "description": "Unity Catalog governance: discovery, grants, volumes, external locations, lineage, and UC-managed objects", + "experimental": false, + "updated_at": "2026-05-04T12:42:21Z", + "files": [ + "SKILL.md", + "agents/openai.yaml", + "assets/databricks.png", + "assets/databricks.svg", + "references/access-control.md", + "references/ai-ml-objects.md", + "references/lineage-and-observability.md", + "references/namespace-and-objects.md", + "references/operations-and-migration.md", + "references/storage-and-connections.md", + "references/volumes.md" + ] } } } diff --git a/skills/databricks-unitycatalog/SKILL.md b/skills/databricks-unitycatalog/SKILL.md new file mode 100644 index 0000000..2cc7822 --- /dev/null +++ b/skills/databricks-unitycatalog/SKILL.md @@ -0,0 +1,49 @@ +--- +name: databricks-unitycatalog +description: "Unity Catalog governance operations: discovery, grants, volumes, external locations, and UC object workflows." +compatibility: Requires databricks CLI (>= v0.292.0) +metadata: + version: "0.1.0" +parent: databricks-core +--- + +# Databricks Unity Catalog + +**FIRST**: Use the parent `databricks-core` skill for CLI basics, authentication, and profile selection. + +Use this skill for Unity Catalog governance and day-2 operations: namespaces and objects, discovery, grants/privileges, volumes, external locations, storage credentials, lineage/observability, and UC-managed AI/ML objects. + +## Required Reading by Task + +| Task | READ BEFORE proceeding | +|------|------------------------| +| Discover catalogs/schemas/tables; search metadata | [Namespace & discovery](references/namespace-and-objects.md) | +| Grants, privileges, ownership/MANAGE, RLS/CLS | [Access control](references/access-control.md) | +| Read/write files via Volumes | [Volumes](references/volumes.md) | +| External locations, storage credentials, federation, sharing | [Storage & connections](references/storage-and-connections.md) | +| Lineage, tags, audit logs, cost attribution | [Lineage & observability](references/lineage-and-observability.md) | +| Maintenance, time travel, migration, constraints, clone | [Operations & migration](references/operations-and-migration.md) | +| Models, functions, vector search, feature tables | [AI & ML objects](references/ai-ml-objects.md) | + +## Priorities (P1 → P3) + +- **P1**: Access control (grants/privileges), volumes + external locations, and metadata discovery (`information_schema`) +- **P2**: Lineage/observability (tags, audit logs), federation/sharing patterns, and operational best practices +- **P3**: Billing and cost attribution patterns (system tables) + +## Key gotchas (do not skip) + +- **CLI args**: many UC list/get commands use **positional** arguments (see parent `databricks-core` quick reference). +- **File privileges**: **`WRITE FILES` requires `READ FILES`** (common cause of confusing permission errors). +- **Discovery without data**: `BROWSE` enables seeing objects without reading table data. +- **Ownership vs MANAGE**: these are not interchangeable; confirm which is required for the operation. + +## Reference Guides + +- [Namespace & discovery](references/namespace-and-objects.md) +- [Access control](references/access-control.md) +- [Volumes](references/volumes.md) +- [Storage & connections](references/storage-and-connections.md) +- [Lineage & observability](references/lineage-and-observability.md) +- [Operations & migration](references/operations-and-migration.md) +- [AI & ML objects](references/ai-ml-objects.md) diff --git a/skills/databricks-unitycatalog/agents/openai.yaml b/skills/databricks-unitycatalog/agents/openai.yaml new file mode 100644 index 0000000..e9e90ef --- /dev/null +++ b/skills/databricks-unitycatalog/agents/openai.yaml @@ -0,0 +1,7 @@ +interface: + display_name: "Databricks Unity Catalog" + short_description: "UC governance: grants, volumes, external locations" + icon_small: "./assets/databricks.svg" + icon_large: "./assets/databricks.png" + brand_color: "#FF3621" + default_prompt: "Use $databricks-unitycatalog for Unity Catalog governance tasks (grants, volumes, external locations, discovery)." diff --git a/skills/databricks-unitycatalog/assets/databricks.png b/skills/databricks-unitycatalog/assets/databricks.png new file mode 100644 index 0000000..263fe98 Binary files /dev/null and b/skills/databricks-unitycatalog/assets/databricks.png differ diff --git a/skills/databricks-unitycatalog/assets/databricks.svg b/skills/databricks-unitycatalog/assets/databricks.svg new file mode 100644 index 0000000..9d19110 --- /dev/null +++ b/skills/databricks-unitycatalog/assets/databricks.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/skills/databricks-unitycatalog/references/access-control.md b/skills/databricks-unitycatalog/references/access-control.md new file mode 100644 index 0000000..370c0f0 --- /dev/null +++ b/skills/databricks-unitycatalog/references/access-control.md @@ -0,0 +1,75 @@ +# Access control (grants, privileges, RLS/CLS) + +## When to use this reference + +Use this doc for: + +- Grant/revoke workflows (`GRANT`, `REVOKE`, `SHOW GRANTS`) +- “I can’t see the table” vs “I can’t query the table” debugging +- Permissions on volumes / external locations (file privileges) +- Row and column-level security (row filters, column masks) + +## Core concepts (keep straight) + +- **Privileges** are granted on UC securables (catalogs, schemas, tables/views, volumes, external locations, functions, etc.). +- **Discovery** can be separated from data access via `BROWSE`. +- **Namespace traversal** often requires `USE CATALOG` + `USE SCHEMA` even when `SELECT` exists. +- **Ownership** is not the same as `MANAGE` (workspaces differ on what each enables). +- **File privileges gotcha**: **`WRITE FILES` requires `READ FILES`**. + +## Quick checklist: “why can’t user X query object Y?” + +1. Confirm the user/principal identity (``). +2. Check grants on: + - catalog + schema (traversal / discovery) + - the target object (table/view/volume/external location) +3. If the error mentions files/paths, verify file privileges (`READ FILES`, `WRITE FILES`) and underlying external location grants. +4. If the query returns fewer rows or masked values, check row filters / column masks. + +## Common SQL patterns + +```sql +-- Inspect grants (examples) +SHOW GRANTS ON CATALOG ; +SHOW GRANTS ON SCHEMA .; +SHOW GRANTS ON TABLE ..; +SHOW GRANTS ON VIEW ..; +SHOW GRANTS ON VOLUME ..; + +-- Minimal traversal + discovery (lets users find objects) +GRANT USE CATALOG ON CATALOG TO ``; +GRANT USE SCHEMA ON SCHEMA . TO ``; +GRANT BROWSE ON CATALOG TO ``; + +-- Data access +GRANT SELECT ON TABLE ..
TO ``; + +-- Revoke +REVOKE SELECT ON TABLE ..
FROM ``; +``` + +### Troubleshooting: “not found” vs “permission denied” + +- **“Not found” / can’t list** often means missing `USE CATALOG` / `USE SCHEMA` and/or `BROWSE`. +- **“Permission denied” on query** usually means missing `SELECT`, or a denied row/column policy, or file privileges on underlying storage paths. + +## `ALL PRIVILEGES` notes + +Treat `ALL PRIVILEGES` as a convenience that depends on object type and platform semantics. Prefer granting only what is required and verifying with `SHOW GRANTS`. + +## Ownership vs `MANAGE` + +Document which operations require ownership vs `MANAGE` in your environment. Do not assume one implies the other. + +## RLS/CLS: row filters + column masks + +Unity Catalog can enforce: + +- **Row filters**: restrict which rows a principal can see +- **Column masks**: redact/transform specific columns + +Debug workflow: + +- Start with a minimal query selecting non-sensitive columns +- If results differ by principal, inspect applicable row/column policies +- Confirm base privileges first (`USE CATALOG`, `USE SCHEMA`, `SELECT`) diff --git a/skills/databricks-unitycatalog/references/ai-ml-objects.md b/skills/databricks-unitycatalog/references/ai-ml-objects.md new file mode 100644 index 0000000..b132b0b --- /dev/null +++ b/skills/databricks-unitycatalog/references/ai-ml-objects.md @@ -0,0 +1,62 @@ +# AI & ML objects in Unity Catalog (models, functions, vector, features) + +## When to use this reference + +Use this doc when working with UC-governed AI/ML primitives: + +- registered models +- UC functions (including those used as governed “tools”) +- vector search indexes +- feature tables and online store publishing (if applicable) + +## Registered models (governance mindset) + +UC can govern registered models and their lifecycle (versions, aliases/stages depending on setup). Treat model governance similarly to table governance: + +- who can read / write / deploy +- how changes are audited +- how environments (dev/stage/prod) are separated + +## UC functions as governed tools + +UC functions can be a controlled “tool surface” when used intentionally. + +Checklist: + +- Add a clear `COMMENT` describing safe usage and inputs/outputs. +- Ensure callers have `EXECUTE` privilege (and only what they need). +- Avoid designs that require embedding secrets in function bodies or configs. + +## Python UDFs / UDTFs (validate constraints early) + +Support, packaging, and runtime constraints vary by environment. Validate: + +- runtime compatibility +- dependency strategy (what can/can’t be packaged) +- permissions (who can create/alter/execute) + +## Vector Search indexes + +Common patterns: + +- direct index over data +- Delta Sync-managed refresh + +Pick based on freshness requirements and operational overhead. + +## Feature tables / online store publishing + +Typical workflow: + +- curate feature tables with stable keys and definitions +- publish/sync to an online store (if used) + +Confirm which feature APIs your workspace supports and which principal will run publish/sync jobs (human vs service principal). + +## External access from functions/UDFs + +If functions/UDFs access external cloud services: + +- keep credentials out of code (no embedded tokens/secrets) +- confirm egress/networking policies allow access +- enforce least privilege and auditability diff --git a/skills/databricks-unitycatalog/references/lineage-and-observability.md b/skills/databricks-unitycatalog/references/lineage-and-observability.md new file mode 100644 index 0000000..20672e1 --- /dev/null +++ b/skills/databricks-unitycatalog/references/lineage-and-observability.md @@ -0,0 +1,66 @@ +# Lineage & observability (metadata, tags, audit, billing) + +## When to use this reference + +Use this doc when you need to: + +- Verify lineage exists for a table/model/dashboard/pipeline +- Bring lineage from external systems (or document gaps) +- Apply or audit tags (system vs governed) +- Investigate access and permission changes via audit logs +- Attribute costs using system billing tables + +## Automated lineage (how to reason about it) + +Unity Catalog can capture lineage across common compute and platform surfaces (tables, pipelines, dashboards, models). Coverage varies by feature/integration. + +Checklist: + +- Validate lineage on a representative object first (don’t assume global coverage). +- If lineage is missing, determine whether it’s a tooling gap, a permissions gap, or an unsupported integration path. + +## External lineage (BYO) + +For systems outside Databricks (BI tools, SaaS sources, external warehouses), use external lineage ingestion where available. If not possible, document: + +- what lineage will remain missing +- what identifiers can be used to correlate (table names, URLs, workbook IDs, etc.) + +## Tags (system vs governed) + +- **System tags**: platform-generated metadata. +- **Governed tags**: curated taxonomy with controlled assignment. + +When using governed tags, principals may require privileges such as: + +- `APPLY TAG` +- an assignment permission (often called `ASSIGN`) depending on the governed-tag system in use + +## Audit logs (`system.access.audit`) + +Use audit logs to answer “who did what, when” and to diagnose unexpected permission/access patterns. + +```sql +-- Recent grant/revoke-related actions +SELECT * +FROM system.access.audit +WHERE event_time >= current_timestamp() - INTERVAL 7 DAYS + AND ( + lower(action_name) LIKE '%grant%' + OR lower(action_name) LIKE '%revoke%' + ) +ORDER BY event_time DESC +LIMIT 200; +``` + +## Billing / cost attribution (`system.billing.usage`) + +Use usage tables for cost attribution by workspace, identity, SKU, and time range. + +```sql +SELECT * +FROM system.billing.usage +WHERE usage_start_time >= current_timestamp() - INTERVAL 30 DAYS +ORDER BY usage_start_time DESC +LIMIT 200; +``` diff --git a/skills/databricks-unitycatalog/references/namespace-and-objects.md b/skills/databricks-unitycatalog/references/namespace-and-objects.md new file mode 100644 index 0000000..d17b0f7 --- /dev/null +++ b/skills/databricks-unitycatalog/references/namespace-and-objects.md @@ -0,0 +1,76 @@ +# Namespace & objects (Unity Catalog) + +## When to use this reference + +Use this doc when you need to: + +- Navigate the 3-level namespace (`catalog.schema.table`) +- Inventory catalogs/schemas/tables quickly +- Search metadata at scale via `information_schema` +- Decide between managed vs external tables, and understand view types + +## Core model: 3-level namespace + +- **Fully-qualified**: `catalog.schema.table` +- Most governance and discovery flows require the ability to traverse the namespace (often `USE CATALOG` + `USE SCHEMA`, and sometimes `BROWSE`). + +## CLI discovery (fastest for inventory) + +Always include `--profile `. + +```bash +# list catalogs +databricks catalogs list --profile + +# list schemas in a catalog (⚠️ positional arg) +databricks schemas list --profile + +# list tables in a schema (⚠️ positional args) +databricks tables list --profile + +# inspect a table +databricks tables get ..
--profile +``` + +### CLI gotcha: positional args + +Many UC commands use **positional** arguments (e.g. `schemas list `). Do not invent flags like `--catalog-name` unless `--help` shows them. + +## SQL discovery via `information_schema` (best for search) + +Run on a SQL warehouse. + +```sql +-- Find tables by name pattern +SELECT table_catalog, table_schema, table_name +FROM system.information_schema.tables +WHERE lower(table_name) LIKE '%customer%'; + +-- Find columns across the lakehouse (handy for “where is field X?”) +SELECT table_catalog, table_schema, table_name, column_name, data_type +FROM system.information_schema.columns +WHERE lower(column_name) LIKE '%email%'; + +-- Inspect columns for one table +SELECT column_name, data_type, is_nullable, comment +FROM system.information_schema.columns +WHERE table_catalog = '' + AND table_schema = '' + AND table_name = '
' +ORDER BY ordinal_position; +``` + +If `system.information_schema` is unavailable, fall back to per-catalog `information_schema` (availability varies by workspace and configuration). + +## Managed vs external tables (decision guide) + +- **Managed tables**: simplest ops; UC controls the storage lifecycle. +- **External tables**: data lives in customer-controlled cloud storage; common for shared paths, multi-tool interoperability, and explicit storage ownership. + +Default to **managed** unless you have a specific requirement to own the underlying storage path and lifecycle. + +## Views, materialized views, metric views (quick mental model) + +- **Views**: computed at query time; no storage of results. +- **Materialized views**: persisted results to accelerate repeated workloads (refresh semantics vary). +- **Metric views**: newer abstraction; verify feature availability and semantics in the target workspace before relying on it. diff --git a/skills/databricks-unitycatalog/references/operations-and-migration.md b/skills/databricks-unitycatalog/references/operations-and-migration.md new file mode 100644 index 0000000..63a2e12 --- /dev/null +++ b/skills/databricks-unitycatalog/references/operations-and-migration.md @@ -0,0 +1,66 @@ +# Operations & migration (maintenance, time travel, constraints, clone) + +## When to use this reference + +Use this doc for day-2 table operations and migrations: + +- maintenance (`OPTIMIZE`, `VACUUM`, clustering) +- time travel for debugging/recovery +- migration of legacy (Hive) tables into UC +- constraints and cloning strategies + +## Maintenance operations (what to be careful about) + +Common operations: + +- `OPTIMIZE`: improves data layout / compacts files for performance. +- `VACUUM`: deletes old data files; **verify retention/compliance** before changing defaults. +- liquid clustering / auto-clustering: capability varies; confirm the workspace’s current behavior and defaults. + +## Time travel (debugging + recovery) + +Be mindful of retention and any runtime-specific limitations. + +```sql +-- version-based +SELECT * +FROM ..
VERSION AS OF 123; + +-- timestamp-based +SELECT * +FROM ..
TIMESTAMP AS OF '2026-01-01T00:00:00Z'; +``` + +Debug checklist: + +- Confirm the queried version/timestamp is within retention. +- If results differ across environments, check table properties and retention configuration. + +## Predictive optimization + +Enablement may be per-table, per-schema, or per-catalog depending on workspace defaults and policy. Verify what’s enabled before assuming optimizations will occur automatically. + +## Migrating Hive tables to Unity Catalog (SYNC workflow) + +Treat migrations as a controlled change: + +- Validate schema compatibility. +- Map permissions intentionally (don’t assume inheritance matches legacy ACLs). +- Inventory downstream dependencies (jobs, dashboards, notebooks, apps). +- Migrate a small subset first, then expand. + +## Constraints (validate support + enforcement) + +Common constraint types: + +- `NOT NULL`, `CHECK` +- `PRIMARY KEY`, `FOREIGN KEY` + +Support and enforcement semantics can vary; validate behavior in the target workspace before depending on constraint enforcement. + +## CLONE (deep vs shallow) + +- **Shallow clone**: fast; depends on source data retention and access to underlying files. +- **Deep clone**: copies data; safer isolation, higher cost. + +Pick based on isolation and retention guarantees you need. diff --git a/skills/databricks-unitycatalog/references/storage-and-connections.md b/skills/databricks-unitycatalog/references/storage-and-connections.md new file mode 100644 index 0000000..370048d --- /dev/null +++ b/skills/databricks-unitycatalog/references/storage-and-connections.md @@ -0,0 +1,60 @@ +# Storage & connections (credentials, external locations, federation, sharing) + +## When to use this reference + +Use this doc when you’re working with governed access to storage and external systems: + +- storage credentials (identity used to access cloud storage) +- external locations (governed mapping to cloud paths) +- federation / foreign catalogs via connections +- Delta Sharing as provider or recipient + +## Storage credentials (what they are) + +Storage credentials define **how Databricks authenticates to cloud storage** for governed access. Commonly backed by: + +- managed identity +- service principal + +Keep all examples obfuscated (no real workspace URLs, account names, or IDs). + +## External locations (the key governance primitive) + +External locations bind a cloud storage URL to a UC securable so permissions can be managed centrally. + +### Operational guidelines + +- Validate location access at creation time where possible. +- Use **read-only** locations for shared datasets. +- For write-enabled locations, explicitly grant file privileges. + +### Gotcha: file privileges + +**`WRITE FILES` requires `READ FILES`**. Always grant both when enabling writes. + +## Federation / foreign catalogs (via connections) + +Connections can expose external systems as foreign catalogs. Before building workflows: + +- confirm supported operations (read-only vs read/write) +- confirm identity and credential scope used by the connection +- confirm performance expectations and pushdown behavior + +## Iceberg REST catalog (optional integration) + +Iceberg REST catalog support (especially writes) varies by workspace and connector maturity. Treat as an optional integration and verify current support before committing to it. + +## Delta Sharing (provider/recipient mental model) + +Objects and workflows include: + +- providers +- shares +- recipients +- token rotation / credential hygiene + +Troubleshooting checklist: + +- Identify the role: **provider** vs **recipient** +- Identify which principal is used (human, service principal, workspace identity) +- Confirm that the recipient’s token/credentials are current and stored securely (no embedded secrets in code) diff --git a/skills/databricks-unitycatalog/references/volumes.md b/skills/databricks-unitycatalog/references/volumes.md new file mode 100644 index 0000000..7c3b172 --- /dev/null +++ b/skills/databricks-unitycatalog/references/volumes.md @@ -0,0 +1,47 @@ +# Volumes (managed vs external) + +## When to use this reference + +Use this doc when you need UC-governed file access for: + +- notebooks / jobs reading and writing files +- data ingestion/export paths governed by UC +- sharing “known-good” paths with clear permissions (instead of ad-hoc mounts) + +## Managed vs external volumes (decision guide) + +- **Managed volume**: simplest lifecycle; storage managed by Databricks/UC. +- **External volume**: backed by customer-owned cloud storage (via an external location); best when you must control the underlying path and lifecycle. + +Default to **managed** unless you have a clear reason to control the cloud path. + +## Create volumes (SQL) + +```sql +-- Managed volume +CREATE VOLUME ..; + +-- External volume (location is a cloud path; cloud-specific scheme varies) +CREATE VOLUME .. +LOCATION '//'; +``` + +## Use volumes in code: canonical paths + +Prefer the `/Volumes/...` path so code is portable across notebooks/jobs: + +- `/Volumes////some/file.parquet` + +`dbutils.fs` can be used as an API surface, but the **path** should still typically be a `/Volumes/...` path. + +## Permissions: the two-layer model (common failure source) + +When something fails, check both layers: + +- **Volume grants** (the UC object you read/write) +- **External location grants** (for external volumes) + +### Gotchas + +- **`WRITE FILES` requires `READ FILES`** (grant both). +- If users can list but not read, you may be missing file privileges or underlying external location permissions.