Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions manifest.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"version": "2",
"updated_at": "2026-05-11T13:22:07Z",
"updated_at": "2026-05-18T12:22:38Z",
"skills": {
"databricks-apps": {
"version": "0.1.1",
"description": "Databricks Apps development and deployment (evaluates analytics vs synced tables data access)",
"experimental": false,
"updated_at": "2026-05-11T13:22:01Z",
"updated_at": "2026-05-18T12:11:44Z",
"files": [
"SKILL.md",
"agents/openai.yaml",
Expand All @@ -33,7 +33,7 @@
"version": "0.1.0",
"description": "Core Databricks skill for CLI, auth, and data exploration",
"experimental": false,
"updated_at": "2026-05-11T10:22:59Z",
"updated_at": "2026-05-18T12:11:44Z",
"files": [
"SKILL.md",
"agents/openai.yaml",
Expand All @@ -48,7 +48,7 @@
"version": "0.0.0",
"description": "Declarative Automation Bundles (DABs) for deploying and managing Databricks resources",
"experimental": false,
"updated_at": "2026-05-05T15:31:42Z",
"updated_at": "2026-05-18T12:11:44Z",
"files": [
"SKILL.md",
"agents/openai.yaml",
Expand All @@ -66,7 +66,7 @@
"version": "0.1.0",
"description": "Databricks Jobs orchestration and scheduling",
"experimental": false,
"updated_at": "2026-05-07T15:19:50Z",
"updated_at": "2026-05-18T12:11:44Z",
"files": [
"SKILL.md",
"agents/openai.yaml",
Expand All @@ -78,7 +78,7 @@
"version": "0.1.0",
"description": "Databricks Lakebase Postgres: projects, scaling, connectivity, synced tables, and Data API",
"experimental": false,
"updated_at": "2026-05-11T10:23:05Z",
"updated_at": "2026-05-18T12:11:44Z",
"files": [
"SKILL.md",
"agents/openai.yaml",
Expand All @@ -93,7 +93,7 @@
"version": "0.1.0",
"description": "Databricks Model Serving endpoint management",
"experimental": false,
"updated_at": "2026-05-07T15:19:45Z",
"updated_at": "2026-05-18T12:11:44Z",
"files": [
"SKILL.md",
"agents/openai.yaml",
Expand All @@ -105,7 +105,7 @@
"version": "0.1.0",
"description": "Databricks Pipelines (DLT) for ETL and streaming",
"experimental": false,
"updated_at": "2026-05-07T15:19:55Z",
"updated_at": "2026-05-18T12:11:44Z",
"files": [
"SKILL.md",
"agents/openai.yaml",
Expand Down Expand Up @@ -152,7 +152,7 @@
"version": "0.1.0",
"description": "Migrate Databricks workloads from classic compute to serverless compute, including compatibility checks and concrete fixes",
"experimental": false,
"updated_at": "2026-05-07T15:19:59Z",
"updated_at": "2026-05-18T12:22:14Z",
"files": [
"SKILL.md",
"agents/openai.yaml",
Expand All @@ -164,6 +164,22 @@
"references/networking-and-security.md",
"references/streaming-migration.md"
]
},
"databricks-serverless-storage-check": {
"version": "0.1.0",
"description": "Detect cross-task file-sharing antipatterns in serverless jobs (writes to /local_disk0, /tmp, or trustedTemp that are read by sibling or child tasks) and recommend UC Volumes or /Workspace handoff",
"experimental": false,
"updated_at": "2026-05-18T12:21:52Z",
"files": [
"SKILL.md",
"agents/openai.yaml",
"assets/databricks.png",
"assets/databricks.svg",
"references/pattern-catalog.md",
"references/remediation-guide.md",
"scripts/preflight.py",
"scripts/test_preflight.py"
]
}
}
}
4 changes: 4 additions & 0 deletions scripts/skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
"description": "Migrate Databricks workloads from classic compute to serverless compute, including compatibility checks and concrete fixes",
"experimental": False,
},
"databricks-serverless-storage-check": {
"description": "Detect cross-task file-sharing antipatterns in serverless jobs (writes to /local_disk0, /tmp, or trustedTemp that are read by sibling or child tasks) and recommend UC Volumes or /Workspace handoff",
"experimental": False,
},
}


Expand Down
2 changes: 1 addition & 1 deletion skills/databricks-serverless-migration/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ Scan the code for patterns that are incompatible with the serverless compute arc
| Pattern | Severity | Fix |
|---------|----------|-----|
| `dbfs:/` or `/dbfs/` paths (persistent data) | Blocker | Replace with `/Volumes/<your_catalog>/schema/volume/path` |
| `dbfs:/tmp/`, `/dbfs/tmp/`, paths with `cache`/`scratch`/`temp` | Warning | Use `/tmp/` or `/local_disk0/tmp/` (local driver disk) — do not use Volumes for temp files due to performance |
| `dbfs:/tmp/`, `/dbfs/tmp/`, paths with `cache`/`scratch`/`temp` | Warning | Use `/tmp/` or `/local_disk0/tmp/` (local driver disk) — do not use Volumes for temp files due to performance. **Per-task scratch only**: if another task (child notebook, sibling job task, or pipeline) needs to read the file, use UC Volumes or `/Workspace` — see [`databricks-serverless-storage-check`](../databricks-serverless-storage-check/SKILL.md). |
| `file:///dbfs/` FUSE mount paths | Warning | Replace persistent paths with `/Volumes/...`; replace temp paths with `/local_disk0/tmp/` |
| `dbutils.fs.mount(...)` | Blocker | Create UC external location + external volume |
| `hive_metastore.db.table` | Warning | Migrate to UC or use HMS Federation: `CREATE FOREIGN CATALOG ... USING CONNECTION hms_connection` |
Expand Down
149 changes: 149 additions & 0 deletions skills/databricks-serverless-storage-check/SKILL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
---
name: databricks-serverless-storage-check
description: "Detect cross-task file-sharing antipatterns in Databricks serverless jobs (writes to /local_disk0, /tmp, or trustedTemp that are read by sibling or child tasks on potentially different compute nodes) and recommend UC Volumes or /Workspace for handoff. Use when a serverless job fails with `INTERNAL_ERROR: [Errno 13] Permission denied` on /local_disk0 paths, when parallel child notebooks fail intermittently, when reviewing a DAB job before deploying to serverless, or when the user mentions trustedTemp, fan-out, or cross-task file handoff. Complements databricks-serverless-migration (which covers single-notebook migration)."
compatibility: Requires databricks CLI (>= v0.292.0) for --job-id and --run-id modes; --notebook / --dir / --job-yaml modes have no external dependencies.
metadata:
version: "0.1.0"
parent: databricks-core
---

# Serverless Storage Check

**FIRST**: Use the parent `databricks-core` skill for CLI basics, authentication, and profile selection.

This skill detects a specific class of serverless failure: **cross-task file handoffs through local disk**. On serverless compute, each task may run on a different node, so a path written by a parent task to `/local_disk0`, `/tmp`, or a `trustedTemp` directory is not guaranteed to be visible to a child task. The typical symptom is:

```
INTERNAL_ERROR: [Errno 13] Permission denied:
'/local_disk0/spark-<id>/trustedTemp-<id>/tmp<id>'
```

The fix is to move the handoff off local disk and onto durable, cross-node storage — UC Volumes (preferred) or `/Workspace` (fallback) — or replace the file handoff entirely with `dbutils.jobs.taskValues` for small payloads.

This skill ships an executable preflight scanner (`scripts/preflight.py`) that statically detects these antipatterns and emits remediation guidance. It is intentionally narrow: it does **not** try to fix `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT`, which is a separate, platform-side intermittent issue (see "What this skill does NOT cover" below).

## When to use this skill

Use this skill when any of these triggers appear:

- A serverless job fails with `INTERNAL_ERROR: [Errno 13] Permission denied` on `/local_disk0`, `/tmp`, or a path containing `trustedTemp`
- Parallel child notebooks (`dbutils.notebook.run`) fail intermittently while the same logic succeeds when run sequentially in a single notebook
- A DAB job is about to be deployed to serverless and has multiple `notebook_task` or `pipeline_task` tasks
- The user mentions "trustedTemp", "fan-out", "cross-task file sharing", or `/local_disk0`
- A new serverless job design needs a sanity check before first run

This skill is **complementary to**, not a replacement for, [`databricks-serverless-migration`](../databricks-serverless-migration/SKILL.md). That skill handles single-notebook migration and explicitly recommends `/local_disk0/tmp` for per-task scratch — which is correct *inside* a task. The boundary between the two skills:

| Concern | Use skill |
|---------|-----------|
| Migrating one notebook from classic DBR to serverless | `databricks-serverless-migration` |
| Per-task scratch storage (intra-task) | `databricks-serverless-migration` (recommends `/local_disk0/tmp`) |
| **Cross-task file handoff between parent/child notebooks or sibling tasks** | **this skill** |
| Permission-denied on `/local_disk0` during a multi-task run | **this skill** |

## Quick start

Run the preflight scanner against any of: a single notebook, a directory, a DAB job YAML, a remote job, or a failed run.

```bash
# Single notebook
python3 scripts/preflight.py --notebook path/to/notebook.ipynb

# Recursive scan of a directory
python3 scripts/preflight.py --dir path/to/repo/

# A DAB job YAML (auto-resolves referenced notebooks)
python3 scripts/preflight.py --job-yaml resources/my_job.job.yml

# A remote job (pulls notebook source via databricks workspace export)
python3 scripts/preflight.py --job-id 123456789 --profile DEFAULT

# A failed run (classifies the error trace as fan-out vs env-sync)
python3 scripts/preflight.py --run-id 987654321 --profile DEFAULT

# Machine-readable output for CI gating
python3 scripts/preflight.py --dir . --json
```

## Interpreting the output

The scanner prints findings grouped by severity. Each finding includes the pattern ID, file, line, code snippet, and a recommended fix snippet.

| Severity | Meaning | Exit code |
|----------|---------|-----------|
| **Blocker** | Will fail on serverless. Must fix before deploy. | `2` |
| **Warning** | Likely to fail under parallel execution. Should fix. | `1` |
| **Info** | Awareness-only or escalation routing (e.g. env-sync error). | `0` |

Clean scans exit `0`. Use `--json` for CI: pipe to `jq` or fail builds when blockers are found.

## The core rule

The boundary between safe and unsafe local-disk use on serverless:

> **Local disk (`/local_disk0`, `/tmp`, `trustedTemp`) is per-task only.** Anything one task writes that another task reads MUST live on `/Volumes` or `/Workspace`.

This is verbatim from the BSI thread guidance: when the parent task writes to local disk and the child task tries to read it, the child may be on a different node and the file won't exist (or will hit `Permission denied`). See [`references/remediation-guide.md`](references/remediation-guide.md) for concrete before/after patterns.

## Pattern catalog (summary)

| ID | Severity | What it detects |
|----|----------|-----------------|
| `FANOUT001` | Blocker | Local-disk path written then passed to `dbutils.notebook.run`, `taskValues.set`, or job-task parameter |
| `FANOUT002` | Blocker | Child notebook reads from `/local_disk0` or `/tmp` via widget, parameter, or `taskValues.get` |
| `FANOUT003` | Warning | DAB job with multiple sibling tasks referencing the same local-disk path |
| `FANOUT004` | Warning | `pipeline_task` immediately downstream of a `notebook_task` that wrote to local temp |
| `FANOUT005` | Info | `dbutils.fs.cp` from local path to local path inside a multi-task job (heuristic) |
| `FANOUT006` | Blocker | Hardcoded path matching the BSI signature `/local_disk0/spark-*/trustedTemp/...` |
| `ENV001` | Info | Run output contains `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT` — route to escalation |

Full rules, sample matches, and per-pattern fixes are in [`references/pattern-catalog.md`](references/pattern-catalog.md).

## Remediation summary

When the scanner flags a finding, prefer fixes in this order:

1. **UC Volumes** (preferred): `/Volumes/<catalog>/<schema>/<volume>/handoff/<run_id>/...`
- Durable, cross-node, UC-governed, works for any file size
- Requires `WRITE FILES` on the volume and a parent that creates the volume per run or per job

2. **`/Workspace`** (fallback): `/Workspace/Shared/<job_name>/handoff/...`
- Durable and cross-node, no UC dependency
- Best for smaller files; subject to workspace storage limits

3. **`dbutils.jobs.taskValues`** (small payloads only): no file at all
- For scalars and small JSON (well under 48 KB total per run)
- Replaces the file entirely — preferred when the handoff is just a parameter, config, or summary

4. **Keep `/local_disk0/tmp`** for **intra-task scratch only**. Never for cross-task.

Full before/after code is in [`references/remediation-guide.md`](references/remediation-guide.md).

## What this skill does NOT cover

The original BSI thread combined two distinct failures. This skill addresses only the storage one. The other failure, `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT` / "Virtual environment changed while syncing", is a rare, platform-side issue that the Databricks team treats as an engineering escalation. The scanner detects it in `--run-id` mode and emits an `ENV001` info finding routing the user to support, but does not attempt to fix it.

If the scanner emits `ENV001`:

1. Open a Databricks engineering support ticket (use the `/jira-actions` skill or `/support-escalation` if available) with the run ID and error trace
2. As a temporary mitigation, reduce dependency setup during child notebook startup (move heavy `%pip install` to the parent or a job-level environment spec)
3. Add retries on the affected task — the error is usually transient

## Related skills

- [`databricks-serverless-migration`](../databricks-serverless-migration/SKILL.md) — single-notebook classic-to-serverless migration. **Use that skill first** if the workload hasn't been migrated yet.
- [`databricks-dabs`](../databricks-dabs/SKILL.md) — DAB structure and resource definitions. Use when authoring or fixing the `job.yml` flagged by `FANOUT003` or `FANOUT004`.
- [`databricks-jobs`](../databricks-jobs/SKILL.md) — Lakeflow Jobs orchestration. Use when restructuring task dependencies to avoid the fan-out antipattern.
- [`databricks-core`](../databricks-core/SKILL.md) — parent skill for CLI auth and profile selection.

## Reference docs

- [Pattern catalog](references/pattern-catalog.md) — all detection rules with examples
- [Remediation guide](references/remediation-guide.md) — before/after code for Volumes, Workspace, and taskValues handoffs

## External documentation

- [Serverless compute limitations](https://docs.databricks.com/en/compute/serverless/limitations) — official local-disk scoping rules
- [Unity Catalog volumes](https://docs.databricks.com/en/connect/unity-catalog/volumes.html) — the preferred handoff target
- [Workspace files](https://docs.databricks.com/en/files/workspace.html) — the fallback handoff target
- [`dbutils.jobs.taskValues`](https://docs.databricks.com/en/dev-tools/databricks-utils.html#task-values-utility-dbutilsjobstaskvalues) — for non-file handoffs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
interface:
display_name: "Databricks Serverless Storage Check"
short_description: "Detect cross-task local-disk handoffs in serverless jobs"
icon_small: "./assets/databricks.svg"
icon_large: "./assets/databricks.png"
brand_color: "#FF3621"
default_prompt: "Use $databricks-serverless-storage-check to scan a serverless job, notebook, or DAB for cross-task file handoffs through /local_disk0, /tmp, or trustedTemp."
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading