diff --git a/manifest.json b/manifest.json index ba154bf..d3ea0ba 100644 --- a/manifest.json +++ b/manifest.json @@ -21,6 +21,9 @@ "references/appkit/overview.md", "references/appkit/proto-contracts.md", "references/appkit/proto-first.md", + "references/appkit/contract-testing.md", + "references/appkit/scenario-testing.md", + "references/appkit/testing-patterns.md", "references/appkit/sql-queries.md", "references/appkit/trpc.md", "references/other-frameworks.md", diff --git a/skills/databricks-apps/SKILL.md b/skills/databricks-apps/SKILL.md index 67cabc4..15ccd5c 100644 --- a/skills/databricks-apps/SKILL.md +++ b/skills/databricks-apps/SKILL.md @@ -27,6 +27,9 @@ Build apps that deploy to Databricks Apps platform. | Using Model Serving (ML inference) | [Model Serving Guide](references/appkit/model-serving.md) | | Typed data contracts (proto-first design) | [Proto-First Guide](references/appkit/proto-first.md) and [Plugin Contracts](references/appkit/proto-contracts.md) | | Managing files in UC Volumes | [Files Guide](references/appkit/files.md) | +| Testing progression | [Testing Patterns](references/appkit/testing-patterns.md) | +| Contract testing | [Contract Testing](references/appkit/contract-testing.md) | +| Scenario/E2E testing | [Scenario Testing](references/appkit/scenario-testing.md) | | Platform rules (permissions, deployment, limits) | [Platform Guide](references/platform-guide.md) — READ for ALL apps including AppKit | | Non-AppKit app (Streamlit, FastAPI, Flask, Gradio, Next.js, etc.) | [Other Frameworks](references/other-frameworks.md) | diff --git a/skills/databricks-apps/references/appkit/contract-testing.md b/skills/databricks-apps/references/appkit/contract-testing.md new file mode 100644 index 0000000..576936c --- /dev/null +++ b/skills/databricks-apps/references/appkit/contract-testing.md @@ -0,0 +1,198 @@ +# Contract Testing + +PACT-style contract tests for AppKit apps. Consumer defines expectations, provider verifies. Use when multiple modules produce or consume the same data shapes. + +**When to use:** Multi-module apps, apps with jobs that produce/consume data, or any app where two or more modules share a data boundary. Skip for single-module prototypes. + +**Not mandatory.** Add contract tests when the app has multiple producers/consumers of the same data, or when a boundary change has caused a runtime bug. + +## What Contract Tests Are + +Contract tests verify that a producer's output matches what the consumer expects. They are NOT integration tests -- they run without network calls, databases, or live services. + +The pattern (PACT-style): +1. **Consumer** defines a contract: "I expect this shape, with these constraints." +2. **Provider** is verified against that contract: "Does my output satisfy every consumer?" +3. If either side changes, the contract test fails at build time -- not in production. + +``` +Consumer (Dashboard) Provider (Eval API) + | | + +-- expects scores 0..1 | + +-- expects run_id as string | + +-- expects status enum | + | | + +---- contract.test.ts -----------+ + | + vitest runs + at build time +``` + +## Contract Boundaries in AppKit Apps + +Each module boundary is a potential contract surface: + +| Boundary | Producer | Consumer | What to test | +|----------|----------|----------|-------------| +| frontend <-> server | tRPC router | React components | Response shapes, error codes, field presence | +| server <-> lakebase | Lakebase migrations/queries | tRPC procedures | Row shapes, column types, NULL handling | +| server <-> files | Files plugin | tRPC procedures | Volume paths, content types, metadata keys | +| job <-> job | Upstream job task | Downstream job task | Task output shapes, status codes, payload encoding | + +## How to Write Contract Tests with Vitest + +Contract tests live alongside unit tests and run with `vitest`. + +### Basic Example + +```ts +import { describe, it, expect } from "vitest"; + +// Simulated provider response -- in practice, import the type +// and construct a minimal valid instance. +const result = { + run_id: "run-abc-123", + appeval100: 0.87, + status: "COMPLETED", + metrics: { accuracy: 0.92, latency_ms: 340 }, +}; + +describe("Dashboard expects Eval API", () => { + it("returns a valid run_id", () => { + expect(typeof result.run_id).toBe("string"); + expect(result.run_id.length).toBeGreaterThan(0); + }); + + it("returns results with scores between 0 and 1", () => { + expect(result.appeval100).toBeGreaterThanOrEqual(0); + expect(result.appeval100).toBeLessThanOrEqual(1); + }); + + it("returns a known status enum value", () => { + expect(["PENDING", "RUNNING", "COMPLETED", "FAILED"]).toContain( + result.status + ); + }); + + it("includes metrics as a record of numbers", () => { + for (const [key, value] of Object.entries(result.metrics)) { + expect(typeof key).toBe("string"); + expect(typeof value).toBe("number"); + } + }); +}); +``` + +### Testing Lakebase Row Shapes + +```ts +import { describe, it, expect } from "vitest"; +import type { RunRecord } from "../proto/gen/app/v1/database"; + +// Minimal valid row -- mirrors what Lakebase would return. +const row: RunRecord = { + run_id: "run-001", + app_name: "my-app", + status: "RUN_STATUS_PENDING", + started_at: new Date().toISOString(), + completed_at: "", + error_message: "", + config_json: "{}", +}; + +describe("API module expects RunRecord from Lakebase", () => { + it("has required fields", () => { + expect(row.run_id).toBeTruthy(); + expect(row.app_name).toBeTruthy(); + }); + + it("status is a valid enum string", () => { + expect(row.status).toMatch(/^RUN_STATUS_/); + }); + + it("config_json is valid JSON", () => { + expect(() => JSON.parse(row.config_json)).not.toThrow(); + }); +}); +``` + +### Testing Job Task Outputs + +```ts +import { describe, it, expect } from "vitest"; +import type { JobTaskOutput } from "../proto/gen/app/v1/compute"; + +const output: JobTaskOutput = { + task_id: "task-001", + run_id: "run-001", + success: true, + error: "", + output_payload: new Uint8Array([]), + duration_ms: 1200, + metrics: { rows_processed: "5000" }, +}; + +describe("API module expects JobTaskOutput", () => { + it("has matching run_id and task_id", () => { + expect(output.run_id).toBeTruthy(); + expect(output.task_id).toBeTruthy(); + }); + + it("duration_ms is non-negative", () => { + expect(output.duration_ms).toBeGreaterThanOrEqual(0); + }); + + it("on success, error is empty", () => { + if (output.success) { + expect(output.error).toBe(""); + } + }); +}); +``` + +## Proto-First Contract Derivation + +The recommended workflow ties contract tests directly to proto definitions: + +``` +1. Write the contract test -> "Dashboard expects scores 0..1" +2. Derive the proto message -> message EvalResult { double score = 1; } +3. Generate TypeScript types -> buf generate proto/ +4. Implement provider -> tRPC route returns EvalResult +5. Contract test passes -> Consumer expectation met +``` + +This inverts the usual flow. Instead of writing the proto first and hoping consumers are satisfied, you start from what the consumer needs and work backward to the schema. The proto becomes the verified bridge. + +## Running Contract Tests + +Contract tests run with the rest of the vitest suite: + +```bash +# Run all tests including contracts +npx vitest run + +# Run only contract tests (by convention, name files *.contract.test.ts) +npx vitest run --reporter=verbose "contract" +``` + +## File Naming Convention + +``` +tests/ + contracts/ + dashboard-eval-api.contract.test.ts + api-lakebase-runs.contract.test.ts + job-upstream-downstream.contract.test.ts +``` + +Name each file `-.contract.test.ts` so the boundary is obvious at a glance. + +## Common Traps + +| Trap | Why it fails | Fix | +|------|-------------|-----| +| Testing implementation, not shape | Test breaks on refactor even though contract holds | Assert on shape and constraints, not internal logic | +| No contract for job boundaries | Job output changes silently, downstream breaks | Add contract test for every job->job and job->api boundary | +| Duplicating validation logic | Contract and runtime validation diverge | Derive both from the proto; contract test checks the shape, runtime uses generated validators | +| Testing only happy path | Missing fields or null values slip through | Add cases for empty strings, zero values, missing optional fields | diff --git a/skills/databricks-apps/references/appkit/scenario-testing.md b/skills/databricks-apps/references/appkit/scenario-testing.md new file mode 100644 index 0000000..d32c686 --- /dev/null +++ b/skills/databricks-apps/references/appkit/scenario-testing.md @@ -0,0 +1,224 @@ +# Scenario Testing (pw-evals) + +Playwright end-to-end tests parameterized by JSON test cases. Use for apps that need UI-level acceptance testing across multiple input/output scenarios. + +**When to use:** Apps with deterministic user workflows where you can define input/output pairs. Particularly useful for evaluation-driven development where an agent builds the app and automated tests verify correctness. + +**Not mandatory.** Add scenario tests when the app needs UI-level acceptance testing beyond smoke tests. + +## What Scenario Tests Are + +Scenario tests combine Playwright browser automation with JSON-defined test cases. Each case specifies inputs to enter and expected outputs to verify. The same Playwright spec runs once per case, producing a pass/fail for each. + +``` +cases.json + spec.ts = scenario test +(what to test) (how to test) (automated verdict) +``` + +This separates **test data** (cases) from **test logic** (spec), so you can add new scenarios without writing new test code. + +## Directory Structure + +Each task (a testable unit of the app) gets its own directory: + +``` +pw-evals/ + task-name/ + meta.json # Task metadata: appCommand, appUrl, timeout + public/ + cases.json # Dev verification cases (visible to agent) + private/ + cases.json # Evaluation cases (hidden from agent) + tests/ + task-name.spec.ts # Playwright test spec +``` + +### meta.json + +Defines how to start the app and where to find it: + +```json +{ + "appCommand": "npm run dev", + "appUrl": "http://localhost:5173", + "timeout": 30000 +} +``` + +| Field | Purpose | +|-------|---------| +| `appCommand` | Shell command to start the app (run before tests) | +| `appUrl` | URL Playwright navigates to | +| `timeout` | Max milliseconds per test case | + +## How to Write cases.json + +Each case is an object with `inputs` (what to enter/select in the UI) and `expected` (what to verify in the UI after the action completes). + +```json +[ + { + "id": "basic-addition", + "description": "Adds two positive numbers", + "inputs": { + "operand_a": "5", + "operand_b": "3", + "operation": "add" + }, + "expected": { + "result": "8" + } + }, + { + "id": "division-by-zero", + "description": "Shows error on division by zero", + "inputs": { + "operand_a": "10", + "operand_b": "0", + "operation": "divide" + }, + "expected": { + "error": "Cannot divide by zero" + } + } +] +``` + +### Case Design Rules + +- **Each case is independent** -- no case depends on state from a previous case. +- **Inputs map to UI controls** -- field names match ARIA labels or test IDs in the app. +- **Expected values are exact strings** -- the spec asserts equality, not fuzzy matching. +- **Keep cases minimal** -- test one behavior per case. Combine related assertions in `expected`, not multiple behaviors. + +## How to Write the Playwright Spec + +The spec loads `cases.json`, loops over each case, and uses ARIA selectors to interact with the app. + +```ts +import { test, expect } from "@playwright/test"; +import cases from "../public/cases.json"; + +const CASES_PATH = process.env.TASK_CASES_PATH || "public/cases.json"; + +// Dynamically load cases based on environment variable +const loadCases = async () => { + const path = require("path").resolve(__dirname, "..", CASES_PATH); + return require(path); +}; + +test.describe("Calculator scenarios", () => { + for (const testCase of cases) { + test(`${testCase.id}: ${testCase.description}`, async ({ page }) => { + await page.goto(process.env.APP_URL || "http://localhost:5173"); + + // Fill inputs using ARIA selectors + for (const [field, value] of Object.entries(testCase.inputs)) { + await page.getByRole("textbox", { name: field }).fill(value as string); + } + + // If there's a select/dropdown input + if (testCase.inputs.operation) { + await page + .getByRole("combobox", { name: "operation" }) + .selectOption(testCase.inputs.operation); + } + + // Trigger the action + await page.getByRole("button", { name: /calculate|submit/i }).click(); + + // Verify expected outputs + for (const [field, value] of Object.entries(testCase.expected)) { + await expect( + page.getByRole("status").or(page.getByTestId(field)) + ).toContainText(value as string); + } + }); + } +}); +``` + +### Selector Strategy + +Use ARIA roles and accessible names, not CSS selectors: + +| UI Element | Selector | +|-----------|----------| +| Text input | `page.getByRole("textbox", { name: "field_name" })` | +| Button | `page.getByRole("button", { name: /pattern/i })` | +| Dropdown | `page.getByRole("combobox", { name: "field_name" })` | +| Output text | `page.getByRole("status")` or `page.getByTestId("field")` | +| Heading | `page.getByRole("heading", { name: "text" })` | + +This makes tests resilient to CSS and layout changes. + +## Public vs Private Split + +The split enables evaluation-driven development: + +| Set | Who sees it | Purpose | +|-----|------------|---------| +| `public/cases.json` | Agent + developer | Development and debugging. Agent uses these to verify its implementation works. | +| `private/cases.json` | Evaluator only | Grading. Hidden from the agent during development to prevent overfitting. | + +Both files have the same schema. The difference is visibility: +- During development, the agent runs tests against `public/cases.json`. +- During evaluation, the harness runs tests against `private/cases.json`. +- If the app handles public cases correctly, it should generalize to private cases (assuming cases test the same behaviors). + +## Running Scenario Tests + +```bash +# Run with public (dev) cases +TASK_CASES_PATH=public/cases.json npx playwright test + +# Run with private (eval) cases +TASK_CASES_PATH=private/cases.json npx playwright test + +# Run a specific task +TASK_CASES_PATH=public/cases.json npx playwright test pw-evals/calculator/tests/ + +# Run with visible browser (debugging) +TASK_CASES_PATH=public/cases.json npx playwright test --headed +``` + +### Playwright Configuration + +Add to `playwright.config.ts`: + +```ts +import { defineConfig } from "@playwright/test"; + +export default defineConfig({ + testDir: "./pw-evals", + timeout: 30_000, + use: { + baseURL: process.env.APP_URL || "http://localhost:5173", + }, + webServer: { + command: "npm run dev", + url: "http://localhost:5173", + reuseExistingServer: true, + }, +}); +``` + +## Adding a New Task + +1. Create the directory: `pw-evals//` +2. Write `meta.json` with app startup config. +3. Write `public/cases.json` with 3-5 representative cases. +4. Write the Playwright spec in `tests/.spec.ts`. +5. Run against public cases to verify. +6. Optionally add `private/cases.json` with additional edge cases for evaluation. + +## Common Traps + +| Trap | Why it fails | Fix | +|------|-------------|-----| +| Cases depend on execution order | Flaky tests when run in parallel | Make each case fully independent | +| CSS selectors in specs | Tests break on style changes | Use ARIA roles and accessible names | +| Hardcoded URLs | Fails in CI or different environments | Use `process.env.APP_URL` with fallback | +| Too many cases in public set | Agent overfits to specific inputs | Keep public set small (3-5 cases), test general behaviors | +| No timeout per case | Slow cases block the entire suite | Set `timeout` in meta.json and Playwright config | +| Asserting exact layout | Brittle to responsive changes | Assert on text content, not position or size | diff --git a/skills/databricks-apps/references/appkit/testing-patterns.md b/skills/databricks-apps/references/appkit/testing-patterns.md new file mode 100644 index 0000000..833dde4 --- /dev/null +++ b/skills/databricks-apps/references/appkit/testing-patterns.md @@ -0,0 +1,243 @@ +# Testing Patterns + +Testing progression for AppKit apps. Each level builds on the previous one. Only Level 1 (smoke tests) is mandatory -- add higher levels incrementally as the app grows in complexity. + +## The Testing Pyramid + +``` + +-------------------+ + | Level 5: Eval | Automated generation + evaluation at scale + +-------------------+ + / \ + +-----------------------+ + | Level 4: Scenarios | Playwright + JSON cases, verify user workflows + +-----------------------+ + / \ + +---------------------------+ + | Level 3: Contracts | PACT-style, verify cross-boundary data shapes + +---------------------------+ + / \ + +-------------------------------+ + | Level 2: Unit Tests | vitest, test business logic and API routes + +-------------------------------+ + / \ + +-----------------------------------+ + | Level 1: Smoke Tests | Generated by appkit init, run on every build + +-----------------------------------+ +``` + +Each level is optional except Level 1. Add levels when the cost of a bug at that layer exceeds the cost of writing the test. + +## Level 1: Smoke Tests (Mandatory) + +**Generated automatically by `appkit init`.** Run on every build and validation. + +Smoke tests answer one question: **does the app start and render without errors?** + +```ts +// tests/smoke.spec.ts (auto-generated, MUST be updated for your app) +import { test, expect } from "@playwright/test"; + +test("app renders without errors", async ({ page }) => { + await page.goto("/"); + // Update these selectors to match YOUR app's actual content + await expect(page.getByRole("heading", { name: /my app/i })).toBeVisible(); + // No console errors + const errors: string[] = []; + page.on("console", (msg) => { + if (msg.type() === "error") errors.push(msg.text()); + }); + await page.waitForTimeout(2000); + expect(errors).toHaveLength(0); +}); +``` + +**Critical:** The default template checks for "Minimal Databricks App" heading. You MUST update selectors to match your app's content before running `databricks apps validate`. + +**When to update:** Every time you change the app's initial render (headings, layout, route structure). + +**Run:** +```bash +databricks apps validate --profile +``` + +## Level 2: Unit Tests + +Test business logic, utility functions, and tRPC API routes in isolation. No browser needed. + +**When to add:** When your app has logic beyond simple data display -- calculations, transformations, validation rules, or tRPC procedures with business logic. + +### What to Test + +| Target | Example | +|--------|---------| +| Utility functions | `formatCurrency(1234.5)` returns `"$1,234.50"` | +| Data transformers | `aggregateMetrics(rows)` groups and sums correctly | +| Validation logic | `validateConfig(input)` rejects missing fields | +| tRPC procedures | `router.getStatus({ runId })` returns expected shape | + +### Example + +```ts +// tests/unit/format.test.ts +import { describe, it, expect } from "vitest"; +import { formatCurrency, formatPercent } from "../../client/src/utils/format"; + +describe("formatCurrency", () => { + it("formats positive numbers with commas and dollar sign", () => { + expect(formatCurrency(1234.5)).toBe("$1,234.50"); + }); + + it("handles zero", () => { + expect(formatCurrency(0)).toBe("$0.00"); + }); + + it("handles negative numbers", () => { + expect(formatCurrency(-42)).toBe("-$42.00"); + }); +}); + +describe("formatPercent", () => { + it("formats decimal as percentage", () => { + expect(formatPercent(0.856)).toBe("85.6%"); + }); +}); +``` + +### Testing tRPC Procedures + +```ts +// tests/unit/api.test.ts +import { describe, it, expect } from "vitest"; +import { appRouter } from "../../server/server"; + +const caller = appRouter.createCaller({}); + +describe("status procedure", () => { + it("returns status for a valid run", async () => { + const result = await caller.getStatus({ runId: "run-001" }); + expect(result).toHaveProperty("status"); + expect(result).toHaveProperty("startedAt"); + }); +}); +``` + +**Run:** +```bash +npx vitest run +npx vitest run --reporter=verbose # detailed output +npx vitest --watch # watch mode during development +``` + +## Level 3: Contract Tests + +PACT-style tests that verify data shapes at module boundaries. Consumer defines expectations, provider is verified against them. + +**When to add:** When your app has multiple modules that produce/consume the same data (e.g., a jobs plugin producing results that the server plugin reads, or a frontend expecting a specific response shape from tRPC). + +**Full guide:** [Contract Testing](contract-testing.md) + +### Quick Example + +```ts +// tests/contracts/dashboard-eval-api.contract.test.ts +import { describe, it, expect } from "vitest"; + +const result = { + run_id: "run-abc-123", + appeval100: 0.87, + status: "COMPLETED", +}; + +describe("Dashboard expects Eval API", () => { + it("returns results with scores between 0 and 1", () => { + expect(result.appeval100).toBeGreaterThanOrEqual(0); + expect(result.appeval100).toBeLessThanOrEqual(1); + }); +}); +``` + +**Key boundaries:** frontend<->server, server<->lakebase, server<->files, job<->job. + +## Level 4: Scenario Tests + +Playwright end-to-end tests parameterized by JSON test cases. Each case defines inputs and expected outputs. The same spec runs once per case. + +**When to add:** When the app has deterministic user workflows that can be expressed as input/output pairs, especially for evaluation-driven development. + +**Full guide:** [Scenario Testing](scenario-testing.md) + +### Quick Example + +```json +// pw-evals/calculator/public/cases.json +[ + { + "id": "basic-addition", + "inputs": { "a": "5", "b": "3" }, + "expected": { "result": "8" } + } +] +``` + +```ts +// pw-evals/calculator/tests/calculator.spec.ts +import { test, expect } from "@playwright/test"; +import cases from "../public/cases.json"; + +for (const c of cases) { + test(c.id, async ({ page }) => { + await page.goto("/"); + await page.getByRole("textbox", { name: "a" }).fill(c.inputs.a); + await page.getByRole("textbox", { name: "b" }).fill(c.inputs.b); + await page.getByRole("button", { name: /calculate/i }).click(); + await expect(page.getByTestId("result")).toContainText(c.expected.result); + }); +} +``` + +**Run:** +```bash +TASK_CASES_PATH=public/cases.json npx playwright test +``` + +## Level 5: Eval Pipeline + +Automated generation and evaluation at scale. An orchestrator generates test inputs, runs them through the app, and scores the outputs. + +**When to add:** When you need to evaluate app quality across hundreds or thousands of cases, typically for apps that involve AI/ML components (e.g., generation quality, retrieval accuracy). + +This level is app-specific and typically involves: +1. **Case generation** -- programmatically create input cases from data catalogs or templates. +2. **Automated execution** -- run scenario tests against generated cases via CI. +3. **Scoring** -- compute metrics (accuracy, latency, error rates) across all cases. +4. **Regression tracking** -- compare scores across builds to detect regressions. + +The eval pipeline builds on Level 4 (scenario tests) by automating the creation and scoring of `cases.json` files at scale. + +## Choosing Your Level + +| App Complexity | Recommended Levels | +|---------------|-------------------| +| Simple dashboard (1 SQL query, 1 chart) | Level 1 only | +| Multi-query analytics app | Levels 1 + 2 | +| App with lakebase + files + jobs | Levels 1 + 2 + 3 | +| App with deterministic user workflows | Levels 1 + 2 + 4 | +| Full multi-module app with eval requirements | Levels 1 + 2 + 3 + 4 + 5 | + +## Adding Levels Incrementally + +Do not try to add all levels at once. Follow this progression: + +1. **Start with Level 1** -- it is already there after `appkit init`. Just update the selectors. +2. **Add Level 2 when you extract logic** -- as soon as you have a utility function or a tRPC procedure with non-trivial logic, add a unit test. +3. **Add Level 3 when you add a second module** -- when the app grows beyond frontend+server to include jobs, files, or lakebase, add contract tests for each new boundary. +4. **Add Level 4 when you need acceptance criteria** -- when the app has user-facing workflows that must work end-to-end, add scenario tests. +5. **Add Level 5 when you need scale** -- when manual case creation is insufficient for your quality bar, automate it. + +Each level catches bugs that lower levels miss: +- Smoke tests catch "app won't start" bugs. +- Unit tests catch logic bugs. +- Contract tests catch interface mismatch bugs. +- Scenario tests catch integration and UX bugs. +- Eval pipelines catch quality regressions at scale.