diff --git a/.fastflowtransform/target/catalog.json b/.fastflowtransform/target/catalog.json index a9f7728..10951b1 100644 --- a/.fastflowtransform/target/catalog.json +++ b/.fastflowtransform/target/catalog.json @@ -1,6 +1,6 @@ { "metadata": { - "generated_at": "2025-10-28T19:05:15+00:00", + "generated_at": "2025-10-29T17:08:19+00:00", "tool": "fastflowtransform" }, "relations": { diff --git a/.fastflowtransform/target/manifest.json b/.fastflowtransform/target/manifest.json index d3fd433..c281cc1 100644 --- a/.fastflowtransform/target/manifest.json +++ b/.fastflowtransform/target/manifest.json @@ -3,7 +3,7 @@ "snake": "text.py" }, "metadata": { - "generated_at": "2025-10-28T19:05:15+00:00", + "generated_at": "2025-10-29T17:08:19+00:00", "tool": "fastflowtransform" }, "nodes": { diff --git a/.fastflowtransform/target/run_results.json b/.fastflowtransform/target/run_results.json index fd4e084..eaf4044 100644 --- a/.fastflowtransform/target/run_results.json +++ b/.fastflowtransform/target/run_results.json @@ -1,19 +1,19 @@ { "metadata": { - "generated_at": "2025-10-28T19:05:15+00:00", + "generated_at": "2025-10-29T17:08:19+00:00", "tool": "fastflowtransform" }, "results": [ { "duration_ms": 0, - "finished_at": "2025-10-28T19:05:15+00:00", + "finished_at": "2025-10-29T17:08:19+00:00", "http": null, "message": "'boom'", "name": "failing", - "started_at": "2025-10-28T19:05:15+00:00", + "started_at": "2025-10-29T17:08:19+00:00", "status": "error" } ], - "run_finished_at": "2025-10-28T19:05:15+00:00", - "run_started_at": "2025-10-28T19:05:15+00:00" + "run_finished_at": "2025-10-29T17:08:19+00:00", + "run_started_at": "2025-10-29T17:08:19+00:00" } diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 103b30b..6388799 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -32,6 +32,9 @@ jobs: - name: Install docs (and dev) dependencies run: | uv sync --extra docs + + - name: Generate API reference pages (real files) + run: uv run python scripts/generate_api_docs.py - name: Build site run: uv run mkdocs build --strict diff --git a/.gitignore b/.gitignore index a9b5a99..dfabe9d 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,11 @@ htmlcov/ # Build Artifacts build/ dist/ +site/dag +spark-warehouse +metastore_db +derby.log +.fastflowtransform # Editors / IDEs .vscode/ diff --git a/Combined.md b/Combined.md new file mode 100644 index 0000000..33f3e76 --- /dev/null +++ b/Combined.md @@ -0,0 +1,3226 @@ +# Combined Documentation + + + + + +# FastFlowTransform Documentation Hub + +Welcome! This page is your starting point for FastFlowTransform docs. Pick the track that matches what you want to do and follow the links to the detailed guides. + +--- + +## Docs Navigation +- **Getting Started** — you are here (`docs/index.md`) +- [User Guide](./Technical_Overview.md#part-i-operational-guide) +- [Modeling Reference](./Config_and_Macros.md) +- [Parallelism & Cache](./Cache_and_Parallelism.md) +- [API calls in Python models](./Api_Models.md) +- [Incremental Models](./Incremental.md) +- [YAML Tests (Schema-bound)](./YAML_Tests.md) +- [Data Quality Tests Reference](./Data_Quality_Tests.md) +- [Profiles & Environments](./Profiles.md) +- [Sources Declaration](./Sources.md) +- [Project Configuration](./Project_Config.md) +- [State Selection (changed & results)](./State_Selection.md) +- [Cross-Table Reconciliations](./Technical_Overview.md#cross-table-reconciliations) +- [Auto-Docs & Lineage](./Technical_Overview.md#auto-docs-lineage) +- [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) + +## Table of Contents + +- [Docs Navigation](#docs-navigation) +- [Choose Your Path](#choose-your-path) +- [Reference Map](#reference-map) +- [Need Help?](#need-help) + +--- + +## Choose Your Path + +### 1. Build & Operate Projects (Data Practitioners) + +- **Get set up quickly:** follow the dedicated [Quickstart](Quickstart.md) guide for installation, seeding, and a first run. +- **Need local runtimes?** The [API demo local engine setup](examples/Local_Engine_Setup.md) walks through DuckDB, Postgres, and Databricks Spark. +- **Understand the project layout & CLI workflow:** see *Project Layout*, *Makefile Targets*, and *CLI Flows* in the [Technical Overview](Technical_Overview.md#project-layout). +- **Configure runtimes & profiles:** review executor profiles, environment overrides, and logging options in the [Technical Overview](Technical_Overview.md#profiles-environment-overrides). +- **Model data quality & troubleshoot runs:** the [Technical Overview](Technical_Overview.md#model-unit-tests-fft-utest) covers unit tests, troubleshooting tips, and exit codes. +- **Explore runnable demos:** browse the `examples/` directory in the repo; each subproject comes with its own README. + +### 2. Extend FastFlowTransform (Developers & Contributors) + +- **Dive into architecture & core modules:** start with [Architecture Overview](Technical_Overview.md#architecture-overview) and [Core Modules](Technical_Overview.md#core-modules) for registry, DAG, executors, validation, and more. +- **Add tests & seeds:** see [Sample Models](Technical_Overview.md#sample-models), [Seeds & Example Data](Technical_Overview.md#seeds-example-data), and the unit test guide in [Model Unit Tests](Technical_Overview.md#model-unit-tests-fft-utest). +- **Contribute code:** follow the workflow described in [`./Contributing.md`](./Contributing.md) and consult the module-level docs for internal APIs. +- **Plan ahead:** check the roadmap snapshot in the [Technical Overview](Technical_Overview.md#roadmap-snapshot) to understand upcoming work. + +--- + +## Reference Map + +- **Modeling reference** — Jinja configuration, macros, helper functions: [`Config_and_Macros.md`](Config_and_Macros.md) +- **CLI entry point & commands** — `src/fastflowtransform/cli.py` +- **Registry & node loading** — `src/fastflowtransform/core.py` +- **Unit test runner** — `src/fastflowtransform/utest.py` +- **Rendered DAG templates** — `src/fastflowtransform/docs/templates/` + +--- + +## Need Help? + +- Open an issue or PR — see [`./Contributing.md`](./Contributing.md) for guidelines. +- Join the discussion (planning doc / roadmap highlights) — see the roadmap section in the [Technical Overview](Technical_Overview.md#roadmap-snapshot). +- If you spot gaps in the docs, file an issue with the context and links to the relevant section. + + + + + +# Quickstart + +This guide walks you through creating a minimal FastFlowTransform project from scratch and running it end-to-end. + +## 1. Install & bootstrap + +```bash +python -m venv .venv +. .venv/bin/activate +pip install -e ./fastflowtransform +fft --help +``` + +## 2. Create project layout + +```bash +mkdir -p demo/{models,seeds} +cat <<'YAML' > demo/sources.yml +version: 2 + +sources: + - name: raw + schema: staging + tables: + - name: users + identifier: seed_users +YAML + +cat <<'CSV' > demo/seeds/seed_users.csv +id,email +1,a@example.com +2,b@example.com +CSV + +cat <<'SQL' > demo/models/users.ff.sql +{{ config(materialized='table') }} +select id, email +from {{ source('raw', 'users') }} +SQL +``` + +## 3. Seed static inputs + +```bash +fft seed demo --profile dev +``` + +This materializes the CSV into the configured engine (DuckDB by default) using `seed_users` as the physical table. + +## 4. Run the pipeline + +```bash +fft run demo --cache off +``` + +You should see log lines similar to `✓ L01 [DUCK] users.ff`. The resulting table lives in the target schema (`staging` in this example). + +## 5. Inspect artifacts + +- `.fastflowtransform/target/manifest.json` → model graph + sources +- `.fastflowtransform/target/run_results.json` → run outcomes and durations + +## 6. Add more models (optional) + +- Reference other models with `{{ ref('model_name') }}` +- Configure tags or materializations via `{{ config(...) }}` at the top of each SQL file + +## 7. Next steps + +- Add `project.yml` for reusable `vars:` and metadata +- Explore `fft docs` to generate HTML documentation +- Use engine profiles under `profiles.yml` to target Postgres, BigQuery, or Databricks (path-based sources supported via `format` + `location` overrides) + +Refer to `docs/Config_and_Macros.md` for advanced configuration options. + + + + + +# 🧭 FastFlowTransform – Technical Developer Documentation (v0.4) + +> Status: latest updates from your context dump. This document consolidates project structure, architecture, core APIs, error handling, CLI, examples, and roadmap into a print/git-friendly Markdown. +> +> Looking for an overview? Start at the [`docs/index.md`](./index.md) hub, then dive back here when you need details. +> +> Project: **FastFlowTransform** — SQL & Python Data Modeling (Batch + Streaming), DAG, CLI, Auto-Docs, DQ Tests. + +--- + +## Docs Navigation +1. [Getting Started](./index.md) +2. **User Guide** — see [Part I – Operational Guide](#part-i-operational-guide) (this document) +3. [Modeling Reference](./Config_and_Macros.md) +4. **Developer Guide** — see [Part II – Architecture & Internals](#part-ii-architecture-internals) (this document) + +--- + +## Table of Contents + +- [Docs Navigation](#docs-navigation) +- [Part I – Operational Guide](#part-i-operational-guide) + - [Project Layout](#project-layout) + - [Sample Models](#sample-models) + - [Seeds & Example Data](#seeds-example-data) + - [Makefile Targets](#makefile-targets) + - [CLI Flows](#cli-flows) + - [Logging & Verbosity](#logging-verbosity) + - [Model Unit Tests (`fft utest`)](#model-unit-tests-fft-utest) + - [Troubleshooting](#troubleshooting) + - [Error Codes](#error-codes) + - [Profiles & Environment Overrides](#profiles-environment-overrides) + - [Parallel Scheduler (v0.3)](#parallel-scheduler-v03) + - [Cache Policy (v0.3)](#cache-policy-v03) + - [Fingerprint Formula (v0.3)](#fingerprint-formula-v03) + - [Meta Table Schema (v0.3)](#meta-table-schema-v03) + - [Jinja DSL Quick Reference](#jinja-dsl-quick-reference) + - [Roadmap Snapshot](#roadmap-snapshot) + - [Cross-Table Reconciliations](#cross-table-reconciliations) + - [Auto-Docs & Lineage](#auto-docs-lineage) +- [Part II – Architecture & Internals](#part-ii-architecture-internals) + - [Architecture Overview](#architecture-overview) + - [Core Modules](#core-modules) + - [`core.py`](#corepy) + - [`dag.py`](#dagpy) + - [`errors.py`](#errorspy) + - [Executors](#executors) + - [`validation.py`](#validationpy) + - [`testing.py`](#testingpy) + - [`docs.py` & Templates](#docspy-templates) + - [`seeding.py`](#seedingpy) + - [CLI Implementation](#cli-implementation) + - [Settings Infrastructure](#settings-infrastructure) + - [Streaming Components](#streaming-components) + - [Mini End-to-End Example (Python API)](#mini-end-to-end-example-python-api) + +--- + +## Part I – Operational Guide + +### Project Layout + +```text +fastflowtransform/ +├── pyproject.toml +├── src/ +│ └── fastflowtransform/ +│ ├── __init__.py +│ ├── cli.py +│ ├── core.py +│ ├── dag.py +│ ├── docs.py +│ ├── errors.py +│ ├── settings.py +│ ├── seeding.py +│ ├── testing.py +│ ├── validation.py +│ ├── decorators.py # optional, if not kept in core.py +│ ├── docs/ +│ │ └── templates/ +│ │ ├── index.html.j2 +│ │ └── model.html.j2 +│ ├── executors/ +│ │ ├── __init__.py +│ │ ├── base.py +│ │ ├── duckdb_exec.py +│ │ ├── postgres_exec.py +│ │ ├── bigquery_exec.py # pandas + BigQuery client +│ │ ├── bigquery_bf_exec.py # BigQuery DataFrames (bigframes) +│ │ ├── databricks_spark_exec.py # PySpark (without pandas) +│ │ └── snowflake_snowpark_exec.py# Snowpark (without pandas) +│ └── streaming/ +│ ├── __init__.py +│ ├── file_tail.py +│ └── sessionizer.py +│ +├── examples/ +│ ├── simple_duckdb/ +│ │ ├── models/ +│ │ │ ├── users.ff.sql +│ │ │ ├── users_enriched.ff.py +│ │ │ ├── orders.ff.sql +│ │ │ ├── mart_orders_enriched.ff.py +│ │ │ └── mart_users.ff.sql +│ │ ├── seeds/ +│ │ │ ├── seed_users.csv +│ │ │ └── seed_orders.csv +│ │ ├── sources.yml +│ │ ├── project.yml +│ │ ├── Makefile +│ │ └── .local/demo.duckdb (after make seed/run) +│ └── postgres/ # similar structure if needed +│ +├── tests/ +│ ├── conftest.py +│ ├── duckdb/ … # end-to-end + unit +│ ├── postgres/ … +│ └── streaming/ … +└── README.md +``` + +### Sample Models + +The demo project `examples/simple_duckdb` showcases the typical mix of SQL and Python models plus downstream marts. Use it as a template for your own projects. + +- Batch models live under `models/` (`*.ff.sql`, `*.ff.py`). +- External tables are declared in `sources.yml`; reusable tests in `project.yml`. +- Seeds in `seeds/` keep demos deterministic. + +> ℹ️ **Need full code samples and decorator details?** +> See [Model Fundamentals](./Config_and_Macros.md#1-model-fundamentals) in the Modeling Reference. + +### Seeds & Example Data + +`seeds/seed_users.csv` + +```csv +id,email +1,a@example.com +2,b@gmail.com +3,c@gmail.com +``` + +`seeds/seed_orders.csv` + +```csv +order_id,user_id,amount +100,1,19.9 +101,2,0 +``` + +### Makefile Targets + +```makefile +DB ?= .local/demo.duckdb +PROJECT ?= examples/simple_duckdb + +seed: + fft seed $(PROJECT) --env dev + +run: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev + +dag: + fft dag "$(PROJECT)" --env dev --html + +test: + fft test "$(PROJECT)" --env dev --select batch +``` + +Targets wrap the CLI commands showcased below. Feel free to copy the pattern into your own projects. + +### CLI Flows + +- CLI flags and internals are documented under [CLI Implementation](#cli-implementation). +- Automation examples appear in the [Makefile Targets](#makefile-targets). + + +#### HTTP/API in Python models +See [API calls in Python models](./Api_Models.md) for `get_json`/`get_df`, pagination, cache/offline flags. + + +#### DAG & Documentation + +- Narrow the graph with `fft dag ... --select ` (for example `state:modified` or `tag:finance`). Combined with `--html` this produces a focused mini site. +- Control schema introspection via `--with-schema/--no-schema`. Use `--no-schema` when the executor should avoid fetching column metadata (for example, BigQuery without sufficient permissions). +- `fft docgen` renders the DAG, model pages, and an optional JSON manifest in one command. Append `--open-source` to open `index.html` in your default browser after rendering. + +#### Sync Database Comments + +`fft sync-db-comments --env ` pushes model and column descriptions from project YAML or Markdown into database comments. The command currently supports Postgres and Snowflake Snowpark: + +- Start with `--dry-run` to review the generated `COMMENT` statements. +- Postgres honors `profiles.yml -> postgres.db_schema` (and any `FF_PG_SCHEMA` override). +- Snowflake reuses the session or connection exposed by the executor. + +If no descriptions are found, the command exits without making changes. + +### Logging & Verbosity + +FastFlowTransform exposes uniform logging controls across all CLI commands plus a dedicated SQL debug channel. + +#### Flags + +- `-q` / `--quiet` → only errors (`ERROR`) +- *(default)* → concise warnings (`WARNING`) +- `-v` / `--verbose` → progress/info (`INFO`) +- `-vv` → full debug (`DEBUG`), including SQL debug output + +`-vv` flips on the SQL debug channel automatically (same as setting `FFT_SQL_DEBUG=1` + +#### SQL debug channel + +Enable it to inspect Python-model inputs, dependency columns, and helper SQL emitted by data-quality checks: + +```bash +# full debug (recommended) +fft run . -vv + +# equivalent using the env var (legacy behaviour retained) +FFT_SQL_DEBUG=1 fft run . +``` + +#### Usage patterns + +```bash +fft run . -q # quiet (errors only) +fft run . # default (concise) +fft run . -v # verbose progress (model names, executor info) +fft run . -vv # full debug + SQL channel +``` + +#### Parallel logging UX + +- Per node: start/end lines with duration, truncated name, and engine abbrev (DUCK/PG/BQ/…). +- Output is line-stable via a thread-safe log queue; per-level summaries at the end. +- On errors, the familiar “error block” is shown per node. + +**Notes** + +- SQL debug output routes through the `fastflowtransform.sql` logger; use `-vv` or the env var to see it. +- Existing projects do not need changes: the env var continues to work even without `-vv`. + +### Model Unit Tests (`fft utest`) + +`fft utest` executes a single model in isolation, loading only the inputs you provide and comparing the result to an expected dataset. It works for SQL and Python models and runs against DuckDB or Postgres by default. + +#### Unit tests & cache + +`fft utest --cache {off|ro|rw}` (default: `off`) + +- `off`: deterministic, never skips. +- `ro`: skip on cache hit; on miss, build but **do not write** cache. +- `rw`: skip on hit; on miss, build **and write** fingerprint. + +Notes: +- UTests key the cache with `profile="utest"`. +- Fingerprints include case inputs (CSV content hash / inline rows), so changing inputs invalidates the cache. +- `--reuse-meta` is currently a reserved flag: it is exposed in the CLI, acts as a no-op today, and will enable future meta-table optimizations. + + +#### Why? + +- Fast feedback on transformation logic without full DAG runs +- Small, reproducible fixtures (rows inline or external CSV) +- Engine-agnostic: swap DuckDB/Postgres to spot dialect differences + +#### Folder layout + +Specs live under `/tests/unit/*.yml` relative to the project root (the directory passed to the CLI that contains `models/`): + +``` +your-project/ +├── models/ +│ ├── users.ff.sql +│ ├── users_enriched.ff.py +│ └── mart_users.ff.sql +└── tests/ + └── unit/ + ├── users_enriched.yml + └── mart_users.yml +``` + +#### YAML DSL (with `defaults`) + +Each file targets one logical node (the DAG name). Defaults are deep-merged into every case so you can share inputs/expectations and override per scenario. + +```yaml +# tests/unit/users_enriched.yml +model: users_enriched + +defaults: + inputs: + users: + rows: + - {id: 1, email: "a@example.com"} + - {id: 2, email: "b@gmail.com"} + expect: + relation: users_enriched + order_by: [id] + +cases: + - name: basic_gmail_flag + expect: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} + + - name: override_inputs + inputs: + users: + rows: + - {id: 3, email: "c@hotmail.com"} + - {id: 4, email: "d@gmail.com"} + expect: + rows: + - {id: 3, email: "c@hotmail.com", is_gmail: false} + - {id: 4, email: "d@gmail.com", is_gmail: true} +``` + +SQL models use the file stem (including `.ff`) as `model`. Provide expected relation names that match the materialized table/view: + +```yaml +# tests/unit/mart_users.yml +model: mart_users.ff + +defaults: + inputs: + users_enriched: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} + expect: + relation: mart_users + order_by: [id] + +cases: + - name: passthrough_columns + expect: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} +``` + +For multi-dependency models, include every physical relation name (what `relation_for(dep)` returns): + +```yaml +model: mart_orders_enriched +defaults: + inputs: + users_enriched: + rows: + - {id: 1, email: "x@gmail.com", is_gmail: true} + orders: + rows: + - {order_id: 10, user_id: 1, amount: 19.9} + - {order_id: 11, user_id: 1, amount: -1.0} +cases: + - name: join_and_flag + expect: + any_order: true + rows: + - {order_id: 10, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: 19.9, valid_amt: true} + - {order_id: 11, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: -1.0, valid_amt: false} +``` + +#### Input formats + +- `rows`: inline dictionaries per row +- `csv`: reference a CSV file (relative paths allowed) + +Keys under `inputs` are physical relations; use `relation_for('users.ff')` if unsure. + +#### Expected output & comparison + +- `relation`: actual table/view name produced by the model (defaults to `relation_for(model)`) +- Ordering: `order_by: [...]` or `any_order: true` +- Columns: `ignore_columns: [...]`, `subset: true` +- Numeric tolerance: `approx: true` or `approx: { col: 1e-9, other_col: 0.01 }` + (numbers can be plain `1e-9` or quoted; they are cast to float) + +#### Running utests + +```bash +fft utest . # discover all specs +fft utest . --env dev # use a specific profile +fft utest . --model users_enriched +fft utest . --model mart_orders_enriched --case join_and_flag +fft utest . --path tests/unit/users_enriched.yml +``` + +Override the executor for all specs (ensure credentials/DSNs are set): + +```bash +export FF_PG_DSN="postgresql+psycopg://postgres:postgres@localhost:5432/ffdb" +export FF_PG_SCHEMA="public" +fft utest . --engine postgres +``` + +Executor precedence (highest → lowest): CLI `--engine`, YAML `engine:` (optional), `profiles.yml`, environment overrides. + +#### Design notes + +- Only the target model runs; supply all upstream relations the model expects. +- `defaults` deep-merge: dicts merge, lists/scalars overwrite. +- Results compare as DataFrames with configurable order, subset, ignored columns, and numeric tolerances. +- Exit codes: `0` for success, `2` when at least one case fails (compact CSV-style diff is printed). + +**CI example (GitHub Actions)** + +```yaml +name: utests +on: [push, pull_request] +jobs: + duckdb: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: { python-version: "3.11" } + - run: pip install -e . + - run: fft utest . --env dev +``` + +(For Postgres, add a service container and run `fft utest . --engine postgres` with `FF_PG_DSN` / `FF_PG_SCHEMA`.) + +### Troubleshooting + +- **DuckDB seeds not visible** → ensure `FF_DUCKDB_PATH` (or profile path) is identical for `seed`, `run`, `dag`, and `test`. +- **Postgres connection refused** → confirm `FF_PG_DSN`, container status (`docker ps`), and that port `5432` is open. +- **BigQuery permissions** → set `GOOGLE_APPLICATION_CREDENTIALS` and match dataset/location to your profile. +- **HTML docs missing** → run `fft dag --html` and open `/docs/index.html`. +- **Unexpected test failures** → inspect rendered SQL in CLI output, refine selection via `--select`, refresh seeds if needed. +- **Dependency table not found** in utests → provide all physical upstream relations in the YAML spec. + +### Error Codes + +| Type | Class/Source | Exit | Notes | +|---------------------------|---------------------------|------|---------------------------------------------------------| +| Missing dependency | `DependencyNotFoundError` | 1 | Per-node list; tips for `ref()` / names | +| Cycle in DAG | `ModelCycleError` | 1 | "Cycle detected among nodes: ..." | +| Model execution (KeyError)| `cli.py` → formatted block| 1 | Inspect columns, use `relation_for(dep)` as keys | +| Data quality failures | `cli test` → summary | 2 | "Totals ... passed/failed"; each failure on its own line | +| Unknown/unexpected | generic | 99 | Optional trace via `FFT_TRACE=1` | + +Error types map to the classes documented in [Core Modules](#core-modules) and [CLI Implementation](#cli-implementation). + +### Profiles & Environment Overrides + +**`profiles.yml` example:** + +```yaml +default: + engine: duckdb + duckdb: { path: ":memory:" } + +stg: + engine: postgres + postgres: + dsn: postgresql+psycopg://postgres:postgres@localhost:5432/ffdb + db_schema: public + +bq: + engine: bigquery + bigquery: + project: my-gcp-proj + dataset: demo + location: EU + use_bigframes: false +``` + +**ENV overrides (examples):** + +`FF_ENGINE`, `FF_DUCKDB_PATH`, `FF_PG_DSN`, `FF_PG_SCHEMA`, `FF_BQ_DATASET`, `FF_BQ_LOCATION`, `FF_BQ_USE_BIGFRAMES=1` + +**Priority (lowest → highest):** `profiles.yml` < environment variables (`FF_*`) < CLI flags (e.g. `--engine`). + +For the Pydantic models and resolution flow, see [Settings Infrastructure](#settings-infrastructure). + +### Parallel Scheduler (v0.3) + +FastFlowTransform executes the DAG in **levels**. Each level contains nodes without mutual dependencies. + +- `--jobs N` limits the **maximum concurrency per level**. +- `--keep-going` keeps tasks within the current level running even if one fails; subsequent levels are not started. + +**CLI** +```bash +fft run . --env dev --jobs 4 # parallel (level-wise) +fft run . --env dev --jobs 4 --keep-going + +fft run . --select model_b --jobs 4 # Run only model_b and whatever it depends on +fft run . --rebuild-only model_b # Rebuild only model_b, even if cache hits +``` + +**Internals** +- `dag.levels(nodes)` builds level lists using indegrees. +- `run_executor.schedule(levels, jobs, fail_policy)` spawns a thread pool per level and aggregates timings. + +### Cache Policy (v0.3) + +**Modes** +``` +off – always build +rw – default; skip if fingerprint matches and relation exists; write cache after build +ro – skip on match; on miss build but do not write cache +wo – always build and write cache +``` +`--rebuild ` ignores cache for matching nodes. + +**Skip condition** +1) Fingerprint matches the stored value (file-backed cache) +2) Physical relation exists on the target engine + +**Examples** +```bash +fft run . --env dev --cache=rw +fft run . --env dev --cache=ro +fft run . --env dev --cache=rw --rebuild marts_daily.ff +``` + +### Fingerprint Formula (v0.3) + +**SQL nodes**: +`fingerprint_sql(node, rendered_sql, env_ctx, dep_fps)` + +**Python nodes**: +`fingerprint_py(node, func_src, env_ctx, dep_fps)` + +**`env_ctx` content** +- `engine` (e.g. duckdb, postgres, bigquery) +- `profile_name` (CLI `--env`) +- selected environment keys/values: all `FF_*` +- normalized excerpt of `sources.yml` (sorted dump) + +**Properties** +- Same inputs ⇒ same hash. +- Minimal change in SQL/function ⇒ different hash. +- Any dependency fingerprint change bubbles downstream via `dep_fps`. + +### Meta Table Schema (v0.3) + +FastFlowTransform writes a per-node audit row after successful builds: + +``` +_ff_meta ( + node_name TEXT / STRING -- logical name, e.g. "users.ff" + relation TEXT / STRING -- physical name, e.g. "users" + fingerprint TEXT / STRING + engine TEXT / STRING + built_at TIMESTAMP +) +``` + +**Backends** +- DuckDB: table `_ff_meta` in `main`. +- Postgres: table `_ff_meta` in the active schema. +- BigQuery: table `._ff_meta`. + +**Notes** +- Meta is currently used for auditing and tooling; skip logic relies on fingerprint cache + relation existence checks. + +#### Executor meta hook + +After a successful materialization the executor calls: + on_node_built(node, relation, fingerprint) + +This performs an upsert into `_ff_meta` with `(node_name, relation, fingerprint, built_at, engine)`. + +Skipped nodes do **not** touch the meta table. + + +### Jinja DSL Quick Reference + +`ref()`, `source()`, `var()`, `config()`, `this` – see details in the [Modeling Reference](./Config_and_Macros.md). + +### Roadmap Snapshot + +| Version | Content | +|---------|---------------------------------------------------| +| 0.2 | `config(materialized=...)`, Jinja macros, variables | +| 0.3 | Parallel execution, cache | +| 0.4 | Incremental models | +| 0.5 | Streaming connectors (Kafka, S3) | +| 1.0 | Stable API, plugin SDK | + +> See also: feature pyramid & roadmap phases (OSS/SaaS) in the separate document. + +--- + +### Cross-Table Reconciliations + +FastFlowTransform can compare aggregates and key coverage **across two tables** and surface drift with clear, numeric messages. These checks run via the standard `fft test` entrypoint and integrate into the DQ summary output. + +**CLI** +```bash +# only run reconciliation checks +fft test . --env dev --select reconcile +``` + +**YAML DSL** + +All checks live under `project.yml → tests:` and should carry the tag `reconcile` for easy selection. + +1) **Equality / Approx Equality** +```yaml +- type: reconcile_equal + name: orders_total_equals_mart + tags: [reconcile] + left: { table: orders, expr: "sum(amount)" } + right: { table: mart_orders_enriched, expr: "sum(amount)", where: "valid_amt" } + # optional tolerances: + abs_tolerance: 0.01 # |L - R| <= 0.01 + rel_tolerance_pct: 0.1 # |L - R| / max(|R|, eps) <= 0.1% (0.1) +``` + +2) **Ratio within bounds** +```yaml +- type: reconcile_ratio_within + name: orders_vs_mart_ratio + tags: [reconcile] + left: { table: orders, expr: "sum(amount)" } + right: { table: mart_orders_enriched, expr: "sum(amount)" } + min_ratio: 0.999 + max_ratio: 1.001 +``` + +3) **Absolute difference within limit** +```yaml +- type: reconcile_diff_within + name: count_stability + tags: [reconcile] + left: { table: events_raw, expr: "count(*)", where: "event_type='purchase'" } + right: { table: fct_sales, expr: "sum(txn_count)" } + max_abs_diff: 10 +``` + +4) **Coverage (anti-join = 0)** +```yaml +- type: reconcile_coverage + name: all_orders_covered + tags: [reconcile] + source: { table: orders, key: "order_id" } + target: { table: mart_orders_enriched, key: "order_id" } + # optional filters + source_where: "order_date >= current_date - interval '7 days'" + target_where: "valid_amt" +``` + +**Parameter semantics** +- `expr`: SQL snippet placed into `SELECT {expr} FROM {table}` (keep it engine-neutral: `sum(...)`, `count(*)`, simple filters). +- `where`: optional SQL appended as `WHERE {where}`. +- `abs_tolerance`: absolute tolerance on the difference. +- `rel_tolerance_pct`: relative tolerance in **percent**; denominator is `max(|right|, 1e-12)`. +- `min_ratio` / `max_ratio`: inclusive bounds for `left/right`. +- Coverage uses an anti-join (`source` minus `target` on the given key). The check passes if missing = 0. + +**Summary output** +Each reconciliation contributes a line in the summary with a compact scope, e.g.: +``` +✅ reconcile_equal orders ⇔ mart_orders_enriched (4ms) +✅ reconcile_coverage orders ⇒ mart_orders_enriched (3ms) +``` + +**Engine notes** +- DuckDB and Postgres are supported out-of-the-box. BigQuery works with simple aggregates/filters (expressions should avoid dialect-specific functions). +- For relative tolerances, the implementation guards against zero denominators with a small epsilon (`1e-12`). + + +### Auto-Docs & Lineage + +FastFlowTransform can generate a lightweight documentation site (DAG + model detail pages) from your project: + +```bash +# Classic +fft dag . --env dev --html + +# Convenience wrapper (loads schema + descriptions + lineage, can emit JSON) +fft docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json +``` + +Add `--open-source` if you want the default browser to open the rendered `index.html` immediately. + +**Descriptions** can be provided in YAML (project.yml) and/or Markdown files. Markdown has higher priority. + +YAML in `project.yml`: + +```yaml +docs: + models: + users.ff: + description: "Raw users table imported from CRM." + columns: + id: "Primary key." + email: "User email address." + users_enriched: + description: "Adds gmail flag." + columns: + is_gmail: "True if email ends with @gmail.com" +``` + +Markdown (overrides YAML if present): + +``` +/docs/models/.md +/docs/columns//.md +``` + +Optional front matter is ignored for now (title/tags may be used later). + +**Column lineage (heuristic, best effort).** + +- SQL models: expressions like `col` / `alias AS out` / `upper(u.email) AS email_upper)` are parsed; + `u` must come from a `FROM ... AS u` that resolves to a relation. Functions mark lineage as *transformed*. +- Python (pandas) models: simple patterns like `rename`, `out["x"] = df["y"]`, `assign(x=...)` are recognized. +- You can override hints in YAML: + +```yaml +docs: + models: + mart_orders_enriched: + lineage: + email_upper: + from: [{ table: users, column: email }] + transformed: true +``` + +**JSON manifest** (optional via `--emit-json`) includes models, relations, descriptions, columns (with nullable/dtype), +and lineage per column. This is useful for custom doc portals or CI checks. + +Notes: +- Schema introspection currently supports DuckDB and Postgres. For other engines, the Columns card may be empty. +- Lineage is optional; when uncertain, entries fall back to “unknown” and never fail doc generation. + + + +## Part II – Architecture & Internals + +### Architecture Overview + +``` +CLI (Typer) +│ +├── Registry (core.py) +│ ├── Discover models (*.ff.sql / *.ff.py) +│ ├── Load Python models (decorator) +│ ├── Parse/validate dependencies +│ └── Jinja environment + sources.yml +│ +├── DAG (dag.py) +│ ├── topo_sort (Kahn, deterministic) +│ └── mermaid() (styled + stable IDs) +│ +├── Executors (executors/*) +│ ├── BaseExecutor (SQL rendering, dependency loading, materialization, requires guard) +│ ├── DuckExecutor (DuckDB) +│ ├── PostgresExecutor (SQLAlchemy, shims) +│ ├── BigQueryExecutor (pandas) +│ ├── BigQueryBFExecutor (BigQuery DataFrames / bigframes) +│ ├── DatabricksSparkExecutor (PySpark, without pandas) +│ └── SnowflakeSnowparkExecutor (Snowpark, without pandas) +│ +├── Testing (testing.py) +│ ├── generic _exec / _scalar +│ └── Checks: not_null, unique, row_count_between, greater_equal, non_negative_sum, freshness +│ +├── Seeding (seeding.py) +│ └── Load seeds (CSV/Parquet/SQL) → engine agnostic +│ +├── Docs (docs.py + templates/) +│ ├── Mermaid + overview table (index.html) +│ └── Model detail pages (model.html) +│ +├── Settings/Profiles (settings.py) +│ └── Pydantic v2 discriminated union + ENV overrides +│ +└── Streaming (streaming/*) + ├── FileTailSource + └── StreamSessionizer +``` + +--- + +### Core Modules + +#### `core.py` + +Key data structures and the project loading process. + +```python +@dataclass +class Node: + name: str # logical name (stem or @model(name=...)) + kind: str # "sql" | "python" + path: Path + deps: List[str] = field(default_factory=list) + +class Registry: + def load_project(self, project_dir: Path) -> None: ... + def _register_node(self, node: Node) -> None: ... + def _load_py_module(self, path: Path) -> types.ModuleType: ... + def _scan_sql_deps(self, path: Path) -> List[str]: ... +``` + +**Helpers & decorator:** + +```python +def relation_for(node_name: str) -> str: ... +def ref(name: str) -> str: ... +def source(source_name: str, table_name: str) -> str: ... + +def model(name=None, deps=None, requires=None) -> Callable[[Callable[..., Any]], Callable[..., Any]]: ... +``` + +**Python models (example):** + +```python +@model(name="users_enriched", deps=["users.ff"], requires={"users": {"id","email"}}) +def enrich(df: pd.DataFrame) -> pd.DataFrame: ... +``` + +--- + +#### `dag.py` + +Deterministic topological sort plus Mermaid export. + +```python +def topo_sort(nodes: Dict[str, Node]) -> List[str]: ... +def mermaid(nodes: Dict[str, Node]) -> str: ... +``` + +--- + +#### `errors.py` + +Primary error types with helpful messages. + +```python +class FastFlowTransformError(Exception): ... +class ModuleLoadError(FastFlowTransformError): ... +class DependencyNotFoundError(FastFlowTransformError): ... +class ModelCycleError(FastFlowTransformError): ... +class TestFailureError(FastFlowTransformError): ... +``` + +--- + +#### Executors + +Shared logic (`BaseExecutor`) plus engine implementations. + +```python +class BaseExecutor(ABC): + def render_sql(self, node: Node, env: Environment, ref_resolver=None, source_resolver=None) -> str: ... + def run_python(self, node: Node) -> None: ... + @abstractmethod + def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.DataFrame: ... + @abstractmethod + def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None: ... +``` + +**DuckDB (`duckdb_exec.py`)** + +- `run_sql(node, env)` renders Jinja (`ref/source`) and executes the SQL. +- `_read_relation` loads a table as `DataFrame`; surfaces actionable errors when a dependency is missing. +- `_materialize_relation` writes the `DataFrame` as a table (`create or replace table ...`). + +**Postgres (`postgres_exec.py`)** + +- `_SAConnShim` (compatible with `testing._exec`). +- `run_sql` renders SQL and rewrites `CREATE OR REPLACE TABLE` to `DROP + CREATE AS`. +- `_read_relation` uses pandas, handles schemas, and provides clear guidance. +- `_materialize_relation` writes via `to_sql(if_exists="replace")`. + +**BigQuery / BigQuery DataFrames / Spark / Snowpark** + +- Identical signatures; IO uses the respective native dataframes (no pandas for Spark/Snowpark). + +--- + +#### `validation.py` + +Required-column checks for Python models (single and multi dependency). + +```python +class RequiredColumnsError(ValueError): ... +def validate_required_columns(node_name: str, inputs: Any, requires: dict[str, set[str]]): ... +``` + +--- + +#### `testing.py` + +Minimal data quality framework (engine agnostic via `_exec`). + +**Checks:** `not_null`, `unique`, `greater_equal`, `non_negative_sum`, `row_count_between`, `freshness` + +```python +class TestFailure(Exception): ... +def _exec(con: Any, sql: Any): ... +def _scalar(con: Any, sql: Any): ... +``` + +--- + +#### `docs.py` & Templates + +- `render_site(out_dir, nodes)` produces `index.html` plus `model.html` per model. +- Templates (`docs/templates/`) include dark mode, filters, copy buttons, legend. +- Uses `dag.mermaid(nodes)` for the graph. + +--- + +#### `seeding.py` + +Engine-agnostic seed loading (CSV/Parquet/SQL). + +```python +def seed_project(project_dir: Path, executor, schema: Optional[str] = None) -> int: ... +``` + +--- + +### CLI Implementation + +Operational usage lives in [CLI Flows](#cli-flows). This section drills into the Typer command definitions in `cli.py`. + +**Commands:** + +- `fft run [--env dev] [--engine ...]` +- `fft dag [--env dev] [--html] [--select ...] [--with-schema/--no-schema]` +- `fft docgen [--env dev] [--out dir] [--emit-json path] [--open-source]` +- `fft test [--env dev] [--select batch|streaming|tag:...]` +- `fft seed [--env dev]` +- `fft sync-db-comments [--env dev] [--dry-run]` +- `fft utest [--env dev] [--cache off|ro|rw] [--reuse-meta]` +- `fft --version` + +**Key components:** + +```python +def _load_project_and_env(project_arg) -> tuple[Path, Environment]: ... +def _resolve_profile(env_name, engine, proj) -> tuple[EnvSettings, Profile]: ... +def _get_test_con(executor: Any) -> Any: ... +``` + +**Test summary (exit 2 on failures):** + +``` +Data Quality Summary +──────────────────── +✅ not_null users.email (3ms) +❌ unique users.id (2ms) + ↳ users.id has 1 duplicate + +Totals +────── +✓ passed: 1 +✗ failed: 1 +``` + +--- + +### Settings Infrastructure + +`settings.py` uses a **Pydantic v2 discriminated union** (`engine` as discriminator) plus ENV overrides. + +Profile types: +- `DuckDBProfile(engine="duckdb", duckdb: {path})` +- `PostgresProfile(engine="postgres", postgres: {dsn, db_schema})` +- `BigQueryProfile(engine="bigquery", bigquery: {project?, dataset, location?, use_bigframes?})` +- `DatabricksSparkProfile(engine="databricks_spark", ...)` +- `SnowflakeSnowparkProfile(engine="snowflake_snowpark", ...)` + +Resolver idea: + +```python +def resolve_profile(project_dir: Path, env_name: str, env: EnvSettings) -> Profile: ... +``` + +--- + +### Streaming Components + +**`streaming/sessionizer.py`** + +- Normalizes events (JSONL / batch DF) and writes `fct_sessions_streaming`. +- `process_batch(df)` aggregates sessions (start/end, pageviews, revenue). + +**Smoke test (DuckDB):** + +```python +def test_stream_sessionizer_produces_sessions(): ... +``` + +--- + +### Mini End-to-End Example (Python API) + +```python +from pathlib import Path +from jinja2 import Environment, FileSystemLoader +from fastflowtransform.core import REGISTRY +from fastflowtransform.dag import topo_sort +from fastflowtransform.executors.duckdb_exec import DuckExecutor + +proj = Path("examples/simple_duckdb").resolve() +REGISTRY.load_project(proj) +env = REGISTRY.env # Jinja env from the registry load + +order = topo_sort(REGISTRY.nodes) +ex = DuckExecutor(db_path=str(proj / ".local" / "demo.duckdb")) + +for name in order: + node = REGISTRY.nodes[name] + if node.kind == "sql": + ex.run_sql(node, env) + else: + ex.run_python(node) + +print("✓ Done") +``` + +--- + +Need a different angle? Head back to the [Docs Hub](./index.md) or deep-dive into the [Modeling Reference](./Config_and_Macros.md). + + + + + +# API Calls in Python Models + +> **Status:** Experimental but stable for demos and smaller workflows. +> **Goal:** Query HTTP APIs from Python models, return responses as DataFrames, cache and instrument them cleanly, and support reproducible offline runs. + +* [Motivation](#motivation) +* [Quickstart](#quickstart) +* [Programming API](#programming-api) + * [`get_json`](#get_json) + * [`get_df`](#get_df) + * [Pagination](#pagination) + * [Context & Telemetry](#context-telemetry) +* [CLI Flags & Environment Variables](#cli-flags-environment-variables) +* [Example Model](#example-model) +* [Artifacts](#artifacts) +* [Tests & Offline Demos](#tests-offline-demos) +* [Best Practices](#best-practices) +* [Troubleshooting](#troubleshooting) +* [Security & Compliance](#security-compliance) +* [FAQ](#faq) + +--- + +## Motivation + +Many pipelines need small, reliable API fetchers: configuration tables, miniature dimensions, feature flags, SaaS exports. This feature provides: + +- Simple HTTP calls inside Python models +- File-backed cache (reproducible builds, works offline) +- Per-node telemetry (requests, hits, bytes, hashes) +- CLI switches `--offline` and `--http-cache` for reproducible runs + +--- + +## Quickstart + +1. **Optionally enable flags** (recommended): + + ```bash + # No network - cache hits only + fft run . --env dev --offline + # Cache mode + fft run . --env dev --http-cache rw # rw|ro|off + ``` + +2. **Write a Python model**: + + ```python + # models/users_from_api.ff.py + import pandas as pd + from fastflowtransform.core import model + from fastflowtransform.api.http import get_df + + @model(name="users_from_api", deps=["users.ff"]) + def fetch(_: pd.DataFrame) -> pd.DataFrame: + df = get_df( + url="https://api.example.com/users", + params={"page": 1}, + record_path=["data"], # JSON -> list -> DataFrame + ) + return df + ``` + +3. **Run it**: + + ```bash + fft run . --env dev --select users_from_api + ``` + +--- + +## Programming API + +> Module: `fastflowtransform.api.http` + +### `get_json` + +```python +from fastflowtransform.api.http import get_json + +data = get_json( + url="https://api.example.com/objects", + params={"page": 1}, # optional + headers={"Authorization": "Bearer ..."}, # optional + timeout=20, # optional (seconds) +) +# -> Python dict / list +``` + +**Behavior** + +- Reads from the local cache (when present and valid). +- Writes to the cache (`rw` mode), including the response body. +- Respects offline mode (no network traffic). + +### `get_df` + +```python +from fastflowtransform.api.http import get_df + +df = get_df( + url="https://api.example.com/users", + params={"page": 1}, + record_path=["data"], # path to the JSON list + normalize=True, # optional: flatten nested objects + paginator=None, # optional: pagination strategy (see below) + output="pandas", # pandas|spark (default=pandas) +) +# -> pandas.DataFrame +``` + +**Conversion** + +- Default: `record_path` points to the array payload (for example `["data"]`). +- `normalize=True` delegates to `json_normalize` for deeper structures. +- `output='spark'` (plus an optional `session=SparkSession`) converts the normalized result into a `pyspark.sql.DataFrame`. Additional backends will reuse the same parameter. + +### Pagination + +For paged APIs you can describe the next request declaratively: + +```python +def paginator(url: str, params: dict | None, json_obj: dict): + next_url = json_obj.get("next") # e.g. absolute URL + if next_url: + return {"next_request": {"url": next_url}} + return None + +df = get_df( + "https://api.example.com/users?page=1", + paginator=paginator, + record_path=["data"], +) +``` + +The paginator may return the following fields: + +- `{"next_request": {"url": "...", "params": {...}, "headers": {...}}}` + (any missing field keeps its previous value) + +### Context & Telemetry + +During a model run the executor collects telemetry per node and writes it into `run_results.json`: + +- `requests` (count) +- `cache_hits` +- `bytes` (sum of response bodies) +- `used_offline` (bool) +- `keys` (cache keys) +- `entries` (optional compact array with URL, status, content hash) + +You will find these metrics under the `http` block of each node (see [Artifacts](#artifacts)). + +--- + +## CLI Flags & Environment Variables + +**CLI** + +- `--offline` + Sets `FF_HTTP_OFFLINE=1`; network requests are blocked, **cache hits only**. +- `--http-cache {off|ro|rw}` + Sets `FF_HTTP_CACHE_MODE`: + + - `off`: neither read nor write. + - `ro`: read-only (hits), **no** writes. + - `rw`: read and write (default). + +**Environment (optional to set directly)** + +| Variable | Default | Effect | +| ------------------------ | ------------------------------- | ----------------------------------- | +| `FF_HTTP_OFFLINE` | `0` | `1/true/on` -> offline mode | +| `FF_HTTP_CACHE_MODE` | `rw` | `off` / `ro` / `rw` | +| `FF_HTTP_CACHE_DIR` | `.fastflowtransform/http_cache` | Cache directory | +| `FF_HTTP_TTL` | `0` | Seconds; 0 = never expires | +| `FF_HTTP_TIMEOUT` | `20` | Request timeout (seconds) | +| `FF_HTTP_MAX_RETRIES` | `3` | Basic retry count | +| `FF_HTTP_RATE_LIMIT_RPS` | `0` | Requests per second (0 = unlimited) | + +--- + +## Example Model + +```python +# models/dim_countries_from_api.ff.py +import pandas as pd +from fastflowtransform.core import model +from fastflowtransform.api.http import get_df + +@model(name="dim_countries_from_api", deps=["users.ff"]) +def countries(_: pd.DataFrame) -> pd.DataFrame: + def pager(u, p, js): + nxt = js.get("paging", {}).get("next") + return {"next_request": {"url": nxt}} if nxt else None + + df = get_df( + url="https://api.example.com/countries?page=1", + paginator=pager, + record_path=["data"], + normalize=True, + ) + # lightweight post-processing + if "code" in df.columns: + df["code"] = df["code"].str.upper() + return df +``` + +Run: + +```bash +fft run . --env dev --select dim_countries_from_api --http-cache ro +``` + +--- + +## Artifacts + +`/.fastflowtransform/target/run_results.json` (excerpt): + +```json +{ + "results": [ + { + "name": "dim_countries_from_api", + "status": "success", + "duration_ms": 153, + "http": { + "requests": 2, + "cache_hits": 2, + "bytes": 1842, + "used_offline": true, + "keys": ["GET:https://api.example.com/countries?page=1|{}|{}", "..."], + "entries": [ + {"url": "https://api.example.com/countries?page=1", "status": 200, "content_hash": "sha256:..."}, + {"url": "https://api.example.com/countries?page=2", "status": 200, "content_hash": "sha256:..."} + ] + } + } + ] +} +``` + +> Note: When a node is **skipped** (fingerprint cache hit), no new `http` block is emitted - the model did not run. + +--- + +## Tests & Offline Demos + +- Place unit tests under `tests/api/...` and seed the cache directly (no real HTTP calls). +- Suggested scenarios: + + - **Offline hit:** set `FF_HTTP_OFFLINE=1`, seed the cache, `get_json/get_df` must succeed. + - **Cache mode `off`:** even with cache entries, **no** reads; expect a failure in offline mode. + - **`ro`:** allow read hits; **no** cache writes after a real or mocked request. + - **Pagination:** stitch several pages from offline fixtures; telemetry should count requests/hits. + +--- + +## Best Practices + +- **Stable URLs and parameter order** produce identical cache keys and reproducible builds. +- **Keep `record_path` shallow**; use `normalize=True` only when necessary (performance). +- **Never cache secrets:** provide tokens via headers; the response body and metadata are cached. +- **Use `--offline` in CI** for deterministic tests with a pre-seeded cache. +- **Set TTL intentionally** when APIs change frequently. +- **Scope engine-specific variants** with `engine_model(only=...)` so each execution backend registers only the models it can run (pair with SQL `config(engines=[...])` when duplicating logical names). + +--- + +## Troubleshooting + +- **“offline + cache miss”** + Seed the cache (see tests) or disable offline mode. +- **“Schema mismatch”** + Harmonize columns after `get_df` (types, missing keys). +- **“Too many requests”** + Configure `FF_HTTP_RATE_LIMIT_RPS`; make pagination more efficient (larger `page_size`). +- **“No http block”** + Was the node **skipped** (fingerprint cache)? Or did the model avoid HTTP calls altogether? + +--- + +## Security & Compliance + +- **Do not commit secrets** - use environment variables or a secret manager. +- **PII/GDPR:** verify whether the API returns personal data; minimise retention. +- **Cache directory:** keep it in `.gitignore`; encrypt or isolate it if necessary. + +--- + +## FAQ + +**Q:** Can I call other libraries (for example `requests`, `httpx`) directly? +**A:** Yes, but you lose telemetry and caching. The recommended entrypoint is `fastflowtransform.api.http`. + +**Q:** How do I add custom headers (for example OAuth)? +**A:** Pass `headers={...}`. Store sensitive values in env vars and inject them into your models. + +**Q:** Does this work for POST requests? +**A:** Release R1 focuses on GET. Please open an issue for POST/PUT support; the design can be extended. + +--- + +**See also:** + +- Technical guide: *Developer Guide – Architecture & Internals* +- Unit tests: `tests/api/test_http_*.py` +- Runtime & cache: *Parallelism & Cache (v0.3)* + + + + + +# FastFlowTransform Modeling Reference (v0.1) + +> Authoritative reference for FastFlowTransform’s modeling layer: SQL/Python models, configuration macros, templating helpers, and testing hooks. +> Works with FastFlowTransform v0.1 (T1–T11). Supported engines: DuckDB, Postgres, BigQuery (pandas & BigFrames), Databricks/Spark, Snowflake/Snowpark. +> **Execution & Cache (v0.3) quick notes** +> - Parallelism is level-wise; use `fft run --jobs N`. +> - Use `--cache={off|ro|rw|wo}` to control skipping behavior. +> - Fingerprints include rendered SQL / Python function source, selected `FF_*` env vars, `sources.yml` and upstream fingerprints. +> - Change any of these → downstream nodes rebuild. +> - `--rebuild ` forces rebuilding selected models (ignores cache). + + +For an operational walkthrough (CLI usage, troubleshooting, pipelines) see the [Technical Overview](./Technical_Overview.md). This document focuses purely on how you author and test models. + +--- + +## Docs Navigation +1. [Getting Started](./index.md) +2. [User Guide](./Technical_Overview.md#part-i-operational-guide) +3. **Modeling Reference** — you are here (`Config_and_Macros.md`) +4. [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) + +--- + +## Table of Contents + +- [Docs Navigation](#docs-navigation) +- [1. Model Fundamentals](#1-model-fundamentals) + - [1.1 SQL models (`*.ff.sql`)](#11-sql-models-ffsql) + - [1.2 Python models (`*.ff.py`)](#12-python-models-ffpy) + - [1.3 Seeds, sources, and dependencies](#13-seeds-sources-and-dependencies) +- [2. `config()` options](#2-config-options) +- [3. Variables with `var()`](#3-variables-with-var) +- [4. Template context & helpers](#4-template-context-helpers) +- [5. Macros & reusable Jinja code](#5-macros-reusable-jinja-code) +- [6. Materialization semantics](#6-materialization-semantics) +- [7. Testing & quality gates](#7-testing-quality-gates) +- [8. Quick cheat sheet](#8-quick-cheat-sheet) + +--- + +## 1. Model Fundamentals + +FastFlowTransform discovers models under `/models/` with two primary flavours: + +### 1.1 SQL models (`*.ff.sql`) + +- File stem defines the logical DAG node (`users.ff.sql` → `users.ff`). +- Jinja template rendered with FastFlowTransform context (helpers like `ref`, `source`, `var`, `config`, `this`). +- Output relation defaults to the stem without `.ff` (configurable via `config(alias=...)` if supported in future releases). + +```sql +-- models/users.ff.sql +{{ config(materialized='table', tags=['staging']) }} +create or replace table users as +select id, email +from {{ source('crm', 'users') }}; +``` + +### 1.2 Python models (`*.ff.py`) + +Use the `@model` decorator from `fastflowtransform.core` to register a callable. The decorator accepts: + +- `name` (optional) → overrides the logical name (defaults to stem). +- `deps` → list of dependency nodes (file stems or logical names). +- `requires` → column contract per dependency (validated via `validation.validate_required_columns`). + +Dependencies determine the call signature: + +- Single dependency → function receives a single `pandas.DataFrame`. +- Multiple dependencies → function receives `dict[str, pandas.DataFrame]` keyed by physical relation name (e.g. `"users"`). + +```python +# models/users_enriched.ff.py +from fastflowtransform.core import model +import pandas as pd + +@model( + name="users_enriched", + deps=["users.ff"], + requires={"users": {"id", "email"}} +) +def enrich(df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + out["is_gmail"] = out["email"].str.endswith("@gmail.com") + return out +``` + +#### Engine-scoped registration + +When the same project supports multiple execution backends, use `engine_model` to register a Python model only for specific engines. The decorator wraps `@model` but bails out early if the active engine (from `FF_ENGINE` or the selected profile) is not allowed. + +```python +from fastflowtransform import engine_model +import pandas as pd + +@engine_model( + only=("duckdb", "postgres"), + name="api_users_requests", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine"], +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + ... +``` + +Allowed values are case-insensitive strings or tuples. If the engine does not match, the function is left undecorated and no node is created, preventing duplicate registrations across engine-specific folders. + +### 1.3 Seeds, sources, and dependencies + +- Declare external tables in `sources.yml`; they become available via `source('group','table')`. +- Provide reproducible inputs with CSV/Parquet seeds in `/seeds/`. +- FastFlowTransform auto-detects dependencies: + - SQL models → parse `ref()` / `source()` calls. + - Python models → use the decorator’s `deps`. + - Additional runtime dependencies can be expressed via `relation_for()`. + +> **Warning:** SQL dependency detection is static. Only literal calls such as `ref('users.ff')` are registered. When you need to gate a dependency behind a variable, materialise the options in a mapping (`{'foo': ref('foo'), 'bar': ref('bar')}`) and pick from that map at runtime; a bare `ref(variable)` will not show up in the DAG. + +- Persistence (e.g. Spark/Databricks): configure default targets under `project.yml → models.storage` (and optionally `seeds.storage`). Example: + + ```yaml + models: + storage: + api_users_http: + path: ".local/spark/api_users_http" + format: delta + options: + mergeSchema: true + + seeds: + storage: + users: + path: ".local/spark/seeds/users" + ``` + + Entries end up in `node.meta["storage"]` (keys: `path`, `format`, `options`) and are respected by the matching executor. + +```yaml +# sources.yml +version: 2 + +sources: + - name: crm + tables: + - name: users + identifier: seed_users + - name: erp + tables: + - name: orders + identifier: seed_orders +``` + +Each source can declare defaults such as `schema`, `database`, or `catalog`. Tables may +override those defaults, add per-engine overrides, or point at files: + +```yaml + - name: raw + schema: staging + tables: + - name: seed_users + identifier: seed_users + overrides: + postgres: + schema: raw + databricks_spark: + format: delta + location: "/mnt/delta/raw/seed_users" +``` + +--- + +## 2. `config()` options + +Call `config()` at the top of SQL models (and optionally within Python models via decorator kwargs in future versions). + +```sql +{{ config( + materialized='view', + tags=['mart', 'daily'] +) }} +``` + +Supported keys (v0.1): + +| Key | Type | Description | +|----------------|-----------------|------------------------------------------------------------------------------| +| `materialized` | `"table" \| "view" \| "ephemeral"` | Controls how FastFlowTransform persists the model. See [Materialization semantics](#6-materialization-semantics). | +| `tags` | `list[str]` | Arbitrary labels surfaced in docs / selection tooling. | +| `engines` | `list[str]` or `str` | Restrict registration to the listed engines (case-insensitive). Requires the active engine to be known (profile selection or `FF_ENGINE`). | +| (future) | – | Additional metadata is stored under `node.meta[...]` if added later. | + +**Tips** + +- Place `config()` before any SQL text. +- Use tags to power custom filters in docs or to drive test selection. +- Combine `engines=[...]` with per-engine subfolders to keep one physical file per backend without name clashes. When no engine is active, FastFlowTransform raises a clear error to avoid silent skips. +- Ephemeral models inline into downstream SQL; pick `view` for shareable logic without materializing a table. + +--- + +## 3. Variables with `var()` + +Project-level variables live under `project.yml → vars:` and can be overridden from the CLI: + +```yaml +# project.yml +vars: + snapshot_day: "2000-01-01" + limit: 100 +``` + +```bash +fft run . --vars snapshot_day='2025-10-01' limit=50 +``` + +Usage in templates: + +```sql +select * +from {{ source('crm','users') }} +where signup_date <= '{{ var("snapshot_day", "1970-01-01") }}' +limit {{ var("limit", 1000) }} +``` + +Resolution order: CLI overrides → project vars → default argument. + +--- + +## 4. Template context & helpers + +Every model (SQL & Python) gets a rich Jinja context. Key helpers: + +| Helper | Purpose | +|--------------------|------------------------------------------------------------------------------------------| +| `this` | Object exposing `name`, `relation`, `materialized`, `schema`, `database`. | +| `ref("model")` | Resolves another model’s physical relation (or inlines ephemeral SQL). | +| `source("group","table")` | Resolves entries defined in `sources.yml`. | +| `relation_for(node)` (Python utility) | Maps logical node names to physical relations (helpful inside UDFs/tests). | +| `var("key", default)` | Retrieves project/CLI variables (see above). | + +Example: + +```sql +{{ config(materialized='view') }} +select + u.id, + u.email, + {{ var("country_column", "'US'") }} as country_code +from {{ ref('users.ff') }} as u +-- rendered relation for logging/debugging +-- {{ this.relation }} +``` + +--- + +## 5. Macros & reusable Jinja code + +Organise shared SQL snippets in `models/macros/` (all `.sql` files are auto-loaded): + +``` +models/ + macros/ + string_utils.sql + marts/ + users.ff.sql +``` + +```jinja +{# models/macros/string_utils.sql #} +{% macro safe_lower(col) -%} +lower(trim({{ col }})) +{%- endmacro %} +``` + +Use the macro anywhere within the project: + +```sql +select {{ safe_lower("email") }} as email_lower +from {{ ref('users.ff') }}; +``` + +**Best practices** + +- Keep macros idempotent and side-effect free. +- Group related macros per file (e.g., string utilities, date helpers). +- Document macros with inline comments; FastFlowTransform’s generated docs list each macro with its path. + +--- + +## 6. Materialization semantics + +### SQL models + +| Materialization | Behaviour | +|-----------------|-----------| +| `table` | `CREATE OR REPLACE TABLE … AS ` | +| `ephemeral` | No object is created; downstream `ref()` expands to a subquery. | + +**Postgres-specific:** FastFlowTransform rewrites the “create or replace” pattern into `DROP TABLE IF EXISTS …; CREATE TABLE … AS …` for compatibility. + +### Python models + +- Default → materialized as `table`. +- `materialized='view'` produces an engine-specific temporary table first, then creates/overwrites a view that selects from it. +- Ephemeral Python models are not supported in v0.1. + +--- + +## 7. Testing & quality gates + +### 7.1 Column contracts (`requires`) + +Use the decorator’s `requires` argument (Python models) to ensure upstream inputs carry expected columns. Under the hood FastFlowTransform calls `validation.validate_required_columns`, raising `RequiredColumnsError` with a descriptive diff. + +```python +@model( + deps=["orders.ff", "users_enriched"], + requires={ + "orders": {"order_id", "user_id", "amount"}, + "users_enriched": {"id", "email", "is_gmail"} + } +) +def join_orders(inputs: dict[str, pd.DataFrame]) -> pd.DataFrame: + ... +``` + +### 7.2 Data quality tests (`project.yml`) + +Declare checks under `project.yml → tests:`. Each entry maps directly to a function in `fastflowtransform.testing` (`not_null`, `unique`, `row_count_between`, `greater_equal`, `non_negative_sum`, `freshness`). Run them via `fft test …`. + +```yaml +tests: + - type: not_null + table: users + column: email + tags: [batch] +``` + +### 7.3 Model unit tests (`fft utest`) + +Keep transformation logic honest with small, engine-agnostic specs: + +- Place YAML files under `/tests/unit/`. +- Express inputs via inline rows or CSV paths. +- Declare expected output rows plus comparison options (`order_by`, `any_order`, `ignore_columns`, `approx`). + +```yaml +# tests/unit/users_enriched.yml +model: users_enriched +defaults: + inputs: + users: + rows: + - {id: 1, email: "a@example.com"} + - {id: 2, email: "b@gmail.com"} + expect: + relation: users_enriched + order_by: [id] + +cases: + - name: flags_gmail + expect: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} +``` + +Run with: + +```bash +fft utest . --env dev +fft utest . --model users_enriched --case flags_gmail +``` + +See the [Technical Overview](./Technical_Overview.md#model-unit-tests-fft-utest) for an exhaustive walkthrough (engine overrides, CI examples, troubleshooting). + +--- + +## 8. Quick cheat sheet + +| Task | Snippet / Pointer | +|------|-------------------| +| Set materialization | `{{ config(materialized='view') }}` | +| Add tags | `{{ config(tags=['mart','daily']) }}` | +| Read project variable | `{{ var('run_date', '1970-01-01') }}` | +| Current relation name | `{{ this.relation }}` | +| Reference another model | `{{ ref('users.ff') }}` | +| Reference source | `{{ source('crm','users') }}` | +| Macro definition | `models/macros/*.sql` | +| Guarantee columns (Python) | `@model(..., requires={'users': {'id','email'}})` | +| Data-quality test | `project.yml → tests` + `fft test …` | +| Unit test | `tests/unit/*.yml` + `fft utest …` | + +--- + +Return to the [Docs Hub](./index.md) or switch to the [User/Developer Guide](./Technical_Overview.md). + + + + + +### 🆕 `docs/Cache_and_Parallelism.md` + +````markdown +# Parallelism & Cache (FastFlowTransform v0.3) + +FastFlowTransform 0.3 introduces a level-wise parallel scheduler and a build cache driven by stable fingerprints. This document explains **how parallel execution works**, **when nodes are skipped**, the exact **fingerprint formula**, and the **meta table** written after successful builds. + +--- + +## Table of Contents +- [Parallel Scheduler](#parallel-scheduler) +- [Cache Policy](#cache-policy) +- [Fingerprint Formula](#fingerprint-formula) +- [Meta Table Schema](#meta-table-schema) +- [CLI Recipes](#cli-recipes) +- [Troubleshooting & FAQ](#troubleshooting--faq) +- [Example: simple_duckdb](#example-simple_duckdb) +- [Appendix: Environment Inputs](#appendix-environment-inputs) + +--- + +## Parallel Scheduler + +FastFlowTransform splits the DAG into **levels** (all nodes that can run together without violating dependencies). Within a level, up to `--jobs` nodes execute in **parallel**. + +- Dependencies are **never** violated. +- `--keep-going`: tasks already started in a level finish; **subsequent levels won’t start** if any task in the current level fails. +- Logs are serialized through an internal queue to keep lines readable and per-node timing visible. + +**Quick start** +```bash +# Run with 4 workers per level +fft run . --env dev --jobs 4 + +# Keep tasks in the same level running even if one fails +fft run . --env dev --jobs 4 --keep-going +```` + +--- + +## Cache Policy + +The cache decides whether a node can be **skipped** when nothing relevant changed. Modes: + +``` +--cache=off # always build +--cache=rw # default; skip on match; write cache after build +--cache=ro # skip on match; on miss build but don't write cache +--cache=wo # always build and write cache +--rebuild # ignore cache for matching nodes +--no-cache # alias for --cache=off +``` + +### Skip condition + +A node is skipped iff: + +1. The current **fingerprint** matches the on-disk cache value, **and** +2. The **physical relation exists** on the target engine. + +If the relation was dropped externally, FastFlowTransform will **rebuild** even if the fingerprint matches. + +--- + +## Fingerprint Formula + +Fingerprints are stable hashes that change on any relevant input: + +* **SQL models**: `fingerprint_sql(node, rendered_sql, env_ctx, dep_fps)` + + * Uses **rendered** SQL (after Jinja), not the raw template. +* **Python models**: `fingerprint_py(node, func_src, env_ctx, dep_fps)` + + * Uses `inspect.getsource(func)` with a **file-content fallback** if needed. + +`env_ctx` includes: + +* `engine` (e.g., `duckdb`, `postgres`, `bigquery`) +* `profile_name` (CLI `--env`) +* Selected environment entries: **all `FF_*` keys** (key + value) +* A **normalized** portion of `sources.yml` (sorted keys/dump) + +`dep_fps` are upstream fingerprints; **any upstream change** invalidates downstream fingerprints. + +**Properties** + +* Same inputs ⇒ same hash. +* Minimal change in SQL/function ⇒ different hash. +* Dependency changes propagate downstream. + +--- + +## Meta Table Schema + +After a successful build, FastFlowTransform writes a per-node audit row: + +``` +_ff_meta ( + node_name TEXT/STRING, -- logical name, e.g. "users.ff" + relation TEXT/STRING, -- physical table/view, e.g. "users" + fingerprint TEXT/STRING, + engine TEXT/STRING, + built_at TIMESTAMP +) +``` + +Backends: + +* **DuckDB:** table `_ff_meta` in `main`. +* **Postgres:** table `_ff_meta` in the active schema. +* **BigQuery:** table `._ff_meta`. + +> Note: Skip logic uses the file-backed fingerprint cache and a direct relation existence check; the meta table is for auditing and tooling. + +--- + +## CLI Recipes + +```bash +# First run — builds everything, writes cache and meta +fft run . --env dev --cache=rw + +# No-op run — should skip all nodes (if nothing changed) +fft run . --env dev --cache=rw + +# Force rebuild of a single model (ignores cache for it) +fft run . --env dev --cache=rw --rebuild marts_daily.ff + +# Read-only cache (skip on match, build on miss, no writes) +fft run . --env dev --cache=ro + +# Always build and write cache +fft run . --env dev --cache=wo + +# Disable cache entirely +fft run . --env dev --no-cache +``` + +With parallelism: + +```bash +fft run . --env dev --jobs 4 +fft run . --env dev --jobs 4 --keep-going +``` + +--- + +## Troubleshooting & FAQ + +**“Why did it skip?”** +A skip requires a fingerprint match and an existing relation. Fingerprints include: + +* rendered SQL / Python function source, +* `sources.yml` (normalized), +* engine/profile, +* **all `FF_*` environment variables**, +* upstream fingerprints. + +Any change in the above triggers a rebuild downstream. + +**“Relation missing but cache says skip?”** +We also check relation existence. If the table/view was dropped externally, FastFlowTransform will **rebuild**. + +**“My logs interleave under parallelism.”** +Logs are serialized via a queue; use `-v` / `-vv` for richer but still stable output. Each node prints start/end and duration; levels summarize. + +**“Utest cache?”** +`fft utest --cache {off|ro|rw}` defaults to `off` for deterministic runs. With `rw`, expensive unit cases can be accelerated. Unit tests do not rely on the meta table by default. + +--- + +## Example: simple_duckdb + +The demo contains two independent staging nodes (`users.ff.sql`, `orders.ff.sql`). They run in **parallel** within the same level. + +Makefile targets: + +```makefile +run_parallel: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --jobs 4 + +cache_rw_first: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --cache=rw + +cache_rw_second: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --cache=rw + +cache_invalidate_env: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" FF_DEMO_TOGGLE=1 fft run "$(PROJECT)" --env dev --cache=rw +``` + +--- + +## Appendix: Environment Inputs + +Only environment variables with the `FF_` prefix affect fingerprints (keys and values). If you change one (e.g., `FF_RUN_DATE`, `FF_REGION`), fingerprints change and downstream nodes rebuild. + +```bash +# Will invalidate fingerprints and rebuild affected nodes +FF_RUN_DATE=2025-01-01 fft run . --env dev --cache=rw +``` + +```` + +--- + +### 🔗 `docs/index.md` – Link zum neuen Kapitel + +```diff +--- a/docs/index.md ++++ b/docs/index.md +@@ -10,6 +10,7 @@ + - [User Guide – Operational](./Technical_Overview.md#part-i--operational-guide) + - [Modeling Reference](./Config_and_Macros.md) + - [Parallelism & Cache (v0.3)](./Cache_and_Parallelism.md) + - [Developer Guide – Architecture & Internals](./Technical_Overview.md#part-ii--architecture--internals) +```` + + + + + +# Incremental Models (R1) + +This guide explains how to configure incremental models, use `is_incremental()` in SQL, engine compatibility, and schema change policies. + +## Quick Start + +A minimal incremental model: + +```sql +-- examples/r1_demo/models/fct_events_inc.ff.sql +{{ config( + materialized='incremental', + unique_key=['event_id'], + on_schema_change='append_new_columns' -- or 'sync_all_columns' +) }} +with src as ( + select * from {{ source('app', 'events') }} + {% if is_incremental() %} + where ingested_at > (select coalesce(max(ingested_at), timestamp '1970-01-01') from {{ this.name }}) + {% endif %} +) +select + event_id, + user_id, + event_type, + ingested_at, + -- evolving column: will appear later + meta_json +from src; +```` + +### `is_incremental()` + +* Available in SQL templates during rendering. +* Returns `true` when the model exists and the current `materialized='incremental'` run chooses an incremental path (insert/merge) instead of full rebuild. +* Typical usage: filter the source to “new” rows only. + +### Engine Matrix (MVP) + +| Engine | Incremental Insert | Merge/Upsert | Schema Change Policy | +| ------------------ | ------------------ | ------------ | -------------------- | +| DuckDB | ✅ insert | 🚧 fallback* | ✅ append new cols | +| Postgres | ✅ insert | 🚧 fallback* | ✅ append new cols | +| BigQuery (classic) | ✅ insert | 🚧 fallback* | 🚧 best-effort | +| BigQuery BigFrames | ✅ insert | 🚧 fallback* | 🚧 best-effort | +| Databricks Spark | ✅ insert | 🚧 fallback* | 🚧 best-effort | +| Snowflake Snowpark | ✅ insert | 🚧 fallback* | 🚧 best-effort | + +* Fallback strategy merges by delete-on-keys + insert (best effort) if native merge isn’t wired. + +### Schema Change Policies + +* `append_new_columns` (default): new columns appear in target if they show up in the select. +* `sync_all_columns` (planned): attempt to keep type/nullable alignment. Currently not enforced; prefer append in R1. + +### End-to-End + +```bash +# Seeds → initial incremental build → run again with filter +fft seed examples/r1_demo --env dev +fft run examples/r1_demo --env dev --select fct_events_inc.ff +# simulate new data (re-seed or append), then: +fft run examples/r1_demo --env dev --select fct_events_inc.ff +``` + +**Artifacts:** see `.fastflowtransform/target/{manifest.json, run_results.json, catalog.json}`. + + + + + +# Profiles Configuration + +FastFlowTransform uses `profiles.yml` to describe how each environment connects to the execution engine (DuckDB, Postgres, BigQuery, Databricks Spark, Snowflake Snowpark, …). This document covers file layout, supported features, environment overrides, and loading precedence. + +## File Location + +`profiles.yml` lives at the project root (same level as `models/`, `project.yml`). The CLI loads it whenever you run `fft` commands (seed/run/test/dag/utest/docgen …). + +``` +project/ +├── models/ +├── project.yml +└── profiles.yml +``` + +## Basic Structure + +The file is parsed as YAML after optional Jinja rendering. Top-level keys represent profile “names” (e.g. `dev`, `prod`, `dev_postgres`). Each profile must include an `engine` plus engine-specific configuration. + +```yaml +dev: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}" + +stg: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +prod: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET') }}" + location: EU + +default: + engine: duckdb + duckdb: + path: ":memory:" +``` + +### Engines and Sections + +Supported engines and their expected sections: + +| Engine | Section | Key Fields | +|----------------------|--------------------|---------------------------------------------------| +| `duckdb` | `duckdb` | `path` (file path or `:memory:`) | +| `postgres` | `postgres` | `dsn`, `db_schema` | +| `bigquery` | `bigquery` | `project` (optional), `dataset`, `location` | +| `databricks_spark` | `databricks_spark` | `master`, `app_name`, optional `extra_conf`, `warehouse_dir`, `use_hive_metastore`, `database`, `table_format`, `table_options` | +| `snowflake_snowpark` | `snowflake_snowpark`| `account`, `user`, `password`, `warehouse`, `database`, `db_schema`, optional `role` | + +Each profile can define its own `vars:` block (values exposed via `var('key')` inside templates). + +## Environment Variables + +`profiles.yml` supports Jinja expressions. The helper `env('FF_VAR', 'fallback')` reads process environment variables and substitutes the default if unset. Examples: + +```yaml +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'analytics') }}" +``` + +These expressions are rendered *before* YAML parsing. If the environment variable is missing and no default is provided, the expression resolves to an empty string and validation will fail with a clear error message. + +## Loading Order & Precedence + +When running `fft` commands, `_load_dotenv_layered()` loads `.env` files in ascending precedence: + +1. `/.env` +2. `/.env` +3. `/.env.local` +4. `/.env.` +5. `/.env..local` + +Earlier values fill defaults; later files override earlier ones *only for keys that are not already defined*. **Values set in the shell (e.g. via `FF_ENGINE=duckdb fft run …`) have highest priority**—they remain untouched, even if `.env` files define the same key. + +After `.env` loading, `profiles.yml` is rendered with Jinja (using the current `os.environ`) and parsed by Pydantic. Validation ensures required fields are present for each engine and produces human-readable errors for missing DSNs, schemas, etc. + +## Selecting Profiles + +- **Via `--env` flag**: `fft run . --env dev_postgres` +- **Via `FFT_ACTIVE_ENV`**: set in shell or `.env` to choose the active profile name. +- **Legacy `FF_ENGINE`** (overrides `engine` field post-parse): useful for quick experiments but explicit `profiles.yml` entries are preferred. + +Example Makefile snippet that switches profiles without exposing secrets: + +```make +ENGINE ?= duckdb + +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb +endif +ifeq ($(ENGINE),postgres) + PROFILE_ENV = dev_postgres +endif + +seed: + FFT_ACTIVE_ENV=$(PROFILE_ENV) uv run fft seed . --env $(PROFILE_ENV) +``` + +## Using `.env` for Secrets + +Keep sensitive credentials out of VCS by storing them in `.env` files referenced above: + +``` +examples/api_demo/ +├── .env.dev_duckdb # FF_DUCKDB_PATH=... +├── .env.dev_postgres # FF_PG_DSN=..., FF_PG_SCHEMA=... +├── .env.dev_databricks # FF_SPARK_MASTER=..., FF_SPARK_APP_NAME=... +└── profiles.yml +``` + +These files stay out of git (via `.gitignore`), while `profiles.yml` contains only non-sensitive wiring. + +## Summary of Features + +- Multiple profiles in a single YAML file. +- Jinja templating with `env()` helper for dynamic values. +- `.env` layered loading with shell overrides taking precedence. +- Validation for engine-specific parameters (clear error messages). +- Profile-specific `vars` exposed to Jinja `var()` function in models. +- Works seamlessly across CLI commands: seed, run, dag, test, docgen, utest. + +Keep `profiles.yml` declarative, `.env` files secret, and use CLI or Makefiles to select the active profile per run. This pattern scales from local DuckDB demos to production Postgres/BigQuery/Snowflake deployments. + + + + + +# Sources Configuration + +`sources.yml` declares external tables (seeds, raw inputs, lakehouse paths) that models can reference via `{{ source('group', 'table') }}`. This document covers the schema, engine overrides, file paths, and best practices. + +## File Location + +Place `sources.yml` at your project root (same level as `models/`). Example: + +``` +project/ +├── models/ +├── sources.yml +└── seeds/ +``` + +## YAML Schema (Version 2) + +FastFlowTransform expects a dbt-style structure: + +```yaml +version: 2 +sources: + - name: raw + schema: staging # default schema for this source group + overrides: + postgres: + schema: raw_main # engine-specific default override + + tables: + - name: seed_users + identifier: seed_users # optional physical name + overrides: + duckdb: + schema: main + databricks_spark: + format: delta + location: "/mnt/delta/raw/seed_users" +``` + +### Fields + +| Level | Field | Description | +|----------|-------------|-------------| +| source | `name` | Logical group identifier referenced by `source('name', ...)`. | +| | `schema` | Default target schema/database for the group. | +| | `database`/`catalog` | Optional qualifiers per engine (BigQuery, Snowflake). | +| | `overrides` | Map of engine → config snippet (schema overrides, formats, locations). | +| table | `name` | Logical table name (second argument in `source()`). | +| | `identifier`| Physical name; defaults to `name` if omitted. | +| | `location` | File/path location (used with `format`). | +| | `format` | Ingestion format for engines supporting path-based sources (`delta`, `parquet`, …). | +| | `options` | Dict of format options (Spark/Databricks). | +| | `overrides` | Additional engine-specific settings merged with source-level overrides. | + +Engine-specific overrides follow this merge order: + +1. Source defaults (`schema`, `database`, …) +2. Source-level `overrides[engine]` +3. Table-level `overrides[engine]` + +### Engine Behavior + +- **DuckDB / Postgres / BigQuery / Snowflake**: expect `identifier` (plus `schema`/`database` where relevant). Path-based sources raise errors. +- **Databricks Spark**: supports `format` + `location`. The executor registers a temp view with optional `options` (e.g. `compression`). + +### Path-Based Sources Example + +```yaml + - name: raw_events + tables: + - name: landing + overrides: + databricks_spark: + format: json + location: "abfss://landing@storage.dfs.core.windows.net/events/*.json" + options: + multiline: true +``` + +## Referencing Sources in Models + +```sql +select id, email +from {{ source('raw', 'seed_users') }} +``` + +After rendering, the executor resolves the fully-qualified relation or path depending on the active engine. + +## Seed Integration + +When combined with `seeds/schema.yml`, you can map CSV/Parquet seeds into schemas per engine: + +```yaml +targets: + raw/users: + schema: raw + schema_by_engine: + duckdb: main + postgres: staging +``` + +## Validation & Errors + +- Missing `identifier` *and* `location` produce `KeyError` during rendering. +- Unknown source/table names raise `KeyError` with suggestions. +- Unsupported path-based sources on an engine (`location` provided but no `format`) raise descriptive `NotImplementedError`. + +Keep `sources.yml` declarative, use engine overrides for schema differences, and lean on `.env` files where credentials or URIs vary per environment. + + + + + +# Project Configuration (`project.yml`) + +`project.yml` defines global metadata, documentation, variables, and data-quality tests for a FastFlowTransform project. This reference walks through the supported sections and common patterns. + +## File Location + +`project.yml` lives at the root of your project. + +``` +project/ +├── models/ +├── project.yml +└── profiles.yml +``` + +## Top-Level Keys + +```yaml +name: my_project +version: "0.1" +models_dir: models # optional, defaults to "models" + +docs: + dag_dir: site/dag # output for fft dag --html + models: + users: + description: "Raw users table" + columns: + id: "Primary key" + email: "Email address" + +vars: + snapshot_day: "2024-01-01" + default_limit: 100 + +tests: + - type: not_null + table: users + column: id + tags: [batch] +``` + +### Metadata + +| Key | Description | +|-------------|-------------| +| `name` | Project identifier (used in docs/metadata). | +| `version` | Arbitrary version string. | +| `models_dir`| Relative directory containing models (`*.ff.sql` / `*.ff.py`). | + +### Documentation (`docs`) + +- `dag_dir`: where `fft dag --html` writes the static site. +- `models`: per-model descriptions and column docs surfaced in the generated DAG/docs. + +### Variables (`vars`) + +Key/value pairs accessible via `{{ var('key', default) }}` in Jinja templates. CLI overrides (`--vars key=value`) take precedence. + +### Tests (`tests`) + +Project-wide data quality checks run by `fft test`. Each test is a dict with: + +- `type`: `not_null`, `unique`, `accepted_values`, `row_count_between`, `greater_equal`, `non_negative_sum`, `freshness`, or reconciliation checks (`reconcile_equal`, `reconcile_diff_within`, `reconcile_ratio_within`, `reconcile_coverage`). +- `table`: target table or relation. +- `column`: required for column-based tests. +- Optional: `tags`, `severity` (`error`/`warn`), additional parameters (e.g. `values`, `min`, `max`). + +Example: + +```yaml +tests: + - type: accepted_values + table: mart_users + column: status + values: [active, invited] + severity: warn + - type: reconcile_equal + name: revenue_vs_bookings + left: { table: fct_revenue, expr: "sum(amount)" } + right: { table: fct_bookings, expr: "sum(expected_amount)" } + abs_tolerance: 5.0 +``` + +## Interaction with `.env` and Profiles + +`project.yml` does not read environment variables directly. However: + +- `vars:` can reference `var('key')` defaults overridden by CLI or `.env`. +- Tests often depend on `profiles.yml` and `sources.yml` for the actual connection details. +- Makefiles may set `FFT_ACTIVE_ENV` or other `FF_*` variables influencing runs, but `project.yml` remains static. + +## Best Practices + +- Keep `project.yml` committed to version control (no secrets). +- Use `docs/` to provide richer Markdown descriptions; reference them via `columns` or `description` fields if desired. +- Organize tests by tag (`tags: [batch]`, `tags: [reconcile]`) to support selective execution: `fft test . --select tag:reconcile`. + +Refer to `docs/Data_Quality_Tests.md` for detailed test semantics and `docs/Profiles.md` for profile/env loading behavior. + + + + + +# State Selection — R1 + +Build only changed nodes or select by last run results. + +## Changed Nodes + +- `state:modified` — models that have changed since last cached fingerprint. +- `state:modified+` — the above plus all downstream dependents. + +```bash +# First run populates cache +fft run examples/r1_demo --env dev --cache rw +# Touch files / change SQL → next run: +fft run examples/r1_demo --env dev --cache rw --select state:modified +fft run examples/r1_demo --env dev --cache rw --select state:modified+ +```` + +## Result-based Selection + +Use the last `run_results.json`: + +* `result:ok` — successful models (no warnings) +* `result:warn` — successful but with warnings +* `result:fail` — alias of `result:error` +* `result:error`— failed models + +```bash +fft run examples/r1_demo --env dev --select result:error +``` + +### Artifacts + +``` +examples/r1_demo/.fastflowtransform/target/ +├── manifest.json +├── run_results.json +└── catalog.json +``` + + + + + +# YAML Tests (Schema-bound) + +Schema-bound tests live in `models/*.yml` or `models/**/schema.yml` and complement (or replace) `project.yml`-based tests. + +## Example + +```yaml +# examples/r1_demo/models/users_enriched.yml +version: 2 +models: + - name: users_enriched + description: "Adds gmail flag" + columns: + - name: id + tests: + - not_null: { severity: error } + - unique + - name: email + tests: + - not_null + - accepted_values: + values: ["a@example.com","b@example.com","c@gmail.com"] + severity: warn +```` + +### Severities + +* `error` → contributes to failures (exit code 2). +* `warn` → surfaced in summary as ❕, does not affect exit code. + +### Run + +```bash +fft test examples/r1_demo --env dev +# Select only tests tagged 'reconcile' (if present) +fft test examples/r1_demo --env dev --select tag:reconcile +``` + +### Output (excerpt) + +``` +Data Quality Summary +──────────────────── +✅ not_null users.id (3ms) +❌ unique users.id (2ms) + ↳ [unique] users.id: found 1 duplicate +❕ accepted_values users_enriched.email (1ms) + +Totals +────── +✓ passed: 2 +✗ failed: 1 +! warnings: 1 +``` + + + + + +# Data Quality Test Reference + +FastFlowTransform exposes a set of built-in data quality checks that you can configure in `project.yml → tests:` and execute with `fft test`. This document lists every supported test, required parameters, and example configurations. + +## Usage Overview + +```yaml +# project.yml +tests: + - type: not_null + table: users + column: id + severity: error # default (omit for error) + tags: [batch] + + - type: unique + table: users + column: email + tags: [batch] + + - type: accepted_values + table: users + column: status + values: [active, invited] + severity: warn # warn keeps run green on failure + + - type: row_count_between + table: users_enriched + min: 1 + max: 100000 + + - type: reconcile_equal + name: revenue_vs_bookings # optional label in summaries + tags: [reconcile] + left: { table: fct_revenue, expr: "sum(amount)" } + right: { table: fct_bookings, expr: "sum(expected_amount)" } + abs_tolerance: 5.0 +``` + +Every entry is a single dictionary describing one check. The common keys are: + +| Key | Description | +|------------|-------------| +| `type` | Test kind (see tables below). | +| `table` | Target table for table-level checks or display hint for reconciliations. | +| `column` | Required for column-scoped checks (`not_null`, `unique`, …). | +| `severity` | `error` (default) or `warn`. | +| `tags` | Optional list of selectors for `fft test --select tag:...`. | +| `name` | Optional identifier surfaced in summaries (useful for reconciliations). | + +Run all configured checks: + +```bash +fft test . --env dev +``` + +Use `--select tag:` to restrict by tags (legacy `--select batch` reads the same tags list). Tests always execute regardless of cache settings. + +Each entry produces a summary line. Failures stop the command unless `severity: warn` is set. + +## Table-Level Checks + +These checks operate on a single table (optionally filtered with `where:`). Unless noted, they require a `column` argument. + +### `not_null` +- **Purpose:** Assert that a column never contains NULLs. +- **Parameters:** + - `column` *(str, required)* + - `where` *(str, optional)* — SQL predicate applied before the NULL check. +- **Failure:** Reports the number of NULL rows and shows the underlying SQL. + +### `unique` +- **Purpose:** Detect duplicates within a column. +- **Parameters:** + - `column` *(str, required)* + - `where` *(str, optional)* +- **Failure:** Indicates how many duplicate groups were found (HAVING count > 1) and shows a sample query. + +### `accepted_values` +- **Purpose:** Ensure every non-NULL value is inside an allowed set. +- **Parameters:** + - `column` *(str, required)* + - `values` *(list, required)* — permitted literals (strings are quoted automatically). + - `where` *(str, optional)* +- **Failure:** Shows the number of out-of-set values plus up to five sample values. + +### `greater_equal` +- **Purpose:** Require all values to be greater than or equal to a threshold. +- **Parameters:** + - `column` *(str, required)* + - `threshold` *(number, default `0`)* +- **Failure:** Lists how many rows fell below the threshold. + +### `non_negative_sum` +- **Purpose:** Validate that the sum of a numeric column is not negative. +- **Parameters:** + - `column` *(str, required)* +- **Failure:** Reports the signed sum when it is negative. + +### `row_count_between` +- **Purpose:** Guard minimum (and optional maximum) row counts for a table. +- **Parameters:** + - `min` *(int, default `1`)* + - `max` *(int, optional)* — omit for open-ended upper bounds. +- **Failure:** Indicates the observed row count when it falls outside `[min, max]`. + +### `freshness` +- **Purpose:** Warn when the latest timestamp is older than an allowed delay. +- **Parameters:** + - `column` *(str, required)* — timestamp column. + - `max_delay_minutes` *(int, required)* — permitted staleness. +- **Failure:** Reports the computed lag in minutes. Uses ANSI-style `DATE_PART` (works on DuckDB/Postgres; extend for other engines as needed). + +## Cross-Table Reconciliations + +Reconciliation checks compare aggregates or keys across two relations. Their configuration accepts dictionaries describing the left/right side expressions or keys. + +### `reconcile_equal` +- **Purpose:** Compare two scalar expressions with optional tolerances. +- **Parameters:** + - `left`, `right` *(dict, required)* with keys: + - `table` *(str, required)* + - `expr` *(str, required)* — SQL select expression (e.g. `sum(amount)`). + - `where` *(str, optional)* + - `abs_tolerance` *(float, optional)* — maximum absolute difference. + - `rel_tolerance_pct` *(float, optional)* — maximum relative difference in percent. +- **Failure:** Displays both values, absolute and relative differences. + +### `reconcile_ratio_within` +- **Purpose:** Constrain the ratio `left/right` within bounds. +- **Parameters:** + - `left`, `right` *(dict, required as above)* + - `min_ratio`, `max_ratio` *(float, required)* +- **Failure:** Shows the computed ratio and expected interval. + +### `reconcile_diff_within` +- **Purpose:** Limit the absolute difference between two aggregates. +- **Parameters:** + - `left`, `right` *(dict, required)* + - `max_abs_diff` *(float, required)* +- **Failure:** Reports the absolute difference when it exceeds `max_abs_diff`. + +### `reconcile_coverage` +- **Purpose:** Ensure every key present in a source table appears in a target table (anti-join zero). +- **Parameters:** + - `source` *(dict, required)* — `table` and `key` column. + - `target` *(dict, required)* — `table` and `key` column. + - `source_where` *(str, optional)* — filter applied to the source. + - `target_where` *(str, optional)* — filter applied to the target. +- **Failure:** Reports the number of missing keys. + +## Severity & Selectors + +- `severity: error` (default) makes failures stop the test run with exit code 1. +- `severity: warn` records the result but keeps the run successful. +- `selectors:` lets you group checks under named tokens (e.g. `batch`, `streaming`). Use `fft test --select tag:batch` to execute a subset. + +## CLI Summary Output + +Each executed check produces a line in the summary: + +``` +✓ not_null users.email (3ms) +✖ accepted_values events.status values=['new', 'active'] (warn) +``` + +Failures include the generated SQL (where available) to simplify debugging. Use `fft test --verbose` for more detail, or `FFT_SQL_DEBUG=1` to log the underlying queries. + +## Further Reading + +- [`docs/YAML_Tests.md`](YAML_Tests.md) – schema for YAML-defined tests and advanced scenarios. +- [`fft test --help`] — command-line switches, selectors, and cache options. + + + + + +# Environment Matrix (DuckDB-only) — Example + +This tiny project demonstrates **per-environment configuration** (dev / stg / prod) while keeping everything on **DuckDB**. +Each environment uses its **own DuckDB file**, so you can switch environments without changing code. + +It also includes a **seed step** (CSV → table) and two minimal models: + +* `env_vars.ff` (Python) — echoes which env is active and which DuckDB file is used +* `hello.ff` (SQL view) — shows how `{{ this.* }}` resolves from the active profile +* `users.ff` (SQL table) — reads from the seeded CSV table to prove seeding works + +--- + +## What this shows + +* Layered environment files: `.env.dev`, `.env.stg`, `.env.prod` (+ optional `*.local` overrides) +* `profiles.yml` that reads from `env('…')` so connection details live in env files +* All environments use **DuckDB**, but **different DB files** (e.g. `.local/dev.duckdb`, `.local/stg.duckdb`, …) +* Seeding CSV → `seed_users` table, then a simple model consuming it + +--- + +## Project layout + +``` +examples/env_matrix/ +├─ models/ +│ ├─ env_vars.ff.py # Python model: shows env + DuckDB file info +│ └─ users.ff.sql # SQL table: reads from seeded 'seed_users' +├─ seeds/ +│ └─ users.csv # sample data for seeding (-> seed_users) +├─ profiles.yml # all envs = DuckDB, different paths +├─ .env # shared defaults (optional) +├─ .env.dev # dev environment vars +├─ .env.stg # stg environment vars +├─ .env.prod # prod environment vars +├─ .env.dev.local # private overrides (gitignored; optional) +├─ .env.stg.local # private overrides (gitignored; optional) +├─ .env.prod.local # private overrides (gitignored; optional) +└─ Makefile # convenience targets (run, seed, dag) +``` + +--- + +## Environment files + +Each env file sets a different DuckDB path: + +* `.env.dev` + + ``` + FFT_ACTIVE_ENV=dev + FF_ENGINE=duckdb + FF_DUCKDB_PATH=.local/env_matrix.dev.duckdb + ``` + +* `.env.stg` + + ``` + FFT_ACTIVE_ENV=stg + FF_ENGINE=duckdb + FF_DUCKDB_PATH=.local/env_matrix.stg.duckdb + ``` + +* `.env.prod` + + ``` + FFT_ACTIVE_ENV=prod + FF_ENGINE=duckdb + FF_DUCKDB_PATH=.local/env_matrix.prod.duckdb + ``` + +> You can place secrets or machine-local tweaks in `.env..local` (ignored by git). +> Optional toggles (if you want verbose SQL logs): +> `FFT_SQL_DEBUG=1`, `FFT_LOG_JSON=1` + +--- + +## `profiles.yml` (DuckDB for all envs) + +```yaml +default: + dev: + engine: "{{ env('FF_ENGINE', 'duckdb') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" + + stg: + engine: "{{ env('FF_ENGINE', 'duckdb') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" + + prod: + engine: "{{ env('FF_ENGINE', 'duckdb') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" +``` + +--- + +## Models + +### `models/env_vars.ff.py` (Python) + +Returns one row with: + +* `active_env_hint` (from `.env.*`), +* `ff_engine` (should be `duckdb` here), +* `duckdb_path`, `duckdb_exists`, `duckdb_size_bytes`. + +### `models/hello.ff.sql` (SQL view) + +Uses `{{ this.materialized }}`, `{{ this.schema }}`, `{{ this.database }}` so you can see what the active profile provides. (The simple `SELECT` is compatible with DuckDB; if you added casts like `::text`, they’re fine in DuckDB too.) + +### `models/users.ff.sql` (SQL table) + +Reads from the seeded table `seed_users`: + +```sql +{{ config(materialized='table', tags=['demo', 'seed']) }} + +select + id, + email +from "seed_users"; +``` + +> If you see an error “table seed_users does not exist”, you **haven’t run `fft seed`** for that environment yet. + +--- + +## Seeds + +`seeds/users.csv` is loaded by `fft seed` into a table named `seed_users`. +(That’s the default naming convention: `users.csv` → `seed_users`.) + +--- + +## Running it + +From the repo root: + +### Using `uv` directly + +**Dev** + +```bash +uv run fft seed examples/env_matrix --env dev +uv run fft run examples/env_matrix --env dev +uv run fft dag examples/env_matrix --env dev --html +``` + +**Staging** + +```bash +uv run fft seed examples/env_matrix --env stg +uv run fft run examples/env_matrix --env stg +``` + +**Prod** + +```bash +uv run fft seed examples/env_matrix --env prod +uv run fft run examples/env_matrix --env prod +``` + +### Using the Makefile (inside `examples/env_matrix/`) + +```bash +make run-dev # runs the DAG on dev +make run-stg +make run-prod + +make seed-dev # seed only (dev) +make seed-stg +make seed-prod + +make dag-dev # generate HTML DAG for dev +make clean # remove .local/, docs/, site/, .fastflowtransform/ +``` + +> Tip: re-run `fft seed` whenever you switch environments or change `seeds/*.csv`. + +--- + +## Inspecting results + +* The **HTML DAG** (after `make dag-dev`) will be at: + + ``` + examples/env_matrix/site/dag/index.html + ``` +* The **artifacts** are under: + + ``` + examples/env_matrix/.fastflowtransform/target/{manifest.json, run_results.json, catalog.json} + ``` +* Query the DuckDB files directly with `duckdb` CLI or `python` + `duckdb` module if you want to peek inside. + +--- + +## Troubleshooting + +* **`seed_users` not found** + Run `fft seed` for the same environment: + `uv run fft seed examples/env_matrix --env dev` + +* **No logs showing** + Use `-v`/`-vv` and/or `--sql-debug` on the CLI, or set: + + ``` + FFT_SQL_DEBUG=1 + FFT_LOG_JSON=1 # optional JSON logs + ``` + +* **Wrong environment picked** + Double-check the `--env` flag in your CLI call and ensure the `.env.` file exists. + +--- + +## Clean up + +```bash +make clean # from examples/env_matrix/ +# or manually: +rm -rf examples/env_matrix/.local examples/env_matrix/site examples/env_matrix/docs +rm -rf examples/env_matrix/.fastflowtransform +``` + + + + + +# API Demo Project + +The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local data, external APIs, and multiple execution engines. It highlights: + +- **Hybrid data model**: joins a local seed (`crm.users`) with live user data from JSONPlaceholder. +- **Multiple environments**: switch between DuckDB, Postgres, and Databricks Spark using `profiles.yml` + `.env.*`. +- **HTTP integration**: compare the built-in FastFlowTransform HTTP client (`api_users_http`) with a plain `requests` implementation (`api_users_requests`). +- **Offline caching & telemetry**: inspect HTTP snapshots via `run_results.json`. +- **Engine-aware registration**: scope Python models via `engine_model` and SQL models via `config(engines=[...])` so only the active engine’s nodes load. + +## Data Model + +1. **Seed staging** – `models/common/users.ff.sql` + ```sql + {{ config( + materialized='table', + tags=[ + 'example:api_demo', + 'scope:common', + 'kind:seed-consumer', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ] + ) }} + select id, email + from {{ source('crm', 'users') }}; + ``` + Consumes `sources.yml → crm.users` (seeded from `seeds/seed_users.csv`). + +2. **API enrichment** – two Python implementations under `models/engines/duckdb/`: + - `api_users_http.ff.py` uses the built-in HTTP wrapper (`fastflowtransform.api.http.get_df`) with cache/offline support. + - `api_users_requests.ff.py` uses raw `requests` for maximum flexibility. + - Wrap engine-specific callables with `engine_model(only="duckdb", ...)` to skip registration when another engine is selected. + +3. **Mart join** – `models/common/mart_users_join.ff.sql` + ```sql + {{ config(engines=['duckdb','postgres','databricks_spark']) }} + {% set api_users_model = var('api_users_model', 'api_users_http') %} + {% set api_users_refs = { + 'api_users_http': ref('api_users_http'), + 'api_users_requests': ref('api_users_requests') + } %} + {% set api_users_relation = api_users_refs.get(api_users_model, api_users_refs['api_users_http']) %} + with a as ( + select u.id as user_id, u.email from {{ ref('users.ff') }} u + ), + b as ( + select * from {{ api_users_relation }} + ) + select ... + ``` + Ties everything together and exposes the `var('api_users_model')` hook to choose the HTTP implementation while still keeping literal `ref('…')` calls in the template (required for DAG detection). `config(engines=[...])` keeps the SQL node registered only for the engines you list, preventing duplicate names across engine-specific folders. + + > **Warning:** The DAG builder only detects dependencies from literal `ref('model_name')` strings. A pure `ref(api_users_model)` (without the mapping shown above) compiles, but the graph would miss the edge to `api_users_http`/`api_users_requests`. + +## Profiles & Secrets + +`profiles.yml` defines per-engine profiles that reference environment variables: + +```yaml +dev_duckdb: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/api_demo.duckdb') }}" + +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" +``` + +`.env.dev_*` files supply the actual values. `_load_dotenv_layered()` loads them in priority order: repo `.env` → project `.env` → `.env.` → shell overrides (highest priority). Secrets stay out of version control. + + +## Makefile Workflow + +`Makefile` chooses the profile via `ENGINE` (`duckdb`/`postgres`/`databricks_spark`) and wraps the main commands: + +```make +ENGINE ?= duckdb + +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb +endif +... + +seed: + uv run fft seed "$(PROJECT)" --env $(PROFILE_ENV) +run: + env FFT_ACTIVE_ENV=$(PROFILE_ENV) ... uv run fft run ... +``` + +Common targets: + +| Target | Description | +|--------------------------|-------------| +| `make ENGINE=duckdb seed`| Materialize seeds into DuckDB. | +| `make ENGINE=postgres run`| Execute the full pipeline against Postgres. | +| `make dag` | Render documentation (`site/dag/`). | +| `make api-run` | Run only API models (uses HTTP cache). | +| `make api-offline` | Force offline mode (`FF_HTTP_OFFLINE=1`). | +| `make api-show-http` | Display HTTP snapshot metrics via `jq`. | + +HTTP tuning parameters (`FF_HTTP_ALLOWED_DOMAINS`, cache dir, timeouts) live in `.env` and are appended via `HTTP_ENV` when running commands. + +## End-to-End Demo + +1. **Select engine**: `make ENGINE=duckdb` (default). Set `ENGINE=postgres` or `ENGINE=databricks_spark` to switch. +2. **Seed data**: `make seed` +3. **Run pipeline**: `make run` +4. **Explore docs**: `make dag` → open `examples/api_demo/site/dag/index.html` +5. **Inspect HTTP usage**: `make api-show-http` + +This example demonstrates multi-engine configuration, environment-driven secrets, and API enrichment within FastFlowTransform. + + + + + +## Local Engine Setup + +### DuckDB + +- Copy `.env.dev_duckdb` and adjust `FF_DUCKDB_PATH` if you want a different location (default: `.local/api_demo.duckdb`). +- Create the target directory once: `mkdir -p examples/api_demo/.local`. +- Run `make ENGINE=duckdb seed run` to build the seeds and models inside the DuckDB file. + +### Postgres + +- Start a local database, e.g. via Docker: + `docker run --name fft-postgres -e POSTGRES_PASSWORD=postgres -p 5432:5432 -d postgres:15`. +- Set `FF_PG_DSN` in `.env.dev_postgres` (for example `postgresql+psycopg://postgres:postgres@localhost:5432/fft`) and optionally override `FF_PG_SCHEMA` (defaults to `api_demo`). + The executor ensures the schema exists via `CREATE SCHEMA IF NOT EXISTS` on first connect. +- Execute `make ENGINE=postgres seed run` to materialize seeds and models in Postgres. + +### Databricks Spark (local) + +- Install Java (JDK ≥ 17) and declare `JAVA_HOME`, for example: + `brew install openjdk@17` + `echo 'JAVA_HOME=/opt/homebrew/opt/openjdk@17' >> examples/api_demo/.env.dev_databricks`. +- Optionally tweak `FF_SPARK_MASTER` / `FF_SPARK_APP_NAME` in `.env.dev_databricks` (default: `local[*]`). +- To persist tables across separate `seed`/`run` sessions, enable the bundled Hive metastore defaults: + `FF_DBR_ENABLE_HIVE=1`, `FF_DBR_WAREHOUSE_DIR=examples/api_demo/spark-warehouse`, `FF_DBR_DATABASE=api_demo`. +- Switch the physical format by setting `FF_DBR_TABLE_FORMAT` (e.g. `delta`, requires the Delta Lake runtime); extra writer options can be supplied via `profiles.yml → databricks_spark.table_options`. +- Ensure your shell loads `.env.dev_databricks` (via `make`, `direnv`, or manual export) and run `make ENGINE=databricks_spark seed run`. + + + + + +--8<-- "Contributing.md" + + + + + +# License + +--8<-- "License" diff --git a/Contributing.md b/Contributing.md index 0166271..ea84ae2 100644 --- a/Contributing.md +++ b/Contributing.md @@ -87,4 +87,4 @@ Instances of abusive, harassing, or otherwise unacceptable behavior may be repor ## 📄 License -By contributing, you agree that your contributions will be licensed under the [Apache-2.0 License](LICENSE). +By contributing, you agree that your contributions will be licensed under the [Apache-2.0 License](License.md). diff --git a/License b/License index dbe706b..7638541 100644 --- a/License +++ b/License @@ -199,7 +199,7 @@ file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. -Copyright [2025] [Your Name or Your Company] +Copyright (c) 2025 Marko Lekic Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/Makefile.dev b/Makefile.dev index 9d24089..27125ff 100644 --- a/Makefile.dev +++ b/Makefile.dev @@ -14,10 +14,6 @@ dev-venv: $(UV) pip install --python .venv/bin/python --upgrade pip $(UV) pip install --python .venv/bin/python --editable . -# Lightweight local CI entrypoint (full matrix lives in GitHub Actions) -ci: - pytest -q - test-pg-batch: FFT_SQL_DEBUG=1 $(UV) run pytest -q tests/test_smoke_postgres.py::test_pg_batch_tests_green @@ -25,7 +21,7 @@ unittest: FFT_SQL_DEBUG=1 $(UV) run pytest -q tests cover: - uv run pytest --cov=src/fastflowtransform --cov-report=term-missing --cov-report=xml --cov-report=html + FFT_SQL_DEBUG=1 $(UV) run pytest -q tests --cov=fastflowtransform --cov-report=term-missing --cov-report=html utest: fft utest "$(FF_PROJECT)" --env "$(FF_ENV)" @@ -60,3 +56,6 @@ act-commit: -P ubuntu-latest=ghcr.io/catthehacker/ubuntu:act-22.04 \ --env UV_PROJECT_ENVIRONMENT=/tmp/uv-venv \ --env VIRTUAL_ENV= + +concat-docs: + $(UV) run python _scripts/concat_docs.py -o Combined.md diff --git a/README.md b/README.md index 345e03b..b0d43e9 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,15 @@ make install # upgrades pip + installs FastFlowTransform in editable mode ## Quickstart -> 📚 **Mehr lesen … CLI-Details** +### Project skeleton (optional) + +```bash +fft init ./demo_project --engine duckdb +``` + +`fft init` generates a non-interactive skeleton (no demo models) and adds inline comments pointing to the relevant documentation pages. + +> 📚 **Read more… CLI-Details** > For flag referencees, automatization and backgrounds see [`docs/Technical_Overview.md`](docs/Technical_Overview.md#cli-flows). Run the end-to-end DuckDB demo (seed → run → docs → tests) in under a minute: diff --git a/_scripts/concat_docs.py b/_scripts/concat_docs.py new file mode 100644 index 0000000..1c00d6e --- /dev/null +++ b/_scripts/concat_docs.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +# concat_docs.py +""" +Fügt alle Markdown-Dateien aus dem docs-Verzeichnis zu einer einzelnen Datei zusammen. +- Respektiert die Reihenfolge in mkdocs.yml (nav). +- Ignoriert doppelte Einträge / Anker (#...). +- Hängt übrige .md-Dateien (nicht in nav) am Ende an. +- Optional: Headings demoten (um mehrfaches H1 zu vermeiden). + +Beispiel: + python concat_docs.py -o Combined.md + python concat_docs.py -o Combined.md --demote --exclude "reference/**" --exclude "site/**" +""" + +from __future__ import annotations +import argparse +import fnmatch +import os +from pathlib import Path +import re +import sys + +try: + import yaml # PyYAML +except ImportError: + yaml = None + +DOCS_DIR_DEFAULT = "docs" +MKDOCS_YML = "mkdocs.yml" + + +def load_nav_order(project_root: Path) -> list[Path]: + """Liest mkdocs.yml und extrahiert eine geordnete Liste der Markdown-Pfade (ohne Anker).""" + yml_path = project_root / MKDOCS_YML + ordered: list[Path] = [] + if yaml is None or not yml_path.exists(): + return ordered # keine Order-Info -> leere Liste + data = yaml.safe_load(yml_path.read_text(encoding="utf-8")) + nav = data.get("nav") if isinstance(data, dict) else None + if not isinstance(nav, list): + return ordered + + def normalize_nav_item(item) -> list[str]: + # item kann dict ({"Title": "path.md" | ["subitems"]}) oder string sein + out: list[str] = [] + if isinstance(item, str): + out.append(item) + elif isinstance(item, dict): + for _, v in item.items(): + if isinstance(v, str): + out.append(v) + elif isinstance(v, list): + for sub in v: + out.extend(normalize_nav_item(sub)) + return out + + paths = [] + for entry in nav: + paths.extend(normalize_nav_item(entry)) + + seen = set() + for p in paths: + # Nur Dateien unter docs berücksichtigen; Anker entfernen + p_no_anchor = p.split("#", 1)[0] + if not p_no_anchor.lower().endswith(".md"): + continue + # mkdocs erlaubt relative Pfade; wir interpretieren sie relativ zu docs/ + # Falls der Pfad bereits "docs/..." enthält, normalisieren wir trotzdem + if p_no_anchor.startswith(DOCS_DIR_DEFAULT + "/"): + rel = Path(p_no_anchor).relative_to(DOCS_DIR_DEFAULT) + else: + rel = Path(p_no_anchor) + if rel.as_posix() not in seen: + seen.add(rel.as_posix()) + ordered.append(rel) + return ordered + + +def collect_md_files(docs_dir: Path) -> list[Path]: + return sorted([p.relative_to(docs_dir) for p in docs_dir.rglob("*.md")]) + + +def apply_excludes(paths: list[Path], patterns: list[str]) -> list[Path]: + if not patterns: + return paths + kept = [] + for p in paths: + posix = p.as_posix() + if any(fnmatch.fnmatch(posix, pat) for pat in patterns): + continue + kept.append(p) + return kept + + +def demote_headings(text: str, levels: int = 1) -> str: + """ + Erhöht die Anzahl der '#' um 'levels' für alle ATX-Headings (Markdown #). + Lässt Codeblöcke unberührt. + """ + if levels <= 0: + return text + + lines = text.splitlines() + in_code = False + fence_re = re.compile(r"^(```|~~~)") + heading_re = re.compile(r"^(#{1,6})\s+") + for i, line in enumerate(lines): + if fence_re.match(line.strip()): + in_code = not in_code + continue + if in_code: + continue + m = heading_re.match(line) + if m: + hashes = m.group(1) + new_level = min(len(hashes) + levels, 6) + lines[i] = "#" * new_level + line[m.end(1) :] + return "\n".join(lines) + + +def read_file(path: Path) -> str: + try: + return path.read_text(encoding="utf-8") + except UnicodeDecodeError: + return path.read_text(encoding="utf-8", errors="replace") + + +def main(): + parser = argparse.ArgumentParser( + description="Concatenate Markdown files from docs/ into a single file." + ) + parser.add_argument( + "-d", + "--docs-dir", + default=DOCS_DIR_DEFAULT, + help="Pfad zum docs-Verzeichnis (Default: docs)", + ) + parser.add_argument("-o", "--output", required=True, help="Ausgabedatei (z. B. Combined.md)") + parser.add_argument( + "--demote", + action="store_true", + help="Headings ab der zweiten Datei um eine Ebene demoten (# -> ##, usw.)", + ) + parser.add_argument( + "--exclude", + action="append", + default=[], + help="Glob-Pattern zum Ausschließen (z. B. 'reference/**'). Mehrfach nutzbar.", + ) + parser.add_argument( + "--no-nav", + action="store_true", + help="mkdocs.yml ignorieren und alphabetisch alle .md zusammenfügen", + ) + args = parser.parse_args() + + project_root = Path(".").resolve() + docs_dir = (project_root / args.docs_dir).resolve() + if not docs_dir.exists(): + print(f"Fehler: docs-Verzeichnis nicht gefunden: {docs_dir}", file=sys.stderr) + sys.exit(1) + + # 1) Reihenfolge aus mkdocs.yml (falls nicht deaktiviert / vorhanden) + nav_order = load_nav_order(project_root) if not args.no_nav else [] + all_md = collect_md_files(docs_dir) + all_md = apply_excludes(all_md, args.exclude) + + # 2) Liste zusammenstellen: zuerst nav, dann Rest (ohne Duplikate) + ordered: list[Path] = [] + seen = set() + for rel in nav_order: + if rel in all_md and rel.as_posix() not in seen: + ordered.append(rel) + seen.add(rel.as_posix()) + for rel in all_md: + if rel.as_posix() not in seen: + ordered.append(rel) + seen.add(rel.as_posix()) + + if not ordered: + print("Keine Markdown-Dateien gefunden.", file=sys.stderr) + sys.exit(2) + + out_path = Path(args.output).resolve() + out_path.parent.mkdir(parents=True, exist_ok=True) + + parts = [] + for i, rel in enumerate(ordered, start=1): + src = docs_dir / rel + content = read_file(src) + if i > 1 and args.demote: + content = demote_headings(content, levels=1) + + header = f"\n\n\n\n" + parts.append(header + content.strip() + "\n") + + out_text = f"# Combined Documentation\n\n" + "\n".join(parts) + out_path.write_text(out_text, encoding="utf-8") + print(f"✔️ {len(ordered)} Dateien zusammengeführt → {out_path}") + + +if __name__ == "__main__": + main() diff --git a/docs/Api_Models.md b/docs/Api_Models.md index 18708b3..0d93861 100644 --- a/docs/Api_Models.md +++ b/docs/Api_Models.md @@ -9,14 +9,14 @@ * [`get_json`](#get_json) * [`get_df`](#get_df) * [Pagination](#pagination) - * [Context & Telemetry](#context--telemetry) -* [CLI Flags & Environment Variables](#cli-flags--environment-variables) + * [Context & Telemetry](#context-telemetry) +* [CLI Flags & Environment Variables](#cli-flags-environment-variables) * [Example Model](#example-model) * [Artifacts](#artifacts) -* [Tests & Offline Demos](#tests--offline-demos) +* [Tests & Offline Demos](#tests-offline-demos) * [Best Practices](#best-practices) * [Troubleshooting](#troubleshooting) -* [Security & Compliance](#security--compliance) +* [Security & Compliance](#security-compliance) * [FAQ](#faq) --- @@ -104,6 +104,7 @@ df = get_df( record_path=["data"], # path to the JSON list normalize=True, # optional: flatten nested objects paginator=None, # optional: pagination strategy (see below) + output="pandas", # pandas|spark (default=pandas) ) # -> pandas.DataFrame ``` @@ -112,6 +113,7 @@ df = get_df( - Default: `record_path` points to the array payload (for example `["data"]`). - `normalize=True` delegates to `json_normalize` for deeper structures. +- `output='spark'` (plus an optional `session=SparkSession`) converts the normalized result into a `pyspark.sql.DataFrame`. Additional backends will reuse the same parameter. ### Pagination @@ -262,6 +264,7 @@ fft run . --env dev --select dim_countries_from_api --http-cache ro - **Never cache secrets:** provide tokens via headers; the response body and metadata are cached. - **Use `--offline` in CI** for deterministic tests with a pre-seeded cache. - **Set TTL intentionally** when APIs change frequently. +- **Scope engine-specific variants** with `engine_model(only=...)` so each execution backend registers only the models it can run (pair with SQL `config(engines=[...])` when duplicating logical names). --- diff --git a/docs/Config_and_Macros.md b/docs/Config_and_Macros.md index dcf3b05..6c1cd5d 100644 --- a/docs/Config_and_Macros.md +++ b/docs/Config_and_Macros.md @@ -16,9 +16,9 @@ For an operational walkthrough (CLI usage, troubleshooting, pipelines) see the [ ## Docs Navigation 1. [Getting Started](./index.md) -2. [User Guide](./Technical_Overview.md#part-i--operational-guide) +2. [User Guide](./Technical_Overview.md#part-i-operational-guide) 3. **Modeling Reference** — you are here (`Config_and_Macros.md`) -4. [Developer Guide](./Technical_Overview.md#part-ii--architecture--internals) +4. [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) --- @@ -31,10 +31,10 @@ For an operational walkthrough (CLI usage, troubleshooting, pipelines) see the [ - [1.3 Seeds, sources, and dependencies](#13-seeds-sources-and-dependencies) - [2. `config()` options](#2-config-options) - [3. Variables with `var()`](#3-variables-with-var) -- [4. Template context & helpers](#4-template-context--helpers) -- [5. Macros & reusable Jinja code](#5-macros--reusable-jinja-code) +- [4. Template context & helpers](#4-template-context-helpers) +- [5. Macros & reusable Jinja code](#5-macros-reusable-jinja-code) - [6. Materialization semantics](#6-materialization-semantics) -- [7. Testing & quality gates](#7-testing--quality-gates) +- [7. Testing & quality gates](#7-testing-quality-gates) - [8. Quick cheat sheet](#8-quick-cheat-sheet) --- @@ -86,6 +86,26 @@ def enrich(df: pd.DataFrame) -> pd.DataFrame: return out ``` +#### Engine-scoped registration + +When the same project supports multiple execution backends, use `engine_model` to register a Python model only for specific engines. The decorator wraps `@model` but bails out early if the active engine (from `FF_ENGINE` or the selected profile) is not allowed. + +```python +from fastflowtransform import engine_model +import pandas as pd + +@engine_model( + only=("duckdb", "postgres"), + name="api_users_requests", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine"], +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + ... +``` + +Allowed values are case-insensitive strings or tuples. If the engine does not match, the function is left undecorated and no node is created, preventing duplicate registrations across engine-specific folders. + ### 1.3 Seeds, sources, and dependencies - Declare external tables in `sources.yml`; they become available via `source('group','table')`. @@ -95,6 +115,27 @@ def enrich(df: pd.DataFrame) -> pd.DataFrame: - Python models → use the decorator’s `deps`. - Additional runtime dependencies can be expressed via `relation_for()`. +> **Warning:** SQL dependency detection is static. Only literal calls such as `ref('users.ff')` are registered. When you need to gate a dependency behind a variable, materialise the options in a mapping (`{'foo': ref('foo'), 'bar': ref('bar')}`) and pick from that map at runtime; a bare `ref(variable)` will not show up in the DAG. + +- Persistence (e.g. Spark/Databricks): configure default targets under `project.yml → models.storage` (and optionally `seeds.storage`). Example: + + ```yaml + models: + storage: + api_users_http: + path: ".local/spark/api_users_http" + format: delta + options: + mergeSchema: true + + seeds: + storage: + users: + path: ".local/spark/seeds/users" + ``` + + Entries end up in `node.meta["storage"]` (keys: `path`, `format`, `options`) and are respected by the matching executor. + ```yaml # sources.yml version: 2 @@ -146,12 +187,14 @@ Supported keys (v0.1): |----------------|-----------------|------------------------------------------------------------------------------| | `materialized` | `"table" \| "view" \| "ephemeral"` | Controls how FastFlowTransform persists the model. See [Materialization semantics](#6-materialization-semantics). | | `tags` | `list[str]` | Arbitrary labels surfaced in docs / selection tooling. | +| `engines` | `list[str]` or `str` | Restrict registration to the listed engines (case-insensitive). Requires the active engine to be known (profile selection or `FF_ENGINE`). | | (future) | – | Additional metadata is stored under `node.meta[...]` if added later. | **Tips** - Place `config()` before any SQL text. - Use tags to power custom filters in docs or to drive test selection. +- Combine `engines=[...]` with per-engine subfolders to keep one physical file per backend without name clashes. When no engine is active, FastFlowTransform raises a clear error to avoid silent skips. - Ephemeral models inline into downstream SQL; pick `view` for shareable logic without materializing a table. --- @@ -331,7 +374,7 @@ fft utest . --env dev fft utest . --model users_enriched --case flags_gmail ``` -See the [Technical Overview](./Technical_Overview.md#model-unit-tests-fastflowtransform-utest) for an exhaustive walkthrough (engine overrides, CI examples, troubleshooting). +See the [Technical Overview](./Technical_Overview.md#model-unit-tests-fft-utest) for an exhaustive walkthrough (engine overrides, CI examples, troubleshooting). --- diff --git a/docs/Profiles.md b/docs/Profiles.md new file mode 100644 index 0000000..4ff4454 --- /dev/null +++ b/docs/Profiles.md @@ -0,0 +1,132 @@ +# Profiles Configuration + +FastFlowTransform uses `profiles.yml` to describe how each environment connects to the execution engine (DuckDB, Postgres, BigQuery, Databricks Spark, Snowflake Snowpark, …). This document covers file layout, supported features, environment overrides, and loading precedence. + +## File Location + +`profiles.yml` lives at the project root (same level as `models/`, `project.yml`). The CLI loads it whenever you run `fft` commands (seed/run/test/dag/utest/docgen …). + +``` +project/ +├── models/ +├── project.yml +└── profiles.yml +``` + +## Basic Structure + +The file is parsed as YAML after optional Jinja rendering. Top-level keys represent profile “names” (e.g. `dev`, `prod`, `dev_postgres`). Each profile must include an `engine` plus engine-specific configuration. + +```yaml +dev: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}" + +stg: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +prod: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET') }}" + location: EU + +default: + engine: duckdb + duckdb: + path: ":memory:" +``` + +### Engines and Sections + +Supported engines and their expected sections: + +| Engine | Section | Key Fields | +|----------------------|--------------------|---------------------------------------------------| +| `duckdb` | `duckdb` | `path` (file path or `:memory:`) | +| `postgres` | `postgres` | `dsn`, `db_schema` | +| `bigquery` | `bigquery` | `project` (optional), `dataset`, `location` | +| `databricks_spark` | `databricks_spark` | `master`, `app_name`, optional `extra_conf`, `warehouse_dir`, `use_hive_metastore`, `database`, `table_format`, `table_options` | +| `snowflake_snowpark` | `snowflake_snowpark`| `account`, `user`, `password`, `warehouse`, `database`, `db_schema`, optional `role` | + +Each profile can define its own `vars:` block (values exposed via `var('key')` inside templates). + +## Environment Variables + +`profiles.yml` supports Jinja expressions. The helper `env('FF_VAR', 'fallback')` reads process environment variables and substitutes the default if unset. Examples: + +```yaml +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'analytics') }}" +``` + +These expressions are rendered *before* YAML parsing. If the environment variable is missing and no default is provided, the expression resolves to an empty string and validation will fail with a clear error message. + +## Loading Order & Precedence + +When running `fft` commands, `_load_dotenv_layered()` loads `.env` files in ascending precedence: + +1. `/.env` +2. `/.env` +3. `/.env.local` +4. `/.env.` +5. `/.env..local` + +Earlier values fill defaults; later files override earlier ones *only for keys that are not already defined*. **Values set in the shell (e.g. via `FF_ENGINE=duckdb fft run …`) have highest priority**—they remain untouched, even if `.env` files define the same key. + +After `.env` loading, `profiles.yml` is rendered with Jinja (using the current `os.environ`) and parsed by Pydantic. Validation ensures required fields are present for each engine and produces human-readable errors for missing DSNs, schemas, etc. + +## Selecting Profiles + +- **Via `--env` flag**: `fft run . --env dev_postgres` +- **Via `FFT_ACTIVE_ENV`**: set in shell or `.env` to choose the active profile name. +- **Legacy `FF_ENGINE`** (overrides `engine` field post-parse): useful for quick experiments but explicit `profiles.yml` entries are preferred. + +Example Makefile snippet that switches profiles without exposing secrets: + +```make +ENGINE ?= duckdb + +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb +endif +ifeq ($(ENGINE),postgres) + PROFILE_ENV = dev_postgres +endif + +seed: + FFT_ACTIVE_ENV=$(PROFILE_ENV) uv run fft seed . --env $(PROFILE_ENV) +``` + +## Using `.env` for Secrets + +Keep sensitive credentials out of VCS by storing them in `.env` files referenced above: + +``` +examples/api_demo/ +├── .env.dev_duckdb # FF_DUCKDB_PATH=... +├── .env.dev_postgres # FF_PG_DSN=..., FF_PG_SCHEMA=... +├── .env.dev_databricks # FF_SPARK_MASTER=..., FF_SPARK_APP_NAME=... +└── profiles.yml +``` + +These files stay out of git (via `.gitignore`), while `profiles.yml` contains only non-sensitive wiring. + +## Summary of Features + +- Multiple profiles in a single YAML file. +- Jinja templating with `env()` helper for dynamic values. +- `.env` layered loading with shell overrides taking precedence. +- Validation for engine-specific parameters (clear error messages). +- Profile-specific `vars` exposed to Jinja `var()` function in models. +- Works seamlessly across CLI commands: seed, run, dag, test, docgen, utest. + +Keep `profiles.yml` declarative, `.env` files secret, and use CLI or Makefiles to select the active profile per run. This pattern scales from local DuckDB demos to production Postgres/BigQuery/Snowflake deployments. diff --git a/docs/Project_Config.md b/docs/Project_Config.md new file mode 100644 index 0000000..adb1571 --- /dev/null +++ b/docs/Project_Config.md @@ -0,0 +1,99 @@ +# Project Configuration (`project.yml`) + +`project.yml` defines global metadata, documentation, variables, and data-quality tests for a FastFlowTransform project. This reference walks through the supported sections and common patterns. + +## File Location + +`project.yml` lives at the root of your project. + +``` +project/ +├── models/ +├── project.yml +└── profiles.yml +``` + +## Top-Level Keys + +```yaml +name: my_project +version: "0.1" +models_dir: models # optional, defaults to "models" + +docs: + dag_dir: site/dag # output for fft dag --html + models: + users: + description: "Raw users table" + columns: + id: "Primary key" + email: "Email address" + +vars: + snapshot_day: "2024-01-01" + default_limit: 100 + +tests: + - type: not_null + table: users + column: id + tags: [batch] +``` + +### Metadata + +| Key | Description | +|-------------|-------------| +| `name` | Project identifier (used in docs/metadata). | +| `version` | Arbitrary version string. | +| `models_dir`| Relative directory containing models (`*.ff.sql` / `*.ff.py`). | + +### Documentation (`docs`) + +- `dag_dir`: where `fft dag --html` writes the static site. +- `models`: per-model descriptions and column docs surfaced in the generated DAG/docs. + +### Variables (`vars`) + +Key/value pairs accessible via `{{ var('key', default) }}` in Jinja templates. CLI overrides (`--vars key=value`) take precedence. + +### Tests (`tests`) + +Project-wide data quality checks run by `fft test`. Each test is a dict with: + +- `type`: `not_null`, `unique`, `accepted_values`, `row_count_between`, `greater_equal`, `non_negative_sum`, `freshness`, or reconciliation checks (`reconcile_equal`, `reconcile_diff_within`, `reconcile_ratio_within`, `reconcile_coverage`). +- `table`: target table or relation. +- `column`: required for column-based tests. +- Optional: `tags`, `severity` (`error`/`warn`), additional parameters (e.g. `values`, `min`, `max`). + +Example: + +```yaml +tests: + - type: accepted_values + table: mart_users + column: status + values: [active, invited] + severity: warn + - type: reconcile_equal + name: revenue_vs_bookings + left: { table: fct_revenue, expr: "sum(amount)" } + right: { table: fct_bookings, expr: "sum(expected_amount)" } + abs_tolerance: 5.0 +``` + +## Interaction with `.env` and Profiles + +`project.yml` does not read environment variables directly. However: + +- `vars:` can reference `var('key')` defaults overridden by CLI or `.env`. +- Tests often depend on `profiles.yml` and `sources.yml` for the actual connection details. +- Makefiles may set `FFT_ACTIVE_ENV` or other `FF_*` variables influencing runs, but `project.yml` remains static. + +## Best Practices + +- Keep `project.yml` committed to version control (no secrets). +- Use `docs/` to provide richer Markdown descriptions; reference them via `columns` or `description` fields if desired. +- Organize tests by tag (`tags: [batch]`, `tags: [reconcile]`) to support selective execution: `fft test . --select tag:reconcile`. + +Refer to `docs/Data_Quality_Tests.md` for detailed test semantics and `docs/Profiles.md` for profile/env loading behavior. diff --git a/docs/Quickstart.md b/docs/Quickstart.md index 1c20f12..2973c8b 100644 --- a/docs/Quickstart.md +++ b/docs/Quickstart.md @@ -2,6 +2,16 @@ This guide walks you through creating a minimal FastFlowTransform project from scratch and running it end-to-end. +## 0. Create a skeleton (optional) + +Start with a minimal project structure: + +```bash +fft init demo_project --engine duckdb +``` + +The command is non-interactive, refuses to overwrite existing directories, and leaves inline comments that point back to the relevant docs (`Project_Config.md`, `Profiles.md`, etc.). Populate the generated files before running the steps below. + ## 1. Install & bootstrap ```bash diff --git a/docs/Sources.md b/docs/Sources.md new file mode 100644 index 0000000..a2477cf --- /dev/null +++ b/docs/Sources.md @@ -0,0 +1,108 @@ +# Sources Configuration + +`sources.yml` declares external tables (seeds, raw inputs, lakehouse paths) that models can reference via `{{ source('group', 'table') }}`. This document covers the schema, engine overrides, file paths, and best practices. + +## File Location + +Place `sources.yml` at your project root (same level as `models/`). Example: + +``` +project/ +├── models/ +├── sources.yml +└── seeds/ +``` + +## YAML Schema (Version 2) + +FastFlowTransform expects a dbt-style structure: + +```yaml +version: 2 +sources: + - name: raw + schema: staging # default schema for this source group + overrides: + postgres: + schema: raw_main # engine-specific default override + + tables: + - name: seed_users + identifier: seed_users # optional physical name + overrides: + duckdb: + schema: main + databricks_spark: + format: delta + location: "/mnt/delta/raw/seed_users" +``` + +### Fields + +| Level | Field | Description | +|----------|-------------|-------------| +| source | `name` | Logical group identifier referenced by `source('name', ...)`. | +| | `schema` | Default target schema/database for the group. | +| | `database`/`catalog` | Optional qualifiers per engine (BigQuery, Snowflake). | +| | `overrides` | Map of engine → config snippet (schema overrides, formats, locations). | +| table | `name` | Logical table name (second argument in `source()`). | +| | `identifier`| Physical name; defaults to `name` if omitted. | +| | `location` | File/path location (used with `format`). | +| | `format` | Ingestion format for engines supporting path-based sources (`delta`, `parquet`, …). | +| | `options` | Dict of format options (Spark/Databricks). | +| | `overrides` | Additional engine-specific settings merged with source-level overrides. | + +Engine-specific overrides follow this merge order: + +1. Source defaults (`schema`, `database`, …) +2. Source-level `overrides[engine]` +3. Table-level `overrides[engine]` + +### Engine Behavior + +- **DuckDB / Postgres / BigQuery / Snowflake**: expect `identifier` (plus `schema`/`database` where relevant). Path-based sources raise errors. +- **Databricks Spark**: supports `format` + `location`. The executor registers a temp view with optional `options` (e.g. `compression`). + +### Path-Based Sources Example + +```yaml + - name: raw_events + tables: + - name: landing + overrides: + databricks_spark: + format: json + location: "abfss://landing@storage.dfs.core.windows.net/events/*.json" + options: + multiline: true +``` + +## Referencing Sources in Models + +```sql +select id, email +from {{ source('raw', 'seed_users') }} +``` + +After rendering, the executor resolves the fully-qualified relation or path depending on the active engine. + +## Seed Integration + +When combined with `seeds/schema.yml`, you can map CSV/Parquet seeds into schemas per engine: + +```yaml +targets: + raw/users: + schema: raw + schema_by_engine: + duckdb: main + postgres: staging +``` + +## Validation & Errors + +- Missing `identifier` *and* `location` produce `KeyError` during rendering. +- Unknown source/table names raise `KeyError` with suggestions. +- Unsupported path-based sources on an engine (`location` provided but no `format`) raise descriptive `NotImplementedError`. + +Keep `sources.yml` declarative, use engine overrides for schema differences, and lean on `.env` files where credentials or URIs vary per environment. diff --git a/docs/Technical_Overview.md b/docs/Technical_Overview.md index 4d7e555..e33c1c4 100644 --- a/docs/Technical_Overview.md +++ b/docs/Technical_Overview.md @@ -10,26 +10,26 @@ ## Docs Navigation 1. [Getting Started](./index.md) -2. **User Guide** — see [Part I – Operational Guide](#part-i--operational-guide) (this document) +2. **User Guide** — see [Part I – Operational Guide](#part-i-operational-guide) (this document) 3. [Modeling Reference](./Config_and_Macros.md) -4. **Developer Guide** — see [Part II – Architecture & Internals](#part-ii--architecture--internals) (this document) +4. **Developer Guide** — see [Part II – Architecture & Internals](#part-ii-architecture-internals) (this document) --- ## Table of Contents - [Docs Navigation](#docs-navigation) -- [Part I – Operational Guide](#part-i--operational-guide) +- [Part I – Operational Guide](#part-i-operational-guide) - [Project Layout](#project-layout) - [Sample Models](#sample-models) - - [Seeds & Example Data](#seeds--example-data) + - [Seeds & Example Data](#seeds-example-data) - [Makefile Targets](#makefile-targets) - [CLI Flows](#cli-flows) - - [Logging & Verbosity](#logging--verbosity) - - [Model Unit Tests (`fft utest`)](#model-unit-tests-fastflowtransform-utest) + - [Logging & Verbosity](#logging-verbosity) + - [Model Unit Tests (`fft utest`)](#model-unit-tests-fft-utest) - [Troubleshooting](#troubleshooting) - [Error Codes](#error-codes) - - [Profiles & Environment Overrides](#profiles--environment-overrides) + - [Profiles & Environment Overrides](#profiles-environment-overrides) - [Parallel Scheduler (v0.3)](#parallel-scheduler-v03) - [Cache Policy (v0.3)](#cache-policy-v03) - [Fingerprint Formula (v0.3)](#fingerprint-formula-v03) @@ -37,8 +37,8 @@ - [Jinja DSL Quick Reference](#jinja-dsl-quick-reference) - [Roadmap Snapshot](#roadmap-snapshot) - [Cross-Table Reconciliations](#cross-table-reconciliations) - - [Auto-Docs & Lineage](#auto-docs--lineage) -- [Part II – Architecture & Internals](#part-ii--architecture--internals) + - [Auto-Docs & Lineage](#auto-docs-lineage) +- [Part II – Architecture & Internals](#part-ii-architecture-internals) - [Architecture Overview](#architecture-overview) - [Core Modules](#core-modules) - [`core.py`](#corepy) @@ -47,7 +47,7 @@ - [Executors](#executors) - [`validation.py`](#validationpy) - [`testing.py`](#testingpy) - - [`docs.py` & Templates](#docspy--templates) + - [`docs.py` & Templates](#docspy-templates) - [`seeding.py`](#seedingpy) - [CLI Implementation](#cli-implementation) - [Settings Infrastructure](#settings-infrastructure) diff --git a/docs/_scripts/gen_api.py b/docs/_scripts/gen_api.py new file mode 100644 index 0000000..1ffb2a2 --- /dev/null +++ b/docs/_scripts/gen_api.py @@ -0,0 +1,106 @@ +# docs/_scripts/gen_api.py +from __future__ import annotations +from pathlib import Path +import mkdocs_gen_files + +# ------------------------------------------------------------------- +# Konfiguration +# Falls du den Namen kennst, trag ihn hier ein; None => Auto-Detect. +PACKAGE: str | None = "fastflowtransform" # <- bei Bedarf anpassen oder auf None setzen +SRC_DIR = Path("src") +# ------------------------------------------------------------------- + + +def detect_package() -> tuple[str, Path]: + """ + Liefert (package_name, package_root_path). + Prüft zuerst src-Layout (src//__init__.py), dann Flat-Layout (/__init__.py). + """ + candidates: list[tuple[str, str, Path]] = [] + + # src-Layout + if SRC_DIR.exists(): + for p in SRC_DIR.iterdir(): + if p.is_dir() and (p / "__init__.py").exists(): + candidates.append(("src", p.name, p)) + + # Flat-Layout (im Repo-Root) + root = Path(".") + ignore = { + "docs", + "site", + "build", + "dist", + "tests", + ".git", + "venv", + ".venv", + "src", + ".mypy_cache", + ".pytest_cache", + } + for p in root.iterdir(): + if p.is_dir() and (p / "__init__.py").exists() and p.name not in ignore: + candidates.append(("flat", p.name, p)) + + if PACKAGE: + for _, name, path in candidates: + if name == PACKAGE: + return name, path + raise AssertionError( + f'Paket "{PACKAGE}" nicht gefunden. Erwartet z. B. src/{PACKAGE}/ oder {PACKAGE}/ mit __init__.py' + ) + + unique = {(name, str(path)) for _, name, path in candidates} + if not unique: + raise AssertionError( + "Kein Paket gefunden. Lege src//__init__.py an oder setze PACKAGE im Skript." + ) + if len(unique) > 1: + formatted = "\n".join(f"- {name} @ {path}" for name, path in sorted(unique)) + raise AssertionError( + f"Mehrere mögliche Pakete gefunden:\n{formatted}\nSetze PACKAGE explizit im Skript." + ) + name, path_str = next(iter(unique)) + return name, Path(path_str) + + +package, pkg_root = detect_package() +print(f"[gen_api] Paket erkannt: {package} | Pfad: {pkg_root}") + +generated_files: list[tuple[str, str]] = [] # (module, doc_file) + +for path in sorted(pkg_root.rglob("*.py")): + # Rausfiltern + if any(part in {"__pycache__", ".pytest_cache"} for part in path.parts): + continue + + rel = path.with_suffix("").relative_to(pkg_root) + parts = list(rel.parts) + + # Modulname bestimmen + if path.name == "__init__.py": + module = package + ("" if not parts[:-1] else "." + ".".join(parts[:-1])) + else: + module = package + "." + ".".join(parts) + + # Zielpfad (mkdocs_gen_files.open legt Ordner an) + doc_file = f"reference/{module.replace('.', '/')}.md" + generated_files.append((module, doc_file)) + + with mkdocs_gen_files.open(doc_file, "w") as f: + f.write(f"# {module}\n\n") + f.write(f"::: {module}\n") + f.write(" options:\n") + f.write(" show_signature: true\n") + f.write(" filters:\n") + f.write(' - "!^_"\n') + +# Index-Seite erzeugen +index_path = "reference/index.md" +with mkdocs_gen_files.open(index_path, "w") as f: + f.write("# API Reference\n\n") + f.write("> Auto-generated per module\n\n") + for module, doc_file in generated_files: + rel = Path(doc_file).relative_to("reference").as_posix() + f.write(f"- [{module}]({rel})\n") diff --git a/docs/examples/API_Demo.md b/docs/examples/API_Demo.md new file mode 100644 index 0000000..1a9d6d3 --- /dev/null +++ b/docs/examples/API_Demo.md @@ -0,0 +1,116 @@ +# API Demo Project + +The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local data, external APIs, and multiple execution engines. It highlights: + +- **Hybrid data model**: joins a local seed (`crm.users`) with live user data from JSONPlaceholder. +- **Multiple environments**: switch between DuckDB, Postgres, and Databricks Spark using `profiles.yml` + `.env.*`. +- **HTTP integration**: compare the built-in FastFlowTransform HTTP client (`api_users_http`) with a plain `requests` implementation (`api_users_requests`). +- **Offline caching & telemetry**: inspect HTTP snapshots via `run_results.json`. +- **Engine-aware registration**: scope Python models via `engine_model` and SQL models via `config(engines=[...])` so only the active engine’s nodes load. + +## Data Model + +1. **Seed staging** – `models/common/users.ff.sql` + ```sql + {{ config( + materialized='table', + tags=[ + 'example:api_demo', + 'scope:common', + 'kind:seed-consumer', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ] + ) }} + select id, email + from {{ source('crm', 'users') }}; + ``` + Consumes `sources.yml → crm.users` (seeded from `seeds/seed_users.csv`). + +2. **API enrichment** – two Python implementations under `models/engines/duckdb/`: + - `api_users_http.ff.py` uses the built-in HTTP wrapper (`fastflowtransform.api.http.get_df`) with cache/offline support. + - `api_users_requests.ff.py` uses raw `requests` for maximum flexibility. + - Wrap engine-specific callables with `engine_model(only="duckdb", ...)` to skip registration when another engine is selected. + +3. **Mart join** – `models/common/mart_users_join.ff.sql` + ```sql + {{ config(engines=['duckdb','postgres','databricks_spark']) }} + {% set api_users_model = var('api_users_model', 'api_users_http') %} + {% set api_users_refs = { + 'api_users_http': ref('api_users_http'), + 'api_users_requests': ref('api_users_requests') + } %} + {% set api_users_relation = api_users_refs.get(api_users_model, api_users_refs['api_users_http']) %} + with a as ( + select u.id as user_id, u.email from {{ ref('users.ff') }} u + ), + b as ( + select * from {{ api_users_relation }} + ) + select ... + ``` + Ties everything together and exposes the `var('api_users_model')` hook to choose the HTTP implementation while still keeping literal `ref('…')` calls in the template (required for DAG detection). `config(engines=[...])` keeps the SQL node registered only for the engines you list, preventing duplicate names across engine-specific folders. + + > **Warning:** The DAG builder only detects dependencies from literal `ref('model_name')` strings. A pure `ref(api_users_model)` (without the mapping shown above) compiles, but the graph would miss the edge to `api_users_http`/`api_users_requests`. + +## Profiles & Secrets + +`profiles.yml` defines per-engine profiles that reference environment variables: + +```yaml +dev_duckdb: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/api_demo.duckdb') }}" + +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" +``` + +`.env.dev_*` files supply the actual values. `_load_dotenv_layered()` loads them in priority order: repo `.env` → project `.env` → `.env.` → shell overrides (highest priority). Secrets stay out of version control. + + +## Makefile Workflow + +`Makefile` chooses the profile via `ENGINE` (`duckdb`/`postgres`/`databricks_spark`) and wraps the main commands: + +```make +ENGINE ?= duckdb + +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb +endif +... + +seed: + uv run fft seed "$(PROJECT)" --env $(PROFILE_ENV) +run: + env FFT_ACTIVE_ENV=$(PROFILE_ENV) ... uv run fft run ... +``` + +Common targets: + +| Target | Description | +|--------------------------|-------------| +| `make ENGINE=duckdb seed`| Materialize seeds into DuckDB. | +| `make ENGINE=postgres run`| Execute the full pipeline against Postgres. | +| `make dag` | Render documentation (`site/dag/`). | +| `make api-run` | Run only API models (uses HTTP cache). | +| `make api-offline` | Force offline mode (`FF_HTTP_OFFLINE=1`). | +| `make api-show-http` | Display HTTP snapshot metrics via `jq`. | + +HTTP tuning parameters (`FF_HTTP_ALLOWED_DOMAINS`, cache dir, timeouts) live in `.env` and are appended via `HTTP_ENV` when running commands. + +## End-to-End Demo + +1. **Select engine**: `make ENGINE=duckdb` (default). Set `ENGINE=postgres` or `ENGINE=databricks_spark` to switch. +2. **Seed data**: `make seed` +3. **Run pipeline**: `make run` +4. **Explore docs**: `make dag` → open `examples/api_demo/site/dag/index.html` +5. **Inspect HTTP usage**: `make api-show-http` + +This example demonstrates multi-engine configuration, environment-driven secrets, and API enrichment within FastFlowTransform. diff --git a/docs/examples/Local_Engine_Setup.md b/docs/examples/Local_Engine_Setup.md new file mode 100644 index 0000000..b64e318 --- /dev/null +++ b/docs/examples/Local_Engine_Setup.md @@ -0,0 +1,26 @@ +## Local Engine Setup + +### DuckDB + +- Copy `.env.dev_duckdb` and adjust `FF_DUCKDB_PATH` if you want a different location (default: `.local/api_demo.duckdb`). +- Create the target directory once: `mkdir -p examples/api_demo/.local`. +- Run `make ENGINE=duckdb seed run` to build the seeds and models inside the DuckDB file. + +### Postgres + +- Start a local database, e.g. via Docker: + `docker run --name fft-postgres -e POSTGRES_PASSWORD=postgres -p 5432:5432 -d postgres:15`. +- Set `FF_PG_DSN` in `.env.dev_postgres` (for example `postgresql+psycopg://postgres:postgres@localhost:5432/fft`) and optionally override `FF_PG_SCHEMA` (defaults to `api_demo`). + The executor ensures the schema exists via `CREATE SCHEMA IF NOT EXISTS` on first connect. +- Execute `make ENGINE=postgres seed run` to materialize seeds and models in Postgres. + +### Databricks Spark (local) + +- Install Java (JDK ≥ 17) and declare `JAVA_HOME`, for example: + `brew install openjdk@17` + `echo 'JAVA_HOME=/opt/homebrew/opt/openjdk@17' >> examples/api_demo/.env.dev_databricks`. +- Optionally tweak `FF_SPARK_MASTER` / `FF_SPARK_APP_NAME` in `.env.dev_databricks` (default: `local[*]`). +- To persist tables across separate `seed`/`run` sessions, enable the bundled Hive metastore defaults: + `FF_DBR_ENABLE_HIVE=1`, `FF_DBR_WAREHOUSE_DIR=examples/api_demo/spark-warehouse`, `FF_DBR_DATABASE=api_demo`. +- Switch the physical format by setting `FF_DBR_TABLE_FORMAT` (e.g. `delta`, requires the Delta Lake runtime); extra writer options can be supplied via `profiles.yml → databricks_spark.table_options`. +- Ensure your shell loads `.env.dev_databricks` (via `make`, `direnv`, or manual export) and run `make ENGINE=databricks_spark seed run`. diff --git a/docs/index.md b/docs/index.md index d71d65d..36754ad 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,17 +6,20 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t ## Docs Navigation - **Getting Started** — you are here (`docs/index.md`) -- [User Guide](./Technical_Overview.md#part-i--operational-guide) +- [User Guide](./Technical_Overview.md#part-i-operational-guide) - [Modeling Reference](./Config_and_Macros.md) - [Parallelism & Cache](./Cache_and_Parallelism.md) - [API calls in Python models](./Api_Models.md) - [Incremental Models](./Incremental.md) - [YAML Tests (Schema-bound)](./YAML_Tests.md) - [Data Quality Tests Reference](./Data_Quality_Tests.md) +- [Profiles & Environments](./Profiles.md) +- [Sources Declaration](./Sources.md) +- [Project Configuration](./Project_Config.md) - [State Selection (changed & results)](./State_Selection.md) - [Cross-Table Reconciliations](./Technical_Overview.md#cross-table-reconciliations) -- [Auto-Docs & Lineage](./Technical_Overview.md#auto-docs--lineage) -- [Developer Guide](./Technical_Overview.md#part-ii--architecture--internals) +- [Auto-Docs & Lineage](./Technical_Overview.md#auto-docs-lineage) +- [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) ## Table of Contents @@ -32,15 +35,16 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t ### 1. Build & Operate Projects (Data Practitioners) - **Get set up quickly:** follow the dedicated [Quickstart](Quickstart.md) guide for installation, seeding, and a first run. +- **Need local runtimes?** The [API demo local engine setup](examples/Local_Engine_Setup.md) walks through DuckDB, Postgres, and Databricks Spark. - **Understand the project layout & CLI workflow:** see *Project Layout*, *Makefile Targets*, and *CLI Flows* in the [Technical Overview](Technical_Overview.md#project-layout). -- **Configure runtimes & profiles:** review executor profiles, environment overrides, and logging options in the [Technical Overview](Technical_Overview.md#profiles--environment-overrides). -- **Model data quality & troubleshoot runs:** the [Technical Overview](Technical_Overview.md#model-unit-tests-fastflowtransform-utest) covers unit tests, troubleshooting tips, and exit codes. +- **Configure runtimes & profiles:** review executor profiles, environment overrides, and logging options in the [Technical Overview](Technical_Overview.md#profiles-environment-overrides). +- **Model data quality & troubleshoot runs:** the [Technical Overview](Technical_Overview.md#model-unit-tests-fft-utest) covers unit tests, troubleshooting tips, and exit codes. - **Explore runnable demos:** browse the `examples/` directory in the repo; each subproject comes with its own README. ### 2. Extend FastFlowTransform (Developers & Contributors) - **Dive into architecture & core modules:** start with [Architecture Overview](Technical_Overview.md#architecture-overview) and [Core Modules](Technical_Overview.md#core-modules) for registry, DAG, executors, validation, and more. -- **Add tests & seeds:** see [Sample Models](Technical_Overview.md#sample-models), [Seeds & Example Data](Technical_Overview.md#seeds--example-data), and the unit test guide in [Model Unit Tests](Technical_Overview.md#model-unit-tests-fastflowtransform-utest). +- **Add tests & seeds:** see [Sample Models](Technical_Overview.md#sample-models), [Seeds & Example Data](Technical_Overview.md#seeds-example-data), and the unit test guide in [Model Unit Tests](Technical_Overview.md#model-unit-tests-fft-utest). - **Contribute code:** follow the workflow described in [`./Contributing.md`](./Contributing.md) and consult the module-level docs for internal APIs. - **Plan ahead:** check the roadmap snapshot in the [Technical Overview](Technical_Overview.md#roadmap-snapshot) to understand upcoming work. diff --git a/examples/_scripts/cleanup_env.py b/examples/_scripts/cleanup_env.py new file mode 100644 index 0000000..abc98dd --- /dev/null +++ b/examples/_scripts/cleanup_env.py @@ -0,0 +1,327 @@ +from __future__ import annotations + +import argparse +import os +import shutil +import sys +from contextlib import suppress +from pathlib import Path +from typing import Any, Iterable + +from dotenv import dotenv_values + +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = SCRIPT_DIR.parents[1] +SRC_DIR = PROJECT_ROOT / "src" +if SRC_DIR.exists() and str(SRC_DIR) not in sys.path: + sys.path.insert(0, str(SRC_DIR)) + +from fastflowtransform.settings import EnvSettings, resolve_profile + + +def _log(msg: str) -> None: + print(msg) + + +def _coerce_path(value: str | None, project: Path) -> Path | None: + if not value: + return None + p = Path(value) + if not p.is_absolute(): + p = (project / p).resolve() + return p + + +def _remove_paths(paths: Iterable[Path], *, dry_run: bool) -> None: + for path in paths: + if not path: + continue + if not path.exists(): + continue + if dry_run: + _log(f"[dry-run] Would remove {path}") + continue + if path.is_dir(): + shutil.rmtree(path, ignore_errors=True) + _log(f"Removed directory {path}") + else: + with suppress(Exception): + path.unlink() + _log(f"Removed file {path}") + + +def cleanup_duckdb(*, project: Path, db_path: str | None, dry_run: bool) -> None: + candidates: list[Path] = [] + env_path = os.getenv("FF_DUCKDB_PATH") + for raw in [db_path, env_path]: + candidate = _coerce_path(raw, project) + if candidate and candidate not in candidates: + candidates.append(candidate) + wal = candidate.with_suffix(candidate.suffix + ".wal") + if wal not in candidates: + candidates.append(wal) + if candidates: + _log("Cleaning DuckDB files") + _remove_paths(candidates, dry_run=dry_run) + + +def cleanup_postgres(*, dsn: str | None, schema: str | None, dry_run: bool) -> None: + if not dsn: + raise ValueError("Postgres cleanup requires FF_PG_DSN or --postgres-dsn") + if not schema: + raise ValueError("Postgres cleanup requires FF_PG_SCHEMA or --postgres-schema") + if dry_run: + _log(f"[dry-run] Would drop and recreate schema '{schema}' on {dsn}") + return + from sqlalchemy import ( + create_engine, + text, + ) # local import to avoid optional dependency at import + + engine = create_engine(dsn, isolation_level="AUTOCOMMIT") + _log(f"Dropping schema '{schema}' on {dsn}") + with engine.begin() as conn: + conn.execute(text(f'DROP SCHEMA IF EXISTS "{schema}" CASCADE')) + conn.execute(text(f'CREATE SCHEMA "{schema}"')) + + +def _env_flag(name: str, default: bool = False) -> bool: + val = os.getenv(name) + if val is None: + return default + return val.strip().lower() in {"1", "true", "yes", "on"} + + +def cleanup_databricks( + *, + project: Path, + master: str | None, + app_name: str | None, + warehouse_dir: str | None, + database: str | None, + catalog: str | None, + extra_conf: dict[str, Any] | None, + use_hive: bool, + dry_run: bool, +) -> Path | None: + master = master or os.getenv("FF_DBR_MASTER", "local[*]") + app_name = app_name or os.getenv("FF_DBR_APPNAME", "cleanup") + warehouse = warehouse_dir or os.getenv("FF_DBR_WAREHOUSE_DIR") + database = database or os.getenv("FF_DBR_DATABASE") + catalog = catalog or os.getenv("FF_DBR_CATALOG") + enable_hive = use_hive or _env_flag("FF_DBR_ENABLE_HIVE", False) + + if dry_run: + _log( + "[dry-run] Would reset Databricks/Spark environment " + f"(master={master}, database={database}, warehouse={warehouse})" + ) + return None + + try: + from pyspark.sql import SparkSession + except ModuleNotFoundError as exc: + raise RuntimeError("Databricks cleanup requires pyspark to be installed") from exc + + builder = SparkSession.builder.master(master).appName(app_name) + + warehouse_path: Path | None = None + if warehouse: + warehouse_path = Path(warehouse).expanduser() + if not warehouse_path.is_absolute(): + warehouse_path = (project / warehouse_path).resolve() + _log(f"Resetting warehouse directory {warehouse_path}") + warehouse_path.mkdir(parents=True, exist_ok=True) + builder = builder.config("spark.sql.warehouse.dir", str(warehouse_path)) + + if catalog: + builder = builder.config("spark.sql.catalog.spark_catalog", catalog) + + if extra_conf: + for key, value in extra_conf.items(): + if value is not None: + builder = builder.config(str(key), str(value)) + + if enable_hive: + builder = builder.config("spark.sql.catalogImplementation", "hive") + builder = builder.enableHiveSupport() + + spark = builder.getOrCreate() + try: + if database: + _log(f"Dropping database `{database}`") + spark.sql(f"DROP DATABASE IF EXISTS `{database}` CASCADE") + spark.sql(f"CREATE DATABASE `{database}`") + elif catalog: + _log(f"Clearing catalog `{catalog}` tables") + tables = spark.sql(f"SHOW TABLES IN `{catalog}`").collect() + for row in tables: + db = row["database"] + tbl = row["tableName"] + if db: + spark.sql(f"DROP TABLE IF EXISTS `{catalog}`.`{db}`.`{tbl}`") + else: + spark.sql(f"DROP TABLE IF EXISTS `{catalog}`.`{tbl}`") + finally: + with suppress(Exception): + spark.stop() + + if warehouse_path: + _log(f"Resetting warehouse directory {warehouse_path}") + shutil.rmtree(warehouse_path, ignore_errors=True) + warehouse_path.mkdir(parents=True, exist_ok=True) + return warehouse_path + + +def cleanup_common_artifacts( + *, project: Path, dry_run: bool, extra_paths: Iterable[Path] | None = None +) -> None: + targets = [ + project / ".fastflowtransform", + project / "docs", + project / "site", + project / "dist", + project / "build", + project / ".local", + ] + if extra_paths: + for path in extra_paths: + if path: + targets.append(Path(path)) + _remove_paths(targets, dry_run=dry_run) + extra = [p for p in project.glob("*.egg-info")] + _remove_paths(extra, dry_run=dry_run) + + +def _load_dotenv_layered(project_dir: Path, env_name: str) -> None: + original_env = dict(os.environ) + merged: dict[str, str] = {} + + def _merge(p: Path) -> None: + try: + if not p.exists(): + return + data = dotenv_values(p) + for key, value in (data or {}).items(): + if value is not None: + merged[key] = value + except Exception: + pass + + _merge(Path.cwd() / ".env") + _merge(project_dir / ".env") + _merge(project_dir / ".env.local") + _merge(project_dir / f".env.{env_name}") + _merge(project_dir / f".env.{env_name}.local") + + for key, value in merged.items(): + if key not in original_env and value is not None: + os.environ.setdefault(key, value) + + +def _load_profile(project: Path, env_name: str, engine: str | None): + env_settings = EnvSettings() + if engine: + env_settings = env_settings.model_copy(update={"ENGINE": engine}) + env_settings = env_settings.model_copy(update={"ENV": env_name}) + return resolve_profile(project, env_name, env_settings) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Reset FastFlowTransform example environments.") + parser.add_argument( + "--engine", required=True, choices=["duckdb", "postgres", "databricks_spark"] + ) + parser.add_argument("--project", default=".") + parser.add_argument("--env", help="Profile environment name (e.g. dev_duckdb).") + parser.add_argument("--duckdb-path") + parser.add_argument("--postgres-dsn") + parser.add_argument("--postgres-schema") + parser.add_argument("--spark-master") + parser.add_argument("--spark-app-name") + parser.add_argument("--spark-warehouse") + parser.add_argument("--spark-database") + parser.add_argument("--spark-catalog") + parser.add_argument( + "--spark-use-hive", action="store_true", help="Force Hive metastore enablement for cleanup." + ) + parser.add_argument("--dry-run", action="store_true") + parser.add_argument( + "--skip-artifacts", + action="store_true", + help="Do not remove local artifacts (.fastflowtransform, docs, site, .local).", + ) + + args = parser.parse_args(argv) + project = Path(args.project).expanduser().resolve() + + env_name = ( + args.env + or os.getenv("FFT_ACTIVE_ENV") + or ("dev_" + args.engine if args.engine in {"duckdb", "postgres"} else "dev") + ) + + try: + os.environ["FFT_ACTIVE_ENV"] = env_name + _load_dotenv_layered(project, env_name) + profile = _load_profile(project, env_name, args.engine) + + warehouse_path: Path | None = None + if args.engine == "duckdb": + profile_duckdb = getattr(getattr(profile, "duckdb", None), "path", None) + db_path = args.duckdb_path or os.getenv("FF_DUCKDB_PATH") or profile_duckdb + cleanup_duckdb(project=project, db_path=db_path, dry_run=args.dry_run) + elif args.engine == "postgres": + profile_pg = getattr(profile, "postgres", None) + profile_dsn = getattr(profile_pg, "dsn", None) + profile_schema = getattr(profile_pg, "db_schema", None) + dsn = args.postgres_dsn or os.getenv("FF_PG_DSN") or profile_dsn + schema = args.postgres_schema or os.getenv("FF_PG_SCHEMA") or profile_schema + cleanup_postgres(dsn=dsn, schema=schema, dry_run=args.dry_run) + elif args.engine == "databricks_spark": + profile_db = getattr(profile, "databricks_spark", None) + profile_master = getattr(profile_db, "master", None) + profile_app = getattr(profile_db, "app_name", None) + profile_warehouse = getattr(profile_db, "warehouse_dir", None) + profile_database = getattr(profile_db, "database", None) + profile_catalog = getattr(profile_db, "catalog", None) + profile_use_hive = getattr(profile_db, "use_hive_metastore", False) + profile_extra_conf = getattr(profile_db, "extra_conf", None) + warehouse_path = cleanup_databricks( + project=project, + master=args.spark_master or profile_master, + app_name=args.spark_app_name or profile_app, + warehouse_dir=args.spark_warehouse or profile_warehouse, + database=args.spark_database or profile_database, + catalog=args.spark_catalog or profile_catalog, + extra_conf=profile_extra_conf, + use_hive=args.spark_use_hive or bool(profile_use_hive), + dry_run=args.dry_run, + ) + except Exception as exc: + _log(f"Cleanup failed: {exc}") + return 1 + + if not args.skip_artifacts: + extra_paths: list[Path] = [] + if args.engine == "databricks_spark": + configured = ( + args.spark_warehouse + or os.getenv("FF_DBR_WAREHOUSE_DIR") + or getattr(getattr(profile, "databricks_spark", None), "warehouse_dir", None) + ) + if configured: + p = Path(configured).expanduser() + if not p.is_absolute(): + p = (project / p).resolve() + extra_paths.append(p) + if warehouse_path: + extra_paths.append(warehouse_path) + extra_paths.append((project / "spark-warehouse").resolve()) + cleanup_common_artifacts(project=project, dry_run=args.dry_run, extra_paths=extra_paths) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/api_demo/Makefile b/examples/api_demo/Makefile index f4e0d63..965c1d8 100644 --- a/examples/api_demo/Makefile +++ b/examples/api_demo/Makefile @@ -5,10 +5,13 @@ # --- Config ------------------------------------------------------------------- # DuckDB database file and project path (for the API demo typically examples/api_demo) -DB ?= .local/demo.duckdb +DB ?= .local/api_demo.duckdb PROJECT ?= . UV ?= uv +# Engine selector (duckdb|postgres|databricks_spark) +ENGINE ?= duckdb + # HTTP wrapper defaults (override per call if needed) # Allowed domains are comma-separated (no https://) FF_HTTP_ALLOWED_DOMAINS ?= jsonplaceholder.typicode.com,api.github.com @@ -25,44 +28,52 @@ else OPENER := xdg-open endif -# --- Shared env for all runs (DuckDB + HTTP) --------------------------------- -RUN_ENV = FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" \ - FF_HTTP_ALLOWED_DOMAINS="$(FF_HTTP_ALLOWED_DOMAINS)" \ - FF_HTTP_CACHE_DIR="$(FF_HTTP_CACHE_DIR)" \ - FF_HTTP_MAX_RPS="$(FF_HTTP_MAX_RPS)" \ - FF_HTTP_MAX_RETRIES="$(FF_HTTP_MAX_RETRIES)" \ - FF_HTTP_TIMEOUT="$(FF_HTTP_TIMEOUT)" -# Engine env ifeq ($(ENGINE),duckdb) - ENGINE_ENV = FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" + PROFILE_ENV = dev_duckdb ENGINE_TAG = engine:duckdb endif ifeq ($(ENGINE),postgres) - ENGINE_ENV = FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" + PROFILE_ENV = dev_postgres ENGINE_TAG = engine:postgres endif ifeq ($(ENGINE),databricks_spark) - ENGINE_ENV = FF_ENGINE=databricks_spark FF_SPARK_MASTER="$(FF_SPARK_MASTER)" + PROFILE_ENV = dev_databricks ENGINE_TAG = engine:databricks_spark endif -# Select only common + this engine; keeps DAG clean and avoids executing foreign variants -SELECT = tag:example:api_demo,tag:scope:common,tag:$(ENGINE_TAG) + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +RUN_ENV = $(BASE_ENV) + +# Select only API demo models for the active engine (common models carry all engine tags) +SELECT_FLAGS = --select tag:example:api_demo --select tag:$(ENGINE_TAG) + +CLEAN_SCRIPT = ../_scripts/cleanup_env.py + +ifeq ($(ENGINE),duckdb) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine duckdb --env "$(PROFILE_ENV)" --project "$(PROJECT)" --duckdb-path "$(DB)" +else ifeq ($(ENGINE),postgres) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),databricks_spark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else + CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) +endif # --- Standard R1 targets ------------------------------------------------------ seed: - $(ENGINE_ENV) $(UV) run fft seed "$(PROJECT)" --env dev + env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) run: - $(ENGINE_ENV) $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) test: - $(ENGINE_ENV) $(UV) run fft test "$(PROJECT)" --env dev + env $(BASE_ENV) $(UV) run fft test "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) dag: - $(ENGINE_ENV) $(UV) run fft dag "$(PROJECT)" --env dev --html + env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --html artifacts: @echo @@ -71,35 +82,23 @@ artifacts: @echo " DAG HTML: $(PROJECT)/site/dag/index.html" incr: - $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --select fct_events_inc.ff --cache rw || true + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) --select fct_events_inc.ff --cache rw || true state-mod: @if [ -f "$(PROJECT)/models/users.ff.sql" ]; then touch "$(PROJECT)/models/users.ff.sql"; fi - $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --cache rw --select state:modified + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) --cache rw --select state:modified state-mod-plus: - $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --cache rw --select state:modified+ + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) --cache rw --select state:modified+ res-error: - $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --select result:error || true + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) --select result:error || true res-warn: - $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --select result:warn || true - -pg-seed: - FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" $(UV) run fft seed "$(PROJECT)" --env stg - -pg-run: - FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" \ - FF_HTTP_ALLOWED_DOMAINS="$(FF_HTTP_ALLOWED_DOMAINS)" \ - FF_HTTP_CACHE_DIR="$(FF_HTTP_CACHE_DIR)" \ - FF_HTTP_MAX_RPS="$(FF_HTTP_MAX_RPS)" \ - FF_HTTP_MAX_RETRIES="$(FF_HTTP_MAX_RETRIES)" \ - FF_HTTP_TIMEOUT="$(FF_HTTP_TIMEOUT)" \ - $(UV) run fft run "$(PROJECT)" --env stg + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) --select result:warn || true clean: - rm -rf .local "$(PROJECT)/docs" dist build *.egg-info .fastflowtransform + $(CLEAN_CMD) demo-open: @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ @@ -110,7 +109,7 @@ demo-open: demo: clean @echo "== 🚀 R1 Demo (DuckDB) ==" - @echo "DB=$(DB) PROJECT=$(PROJECT)" + @echo "Profile=$(PROFILE_ENV) DB=$(DB) PROJECT=$(PROJECT)" +$(MAKE) seed +$(MAKE) run +$(MAKE) dag @@ -134,13 +133,13 @@ demo: clean # --- API-specific convenience targets ---------------------------------------- api-run: - $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --select "kind:python" --cache rw + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) --select "kind:python" --cache rw api-warm: +$(MAKE) api-run api-offline: - $(RUN_ENV) FF_HTTP_OFFLINE=1 $(UV) run fft run "$(PROJECT)" --env dev --select "kind:python" --cache rw + env $(RUN_ENV) FF_HTTP_OFFLINE=1 $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) --select "kind:python" --cache rw api-cache-clear: rm -rf "$(FF_HTTP_CACHE_DIR)" @@ -156,7 +155,7 @@ api-show-http: api-demo: clean @echo "== 🌐 API Demo (DuckDB) ==" - @echo "DB=$(DB) PROJECT=$(PROJECT)" + @echo "Profile=$(PROFILE_ENV) DB=$(DB) PROJECT=$(PROJECT)" +$(MAKE) run +$(MAKE) dag +$(MAKE) api-show-http || true diff --git a/examples/api_demo/models/common/mart_users_join.ff.sql b/examples/api_demo/models/common/mart_users_join.ff.sql index 9ee550c..0197db7 100644 --- a/examples/api_demo/models/common/mart_users_join.ff.sql +++ b/examples/api_demo/models/common/mart_users_join.ff.sql @@ -1,17 +1,33 @@ -{{ config(materialized='table', tags=['example:api_demo','scope:common']) }} +{{ config( + materialized='table', + tags=[ + 'example:api_demo', + 'scope:common', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], +) }} {# Choose the producing model by variable. Default is the pandas HTTP version. #} {% set api_users_model = var('api_users_model', 'api_users_http') %} +{# materialize literal refs so the loader sees them #} +{% set _api_users_refs = { + 'api_users_http': ref('api_users_http'), + 'api_users_requests': ref('api_users_requests') +} %} + +{% set api_users_relation = _api_users_refs.get(api_users_model, _api_users_refs['api_users_http']) %} + + -- Join local seed users with API users by email (demo-only; real keys will differ) with a as ( select u.id as user_id, u.email from {{ ref('users.ff') }} u ), b as ( - -- Choose one of the API models: - -- select * from {{ ref('api_users_http') }} - select * from {{ ref('api_users_requests') }} + select * from {{ api_users_relation }} ) select a.user_id, diff --git a/examples/api_demo/models/common/users.ff.sql b/examples/api_demo/models/common/users.ff.sql index 639ce2d..ec68605 100644 --- a/examples/api_demo/models/common/users.ff.sql +++ b/examples/api_demo/models/common/users.ff.sql @@ -1,4 +1,14 @@ -{{ config(materialized='table', tags=['example:api_demo','scope:common','kind:seed-consumer']) }} +{{ config( + materialized='table', + tags=[ + 'example:api_demo', + 'scope:common', + 'kind:seed-consumer', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], +) }} -- Simple staging table from seed select id, email from {{ source('crm', 'users') }}; diff --git a/examples/api_demo/models/engines/databricks_spark/api_users_http.ff.py b/examples/api_demo/models/engines/databricks_spark/api_users_http.ff.py new file mode 100644 index 0000000..d3276e7 --- /dev/null +++ b/examples/api_demo/models/engines/databricks_spark/api_users_http.ff.py @@ -0,0 +1,33 @@ +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df +from pyspark.sql import DataFrame as SparkDataFrame +from pyspark.sql import SparkSession + + +@engine_model( + only="databricks_spark", + name="api_users_http", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine", "engine:databricks_spark"], +) +def fetch(users_df: SparkDataFrame) -> SparkDataFrame: + """ + Fetch demo users via the FFT HTTP helper and return a Spark DataFrame. + Leverages get_df(..., output='spark') to stay entirely in Spark. + """ + spark = ( + users_df.sparkSession + if isinstance(users_df, SparkDataFrame) + else SparkSession.getActiveSession() + ) + if spark is None: + spark = SparkSession.builder.getOrCreate() + + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, + normalize=True, + output="spark", + session=spark, + ) + return df.select("id", "email", "username", "name").withColumnRenamed("id", "api_user_id") diff --git a/examples/api_demo/models/engines/databricks_spark/api_users_requests.ff.py b/examples/api_demo/models/engines/databricks_spark/api_users_requests.ff.py new file mode 100644 index 0000000..ad36b18 --- /dev/null +++ b/examples/api_demo/models/engines/databricks_spark/api_users_requests.ff.py @@ -0,0 +1,45 @@ +from fastflowtransform import engine_model +from pyspark.sql import DataFrame as SparkDataFrame +from pyspark.sql import SparkSession + +try: + import requests +except Exception as _e: # pragma: no cover + raise RuntimeError("Please install 'requests' to run this model") from _e + + +@engine_model( + only="databricks_spark", + name="api_users_requests", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine", "engine:databricks_spark"], +) +def fetch(users_df: SparkDataFrame) -> SparkDataFrame: + """ + Plain requests-based HTTP fetch that returns a Spark DataFrame. + Useful when you need full control over authentication, retries, etc. + """ + spark = ( + users_df.sparkSession + if isinstance(users_df, SparkDataFrame) + else SparkSession.getActiveSession() + ) + if spark is None: + spark = SparkSession.builder.getOrCreate() + + resp = requests.get("https://jsonplaceholder.typicode.com/users", timeout=30) + resp.raise_for_status() + rows = resp.json() + + # Select a stable subset of columns and rename id -> api_user_id + projected = [ + { + "api_user_id": row.get("id"), + "email": row.get("email"), + "username": row.get("username"), + "name": row.get("name"), + } + for row in rows + ] + + return spark.createDataFrame(projected) diff --git a/examples/api_demo/models/engines/duckdb/api_users_http.ff.py b/examples/api_demo/models/engines/duckdb/api_users_http.ff.py index e75d7bc..3f900d4 100644 --- a/examples/api_demo/models/engines/duckdb/api_users_http.ff.py +++ b/examples/api_demo/models/engines/duckdb/api_users_http.ff.py @@ -1,11 +1,13 @@ -from fastflowtransform import model +from fastflowtransform import engine_model from fastflowtransform.api.http import get_df import pandas as pd -@model( +@engine_model( + only="duckdb", name="api_users_http", deps=["users.ff"], # at least one dependency is required by the executor contract + tags=["example:api_demo", "scope:engine", "engine:duckdb"], ) def fetch(_: pd.DataFrame) -> pd.DataFrame: """ diff --git a/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py b/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py index d1b731f..ec5254e 100644 --- a/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py +++ b/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py @@ -1,5 +1,5 @@ # NOTE: Plain Python variant (requests/httpx). No built-in FFT telemetry or HTTP cache here. -from fastflowtransform import model +from fastflowtransform import engine_model import pandas as pd try: @@ -8,9 +8,11 @@ raise RuntimeError("Please install 'requests' to run this model") from _e -@model( +@engine_model( + only="duckdb", name="api_users_requests", deps=["users.ff"], # keep a dependency for executor contract + tags=["example:api_demo", "scope:engine", "engine:duckdb"], ) def fetch(_: pd.DataFrame) -> pd.DataFrame: """ diff --git a/examples/api_demo/models/engines/postgres/api_users_http.ff.py b/examples/api_demo/models/engines/postgres/api_users_http.ff.py new file mode 100644 index 0000000..9407ae6 --- /dev/null +++ b/examples/api_demo/models/engines/postgres/api_users_http.ff.py @@ -0,0 +1,27 @@ +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df +import pandas as pd + + +@engine_model( + only="postgres", + name="api_users_http", + deps=["users.ff"], # at least one dependency is required by the executor contract + tags=["example:api_demo", "scope:engine", "engine:postgres"], +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + """ + Fetch users from a public demo API using the built-in HTTP wrapper. + Pros: caching, offline mode, telemetry in run_results.json. + """ + # Example endpoint (JSON Placeholder); replace with your real API. + # For paginated APIs you can add a `paginator` function. + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, # the outer JSON is already a list + normalize=True, # flatten objects to columns (address.*, company.*) + ) + + # Keep only a few columns to make joins simpler + cols = [c for c in df.columns if c in ("id", "email", "username", "name")] + return df[cols].rename(columns={"id": "api_user_id"}) diff --git a/examples/api_demo/models/engines/postgres/api_users_requests.ff.py b/examples/api_demo/models/engines/postgres/api_users_requests.ff.py new file mode 100644 index 0000000..f67caec --- /dev/null +++ b/examples/api_demo/models/engines/postgres/api_users_requests.ff.py @@ -0,0 +1,35 @@ +# NOTE: Plain Python variant (requests/httpx). No built-in FFT telemetry or HTTP cache here. +from fastflowtransform import engine_model +import pandas as pd + +try: + import requests # you can swap this with httpx if you prefer +except Exception as _e: # pragma: no cover + raise RuntimeError("Please install 'requests' to run this model") from _e + + +@engine_model( + only="postgres", + name="api_users_requests", + deps=["users.ff"], # keep a dependency for executor contract + tags=["example:api_demo", "scope:engine", "engine:postgres"], +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + """ + Fetch users from the same demo API using plain Python code. + Pros: ultimate flexibility (custom auth, retry, shaping). + Cons: no built-in FFT telemetry or cache (unless you add it manually). + """ + url = "https://jsonplaceholder.typicode.com/users" + headers = { + # Add your auth headers here if needed: + # "Authorization": f"Bearer {os.getenv('MY_TOKEN')}", + } + resp = requests.get(url, headers=headers, timeout=30) + resp.raise_for_status() + data = resp.json() # list[dict] + + # Example shaping + df = pd.DataFrame(data) + cols = [c for c in df.columns if c in ("id", "email", "username", "name")] + return df[cols].rename(columns={"id": "api_user_id"}) diff --git a/examples/api_demo/profiles.yml b/examples/api_demo/profiles.yml index 1698a40..d55c6b5 100644 --- a/examples/api_demo/profiles.yml +++ b/examples/api_demo/profiles.yml @@ -1,21 +1,39 @@ -default: - dev: - engine: "{{ env('FF_ENGINE' }}" +# profiles.yml for API demo +# Actual connection details are sourced from environment variables (preferably via .env files). - # Router variable binds "api_users_model" to the *canonical* model name - # We keep the same value for all engines here (api_users_http), but it could differ - # (e.g. "api_users_requests" or a parquet loader) per engine in real projects. - vars: - api_users_model: "api_users_http" +dev_duckdb: + engine: duckdb + vars: + api_users_model: "api_users_http" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/api_demo.duckdb') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH') }}" +dev_postgres: + engine: postgres + vars: + api_users_model: "api_users_http" + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" - postgres: - # Safe default DSN for local dev; override via FF_PG_DSN in CI/Prod - dsn: "{{ env('FF_PG_DSN') }}" - db_schema: "{{ env('FF_PG_SCHEMA') }}" +dev_databricks: + engine: databricks_spark + vars: + api_users_model: "api_users_http" + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'api_demo') }}" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" + extra_conf: + spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" + spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" + spark.hadoop.datanucleus.schema.autoCreateAll: "true" + spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" - databricks_spark: - master: "{{ env('FF_SPARK_MASTER') }}" - app_name: "api_demo" +default: + engine: duckdb + vars: + api_users_model: "api_users_http" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" diff --git a/examples/api_demo/project.yml b/examples/api_demo/project.yml index b84f5a8..c6788e5 100644 --- a/examples/api_demo/project.yml +++ b/examples/api_demo/project.yml @@ -3,8 +3,29 @@ version: "0.1" vars: {} +models: + storage: + users: + path: ".local/spark/users" + format: parquet + api_users_http: + path: ".local/spark/api_users_http" + format: parquet + api_users_requests: + path: ".local/spark/api_users_requests" + format: parquet + mart_users_join.ff: + path: ".local/spark/mart_users_join" + format: parquet + +seeds: + storage: + seed_users: + path: ".local/spark/seed_users" + format: parquet + tests: - # Batch‑Tabellen + # Batch tables - type: not_null table: mart_users_join column: user_id diff --git a/examples/api_demo/site/dag/api_users_http.html b/examples/api_demo/site/dag/api_users_http.html index d4a491a..ef69b8d 100644 --- a/examples/api_demo/site/dag/api_users_http.html +++ b/examples/api_demo/site/dag/api_users_http.html @@ -90,7 +90,7 @@

Metadata

Path
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/duckdb_api_demo/models/api_users_http.ff.py + /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/api_demo/models/engines/databricks_spark/api_users_http.ff.py
@@ -144,7 +144,7 @@

Columns

api_user_id - BIGINT + bigint yes @@ -157,20 +157,14 @@

Columns

- - ?.id - - transformed - - - + unknown - name - VARCHAR + email + string yes @@ -190,7 +184,7 @@

Columns

username - VARCHAR + string yes @@ -209,8 +203,8 @@

Columns

- email - VARCHAR + name + string yes diff --git a/examples/api_demo/site/dag/api_users_requests.html b/examples/api_demo/site/dag/api_users_requests.html index e52fa3d..429b215 100644 --- a/examples/api_demo/site/dag/api_users_requests.html +++ b/examples/api_demo/site/dag/api_users_requests.html @@ -90,7 +90,7 @@

Metadata

Path
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/duckdb_api_demo/models/api_users_requests.ff.py + /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/api_demo/models/engines/databricks_spark/api_users_requests.ff.py
@@ -144,7 +144,7 @@

Columns

api_user_id - BIGINT + bigint yes @@ -157,20 +157,14 @@

Columns

- - ?.id - - transformed - - - + unknown - name - VARCHAR + email + string yes @@ -189,8 +183,8 @@

Columns

- username - VARCHAR + name + string yes @@ -209,8 +203,8 @@

Columns

- email - VARCHAR + username + string yes diff --git a/examples/api_demo/site/dag/index.html b/examples/api_demo/site/dag/index.html index 7a50925..5e75b72 100644 --- a/examples/api_demo/site/dag/index.html +++ b/examples/api_demo/site/dag/index.html @@ -132,9 +132,9 @@

DAG

class mart_users_join_ff sql; users_ff["users.ff
(users)"] class users_ff sql; - users_ff --> mart_users_join_ff api_users_http --> mart_users_join_ff api_users_requests --> mart_users_join_ff + users_ff --> mart_users_join_ff users_ff --> api_users_requests users_ff --> api_users_http @@ -203,7 +203,7 @@

Models

sql mart_users_join - users.ff, api_users_http, api_users_requests + api_users_http, api_users_requests, users.ff table diff --git a/examples/api_demo/site/dag/mart_users_join.ff.html b/examples/api_demo/site/dag/mart_users_join.ff.html index 7913056..6562004 100644 --- a/examples/api_demo/site/dag/mart_users_join.ff.html +++ b/examples/api_demo/site/dag/mart_users_join.ff.html @@ -90,7 +90,7 @@

Metadata

Path
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/duckdb_api_demo/models/mart_users_join.ff.sql + /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/api_demo/models/common/mart_users_join.ff.sql
@@ -99,12 +99,12 @@

Metadata

@@ -139,7 +139,7 @@

Columns

user_id - BIGINT + bigint yes @@ -165,7 +165,7 @@

Columns

email - VARCHAR + string yes @@ -191,7 +191,7 @@

Columns

api_user_id - BIGINT + bigint yes @@ -211,7 +211,7 @@

Columns

username - VARCHAR + string yes @@ -231,7 +231,7 @@

Columns

name - VARCHAR + string yes diff --git a/examples/api_demo/site/dag/users.ff.html b/examples/api_demo/site/dag/users.ff.html index ae430a3..2915cde 100644 --- a/examples/api_demo/site/dag/users.ff.html +++ b/examples/api_demo/site/dag/users.ff.html @@ -90,7 +90,7 @@

Metadata

Path
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/duckdb_api_demo/models/users.ff.sql + /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/api_demo/models/common/users.ff.sql
@@ -144,7 +144,7 @@

Columns

id - BIGINT + bigint yes @@ -170,7 +170,7 @@

Columns

email - VARCHAR + string yes diff --git a/examples/basic_demo/README.md b/examples/basic_demo/README.md new file mode 100644 index 0000000..5e977f7 --- /dev/null +++ b/examples/basic_demo/README.md @@ -0,0 +1,7 @@ +# FastFlowTransform project scaffold + +This project was created with `fft init`. +Next steps: +1. Update `profiles.yml` with real connection details (docs/Profiles.md). +2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). +3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). diff --git a/examples/basic_demo/models/README.md b/examples/basic_demo/models/README.md new file mode 100644 index 0000000..32818bb --- /dev/null +++ b/examples/basic_demo/models/README.md @@ -0,0 +1,4 @@ +# Models directory + +Place SQL (`*.ff.sql`) and Python (`*.ff.py`) models here. +See docs/Config_and_Macros.md for modeling guidance and config options. diff --git a/examples/basic_demo/profiles.yml b/examples/basic_demo/profiles.yml new file mode 100644 index 0000000..ca66403 --- /dev/null +++ b/examples/basic_demo/profiles.yml @@ -0,0 +1,13 @@ +# Profiles generated by `fft init`. +# Update these placeholders as described in docs/Profiles.md. +dev: + engine: duckdb + # DuckDB profile example. See docs/Profiles.md#duckdb for details. + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}" # Path to your DuckDB database file. + +# Default in-memory profile for quick experiments. +default: + engine: duckdb + duckdb: + path: ":memory:" diff --git a/examples/basic_demo/project.yml b/examples/basic_demo/project.yml new file mode 100644 index 0000000..4d8ded1 --- /dev/null +++ b/examples/basic_demo/project.yml @@ -0,0 +1,18 @@ +# Project configuration generated by `fft init`. +# Read docs/Project_Config.md for the complete reference. +name: basic_demo +version: "0.1" +models_dir: models + +docs: + # Adjust `dag_dir` to change where `fft dag --html` writes documentation (docs/Technical_Overview.md#documentation). + dag_dir: site/dag + +# Project-level variables accessible via {{ var('key') }} inside models. +# Example: +# vars: +# run_date: "2024-01-01" +vars: {} + +# Declare project-wide data quality checks under `tests`. See docs/Data_Quality_Tests.md. +tests: [] diff --git a/examples/basic_demo/seeds/README.md b/examples/basic_demo/seeds/README.md new file mode 100644 index 0000000..2e553ed --- /dev/null +++ b/examples/basic_demo/seeds/README.md @@ -0,0 +1,4 @@ +# Seeds directory + +Add CSV or Parquet files for reproducible seeds. +Usage examples are covered in docs/Quickstart.md and docs/Config_and_Macros.md#13-seeds-sources-and-dependencies. diff --git a/examples/basic_demo/sources.yml b/examples/basic_demo/sources.yml new file mode 100644 index 0000000..cf52a95 --- /dev/null +++ b/examples/basic_demo/sources.yml @@ -0,0 +1,9 @@ +# Source declarations describe external tables. See docs/Sources.md for details. +version: 2 +sources: + # Example: + # - name: raw + # schema: staging + # tables: + # - name: users + # identifier: seed_users diff --git a/examples/basic_demo/tests/unit/README.md b/examples/basic_demo/tests/unit/README.md new file mode 100644 index 0000000..b3c3c8d --- /dev/null +++ b/examples/basic_demo/tests/unit/README.md @@ -0,0 +1,4 @@ +# Unit tests + +Define YAML unit specs as described in docs/Config_and_Macros.md#73-model-unit-tests-fft-utest. +Invoke them with `fft utest --env `. diff --git a/examples/postgres/.fastflowtransform/target/catalog.json b/examples/postgres/.fastflowtransform/target/catalog.json index 28dcb5f..833aa53 100644 --- a/examples/postgres/.fastflowtransform/target/catalog.json +++ b/examples/postgres/.fastflowtransform/target/catalog.json @@ -1,6 +1,6 @@ { "metadata": { - "generated_at": "2025-10-28T19:05:08+00:00", + "generated_at": "2025-10-30T18:29:08+00:00", "tool": "fastflowtransform" }, "relations": { diff --git a/examples/postgres/.fastflowtransform/target/manifest.json b/examples/postgres/.fastflowtransform/target/manifest.json index 503cc94..1a5eed6 100644 --- a/examples/postgres/.fastflowtransform/target/manifest.json +++ b/examples/postgres/.fastflowtransform/target/manifest.json @@ -1,7 +1,7 @@ { "macros": {}, "metadata": { - "generated_at": "2025-10-28T19:05:08+00:00", + "generated_at": "2025-10-30T18:29:08+00:00", "tool": "fastflowtransform" }, "nodes": { diff --git a/examples/postgres/.fastflowtransform/target/run_results.json b/examples/postgres/.fastflowtransform/target/run_results.json index e1f3625..36067fb 100644 --- a/examples/postgres/.fastflowtransform/target/run_results.json +++ b/examples/postgres/.fastflowtransform/target/run_results.json @@ -1,55 +1,55 @@ { "metadata": { - "generated_at": "2025-10-28T19:05:08+00:00", + "generated_at": "2025-10-30T18:29:08+00:00", "tool": "fastflowtransform" }, "results": [ { "duration_ms": 0, - "finished_at": "2025-10-28T19:05:08+00:00", + "finished_at": "2025-10-30T18:29:08+00:00", "http": null, "message": null, "name": "mart_orders_enriched", - "started_at": "2025-10-28T19:05:08+00:00", + "started_at": "2025-10-30T18:29:08+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-28T19:05:08+00:00", + "finished_at": "2025-10-30T18:29:08+00:00", "http": null, "message": null, "name": "mart_users.ff", - "started_at": "2025-10-28T19:05:08+00:00", + "started_at": "2025-10-30T18:29:08+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-28T19:05:08+00:00", + "finished_at": "2025-10-30T18:29:08+00:00", "http": null, "message": null, "name": "orders.ff", - "started_at": "2025-10-28T19:05:08+00:00", + "started_at": "2025-10-30T18:29:08+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-28T19:05:08+00:00", + "finished_at": "2025-10-30T18:29:08+00:00", "http": null, "message": null, "name": "users.ff", - "started_at": "2025-10-28T19:05:08+00:00", + "started_at": "2025-10-30T18:29:08+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-28T19:05:08+00:00", + "finished_at": "2025-10-30T18:29:08+00:00", "http": null, "message": null, "name": "users_enriched", - "started_at": "2025-10-28T19:05:08+00:00", + "started_at": "2025-10-30T18:29:08+00:00", "status": "success" } ], - "run_finished_at": "2025-10-28T19:05:08+00:00", - "run_started_at": "2025-10-28T19:05:08+00:00" + "run_finished_at": "2025-10-30T18:29:08+00:00", + "run_started_at": "2025-10-30T18:29:08+00:00" } diff --git a/examples/simple_duckdb/.fastflowtransform/target/catalog.json b/examples/simple_duckdb/.fastflowtransform/target/catalog.json index 75cbf5b..4690161 100644 --- a/examples/simple_duckdb/.fastflowtransform/target/catalog.json +++ b/examples/simple_duckdb/.fastflowtransform/target/catalog.json @@ -1,6 +1,6 @@ { "metadata": { - "generated_at": "2025-10-28T19:05:02+00:00", + "generated_at": "2025-10-30T18:29:03+00:00", "tool": "fastflowtransform" }, "relations": { diff --git a/examples/simple_duckdb/.fastflowtransform/target/manifest.json b/examples/simple_duckdb/.fastflowtransform/target/manifest.json index cfd2773..0aea182 100644 --- a/examples/simple_duckdb/.fastflowtransform/target/manifest.json +++ b/examples/simple_duckdb/.fastflowtransform/target/manifest.json @@ -6,7 +6,7 @@ "upper_col": "models/macros/util.sql" }, "metadata": { - "generated_at": "2025-10-28T19:05:02+00:00", + "generated_at": "2025-10-30T18:29:03+00:00", "tool": "fastflowtransform" }, "nodes": { diff --git a/examples/simple_duckdb/.fastflowtransform/target/run_results.json b/examples/simple_duckdb/.fastflowtransform/target/run_results.json index 0f5c774..f257d89 100644 --- a/examples/simple_duckdb/.fastflowtransform/target/run_results.json +++ b/examples/simple_duckdb/.fastflowtransform/target/run_results.json @@ -1,82 +1,82 @@ { "metadata": { - "generated_at": "2025-10-28T19:05:02+00:00", + "generated_at": "2025-10-30T18:29:03+00:00", "tool": "fastflowtransform" }, "results": [ { "duration_ms": 0, - "finished_at": "2025-10-28T19:05:02+00:00", + "finished_at": "2025-10-30T18:29:03+00:00", "http": null, "message": null, "name": "ephemeral_ids.ff", - "started_at": "2025-10-28T19:05:02+00:00", + "started_at": "2025-10-30T18:29:03+00:00", "status": "success" }, { - "duration_ms": 1, - "finished_at": "2025-10-28T19:05:02+00:00", + "duration_ms": 0, + "finished_at": "2025-10-30T18:29:03+00:00", "http": null, "message": null, "name": "mart_orders_enriched", - "started_at": "2025-10-28T19:05:02+00:00", + "started_at": "2025-10-30T18:29:03+00:00", "status": "success" }, { "duration_ms": 1, - "finished_at": "2025-10-28T19:05:02+00:00", + "finished_at": "2025-10-30T18:29:03+00:00", "http": null, "message": null, "name": "mart_users.ff", - "started_at": "2025-10-28T19:05:02+00:00", + "started_at": "2025-10-30T18:29:03+00:00", "status": "success" }, { "duration_ms": 4, - "finished_at": "2025-10-28T19:05:02+00:00", + "finished_at": "2025-10-30T18:29:03+00:00", "http": null, "message": null, "name": "orders.ff", - "started_at": "2025-10-28T19:05:02+00:00", + "started_at": "2025-10-30T18:29:03+00:00", "status": "success" }, { "duration_ms": 1, - "finished_at": "2025-10-28T19:05:02+00:00", + "finished_at": "2025-10-30T18:29:03+00:00", "http": null, "message": null, "name": "users.ff", - "started_at": "2025-10-28T19:05:02+00:00", + "started_at": "2025-10-30T18:29:03+00:00", "status": "success" }, { - "duration_ms": 1, - "finished_at": "2025-10-28T19:05:02+00:00", + "duration_ms": 0, + "finished_at": "2025-10-30T18:29:03+00:00", "http": null, "message": null, "name": "users_enriched", - "started_at": "2025-10-28T19:05:02+00:00", + "started_at": "2025-10-30T18:29:03+00:00", "status": "success" }, { - "duration_ms": 2, - "finished_at": "2025-10-28T19:05:02+00:00", + "duration_ms": 1, + "finished_at": "2025-10-30T18:29:03+00:00", "http": null, "message": null, "name": "v_users.ff", - "started_at": "2025-10-28T19:05:02+00:00", + "started_at": "2025-10-30T18:29:03+00:00", "status": "success" }, { - "duration_ms": 2, - "finished_at": "2025-10-28T19:05:02+00:00", + "duration_ms": 0, + "finished_at": "2025-10-30T18:29:03+00:00", "http": null, "message": null, "name": "v_users_enriched.ff", - "started_at": "2025-10-28T19:05:02+00:00", + "started_at": "2025-10-30T18:29:03+00:00", "status": "success" } ], - "run_finished_at": "2025-10-28T19:05:02+00:00", - "run_started_at": "2025-10-28T19:05:02+00:00" + "run_finished_at": "2025-10-30T18:29:03+00:00", + "run_started_at": "2025-10-30T18:29:03+00:00" } diff --git a/mkdocs.yml b/mkdocs.yml index fe84938..0bc9890 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -17,6 +17,7 @@ theme: - search.highlight icon: repo: fontawesome/brands/github + nav: - Home: index.md - Quickstart: Quickstart.md @@ -25,22 +26,27 @@ nav: - Configuration & Macros: Config_and_Macros.md - Cache & Parallelism: Cache_and_Parallelism.md - Incremental Processing: Incremental.md + - Profiles & Environments: Profiles.md + - Sources Declaration: Sources.md + - Project Configuration: Project_Config.md - State Selection: State_Selection.md - YAML Tests: YAML_Tests.md - Data Quality Tests: Data_Quality_Tests.md + - API Reference: reference/index.md - Examples: - Environment Matrix: examples/Environment_Matrix.md + - API Demo Overview: examples/API_Demo.md + - API Demo Local Setup: examples/Local_Engine_Setup.md - Contributing: Contributing.md - - License: license.md + - License: License.md markdown_extensions: - admonition - - codehilite - toc: permalink: true - pymdownx.superfences - pymdownx.details - - pymdownx.highlight + - pymdownx.highlight # <— ersetzt codehilite - pymdownx.inlinehilite - pymdownx.snippets: base_path: @@ -51,6 +57,25 @@ markdown_extensions: plugins: - search + - autorefs + - gen-files: + scripts: + - docs/_scripts/gen_api.py + - mkdocstrings: + handlers: + python: + paths: ["src"] + options: + docstring_style: google + show_source: true + separate_signature: true + members_order: source + inherited_members: true + show_bases: true + show_if_no_docstring: false + filters: ["!^_"] + - section-index + extra: social: diff --git a/pyproject.toml b/pyproject.toml index b752cde..ae66b6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ [project.optional-dependencies] dev = [ "pytest==8.4.*", + "pytest-cov==7.0.*", "ruff==0.14.*", "mypy==1.18.*", "pre-commit==3.*", @@ -53,6 +54,11 @@ docs = [ "mkdocs>=1.6", "mkdocs-material>=9.5", "pymdown-extensions>=10.0", + "mkdocstrings[python]>=0.25", + "mkdocs-autorefs>=1.0", + "mkdocs-section-index>=0.3", + "mkdocs-gen-files>=0.5", + "mkdocs-literate-nav>=0.6", ] [project.urls] @@ -93,6 +99,7 @@ select = [ ] ignore = [ "E203", # whitespace before ':', compatible with black-like formatting + "PLR2004", ] fixable = ["ALL"] @@ -159,9 +166,3 @@ omit = [ show_missing = true skip_covered = false fail_under = 80 - -[dependency-groups] -dev = [ - "coverage>=7.11.0", - "pytest-cov>=7.0.0", -] diff --git a/pytest.ini b/pytest.ini index 50e28cf..9a86424 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,7 +3,13 @@ addopts = -q markers = duckdb: marks tests that require DuckDB postgres: marks tests that require Postgres + spark: marks tests that require Spark + artifacts: marks tests covering artifacts generation + render: marks tests for render-time helpers/templates + schema: marks schema loader/validation tests + http: marks tests that exercise the HTTP client/API cli: marks CLI smoke tests streaming: marks tests that exercise streaming functionality slow: marks slower end-to-end scenarios + unit: marks unit tests integration: integration tests diff --git a/src/fastflowtransform/__init__.py b/src/fastflowtransform/__init__.py index 551f5cd..07a16ed 100644 --- a/src/fastflowtransform/__init__.py +++ b/src/fastflowtransform/__init__.py @@ -13,10 +13,10 @@ # Optional convenience re-exports (safe, low-risk imports). # If you prefer a minimal surface, you can remove the block below. try: - from .core import REGISTRY, Node, relation_for - from .dag import levels, mermaid, topo_sort - from .decorators import model - from .fingerprint import ( + from fastflowtransform.core import REGISTRY, Node, relation_for + from fastflowtransform.dag import levels, mermaid, topo_sort + from fastflowtransform.decorators import engine_model, model + from fastflowtransform.fingerprint import ( EnvCtx, build_env_ctx, fingerprint_py, @@ -35,6 +35,7 @@ "Node", "__version__", "build_env_ctx", + "engine_model", "fingerprint_py", "fingerprint_sql", "get_function_source", diff --git a/src/fastflowtransform/api/http.py b/src/fastflowtransform/api/http.py index c15ac46..7f51bea 100644 --- a/src/fastflowtransform/api/http.py +++ b/src/fastflowtransform/api/http.py @@ -9,7 +9,7 @@ from contextlib import suppress from dataclasses import dataclass from pathlib import Path -from typing import Any, cast +from typing import Any, Literal, cast from urllib.parse import urlparse import httpx as _HTTP @@ -271,6 +271,9 @@ def _load_one(u: str, p: dict | None) -> tuple[Any, dict]: MetaParamOut = str | list[MetaEntry] | None +OutputBackend = Literal["pandas", "spark", "bigframes"] + + def get_df( url: str, *, @@ -284,7 +287,9 @@ def get_df( dtype: dict[str, str] | None = None, timeout: float | None = None, normalize: bool = False, -) -> pd.DataFrame: + output: OutputBackend = "pandas", + session: Any | None = None, +) -> Any: """ GET JSON and normalize into a DataFrame using pandas.json_normalize. If `paginator` is provided, concatenates pages over the same normalization logic. @@ -295,6 +300,14 @@ def get_df( Path to the list in the JSON to be normalized. meta : Sequence[str | Sequence[str]] | None Columns to include as metadata (top-level keys or nested paths). + output : {"pandas","spark","bigframes"} + Controls the returned frame type. "pandas" (default) yields a pandas DataFrame. + "spark" materialises a pyspark.sql.DataFrame using the provided session + (or an active/builder session). + "bigframes" is reserved for future integration and currently raises NotImplementedError. + session : Any | None + Optional backend handle. For Spark, pass a SparkSession; + otherwise the active session or a new one is used. """ def _extract(obj: Any) -> Any: @@ -343,9 +356,35 @@ def _to_df(js: Any) -> pd.DataFrame: df = df.astype({col: cast(Any, dt)}, copy=False) return df + def _finalize(pdf: pd.DataFrame) -> Any: + mode = (output or "pandas").lower() + if mode == "pandas": + return pdf + if mode == "spark": + try: + from pyspark.sql import SparkSession # noqa: PLC0415 + except Exception as exc: # pragma: no cover - pyspark optional dependency + raise RuntimeError( + "get_df(..., output='spark') requires pyspark to be installed." + ) from exc + spark = session + if spark is None: + spark = SparkSession.getActiveSession() + if spark is None: + spark = SparkSession.builder.getOrCreate() + return spark.createDataFrame(pdf) + if mode == "bigframes": + raise NotImplementedError( + "get_df(..., output='bigframes') is not implemented yet. " + "Open an issue if you need this backend." + ) + raise ValueError( + f"Unsupported output backend '{output}' (expected pandas|spark|bigframes)." + ) + if paginator is None: js = get_json(url, params=params, headers=headers, ttl=ttl, timeout=timeout) - return _to_df(js) + return _finalize(_to_df(js)) pages = get_json( url, params=params, headers=headers, ttl=ttl, paginator=paginator, timeout=timeout @@ -354,5 +393,5 @@ def _to_df(js: Any) -> pd.DataFrame: for js in pages if isinstance(pages, list) else [pages]: frames.append(_to_df(js)) if not frames: - return pd.DataFrame() - return pd.concat(frames, ignore_index=True) + return _finalize(pd.DataFrame()) + return _finalize(pd.concat(frames, ignore_index=True)) diff --git a/src/fastflowtransform/artifacts.py b/src/fastflowtransform/artifacts.py index 2e849dc..11381e6 100644 --- a/src/fastflowtransform/artifacts.py +++ b/src/fastflowtransform/artifacts.py @@ -193,10 +193,72 @@ def _postgres_columns(con: Any, table: str, schema: str | None = None) -> list[d return [{"name": r[0], "dtype": r[1], "nullable": (r[2] == "YES")} for r in rows] +def _spark_columns(spark: Any, table: str) -> list[dict[str, Any]]: + """ + Retrieve column metadata for Spark/Databricks tables. + Supports names with optional schema/catalog. + """ + db = None + tbl = table + cat = None + parts = table.split(".") + + if len(parts) == 2: + db, tbl = parts + elif len(parts) >= 3: + cat, db, tbl = parts[-3], parts[-2], parts[-1] + + def _list_cols(target_tbl: str, target_db: str | None) -> list[dict[str, Any]]: + ident = target_tbl if not target_db else f"{target_db}.{target_tbl}" + try: + cols = spark.catalog.listColumns(ident) + except TypeError: + cols = spark.catalog.listColumns(target_tbl, target_db) + except Exception: + return [] + out: list[dict[str, Any]] = [] + for c in cols: + name = getattr(c, "name", None) + if not name: + continue + dtype = str(getattr(c, "dataType", "")) + nullable = bool(getattr(c, "nullable", False)) + out.append({"name": name, "dtype": dtype, "nullable": nullable}) + return out + + cols = _list_cols(tbl, db) + if cols: + return cols + # Fallback: try fully qualified view via Spark SQL + ident = tbl + if db: + ident = f"`{db}`.`{tbl}`" + if cat and db: + ident = f"`{cat}`.`{db}`.`{tbl}`" + try: + df = spark.table(ident) + except Exception: + return [] + out: list[dict[str, Any]] = [] + for field in df.schema.fields: + dtype = ( + field.dataType.simpleString() + if hasattr(field.dataType, "simpleString") + else str(field.dataType) + ) + out.append({"name": field.name, "dtype": dtype, "nullable": field.nullable}) + return out + + def _try_columns_for(executor: Any, table: str) -> list[dict[str, Any]]: """ Best-effort column introspection for known engines. Returns [] if unsupported. """ + spark = getattr(executor, "spark", None) + if spark is not None: + cols = _spark_columns(spark, table) + if cols: + return cols con = getattr(executor, "con", None) # DuckDB detection (robust): class/module name contains 'duckdb' try: diff --git a/src/fastflowtransform/cli/__init__.py b/src/fastflowtransform/cli/__init__.py index 542ee2c..f3c8a2c 100644 --- a/src/fastflowtransform/cli/__init__.py +++ b/src/fastflowtransform/cli/__init__.py @@ -24,6 +24,7 @@ _resolve_dag_out_dir, _strip_html, ) +from fastflowtransform.cli.init_cmd import init, register as _register_init from fastflowtransform.cli.options import ( CacheMode, CacheOpt, @@ -126,6 +127,7 @@ def main( _register_utest(app) _register_docgen(app) _register_sync_db_comments(app) +_register_init(app) __all__ = [ @@ -184,6 +186,7 @@ def main( "dag", "dag_levels", "docgen", + "init", "render_site", "run", "schedule", diff --git a/src/fastflowtransform/cli/bootstrap.py b/src/fastflowtransform/cli/bootstrap.py index 1707a43..6147196 100644 --- a/src/fastflowtransform/cli/bootstrap.py +++ b/src/fastflowtransform/cli/bootstrap.py @@ -1,15 +1,15 @@ # fastflowtransform/cli/bootstrap.py from __future__ import annotations +import os from collections.abc import Callable -from contextlib import suppress from dataclasses import dataclass from pathlib import Path from typing import Any, NoReturn, cast import typer import yaml -from dotenv import load_dotenv +from dotenv import dotenv_values from jinja2 import Environment from fastflowtransform.core import REGISTRY @@ -55,20 +55,18 @@ def _resolve_project_path(project_arg: str) -> Path: p = Path(project_arg).expanduser().resolve() if not p.exists(): raise typer.BadParameter( - f"Project path not found: {p}\n" - "Tip: Benutze einen absoluten Pfad oder '.' im Projekt-Root." + f"Project path not found: {p}\nTip: Use an absolute path or '.' in the project root." ) if not p.is_dir(): raise typer.BadParameter( - f"Project path is not a directory: {p}\n" - "Tip: Übergebe das Verzeichnis, nicht eine Datei." + f"Project path is not a directory: {p}\nTip: Pass the directory, not the file." ) models = p / "models" if not models.exists() or not models.is_dir(): raise typer.BadParameter( f"Invalid project at {p}\n" - "Erwartet ein Unterverzeichnis 'models/'.\n" - "Tip: Wechsle ins Projekt und nutze '.'." + "Expected eian subfolder 'models/'.\n" + "Tip: change directory to the root and use '.'." ) return p @@ -102,20 +100,34 @@ def _load_dotenv_layered(project_dir: Path, env_name: str) -> None: 5) /.env..local """ - def _safe_load(p: Path, override: bool) -> None: - with suppress(Exception): - load_dotenv(dotenv_path=p, override=override) + original_env = dict(os.environ) + merged: dict[str, str] = {} + + def _merge(p: Path) -> None: + try: + if not p.exists(): + return + data = dotenv_values(p) + for key, value in (data or {}).items(): + if value is not None: + merged[key] = value + except Exception: + pass # 1) Repo root defaults - _safe_load(Path.cwd() / ".env", override=False) + _merge(Path.cwd() / ".env") # 2) Project defaults - _safe_load(project_dir / ".env", override=True) + _merge(project_dir / ".env") # 3) Project local (gitignored) - _safe_load(project_dir / ".env.local", override=True) + _merge(project_dir / ".env.local") # 4) Env-specific - _safe_load(project_dir / f".env.{env_name}", override=True) + _merge(project_dir / f".env.{env_name}") # 5) Env-specific local (gitignored) - _safe_load(project_dir / f".env.{env_name}.local", override=True) + _merge(project_dir / f".env.{env_name}.local") + + for key, value in merged.items(): + if key not in original_env and value is not None: + os.environ.setdefault(key, value) def _resolve_profile( @@ -211,12 +223,23 @@ def _prepare_context( engine: EngineType | None, vars_opt: list[str] | None, ) -> CLIContext: - proj_raw, jenv = _load_project_and_env(project_arg) - proj = Path(proj_raw) + proj = _resolve_project_path(project_arg) _load_dotenv_layered(proj, env_name) - REGISTRY.set_cli_vars(_parse_cli_vars(vars_opt or [])) + env_settings, prof = _resolve_profile(env_name, engine, proj) _validate_profile_params(env_name, prof) + + engine_name = getattr(prof, "engine", None) + REGISTRY.set_active_engine(engine_name) + if engine_name: + os.environ["FF_ENGINE"] = engine_name + else: + os.environ.pop("FF_ENGINE", None) + + proj_raw, jenv = _load_project_and_env(str(proj)) + proj = Path(proj_raw) + + REGISTRY.set_cli_vars(_parse_cli_vars(vars_opt or [])) return CLIContext(project=proj, jinja_env=jenv, env_settings=env_settings, profile=prof) @@ -295,6 +318,13 @@ def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Cal ex = DatabricksSparkExecutor( master=prof.databricks_spark.master, app_name=prof.databricks_spark.app_name, + extra_conf=prof.databricks_spark.extra_conf, + warehouse_dir=prof.databricks_spark.warehouse_dir, + use_hive_metastore=prof.databricks_spark.use_hive_metastore, + catalog=prof.databricks_spark.catalog, + database=prof.databricks_spark.database, + table_format=prof.databricks_spark.table_format, + table_options=prof.databricks_spark.table_options, ) return ex, (lambda n: run_or_dispatch(ex, n, jenv)), ex.run_python diff --git a/src/fastflowtransform/cli/docs_utils.py b/src/fastflowtransform/cli/docs_utils.py index 6b88195..51645c9 100644 --- a/src/fastflowtransform/cli/docs_utils.py +++ b/src/fastflowtransform/cli/docs_utils.py @@ -1,3 +1,4 @@ +# fastflowtransform/cli/docs_utils.py from __future__ import annotations import re diff --git a/src/fastflowtransform/cli/init_cmd.py b/src/fastflowtransform/cli/init_cmd.py new file mode 100644 index 0000000..194ff4a --- /dev/null +++ b/src/fastflowtransform/cli/init_cmd.py @@ -0,0 +1,287 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Annotated + +import typer + +# Engines supported by the skeleton generator. +_SUPPORTED_ENGINES = { + "duckdb", + "postgres", + "bigquery", + "bigquery_bf", + "databricks_spark", + "snowflake_snowpark", +} + + +@dataclass(frozen=True) +class _InitContext: + project_dir: Path + project_name: str + profile_name: str + engine: str + + +def _build_profiles_yaml(ctx: _InitContext) -> str: + engine_block = { + "duckdb": [ + " # DuckDB profile example. See docs/Profiles.md#duckdb for details.", + " duckdb:", + " path: \"{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}\" # Path to your DuckDB database file.", # Noqa E501 + ], + "postgres": [ + " # Postgres profile example. See docs/Profiles.md#postgres for required keys.", + " postgres:", + " dsn: \"{{ env('FF_PG_DSN') }}\" # Full Postgres DSN, e.g. postgresql://user:pass@host/db", + " db_schema: \"{{ env('FF_PG_SCHEMA', 'analytics') }}\"", + ], + "bigquery": [ + " # BigQuery profile example. See docs/Profiles.md#bigquery.", + " bigquery:", + " project: \"{{ env('FF_BQ_PROJECT') }}\" # GCP project id.", + " dataset: \"{{ env('FF_BQ_DATASET') }}\" # Target dataset for models.", + " location: US # Update to match your dataset location.", + ], + "bigquery_bf": [ + " # BigQuery BigFrames profile example. See docs/Profiles.md#bigquery.", + " bigquery_bf:", + " project: \"{{ env('FF_BQ_PROJECT') }}\"", + " dataset: \"{{ env('FF_BQ_DATASET') }}\"", + " location: US", + ], + "databricks_spark": [ + " # Databricks Spark profile example. See docs/Profiles.md#databricks-spark.", + " databricks_spark:", + " master: \"{{ env('FF_SPARK_MASTER') }}\" # e.g. spark://host:7077 or a Databricks cluster URL.", # Noqa E501 + " app_name: \"{{ env('FF_SPARK_APP_NAME', 'fft-project') }}\"", + " warehouse_dir: \"{{ env('FF_SPARK_WAREHOUSE', '/tmp/fft-warehouse') }}\"", + " use_hive_metastore: false", + ], + "snowflake_snowpark": [ + " # Snowflake Snowpark profile example. See docs/Profiles.md#snowflake-snowpark.", + " snowflake_snowpark:", + " account: \"{{ env('FF_SF_ACCOUNT') }}\"", + " user: \"{{ env('FF_SF_USER') }}\"", + " password: \"{{ env('FF_SF_PASSWORD') }}\"", + " warehouse: \"{{ env('FF_SF_WAREHOUSE') }}\"", + " database: \"{{ env('FF_SF_DATABASE') }}\"", + " db_schema: \"{{ env('FF_SF_SCHEMA', 'PUBLIC') }}\"", + ], + }[ctx.engine] + + lines = [ + "# Profiles generated by `fft init`.", + "# Update these placeholders as described in docs/Profiles.md.", + f"{ctx.profile_name}:", + f" engine: {ctx.engine}", + *engine_block, + "", + "# Default in-memory profile for quick experiments.", + "default:", + " engine: duckdb", + " duckdb:", + ' path: ":memory:"', + "", + ] + return "\n".join(lines) + + +def _build_project_yaml(ctx: _InitContext) -> str: + return "\n".join( + [ + "# Project configuration generated by `fft init`.", + "# Read docs/Project_Config.md for the complete reference.", + f"name: {ctx.project_name}", + 'version: "0.1"', + "models_dir: models", + "", + "docs:", + " # Adjust `dag_dir` to change where `fft dag --html` writes documentation " + "(docs/Technical_Overview.md#documentation).", + " dag_dir: site/dag", + "", + "# Project-level variables accessible via {{ var('key') }} inside models.", + "# Example:", + "# vars:", + '# run_date: "2024-01-01"', + "vars: {}", + "", + "# Declare project-wide data quality checks under `tests`. " + "See docs/Data_Quality_Tests.md.", + "tests: []", + "", + ] + ) + + +def _build_sources_yaml() -> str: + return "\n".join( + [ + "# Source declarations describe external tables. See docs/Sources.md for details.", + "version: 2", + "sources:", + " # Example:", + " # - name: raw", + " # schema: staging", + " # tables:", + " # - name: users", + " # identifier: seed_users", + "", + ] + ) + + +def _write_file(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def _create_directory_notes(target: Path) -> None: + notes = { + "models/README.md": "\n".join( + [ + "# Models directory", + "", + "Place SQL (`*.ff.sql`) and Python (`*.ff.py`) models here.", + "See docs/Config_and_Macros.md for modeling guidance and config options.", + "", + ] + ), + "seeds/README.md": "\n".join( + [ + "# Seeds directory", + "", + "Add CSV or Parquet files for reproducible seeds.", + "Usage examples are covered in docs/Quickstart.md and " + "docs/Config_and_Macros.md#13-seeds-sources-and-dependencies.", + "", + ] + ), + "tests/unit/README.md": "\n".join( + [ + "# Unit tests", + "", + "Define YAML unit specs as described in " + "docs/Config_and_Macros.md#73-model-unit-tests-fft-utest.", + "Invoke them with `fft utest --env `.", + "", + ] + ), + "docs/README.md": "\n".join( + [ + "# Project documentation", + "", + "Write operator or contributor notes here and keep " + "them in sync with generated docs.", + "See docs/Technical_Overview.md#documentation " + "for `fft dag` / `fft docgen` guidance.", + "", + ] + ), + } + for rel, text in notes.items(): + _write_file(target / rel, text) + + +def _build_root_readme(ctx: _InitContext) -> str: + return "\n".join( + [ + "# FastFlowTransform project scaffold", + "", + "This project was created with `fft init`.", + "Next steps:", + "1. Update `profiles.yml` with real connection details (docs/Profiles.md).", + "2. Add sources in `sources.yml` and author models " + "under `models/` (docs/Config_and_Macros.md).", + "3. Seed sample data with `fft seed` and execute models " + "with `fft run` (docs/Quickstart.md).", + "", + ] + ) + + +def init( + project_dir: Annotated[ + Path, + typer.Argument( + help="Directory to create (must not exist). For example: ./my_project", + ), + ], + name: Annotated[ + str | None, + typer.Option("--name", help="Project name; defaults to the target directory name."), + ] = None, + engine: Annotated[ + str, + typer.Option( + "--engine", + help=( + "Executor engine for the default profile. " + "Supported values: duckdb, postgres, bigquery, bigquery_bf, " + "databricks_spark, snowflake_snowpark." + ), + ), + ] = "duckdb", + profile_name: Annotated[ + str, + typer.Option("--profile-name", help="Profile name to generate inside profiles.yml."), + ] = "dev", +) -> None: + resolved_engine = engine.lower().strip() + if resolved_engine not in _SUPPORTED_ENGINES: + typer.secho( + ( + f"Unsupported engine '{engine}'. " + "Choose one of: {', '.join(sorted(_SUPPORTED_ENGINES))}." + ), + fg="red", + ) + raise typer.Exit(2) + + project_dir = project_dir.resolve() + project_name = name or project_dir.name + + try: + project_dir.mkdir(parents=True, exist_ok=False) + except FileExistsError as err: + typer.secho( + f"Cannot initialise project: directory '{project_dir}' already exists. " + "Choose a new path or remove the existing directory first.", + fg="red", + ) + raise typer.Exit(1) from err + + ctx = _InitContext( + project_dir=project_dir, + project_name=project_name, + profile_name=profile_name, + engine=resolved_engine, + ) + + for sub in ("models", "seeds", "tests/unit", "docs"): + (project_dir / sub).mkdir(parents=True, exist_ok=True) + + _write_file(project_dir / "project.yml", _build_project_yaml(ctx)) + _write_file(project_dir / "profiles.yml", _build_profiles_yaml(ctx)) + _write_file(project_dir / "sources.yml", _build_sources_yaml()) + _write_file(project_dir / "README.md", _build_root_readme(ctx)) + _create_directory_notes(project_dir) + + typer.secho(f"✓ Project skeleton created at {project_dir}", fg="green") + + +def register(app: typer.Typer) -> None: + app.command( + help=( + "Create a FastFlowTransform project skeleton (non-interactive).\n\n" + "Examples:\n" + " fft init ./analytics --name analytics --engine duckdb\n" + " fft init ~/projects/warehouse --engine postgres --profile-name prod\n" + ) + )(init) + + +__all__ = ["init", "register"] diff --git a/src/fastflowtransform/core.py b/src/fastflowtransform/core.py index c940518..25da7cf 100644 --- a/src/fastflowtransform/core.py +++ b/src/fastflowtransform/core.py @@ -3,6 +3,7 @@ import ast import importlib.util +import os import re import types from collections.abc import Callable, Iterable, Mapping @@ -15,7 +16,9 @@ import yaml from jinja2 import Environment, FileSystemLoader, StrictUndefined -from .errors import DependencyNotFoundError, ModuleLoadError +from fastflowtransform import storage +from fastflowtransform.errors import DependencyNotFoundError, ModuleLoadError +from fastflowtransform.logging import get_logger _SOURCE_CFG_FIELDS = { "identifier", @@ -279,6 +282,7 @@ def __init__(self): self.macros: dict[str, Path] = {} # macro_name -> file path self.project_vars: dict[str, Any] = {} # project.yml: vars self.cli_vars: dict[str, Any] = {} # CLI --vars overrides + self.active_engine: str | None = None def get_project_dir(self) -> Path: """Return the project directory after load_project(), or raise if not set.""" @@ -310,7 +314,218 @@ def set_cli_vars(self, overrides: dict[str, Any]) -> None: """Set CLI --vars overrides (highest precedence).""" self.cli_vars = dict(overrides or {}) + def set_active_engine(self, engine: str | None) -> None: + """Store active engine hint (case-insensitive) for conditional loading.""" + self.active_engine = engine.lower().strip() if isinstance(engine, str) else None + + def _lookup_storage_meta(self, node_name: str) -> dict[str, Any]: + """ + Return storage metadata for a given node (if configured in project.yml). + Accepts names with or without trailing '.ff'. + """ + return storage.get_model_storage(node_name) + + def _current_engine(self) -> str | None: + """ + Determine the active engine in precedence order: + 1) Explicit hint via set_active_engine() + 2) Environment variable FF_ENGINE + 3) project.yml vars → engine + 4) CLI --vars {engine: ...} + """ + if self.active_engine: + return self.active_engine + + env_engine = os.getenv("FF_ENGINE") + if isinstance(env_engine, str) and env_engine.strip(): + return env_engine.strip().lower() + + proj_engine = self.project_vars.get("engine") + if isinstance(proj_engine, str) and proj_engine.strip(): + return proj_engine.strip().lower() + + cli_engine = self.cli_vars.get("engine") + if isinstance(cli_engine, str) and cli_engine.strip(): + return cli_engine.strip().lower() + + return None + + def _should_register_for_engine(self, meta: Mapping[str, Any], *, path: Path) -> bool: + """ + SQL models may declare config(engines=[...]) to limit registration. + Returns True when the current engine matches (or no restriction given). + """ + raw = meta.get("engines") + if raw is None: + return True + + tokens: Iterable[Any] + if isinstance(raw, str): + tokens = [raw] + elif isinstance(raw, Iterable) and not isinstance(raw, (str, Mapping)): + tokens = raw + else: + raise ModuleLoadError( + f"{path}: config(engines=...) must be a string or iterable of strings." + ) + + allowed: set[str] = set() + for tok in tokens: + if not isinstance(tok, (str, bytes)): + raise ModuleLoadError( + f"{path}: config(engines=...) expects strings, got {type(tok).__name__}." + ) + text = str(tok).strip() + if text: + allowed.add(text.lower()) + + if not allowed: + return True + + current = self._current_engine() + if current is None: + raise ModuleLoadError( + f"{path}: config(engines=...) requires an active engine.\n" + "Hint: Export FF_ENGINE or call REGISTRY.set_active_engine('duckdb'|...)." + ) + return current in allowed + + # def load_project(self, project_dir: Path) -> None: + # self.nodes.clear() + # self.py_funcs.clear() + # self.py_requires.clear() + # self.sources = {} + # self.project_vars = {} + # self.cli_vars = {} + # self.macros.clear() + + # storage.set_model_storage({}) + # storage.set_seed_storage({}) + + # self.project_dir = project_dir + # models_dir = project_dir / "models" + # self.env = Environment( + # loader=FileSystemLoader(str(models_dir)), + # undefined=StrictUndefined, + # autoescape=False, + # trim_blocks=True, + # lstrip_blocks=True, + # ) + + # # Make sure macros are available to all templates before model discovery. + # self._load_macros(models_dir) + # self._load_py_macros(models_dir) + + # # load sources (version 2 schema) + # src_path = project_dir / "sources.yml" + # if src_path.exists(): + # raw_sources = yaml.safe_load(src_path.read_text(encoding="utf-8")) + # try: + # self.sources = _parse_sources_yaml(raw_sources) + # except ValueError as exc: + # raise ValueError(f"Failed to parse sources.yml: {exc}") from exc + # else: + # self.sources = {} + + # # load project.yml (vars) + # proj_path = project_dir / "project.yml" + # if proj_path.exists(): + # proj_cfg = yaml.safe_load(proj_path.read_text(encoding="utf-8")) or {} + # self.project_vars = dict(proj_cfg.get("vars", {}) or {}) + + # models_cfg = proj_cfg.get("models") if isinstance(proj_cfg, Mapping) else None + # model_storage_raw = None + # if isinstance(models_cfg, Mapping): + # candidate = models_cfg.get("storage") + # if isinstance(candidate, Mapping): + # model_storage_raw = candidate + # storage.set_model_storage( + # storage.normalize_storage_map(model_storage_raw, project_dir=project_dir) + # ) + + # seeds_cfg = proj_cfg.get("seeds") if isinstance(proj_cfg, Mapping) else None + # seed_storage_raw = None + # if isinstance(seeds_cfg, Mapping): + # candidate = seeds_cfg.get("storage") + # if isinstance(candidate, Mapping): + # seed_storage_raw = candidate + # storage.set_seed_storage( + # storage.normalize_storage_map(seed_storage_raw, project_dir=project_dir) + # ) + + # # discover models + # for p in models_dir.rglob("*.ff.sql"): + # name = p.stem + # deps = self._scan_sql_deps(p) + # meta = dict(self._parse_model_config(p)) + # storage_meta = self._lookup_storage_meta(name) + # if storage_meta: + # existing = dict(meta.get("storage") or {}) + # existing.update(storage_meta) + # meta["storage"] = existing + # if not self._should_register_for_engine(meta, path=p): + # continue + # self._add_node_or_fail(name, "sql", p, deps, meta=meta) + # for p in models_dir.rglob("*.ff.py"): + # self._load_py_module(p) + # for _, func in list(self.py_funcs.items()): + # func_path = Path(getattr(func, "__ff_path__", "")).resolve() + # if func_path == p.resolve(): + # name = getattr(func, "__ff_name__", func.__name__) + # deps = getattr(func, "__ff_deps__", []) + # kind = getattr(func, "__ff_kind__", "python") or "python" + + # meta = dict(getattr(func, "__ff_meta__", {}) or {}) + # storage_meta = self._lookup_storage_meta(name) + # if storage_meta: + # existing = dict(meta.get("storage") or {}) + # existing.update(storage_meta) + # meta["storage"] = existing + # tags = list(getattr(func, "__ff_tags__", []) or []) + # if tags: + # existing_tags = meta.get("tags") + # if isinstance(existing_tags, list): + # merged = existing_tags + [t for t in tags if t not in existing_tags] + # meta["tags"] = merged + # elif existing_tags is None: + # meta["tags"] = tags + # else: + # # Normalize non-list tags into a list while preserving the value + # meta["tags"] = [existing_tags, *tags] + + # self._add_node_or_fail(name, kind, p, deps, meta=meta) + + # req = getattr(func, "__ff_require__", None) + # if req: + # self.py_requires[name] = req + + # # ---- Dependency validation (early and clear) + # self._validate_dependencies() + def load_project(self, project_dir: Path) -> None: + """Load a FastFlowTransform project from the given directory.""" + self._reset_registry_state() + self.project_dir = project_dir + + models_dir = project_dir / "models" + self._init_jinja_env(models_dir) + + # macros first, because models may use them + self._load_macros(models_dir) + self._load_py_macros(models_dir) + + self._load_sources_yaml(project_dir) + self._load_project_yaml(project_dir) + + # discover models + self._discover_sql_models(models_dir) + self._discover_python_models(models_dir) + + # final validation + self._validate_dependencies() + + def _reset_registry_state(self) -> None: + """Reset in-memory registry structures to a clean state.""" self.nodes.clear() self.py_funcs.clear() self.py_requires.clear() @@ -318,9 +533,12 @@ def load_project(self, project_dir: Path) -> None: self.project_vars = {} self.cli_vars = {} self.macros.clear() + # reset storage maps + storage.set_model_storage({}) + storage.set_seed_storage({}) - self.project_dir = project_dir - models_dir = project_dir / "models" + def _init_jinja_env(self, models_dir: Path) -> None: + """Initialize the Jinja environment for this project.""" self.env = Environment( loader=FileSystemLoader(str(models_dir)), undefined=StrictUndefined, @@ -329,48 +547,104 @@ def load_project(self, project_dir: Path) -> None: lstrip_blocks=True, ) - # Make sure macros are available to all templates before model discovery. - self._load_macros(models_dir) - self._load_py_macros(models_dir) - - # load sources (version 2 schema) + def _load_sources_yaml(self, project_dir: Path) -> None: + """Load sources.yml (version 2) if present.""" src_path = project_dir / "sources.yml" - if src_path.exists(): - raw_sources = yaml.safe_load(src_path.read_text(encoding="utf-8")) - try: - self.sources = _parse_sources_yaml(raw_sources) - except ValueError as exc: - raise ValueError(f"Failed to parse sources.yml: {exc}") from exc - else: + if not src_path.exists(): self.sources = {} + return + + raw_sources = yaml.safe_load(src_path.read_text(encoding="utf-8")) + try: + self.sources = _parse_sources_yaml(raw_sources) + except ValueError as exc: + raise ValueError(f"Failed to parse sources.yml: {exc}") from exc - # load project.yml (vars) + def _load_project_yaml(self, project_dir: Path) -> None: + """Load project.yml (vars, storage blocks) if present.""" proj_path = project_dir / "project.yml" - if proj_path.exists(): - proj_cfg = yaml.safe_load(proj_path.read_text(encoding="utf-8")) or {} - self.project_vars = dict(proj_cfg.get("vars", {}) or {}) + if not proj_path.exists(): + return - # discover models - for p in models_dir.rglob("*.ff.sql"): - name = p.stem - deps = self._scan_sql_deps(p) - meta = self._parse_model_config(p) - self._add_node_or_fail(name, "sql", p, deps, meta=meta) - for p in models_dir.rglob("*.ff.py"): - self._load_py_module(p) + proj_cfg = yaml.safe_load(proj_path.read_text(encoding="utf-8")) or {} + self.project_vars = dict(proj_cfg.get("vars", {}) or {}) + + # models.storage + models_cfg = proj_cfg.get("models") if isinstance(proj_cfg, Mapping) else None + model_storage_raw = None + if isinstance(models_cfg, Mapping): + candidate = models_cfg.get("storage") + if isinstance(candidate, Mapping): + model_storage_raw = candidate + storage.set_model_storage( + storage.normalize_storage_map(model_storage_raw, project_dir=project_dir) + ) + + # seeds.storage + seeds_cfg = proj_cfg.get("seeds") if isinstance(proj_cfg, Mapping) else None + seed_storage_raw = None + if isinstance(seeds_cfg, Mapping): + candidate = seeds_cfg.get("storage") + if isinstance(candidate, Mapping): + seed_storage_raw = candidate + storage.set_seed_storage( + storage.normalize_storage_map(seed_storage_raw, project_dir=project_dir) + ) + + def _discover_sql_models(self, models_dir: Path) -> None: + """Scan *.ff.sql files, parse deps, and register nodes.""" + for path in models_dir.rglob("*.ff.sql"): + name = path.stem + deps = self._scan_sql_deps(path) + meta = dict(self._parse_model_config(path)) + storage_meta = self._lookup_storage_meta(name) + if storage_meta: + existing = dict(meta.get("storage") or {}) + existing.update(storage_meta) + meta["storage"] = existing + if not self._should_register_for_engine(meta, path=path): + continue + self._add_node_or_fail(name, "sql", path, deps, meta=meta) + + def _discover_python_models(self, models_dir: Path) -> None: + """Scan *.ff.py files, import them, and register decorated callables.""" + for path in models_dir.rglob("*.ff.py"): + self._load_py_module(path) + + # we might have loaded several functions; filter by file path for _, func in list(self.py_funcs.items()): func_path = Path(getattr(func, "__ff_path__", "")).resolve() - if func_path == p.resolve(): - name = getattr(func, "__ff_name__", func.__name__) - deps = getattr(func, "__ff_deps__", []) - self._add_node_or_fail(name, "python", p, deps, meta={}) + if func_path != path.resolve(): + continue - req = getattr(func, "__ff_require__", None) - if req: - self.py_requires[name] = req - - # ---- Dependency validation (early and clear) - self._validate_dependencies() + name = getattr(func, "__ff_name__", func.__name__) + deps = getattr(func, "__ff_deps__", []) + kind = getattr(func, "__ff_kind__", "python") or "python" + + meta = dict(getattr(func, "__ff_meta__", {}) or {}) + storage_meta = self._lookup_storage_meta(name) + if storage_meta: + existing = dict(meta.get("storage") or {}) + existing.update(storage_meta) + meta["storage"] = existing + + # merge tags from decorator into model meta.tags + tags = list(getattr(func, "__ff_tags__", []) or []) + if tags: + existing_tags = meta.get("tags") + if isinstance(existing_tags, list): + merged = existing_tags + [t for t in tags if t not in existing_tags] + meta["tags"] = merged + elif existing_tags is None: + meta["tags"] = tags + else: + meta["tags"] = [existing_tags, *tags] + + self._add_node_or_fail(name, kind, path, deps, meta=meta) + + req = getattr(func, "__ff_require__", None) + if req: + self.py_requires[name] = req # --- Macros --------------------------------------------------------- def _load_macros(self, models_dir: Path) -> None: @@ -458,18 +732,36 @@ def _add_node_or_fail( if name in self.nodes: other = self.nodes[name].path raise ModuleLoadError( - "Doppelter Modellname erkannt:\n" - f"• bereits registriert: {other}\n" - f"• weiterer Fund: {path}\n" - "Tipp: Benenne eines der Modelle um (Dateistamm = Node-Name) " - "oder nutze @model(name='…') für Python." + "Duplicate model name detected:\n" + f"• alredy registered: {other}\n" + f"• new model: {path}\n" + "Hint: Rename one of the models (file name = node name)" + "or use @model(name='…') for Python." ) self.nodes[name] = Node(name=name, kind=kind, path=path, deps=deps, meta=meta) def _scan_sql_deps(self, path: Path) -> list[str]: txt = path.read_text(encoding="utf-8") - pattern = re.compile(r"ref\s*\(\s*['\"]([A-Za-z0-9_.\-]+)['\"]\s*\)") - return pattern.findall(txt) + literal = re.compile(r"ref\s*\(\s*['\"]([A-Za-z0-9_.\-]+)['\"]\s*\)") + dynamic = re.compile(r"ref\s*\(\s*([^)]+)\)") + + deps = literal.findall(txt) + + for expr in dynamic.findall(txt): + expr_stripped = expr.strip() + if not ( + (expr_stripped.startswith("'") and expr_stripped.endswith("'")) + or (expr_stripped.startswith('"') and expr_stripped.endswith('"')) + ): + logger = get_logger("registry") + logger.warning( + "%s: ref(%s) cannot be statically resolved; DAG may miss this dependency. " + "Wrap options in a mapping of literal ref('...') calls and pick from that map.", + path, + expr_stripped, + ) + + return deps # -------- {{ config(...) }} Head-Parser -------- def _parse_model_config(self, path: Path) -> dict[str, Any]: diff --git a/src/fastflowtransform/decorators.py b/src/fastflowtransform/decorators.py index 9635f00..86345dc 100644 --- a/src/fastflowtransform/decorators.py +++ b/src/fastflowtransform/decorators.py @@ -1,13 +1,14 @@ -# src/fastflowtransform/decorators.py (or wherever your decorator lives) +# src/fastflowtransform/decorators.py from __future__ import annotations import inspect +import os from collections.abc import Callable, Iterable, Mapping, Sequence from pathlib import Path from typing import Any, ParamSpec, Protocol, TypeVar, cast -from .core import REGISTRY, relation_for # relation_for is required for normalization -from .errors import ModuleLoadError +from fastflowtransform.core import REGISTRY, relation_for +from fastflowtransform.errors import ModuleLoadError P = ParamSpec("P") R_co = TypeVar("R_co", covariant=True) @@ -110,3 +111,17 @@ def deco(func: Callable[P, R_co]) -> HasFFMeta[P, R_co]: return cast(HasFFMeta[P, R_co], func) return deco + + +def engine_model( + *, only: str | tuple[str, ...], **model_kwargs: Any +) -> Callable[[Callable[P, R_co]], HasFFMeta[P, R_co]]: + allowed = {only} if isinstance(only, str) else {e.lower() for e in only} + + def deco(fn): + current = os.getenv("FF_ENGINE", "").lower() + if current in allowed: + return model(**model_kwargs)(fn) + return fn # stays undecorated → no registry entry + + return deco diff --git a/src/fastflowtransform/docs.py b/src/fastflowtransform/docs.py index 0fec0de..c8fdc16 100644 --- a/src/fastflowtransform/docs.py +++ b/src/fastflowtransform/docs.py @@ -45,6 +45,8 @@ def _collect_columns(executor: Any) -> dict[str, list[ColumnInfo]]: Returns an empty mapping if unsupported or on errors. """ try: + if hasattr(executor, "spark"): + return _columns_spark(executor.spark) if hasattr(executor, "con"): # DuckDB return _columns_duckdb(executor.con) if hasattr(executor, "engine"): # Postgres @@ -437,6 +439,62 @@ def _columns_snowflake(session: Any) -> dict[str, list[ColumnInfo]]: return out +def _columns_spark(spark: Any) -> dict[str, list[ColumnInfo]]: + """ + Collect column metadata from a SparkSession (Databricks / Spark SQL). + Uses catalog.listTables/listColumns, available on vanilla Spark 3+. + """ + try: + tables = list(spark.catalog.listTables()) + except Exception: + return {} + + out: dict[str, list[ColumnInfo]] = {} + seen: set[tuple[str | None, str]] = set() + + def _list_columns(table_name: str, database: str | None) -> list[Any]: + ident = table_name if not database else f"{database}.{table_name}" + try: + return list(spark.catalog.listColumns(ident)) + except TypeError: + return list(spark.catalog.listColumns(table_name, database)) + + for tbl in tables: + database = getattr(tbl, "database", None) + raw_name = getattr(tbl, "name", None) + if not raw_name: + continue + table_name = str(raw_name) + key = (database, table_name) + if key in seen: + continue + seen.add(key) + try: + cols = _list_columns(table_name, database) + except Exception: + continue + if not cols: + continue + + keys: set[str] = {table_name} + catalog = getattr(tbl, "catalog", None) + if database: + keys.add(f"{database}.{table_name}") + if database and catalog: + keys.add(f"{catalog}.{database}.{table_name}") + for c in cols: + nullable = bool(getattr(c, "nullable", False)) + dtype = str(getattr(c, "dataType", "")) + col_name = getattr(c, "name", None) + if not col_name: + continue + info = ColumnInfo(str(col_name), dtype, nullable) + for k in keys: + out.setdefault(k, []).append(info) + + return out + + def read_docs_metadata(project_dir: Path) -> dict[str, Any]: """ Merge YAML + Markdown descriptions with priority: Markdown > YAML. diff --git a/src/fastflowtransform/errors.py b/src/fastflowtransform/errors.py index 7fe7de4..875268d 100644 --- a/src/fastflowtransform/errors.py +++ b/src/fastflowtransform/errors.py @@ -95,4 +95,5 @@ def __init__(self, node_name: str, relation: str, message: str, sql_snippet: str self.node_name = node_name self.relation = relation self.sql_snippet = sql_snippet + self.message = message super().__init__(message) diff --git a/src/fastflowtransform/executors/databricks_spark_exec.py b/src/fastflowtransform/executors/databricks_spark_exec.py index 66074b9..c476ba8 100644 --- a/src/fastflowtransform/executors/databricks_spark_exec.py +++ b/src/fastflowtransform/executors/databricks_spark_exec.py @@ -1,13 +1,20 @@ # src/fastflowtransform/executors/databricks_spark_exec.py from __future__ import annotations +import shutil from collections.abc import Iterable +from contextlib import suppress +from pathlib import Path from typing import Any +from urllib.parse import unquote, urlparse +from pyspark.errors.exceptions.base import AnalysisException from pyspark.sql import DataFrame as SDF, SparkSession from pyspark.sql.types import DataType -from fastflowtransform.core import Node, relation_for +from fastflowtransform import storage +from fastflowtransform.core import REGISTRY, Node, relation_for +from fastflowtransform.errors import ModelExecutionError from fastflowtransform.executors.base import BaseExecutor from fastflowtransform.meta import ensure_meta_table, upsert_meta @@ -16,11 +23,60 @@ class DatabricksSparkExecutor(BaseExecutor[SDF]): ENGINE_NAME = "databricks_spark" """Spark/Databricks executor without pandas: Python models operate on Spark DataFrames.""" - def __init__(self, master: str = "local[*]", app_name: str = "fastflowtransform"): - self.spark = SparkSession.builder.master(master).appName(app_name).getOrCreate() + def __init__( + self, + master: str = "local[*]", + app_name: str = "fastflowtransform", + *, + extra_conf: dict[str, Any] | None = None, + warehouse_dir: str | None = None, + use_hive_metastore: bool = False, + catalog: str | None = None, + database: str | None = None, + table_format: str | None = "parquet", + table_options: dict[str, Any] | None = None, + ): + builder = SparkSession.builder.master(master).appName(app_name) + + warehouse_path: Path | None = None + if warehouse_dir: + warehouse_path = Path(warehouse_dir).expanduser() + if not warehouse_path.is_absolute(): + warehouse_path = Path.cwd() / warehouse_path + warehouse_path.mkdir(parents=True, exist_ok=True) + builder = builder.config("spark.sql.warehouse.dir", str(warehouse_path)) + + if catalog: + builder = builder.config("spark.sql.catalog.spark_catalog", catalog) + + if extra_conf: + for key, value in extra_conf.items(): + if value is not None: + builder = builder.config(str(key), str(value)) + + if use_hive_metastore: + builder = builder.config("spark.sql.catalogImplementation", "hive") + builder = builder.enableHiveSupport() + + self.spark = builder.getOrCreate() # Lightweight testing shim so tests can call executor.con.execute("SQL") self.con = _SparkConnShim(self.spark) self._registered_path_sources: dict[str, dict[str, Any]] = {} + self.warehouse_dir = warehouse_path + self.catalog = catalog + self.database = database + self.schema = database + if database: + self.spark.sql(f"CREATE DATABASE IF NOT EXISTS `{database}`") + with suppress(Exception): + self.spark.catalog.setCurrentDatabase(database) + + fmt = (table_format or "").strip().lower() + self.spark_table_format: str | None = fmt or None + if table_options: + self.spark_table_options = {str(k): str(v) for k, v in table_options.items()} + else: + self.spark_table_options = {} # ---------- Frame hooks (required) ---------- def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> SDF: @@ -30,8 +86,12 @@ def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> SDF: def _materialize_relation(self, relation: str, df: SDF, node: Node) -> None: if not self._is_frame(df): raise TypeError("Spark model must return a Spark DataFrame") + storage_meta = self._storage_meta(node, relation) + if storage_meta.get("path"): + self._write_to_storage_path(relation, df, storage_meta) + return # write as a table in Hive/Unity/Delta environments - df.write.mode("overwrite").saveAsTable(relation) + self._save_df_as_table(relation, df, storage=storage_meta) def _create_view_over_table(self, view_name: str, backing_table: str, node: Node) -> None: self.spark.sql(f"CREATE OR REPLACE VIEW `{view_name}` AS SELECT * FROM `{backing_table}`") @@ -70,13 +130,13 @@ def cols(df: SDF) -> set[str]: f"'{node_name}'.\n" + "\n".join(errors) ) - def _columns_of(self, frame: SDF) -> list[str]: + def _columns_of(self, frame: SDF) -> list[str]: # pragma: no cover return frame.schema.fieldNames() - def _is_frame(self, obj: Any) -> bool: + def _is_frame(self, obj: Any) -> bool: # pragma: no cover return isinstance(obj, SDF) - def _frame_name(self) -> str: + def _frame_name(self) -> str: # pragma: no cover return "Spark" # ---- Helpers ---- @@ -86,6 +146,45 @@ def _q_ident(value: str | None) -> str: return "" return f"`{value.replace('`', '``')}`" + def _storage_meta(self, node: Node | None, relation: str) -> dict[str, Any]: + """ + Retrieve configured storage overrides for the logical node backing `relation`. + """ + rel_clean = self._strip_quotes(relation) + if node is not None: + meta = dict((node.meta or {}).get("storage") or {}) + if meta: + return meta + lookup = storage.get_model_storage(node.name) + if lookup: + return lookup + for cand in getattr(REGISTRY, "nodes", {}).values(): + try: + if self._strip_quotes(relation_for(cand.name)) == rel_clean: + meta = dict((cand.meta or {}).get("storage") or {}) + if meta: + return meta + lookup = storage.get_model_storage(cand.name) + if lookup: + return lookup + except Exception: + continue + return storage.get_model_storage(rel_clean) + + def _write_to_storage_path( + self, relation: str, df: SDF, storage_meta: dict[str, Any] + ) -> None: # pragma: no cover + parts = self._identifier_parts(relation) + identifier = ".".join(parts) + storage.spark_write_to_path( + self.spark, + identifier, + df, + storage=storage_meta, + default_format=self.spark_table_format, + default_options=self.spark_table_options, + ) + # ---- SQL hooks ---- def _format_relation_for_ref(self, name: str) -> str: return self._q_ident(relation_for(name)) @@ -130,11 +229,117 @@ def _format_source_reference( parts = [identifier] return ".".join(self._q_ident(str(part)) for part in parts) + # ---- Spark table helpers ---- + @staticmethod + def _strip_quotes(identifier: str) -> str: + return identifier.replace("`", "").replace('"', "") + + def _identifier_parts(self, identifier: str) -> list[str]: + cleaned = self._strip_quotes(identifier) + return [part for part in cleaned.split(".") if part] + + def _warehouse_base(self) -> Path | None: + try: + conf_val = self.spark.conf.get("spark.sql.warehouse.dir", "spark-warehouse") + except Exception: + conf_val = "spark-warehouse" + + if not isinstance(conf_val, str): + conf_val = str(conf_val) + parsed = urlparse(conf_val) + scheme = (parsed.scheme or "").lower() + + if scheme and scheme != "file": + return None + + if scheme == "file": + if parsed.netloc and parsed.netloc not in {"", "localhost"}: + return None + raw_path = unquote(parsed.path or "") + if not raw_path: + return None + base = Path(raw_path) + else: + base = Path(conf_val) + + if not base.is_absolute(): + base = Path.cwd() / base + return base + + def _table_location(self, parts: list[str]) -> Path | None: + base = self._warehouse_base() + if base is None or not parts: + return None + + filtered = [p for p in parts if p] + if not filtered: + return None + + catalog_cutoff = 3 + if len(filtered) >= catalog_cutoff and filtered[0].lower() in {"spark_catalog", "spark"}: + filtered = filtered[1:] + + table = filtered[-1] + schema_cutoff = 2 + schema = filtered[-2] if len(filtered) >= schema_cutoff else None + + location = base + if schema: + location = location / f"{schema}.db" + return location / table + + def _save_df_as_table( + self, identifier: str, df: SDF, *, storage: dict[str, Any] | None = None + ) -> None: + parts = self._identifier_parts(identifier) + if not parts: + raise ValueError(f"Invalid Spark table identifier: {identifier}") + + storage_meta = storage or self._storage_meta(None, identifier) + if storage_meta.get("path"): + self._write_to_storage_path(identifier, df, storage_meta) + return + + table_name = ".".join(parts) + target_location = self._table_location(parts) + + def _write() -> None: + writer = df.write.mode("overwrite") + if self.spark_table_format: + writer = writer.format(self.spark_table_format) + if self.spark_table_options: + writer = writer.options(**self.spark_table_options) + writer.saveAsTable(table_name) + + target_sql = ".".join(self._q_ident(p) for p in parts) + with suppress(Exception): + self.spark.sql(f"DROP TABLE IF EXISTS {target_sql}") + if target_location and target_location.exists(): + with suppress(Exception): + shutil.rmtree(target_location, ignore_errors=True) + + try: + _write() + except AnalysisException as exc: # pragma: no cover - requires real Spark/Delta error + message = str(exc) + if target_location and "LOCATION_ALREADY_EXISTS" in message.upper(): + with suppress(Exception): + shutil.rmtree(target_location, ignore_errors=True) + _write() + else: + raise + def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None: self.spark.sql(f"CREATE OR REPLACE VIEW {target_sql} AS {select_body}") def _create_or_replace_table(self, target_sql: str, select_body: str, node: Node) -> None: - self.spark.sql(f"CREATE OR REPLACE TABLE {target_sql} AS {select_body}") + preview = f"-- target={target_sql}\n{select_body}" + try: + df = self.spark.sql(select_body) + storage_meta = self._storage_meta(node, target_sql) + self._save_df_as_table(target_sql, df, storage=storage_meta) + except Exception as exc: + raise ModelExecutionError(node.name, target_sql, str(exc), sql_snippet=preview) from exc def _create_or_replace_view_from_table( self, view_name: str, backing_table: str, node: Node @@ -161,7 +366,8 @@ def exists_relation(self, relation: str) -> bool: def create_table_as(self, relation: str, select_sql: str) -> None: """CREATE TABLE AS with cleaned SELECT body.""" body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - self.spark.sql(f"CREATE TABLE {relation} AS {body}") + df = self.spark.sql(body) + self._save_df_as_table(relation, df) def incremental_insert(self, relation: str, select_sql: str) -> None: """INSERT INTO with cleaned SELECT body.""" @@ -187,7 +393,8 @@ def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str ) except Exception: # Fallback: Full replace is safer across lake formats - self.spark.sql(f"CREATE OR REPLACE TABLE {relation} AS {body}") + df = self.spark.sql(body) + self._save_df_as_table(relation, df) def alter_table_sync_schema( self, relation: str, select_sql: str, *, mode: str = "append_new_columns" @@ -239,7 +446,7 @@ def fetchone(self) -> tuple | None: return self._rows[0] if self._rows else None -class _SparkConnShim: +class _SparkConnShim: # pragma: no cover """Provide .execute(sql) with fetch* for test utilities.""" def __init__(self, spark: SparkSession): diff --git a/src/fastflowtransform/executors/postgres_exec.py b/src/fastflowtransform/executors/postgres_exec.py index 189d803..c59138d 100644 --- a/src/fastflowtransform/executors/postgres_exec.py +++ b/src/fastflowtransform/executors/postgres_exec.py @@ -31,6 +31,15 @@ def __init__(self, dsn: str, schema: str | None = None): self.engine: Engine = create_engine(dsn, future=True) self.schema = schema + if self.schema: + try: + with self.engine.begin() as conn: + conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {self._q_ident(self.schema)}")) + except SQLAlchemyError as exc: + raise ProfileConfigError( + f"Failed to ensure schema '{self.schema}' exists: {exc}" + ) from exc + # ⇣ fastflowtransform.testing expects executor.con.execute("SQL") self.con = SAConnShim(self.engine, schema=self.schema) diff --git a/src/fastflowtransform/seeding.py b/src/fastflowtransform/seeding.py index 06a10d0..9afe4c3 100644 --- a/src/fastflowtransform/seeding.py +++ b/src/fastflowtransform/seeding.py @@ -2,18 +2,26 @@ from __future__ import annotations import math +import shutil import uuid +from collections.abc import Callable, Iterable from contextlib import suppress from pathlib import Path from time import perf_counter from typing import Any, NamedTuple +from urllib.parse import unquote, urlparse -import duckdb as _dd import pandas as pd import yaml +from fastflowtransform import storage from fastflowtransform.logging import echo +try: # Optional Spark dependency + from pyspark.errors.exceptions.base import AnalysisException as _SparkAnalysisException +except Exception: # pragma: no cover - Spark not installed + _SparkAnalysisException = Exception # type: ignore + # If you use this in a CLI, your code elsewhere should provide _prepare_context. @@ -80,6 +88,65 @@ def _qualify(table: str, schema: str | None) -> str: return _dq(table) +def _spark_warehouse_base(spark: Any) -> Path | None: + """Resolve the Spark warehouse directory if it points to the local filesystem.""" + try: + conf_val = spark.conf.get("spark.sql.warehouse.dir", "spark-warehouse") + except Exception: + conf_val = "spark-warehouse" + + if not isinstance(conf_val, str): + conf_val = str(conf_val) + parsed = urlparse(conf_val) + scheme = (parsed.scheme or "").lower() + + if scheme and scheme != "file": + return None + + if scheme == "file": + # Treat file:// URIs as local filesystem paths. + if parsed.netloc and parsed.netloc not in {"", "localhost"}: + return None + raw_path = unquote(parsed.path or "") + if not raw_path: + return None + base = Path(raw_path) + else: + base = Path(conf_val) + + if not base.is_absolute(): + base = Path.cwd() / base + return base + + +def _spark_table_location(parts: list[str], spark: Any) -> Path | None: + """ + Best-effort guess of the filesystem location for a managed Spark table. + Works for default schema, schema.table, and catalog.schema.table patterns. + """ + base = _spark_warehouse_base(spark) + if base is None or not parts: + return None + + filtered = [p for p in parts if p] + if not filtered: + return None + + # Drop common catalog prefixes while retaining the schema name. + catalog_cutoff = 3 + if len(filtered) >= catalog_cutoff and filtered[0].lower() in {"spark_catalog", "spark"}: + filtered = filtered[1:] + + table = filtered[-1] + schema_cutoff = 2 + schema = filtered[-2] if len(filtered) >= schema_cutoff else None + + location = base + if schema: + location = location / f"{schema}.db" + return location / table + + # -------------------------------- Pretty echo helpers --------------------------------- @@ -143,6 +210,8 @@ class SeedTarget(NamedTuple): def _engine_name_from_executor(executor: Any) -> str: """Infer a human/CFG-facing engine name from the executor object.""" + if getattr(executor, "spark", None) is not None: + return "spark" eng = getattr(executor, "engine", None) if eng is not None: name = getattr(getattr(eng, "dialect", None), "name", None) @@ -214,6 +283,181 @@ def _resolve_schema_and_table_by_cfg( # ------------------------------ Materialization (engines) ------------------------------ +# ------------------------------------------------------------ +# Engine-specifig Handlers +# ------------------------------------------------------------ + + +def _handle_duckdb(table: str, df: pd.DataFrame, executor: Any, schema: str | None) -> bool: + """Versucht DuckDB zu erkennen und zu bedienen. Gibt True zurück, wenn ausgeführt.""" + con = getattr(executor, "con", None) + if con is None: + return False + + try: + import duckdb as _dd # Noqa PLC0415 + + is_duck_con = isinstance(con, _dd.DuckDBPyConnection) + except Exception: + is_duck_con = all(hasattr(con, m) for m in ("register", "execute")) + + if not is_duck_con: + return False + + full_name = _qualify(table, schema) + created_schema = False + if schema and not _is_qualified(table): + con.execute(f"create schema if not exists {_dq(schema)}") + created_schema = True + + t0 = perf_counter() + tmp = f"_ff_seed_{uuid.uuid4().hex[:8]}" + con.register(tmp, df) + try: + con.execute(f"create or replace table {full_name} as select * from {_dq(tmp)}") + finally: + with suppress(Exception): + con.unregister(tmp) # duckdb >= 0.8 + with suppress(Exception): + con.execute(f"drop view if exists {_dq(tmp)}") + + dt_ms = int((perf_counter() - t0) * 1000) + _echo_seed_line( + full_name=full_name, + rows=len(df), + cols=df.shape[1], + engine="duckdb", + ms=dt_ms, + created_schema=created_schema, + action="replaced", + ) + return True + + +def _handle_sqlalchemy(table: str, df: pd.DataFrame, executor: Any, schema: str | None) -> bool: + """Versucht SQLAlchemy-Engine/-Connection zu erkennen und zu bedienen.""" + eng = getattr(executor, "engine", None) + if eng is None: + return False + # heuristik: viele SQLAlchemy-Engines haben 'sqlalchemy' im Modulpfad der Klasse + if "sqlalchemy" not in getattr(eng.__class__, "__module__", ""): + return False + + t0 = perf_counter() + # pandas übernimmt die DDL/DML — replace-Semantik wie im Original + df.to_sql(table, eng, if_exists="replace", index=False, schema=schema, method="multi") + dt_ms = int((perf_counter() - t0) * 1000) + + dialect = getattr(getattr(eng, "dialect", None), "name", "sqlalchemy") + _echo_seed_line( + full_name=_qualify(table, schema), + rows=len(df), + cols=df.shape[1], + engine=dialect, + ms=dt_ms, + created_schema=False, + action="replaced", + ) + return True + + +def _handle_spark(table: str, df: pd.DataFrame, executor: Any, schema: str | None) -> bool: + """Versucht Spark/Databricks zu erkennen und zu bedienen.""" + spark = getattr(executor, "spark", None) + if spark is None: + return False + + def _spark_ident(name: str) -> str: + return name.replace("`", "``") + + created_schema = False + if schema and not _is_qualified(table): + spark.sql(f"CREATE DATABASE IF NOT EXISTS `{_spark_ident(schema)}`") + created_schema = True + parts = [schema, table] + else: + parts = table.split(".") + + parts = [p for p in parts if p] + target_identifier = ".".join(parts) + target_sql = ".".join(f"`{_spark_ident(p)}`" for p in parts) + target_location = _spark_table_location(parts, spark) + + table_format = getattr(executor, "spark_table_format", None) + table_options = getattr(executor, "spark_table_options", None) or {} + + storage_meta = storage.get_seed_storage(target_identifier) + + t0 = perf_counter() + sdf = spark.createDataFrame(df) + cleanup_hint = None + + if storage_meta.get("path"): + storage.spark_write_to_path( + spark, + target_identifier, + sdf, + storage=storage_meta, + default_format=table_format, + default_options=table_options, + ) + cleanup_hint = "custom path" + else: + with suppress(Exception): + spark.sql(f"DROP TABLE IF EXISTS {target_sql}") + if target_location and target_location.exists(): + with suppress(Exception): + shutil.rmtree(target_location, ignore_errors=True) + cleanup_hint = "reset location" + + def _write() -> None: + writer = sdf.write.mode("overwrite") + if table_format: + writer = writer.format(table_format) + if table_options: + writer = writer.options(**table_options) + writer.saveAsTable(target_identifier) + + try: + _write() + except _SparkAnalysisException as exc: + message = str(exc) + if target_location and "LOCATION_ALREADY_EXISTS" in message.upper(): + with suppress(Exception): + shutil.rmtree(target_location, ignore_errors=True) + cleanup_hint = "reset location" + _write() + else: + raise RuntimeError(f"Spark seed load failed for {target_sql}: {exc}") from exc + except Exception as exc: + raise RuntimeError(f"Spark seed load failed for {target_sql}: {exc}") from exc + + dt_ms = int((perf_counter() - t0) * 1000) + _echo_seed_line( + full_name=target_sql, + rows=len(df), + cols=df.shape[1], + engine="spark", + ms=dt_ms, + created_schema=created_schema, + action="replaced", + extra=cleanup_hint, + ) + return True + + +# ------------------------------------------------------------ +# Dispatcher +# ------------------------------------------------------------ + +Handler = Callable[[str, pd.DataFrame, Any, str | None], bool] + +_HANDLERS: Iterable[Handler] = ( + _handle_duckdb, + _handle_sqlalchemy, + _handle_spark, +) + def materialize_seed( table: str, df: pd.DataFrame, executor: Any, schema: str | None = None @@ -221,74 +465,14 @@ def materialize_seed( """ Materialize a DataFrame as a database table across engines. - DuckDB: - - Registers a temporary view for the DataFrame and performs - CREATE OR REPLACE TABLE . AS SELECT * FROM . - - Ensures CREATE SCHEMA IF NOT EXISTS when requested. - - SQLAlchemy engines (e.g., Postgres): - - Uses pandas.DataFrame.to_sql(if_exists='replace', schema=schema). - - Raises: - RuntimeError if no supported executor connection is detected. + Engine-spezifische Logik ist in dedizierten Handlern gekapselt + (_handle_duckdb/_handle_sqlalchemy/_handle_spark). Der Dispatcher + ruft sie der Reihe nach auf, bis einer übernimmt. """ - # DuckDB path (robust detection) - con = getattr(executor, "con", None) - if con is not None: - try: - is_duck_con = isinstance(con, _dd.DuckDBPyConnection) - except Exception: - is_duck_con = all(hasattr(con, m) for m in ("register", "execute")) - - if is_duck_con: - full_name = _qualify(table, schema) - created_schema = False - if schema and not _is_qualified(table): - con.execute(f"create schema if not exists {_dq(schema)}") - created_schema = True - - t0 = perf_counter() - tmp = f"_ff_seed_{uuid.uuid4().hex[:8]}" - con.register(tmp, df) - try: - con.execute(f"create or replace table {full_name} as select * from {_dq(tmp)}") - finally: - try: - con.unregister(tmp) # duckdb >= 0.8 - except Exception: - con.execute(f"drop view if exists {_dq(tmp)}") - dt_ms = int((perf_counter() - t0) * 1000) - - _echo_seed_line( - full_name=full_name, - rows=len(df), - cols=df.shape[1], - engine="duckdb", - ms=dt_ms, - created_schema=created_schema, - action="replaced", - ) + for handler in _HANDLERS: + if handler(table, df, executor, schema): return - # SQLAlchemy Engine path - eng = getattr(executor, "engine", None) - if eng is not None and "sqlalchemy" in eng.__class__.__module__: - t0 = perf_counter() - df.to_sql(table, eng, if_exists="replace", index=False, schema=schema, method="multi") - dt_ms = int((perf_counter() - t0) * 1000) - dialect = getattr(getattr(eng, "dialect", None), "name", "sqlalchemy") - _echo_seed_line( - full_name=_qualify(table, schema), - rows=len(df), - cols=df.shape[1], - engine=dialect, - ms=dt_ms, - created_schema=False, - action="replaced", - ) - return - - # Fallback (not implemented): you could emit VALUES via executor.execute(sql) for tiny seeds. raise RuntimeError("No compatible executor connection for seeding found.") @@ -322,7 +506,7 @@ def seed_project(project_dir: Path, executor: Any, default_schema: str | None = Number of successfully materialized seed tables. Raises: - ValueError if schema.yml uses a plain stem key while multiple files share that stem. + ValueError: if schema.yml uses a plain stem key while multiple files share that stem. """ seeds_dir = project_dir / "seeds" if not seeds_dir.exists(): diff --git a/src/fastflowtransform/settings.py b/src/fastflowtransform/settings.py index c029f65..08dbdc1 100644 --- a/src/fastflowtransform/settings.py +++ b/src/fastflowtransform/settings.py @@ -6,6 +6,7 @@ from typing import Annotated, Any, Literal, cast import yaml +from jinja2 import Environment, StrictUndefined from pydantic import BaseModel, Field, TypeAdapter from pydantic_settings import BaseSettings, SettingsConfigDict @@ -34,6 +35,12 @@ class DatabricksSparkConfig(BaseModel): master: str = "local[*]" app_name: str = "fastflowtransform" extra_conf: dict[str, Any] | None = None + warehouse_dir: str | None = None + use_hive_metastore: bool = False + catalog: str | None = None + database: str | None = None + table_format: str = "parquet" + table_options: dict[str, Any] | None = None class SnowflakeSnowparkConfig(BaseModel): @@ -112,6 +119,10 @@ class EnvSettings(BaseSettings): # databricks spark DBR_MASTER: str | None = None DBR_APPNAME: str | None = None + DBR_ENABLE_HIVE: int | None = None + DBR_WAREHOUSE_DIR: str | None = None + DBR_TABLE_FORMAT: str | None = None + DBR_DATABASE: str | None = None # snowflake snowpark SF_ACCOUNT: str | None = None @@ -148,7 +159,26 @@ def load_profiles(project_dir: Path) -> dict: pf_path = project_dir / "profiles.yml" if not pf_path.exists(): return {} - return yaml.safe_load(pf_path.read_text(encoding="utf-8")) or {} + raw_text = pf_path.read_text(encoding="utf-8") + rendered = _render_profiles_template(raw_text, project_dir) + return yaml.safe_load(rendered) or {} + + +def _render_profiles_template(text: str, project_dir: Path) -> str: + def _env(name: str, default: str | None = "") -> str: + val = os.getenv(name) + if val is not None: + return val + return "" if default is None else str(default) + + jenv = Environment(autoescape=False, undefined=StrictUndefined) + jenv.globals["env"] = _env + jenv.globals["project_dir"] = lambda: str(project_dir) + template = jenv.from_string(text) + try: + return template.render() + except Exception as exc: + raise ProfileConfigError(f"Failed to render profiles.yml: {exc}") from exc # ---------- Resolver ---------- @@ -217,6 +247,16 @@ def _ov_databricks_spark(raw: dict[str, Any], env: EnvSettings) -> None: dbr = raw.setdefault("databricks_spark", {}) _set_if(dbr, "master", getattr(env, "DBR_MASTER", None)) _set_if(dbr, "app_name", getattr(env, "DBR_APPNAME", None)) + _set_if(dbr, "warehouse_dir", getattr(env, "DBR_WAREHOUSE_DIR", None)) + _set_if(dbr, "table_format", getattr(env, "DBR_TABLE_FORMAT", None)) + _set_if(dbr, "database", getattr(env, "DBR_DATABASE", None)) + + enable_hive = getattr(env, "DBR_ENABLE_HIVE", None) + if enable_hive is not None: + if isinstance(enable_hive, str): + dbr["use_hive_metastore"] = enable_hive.strip().lower() in {"1", "true", "yes", "on"} + else: + dbr["use_hive_metastore"] = bool(enable_hive) # ggf. weitere Connect-Parameter hier setzen diff --git a/src/fastflowtransform/storage.py b/src/fastflowtransform/storage.py new file mode 100644 index 0000000..a53d12f --- /dev/null +++ b/src/fastflowtransform/storage.py @@ -0,0 +1,144 @@ +# fastflowtransform/storage.py +from __future__ import annotations + +import shutil +from collections.abc import Mapping +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + + +@dataclass +class _StorageRegistry: + model: dict[str, dict[str, Any]] = field(default_factory=dict) + seed: dict[str, dict[str, Any]] = field(default_factory=dict) + + +_STORAGE = _StorageRegistry() + + +def _sanitize_key(name: str) -> str: + return name.replace("`", "").replace('"', "").strip() + + +def normalize_storage_map( + raw: Mapping[str, Any] | None, *, project_dir: Path +) -> dict[str, dict[str, Any]]: + if not raw or not isinstance(raw, Mapping): + return {} + + normalized: dict[str, dict[str, Any]] = {} + for key, cfg in raw.items(): + if not isinstance(cfg, Mapping): + continue + + entry: dict[str, Any] = {} + + if "path" in cfg and cfg["path"] is not None: + p = Path(str(cfg["path"])) + if not p.is_absolute(): + p = (project_dir / p).resolve() + entry["path"] = str(p) + + fmt = cfg.get("format") + if fmt: + entry["format"] = str(fmt) + + options = cfg.get("options") + if isinstance(options, Mapping): + entry["options"] = {str(k): v for k, v in options.items()} + + if entry: + normalized[_sanitize_key(str(key))] = entry + return normalized + + +def set_model_storage(mapping: Mapping[str, dict[str, Any]] | None) -> None: + _STORAGE.model = dict(mapping or {}) + + +def set_seed_storage(mapping: Mapping[str, dict[str, Any]] | None) -> None: + _STORAGE.seed = dict(mapping or {}) + + +def _lookup(storage_map: Mapping[str, dict[str, Any]], candidates: list[str]) -> dict[str, Any]: + for cand in candidates: + key = _sanitize_key(cand) + meta = storage_map.get(key) + if meta: + return dict(meta) + return {} + + +def get_model_storage(name: str) -> dict[str, Any]: + candidates = [name] + clean = _sanitize_key(name) + if clean.endswith(".ff"): + candidates.append(clean[:-3]) + else: + candidates.append(f"{clean}.ff") + parts = [p for p in clean.split(".") if p] + if parts: + candidates.append(parts[-1]) + return _lookup(_STORAGE.model, candidates) + + +def get_seed_storage(name: str) -> dict[str, Any]: + clean = _sanitize_key(name) + parts = [p for p in clean.split(".") if p] + candidates = [clean] + if parts: + candidates.append(parts[-1]) + return _lookup(_STORAGE.seed, candidates) + + +def spark_write_to_path( + spark: Any, + identifier: str, + df: Any, + *, + storage: Mapping[str, Any], + default_format: str | None = None, + default_options: Mapping[str, Any] | None = None, +) -> None: + """ + Persist a Spark DataFrame to an explicit filesystem location and register it as a table. + """ + path = storage.get("path") + if not path: + raise ValueError("storage path override requires 'path'") + + fmt = storage.get("format") or default_format + options = dict(default_options or {}) + extra_opts = storage.get("options") or {} + if isinstance(extra_opts, Mapping): + options.update({str(k): v for k, v in extra_opts.items()}) + + parts = [_sanitize_key(part) for part in identifier.split(".") if part] + if not parts: + raise ValueError(f"Invalid Spark identifier: {identifier}") + + def _quote(part: str) -> str: + return "`" + part.replace("`", "``") + "`" + + target_sql = ".".join(_quote(p) for p in parts) + + spark.sql(f"DROP TABLE IF EXISTS {target_sql}") + + path_str = str(path) + if "://" not in path_str: + target_path = Path(path_str) + if target_path.exists(): + shutil.rmtree(target_path, ignore_errors=True) + target_path.parent.mkdir(parents=True, exist_ok=True) + + writer = df.write.mode("overwrite") + if fmt: + writer = writer.format(fmt) + if options: + writer = writer.options(**options) + writer.save(path_str) + + using_clause = f"USING {fmt}" if fmt else "" + escaped_path = path_str.replace("'", "''") + spark.sql(f"CREATE TABLE {target_sql} {using_clause} LOCATION '{escaped_path}'") diff --git a/src/fastflowtransform/test_registry.py b/src/fastflowtransform/test_registry.py index 3997314..0eb7e81 100644 --- a/src/fastflowtransform/test_registry.py +++ b/src/fastflowtransform/test_registry.py @@ -9,9 +9,9 @@ class Runner(Protocol): """Callable signature for a generic test runner. Returns: - ok: Whether the test passed. - message: Optional human-friendly message (usually set on failure). - example_sql: Optional example SQL (shown in summary on failure). + ok (bool): Whether the test passed. + message (str | None): Optional human-friendly message (usually set on failure). + example_sql (str | None): Optional example SQL (shown in summary on failure). """ def __call__( diff --git a/src/fastflowtransform/utest.py b/src/fastflowtransform/utest.py index e448bd7..0b50db2 100644 --- a/src/fastflowtransform/utest.py +++ b/src/fastflowtransform/utest.py @@ -476,167 +476,6 @@ def validate_inputs_cover_deps(node: Node, inputs: dict[str, dict]) -> tuple[lis return expected, missing -# def run_unit_specs( -# specs, -# executor, -# jenv, -# only_case=None, -# *, -# cache_mode: str = "off", -# reuse_meta: bool = False, -# ): -# """ -# Execute discovered unit-test specs. Returns the number of failed cases. - -# Args: -# cache_mode: 'off' | 'ro' | 'rw'. Default 'off' for deterministic runs. -# (Reserved for future accelerations; currently no-op.) -# reuse_meta: If True, avoid meta cleanup between cases (reserved; currently no-op). -# """ -# # Normalize cache_mode: accept Enum or str; store as lower-case str -# if not isinstance(cache_mode, str): -# cache_mode = getattr(cache_mode, "value", str(cache_mode)) -# cache_mode = cache_mode.lower() - -# if cache_mode not in {"off", "ro", "rw"}: -# raise ValueError(f"unknown cache_mode: {cache_mode}") - -# failures = 0 - -# # ---- Build a cache context for utests (profile name 'utest') ---- -# try: -# project_dir = REGISTRY.get_project_dir() -# except Exception: -# project_dir = None - -# # Best-effort engine detection for env_ctx/cache keying -# if hasattr(executor, "con"): # duckdb -# engine_name = "duckdb" -# elif hasattr(executor, "engine"): # postgres -# engine_name = "postgres" -# elif hasattr(executor, "client"): # bigquery -# engine_name = "bigquery" -# else: -# engine_name = "unknown" - -# env_ctx: EnvCtx = build_env_ctx( -# engine=engine_name, -# profile_name="utest", -# relevant_env_keys=[k for k in os.environ if k.startswith("FF_")], -# sources=getattr(REGISTRY, "sources", {}), -# ) - -# cache = None -# if project_dir is not None: -# cache = FingerprintCache(project_dir, profile="utest", engine=engine_name) -# cache.load() - -# computed_fps: dict[str, str] = {} - -# for spec in specs: -# node = REGISTRY.nodes.get(spec.model) -# if not node: -# print(f"⚠️ Model '{spec.model}' not found (in {spec.path})") -# failures += 1 -# continue - -# for case in spec.cases: -# if only_case and case.name != only_case: -# continue -# print(f"→ {spec.model} :: {case.name}") - -# # Optional meta hygiene per case -# if not reuse_meta: -# with suppress(Exception): -# delete_meta_for_node(executor, node.name) - -# # ----- Fingerprint for THIS CASE ----- -# # 1) case id → keeps cases distinct -# dep_fps: dict[str, str] = { -# "__case__": f"{getattr(spec, 'path', 'spec')}::{getattr(case, 'name', 'case')}" -# } -# # 2) inputs → invalidates cache when unit-test inputs change -# dep_fps["__inputs__"] = _fingerprint_case_inputs(spec, case) - -# cand_fp: str | None = None -# try: -# if node.kind == "sql": -# # Render the SQL exactly like the executor would -# sql = executor.render_sql( -# node, -# jenv, -# ref_resolver=lambda nm: executor._resolve_ref(nm, jenv), -# source_resolver=executor._resolve_source, -# ) -# cand_fp = fingerprint_sql( -# node=node, rendered_sql=sql, env_ctx=env_ctx, dep_fps=dep_fps -# ) -# else: -# func = REGISTRY.py_funcs[node.name] -# src = get_function_source(func) -# cand_fp = fingerprint_py( -# node=node, func_src=src, env_ctx=env_ctx, dep_fps=dep_fps -# ) -# except Exception: -# # Fingerprint is best-effort in utests; if it fails, behave as cache miss. -# cand_fp = None - -# # 1) Inputs laden/prüfen (inkrementiert failures bei ungültigen Inputs) -# failures += _load_inputs_for_case(executor, spec, case, node) - -# # 2) Optional: skip execution on cache hit -# materialized = (getattr(node, "meta", {}) or {}).get("materialized", "table") -# skip = False -# if ( -# cand_fp -# and cache is not None -# and cache_mode in {"ro", "rw"} -# and can_skip_node( -# node_name=node.name, -# new_fp=cand_fp, -# cache=cache, -# executor=executor, -# materialized=materialized, -# ) -# ): -# # Only skip if both fingerprint AND relation existence agree (via helper) -# print(" ↻ skipped (utest cache hit)") -# skip = True - -# if not skip: -# ok, err = _execute_node(executor, node, jenv) -# if not ok: -# print(f" ❌ execution failed: {err}") -# failures += 1 -# continue -# # Update cache map if we are in a writing mode -# if cand_fp and cache is not None and cache_mode == "rw": -# computed_fps[node.name] = cand_fp - -# # 3) Ergebnis lesen -# ok, df_or_exc, target_rel = _read_target_df(executor, spec, case) -# if not ok: -# print(f" ❌ cannot read result '{target_rel}': {df_or_exc}") -# failures += 1 -# continue -# df = df_or_exc - -# # 4) Erwartungen prüfen -# ok, msg = _assert_expected_rows(df, case) -# if ok: -# print(" ✅ ok") -# else: -# print(f" ❌ {msg}") -# failures += 1 - -# # Persist cache once at the end (only for rw) -# if cache is not None and computed_fps and cache_mode == "rw": -# cache.update_many(computed_fps) -# cache.save() - -# return failures - - @dataclass class UtestCtx: executor: Any @@ -663,7 +502,7 @@ def _detect_engine_name(executor: Any) -> str: return "duckdb" if hasattr(executor, "engine"): return "postgres" - if hasattr(executor, "client"): + if hasattr(executor, "client"): # pragma: no cover return "bigquery" return "unknown" @@ -699,7 +538,7 @@ def _fingerprint_case(node: Any, spec: Any, case: Any, ctx: UtestCtx) -> str | N "__inputs__": _fingerprint_case_inputs(spec, case), } try: - if node.kind == "sql": + if node.kind == "sql": # pragma: no cover sql = ctx.executor.render_sql( node, ctx.jenv, @@ -714,7 +553,7 @@ def _fingerprint_case(node: Any, spec: Any, case: Any, ctx: UtestCtx) -> str | N src = get_function_source(func) return fingerprint_py(node=node, func_src=src, env_ctx=ctx.env_ctx, dep_fps=dep_fps) except Exception: - return None # Fingerprint optional + return None # fingerprint optional def _maybe_skip_by_cache(node: Any, cand_fp: str | None, ctx: UtestCtx) -> bool: @@ -729,7 +568,7 @@ def _maybe_skip_by_cache(node: Any, cand_fp: str | None, ctx: UtestCtx) -> bool: materialized=materialized, ): print(" ↻ skipped (utest cache hit)") - if ctx.cache_mode == "rw": # optional: beim Skip nicht nötig, aber harmless + if ctx.cache_mode == "rw": ctx.computed_fps.setdefault(node.name, cand_fp) return True return False @@ -826,7 +665,7 @@ def run_unit_specs( _read_and_assert(spec, case, ctx) # Cache persistieren (nur rw) - if ctx.cache and ctx.computed_fps and ctx.cache_mode == "rw": + if ctx.cache and ctx.computed_fps and ctx.cache_mode == "rw": # pragma: no cover ctx.cache.update_many(ctx.computed_fps) ctx.cache.save() diff --git a/src/fastflowtransform/validation.py b/src/fastflowtransform/validation.py index ea53c58..9dd9255 100644 --- a/src/fastflowtransform/validation.py +++ b/src/fastflowtransform/validation.py @@ -1,4 +1,4 @@ -# src/fastflowtransform/validation.py (neu) +# src/fastflowtransform/validation.py from __future__ import annotations from typing import Any diff --git a/tests/README.md b/tests/README.md index 72d4956..ee23711 100644 --- a/tests/README.md +++ b/tests/README.md @@ -15,8 +15,12 @@ We use markers to make selective runs easy: | `duckdb` | Requires DuckDB executor/fixtures | | `postgres`| Requires Postgres backend | | `cli` | Runs the CLI or Typer commands | +| `artifacts`| Exercises artifact generation helpers | +| `render` | Exercises render-time template helpers | +| `schema` | Exercises schema parsing/validation helpers | | `streaming`| Exercises streaming/sessionizer features | | `slow` | Slower end-to-end scenarios | +| `http` | Exercises the HTTP API client/cache | Example selective runs: diff --git a/tests/common/fixtures.py b/tests/common/fixtures.py index 7d596c5..4092c6a 100644 --- a/tests/common/fixtures.py +++ b/tests/common/fixtures.py @@ -2,13 +2,18 @@ import os from contextlib import suppress from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock, patch +import pandas as pd import psycopg import pytest from jinja2 import Environment, FileSystemLoader, select_autoescape from psycopg import sql +from fastflowtransform import utest from fastflowtransform.core import REGISTRY +from fastflowtransform.executors.databricks_spark_exec import DatabricksSparkExecutor from tests.common.utils import ROOT, run @@ -93,3 +98,113 @@ def pg_seeded(pg_project, pg_env): conn.commit() run(["fft", "seed", str(pg_project), "--env", "stg"], pg_env) yield + + +# ---- Spark ---- +@pytest.fixture +def exec_minimal(monkeypatch): + with patch("fastflowtransform.executors.databricks_spark_exec.SparkSession") as SP: + fake_spark = MagicMock() + SP.builder.master.return_value.appName.return_value.getOrCreate.return_value = fake_spark + ex = DatabricksSparkExecutor() + # accept mocks as frames in unit tests + monkeypatch.setattr(ex, "_is_frame", lambda obj: True) + return ex + + +@pytest.fixture +def exec_factory(): + """ + Build a DatabricksSparkExecutor with arbitrary __init__ kwargs, + but always with mocked SparkSession (no real JVM). + Returns (executor, fake_builder, fake_spark). + """ + + def _make(**kwargs): + with patch("fastflowtransform.executors.databricks_spark_exec.SparkSession") as SP: + fake_builder = SP.builder.master.return_value.appName.return_value + # make .config(...) chainable + fake_builder.config.return_value = fake_builder + fake_builder.enableHiveSupport.return_value = fake_builder + fake_spark = MagicMock() + fake_builder.getOrCreate.return_value = fake_spark + + ex = DatabricksSparkExecutor(**kwargs) + return ex, fake_builder, fake_spark + + return _make + + +@pytest.fixture(scope="session") +def spark_tmpdir(tmp_path_factory: pytest.TempPathFactory) -> Path: + return tmp_path_factory.mktemp("spark_wh") + + +@pytest.fixture(scope="session") +def spark_exec(spark_tmpdir: Path) -> DatabricksSparkExecutor: + return DatabricksSparkExecutor( + master="local[*]", + app_name="fft-it", + warehouse_dir=str(spark_tmpdir), + database="default", + ) + + +# ---- utest ---- +@pytest.fixture +def fake_registry(tmp_path, monkeypatch): + # wir brauchen ein REGISTRY mit projekt-dir und 1 node + node = SimpleNamespace(name="model_a", kind="sql", deps=["src1"]) + reg = SimpleNamespace( + nodes={"model_a": node}, + sources={}, + get_project_dir=lambda: tmp_path, + ) + monkeypatch.setattr(utest, "REGISTRY", reg) + # relation_for -> immer schema.model + monkeypatch.setattr(utest, "relation_for", lambda name: f"public.{name}") + return reg + + +@pytest.fixture +def duckdb_executor(): + """ + Fake-Executor, der dem DuckDB-Pfad ähnelt: + - hat .con + - con.register(...) + - con.execute(...) + - con.table(...).df() + """ + con = MagicMock() + # für _read_result (duckdb) + table_df = pd.DataFrame([{"id": 1}]) + con.table.return_value.df.return_value = table_df + + class DuckEx: + def __init__(self, con): + self.con = con + + # für _execute_node(sql) + def run_sql(self, node, jenv): + # schreibt nix, simuliert nur Erfolg + return None + + def run_python(self, node): + return None + + return DuckEx(con) + + +@pytest.fixture +def postgres_executor(): + """ + Fake-Executor für den Postgres-Zweig in _read_result. + """ + engine = MagicMock() + + class PgEx: + def __init__(self, engine): + self.engine = engine + self.schema = "public" + + return PgEx(engine) diff --git a/tests/artifacts/test_catalog_duckdb.py b/tests/integration/artifacts/test_catalog_duckdb_integration.py similarity index 93% rename from tests/artifacts/test_catalog_duckdb.py rename to tests/integration/artifacts/test_catalog_duckdb_integration.py index b12ac59..23d202b 100644 --- a/tests/artifacts/test_catalog_duckdb.py +++ b/tests/integration/artifacts/test_catalog_duckdb_integration.py @@ -1,11 +1,15 @@ import json from pathlib import Path +import pytest + from fastflowtransform.artifacts import write_catalog from fastflowtransform.core import REGISTRY from fastflowtransform.executors.duckdb_exec import DuckExecutor +@pytest.mark.integration +@pytest.mark.duckdb def test_catalog_duckdb(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) (tmp_path / "models" / "t.ff.sql").write_text( diff --git a/tests/integration/executors/test_databricks_spark_exec_integration.py b/tests/integration/executors/test_databricks_spark_exec_integration.py new file mode 100644 index 0000000..15ec0b6 --- /dev/null +++ b/tests/integration/executors/test_databricks_spark_exec_integration.py @@ -0,0 +1,91 @@ +# tests/integration/executors/test_databricks_spark_exec_integration.py +from __future__ import annotations + +from pathlib import Path + +import pytest + +pyspark = pytest.importorskip("pyspark") + +from fastflowtransform.core import Node # noqa: E402 +from fastflowtransform.errors import ModelExecutionError # noqa: E402 +from fastflowtransform.executors.databricks_spark_exec import DatabricksSparkExecutor # noqa: E402 + + +@pytest.mark.integration +@pytest.mark.spark +def test_create_table_and_exists(spark_exec: DatabricksSparkExecutor): + spark_exec.create_table_as("default.it_users", "SELECT 1 AS id, 'x' AS name") + assert spark_exec.exists_relation("default.it_users") + assert spark_exec.exists_relation("it_users") + + +@pytest.mark.integration +@pytest.mark.spark +def test_incremental_insert_integration(spark_exec: DatabricksSparkExecutor): + spark_exec.create_table_as("it_inc", "SELECT 1 AS id") + spark_exec.incremental_insert("it_inc", "SELECT 2 AS id") + rows = [tuple(r) for r in spark_exec.spark.sql("SELECT * FROM it_inc ORDER BY id").collect()] + assert rows == [(1,), (2,)] + + +@pytest.mark.integration +@pytest.mark.spark +def test_incremental_merge_integration(spark_exec: DatabricksSparkExecutor): + spark_exec.create_table_as("it_merge", "SELECT 1 AS id, 'old' AS v") + sql = """ + SELECT * FROM ( + SELECT 1 AS id, 'new' AS v + UNION ALL + SELECT 2 AS id, 'other' AS v + ) s + """ + spark_exec.incremental_merge("it_merge", sql, unique_key=["id"]) + rows = {(r["id"], r["v"]) for r in spark_exec.spark.sql("SELECT * FROM it_merge").collect()} + assert rows == {(1, "new"), (2, "other")} + + +@pytest.mark.integration +@pytest.mark.spark +def test_alter_table_sync_schema_integration(spark_exec: DatabricksSparkExecutor): + spark_exec.create_table_as("it_schema", "SELECT 1 AS id") + spark_exec.alter_table_sync_schema("it_schema", "SELECT 1 AS id, 'x' AS extra") + cols = {f.name for f in spark_exec.spark.table("it_schema").schema.fields} + assert {"id", "extra"}.issubset(cols) + + +@pytest.mark.integration +@pytest.mark.spark +def test_create_or_replace_table_wraps_error(spark_exec: DatabricksSparkExecutor): + bad_sql = "SELECT * FROM not_there" + node = Node(name="bad_node", kind="sql", path=Path("dummy")) + with pytest.raises(ModelExecutionError): + spark_exec._create_or_replace_table("default.bad_tbl", bad_sql, node) + + +@pytest.mark.integration +@pytest.mark.spark +def test_materialize_relation_real(spark_exec: DatabricksSparkExecutor): + df = spark_exec.spark.createDataFrame([(1, "x")], ["id", "val"]) + node = Node(name="it_node", kind="python", path=Path("x")) + spark_exec._materialize_relation("default.it_tbl_mr", df, node) + rows = [tuple(r) for r in spark_exec.spark.sql("SELECT * FROM default.it_tbl_mr").collect()] + assert rows == [(1, "x")] + + +@pytest.mark.integration +@pytest.mark.spark +def test_create_view_over_table_real(spark_exec: DatabricksSparkExecutor): + """Create a table and a view over it using simple, backtick-safe names.""" + # 1) create a table WITHOUT a dot in the name + spark_exec.create_table_as("src_tbl", "SELECT 1 AS id") + + # 2) create a view over it + spark_exec._create_view_over_table( + "v_src_tbl", + "src_tbl", # <- no dot, so backticks are fine + Node(name="x", kind="sql", path=Path(".")), + ) + + rows = [tuple(r) for r in spark_exec.spark.sql("SELECT * FROM v_src_tbl").collect()] + assert rows == [(1,)] diff --git a/tests/integration/test_profiles_validation.py b/tests/integration/test_profiles_validation.py index 2fb79bb..06ce44e 100644 --- a/tests/integration/test_profiles_validation.py +++ b/tests/integration/test_profiles_validation.py @@ -1,6 +1,7 @@ # tests/integration/test_profiles_validation.py from __future__ import annotations +import os import textwrap from pathlib import Path @@ -124,9 +125,14 @@ def test_profiles_validation( env_kwargs: dict, expect_error: bool, expect_substring: str | None, + monkeypatch, ): _write_profiles(tmp_path, profiles_yml) + # Ensure FF_* env vars from the outer environment do not affect expectations. + for key in [k for k in os.environ if k.startswith("FF_")]: + monkeypatch.delenv(key, raising=False) + env = EnvSettings(**env_kwargs) if expect_error: diff --git a/tests/test_registry/test_dispatch.py b/tests/integration/test_registry/test_dispatch_integration.py similarity index 92% rename from tests/test_registry/test_dispatch.py rename to tests/integration/test_registry/test_dispatch_integration.py index f3d5ae9..d87a352 100644 --- a/tests/test_registry/test_dispatch.py +++ b/tests/integration/test_registry/test_dispatch_integration.py @@ -1,7 +1,11 @@ +import pytest + from fastflowtransform.executors.duckdb_exec import DuckExecutor from fastflowtransform.test_registry import TESTS +@pytest.mark.integration +@pytest.mark.duckdb def test_registry_not_null_and_unique_and_params_and_sql(): ex = DuckExecutor(":memory:") ex.con.execute("create table t(id int, email varchar)") diff --git a/tests/api/test_http_cache_modes.py b/tests/unit/api/http/test_http_cache_modes_unit.py similarity index 96% rename from tests/api/test_http_cache_modes.py rename to tests/unit/api/http/test_http_cache_modes_unit.py index fd19f63..d3a2d18 100644 --- a/tests/api/test_http_cache_modes.py +++ b/tests/unit/api/http/test_http_cache_modes_unit.py @@ -13,6 +13,8 @@ def _seed_cache(http_mod, cache_dir: Path, url: str, params: dict | None, body_o http_mod._write_cache(key, 200, {}, body, url) +@pytest.mark.unit +@pytest.mark.http def test_http_cache_mode_off_disables_cache(monkeypatch, tmp_path): """ Mode=off: the cache is ignored; in offline mode this raises because @@ -34,6 +36,8 @@ def test_http_cache_mode_off_disables_cache(monkeypatch, tmp_path): assert "offline" in str(e.value).lower() and "cache miss" in str(e.value).lower() +@pytest.mark.unit +@pytest.mark.http def test_http_cache_mode_ro_does_not_write(monkeypatch, tmp_path): """ Mode=ro: reads are allowed, writes are forbidden. diff --git a/tests/api/test_http_offline_cache.py b/tests/unit/api/http/test_http_offline_cache_unit.py similarity index 96% rename from tests/api/test_http_offline_cache.py rename to tests/unit/api/http/test_http_offline_cache_unit.py index a8dcd81..f183179 100644 --- a/tests/api/test_http_offline_cache.py +++ b/tests/unit/api/http/test_http_offline_cache_unit.py @@ -2,6 +2,8 @@ import json from pathlib import Path +import pytest + from fastflowtransform.api import context as ctx, http @@ -14,6 +16,8 @@ def _seed_cache(http_mod, cache_dir: Path, url: str, params: dict | None, body_o http_mod._write_cache(key, 200, {}, body, url) +@pytest.mark.unit +@pytest.mark.http def test_get_json_offline_cache_hit_records_stats(monkeypatch, tmp_path): # Set ENV variables and reload the module so it picks them up monkeypatch.setenv("FF_HTTP_OFFLINE", "1") diff --git a/tests/api/test_http_pagination_df.py b/tests/unit/api/http/test_http_pagination_df_unit.py similarity index 97% rename from tests/api/test_http_pagination_df.py rename to tests/unit/api/http/test_http_pagination_df_unit.py index aeb5ee1..3ace3e2 100644 --- a/tests/api/test_http_pagination_df.py +++ b/tests/unit/api/http/test_http_pagination_df_unit.py @@ -3,6 +3,7 @@ from pathlib import Path import pandas as pd +import pytest from fastflowtransform.api import context as ctx, http @@ -13,6 +14,8 @@ def _seed_cache(http_mod, cache_dir: Path, url: str, params: dict | None, body_o http_mod._write_cache(key, 200, {}, body, url) +@pytest.mark.unit +@pytest.mark.http def test_get_df_pagination_concatenates(monkeypatch, tmp_path): """ get_df with a paginator joins two offline pages correctly. diff --git a/tests/unit/api/test_rate_limit_unit.py b/tests/unit/api/test_rate_limit_unit.py new file mode 100644 index 0000000..9fa8109 --- /dev/null +++ b/tests/unit/api/test_rate_limit_unit.py @@ -0,0 +1,233 @@ +# tests/unit/api/test_rate_limit_unit.py +from __future__ import annotations + +from typing import Any + +import pytest + +import fastflowtransform.api.rate_limit as rl_mod + + +@pytest.fixture(autouse=True) +def _reset_rate_limiter(): + """Ensure each test runs with a clean module-level state.""" + rl_mod.reset() + yield + rl_mod.reset() + + +@pytest.mark.unit +def test_tokenbucket_try_consume_enough_tokens(monkeypatch): + """try_consume should return True and deduct tokens when bucket has enough.""" + tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) + tb._tokens = 3.0 + monkeypatch.setattr(rl_mod, "monotonic", lambda: 100.0) + + ok = tb.try_consume(2.0) + assert ok is True + assert tb._tokens == pytest.approx(1.0) + + +@pytest.mark.unit +def test_tokenbucket_try_consume_not_enough_tokens(monkeypatch): + """try_consume should return False when bucket has not enough tokens.""" + tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) + tb._tokens = 0.5 + monkeypatch.setattr(rl_mod, "monotonic", lambda: 50.0) + + ok = tb.try_consume(1.0) + assert ok is False + # token count should stay the same + assert tb._tokens == pytest.approx(0.5) + + +@pytest.mark.unit +def test_tokenbucket_refills_on_try_consume(monkeypatch): + """try_consume should trigger a refill before checking tokens.""" + tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=2.0) + tb._tokens = 0.0 + tb._last_refill = 10.0 + # now - last_refill = 1s -> +2 tokens + monkeypatch.setattr(rl_mod, "monotonic", lambda: 11.0) + + ok = tb.try_consume(1.0) + assert ok is True + # 2 - 1 = 1 + assert tb._tokens == pytest.approx(1.0) + + +@pytest.mark.unit +def test_tokenbucket_wait_does_not_block_when_enough(monkeypatch): + """wait() should return immediately if enough tokens are present.""" + # freeze time first so both _last_refill and later calls are the same + monkeypatch.setattr(rl_mod, "monotonic", lambda: 200.0) + + tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) + tb._tokens = 4.0 + # make sure no extra refill happens inside wait() + tb._last_refill = 200.0 + + called: dict[str, bool] = {"sleep": False} + + def fake_sleep(_dur: float) -> None: + called["sleep"] = True + + monkeypatch.setattr(rl_mod.time, "sleep", fake_sleep) + + tb.wait(2.0) + + # should not have slept + assert called["sleep"] is False + # consumed exactly 2 + assert tb._tokens == pytest.approx(2.0) + + +@pytest.mark.unit +def test_tokenbucket_wait_blocks_once_and_consumes(monkeypatch): + """wait() should sleep exactly once when tokens are not yet available.""" + tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) + tb._tokens = 0.0 + tb._last_refill = 10.0 + + # 1st call -> 10.0 → not enough → sleep(1.0) + # 2nd call after sleep -> 11.0 → +1 token → consume + times = [10.0, 11.0] + + def fake_monotonic() -> float: + return times.pop(0) + + slept_for: list[float] = [] + + def fake_sleep(dur: float) -> None: + slept_for.append(dur) + + monkeypatch.setattr(rl_mod, "monotonic", fake_monotonic) + monkeypatch.setattr(rl_mod.time, "sleep", fake_sleep) + + tb.wait(1.0) + + assert len(slept_for) == 1 + assert slept_for[0] == pytest.approx(1.0) + # after consuming the freshly refilled token + assert tb._tokens == pytest.approx(0.0) + + +@pytest.mark.unit +def test_tokenbucket_wait_disabled_does_nothing(monkeypatch): + """If capacity/rps <= 0, wait() should be a no-op.""" + tb = rl_mod.TokenBucket(capacity=0.0, refill_per_sec=1.0) + called = {"sleep": False} + + def fake_sleep(*_: float) -> None: + called["sleep"] = True + + monkeypatch.setattr(rl_mod.time, "sleep", fake_sleep) + + tb.wait(10.0) + + assert called["sleep"] is False + + +# ---------------- module-level helpers ---------------- + + +@pytest.mark.unit +def test_init_rate_limiter_creates_bucket(): + """init_rate_limiter should build a TokenBucket when params are positive.""" + rl_mod.init_rate_limiter(5, 2) + assert isinstance(rl_mod._STATE.rl, rl_mod.TokenBucket) + assert rl_mod._STATE.rl.capacity == 5 + assert rl_mod._STATE.rl.refill_per_sec == 2 + + +@pytest.mark.unit +def test_init_rate_limiter_disables_on_zero(): + """init_rate_limiter should disable when params are non-positive.""" + rl_mod.init_rate_limiter(0, 10) + assert rl_mod._STATE.rl is None + + rl_mod.init_rate_limiter(10, 0) + assert rl_mod._STATE.rl is None + + +@pytest.mark.unit +def test_set_params_on_uninitialized_creates_when_both_given(): + """set_params should create a bucket if none exists and both positive values are passed.""" + assert rl_mod._STATE.rl is None + rl_mod.set_params(capacity=3, rps=1) + assert isinstance(rl_mod._STATE.rl, rl_mod.TokenBucket) + assert rl_mod._STATE.rl.capacity == 3 + assert rl_mod._STATE.rl.refill_per_sec == 1 + + +@pytest.mark.unit +def test_set_params_updates_existing(): + """set_params should rebuild the bucket based on existing values when some params are None.""" + rl_mod.init_rate_limiter(5, 2) + rl_mod.set_params(rps=10) + assert isinstance(rl_mod._STATE.rl, rl_mod.TokenBucket) + assert rl_mod._STATE.rl.capacity == 5 + assert rl_mod._STATE.rl.refill_per_sec == 10 + + +@pytest.mark.unit +def test_set_params_can_disable(): + """set_params should disable limiter when resulting params are non-positive.""" + rl_mod.init_rate_limiter(5, 2) + rl_mod.set_params(capacity=0) + assert rl_mod._STATE.rl is None + + +@pytest.mark.unit +def test_rate_limit_delegates_when_initialized(monkeypatch): + """rate_limit() should call wait() on the current bucket.""" + rl_mod.init_rate_limiter(5, 1) + bucket = rl_mod._STATE.rl + assert bucket is not None + + called: dict[str, Any] = {"wait": False} + + def fake_wait(cost: float = 1.0) -> None: + called["wait"] = cost + + bucket.wait = fake_wait # type: ignore[assignment] + + rl_mod.rate_limit(3.5) + assert called["wait"] == 3.5 + + +@pytest.mark.unit +def test_rate_limit_noop_when_uninitialized(): + """rate_limit() should just return when limiter is not initialized.""" + assert rl_mod._STATE.rl is None + rl_mod.rate_limit(10.0) + assert rl_mod._STATE.rl is None + + +@pytest.mark.unit +def test_try_consume_noop_when_uninitialized(): + """try_consume() should return True when limiter is not initialized.""" + assert rl_mod._STATE.rl is None + assert rl_mod.try_consume(999.0) is True + + +@pytest.mark.unit +def test_try_consume_delegates_when_initialized(monkeypatch): + """try_consume() should delegate to bucket.try_consume().""" + rl_mod.init_rate_limiter(5, 1) + bucket = rl_mod._STATE.rl + assert bucket is not None + + monkeypatch.setattr(bucket, "try_consume", lambda cost=1.0: cost == 1.0) + + assert rl_mod.try_consume(1.0) is True + assert rl_mod.try_consume(2.0) is False + + +@pytest.mark.unit +def test_reset_clears_state(): + """reset() should clear the module-level bucket.""" + rl_mod.init_rate_limiter(5, 1) + assert rl_mod._STATE.rl is not None + rl_mod.reset() + assert rl_mod._STATE.rl is None diff --git a/tests/artifacts/test_manifest.py b/tests/unit/artifacts/test_manifest_unit.py similarity index 93% rename from tests/artifacts/test_manifest.py rename to tests/unit/artifacts/test_manifest_unit.py index 34a397b..9397a3b 100644 --- a/tests/artifacts/test_manifest.py +++ b/tests/unit/artifacts/test_manifest_unit.py @@ -1,10 +1,14 @@ import json from pathlib import Path +import pytest + from fastflowtransform.artifacts import write_manifest from fastflowtransform.core import REGISTRY +@pytest.mark.unit +@pytest.mark.artifacts def test_manifest_minimal(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) (tmp_path / "models" / "m.ff.sql").write_text("select 1 as x", encoding="utf-8") diff --git a/tests/artifacts/test_run_result.py b/tests/unit/artifacts/test_run_result_unit.py similarity index 94% rename from tests/artifacts/test_run_result.py rename to tests/unit/artifacts/test_run_result_unit.py index becc04e..000a47b 100644 --- a/tests/artifacts/test_run_result.py +++ b/tests/unit/artifacts/test_run_result_unit.py @@ -1,9 +1,13 @@ import json from pathlib import Path +import pytest + from fastflowtransform.artifacts import RunNodeResult, write_run_results +@pytest.mark.unit +@pytest.mark.artifacts def test_run_results_written(tmp_path: Path): started = "2025-01-01T00:00:00+00:00" finished = "2025-01-01T00:01:00+00:00" diff --git a/tests/unit/test_cache_policy_cli.py b/tests/unit/cache/test_cache_policy_cli.py similarity index 97% rename from tests/unit/test_cache_policy_cli.py rename to tests/unit/cache/test_cache_policy_cli.py index 2a646c3..6b932ad 100644 --- a/tests/unit/test_cache_policy_cli.py +++ b/tests/unit/cache/test_cache_policy_cli.py @@ -24,9 +24,11 @@ def _mk_node(tmp: Path, name: str, *, kind: str = "sql", mat: str = "table", dep Create a minimal Node and backing file so REGISTRY.load_project() isn't required. """ deps = deps or [] + models_dir = tmp / "models" + models_dir.mkdir(parents=True, exist_ok=True) suffix = ".ff.sql" if kind == "sql" else ".ff.py" - path = tmp / f"{name}{'.sql' if name.endswith('.ff') else suffix}" - path.parent.mkdir(parents=True, exist_ok=True) + filename = f"{name}{'.sql' if name.endswith('.ff') else suffix}" + path = models_dir / filename path.write_text("-- test", encoding="utf-8") n = Node(name=name, kind=kind, path=path, deps=list(deps), meta={"materialized": mat}) return n @@ -186,6 +188,7 @@ def ctor(project: Path, profile: str, engine: str): # ------------------------------ Tests ------------------------------- +@pytest.mark.unit def test_cache_rw_noop_skips_all(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): """ RW mode: when cache hits for all nodes, a no-op run builds 0 nodes (all skipped). @@ -220,6 +223,7 @@ def test_cache_rw_noop_skips_all(tmp_path: Path, monkeypatch: pytest.MonkeyPatch assert calls["run_py"] == [] +@pytest.mark.unit def test_cache_ro_build_no_write(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): """ RO mode: cache miss -> build models but do not write cache. @@ -252,6 +256,7 @@ def test_cache_ro_build_no_write(tmp_path: Path, monkeypatch: pytest.MonkeyPatch assert fake_cache.updated is False or fake_cache.saved is False, "RO must not persist cache" +@pytest.mark.unit def test_rebuild_selected_builds_even_on_hit(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): """ --rebuild with a selection: selected node is built even if cache matches. diff --git a/tests/unit/test_cache_skip_logic.py b/tests/unit/cache/test_cache_skip_logic.py similarity index 97% rename from tests/unit/test_cache_skip_logic.py rename to tests/unit/cache/test_cache_skip_logic.py index 738bf0f..796895e 100644 --- a/tests/unit/test_cache_skip_logic.py +++ b/tests/unit/cache/test_cache_skip_logic.py @@ -1,6 +1,8 @@ # tests/unit/test_cache_skip_logic.py from __future__ import annotations +import pytest + from fastflowtransform.cache import FingerprintCache, can_skip_node @@ -30,6 +32,7 @@ def con(self): return _DummyExec._Con(self._present) +@pytest.mark.unit def test_can_skip_node_requires_artifact_when_non_ephemeral(tmp_path): cache = FingerprintCache(tmp_path, profile="dev", engine="duckdb") cache.entries = {"users.ff": "xxx"} @@ -55,6 +58,7 @@ def test_can_skip_node_requires_artifact_when_non_ephemeral(tmp_path): ) +@pytest.mark.unit def test_ephemeral_skip_without_artifact(tmp_path): cache = FingerprintCache(tmp_path, profile="dev", engine="duckdb") cache.entries = {"ephem.ff": "yyy"} diff --git a/tests/unit/test_cache_store.py b/tests/unit/cache/test_cache_store.py similarity index 94% rename from tests/unit/test_cache_store.py rename to tests/unit/cache/test_cache_store.py index 9790f68..70016fb 100644 --- a/tests/unit/test_cache_store.py +++ b/tests/unit/cache/test_cache_store.py @@ -3,9 +3,12 @@ from pathlib import Path +import pytest + from fastflowtransform.cache import FingerprintCache +@pytest.mark.unit def test_cache_persist_roundtrip(tmp_path: Path): proj = tmp_path c1 = FingerprintCache(proj, profile="dev", engine="duckdb") diff --git a/tests/unit/cli/test_docs_util_units.py b/tests/unit/cli/test_docs_util_units.py new file mode 100644 index 0000000..7b2e0c2 --- /dev/null +++ b/tests/unit/cli/test_docs_util_units.py @@ -0,0 +1,282 @@ +# tests/unit/cli/test_docs_utils_unit.py +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from types import SimpleNamespace + +import pytest +import yaml + +from fastflowtransform.cli import docs_utils + + +@pytest.mark.unit +def test_resolve_dag_out_dir_with_override(tmp_path: Path): + override = tmp_path / "custom" + out = docs_utils._resolve_dag_out_dir(tmp_path, override) + assert out == override.resolve() + + +@pytest.mark.unit +def test_resolve_dag_out_dir_from_project_yml(tmp_path: Path): + proj = tmp_path + (proj / "project.yml").write_text( + yaml.safe_dump({"docs": {"dag_dir": "build/dag"}}), + encoding="utf-8", + ) + out = docs_utils._resolve_dag_out_dir(proj, None) + assert out == (proj / "build" / "dag").resolve() + + +@pytest.mark.unit +def test_resolve_dag_out_dir_fallback(tmp_path: Path): + proj = tmp_path + out = docs_utils._resolve_dag_out_dir(proj, None) + assert out == (proj / "site" / "dag").resolve() + + +@pytest.mark.unit +@pytest.mark.parametrize( + "html,expected", + [ + (None, None), + ("", None), + ("

Hello

", "Hello"), + ("

Hi you

", "Hi you"), + ("

Hi

\n

there

", "Hi there"), + ("Text with spaces", "Text with spaces"), + ], +) +def test_strip_html(html, expected): + assert docs_utils._strip_html(html) == expected + + +@pytest.mark.unit +def test_infer_sql_ref_aliases_basic(): + sql = """ + select * + from public.my_table as t + join other.tbl o on t.id = o.id + """ + aliases = docs_utils._infer_sql_ref_aliases(sql) + # keys must be the aliases + assert aliases["t"] == "public.my_table" + assert aliases["o"] == "other.tbl" + + +@pytest.mark.unit +def test_infer_sql_ref_aliases_quoted_and_backticks(): + sql = """ + SELECT * FROM "raw"."users" u + JOIN `stg`.`orders` AS o ON u.id = o.user_id + """ + aliases = docs_utils._infer_sql_ref_aliases(sql) + + # current behavior: only outer quotes/backticks are stripped, + # inner Punkte + Quotes bleiben erhalten + assert aliases["u"] == 'raw"."users' + assert aliases["o"] == "stg`.`orders" + + +# --------------------------------------------------------------------------- +# _build_docs_manifest +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_build_docs_manifest_sql_branch(monkeypatch, tmp_path: Path): + """ + Exercise the SQL-branch: executor.render_sql + lineage.infer_sql_lineage. + """ + # fake project dir + project.yml + proj = tmp_path + (proj / "project.yml").write_text( + yaml.safe_dump({"name": "MyProj"}), + encoding="utf-8", + ) + + # fake nodes in REGISTRY + # node has: name, deps, kind, meta + n1 = SimpleNamespace( + name="model_a", + deps=["model_b"], + kind="sql", + meta={"materialized": "table"}, + ) + n2 = SimpleNamespace( + name="model_b", + deps=[], + kind="sql", + meta={}, + ) + fake_nodes = {"model_a": n1, "model_b": n2} + + # fake REGISTRY + fake_env = object() + + def fake_relation_for(name: str) -> str: + # simple predictable mapping + return f"public.{name}" + + monkeypatch.setattr("fastflowtransform.cli.docs_utils.REGISTRY", SimpleNamespace(env=fake_env)) + monkeypatch.setattr("fastflowtransform.cli.docs_utils.relation_for", fake_relation_for) + + # fake executor with render_sql and _resolve_ref/_resolve_source + def fake_render_sql(node, env, ref_resolver, source_resolver): + # we return a SQL that contains aliases for lineage + return """ + select a.id, b.val + from public.model_b b + join public.model_a_src a on a.id = b.id + """ + + fake_executor = SimpleNamespace( + render_sql=fake_render_sql, + _resolve_ref=lambda name, env: f"public.{name}", + _resolve_source=lambda *a, **k: "public.src", + ) + + # fake lineage: return lineage for two cols + def fake_infer_sql_lineage(rendered_sql: str, alias_map: dict[str, str]): + return { + "id": [{"source": "public.model_b", "column": "id"}], + "val": [{"source": "public.model_a_src", "column": "val"}], + } + + monkeypatch.setattr( + "fastflowtransform.cli.docs_utils.lineage_mod.infer_sql_lineage", + fake_infer_sql_lineage, + ) + + # fake columns: _collect_columns(executor) → relation + # -> list of objects with .name/.dtype/.nullable + class Col: + def __init__(self, name: str, dtype: str = "TEXT", nullable: bool = True): + self.name = name + self.dtype = dtype + self.nullable = nullable + + def fake_collect_columns(executor): + return { + "public.model_a": [Col("id"), Col("val")], + "public.model_b": [Col("id")], + } + + monkeypatch.setattr( + "fastflowtransform.cli.docs_utils._collect_columns", + fake_collect_columns, + ) + + # docs metadata - model level + def fake_read_docs_metadata(project_dir: Path): + return { + "models": { + "model_a": { + "description_html": "

Model A

", + "columns": {"id": "ID"}, + } + }, + "columns": { + "public.model_b": {"id": "

identifier

"}, + }, + } + + monkeypatch.setattr( + "fastflowtransform.cli.docs_utils.read_docs_metadata", + fake_read_docs_metadata, + ) + + manifest = docs_utils._build_docs_manifest( + project_dir=proj, + nodes=fake_nodes, + executor=fake_executor, + env_name="dev", + ) + + # basic shape + assert manifest["project"] == "MyProj" + assert "generated_at" in manifest + # ISO-ish timestamp + datetime.fromisoformat(manifest["generated_at"].replace("Z", "+00:00")) + models = manifest["models"] + # two models + assert {m["name"] for m in models} == {"model_a", "model_b"} + + m_a = next(m for m in models if m["name"] == "model_a") + assert m_a["relation"] == "public.model_a" + assert m_a["description"] == "Model A" + # depends_on / used_by + assert m_a["depends_on"] == ["model_b"] + # model_b should list model_a as used_by + m_b = next(m for m in models if m["name"] == "model_b") + assert m_b["used_by"] == ["model_a"] + + # columns should include lineage + cols_a = {c["name"]: c for c in m_a["columns"]} + assert "id" in cols_a + assert cols_a["id"]["lineage"] # lineage present + + +@pytest.mark.unit +def test_build_docs_manifest_python_branch(monkeypatch, tmp_path: Path): + """ + Exercise the python-branch: n.kind == "python" → lineage_mod.infer_py_lineage. + """ + proj = tmp_path + + # one python model + n = SimpleNamespace( + name="py_model", + deps=[], + kind="python", + meta={}, + requires=None, + ) + fake_nodes = {"py_model": n} + + # fake REGISTRY with env + py_funcs + def fake_py_func(): + return 1 + + REGISTRY_stub = SimpleNamespace( + env=object(), + py_funcs={"py_model": fake_py_func}, + ) + monkeypatch.setattr("fastflowtransform.cli.docs_utils.REGISTRY", REGISTRY_stub) + + # relation_for stub + monkeypatch.setattr( + "fastflowtransform.cli.docs_utils.relation_for", + lambda name: f"public.{name}", + ) + + # no columns from executor + monkeypatch.setattr( + "fastflowtransform.cli.docs_utils._collect_columns", + lambda executor: {"public.py_model": []}, + ) + + # docs metadata empty + monkeypatch.setattr( + "fastflowtransform.cli.docs_utils.read_docs_metadata", + lambda project_dir: {}, + ) + + # fake python-lineage + monkeypatch.setattr( + "fastflowtransform.cli.docs_utils.lineage_mod.infer_py_lineage", + lambda func, requires, _: {"x": [{"source": "input.tbl", "column": "id"}]}, + ) + + manifest = docs_utils._build_docs_manifest( + project_dir=proj, + nodes=fake_nodes, + executor=SimpleNamespace(), # not used in python branch for rendering + env_name="dev", + ) + + assert manifest["project"] == proj.name + assert manifest["models"][0]["name"] == "py_model" + # it should still produce the "columns" field, even if empty + assert "columns" in manifest["models"][0] diff --git a/tests/unit/cli/test_sync_db_comments_unit.py b/tests/unit/cli/test_sync_db_comments_unit.py new file mode 100644 index 0000000..7b1e457 --- /dev/null +++ b/tests/unit/cli/test_sync_db_comments_unit.py @@ -0,0 +1,318 @@ +# tests/unit/cli/test_sync_db_comments_unit.py +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest +import typer + +import fastflowtransform.cli.sync_db_comments_cmd as mod + +# --------------------------------------------------------------------------- +# helper tests +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.cli +def test_strip_html_for_comment_removes_tags_and_collapses_spaces(): + html = "

Hello World


again" + out = mod._strip_html_for_comment(html) + assert out == "Hello World again" + + +@pytest.mark.unit +@pytest.mark.cli +def test_strip_html_for_comment_none(): + assert mod._strip_html_for_comment(None) == "" + + +@pytest.mark.unit +@pytest.mark.cli +def test_pg_quote_ident_escapes_quotes(): + assert mod._pg_quote_ident('my"table') == '"my""table"' + + +@pytest.mark.unit +@pytest.mark.cli +@pytest.mark.parametrize( + "schema,relation,expected", + [ + ("public", "users", '"public"."users"'), + (None, "public.users", '"public"."users"'), + (None, "users", '"users"'), + ], +) +def test_pg_fq_table(schema, relation, expected): + assert mod._pg_fq_table(schema, relation) == expected + + +@pytest.mark.unit +@pytest.mark.cli +def test_sql_literal_escapes_single_quotes(): + assert mod._sql_literal("O'Reilly") == "'O''Reilly'" + + +# --------------------------------------------------------------------------- +# _sync_comments_postgres +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.cli +def test_sync_comments_postgres_dry_run(capsys): + intents = [ + {"kind": "table", "relation": "users", "text": "Users table"}, + {"kind": "column", "relation": "users", "column": "id", "text": "Primary key"}, + ] + fake_exec = SimpleNamespace() # no .engine -> dry_run only + + mod._sync_comments_postgres(fake_exec, intents, schema="public", dry_run=True) + + out = capsys.readouterr().out + assert 'COMMENT ON TABLE "public"."users" IS \'Users table\';' in out + assert 'COMMENT ON COLUMN "public"."users"."id" IS \'Primary key\';' in out + + +@pytest.mark.unit +@pytest.mark.cli +def test_sync_comments_postgres_executes_on_engine(capsys): + # fake sqlalchemy engine + fake_conn = MagicMock() + fake_engine = MagicMock() + fake_engine.begin.return_value.__enter__.return_value = fake_conn + fake_exec = SimpleNamespace(engine=fake_engine) + + intents = [ + {"kind": "table", "relation": "users", "text": "Users table"}, + ] + + mod._sync_comments_postgres(fake_exec, intents, schema="public", dry_run=False) + + # sollte genau 1 statement ausführen + assert fake_conn.execute.call_count == 1 + stmt_arg = fake_conn.execute.call_args[0][0] # sa_text(...) + # sqlalchemy.text hat .text oder .textual? + assert 'COMMENT ON TABLE "public"."users" IS \'Users table\';' in str(stmt_arg) + + out = capsys.readouterr().out + assert "applied: 1" in out + + +# --------------------------------------------------------------------------- +# _sync_comments_snowflake +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.cli +def test_sync_comments_snowflake_dry_run(capsys): + intents = [ + {"kind": "table", "relation": "MY_DB.MY_TBL", "text": "Some table"}, + {"kind": "column", "relation": "MY_DB.MY_TBL", "column": "C1", "text": "Some column"}, + ] + fake_exec = SimpleNamespace() + + mod._sync_comments_snowflake(fake_exec, intents, schema=None, dry_run=True) + + out = capsys.readouterr().out + assert "COMMENT ON TABLE MY_DB.MY_TBL IS 'Some table';" in out + assert "COMMENT ON COLUMN MY_DB.MY_TBL.C1 IS 'Some column';" in out + + +@pytest.mark.unit +@pytest.mark.cli +def test_sync_comments_snowflake_with_session(): + fake_session = MagicMock() + fake_exec = SimpleNamespace(session=fake_session) + + intents = [ + {"kind": "table", "relation": "MY_TBL", "text": "T"}, + {"kind": "column", "relation": "MY_TBL", "column": "C1", "text": "C"}, + ] + + mod._sync_comments_snowflake(fake_exec, intents, schema="PUBLIC", dry_run=False) + + # 2 statements expected + expected_call_count = 2 + assert fake_session.sql.call_count == expected_call_count + # each should be collected + fake_session.sql.return_value.collect.assert_called() + + +@pytest.mark.unit +@pytest.mark.cli +def test_sync_comments_snowflake_with_execute_method(): + exec_mock = SimpleNamespace(execute=MagicMock()) + + intents = [ + {"kind": "table", "relation": "MY_TBL", "text": "T"}, + ] + + mod._sync_comments_snowflake(exec_mock, intents, schema=None, dry_run=False) + + exec_mock.execute.assert_called_once_with("COMMENT ON TABLE MY_TBL IS 'T'") + + +# --------------------------------------------------------------------------- +# sync_db_comments +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.cli +def test_sync_db_comments_no_intents_exits(monkeypatch): + """ + Fall: es gibt gar keine Descriptions -> sofort Exit(0) mit gelb. + """ + # fake context + fake_ctx = SimpleNamespace( + project=Path("."), + profile=SimpleNamespace(engine="postgres", postgres=SimpleNamespace(db_schema="public")), + make_executor=lambda: (MagicMock(), None, None), + ) + # REGISTRY ohne Nodes + monkeypatch.setattr(mod, "REGISTRY", SimpleNamespace(nodes={})) + # docs metadata -> leer + monkeypatch.setattr(mod, "read_docs_metadata", lambda _: {}) + # keine Spalten gefunden + monkeypatch.setattr(mod, "_collect_columns", lambda _: {}) + + monkeypatch.setattr(mod, "_prepare_context", lambda *a, **k: fake_ctx) + + with pytest.raises(typer.Exit) as excinfo: + mod.sync_db_comments(project=".", env_name="dev", dry_run=True) + assert excinfo.value.exit_code == 0 + + +@pytest.mark.unit +@pytest.mark.cli +def test_sync_db_comments_postgres_path(monkeypatch): + # 1) Kontext vorbereiten + fake_exec = MagicMock() + fake_ctx = SimpleNamespace( + project=Path("."), + profile=SimpleNamespace(engine="postgres", postgres=SimpleNamespace(db_schema="public")), + make_executor=lambda: (fake_exec, None, None), + ) + monkeypatch.setattr(mod, "_prepare_context", lambda *a, **k: fake_ctx) + + # 2) Registry mit einem Node + fake_node = SimpleNamespace(name="users.ff") + monkeypatch.setattr(mod, "REGISTRY", SimpleNamespace(nodes={"users.ff": fake_node})) + + # 3) relation_for -> "users" + monkeypatch.setattr(mod, "relation_for", lambda name: "users") + + # 4) docs metadata: model-beschreibung + column-beschreibung + monkeypatch.setattr( + mod, + "read_docs_metadata", + lambda _: { + "models": { + "users.ff": {"description_html": "

User table

", "columns": {"id": "User id"}} + }, + "columns": {}, + }, + ) + + # 5) _collect_columns: table "users" has column "id" + col = SimpleNamespace(name="id") + monkeypatch.setattr(mod, "_collect_columns", lambda _: {"users": [col]}) + + # 6) _sync_comments_postgres beobachten + called = {} + + def fake_sync_pg(execu, intents, schema, dry_run): + called["execu"] = execu + called["intents"] = intents + called["schema"] = schema + called["dry_run"] = dry_run + + monkeypatch.setattr(mod, "_sync_comments_postgres", fake_sync_pg) + + with pytest.raises(typer.Exit) as excinfo: + mod.sync_db_comments(project=".", env_name="dev", dry_run=True) + assert excinfo.value.exit_code == 0 + + # Assertions + assert called["execu"] is fake_exec + assert called["schema"] == "public" + # wir erwarten 2 intents: table + column + kinds = {i["kind"] for i in called["intents"]} + assert kinds == {"table", "column"} + + +@pytest.mark.unit +@pytest.mark.cli +def test_sync_db_comments_snowflake_path(monkeypatch): + fake_exec = MagicMock() + fake_ctx = SimpleNamespace( + project=Path("."), + profile=SimpleNamespace( + engine="snowflake_snowpark", + snowflake_snowpark=SimpleNamespace(db_schema="PUBLIC"), + ), + make_executor=lambda: (fake_exec, None, None), + ) + monkeypatch.setattr(mod, "_prepare_context", lambda *a, **k: fake_ctx) + + # Registry + relation_for + monkeypatch.setattr( + mod, "REGISTRY", SimpleNamespace(nodes={"users.ff": SimpleNamespace(name="users.ff")}) + ) + monkeypatch.setattr(mod, "relation_for", lambda name: "USERS") + + # docs + monkeypatch.setattr( + mod, + "read_docs_metadata", + lambda _: {"models": {"users.ff": {"description_html": "Users"}}, "columns": {}}, + ) + monkeypatch.setattr(mod, "_collect_columns", lambda _: {}) + + called = {} + + def fake_sync_sf(execu, intents, schema, dry_run): + called["intents"] = intents + called["schema"] = schema + called["dry_run"] = dry_run + + monkeypatch.setattr(mod, "_sync_comments_snowflake", fake_sync_sf) + + with pytest.raises(typer.Exit) as excinfo: + mod.sync_db_comments(project=".", env_name="dev", dry_run=True) + assert excinfo.value.exit_code == 0 + + assert called["schema"] == "PUBLIC" + assert called["intents"][0]["kind"] == "table" + assert called["intents"][0]["relation"] == "USERS" + + +@pytest.mark.unit +@pytest.mark.cli +def test_sync_db_comments_unsupported_engine(monkeypatch, capsys): + fake_exec = MagicMock() + fake_ctx = SimpleNamespace( + project=Path("."), + profile=SimpleNamespace(engine="duckdb"), + make_executor=lambda: (fake_exec, None, None), + ) + monkeypatch.setattr(mod, "_prepare_context", lambda *a, **k: fake_ctx) + + # mindestens ein Node, sonst würden wir vorher returnen + monkeypatch.setattr(mod, "REGISTRY", SimpleNamespace(nodes={"n": SimpleNamespace(name="n")})) + monkeypatch.setattr(mod, "relation_for", lambda name: "N") + monkeypatch.setattr( + mod, "read_docs_metadata", lambda _: {"models": {"n": {"description_html": "hi"}}} + ) + monkeypatch.setattr(mod, "_collect_columns", lambda _: {}) + + with pytest.raises(typer.Exit) as excinfo: + mod.sync_db_comments(project=".", env_name="dev", dry_run=True) + assert excinfo.value.exit_code == 0 + + out = capsys.readouterr().out + assert "not supported for comment sync" in out diff --git a/tests/unit/executors/test_databricks_spark_exec_unit.py b/tests/unit/executors/test_databricks_spark_exec_unit.py new file mode 100644 index 0000000..c9ef934 --- /dev/null +++ b/tests/unit/executors/test_databricks_spark_exec_unit.py @@ -0,0 +1,424 @@ +# tests/unit/executors/test_databricks_spark_exec_unit.py +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import ANY, MagicMock, patch + +import pytest + +from fastflowtransform.core import REGISTRY, Node +from fastflowtransform.executors import databricks_spark_exec as mod +from fastflowtransform.executors.databricks_spark_exec import ( + _SparkConnShim, + _split_db_table, +) + + +@pytest.mark.unit +@pytest.mark.spark +def test_split_db_table_unit(): + assert _split_db_table("db.tbl") == ("db", "tbl") + assert _split_db_table("`db`.`tbl`") == ("db`", "`tbl") + assert _split_db_table("tbl") == (None, "tbl") + + +@pytest.mark.unit +@pytest.mark.spark +def test_q_ident_unit(exec_minimal): + assert exec_minimal._q_ident("foo") == "`foo`" + assert exec_minimal._q_ident("foo`bar") == "`foo``bar`" + assert exec_minimal._q_ident(None) == "" + + +@pytest.mark.unit +@pytest.mark.spark +def test_validate_required_single_df_unit(exec_minimal): + # Fake Spark DF + fake_df = SimpleNamespace(schema=SimpleNamespace(fieldNames=lambda: ["id", "email"])) + # Call in "multi-input" shape so the executor treats it as a dict + exec_minimal._validate_required( + "model_x", + {"users": fake_df}, + {"users": {"id", "email"}}, + ) + + +@pytest.mark.unit +@pytest.mark.spark +def test_validate_required_single_df_raises_unit(exec_minimal): + fake_df = SimpleNamespace(schema=SimpleNamespace(fieldNames=lambda: ["id"])) + with pytest.raises(ValueError): + exec_minimal._validate_required( + "model_x", + {"users": fake_df}, + {"users": {"id", "email"}}, + ) + + +@pytest.mark.unit +@pytest.mark.spark +def test_validate_required_multi_dep_unit(exec_minimal): + fake_users = SimpleNamespace(schema=SimpleNamespace(fieldNames=lambda: ["id", "email"])) + fake_orders = SimpleNamespace( + schema=SimpleNamespace(fieldNames=lambda: ["order_id", "user_id"]) + ) + inputs = {"users": fake_users, "orders": fake_orders} + exec_minimal._validate_required( + "join_model", + inputs, + { + "users": {"id", "email"}, + "orders": {"order_id", "user_id"}, + }, + ) + + +@pytest.mark.unit +@pytest.mark.spark +def test_format_source_reference_classic_unit(exec_minimal): + cfg = {"identifier": "seed_users", "schema": "staging", "catalog": "spark_catalog"} + ref = exec_minimal._format_source_reference(cfg, "raw", "users") + assert "spark_catalog" in ref + assert "staging" in ref + assert "seed_users" in ref + + +@pytest.mark.unit +@pytest.mark.spark +def test_format_source_reference_path_based_unit(exec_minimal): + # wir patchen spark.read.format(...) Kette + fake_df = MagicMock() + fake_spark = exec_minimal.spark + fake_spark.read.format.return_value = MagicMock( + options=MagicMock(return_value=MagicMock(load=MagicMock(return_value=fake_df))) + ) + fake_df.createOrReplaceTempView = MagicMock() + + cfg = { + "location": "/tmp/somewhere.parquet", + "format": "parquet", + "identifier": "my_alias", + "options": {"mergeSchema": "true"}, + } + ref = exec_minimal._format_source_reference(cfg, "raw", "tbl") + + assert ref == "`my_alias`" + fake_spark.read.format.assert_called_with("parquet") + # und wir sollten ein TempView angelegt haben + fake_df.createOrReplaceTempView.assert_called_with("my_alias") + + +@pytest.mark.unit +@pytest.mark.spark +def test__materialize_relation_uses_save_table_when_no_path(exec_minimal): + """Executor should call internal table-saving logic when no storage path is configured.""" + df = MagicMock() + node = Node(name="dummy", kind="python", path=Path(".")) + + # force _storage_meta to return empty dict so we hit _save_df_as_table + exec_minimal._storage_meta = MagicMock(return_value={}) + exec_minimal._save_df_as_table = MagicMock() + + exec_minimal._materialize_relation("default.unit_tbl", df, node) + + exec_minimal._save_df_as_table.assert_called_once() + exec_minimal._save_df_as_table.assert_called_with("default.unit_tbl", df, storage={}) + + +@pytest.mark.unit +@pytest.mark.spark +def test__materialize_relation_uses_write_to_storage_path(exec_minimal, tmp_path): + """Executor should delegate to _write_to_storage_path when storage meta has a path.""" + df = MagicMock() + node = Node(name="dummy", kind="python", path=Path(".")) + + exec_minimal._storage_meta = MagicMock( + return_value={"path": str(tmp_path), "format": "parquet"} + ) + exec_minimal._write_to_storage_path = MagicMock() + + exec_minimal._materialize_relation("default.unit_tbl", df, node) + + exec_minimal._write_to_storage_path.assert_called_once() + exec_minimal._write_to_storage_path.assert_called_with( + "default.unit_tbl", + df, + {"path": str(tmp_path), "format": "parquet"}, + ) + + +@pytest.mark.unit +@pytest.mark.spark +def test__write_to_storage_path_calls_storage_helper(exec_minimal, monkeypatch, tmp_path): + """_write_to_storage_path should just be a thin adapter to storage.spark_write_to_path.""" + called = {} + + def fake_write(spark, identifier, df, storage, default_format=None, default_options=None): + called["spark"] = spark + called["identifier"] = identifier + called["storage"] = storage + called["default_format"] = default_format + called["default_options"] = default_options + + monkeypatch.setattr(mod.storage, "spark_write_to_path", fake_write) + + df = MagicMock() + storage_meta = {"path": str(tmp_path), "format": "parquet"} + exec_minimal.spark_table_format = "parquet" + exec_minimal.spark_table_options = {"mergeSchema": "true"} + + exec_minimal._write_to_storage_path("default.tbl_x", df, storage_meta) + + assert called["identifier"] == "default.tbl_x" + assert called["storage"] == storage_meta + assert called["default_format"] == "parquet" + assert called["default_options"] == {"mergeSchema": "true"} + + +@pytest.mark.unit +@pytest.mark.spark +def test__create_view_over_table_executes_expected_sql(exec_minimal): + """_create_view_over_table should emit a simple CREATE OR REPLACE VIEW SELECT * statement.""" + exec_minimal.spark.sql = MagicMock() + + exec_minimal._create_view_over_table( + "v_users", "t_users", Node(name="n", kind="sql", path=Path(".")) + ) + + exec_minimal.spark.sql.assert_called_once() + sql = exec_minimal.spark.sql.call_args[0][0] + assert "CREATE OR REPLACE VIEW `v_users` AS SELECT * FROM `t_users`" in sql + + +@pytest.mark.unit +@pytest.mark.spark +def test_on_node_built_calls_meta_helpers(exec_minimal, monkeypatch): + """on_node_built should best-effort call ensure_meta_table and upsert_meta.""" + ensure_called = {} + upsert_called = {} + + def fake_ensure(executor): + ensure_called["ok"] = True + + def fake_upsert(executor, node_name, relation, fingerprint, engine): + upsert_called["args"] = (node_name, relation, fingerprint, engine) + + monkeypatch.setattr(mod, "ensure_meta_table", fake_ensure) + monkeypatch.setattr(mod, "upsert_meta", fake_upsert) + + node = Node(name="demo_node", kind="sql", path=Path("x")) + exec_minimal.on_node_built(node, "demo_tbl", "abc123") + + assert ensure_called.get("ok") is True + assert upsert_called["args"] == ("demo_node", "demo_tbl", "abc123", "databricks_spark") + + +@pytest.mark.unit +@pytest.mark.spark +def test_spark_conn_shim_execute_runs_select(monkeypatch): + """_SparkConnShim.execute should return rows collected from spark.sql.""" + fake_spark = MagicMock() + fake_spark.sql.return_value.collect.return_value = [("a",), ("b",)] + shim = _SparkConnShim(fake_spark) + + res = shim.execute("SELECT 'a'") + assert res.fetchall() == [("a",), ("b",)] + assert res.fetchone() == ("a",) + + +@pytest.mark.unit +@pytest.mark.spark +def test_read_relation_uses_spark_table(exec_minimal): + exec_minimal.spark.table.return_value = "DF" + out = exec_minimal._read_relation("users", Node(name="n", kind="sql", path=Path(".")), []) + exec_minimal.spark.table.assert_called_with("users") + assert out == "DF" + + +@pytest.mark.unit +@pytest.mark.spark +def test_validate_required_no_requires_is_noop(exec_minimal): + # should not raise + exec_minimal._validate_required("node_x", inputs=MagicMock(), requires={}) + + +@pytest.mark.unit +@pytest.mark.spark +def test_materialize_relation_rejects_non_frame(exec_minimal, monkeypatch): + # für diesen Test brauchen wir das echte Verhalten + monkeypatch.setattr(exec_minimal, "_is_frame", lambda obj: False) + node = Node(name="x", kind="python", path=Path(".")) + with pytest.raises(TypeError, match="Spark model must return a Spark DataFrame"): + exec_minimal._materialize_relation("tbl", object(), node) + + +@pytest.mark.unit +@pytest.mark.spark +def test_exists_relation_qualified(exec_minimal): + exec_minimal.spark.catalog._jcatalog.tableExists.return_value = True + assert exec_minimal.exists_relation("default.my_tbl") is True + exec_minimal.spark.catalog._jcatalog.tableExists.assert_called_with("default", "my_tbl") + + +@pytest.mark.unit +@pytest.mark.spark +def test_exists_relation_unqualified(exec_minimal): + exec_minimal.spark.catalog.tableExists.return_value = False + assert exec_minimal.exists_relation("my_tbl") is False + exec_minimal.spark.catalog.tableExists.assert_called_with("my_tbl") + + +@pytest.mark.unit +@pytest.mark.spark +def test_init_makes_relative_warehouse_absolute(exec_factory): + ex, _, _ = exec_factory(warehouse_dir="rel_dir") + assert ex.warehouse_dir.is_absolute() + + +@pytest.mark.unit +@pytest.mark.spark +def test_init_with_catalog_sets_config(exec_factory): + _, fake_builder, _ = exec_factory(catalog="hive_metastore") + fake_builder.config.assert_any_call("spark.sql.catalog.spark_catalog", "hive_metastore") + + +@pytest.mark.unit +@pytest.mark.spark +def test_init_with_extra_conf(exec_factory): + _, fake_builder, _ = exec_factory(extra_conf={"spark.foo": "1", "spark.bar": "2"}) + fake_builder.config.assert_any_call("spark.foo", "1") + fake_builder.config.assert_any_call("spark.bar", "2") + + +@pytest.mark.unit +@pytest.mark.spark +def test_init_with_hive_support(exec_factory): + _, fake_builder, _ = exec_factory(use_hive_metastore=True) + fake_builder.config.assert_any_call("spark.sql.catalogImplementation", "hive") + fake_builder.enableHiveSupport.assert_called_once() + + +@pytest.mark.unit +@pytest.mark.spark +def test_init_with_table_options(exec_factory): + ex, _, _ = exec_factory(table_options={"mergeSchema": True}) + assert ex.spark_table_options == {"mergeSchema": "True"} + + +@pytest.mark.unit +@pytest.mark.spark +def test_storage_meta_prefers_node_storage(exec_minimal): + node = Node( + name="users.ff", kind="sql", path=Path("x"), meta={"storage": {"path": "/tmp/users"}} + ) + meta = exec_minimal._storage_meta(node, "users") + assert meta == {"path": "/tmp/users"} + + +@pytest.mark.unit +@pytest.mark.spark +def test_storage_meta_uses_global_lookup_when_node_empty(exec_minimal): + with patch("fastflowtransform.executors.databricks_spark_exec.storage.get_model_storage") as gm: + gm.return_value = {"path": "/tmp/global"} + meta = exec_minimal._storage_meta(None, "some_relation") + assert meta == {"path": "/tmp/global"} + gm.assert_called() + + +@pytest.mark.unit +@pytest.mark.spark +def test_storage_meta_falls_back_to_registry_scan(exec_minimal, monkeypatch): + # 1) Fake-Node im Registry, der Storage hat + reg_node = Node( + name="orders.ff", + kind="sql", + path=Path("x"), + meta={"storage": {"path": "/tmp/orders"}}, + ) + REGISTRY.nodes = {"orders.ff": reg_node} + + # 2) relation_for(...) so patchen, dass es "orders" ergibt + with patch("fastflowtransform.executors.databricks_spark_exec.relation_for") as rel_for: + rel_for.return_value = "orders" + + meta = exec_minimal._storage_meta(None, "orders") + + assert meta == {"path": "/tmp/orders"} + + +@pytest.mark.unit +@pytest.mark.spark +def test_storage_meta_registry_scan_then_global(exec_minimal, monkeypatch): + reg_node = Node( + name="orders.ff", + kind="sql", + path=Path("x"), + meta={}, + ) + REGISTRY.nodes = {"orders.ff": reg_node} + + with ( + patch("fastflowtransform.executors.databricks_spark_exec.relation_for") as rel_for, + patch("fastflowtransform.executors.databricks_spark_exec.storage.get_model_storage") as gm, + ): + rel_for.return_value = "orders" + gm.return_value = {"path": "/tmp/from_global"} + + meta = exec_minimal._storage_meta(None, "orders") + + assert meta == {"path": "/tmp/from_global"} + + +@pytest.mark.unit +@pytest.mark.spark +def test_format_relation_for_ref(exec_minimal): + with patch("fastflowtransform.executors.databricks_spark_exec.relation_for") as rel_for: + rel_for.return_value = "real_table" + out = exec_minimal._format_relation_for_ref("users.ff") + assert out == "`real_table`" + + +@pytest.mark.unit +@pytest.mark.spark +def test_format_source_reference_location_without_format_raises(exec_minimal): + cfg = {"location": "/tmp/data", "identifier": "x"} # no "format" + with pytest.raises(KeyError, match="requires 'format'"): + exec_minimal._format_source_reference(cfg, "raw", "events") + + +@pytest.mark.unit +@pytest.mark.spark +def test_save_df_as_table_respects_storage_path(exec_minimal): + df = MagicMock() + exec_minimal._write_to_storage_path = MagicMock() + + exec_minimal._save_df_as_table( + "my_tbl", + df, + storage={"path": "/tmp/somewhere"}, + ) + + exec_minimal._write_to_storage_path.assert_called_once() + + +@pytest.mark.unit +@pytest.mark.spark +def test_create_or_replace_table_happy_path_calls_save(exec_minimal): + # spark.sql soll NICHT werfen, sondern ein DF liefern + fake_df = MagicMock() + exec_minimal.spark.sql.return_value = fake_df + + # save beobachten + exec_minimal._save_df_as_table = MagicMock() + + node = Node(name="my_model", kind="sql", path=Path(".")) + + exec_minimal._create_or_replace_table( + "target_tbl", + "SELECT 1 AS id", + node, + ) + + exec_minimal.spark.sql.assert_called_with("SELECT 1 AS id") + exec_minimal._save_df_as_table.assert_called_once_with("target_tbl", fake_df, storage=ANY) diff --git a/tests/unit/executors/test_postgres_exec_unit.py b/tests/unit/executors/test_postgres_exec_unit.py new file mode 100644 index 0000000..b54a1a9 --- /dev/null +++ b/tests/unit/executors/test_postgres_exec_unit.py @@ -0,0 +1,567 @@ +# tests/unit/executors/test_postgres_exec_unit.py +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pandas as pd +import pytest + +# Wichtig: wir testen genau dieses Modul +import fastflowtransform.executors.postgres_exec as pgmod +from fastflowtransform.core import Node +from fastflowtransform.errors import ModelExecutionError, ProfileConfigError +from fastflowtransform.executors.postgres_exec import PostgresExecutor + +# --------------------------------------------------------------------------- +# Hilfs-Fakes +# --------------------------------------------------------------------------- + + +class _FakeConn: + """Simple connection mock that records executed SQL.""" + + def __init__(self, rows: list[tuple] | None = None): + self.executed: list[tuple[Any, dict[str, Any] | None]] = [] + self._rows = rows or [] + + def execute(self, stmt, params: dict[str, Any] | None = None): + """Record stmt + params and return object with .fetchall() / .fetchone().""" + self.executed.append((stmt, params)) + rows = self._rows + + class _Res: + def __init__(self, rows): + self._rows = rows + + def fetchone(self): + return self._rows[0] if self._rows else None + + def fetchall(self): + return self._rows + + # needed for "for r in con.execute(...)" style + def __iter__(self): + return iter(self._rows) + + return _Res(rows) + + # needed for "with engine.begin() as conn:" + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + +class _FakeEngine: + """Fake SQLAlchemy engine with begin().""" + + def __init__(self, conn: _FakeConn): + self._conn = conn + self.begin_called = 0 + + def begin(self): + self.begin_called += 1 + return self._conn + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def fake_engine_and_conn(monkeypatch): + """Patch create_engine → fake engine+conn, return both.""" + conn = _FakeConn() + engine = _FakeEngine(conn) + + def _fake_create_engine(dsn, future=True): + return engine + + monkeypatch.setattr(pgmod, "create_engine", _fake_create_engine) + return engine, conn + + +@pytest.fixture +def node_tmp(): + return Node(name="m1", kind="sql", path=Path(".")) + + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.postgres +def test_init_requires_dsn(): + with pytest.raises(ProfileConfigError): + PostgresExecutor(dsn="") + + +@pytest.mark.unit +@pytest.mark.postgres +def test_init_creates_schema_when_given(monkeypatch, fake_engine_and_conn): + _, conn = fake_engine_and_conn + # just create - should call CREATE SCHEMA IF NOT EXISTS + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + # last execute should be CREATE SCHEMA ... + assert any("CREATE SCHEMA IF NOT EXISTS" in str(c[0]) for c in conn.executed) + assert ex.schema == "public" + + +@pytest.mark.unit +@pytest.mark.postgres +def test_init_schema_creation_failure(monkeypatch): + """If CREATE SCHEMA fails, we raise ProfileConfigError.""" + bad_conn = _FakeConn() + + def bad_execute(stmt, params=None): + raise pgmod.SQLAlchemyError("boom") + + bad_conn.execute = bad_execute # type: ignore[assignment] + bad_engine = _FakeEngine(bad_conn) + + def _fake_create_engine(dsn, future=True): + return bad_engine + + monkeypatch.setattr(pgmod, "create_engine", _fake_create_engine) + + with pytest.raises(ProfileConfigError): + PostgresExecutor("postgresql+psycopg://x", schema="foo") + + +# --------------------------------------------------------------------------- +# small helpers +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.postgres +def test_q_ident_and_qualified(monkeypatch, fake_engine_and_conn): + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + assert ex._q_ident('t"b') == '"t""b"' + assert ex._qualified("tbl") == '"public"."tbl"' + assert ex._qualified("tbl", schema="x") == '"x"."tbl"' + # with no schema + ex2 = PostgresExecutor("postgresql+psycopg://x", schema=None) + assert ex2._qualified("tbl") == '"tbl"' + + +@pytest.mark.unit +@pytest.mark.postgres +@pytest.mark.parametrize( + "inp,exp", + [ + ("SELECT * FROM x", "SELECT * FROM x"), + (" select * from x ; ", "select * from x"), + ("with cte as (select 1) select * from cte;", "with cte as (select 1) select * from cte"), + (" bla bla", "bla bla"), + ], +) +def test_extract_select_like(monkeypatch, fake_engine_and_conn, inp, exp): + ex = PostgresExecutor("postgresql+psycopg://x", schema=None) + assert ex._extract_select_like(inp) == exp + + +# --------------------------------------------------------------------------- +# _read_relation +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.postgres +def test_read_relation(monkeypatch, fake_engine_and_conn, node_tmp): + _, conn = fake_engine_and_conn + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + + def fake_read_sql_query(stmt, c): + # stmt is TextClause + assert "select * from" in str(stmt).lower() + return pd.DataFrame({"id": [1, 2]}) + + monkeypatch.setattr(pgmod.pd, "read_sql_query", fake_read_sql_query) + + df = ex._read_relation("my_tbl", node_tmp, deps=[]) + assert list(df["id"]) == [1, 2] + # search_path should be set + assert any("SET LOCAL search_path" in str(s[0]) for s in conn.executed) + + +@pytest.mark.unit +@pytest.mark.postgres +def test_read_relation_propagates_programming_error(monkeypatch, fake_engine_and_conn, node_tmp): + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + + def bad_read_sql_query(stmt, c): + raise pgmod.ProgrammingError("nope", None, Exception("orig")) + + monkeypatch.setattr(pgmod.pd, "read_sql_query", bad_read_sql_query) + + with pytest.raises(pgmod.ProgrammingError): + ex._read_relation("x", node_tmp, deps=[]) + + +# --------------------------------------------------------------------------- +# _materialize_relation +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.postgres +def test_materialize_relation_ok(monkeypatch, fake_engine_and_conn, node_tmp): + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + + df = pd.DataFrame({"id": [1]}) + + called = {"ok": False} + + def fake_to_sql(name, engine, if_exists, index, schema, method): + called["ok"] = True + assert name == "t_out" + assert schema == "public" + + monkeypatch.setattr(df, "to_sql", fake_to_sql) + + ex._materialize_relation("t_out", df, node_tmp) + assert called["ok"] is True + + +@pytest.mark.unit +@pytest.mark.postgres +def test_materialize_relation_wraps_error(monkeypatch, fake_engine_and_conn, node_tmp): + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + df = pd.DataFrame({"id": [1]}) + + def bad_to_sql(*a, **k): + raise pgmod.SQLAlchemyError("boom") + + monkeypatch.setattr(df, "to_sql", bad_to_sql) + + with pytest.raises(ModelExecutionError) as exc: + ex._materialize_relation("t_out", df, node_tmp) + + err = exc.value + assert err.node_name == "m1" + assert err.relation == '"public"."t_out"' + assert "boom" in str(err) + + +# --------------------------------------------------------------------------- +# source formatting +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.postgres +def test_format_source_reference_path_not_supported(fake_engine_and_conn): + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + with pytest.raises(NotImplementedError): + ex._format_source_reference( + {"location": "s3://x", "identifier": "foo"}, + "src", + "tbl", # should trigger + ) + + +@pytest.mark.unit +@pytest.mark.postgres +def test_format_source_reference_with_db_and_schema(fake_engine_and_conn): + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + out = ex._format_source_reference( + { + "identifier": "t_src", + "schema": "other", + "database": "mydb", + }, + "src", + "t", + ) + # "mydb"."other"."t_src" + assert out == '"mydb"."other"."t_src"' + + +@pytest.mark.unit +@pytest.mark.postgres +def test_format_source_reference_missing_identifier(fake_engine_and_conn): + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + with pytest.raises(KeyError): + ex._format_source_reference({}, "src", "t") + + +# --------------------------------------------------------------------------- +# view / table creation +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.postgres +def test_create_or_replace_view_ok(monkeypatch, fake_engine_and_conn, node_tmp): + _, conn = fake_engine_and_conn + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + + ex._create_or_replace_view('"public"."v_x"', "select 1 as id", node_tmp) + + # should have dropped and created + texts = [str(c[0]) for c in conn.executed] + assert any("DROP VIEW IF EXISTS" in t for t in texts) + assert any("CREATE OR REPLACE VIEW" in t for t in texts) + + +@pytest.mark.unit +@pytest.mark.postgres +def test_create_or_replace_view_wraps(monkeypatch, fake_engine_and_conn, node_tmp): + _, conn = fake_engine_and_conn + + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + + def bad_execute(stmt, params=None): + raise RuntimeError("db down") + + conn.execute = bad_execute + + with pytest.raises(ModelExecutionError) as exc: + ex._create_or_replace_view('"public"."v_x"', "select 1", node_tmp) + + err = exc.value + assert err.node_name == "m1" + assert err.relation == '"public"."v_x"' + assert "db down" in str(err) + assert err.sql_snippet is not None + assert "select 1" in err.sql_snippet + + +@pytest.mark.unit +@pytest.mark.postgres +def test_create_or_replace_table_ok(fake_engine_and_conn, node_tmp): + _, conn = fake_engine_and_conn + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + + ex._create_or_replace_table('"public"."t_x"', "select 1", node_tmp) + + texts = [str(c[0]) for c in conn.executed] + assert any("DROP TABLE IF EXISTS" in t for t in texts) + assert any("CREATE TABLE" in t and "select 1" in t for t in texts) + + +@pytest.mark.unit +@pytest.mark.postgres +def test_create_or_replace_table_wraps(fake_engine_and_conn, node_tmp): + # Reuse fake engine/connection from fixture + engine, conn = fake_engine_and_conn + + # Bypass __init__ to avoid real DSN setup + ex = PostgresExecutor.__new__(PostgresExecutor) + ex.engine = engine + ex.schema = "public" + + # Force the DB call to fail + def bad_execute(stmt, params=None): + raise RuntimeError("nope") + + conn.execute = bad_execute # type: ignore[assignment] + + with pytest.raises(ModelExecutionError) as excinfo: + ex._create_or_replace_table('"public"."t_x"', "select 1", node_tmp) + + err = excinfo.value + assert err.node_name == node_tmp.name + assert err.relation == '"public"."t_x"' + assert err.message == "nope" + assert err.sql_snippet is not None + assert '-- target="public"."t_x"' in err.sql_snippet + assert "select 1" in err.sql_snippet + + +# --------------------------------------------------------------------------- +# on_node_built +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.postgres +def test_on_node_built_calls_meta(monkeypatch, fake_engine_and_conn, node_tmp): + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + + called = {"ens": 0, "up": 0} + + def fake_ensure(executor): + called["ens"] += 1 + + def fake_upsert(executor, name, relation, fp, engine): + called["up"] += 1 + assert engine == "postgres" + + monkeypatch.setattr(pgmod, "ensure_meta_table", fake_ensure) + monkeypatch.setattr(pgmod, "upsert_meta", fake_upsert) + + ex.on_node_built(node_tmp, "public.t_x", "fp123") + assert called["ens"] == 1 + assert called["up"] == 1 + + +@pytest.mark.unit +@pytest.mark.postgres +def test_on_node_built_swallows_exceptions(monkeypatch, fake_engine_and_conn, node_tmp): + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + + def bad_ensure(executor): + raise RuntimeError("meta fail") + + monkeypatch.setattr(pgmod, "ensure_meta_table", bad_ensure) + # should not raise + ex.on_node_built(node_tmp, "t", "fp") + + +# --------------------------------------------------------------------------- +# incremental API +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.postgres +def test_exists_relation_true(fake_engine_and_conn): + _, conn = fake_engine_and_conn + conn._rows = [(1,)] + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + assert ex.exists_relation("tbl") is True + + +@pytest.mark.unit +@pytest.mark.postgres +def test_exists_relation_false(fake_engine_and_conn): + _, conn = fake_engine_and_conn + conn._rows = [] # no row + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + assert ex.exists_relation("tbl") is False + + +@pytest.mark.unit +@pytest.mark.postgres +def test_create_table_as(fake_engine_and_conn): + _, conn = fake_engine_and_conn + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + ex.create_table_as("out_tbl", "WITH x AS (SELECT 1) SELECT * FROM x") + texts = [str(c[0]) for c in conn.executed] + assert any("create table" in t.lower() for t in texts) + # search_path + assert any("SET LOCAL search_path" in str(s[0]) for s in conn.executed) + + +@pytest.mark.unit +@pytest.mark.postgres +def test_incremental_insert(fake_engine_and_conn): + _, conn = fake_engine_and_conn + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + ex.incremental_insert("out_tbl", "select 1") + texts = [str(c[0]).lower() for c in conn.executed] + assert any("insert into" in t for t in texts) + + +@pytest.mark.unit +@pytest.mark.postgres +def test_incremental_merge(fake_engine_and_conn): + _, conn = fake_engine_and_conn + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + ex.incremental_merge("tgt_tbl", "select 1 as id, 'x' as v", ["id"]) + texts = [str(c[0]).lower() for c in conn.executed] + # temp table + assert any("create temporary table ff_stg as select 1 as id" in t for t in texts) + # delete using + assert any("delete from" in t and "using ff_stg" in t for t in texts) + # insert + assert any("insert into" in t and "select * from ff_stg" in t for t in texts) + # drop staging + assert any("drop table if exists ff_stg" in t for t in texts) + + +@pytest.mark.unit +@pytest.mark.postgres +def test_alter_table_sync_schema_adds_missing(fake_engine_and_conn): + _, conn = fake_engine_and_conn + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + + # First execute: select * from (body) q limit 0 -> columns + # We'll simulate two columns: existing_col, new_col + # Second execute: information_schema.columns -> existing_col only + # Then we expect: ALTER TABLE ... ADD COLUMN "new_col" text + def exec_side_effect(stmt, params=None): + sql = str(stmt).lower() + if "select * from (select 1 as existing_col, 2 as new_col)" in sql: + # "select * from (...) limit 0" + return _FakeConn(rows=[("existing_col",), ("new_col",)]).execute("dummy") + if "information_schema.columns" in sql: + return _FakeConn(rows=[("existing_col",)]).execute("dummy") + # capture alter table + return _FakeConn(rows=[]).execute(stmt, params) + + conn.execute = exec_side_effect # type: ignore[assignment] + + ex.alter_table_sync_schema( + "target_tbl", + "select 1 as existing_col, 2 as new_col", + mode="append_new_columns", + ) + + +@pytest.mark.unit +@pytest.mark.postgres +def test_create_or_replace_view_from_table_happy(fake_engine_and_conn): + """Ensure view over table is created with qualified names.""" + engine, conn = fake_engine_and_conn + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + ex.engine = engine # inject fake + node = Node(name="m1", kind="sql", path=Path("."), deps=[], meta={}) + + ex._create_or_replace_view_from_table("v_out", "src_tbl", node) + + # our fake conn records *everything*, incl. BEGIN/COMMIT → filter + stmts = [ + (str(stmt), params) + for (stmt, params) in conn.executed + # keep only the actual SQL we care about + if "SET LOCAL" in str(stmt) + or "DROP VIEW" in str(stmt) + or "CREATE OR REPLACE VIEW" in str(stmt) + ] + + # now we should have exactly the 3 we expect + expected_statement_len = 3 + assert len(stmts) == expected_statement_len + + assert 'SET LOCAL search_path = "public"' in stmts[0][0] + assert 'DROP VIEW IF EXISTS "public"."v_out" CASCADE' in stmts[1][0] + assert ( + 'CREATE OR REPLACE VIEW "public"."v_out" AS SELECT * FROM "public"."src_tbl"' in stmts[2][0] + ) + + +@pytest.mark.unit +@pytest.mark.postgres +def test_create_or_replace_view_from_table_wraps_error(fake_engine_and_conn): + """Errors during view creation should be wrapped with node + relation preserved.""" + engine, conn = fake_engine_and_conn + ex = PostgresExecutor("postgresql+psycopg://x", schema="public") + # inject fake engine so we stay in-memory + ex.engine = engine + node = Node(name="m_bad", kind="sql", path=Path("."), deps=[], meta={}) + + def boom(stmt, params=None): + raise RuntimeError("db down") + + # force every execute to fail + conn.execute = boom # type: ignore[assignment] + + with pytest.raises(ModelExecutionError) as exc: + ex._create_or_replace_view_from_table("v_broken", "src_tbl", node) + + err = exc.value + # message is just the original error text + assert str(err) == "db down" + # but the extra context must be present + assert err.node_name == "m_bad" + assert err.relation == '"public"."v_broken"' + # sql_snippet is not used in this method, so we expect None + assert err.sql_snippet is None diff --git a/tests/render/test_this_proxy.py b/tests/unit/render/test_this_proxy_unit.py similarity index 84% rename from tests/render/test_this_proxy.py rename to tests/unit/render/test_this_proxy_unit.py index 66ed6d7..3564ef4 100644 --- a/tests/render/test_this_proxy.py +++ b/tests/unit/render/test_this_proxy_unit.py @@ -1,12 +1,15 @@ -# tests/render/test_this_proxy.py +# tests/unit/render/test_this_proxy_unit.py from pathlib import Path +import pytest from jinja2 import Environment from fastflowtransform.core import Node from fastflowtransform.executors.duckdb_exec import DuckExecutor +@pytest.mark.unit +@pytest.mark.render def test_this_string_and_name(tmp_path: Path): p = tmp_path / "m.ff.sql" p.write_text("select '{{ this }}' as a, '{{ this.name }}' as b", encoding="utf-8") diff --git a/tests/render/test_this_relation.py b/tests/unit/render/test_this_relation_unit.py similarity index 90% rename from tests/render/test_this_relation.py rename to tests/unit/render/test_this_relation_unit.py index 42e1fca..ce7440a 100644 --- a/tests/render/test_this_relation.py +++ b/tests/unit/render/test_this_relation_unit.py @@ -1,4 +1,5 @@ -# tests/render/test_this_relation.py +# tests/unit/render/test_this_relation_unit.py +import pytest from jinja2 import Environment, FileSystemLoader, select_autoescape from fastflowtransform.core import Node @@ -14,6 +15,8 @@ def _env_for_tests() -> Environment: ) +@pytest.mark.unit +@pytest.mark.render def test_this_renders_physical_relation(tmp_path): # Arrange: Minimal SQL-Model, das nur `{{ this }}` rendert sql_path = tmp_path / "m.ff.sql" diff --git a/tests/schema/test_schema_loader.py b/tests/unit/schema/test_schema_loader_unit.py similarity index 95% rename from tests/schema/test_schema_loader.py rename to tests/unit/schema/test_schema_loader_unit.py index 5d4d5e0..a2ff898 100644 --- a/tests/schema/test_schema_loader.py +++ b/tests/unit/schema/test_schema_loader_unit.py @@ -1,8 +1,12 @@ from pathlib import Path +import pytest + from fastflowtransform.schema_loader import load_schema_tests +@pytest.mark.unit +@pytest.mark.schema def test_parse_schema_yaml_column_tests(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) (tmp_path / "models" / "users_enriched.yml").write_text( diff --git a/tests/selectors/test_state_modified_unit.py b/tests/unit/selectors/test_state_modified_unit.py similarity index 95% rename from tests/selectors/test_state_modified_unit.py rename to tests/unit/selectors/test_state_modified_unit.py index 758d232..985f3f3 100644 --- a/tests/selectors/test_state_modified_unit.py +++ b/tests/unit/selectors/test_state_modified_unit.py @@ -1,9 +1,12 @@ from pathlib import Path +import pytest + from fastflowtransform.cli.selectors import _downstream_closure from fastflowtransform.core import REGISTRY, Node +@pytest.mark.unit def test_downstream_closure_simple(): REGISTRY.nodes.clear() REGISTRY.nodes.update( diff --git a/tests/unit/test_cli_init.py b/tests/unit/test_cli_init.py new file mode 100644 index 0000000..36e5bde --- /dev/null +++ b/tests/unit/test_cli_init.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest +from typer.testing import CliRunner + +from fastflowtransform.cli import app + + +def _read(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def test_init_creates_minimal_skeleton(tmp_path: Path): + runner = CliRunner() + target = tmp_path / "warehouse" + + result = runner.invoke(app, ["init", str(target), "--engine", "duckdb"]) + assert result.exit_code == 0, result.output + + # Core directories exist + for rel in ("models", "seeds", "tests/unit", "docs"): + assert (target / rel).is_dir(), f"missing directory {rel}" + + # Configuration files contain doc references and comments + project_yaml = _read(target / "project.yml") + assert "docs/Project_Config.md" in project_yaml + assert "tests: []" in project_yaml + + profiles_yaml = _read(target / "profiles.yml") + assert "docs/Profiles.md" in profiles_yaml + assert "duckdb" in profiles_yaml + + sources_yaml = _read(target / "sources.yml") + assert "docs/Sources.md" in sources_yaml + + readme = _read(target / "README.md") + assert "docs/Quickstart.md" in readme + + models_note = _read(target / "models/README.md") + assert "Config_and_Macros.md" in models_note + + tests_note = _read(target / "tests/unit/README.md") + assert "fft utest" in tests_note + + +def test_init_refuses_existing_directory(tmp_path: Path): + runner = CliRunner() + target = tmp_path / "existing" + target.mkdir() + + result = runner.invoke(app, ["init", str(target)]) + assert result.exit_code == 1 + assert "already exists" in result.output + # No files should have been created + assert not list(target.glob("*")) + + +@pytest.mark.parametrize("engine", ["unknown", "sqlite"]) +def test_init_validates_engine(engine: str, tmp_path: Path): + runner = CliRunner() + target = tmp_path / engine + + result = runner.invoke(app, ["init", str(target), "--engine", engine]) + expected_exit_code = 2 + assert result.exit_code == expected_exit_code + assert "Unsupported engine" in result.output + assert not target.exists() diff --git a/tests/unit/test_cli_select.py b/tests/unit/test_cli_select.py index 9decff0..8c86580 100644 --- a/tests/unit/test_cli_select.py +++ b/tests/unit/test_cli_select.py @@ -16,7 +16,9 @@ # Helpers-only: predicates & parse # ------------------------------- def _mk_node(tmp_path: Path, name: str, kind: str = "sql", mat: str = "table", tags=None) -> Node: - p = tmp_path / (name + (".sql" if kind == "sql" else ".py")) + models_dir = tmp_path / "models" + models_dir.mkdir(parents=True, exist_ok=True) + p = models_dir / (name + (".sql" if kind == "sql" else ".py")) p.write_text("-- stub\n", encoding="utf-8") n = Node(name=name, kind=kind, path=p, deps=[], meta={"materialized": mat}) if tags is not None: diff --git a/tests/unit/test_core_python_tags.py b/tests/unit/test_core_python_tags.py new file mode 100644 index 0000000..0f84375 --- /dev/null +++ b/tests/unit/test_core_python_tags.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from pathlib import Path + +from fastflowtransform.core import Registry + + +def test_python_model_tags_propagate_to_node(tmp_path: Path, monkeypatch) -> None: + project_dir = tmp_path / "proj" + models_dir = project_dir / "models" + models_dir.mkdir(parents=True) + + model_file = models_dir / "py_tagged.ff.py" + model_file.write_text( + ( + "from fastflowtransform import model\n\n" + "@model(name='py_tagged', tags=['example', 'demo'], meta={'materialized': 'view'})\n" + "def build(df=None):\n" + " return df\n" + ), + encoding="utf-8", + ) + + isolated_registry = Registry() + monkeypatch.setattr("fastflowtransform.core.REGISTRY", isolated_registry, raising=False) + monkeypatch.setattr("fastflowtransform.decorators.REGISTRY", isolated_registry, raising=False) + monkeypatch.setattr("fastflowtransform.REGISTRY", isolated_registry, raising=False) + + isolated_registry.load_project(project_dir) + + node = isolated_registry.get_node("py_tagged") + assert node.kind == "python" + assert node.meta.get("materialized") == "view" + assert set(node.meta.get("tags", [])) == {"example", "demo"} diff --git a/tests/unit/test_logging_flags.py b/tests/unit/test_logging_flags.py index 329e676..e12c14b 100644 --- a/tests/unit/test_logging_flags.py +++ b/tests/unit/test_logging_flags.py @@ -1,6 +1,7 @@ # tests/unit/test_logging_flags.py import importlib +from jinja2 import Environment from typer.testing import CliRunner from fastflowtransform.cli import app @@ -9,12 +10,15 @@ cli_run = importlib.import_module("fastflowtransform.cli.run") -def test_verbose_flags_wiring(monkeypatch): +def test_verbose_flags_wiring(monkeypatch, tmp_path): + models_dir = tmp_path / "models" + models_dir.mkdir(parents=True, exist_ok=True) + # stub the heavy bits so the command exits early after logging lines monkeypatch.setattr( cli_bootstrap, "_load_project_and_env", - lambda proj: (__import__("pathlib").Path("."), None), + lambda proj: (tmp_path, Environment()), ) monkeypatch.setattr( cli_bootstrap, @@ -32,16 +36,16 @@ def test_verbose_flags_wiring(monkeypatch): runner = CliRunner() # default (quiet-ish) - res = runner.invoke(app, ["run", "."]) + res = runner.invoke(app, ["run", str(tmp_path)]) assert res.exit_code == 0 # -v should show the "Profil" line (we don't assert text content here to keep it loose) - res = runner.invoke(app, ["-v", "run", "."]) + res = runner.invoke(app, ["-v", "run", str(tmp_path)]) assert res.exit_code == 0 # -vv enables SQL debug env var - res = runner.invoke(app, ["-vv", "run", "."]) + res = runner.invoke(app, ["-vv", "run", str(tmp_path)]) assert res.exit_code == 0 - res = runner.invoke(app, ["-q", "dag", "."]) + res = runner.invoke(app, ["-q", "dag", str(tmp_path)]) assert res.exit_code == 0 diff --git a/tests/unit/test_parallel_logging_error_block.py b/tests/unit/test_parallel_logging_error_block.py index 624fa8f..cc51061 100644 --- a/tests/unit/test_parallel_logging_error_block.py +++ b/tests/unit/test_parallel_logging_error_block.py @@ -14,7 +14,10 @@ cli_run = importlib.import_module("fastflowtransform.cli.run") -def test_error_block_prints_after_logs_without_interleaving(monkeypatch): +def test_error_block_prints_after_logs_without_interleaving(monkeypatch, tmp_path): + models_dir = tmp_path / "models" + models_dir.mkdir(parents=True, exist_ok=True) + # Minimal schedule stub: one level, one failing node. def fake_schedule(levels, **kw): # Run the real schedule but inject a failing run_node @@ -52,7 +55,7 @@ class E: monkeypatch.setattr(cli_bootstrap, "_make_executor", fake_make_executor) runner = CliRunner() - res = runner.invoke(app, ["run", ".", "--cache", "off"]) + res = runner.invoke(app, ["run", str(tmp_path), "--cache", "off"]) # Exit with error assert res.exit_code != 0 # Logs first (including ✖ line), then the error block (starts with '┌') diff --git a/tests/unit/test_seeding_unit.py b/tests/unit/test_seeding_unit.py new file mode 100644 index 0000000..e298fff --- /dev/null +++ b/tests/unit/test_seeding_unit.py @@ -0,0 +1,534 @@ +# tests/unit/test_seeding_unit.py +from __future__ import annotations + +import textwrap +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pandas as pd +import pytest + +from fastflowtransform import seeding, storage + +# --------------------------------------------------------------------------- +# File I/O helpers +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_read_seed_file_csv(tmp_path: Path): + p = tmp_path / "users.csv" + p.write_text("id,name\n1,A\n2,B\n", encoding="utf-8") + + df = seeding._read_seed_file(p) + assert list(df.columns) == ["id", "name"] + expected_row_count = 2 + assert len(df) == expected_row_count + + +@pytest.mark.unit +def test_read_seed_file_unsupported(tmp_path: Path): + p = tmp_path / "users.txt" + p.write_text("nope", encoding="utf-8") + with pytest.raises(ValueError): + seeding._read_seed_file(p) + + +@pytest.mark.unit +def test_apply_schema_happy(): + df = pd.DataFrame({"id": [1, 2], "name": ["a", "b"], "age": [10, 20]}) + schema_cfg = { + "dtypes": { + "users": { + "name": "string", + "age": "int64", + } + } + } + + out = seeding._apply_schema(df, "users", schema_cfg) + # 'name' should be string dtype + assert str(out.dtypes["name"]).startswith("string") + assert str(out.dtypes["age"]) in ("int64", "Int64") + + +@pytest.mark.unit +def test_apply_schema_ignores_missing_table_key(): + df = pd.DataFrame({"id": [1]}) + out = seeding._apply_schema(df, "other", {"dtypes": {"users": {"id": "int64"}}}) + # unchanged + assert out.equals(df) + + +@pytest.mark.unit +def test_apply_schema_soft_fails_on_bad_cast(): + df = pd.DataFrame({"id": ["x"]}) + # force bad cast + cfg = {"dtypes": {"t": {"id": "int64"}}} + out = seeding._apply_schema(df, "t", cfg) + # should not raise and should still have the row + assert len(out) == 1 + + +# --------------------------------------------------------------------------- +# Identifier helpers +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_dq_quotes_and_escapes(): + assert seeding._dq('a"b') == '"a""b"' + assert seeding._dq("tbl") == '"tbl"' + + +@pytest.mark.unit +def test_is_qualified(): + assert seeding._is_qualified("raw.users") is True + assert seeding._is_qualified("users") is False + + +@pytest.mark.unit +def test_qualify_unqualified_with_schema(): + out = seeding._qualify("users", "raw") + assert out == '"raw"."users"' + + +@pytest.mark.unit +def test_qualify_already_qualified_preserves_parts(): + out = seeding._qualify("raw.users", None) + assert out == '"raw"."users"' + + +# --------------------------------------------------------------------------- +# Spark warehouse helpers +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.spark +def test_spark_warehouse_base_local(tmp_path: Path): + fake_spark = SimpleNamespace( + conf=SimpleNamespace(get=lambda key, default=None: str(tmp_path / "wh")) + ) + base = seeding._spark_warehouse_base(fake_spark) + assert base == (tmp_path / "wh") + + +@pytest.mark.unit +@pytest.mark.spark +def test_spark_warehouse_base_remote_scheme(): + fake_spark = SimpleNamespace(conf=SimpleNamespace(get=lambda *_: "s3://bucket/warehouse")) + assert seeding._spark_warehouse_base(fake_spark) is None + + +@pytest.mark.unit +@pytest.mark.spark +def test_spark_table_location_strips_catalog(tmp_path: Path): + # warehouse dir is local + fake_spark = SimpleNamespace(conf=SimpleNamespace(get=lambda *_: str(tmp_path / "wh"))) + parts = ["spark_catalog", "default", "mytable"] + loc = seeding._spark_table_location(parts, fake_spark) + # should resolve to /default.db/mytable + assert loc == tmp_path / "wh" / "default.db" / "mytable" + + +# --------------------------------------------------------------------------- +# Pretty helpers +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_human_int_formats_with_spaces(): + assert seeding._human_int(1234567) == "1 234 567" + assert seeding._human_int(0) == "0" + + +@pytest.mark.unit +def test_human_bytes_formats_reasonably(): + assert seeding._human_bytes(512) == "512 B" + # just smoke tests + assert "KB" in seeding._human_bytes(2_000) + assert "MB" in seeding._human_bytes(2_000_000) + + +@pytest.mark.unit +def test_echo_seed_line(monkeypatch): + lines: list[str] = [] + + def fake_echo(msg: str) -> None: + lines.append(msg) + + monkeypatch.setattr(seeding, "echo", fake_echo) + + seeding._echo_seed_line( + full_name="raw.users", + rows=1234, + cols=5, + engine="duckdb", + ms=42, + created_schema=True, + extra="reset location", + ) + + assert len(lines) == 1 + out = lines[0] + assert "raw.users" in out + assert "1 234×5" in out # noqa RUF001 + assert "[duckdb]" in out + assert "(+schema)" in out + assert "reset location" in out + + +# --------------------------------------------------------------------------- +# Target resolution +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_engine_name_from_executor_spark(): + ex = SimpleNamespace(spark=object()) + assert seeding._engine_name_from_executor(ex) == "spark" + + +@pytest.mark.unit +def test_engine_name_from_executor_sqlalchemy_like(): + eng = SimpleNamespace(dialect=SimpleNamespace(name="postgres")) + ex = SimpleNamespace(engine=eng) + assert seeding._engine_name_from_executor(ex) == "postgres" + + +@pytest.mark.unit +def test_engine_name_from_executor_duckdb_like(): + ex = SimpleNamespace(con=object()) + assert seeding._engine_name_from_executor(ex) == "duckdb" + + +@pytest.mark.unit +def test_seed_id_simple(tmp_path: Path): + seeds_dir = tmp_path / "seeds" + seeds_dir.mkdir() + p = seeds_dir / "users.csv" + p.write_text("id\n1\n", encoding="utf-8") + assert seeding._seed_id(seeds_dir, p) == "users" + + +@pytest.mark.unit +def test_seed_id_nested(tmp_path: Path): + seeds_dir = tmp_path / "seeds" + (seeds_dir / "raw").mkdir(parents=True) + p = seeds_dir / "raw" / "users.csv" + p.write_text("id\n1\n", encoding="utf-8") + assert seeding._seed_id(seeds_dir, p) == "raw/users" + + +@pytest.mark.unit +def test_resolve_schema_and_table_by_cfg_priority_engine_override(): + schema_cfg = { + "targets": { + "raw/users": { + "schema": "raw", + "table": "users_final", + "schema_by_engine": { + "postgres": "pg_raw", + "duckdb": "main", + }, + } + } + } + # executor pretending to be postgres + ex = SimpleNamespace(engine=SimpleNamespace(dialect=SimpleNamespace(name="postgres"))) + + schema, table = seeding._resolve_schema_and_table_by_cfg( + seed_id="raw/users", + stem="users", + schema_cfg=schema_cfg, + executor=ex, + default_schema="public", + ) + + assert schema == "pg_raw" + assert table == "users_final" + + +@pytest.mark.unit +def test_resolve_schema_and_table_falls_back_to_default_schema(): + ex = SimpleNamespace(engine=None, con=None) + schema, table = seeding._resolve_schema_and_table_by_cfg( + seed_id="raw/users", + stem="users", + schema_cfg=None, + executor=ex, + default_schema="public", + ) + assert schema == "public" + assert table == "users" + + +# --------------------------------------------------------------------------- +# Handlers: DuckDB +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_handle_duckdb_returns_false_for_non_duckdb_conn(): + executor = SimpleNamespace( + con=SimpleNamespace(register=lambda *a, **k: None, execute=lambda *a, **k: None) + ) + df = pd.DataFrame({"id": [1, 2]}) + + handled = seeding._handle_duckdb("users", df, executor, schema="raw") + + assert handled is False + + +@pytest.mark.unit +def test_handle_duckdb_returns_false_if_no_con(): + executor = SimpleNamespace() + df = pd.DataFrame({"id": [1]}) + handled = seeding._handle_duckdb("users", df, executor, schema=None) + assert handled is False + + +# --------------------------------------------------------------------------- +# Handlers: SQLAlchemy +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.postgres +def test_handle_sqlalchemy_happy(monkeypatch): + calls = {} + + class FakeEngine: + __module__ = "sqlalchemy.engine" # to trigger detection + dialect = SimpleNamespace(name="postgres") + + class FakeDF(pd.DataFrame): + def to_sql(self, name, eng, if_exists, index, schema, method): + calls["name"] = name + calls["schema"] = schema + calls["if_exists"] = if_exists + + df = FakeDF({"id": [1, 2]}) + executor = SimpleNamespace(engine=FakeEngine()) + + handled = seeding._handle_sqlalchemy("seed_tbl", df, executor, schema="raw") + assert handled is True + assert calls["name"] == "seed_tbl" + assert calls["schema"] == "raw" + assert calls["if_exists"] == "replace" + + +@pytest.mark.unit +def test_handle_sqlalchemy_returns_false_if_no_engine(): + df = pd.DataFrame({"id": [1]}) + executor = SimpleNamespace() + assert seeding._handle_sqlalchemy("t", df, executor, None) is False + + +@pytest.mark.unit +def test_handle_sqlalchemy_returns_false_if_engine_not_sqlalchemy(): + df = pd.DataFrame({"id": [1]}) + executor = SimpleNamespace(engine=SimpleNamespace(__module__="not.sqlalchemy")) + assert seeding._handle_sqlalchemy("t", df, executor, None) is False + + +# --------------------------------------------------------------------------- +# Handlers: Spark +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.spark +def test_handle_spark_happy_default_table(tmp_path: Path, monkeypatch): + # fake spark with local warehouse + fake_spark = MagicMock() + fake_spark.conf.get.return_value = str(tmp_path / "wh") + + # DataFrame path + fake_sdf = MagicMock() + fake_spark.createDataFrame.return_value = fake_sdf + + # writer chain + writer = MagicMock() + fake_sdf.write.mode.return_value = writer + writer.format.return_value = writer + writer.options.return_value = writer + + executor = SimpleNamespace( + spark=fake_spark, + spark_table_format="delta", + spark_table_options={"mergeSchema": "true"}, + ) + + df = pd.DataFrame({"id": [1]}) + handled = seeding._handle_spark("default.seed_tbl", df, executor, schema=None) + + assert handled is True + # drop table was attempted + fake_spark.sql.assert_any_call("DROP TABLE IF EXISTS `default`.`seed_tbl`") + # writer.saveAsTable called with identifier + writer.saveAsTable.assert_called_once_with("default.seed_tbl") + + +@pytest.mark.unit +@pytest.mark.spark +def test_handle_spark_uses_seed_storage(monkeypatch): + # storage override set to custom path + storage.set_seed_storage( + {"raw.users": {"path": "/tmp/custom", "format": "parquet", "options": {"x": "1"}}} + ) + + fake_spark = MagicMock() + fake_sdf = MagicMock() + fake_spark.createDataFrame.return_value = fake_sdf + writer = MagicMock() + fake_sdf.write.mode.return_value = writer + writer.format.return_value = writer + writer.options.return_value = writer + + executor = SimpleNamespace( + spark=fake_spark, + spark_table_format=None, + spark_table_options=None, + ) + + df = pd.DataFrame({"id": [1]}) + # name must match our storage key + handled = seeding._handle_spark("raw.users", df, executor, schema=None) + assert handled is True + + # since we used storage override, it should have called storage.spark_write_to_path + # easiest: monkeypatch seeding.storage.spark_write_to_path and assert + # but here we can assert spark.sql got a DROP TABLE? no, path → register only + # so instead let's just check that createDataFrame was called (path flow runs too) + fake_spark.createDataFrame.assert_called_once() + + +# --------------------------------------------------------------------------- +# Dispatcher +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_materialize_seed_tries_all_and_raises(monkeypatch): + df = pd.DataFrame({"id": [1]}) + # executor without con/engine/spark + executor = SimpleNamespace() + + with pytest.raises(RuntimeError) as exc: + seeding.materialize_seed("t", df, executor, schema=None) + assert "No compatible executor" in str(exc.value) + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_materialize_seed_stops_at_first_handler(monkeypatch): + df = pd.DataFrame({"id": [1]}) + + # first handler claims success + def h1(table, df, ex, schema): + return True + + # second handler should not be called + called = {"h2": False} + + def h2(table, df, ex, schema): + called["h2"] = True + return True + + monkeypatch.setattr(seeding, "_HANDLERS", (h1, h2)) + + seeding.materialize_seed("t", df, SimpleNamespace(), schema=None) + assert called["h2"] is False + + +# --------------------------------------------------------------------------- +# seed_project +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_seed_project_happy_duckdb(tmp_path: Path, monkeypatch): + # project structure + seeds_dir = tmp_path / "seeds" + seeds_dir.mkdir() + (seeds_dir / "raw").mkdir() + (seeds_dir / "raw" / "users.csv").write_text("id,name\n1,A\n", encoding="utf-8") + + # fake duckdb-like executor + exec_calls = [] + + class FakeCon: + def register(self, name, df): + exec_calls.append(("register", name)) + + def execute(self, sql): + exec_calls.append(("execute", sql)) + + def unregister(self, name): + exec_calls.append(("unregister", name)) + + executor = SimpleNamespace(con=FakeCon(), schema="public") + + # IMPORTANT: in environments where duckdb is installed, _handle_duckdb() + # does an isinstance(...) against the real DuckDB connection type. + # That would make our FakeCon fail. So we just force the handler to succeed. + def fake_handle_duckdb(table, df, ex, schema): + # simulate the real duckdb handler a bit + full_name = seeding._qualify(table, schema) + ex.con.register("_tmp", df) + ex.con.execute(f'create or replace table {full_name} as select * from "_tmp"') + ex.con.unregister("_tmp") + return True + + handlers = tuple(seeding._HANDLERS) + monkeypatch.setattr( + seeding, + "_HANDLERS", + (fake_handle_duckdb, *handlers[1:]), + ) + + count = seeding.seed_project(tmp_path, executor, default_schema=None) + assert count == 1 + # we should have a create or replace in there + assert any("create or replace table" in sql for (op, sql) in exec_calls if op == "execute") + + +@pytest.mark.unit +def test_seed_project_no_seeds_dir(tmp_path: Path): + executor = SimpleNamespace() + count = seeding.seed_project(tmp_path, executor, default_schema=None) + assert count == 0 + + +@pytest.mark.unit +def test_seed_project_ambiguous_stems_raises(tmp_path: Path): + seeds_dir = tmp_path / "seeds" + (seeds_dir / "a").mkdir(parents=True) + (seeds_dir / "b").mkdir(parents=True) + (seeds_dir / "a" / "users.csv").write_text("id\n1\n", encoding="utf-8") + (seeds_dir / "b" / "users.csv").write_text("id\n2\n", encoding="utf-8") + + # schema.yml that uses bare "users" + (seeds_dir / "schema.yml").write_text( + textwrap.dedent( + """ + targets: + users: + schema: raw + """ + ), + encoding="utf-8", + ) + + executor = SimpleNamespace(schema="public") + + with pytest.raises(ValueError) as exc: + seeding.seed_project(tmp_path, executor, default_schema=None) + + assert "appears multiple times" in str(exc.value) + assert "Please configure using the path-based seed ID" in str(exc.value) diff --git a/tests/unit/test_selective_run_subgraph.py b/tests/unit/test_selective_run_subgraph.py index 70931cd..d5589c8 100644 --- a/tests/unit/test_selective_run_subgraph.py +++ b/tests/unit/test_selective_run_subgraph.py @@ -8,7 +8,9 @@ def _mk_node(tmp, name, kind="sql", deps=None, mat="table", tags=None): - p = tmp / f"{name}.dummy" + models_dir = tmp / "models" + models_dir.mkdir(parents=True, exist_ok=True) + p = models_dir / f"{name}.dummy" p.write_text("--", encoding="utf-8") n = Node( name=name, diff --git a/tests/unit/test_storage_unit.py b/tests/unit/test_storage_unit.py new file mode 100644 index 0000000..5074da7 --- /dev/null +++ b/tests/unit/test_storage_unit.py @@ -0,0 +1,273 @@ +# tests/unit/test_storage_unit.py +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from fastflowtransform import storage + + +@pytest.fixture(autouse=True) +def reset_storage(): + """Reset global storage registry between tests.""" + storage.set_model_storage({}) + storage.set_seed_storage({}) + yield + storage.set_model_storage({}) + storage.set_seed_storage({}) + + +# --------------------------------------------------------------------------- +# _sanitize_key +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_sanitize_key_strips_quotes_and_spaces(): + assert storage._sanitize_key(" `foo.bar` ") == "foo.bar" + assert storage._sanitize_key(' "foo" ') == "foo" + assert storage._sanitize_key("foo") == "foo" + + +# --------------------------------------------------------------------------- +# normalize_storage_map +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_normalize_storage_map_resolves_relative_paths(tmp_path: Path): + raw = { + "m1": { + "path": "data/out", + "format": "parquet", + "options": {"k": "v"}, + }, + # invalid entry → should be skipped + "bad": "not-a-mapping", + } + + norm = storage.normalize_storage_map(raw, project_dir=tmp_path) + + assert "m1" in norm + m1 = norm["m1"] + # path must be absolute and under project_dir + assert Path(m1["path"]).is_absolute() + assert str(tmp_path) in m1["path"] + assert m1["format"] == "parquet" + assert m1["options"] == {"k": "v"} + + # "bad" must be ignored + assert "bad" not in norm + + +@pytest.mark.unit +def test_normalize_storage_map_empty_input(tmp_path: Path): + assert storage.normalize_storage_map(None, project_dir=tmp_path) == {} + assert storage.normalize_storage_map({}, project_dir=tmp_path) == {} + + +@pytest.mark.unit +def test_normalize_storage_map_keeps_absolute_path(tmp_path: Path): + abs_dir = tmp_path / "absdir" + raw = { + "model_x": { + "path": str(abs_dir), + "format": "delta", + } + } + norm = storage.normalize_storage_map(raw, project_dir=tmp_path) + assert norm["model_x"]["path"] == str(abs_dir.resolve()) + assert norm["model_x"]["format"] == "delta" + + +# --------------------------------------------------------------------------- +# set_model_storage / get_model_storage +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_get_model_storage_exact_match(): + storage.set_model_storage({"m1": {"path": "/tmp/m1"}}) + meta = storage.get_model_storage("m1") + assert meta == {"path": "/tmp/m1"} + + +@pytest.mark.unit +def test_get_model_storage_accepts_ff_suffix(): + # registry contains name without .ff + storage.set_model_storage({"m_model": {"format": "parquet"}}) + + # ask with .ff → should find it + meta = storage.get_model_storage("m_model.ff") + assert meta == {"format": "parquet"} + + # ask without .ff → should also find it (because we add .ff as candidate) + meta2 = storage.get_model_storage("m_model") + assert meta2 == {"format": "parquet"} + + +@pytest.mark.unit +def test_get_model_storage_dotted_name_uses_last_part(): + # registry only knows the short name + storage.set_model_storage({"short": {"path": "/opt/data"}}) + + # caller asks with db.schema.short + meta = storage.get_model_storage("db.schema.short") + assert meta == {"path": "/opt/data"} + + +@pytest.mark.unit +def test_get_model_storage_returns_empty_if_not_found(): + storage.set_model_storage({"other": {"x": 1}}) + assert storage.get_model_storage("not-there") == {} + + +# --------------------------------------------------------------------------- +# set_seed_storage / get_seed_storage +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_get_seed_storage_exact_and_last_part(): + storage.set_seed_storage( + { + "schema.seed_tbl": {"path": "/tmp/seed1"}, + "pure": {"path": "/tmp/seed2"}, + } + ) + + # exact + assert storage.get_seed_storage("schema.seed_tbl") == {"path": "/tmp/seed1"} + + # only last part + assert storage.get_seed_storage("schema.other.pure") == {"path": "/tmp/seed2"} + + # missing + assert storage.get_seed_storage("nothing") == {} + + +# --------------------------------------------------------------------------- +# spark_write_to_path +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.spark +def test_spark_write_to_path_happy(tmp_path: Path, monkeypatch): + # fake spark + df.write chain + fake_spark = MagicMock() + fake_df = MagicMock() + + writer = MagicMock() + # df.write.mode("overwrite") → writer + fake_df.write.mode.return_value = writer + # .format(...) → writer + writer.format.return_value = writer + # .options(...) → writer + writer.options.return_value = writer + + # storage entry with local path + target_dir = tmp_path / "out" + storage_meta = { + "path": str(target_dir), + "format": "parquet", + "options": {"compression": "snappy"}, + } + + storage.spark_write_to_path( + fake_spark, + "db.tbl", + fake_df, + storage=storage_meta, + default_format=None, + default_options={"mergeSchema": "true"}, + ) + + # 1) DROP TABLE IF EXISTS `db`.`tbl` + fake_spark.sql.assert_any_call("DROP TABLE IF EXISTS `db`.`tbl`") + + # 2) writer must have been called with format and merged options + fake_df.write.mode.assert_called_once_with("overwrite") + writer.format.assert_called_once_with("parquet") + + # merged options: default_options + storage.options + writer.options.assert_called_once_with(mergeSchema="true", compression="snappy") + + # 3) save() called with path + writer.save.assert_called_once_with(str(target_dir)) + + # 4) create table ... location ... + # fmt is known → USING parquet + create_calls = [c.args[0] for c in fake_spark.sql.call_args_list if "CREATE TABLE" in c.args[0]] + assert len(create_calls) == 1 + assert "CREATE TABLE `db`.`tbl` USING parquet LOCATION" in create_calls[0] + assert str(target_dir) in create_calls[0] + + +@pytest.mark.unit +@pytest.mark.spark +def test_spark_write_to_path_without_format_uses_default(tmp_path: Path): + fake_spark = MagicMock() + fake_df = MagicMock() + writer = MagicMock() + fake_df.write.mode.return_value = writer + writer.format.return_value = writer + + target_dir = tmp_path / "x" + storage_meta = { + "path": str(target_dir), + # no "format" here + } + + storage.spark_write_to_path( + fake_spark, + "tbl_only", + fake_df, + storage=storage_meta, + default_format="delta", + default_options=None, + ) + + writer.format.assert_called_once_with("delta") + + +@pytest.mark.unit +@pytest.mark.spark +def test_spark_write_to_path_requires_path(): + fake_spark = MagicMock() + fake_df = MagicMock() + + with pytest.raises(ValueError) as exc: + storage.spark_write_to_path( + fake_spark, + "db.tbl", + fake_df, + storage={}, + default_format="parquet", + default_options=None, + ) + assert "requires 'path'" in str(exc.value) + + +@pytest.mark.unit +@pytest.mark.spark +def test_spark_write_to_path_rejects_empty_identifier(tmp_path: Path): + fake_spark = MagicMock() + fake_df = MagicMock() + writer = MagicMock() + fake_df.write.mode.return_value = writer + + storage_meta = {"path": str(tmp_path / "out")} + + with pytest.raises(ValueError) as exc: + storage.spark_write_to_path( + fake_spark, + "", + fake_df, + storage=storage_meta, + default_format=None, + default_options=None, + ) + assert "Invalid Spark identifier" in str(exc.value) diff --git a/tests/unit/test_testing_unit.py b/tests/unit/test_testing_unit.py new file mode 100644 index 0000000..96c4a70 --- /dev/null +++ b/tests/unit/test_testing_unit.py @@ -0,0 +1,601 @@ +# tests/unit/test_testing_unit.py +from __future__ import annotations + +from typing import Any + +import pytest + +from fastflowtransform.testing import ( + TestFailure, + _exec, + _fail, + _pretty_sql, + _scalar, + _sql_list, + accepted_values, + freshness, + greater_equal, + non_negative_sum, + not_null, + reconcile_coverage, + reconcile_diff_within, + reconcile_equal, + reconcile_ratio_within, + row_count_between, + unique, +) + + +class _FakeResult: + """Tiny fake fetch result for tests.""" + + def __init__(self, rows: list[tuple]): + self._rows = rows + + def fetchone(self) -> tuple | None: + return self._rows[0] if self._rows else None + + def fetchall(self) -> list[tuple]: + return self._rows + + +# --------------------------------------------------------------------------- +# _pretty_sql / _sql_list +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_pretty_sql_plain(): + assert _pretty_sql(" select 1 ") == "select 1" + + +@pytest.mark.unit +def test_pretty_sql_tuple(): + out = _pretty_sql(("select 1", {"x": 1})) + assert out.startswith("select 1") + assert "params={'x': 1}" in out + + +@pytest.mark.unit +def test_pretty_sql_sequence(): + out = _pretty_sql(["select 1", "select 2"]) + assert "select 1" in out + assert "select 2" in out + assert out.startswith("[") + assert out.endswith("]") + + +@pytest.mark.unit +def test_sql_list_various_types(): + assert _sql_list([1, 2, 3]) == "1, 2, 3" + assert _sql_list(["a", "b"]) == "'a', 'b'" + assert _sql_list([None, "O'Reilly"]) == "NULL, 'O''Reilly'" + + +# --------------------------------------------------------------------------- +# _exec: branch 1 - connection has .execute +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_exec_direct_non_sqlalchemy(): + calls: list[Any] = [] + + class FakeCon: + def execute(self, sql): + calls.append(sql) + return _FakeResult([(1,)]) + + con = FakeCon() + res = _exec(con, "select 1") + assert isinstance(res, _FakeResult) + assert calls == ["select 1"] + + +@pytest.mark.unit +def test_exec_direct_sqlalchemy_like_string(monkeypatch): + # simulate a SA-like connection (module name contains "sqlalchemy") + class FakeSACon: + __module__ = "sqlalchemy.engine.mock" + + def __init__(self): + self.calls: list[Any] = [] + + def execute(self, stmt, params=None): + # sqlalchemy.text(...) should have been called + self.calls.append((stmt, params)) + return _FakeResult([(1,)]) + + con = FakeSACon() + res = _exec(con, "select 1") + assert isinstance(res, _FakeResult) + # first arg should be a TextClause + assert len(con.calls) == 1 + assert str(con.calls[0][0]).strip().lower().startswith("select 1") + + +@pytest.mark.unit +def test_exec_direct_sqlalchemy_like_tuple_params(): + class FakeSACon: + __module__ = "sqlalchemy.engine.mock" + + def __init__(self): + self.calls: list[Any] = [] + + def execute(self, stmt, params=None): + self.calls.append((stmt, params)) + return _FakeResult([(1,)]) + + con = FakeSACon() + res = _exec(con, ("select :x", {"x": 10})) + assert isinstance(res, _FakeResult) + assert len(con.calls) == 1 + sql_obj, params = con.calls[0] + assert "select :x" in str(sql_obj).lower() + assert params == {"x": 10} + + +# --------------------------------------------------------------------------- +# _exec: branch 2 - no .execute, but .begin() (SQLAlchemy fallback) +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_exec_fallback_begin_with_sequence(): + executed: list[str] = [] + + class FakeCtx: + def __init__(self, outer): + self.outer = outer + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def execute(self, stmt, params=None): + # stmt may be TextClause + if hasattr(stmt, "text"): + executed.append(stmt.text) + else: + executed.append(str(stmt)) + return _FakeResult([(1,)]) + + class FakeCon: + def begin(self): + return FakeCtx(self) + + con = FakeCon() + res = _exec(con, ["select 1", "select 2"]) + assert isinstance(res, _FakeResult) + assert executed == ["select 1", "select 2"] + + +@pytest.mark.unit +def test_exec_fallback_unsupported_type_raises(): + class FakeCtx: + def __enter__(self): + """Enter.""" + return self + + def __exit__(self, exc_type, exc, tb): + """Exit.""" + return False + + def execute(self, *_a, **_k): + return _FakeResult([]) + + class FakeCon: + def begin(self): + return FakeCtx() + + con = FakeCon() + with pytest.raises(TypeError): + _exec(con, object()) + + +# --------------------------------------------------------------------------- +# _scalar +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_scalar_returns_first_value(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(42, "x")]) + + v = _scalar(FakeCon(), "select 42") + expected_value = 42 + assert v == expected_value + + +@pytest.mark.unit +def test_scalar_returns_none_on_empty(): + class FakeCon: + def execute(self, sql): + return _FakeResult([]) + + v = _scalar(FakeCon(), "select 42") + assert v is None + + +# --------------------------------------------------------------------------- +# accepted_values +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_accepted_values_ok(): + # first call: count(*) = 0 → ok + # second call (sample) should not be executed + class FakeCon: + def __init__(self): + self.calls = 0 + + def execute(self, sql): + self.calls += 1 + if "count(*)" in sql: + return _FakeResult([(0,)]) + return _FakeResult([]) + + con = FakeCon() + assert accepted_values(con, "tbl", "col", values=["a", "b"]) is True + assert con.calls == 1 + + +@pytest.mark.unit +def test_accepted_values_fail_collects_samples(): + class FakeCon: + def __init__(self): + self.queries: list[str] = [] + + def execute(self, sql): + self.queries.append(sql) + if "count(*)" in sql: + return _FakeResult([(3,)]) + if "distinct" in sql: + return _FakeResult([("X",), ("Y",)]) + return _FakeResult([]) + + con = FakeCon() + with pytest.raises(TestFailure) as exc: + accepted_values(con, "x.tbl", "kind", values=["A", "B"]) + msg = str(exc.value) + assert "x.tbl.kind has 3 value(s) outside accepted set" in msg + # should include sample values + assert "X" in msg or "Y" in msg + + +# --------------------------------------------------------------------------- +# not_null / unique +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_not_null_ok(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(0,)]) + + # should not raise + not_null(FakeCon(), "tbl", "col") + + +@pytest.mark.unit +def test_not_null_fails_on_nulls(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(2,)]) + + with pytest.raises(TestFailure) as exc: + not_null(FakeCon(), "tbl", "col") + assert "has 2 NULL-values" in str(exc.value) + + +@pytest.mark.unit +def test_not_null_wraps_db_error(): + class FakeCon: + def execute(self, sql): + raise RuntimeError("undefinedcolumn: foo HAVING") + + with pytest.raises(TestFailure) as exc: + not_null(FakeCon(), "tbl", "col") + msg = str(exc.value).lower() + assert "error in tbl.col" in msg + assert "undefinedcolumn" in msg + assert "having" in msg or "note: postgres does not permit alias usage" in msg + + +@pytest.mark.unit +def test_unique_ok(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(0,)]) + + unique(FakeCon(), "tbl", "col") + + +@pytest.mark.unit +def test_unique_fails(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(5,)]) + + with pytest.raises(TestFailure) as exc: + unique(FakeCon(), "tbl", "col") + assert "contains 5 duplicates" in str(exc.value) + + +# --------------------------------------------------------------------------- +# numeric checks +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_greater_equal_ok(): + class FakeCon: + def execute(self, sql): + # no rows with < threshold + return _FakeResult([(0,)]) + + greater_equal(FakeCon(), "tbl", "amount", threshold=10) + + +@pytest.mark.unit +def test_greater_equal_fails(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(3,)]) + + with pytest.raises(TestFailure) as exc: + greater_equal(FakeCon(), "tbl", "amount", threshold=10) + assert "has 3 values < 10" in str(exc.value) + + +@pytest.mark.unit +def test_non_negative_sum_ok(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(0,)]) + + non_negative_sum(FakeCon(), "tbl", "amount") + + +@pytest.mark.unit +def test_non_negative_sum_fails(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(-5,)]) + + with pytest.raises(TestFailure) as exc: + non_negative_sum(FakeCon(), "tbl", "amount") + assert "is negative: -5" in str(exc.value) + + +@pytest.mark.unit +def test_row_count_between_ok(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(5,)]) + + row_count_between(FakeCon(), "tbl", min_rows=1, max_rows=10) + + +@pytest.mark.unit +def test_row_count_between_too_few(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(0,)]) + + with pytest.raises(TestFailure): + row_count_between(FakeCon(), "tbl", min_rows=1) + + +@pytest.mark.unit +def test_row_count_between_too_many(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(50,)]) + + with pytest.raises(TestFailure): + row_count_between(FakeCon(), "tbl", min_rows=1, max_rows=10) + + +@pytest.mark.unit +def test_freshness_ok(): + class FakeCon: + def execute(self, sql): + # pretend last update was 3 min ago + return _FakeResult([(3.0,)]) + + freshness(FakeCon(), "tbl", "ts", max_delay_minutes=5) + + +@pytest.mark.unit +def test_freshness_too_old(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(99.0,)]) + + with pytest.raises(TestFailure): + freshness(FakeCon(), "tbl", "ts", max_delay_minutes=10) + + +# --------------------------------------------------------------------------- +# reconcile_* helpers +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_reconcile_equal_exact_ok(): + class FakeCon: + def execute(self, sql): + # both scalar_where calls will read this + return _FakeResult([(10,)]) + + reconcile_equal( + FakeCon(), + left={"table": "a", "expr": "sum(x)"}, + right={"table": "b", "expr": "sum(y)"}, + ) + + +@pytest.mark.unit +def test_reconcile_equal_abs_tolerance_ok(): + class FakeCon: + def __init__(self): + self.calls = 0 + + def execute(self, sql): + self.calls += 1 + if self.calls == 1: + return _FakeResult([(10.0,)]) + return _FakeResult([(11.0,)]) + + reconcile_equal( + FakeCon(), + left={"table": "a", "expr": "v"}, + right={"table": "b", "expr": "v"}, + abs_tolerance=1.5, + ) + + +@pytest.mark.unit +def test_reconcile_equal_fails(): + class FakeCon: + def __init__(self): + self.calls = 0 + + def execute(self, sql): + self.calls += 1 + if self.calls == 1: + return _FakeResult([(10.0,)]) + return _FakeResult([(20.0,)]) + + with pytest.raises(TestFailure): + reconcile_equal( + FakeCon(), + left={"table": "a", "expr": "v"}, + right={"table": "b", "expr": "v"}, + ) + + +@pytest.mark.unit +def test_reconcile_ratio_within_ok(): + class FakeCon: + def __init__(self): + self.calls = 0 + + def execute(self, sql): + self.calls += 1 + if self.calls == 1: + return _FakeResult([(100.0,)]) + return _FakeResult([(50.0,)]) + + # ratio = 100 / 50 = 2.0 + reconcile_ratio_within( + FakeCon(), + left={"table": "l", "expr": "x"}, + right={"table": "r", "expr": "y"}, + min_ratio=1.5, + max_ratio=2.5, + ) + + +@pytest.mark.unit +def test_reconcile_ratio_within_fails(): + class FakeCon: + def execute(self, sql): + if "from l" in sql: + return _FakeResult([(10.0,)]) + return _FakeResult([(100.0,)]) + + with pytest.raises(TestFailure): + reconcile_ratio_within( + FakeCon(), + left={"table": "l", "expr": "x"}, + right={"table": "r", "expr": "y"}, + min_ratio=0.5, + max_ratio=0.8, + ) + + +@pytest.mark.unit +def test_reconcile_diff_within_ok(): + class FakeCon: + def __init__(self): + self.calls = 0 + + def execute(self, sql): + self.calls += 1 + if self.calls == 1: + return _FakeResult([(50.0,)]) + return _FakeResult([(53.0,)]) + + reconcile_diff_within( + FakeCon(), + left={"table": "l", "expr": "x"}, + right={"table": "r", "expr": "y"}, + max_abs_diff=5.0, + ) + + +@pytest.mark.unit +def test_reconcile_diff_within_fails(): + class FakeCon: + def execute(self, sql): + if "from l" in sql: + return _FakeResult([(10.0,)]) + return _FakeResult([(25.0,)]) + + with pytest.raises(TestFailure): + reconcile_diff_within( + FakeCon(), + left={"table": "l", "expr": "x"}, + right={"table": "r", "expr": "y"}, + max_abs_diff=5.0, + ) + + +@pytest.mark.unit +def test_reconcile_coverage_ok(): + class FakeCon: + def execute(self, sql): + # anti-join count(*) == 0 + return _FakeResult([(0,)]) + + reconcile_coverage( + FakeCon(), + source={"table": "src", "key": "id"}, + target={"table": "tgt", "key": "id"}, + ) + + +@pytest.mark.unit +def test_reconcile_coverage_fails(): + class FakeCon: + def execute(self, sql): + return _FakeResult([(3,)]) + + with pytest.raises(TestFailure): + reconcile_coverage( + FakeCon(), + source={"table": "src", "key": "id"}, + target={"table": "tgt", "key": "id"}, + ) + + +# --------------------------------------------------------------------------- +# _fail +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_fail_builds_message(): + with pytest.raises(TestFailure) as exc: + _fail("check_x", "tbl", "col", "select 1", "oops") + msg = str(exc.value) + assert "[check_x] tbl.col: oops" in msg + assert "select 1" in msg diff --git a/tests/unit/test_utest_cache_flag.py b/tests/unit/test_utest_cache_flag.py index 2e86d4b..6c9d8c8 100644 --- a/tests/unit/test_utest_cache_flag.py +++ b/tests/unit/test_utest_cache_flag.py @@ -13,6 +13,8 @@ def _stub_minimal_context(monkeypatch: pytest.MonkeyPatch, tmp_path: Path): """Stub project/profile/executor so the command can run without I/O.""" + # Make sure the minimal project skeleton passes CLI path validation. + (tmp_path / "models").mkdir(parents=True, exist_ok=True) def fake_load_project_and_env(project_arg: str): # Minimal registry with one model file path (not used by utest runner) diff --git a/tests/unit/test_utest_unit.py b/tests/unit/test_utest_unit.py new file mode 100644 index 0000000..9dfb925 --- /dev/null +++ b/tests/unit/test_utest_unit.py @@ -0,0 +1,686 @@ +# tests/unit/test_utest_unit.py +from __future__ import annotations + +import json +from pathlib import Path +from types import SimpleNamespace +from typing import ClassVar, cast +from unittest.mock import MagicMock + +import pandas as pd +import pytest +import yaml + +from fastflowtransform import utest +from fastflowtransform.cache import FingerprintCache +from fastflowtransform.core import REGISTRY, Node +from fastflowtransform.utest import ( + EnvCtx, + UnitCase, + UnitSpec, + UtestCtx, + _make_env_ctx, + _maybe_skip_by_cache, + _project_root_for_spec, +) + + +def make_fake_cache() -> FingerprintCache: + fake = SimpleNamespace( + load=lambda: None, + save=lambda: None, + update_many=lambda d: None, + get=lambda *a, **k: None, + ) + return cast(FingerprintCache, fake) + + +# ------------------------------------------------------------ +# _deep_merge +# ------------------------------------------------------------ + + +@pytest.mark.unit +def test_deep_merge_merges_nested_dicts(): + base = {"a": 1, "b": {"x": 1, "y": 2}} + override = {"b": {"y": 99, "z": 3}, "c": 5} + + out = utest._deep_merge(base, override) + + assert out == { + "a": 1, + "b": {"x": 1, "y": 99, "z": 3}, + "c": 5, + } + # base sollte nicht mutiert sein + assert base == {"a": 1, "b": {"x": 1, "y": 2}} + + +@pytest.mark.unit +def test_deep_merge_lists_are_replaced(): + base = {"a": [1, 2]} + override = {"a": [9]} + out = utest._deep_merge(base, override) + assert out == {"a": [9]} + + +# ------------------------------------------------------------ +# _extract_defaults_inputs + _fingerprint_case_inputs +# ------------------------------------------------------------ + + +@pytest.mark.unit +def test_extract_defaults_inputs_missing_returns_empty(): + spec = SimpleNamespace(defaults={}) + res = utest._extract_defaults_inputs(spec) + assert res == {} + + +@pytest.mark.unit +def test_fingerprint_case_inputs_merges_defaults_and_case(tmp_path, monkeypatch): + # wir brauchen einen existierenden CSV-Pfad für die Hash-Pfade + csv_file = tmp_path / "seed.csv" + csv_file.write_text("id,name\n1,A\n", encoding="utf-8") + + spec = SimpleNamespace( + defaults={"inputs": {"src": {"rows": [{"id": 1}]}}}, + path=tmp_path / "ut.yml", + project_dir=tmp_path, + ) + case = SimpleNamespace( + inputs={ + # überschreibt defaults.src + "src": {"rows": [{"id": 2}]}, + # neue relation via CSV + "dim": {"csv": "seed.csv"}, + } + ) + + fp = utest._fingerprint_case_inputs(spec, case) + expected_fp_len = 64 + assert isinstance(fp, str) + assert len(fp) == expected_fp_len + assert all(ch in "0123456789abcdef" for ch in fp) + + +# ------------------------------------------------------------ +# _resolve_csv_path +# ------------------------------------------------------------ + + +@pytest.mark.unit +def test_resolve_csv_path_prefers_yaml_dir(tmp_path, monkeypatch): + proj = tmp_path / "proj" + proj.mkdir() + tests_dir = proj / "tests" / "unit" + tests_dir.mkdir(parents=True) + + csv_in_yaml_dir = tests_dir / "my.csv" + csv_in_yaml_dir.write_text("id\n1\n", encoding="utf-8") + + spec = SimpleNamespace( + path=tests_dir / "case.yml", + project_dir=proj, + ) + + out = utest._resolve_csv_path(spec, "my.csv") + assert out == csv_in_yaml_dir.resolve() + + +@pytest.mark.unit +def test_resolve_csv_path_falls_back_to_project_dir(tmp_path): + proj = tmp_path / "proj" + proj.mkdir() + csv_in_proj = proj / "my.csv" + csv_in_proj.write_text("id\n1\n", encoding="utf-8") + + # spec liegt woanders, aber project_dir zeigt auf proj + spec = SimpleNamespace( + path=tmp_path / "some" / "other.yml", + project_dir=proj, + ) + + out = utest._resolve_csv_path(spec, "my.csv") + assert out == csv_in_proj.resolve() + + +# ------------------------------------------------------------ +# Assertions: assert_rows_equal und Helfer +# ------------------------------------------------------------ + + +@pytest.mark.unit +def test_assert_rows_equal_exact_ok(): + actual = pd.DataFrame([{"id": 1, "name": "a"}]) + expect = [{"id": 1, "name": "a"}] + + # no raise + utest.assert_rows_equal(actual, expect) + + +@pytest.mark.unit +def test_assert_rows_equal_missing_col_raises(): + actual = pd.DataFrame([{"id": 1}]) + expect = [{"id": 1, "name": "a"}] + + with pytest.raises(utest.UnitAssertionFailure) as exc: + utest.assert_rows_equal(actual, expect) + + assert "Missing columns in actual" in str(exc.value) + + +@pytest.mark.unit +def test_assert_rows_equal_ignore_columns(): + actual = pd.DataFrame([{"id": 1, "ts": "2025-01-01"}]) + expect = [{"id": 1}] + + utest.assert_rows_equal(actual, expect, ignore_columns=["ts"]) + + +@pytest.mark.unit +def test_assert_rows_equal_any_order(): + actual = pd.DataFrame([{"id": 2}, {"id": 1}]) + expect = [{"id": 1}, {"id": 2}] + + utest.assert_rows_equal(actual, expect, any_order=True) + + +@pytest.mark.unit +def test_assert_rows_equal_subset_mode(): + actual = pd.DataFrame( + [ + {"id": 1, "val": "x"}, + {"id": 2, "val": "y"}, + ] + ) + expect = [{"id": 2, "val": "y"}] + + # subset -> ok + utest.assert_rows_equal(actual, expect, subset=True) + + +@pytest.mark.unit +def test_assert_rows_equal_subset_missing_row(): + actual = pd.DataFrame([{"id": 1, "val": "x"}]) + expect = [{"id": 2, "val": "y"}] + + with pytest.raises(utest.UnitAssertionFailure) as exc: + utest.assert_rows_equal(actual, expect, subset=True) + + assert "Expected row" in str(exc.value) + + +@pytest.mark.unit +def test_assert_rows_equal_approx_numeric(): + actual = pd.DataFrame([{"id": 1, "score": 1.005}]) + expect = [{"id": 1, "score": 1.0}] + + # tolerance 0.01 -> ok + utest.assert_rows_equal(actual, expect, approx={"score": 0.01}) + + +@pytest.mark.unit +def test_apply_approx_equalization_bad_tolerance_raises(): + actual = pd.DataFrame([{"x": 1}]) + expect = pd.DataFrame([{"x": 1}]) + + with pytest.raises(utest.UnitAssertionFailure): + utest._apply_approx_equalization( + actual, + expect, + {"x": "not-a-number"}, # type: ignore[arg-type] + ) + + +# ------------------------------------------------------------ +# validate_inputs_cover_deps +# ------------------------------------------------------------ + + +@pytest.mark.unit +def test_validate_inputs_cover_deps_detects_missing(): + node = Node(name="m", kind="sql", path=Path("."), deps=["src_a", "src_b"]) + expected, missing = utest.validate_inputs_cover_deps( + node, + inputs={"src_a": {"rows": []}}, + ) + + assert expected == ["src_a", "src_b"] + assert missing == ["src_b"] + + +# ------------------------------------------------------------ +# _normalize_cache_mode +# ------------------------------------------------------------ + + +@pytest.mark.unit +def test_normalize_cache_mode_accepts_strings(): + assert utest._normalize_cache_mode("off") == "off" + assert utest._normalize_cache_mode("RO") == "ro" + assert utest._normalize_cache_mode("Rw") == "rw" + + +@pytest.mark.unit +def test_normalize_cache_mode_rejects_unknown(): + with pytest.raises(ValueError): + utest._normalize_cache_mode("something-else") + + +# ------------------------------------------------------------ +# _detect_engine_name +# ------------------------------------------------------------ + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_detect_engine_name_duckdb_like(): + execu = SimpleNamespace(con=object()) + assert utest._detect_engine_name(execu) == "duckdb" + + +@pytest.mark.unit +@pytest.mark.postgres +def test_detect_engine_name_postgres_like(): + execu = SimpleNamespace(engine=object()) + assert utest._detect_engine_name(execu) == "postgres" + + +@pytest.mark.unit +def test_detect_engine_name_unknown(): + execu = SimpleNamespace() + assert utest._detect_engine_name(execu) == "unknown" + + +@pytest.mark.unit +def test_discover_unit_specs_basic(tmp_path, fake_registry): + tests_dir = tmp_path / "tests" / "unit" + tests_dir.mkdir(parents=True) + spec_path = tests_dir / "a.yml" + spec_path.write_text( + yaml.safe_dump( + { + "model": "model_a", + "engine": "duckdb", + "defaults": { + "inputs": {"src1": {"rows": [{"id": 1}]}}, + "expect": {"rows": [{"id": 1}]}, + }, + "cases": [ + { + "name": "c1", + "inputs": {"src1": {"rows": [{"id": 2}]}}, + "expect": {"rows": [{"id": 2}]}, + } + ], + } + ), + encoding="utf-8", + ) + + specs = utest.discover_unit_specs(tmp_path) + assert len(specs) == 1 + s = specs[0] + assert s.model == "model_a" + assert len(s.cases) == 1 + # merge muss greifen: expect.rows aus case überschreibt defaults + assert s.cases[0].expect["rows"] == [{"id": 2}] + + +@pytest.mark.unit +def test_discover_unit_specs_only_model_filter(tmp_path, fake_registry): + tests_dir = tmp_path / "tests" / "unit" + tests_dir.mkdir(parents=True) + (tests_dir / "a.yml").write_text( + yaml.safe_dump({"model": "model_a", "cases": [{"name": "x"}]}), encoding="utf-8" + ) + (tests_dir / "b.yml").write_text( + yaml.safe_dump({"model": "other_model", "cases": [{"name": "x"}]}), encoding="utf-8" + ) + + specs = utest.discover_unit_specs(tmp_path, only_model="model_a") + assert len(specs) == 1 + assert specs[0].model == "model_a" + + +# --------------------------------------------------------------------------- +# _load_relation_from_rows (duckdb-pfad) +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_load_relation_from_rows_duckdb(duckdb_executor): + rows = [{"id": 1}, {"id": 2}] + # wir lassen unregister fehlschlagen, damit der Fallback getriggert wird + duckdb_executor.con.unregister.side_effect = Exception("no unregister in this version") + + utest._load_relation_from_rows(duckdb_executor, "tmp_tbl", rows) + + # register mit tmp-name + assert duckdb_executor.con.register.call_count == 1 + # er muss create or replace table ... ausführen + executed_sqls = [c.args[0] for c in duckdb_executor.con.execute.call_args_list] + assert any("create or replace table" in sql.lower() for sql in executed_sqls) + # fallback drop view + assert any("drop view if exists" in sql.lower() for sql in executed_sqls) + + +# --------------------------------------------------------------------------- +# _load_relation_from_csv +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_load_relation_from_csv_calls_rows(monkeypatch, tmp_path, duckdb_executor): + csv_path = tmp_path / "data.csv" + csv_path.write_text("id,value\n1,a\n2,b\n", encoding="utf-8") + + called = {} + + def fake_rows(executor, rel, rows): + called["rel"] = rel + called["rows"] = rows + + monkeypatch.setattr(utest, "_load_relation_from_rows", fake_rows) + + utest._load_relation_from_csv(duckdb_executor, "my_rel", csv_path) + + assert called["rel"] == "my_rel" + expected_row_count = 2 + assert len(called["rows"]) == expected_row_count + assert called["rows"][0]["id"] == 1 + + +# --------------------------------------------------------------------------- +# _read_result +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_read_result_duckdb(duckdb_executor): + df = utest._read_result(duckdb_executor, "some_table") + assert isinstance(df, pd.DataFrame) + assert list(df.columns) == ["id"] + + +@pytest.mark.unit +@pytest.mark.postgres +def test_read_result_postgres(monkeypatch, postgres_executor): + # wir patchen pandas.read_sql_query, damit er keine DB braucht + fake_df = pd.DataFrame([{"x": 1}]) + + def fake_read_sql(query, conn): + return fake_df + + monkeypatch.setattr(utest.pd, "read_sql_query", fake_read_sql) + + df = utest._read_result(postgres_executor, "target_table") + assert isinstance(df, pd.DataFrame) + assert list(df.columns) == ["x"] + + +# --------------------------------------------------------------------------- +# _project_root_for_spec (fallback) +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_project_root_for_spec_fallback(tmp_path, monkeypatch): + # make registry neutral + monkeypatch.setattr(REGISTRY, "project_dir", None, raising=False) + monkeypatch.setattr(REGISTRY, "get_project_dir", lambda: None, raising=False) + + spec_dir = tmp_path / "tests" / "unit" + spec_dir.mkdir(parents=True) + spec_path = spec_dir / "x.yml" + spec_path.write_text("model: m1\n", encoding="utf-8") + + spec = UnitSpec( + model="m1", + engine=None, + defaults={}, + cases=[UnitCase(name="c1", inputs={}, expect={})], + path=spec_path, + project_dir=tmp_path, + ) + + root = _project_root_for_spec(spec) + + assert root == spec_path.parent + + +# --------------------------------------------------------------------------- +# _extract_defaults_inputs (cases 1, 2, 3) +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_extract_defaults_inputs_from_dict(): + spec = SimpleNamespace(defaults={"inputs": {"a": 1}}) + res = utest._extract_defaults_inputs(spec) + assert res == {"a": 1} + + +@pytest.mark.unit +def test_extract_defaults_inputs_from_attr(): + class D: + inputs: ClassVar[dict[str, int]] = {"b": 2} + + spec = SimpleNamespace(defaults=D()) + res = utest._extract_defaults_inputs(spec) + assert res == {"b": 2} + + +@pytest.mark.unit +def test_extract_defaults_inputs_from_get(): + class D: + def get(self, key): + if key == "inputs": + return {"c": 3} + return None + + spec = SimpleNamespace(defaults=D()) + res = utest._extract_defaults_inputs(spec) + assert res == {"c": 3} + + +# --------------------------------------------------------------------------- +# _make_env_ctx, _make_cache, _get_project_dir_safe +# --------------------------------------------------------------------------- + + +def test_make_env_ctx_uses_registry_sources(monkeypatch): + # fake registry sources + fake_sources = {"src1": {"tables": ["t1"]}} + monkeypatch.setattr(REGISTRY, "sources", fake_sources, raising=False) + + ctx = _make_env_ctx("duckdb") + + # jetzt auf sources_json gehen + data = json.loads(ctx.sources_json) + assert data == fake_sources + assert ctx.engine == "duckdb" + assert ctx.profile == "utest" + + +@pytest.mark.unit +def test_make_cache_none_engine(): + c = utest._make_cache(None, "duckdb") + assert c is None + + +@pytest.mark.unit +def test_make_cache_real(tmp_path): + c = utest._make_cache(tmp_path, "duckdb") + assert c is not None + + +@pytest.mark.unit +def test_get_project_dir_safe_ok(tmp_path, monkeypatch): + reg = SimpleNamespace(get_project_dir=lambda: tmp_path) + monkeypatch.setattr(utest, "REGISTRY", reg) + assert utest._get_project_dir_safe() == tmp_path + + +@pytest.mark.unit +def test_get_project_dir_safe_error(monkeypatch): + reg = SimpleNamespace(get_project_dir=lambda: (_ for _ in ()).throw(RuntimeError("boom"))) + monkeypatch.setattr(utest, "REGISTRY", reg) + assert utest._get_project_dir_safe() is None + + +# --------------------------------------------------------------------------- +# _fingerprint_case + _maybe_skip_by_cache +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_fingerprint_case_and_maybe_skip(monkeypatch, tmp_path): + # force cache hit + monkeypatch.setattr( + "fastflowtransform.utest.can_skip_node", + lambda **_: True, + ) + + env_ctx = EnvCtx( + engine="duckdb", + profile="utest", + env_vars={}, + sources_json="{}", + ) + + fake_exec = SimpleNamespace() + + ctx = UtestCtx( + executor=fake_exec, + jenv=object(), + engine_name="duckdb", + env_ctx=env_ctx, + cache=make_fake_cache(), + cache_mode="rw", # we want to test the RW branch + ) + + node = SimpleNamespace(name="m1", meta={}) + cand_fp = "fp-123" + + skipped = _maybe_skip_by_cache(node, cand_fp, ctx) + + assert skipped is True + assert ctx.computed_fps == {"m1": "fp-123"} + + +# --------------------------------------------------------------------------- +# _execute_and_update_cache +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_execute_and_update_cache_success(fake_registry, duckdb_executor): + env_ctx = utest._make_env_ctx("duckdb") + cache = MagicMock() + ctx = utest.UtestCtx( + executor=duckdb_executor, + jenv=MagicMock(), + engine_name="duckdb", + env_ctx=env_ctx, + cache=cache, + cache_mode="rw", + ) + node = fake_registry.nodes["model_a"] + ok = utest._execute_and_update_cache(node, "abc123", ctx) + assert ok is True + assert ctx.computed_fps["model_a"] == "abc123" + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_execute_and_update_cache_failure(fake_registry, duckdb_executor): + # wir machen executor kaputt + duckdb_executor.run_sql = MagicMock(side_effect=RuntimeError("boom")) + env_ctx = utest._make_env_ctx("duckdb") + ctx = utest.UtestCtx( + executor=duckdb_executor, + jenv=MagicMock(), + engine_name="duckdb", + env_ctx=env_ctx, + cache=None, + cache_mode="off", + ) + node = fake_registry.nodes["model_a"] + ok = utest._execute_and_update_cache(node, None, ctx) + assert ok is False + assert ctx.failures == 1 + + +# --------------------------------------------------------------------------- +# _read_and_assert +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_read_and_assert_ok(fake_registry, duckdb_executor): + env_ctx = utest._make_env_ctx("duckdb") + ctx = utest.UtestCtx( + executor=duckdb_executor, + jenv=MagicMock(), + engine_name="duckdb", + env_ctx=env_ctx, + cache=None, + cache_mode="off", + ) + spec = SimpleNamespace(model="model_a") + case = SimpleNamespace(expect={"rows": [{"id": 1}]}) + utest._read_and_assert(spec, case, ctx) + assert ctx.failures == 0 + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_read_and_assert_mismatch(fake_registry, duckdb_executor, monkeypatch): + # actual ist id=1, expected ist id=2 -> mismatch + env_ctx = utest._make_env_ctx("duckdb") + ctx = utest.UtestCtx( + executor=duckdb_executor, + jenv=MagicMock(), + engine_name="duckdb", + env_ctx=env_ctx, + cache=None, + cache_mode="off", + ) + spec = SimpleNamespace(model="model_a") + case = SimpleNamespace(expect={"rows": [{"id": 2}]}) + utest._read_and_assert(spec, case, ctx) + assert ctx.failures == 1 + + +# --------------------------------------------------------------------------- +# run_unit_specs (kleiner happy path) +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_run_unit_specs_happy(tmp_path, fake_registry, duckdb_executor, monkeypatch): + # wir bauen uns per Hand einen spec + spec = utest.UnitSpec( + model="model_a", + engine="duckdb", + defaults={"inputs": {"src1": {"rows": [{"id": 1}]}}}, + cases=[ + utest.UnitCase( + name="c1", + inputs={"src1": {"rows": [{"id": 1}]}}, + expect={"rows": [{"id": 1}]}, + ) + ], + path=tmp_path / "tests" / "unit" / "x.yml", + project_dir=tmp_path, + ) + # jenv ist hier egal + failures = utest.run_unit_specs([spec], duckdb_executor, jenv=MagicMock(), cache_mode="off") + assert failures == 0 diff --git a/tests/unit/test_validation_unit.py b/tests/unit/test_validation_unit.py new file mode 100644 index 0000000..af7952d --- /dev/null +++ b/tests/unit/test_validation_unit.py @@ -0,0 +1,121 @@ +# tests/unit/test_validation_unit.py +from __future__ import annotations + +import pandas as pd +import pytest + +from fastflowtransform.validation import validate_required_columns + + +@pytest.mark.unit +def test_validate_required_columns_no_requires_returns(): + # nothing to validate → no exception + df = pd.DataFrame({"id": [1]}) + validate_required_columns("node_x", df, {}) + # if we got here: ok + + +@pytest.mark.unit +def test_single_dataframe_all_columns_present(): + df = pd.DataFrame( + { + "id": [1, 2], + "email": ["a@example.com", "b@example.com"], + } + ) + requires = {"users": {"id", "email"}} + + # should not raise + validate_required_columns("users_enriched", df, requires) + + +@pytest.mark.unit +def test_single_dataframe_missing_column_raises(): + df = pd.DataFrame( + { + "id": [1, 2], + } + ) + requires = {"users": {"id", "email"}} + + with pytest.raises(ValueError) as excinfo: + validate_required_columns("users_enriched", df, requires) + + msg = str(excinfo.value) + assert "Required columns check failed for Python model 'users_enriched'." in msg + # the detail + assert "- missing columns: ['email'] | have=['id']" in msg + assert "Hint: define/adjust `require=`" in msg + + +@pytest.mark.unit +def test_multi_inputs_all_good(): + inputs = { + "users": pd.DataFrame({"id": [1], "email": ["a@example.com"]}), + "orders": pd.DataFrame({"order_id": [10], "user_id": [1]}), + } + requires = { + "users": {"id", "email"}, + "orders": {"order_id", "user_id"}, + } + + # should not raise + validate_required_columns("mart_orders_enriched", inputs, requires) + + +@pytest.mark.unit +def test_multi_inputs_missing_dep_key(): + inputs = { + "users": pd.DataFrame({"id": [1], "email": ["a@example.com"]}), + # "orders" fehlt + } + requires = { + "users": {"id", "email"}, + "orders": {"order_id", "user_id"}, + } + + with pytest.raises(ValueError) as excinfo: + validate_required_columns("mart_orders_enriched", inputs, requires) + + msg = str(excinfo.value) + assert "- missing dependency key 'orders' in inputs dict" in msg + assert "mart_orders_enriched" in msg + + +@pytest.mark.unit +def test_multi_inputs_missing_columns_in_one_dep(): + inputs = { + "users": pd.DataFrame({"id": [1], "email": ["a@example.com"]}), + "orders": pd.DataFrame({"order_id": [10]}), # user_id fehlt + } + requires = { + "users": {"id", "email"}, + "orders": {"order_id", "user_id"}, + } + + with pytest.raises(ValueError) as excinfo: + validate_required_columns("mart_orders_enriched", inputs, requires) + + msg = str(excinfo.value) + assert "- [orders] missing columns: ['user_id'] | have=['order_id']" in msg + assert "Hint:" in msg + + +@pytest.mark.unit +def test_multi_inputs_multiple_errors_are_combined(): + inputs = { + # users fehlt komplett + "orders": pd.DataFrame({"order_id": [10]}), + } + requires = { + "users": {"id"}, + "orders": {"order_id", "user_id"}, + } + + with pytest.raises(ValueError) as excinfo: + validate_required_columns("mart_orders_enriched", inputs, requires) + + msg = str(excinfo.value) + # beide Fehler sollten drin stehen + assert "- missing dependency key 'users' in inputs dict" in msg + assert "- [orders] missing columns: ['user_id'] | have=['order_id']" in msg diff --git a/uv.lock b/uv.lock index 763f89d..86e4660 100644 --- a/uv.lock +++ b/uv.lock @@ -748,21 +748,21 @@ dev = [ { name = "pandas-stubs" }, { name = "pre-commit" }, { name = "pytest" }, + { name = "pytest-cov" }, { name = "ruff" }, { name = "types-pyyaml" }, ] docs = [ { name = "mkdocs" }, + { name = "mkdocs-autorefs" }, + { name = "mkdocs-gen-files" }, + { name = "mkdocs-literate-nav" }, { name = "mkdocs-material" }, + { name = "mkdocs-section-index" }, + { name = "mkdocstrings", extra = ["python"] }, { name = "pymdown-extensions" }, ] -[package.dev-dependencies] -dev = [ - { name = "coverage" }, - { name = "pytest-cov" }, -] - [package.metadata] requires-dist = [ { name = "bigframes", specifier = ">=2.24.0" }, @@ -771,7 +771,12 @@ requires-dist = [ { name = "httpx", specifier = ">=0.28.1" }, { name = "jinja2", specifier = ">=3.1" }, { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.6" }, + { name = "mkdocs-autorefs", marker = "extra == 'docs'", specifier = ">=1.0" }, + { name = "mkdocs-gen-files", marker = "extra == 'docs'", specifier = ">=0.5" }, + { name = "mkdocs-literate-nav", marker = "extra == 'docs'", specifier = ">=0.6" }, { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5" }, + { name = "mkdocs-section-index", marker = "extra == 'docs'", specifier = ">=0.3" }, + { name = "mkdocstrings", extras = ["python"], marker = "extra == 'docs'", specifier = ">=0.25" }, { name = "mypy", marker = "extra == 'dev'", specifier = "==1.18.*" }, { name = "pandas", specifier = ">=2.0" }, { name = "pandas-stubs", marker = "extra == 'dev'", specifier = ">=2.1" }, @@ -783,6 +788,7 @@ requires-dist = [ { name = "pymdown-extensions", marker = "extra == 'docs'", specifier = ">=10.0" }, { name = "pyspark", specifier = ">=4.0.1" }, { name = "pytest", marker = "extra == 'dev'", specifier = "==8.4.*" }, + { name = "pytest-cov", marker = "extra == 'dev'", specifier = "==7.0.*" }, { name = "python-dotenv", specifier = ">=1.0" }, { name = "pyyaml", specifier = ">=6.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = "==0.14.*" }, @@ -794,12 +800,6 @@ requires-dist = [ ] provides-extras = ["dev", "docs"] -[package.metadata.requires-dev] -dev = [ - { name = "coverage", specifier = ">=7.11.0" }, - { name = "pytest-cov", specifier = ">=7.0.0" }, -] - [[package]] name = "filelock" version = "3.20.0" @@ -1251,6 +1251,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, ] +[[package]] +name = "griffe" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ec/d7/6c09dd7ce4c7837e4cdb11dce980cb45ae3cd87677298dc3b781b6bce7d3/griffe-1.14.0.tar.gz", hash = "sha256:9d2a15c1eca966d68e00517de5d69dd1bc5c9f2335ef6c1775362ba5b8651a13", size = 424684, upload-time = "2025-09-05T15:02:29.167Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/b1/9ff6578d789a89812ff21e4e0f80ffae20a65d5dd84e7a17873fe3b365be/griffe-1.14.0-py3-none-any.whl", hash = "sha256:0e9d52832cccf0f7188cfe585ba962d2674b241c01916d780925df34873bceb0", size = 144439, upload-time = "2025-09-05T15:02:27.511Z" }, +] + [[package]] name = "grpc-google-iam-v1" version = "0.14.2" @@ -1748,6 +1760,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e", size = 3864451, upload-time = "2024-08-30T12:24:05.054Z" }, ] +[[package]] +name = "mkdocs-autorefs" +version = "1.4.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mkdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/fa/9124cd63d822e2bcbea1450ae68cdc3faf3655c69b455f3a7ed36ce6c628/mkdocs_autorefs-1.4.3.tar.gz", hash = "sha256:beee715b254455c4aa93b6ef3c67579c399ca092259cc41b7d9342573ff1fc75", size = 55425, upload-time = "2025-08-26T14:23:17.223Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/4d/7123b6fa2278000688ebd338e2a06d16870aaf9eceae6ba047ea05f92df1/mkdocs_autorefs-1.4.3-py3-none-any.whl", hash = "sha256:469d85eb3114801d08e9cc55d102b3ba65917a869b893403b8987b601cf55dc9", size = 25034, upload-time = "2025-08-26T14:23:15.906Z" }, +] + +[[package]] +name = "mkdocs-gen-files" +version = "0.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mkdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/48/85/2d634462fd59136197d3126ca431ffb666f412e3db38fd5ce3a60566303e/mkdocs_gen_files-0.5.0.tar.gz", hash = "sha256:4c7cf256b5d67062a788f6b1d035e157fc1a9498c2399be9af5257d4ff4d19bc", size = 7539, upload-time = "2023-04-27T19:48:04.894Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/0f/1e55b3fd490ad2cecb6e7b31892d27cb9fc4218ec1dab780440ba8579e74/mkdocs_gen_files-0.5.0-py3-none-any.whl", hash = "sha256:7ac060096f3f40bd19039e7277dd3050be9a453c8ac578645844d4d91d7978ea", size = 8380, upload-time = "2023-04-27T19:48:07.059Z" }, +] + [[package]] name = "mkdocs-get-deps" version = "0.2.0" @@ -1762,6 +1800,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134", size = 9521, upload-time = "2023-11-20T17:51:08.587Z" }, ] +[[package]] +name = "mkdocs-literate-nav" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mkdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f6/5f/99aa379b305cd1c2084d42db3d26f6de0ea9bf2cc1d10ed17f61aff35b9a/mkdocs_literate_nav-0.6.2.tar.gz", hash = "sha256:760e1708aa4be86af81a2b56e82c739d5a8388a0eab1517ecfd8e5aa40810a75", size = 17419, upload-time = "2025-03-18T21:53:09.711Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/84/b5b14d2745e4dd1a90115186284e9ee1b4d0863104011ab46abb7355a1c3/mkdocs_literate_nav-0.6.2-py3-none-any.whl", hash = "sha256:0a6489a26ec7598477b56fa112056a5e3a6c15729f0214bea8a4dbc55bd5f630", size = 13261, upload-time = "2025-03-18T21:53:08.1Z" }, +] + [[package]] name = "mkdocs-material" version = "9.6.22" @@ -1793,6 +1843,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" }, ] +[[package]] +name = "mkdocs-section-index" +version = "0.3.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mkdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/93/40/4aa9d3cfa2ac6528b91048847a35f005b97ec293204c02b179762a85b7f2/mkdocs_section_index-0.3.10.tar.gz", hash = "sha256:a82afbda633c82c5568f0e3b008176b9b365bf4bd8b6f919d6eff09ee146b9f8", size = 14446, upload-time = "2025-04-05T20:56:45.387Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/01/53/76c109e6f822a6d19befb0450c87330b9a6ce52353de6a9dda7892060a1f/mkdocs_section_index-0.3.10-py3-none-any.whl", hash = "sha256:bc27c0d0dc497c0ebaee1fc72839362aed77be7318b5ec0c30628f65918e4776", size = 8796, upload-time = "2025-04-05T20:56:43.975Z" }, +] + +[[package]] +name = "mkdocstrings" +version = "0.30.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jinja2" }, + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mkdocs" }, + { name = "mkdocs-autorefs" }, + { name = "pymdown-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c5/33/2fa3243439f794e685d3e694590d28469a9b8ea733af4b48c250a3ffc9a0/mkdocstrings-0.30.1.tar.gz", hash = "sha256:84a007aae9b707fb0aebfc9da23db4b26fc9ab562eb56e335e9ec480cb19744f", size = 106350, upload-time = "2025-09-19T10:49:26.446Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/2c/f0dc4e1ee7f618f5bff7e05898d20bf8b6e7fa612038f768bfa295f136a4/mkdocstrings-0.30.1-py3-none-any.whl", hash = "sha256:41bd71f284ca4d44a668816193e4025c950b002252081e387433656ae9a70a82", size = 36704, upload-time = "2025-09-19T10:49:24.805Z" }, +] + +[package.optional-dependencies] +python = [ + { name = "mkdocstrings-python" }, +] + +[[package]] +name = "mkdocstrings-python" +version = "1.18.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "griffe" }, + { name = "mkdocs-autorefs" }, + { name = "mkdocstrings" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/ae/58ab2bfbee2792e92a98b97e872f7c003deb903071f75d8d83aa55db28fa/mkdocstrings_python-1.18.2.tar.gz", hash = "sha256:4ad536920a07b6336f50d4c6d5603316fafb1172c5c882370cbbc954770ad323", size = 207972, upload-time = "2025-08-28T16:11:19.847Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/8f/ce008599d9adebf33ed144e7736914385e8537f5fc686fdb7cceb8c22431/mkdocstrings_python-1.18.2-py3-none-any.whl", hash = "sha256:944fe6deb8f08f33fa936d538233c4036e9f53e840994f6146e8e94eb71b600d", size = 138215, upload-time = "2025-08-28T16:11:18.176Z" }, +] + [[package]] name = "multidict" version = "6.7.0"