diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6e49bd8..7aed4e5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -66,14 +66,35 @@ jobs: - name: Unit tests (fast) env: PYTHONWARNINGS: default - run: uv run pytest -q tests -m "not slow and not postgres" --maxfail=1 + run: uv run pytest -q tests -m unit --maxfail=1 - # ---------- smoke: examples/simple_duckdb with view + ephemeral ---------- - smoke-duckdb: + # ---------- Examples: Integration Tests ---------- + examples-matrix: runs-on: ubuntu-latest needs: checks + strategy: + fail-fast: false + matrix: + engine: [duckdb, postgres, databricks_spark] + + services: + postgres: + image: postgres:16 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: fastflowtransform + ports: + - 5432:5432 + options: >- + --health-cmd "pg_isready -U postgres" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 - name: Setup uv (and Python) uses: astral-sh/setup-uv@v5 @@ -81,60 +102,30 @@ jobs: python-version: "3.12" enable-cache: true - - name: Sync deps - run: uv sync - - - name: Prepare ephemeral + view models in example - shell: bash - run: | - set -euo pipefail - PROJECT="examples/simple_duckdb" - mkdir -p "${PROJECT}/models" - - cat > "${PROJECT}/models/ephemeral_ids.ff.sql" <<'SQL' - {{ config(materialized='ephemeral') }} - select id from {{ source('crm','users') }} - SQL - - cat > "${PROJECT}/models/v_users.ff.sql" <<'SQL' - {{ config(materialized='view') }} - select u.id - from {{ ref('users.ff') }} u - join {{ ref('ephemeral_ids.ff') }} e using(id) - SQL - - - name: Seed example (DuckDB file db) - env: - FF_ENGINE: duckdb - FF_DUCKDB_PATH: examples/simple_duckdb/.local/demo.duckdb - run: uv run fft seed examples/simple_duckdb --env dev + - name: Sync deps (dev) + run: uv sync --extra dev --frozen - - name: Run models (ephemeral inline + view materialization) - env: - FF_ENGINE: duckdb - FF_DUCKDB_PATH: examples/simple_duckdb/.local/demo.duckdb - run: uv run fft run examples/simple_duckdb --env dev + - name: Setup Java for Spark + if: matrix.engine == 'databricks_spark' + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: '17' - - name: Smoke assertions (query DuckDB) - run: | - uv run python - <<'PY' - import duckdb, pathlib - db = "examples/simple_duckdb/.local/demo.duckdb" - assert pathlib.Path(db).exists(), "DuckDB file not found" - con = duckdb.connect(db) - n = con.execute("select count(*) from v_users").fetchone()[0] - assert n >= 1, f"v_users empty (count={n})" - existing = {r[0] for r in con.execute( - "select table_name from information_schema.tables where table_schema in ('main','temp')" - ).fetchall()} - assert "ephemeral_ids" not in existing, "ephemeral_ids should not be materialized" - print("✓ smoke ok: v_users present, ephemeral inlined") - PY - - - name: Build DAG (optional sanity) + - name: Run example/integration tests for engine env: - FF_ENGINE: duckdb - FF_DUCKDB_PATH: examples/simple_duckdb/.local/demo.duckdb + FF_PG_DSN: postgresql+psycopg://postgres:postgres@localhost:5432/fastflowtransform + FF_PG_SCHEMA: ci_examples run: | - uv run fft dag examples/simple_duckdb --env dev --html - test -f examples/simple_duckdb/site/dag/index.html + echo "Running integration tests for engine=${{ matrix.engine }}" + case "${{ matrix.engine }}" in + duckdb) + uv run pytest -m "integration and duckdb" --maxfail=1 -q tests + ;; + postgres) + uv run pytest -m "integration and postgres" --maxfail=1 -q tests + ;; + databricks_spark) + uv run pytest -m "integration and databricks_spark" --maxfail=1 -q tests + ;; + esac diff --git a/.gitignore b/.gitignore index b80f732..35acb1e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # Envs & Secrets -.env -.env.* +.env.local +.env.*.local # Local DBs / Artifacts *.duckdb @@ -36,7 +36,7 @@ spark-warehouse metastore_db derby.log .fastflowtransform -Combined.md +_exports/** # Editors / IDEs .vscode/ diff --git a/Makefile.dev b/Makefile.dev index 27125ff..95c2918 100644 --- a/Makefile.dev +++ b/Makefile.dev @@ -59,3 +59,6 @@ act-commit: concat-docs: $(UV) run python _scripts/concat_docs.py -o Combined.md + +export-demo: + $(UV) python _scripts/export_subdir_md.py examples/incremental_demo -o _exports/incremental_demo_export.md --exclude-ext html css \ No newline at end of file diff --git a/Makefile.pipeline b/Makefile.pipeline index 9390625..4b82eb8 100644 --- a/Makefile.pipeline +++ b/Makefile.pipeline @@ -5,6 +5,9 @@ FFT := FF_ENGINE=duckdb FF_DUCKDB_PATH="$(FF_DB)" fft +init: + $(UV) fft init examples/materializations_demo + seed: $(FFT) seed "$(FF_PROJECT)" --env dev diff --git a/_scripts/concat_docs.py b/_scripts/concat_docs.py index c0ca6ff..0b04287 100644 --- a/_scripts/concat_docs.py +++ b/_scripts/concat_docs.py @@ -1,5 +1,4 @@ -#!/usr/bin/env python3 -# concat_docs.py +# _scripts/concat_docs.py """ Concatenates all Markdown files from the docs directory into a single file. - Respects the order in mkdocs.yml (nav). @@ -15,7 +14,6 @@ from __future__ import annotations import argparse import fnmatch -import os from pathlib import Path import re import sys diff --git a/_scripts/export_subdir_md.py b/_scripts/export_subdir_md.py new file mode 100644 index 0000000..1982525 --- /dev/null +++ b/_scripts/export_subdir_md.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +import argparse +import subprocess +from pathlib import Path + + +def get_git_root() -> Path: + """Return the root directory of the current Git repository.""" + try: + out = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], text=True).strip() + return Path(out) + except subprocess.CalledProcessError: + raise SystemExit("Error: This script must be run inside a Git repository.") + + +def get_git_files(git_root: Path) -> list[Path]: + """ + Return all files that are not ignored by Git + (tracked + untracked, but excluding standard ignored files). + """ + try: + out = subprocess.check_output( + ["git", "ls-files", "--cached", "--others", "--exclude-standard"], + text=True, + cwd=git_root, + ) + except subprocess.CalledProcessError as e: + raise SystemExit(f"Error while running 'git ls-files': {e}") + paths = [git_root / line.strip() for line in out.splitlines() if line.strip()] + return paths + + +def is_under_dir(path: Path, directory: Path) -> bool: + """Return True if 'path' is located under 'directory'.""" + try: + path.relative_to(directory) + return True + except ValueError: + return False + + +def is_binary_file(path: Path, chunk_size: int = 2048) -> bool: + """ + Simple heuristic to check if a file is binary. + + Reads the first 'chunk_size' bytes and checks for NUL bytes or + decoding errors when interpreting as UTF-8. + """ + try: + with path.open("rb") as f: + chunk = f.read(chunk_size) + # NUL byte or decode error => treat as binary + if b"\0" in chunk: + return True + try: + chunk.decode("utf-8") + except UnicodeDecodeError: + return True + return False + except OSError: + # If file cannot be read for some reason, treat it as binary + return True + + +def build_tree_structure(files: list[Path], base_dir: Path) -> str: + """ + Build a textual tree structure relative to 'base_dir'. + + 'files' should be the list of all files under 'base_dir'. + """ + # Work with paths relative to base_dir + rel_paths = [f.relative_to(base_dir) for f in files] + # Nested dict-based tree representation + tree = {} + + for rel in rel_paths: + parts = rel.parts + current = tree + for part in parts[:-1]: + current = current.setdefault(part + "/", {}) + # Store files under special key + current.setdefault("__files__", []).append(parts[-1]) + + lines = [] + root_name = base_dir.name + "/" + lines.append(root_name) + + def walk(node: dict, prefix: str = " "): + """Recursively traverse the tree and build the text representation.""" + # Files in the current directory + files_here = sorted(node.get("__files__", [])) + for fname in files_here: + lines.append(f"{prefix}{fname}") + # Subdirectories + for key in sorted(k for k in node.keys() if k != "__files__"): + lines.append(f"{prefix}{key}") + walk(node[key], prefix + " ") + + walk(tree) + return "\n".join(lines) + + +def normalize_ext_list(exts: list[str]) -> set[str]: + """ + Normalize a list of file extensions: + + - ensure each starts with a dot (.) + - convert all to lowercase + """ + norm = set() + for e in exts: + e = e.strip() + if not e: + continue + if not e.startswith("."): + e = "." + e + norm.add(e.lower()) + return norm + + +def main(): + parser = argparse.ArgumentParser( + description="Concatenate the contents of all non-ignored files in a subdirectory into a Markdown file." + ) + parser.add_argument( + "subdir", help="Subdirectory inside the Git repository (relative or absolute)." + ) + parser.add_argument( + "-o", + "--output", + default="combined.md", + help="Path to the output Markdown file (default: combined.md)", + ) + parser.add_argument( + "--exclude-ext", + nargs="*", + default=[], + help="File extensions to exclude, e.g. --exclude-ext .html .css js", + ) + args = parser.parse_args() + + git_root = get_git_root() + subdir_path = Path(args.subdir).resolve() + + # Ensure that the given subdirectory is inside the Git repository + if not is_under_dir(subdir_path, git_root): + raise SystemExit( + f"Error: The given subdirectory is not inside the Git repository: {subdir_path}" + ) + + if not subdir_path.is_dir(): + raise SystemExit(f"Error: {subdir_path} is not a directory.") + + all_git_files = get_git_files(git_root) + + # Filter to files under the given subdirectory + files_in_subdir = [f for f in all_git_files if is_under_dir(f, subdir_path) and f.is_file()] + + # Normalize and apply excluded extensions + excluded_exts = normalize_ext_list(args.exclude_ext) + if excluded_exts: + files_in_subdir = [f for f in files_in_subdir if f.suffix.lower() not in excluded_exts] + + files_in_subdir = sorted(files_in_subdir) + + if not files_in_subdir: + raise SystemExit( + "No matching files found in the subdirectory (or all are excluded/ignored)." + ) + + # Build directory tree for Markdown + tree_md = build_tree_structure(files_in_subdir, subdir_path) + + output_path = Path(args.output).resolve() + + skipped_binary = [] + + with output_path.open("w", encoding="utf-8") as out: + # Title + out.write(f"# Export from `{subdir_path.relative_to(git_root)}`\n\n") + + # Directory structure + out.write("## Directory structure\n\n") + out.write("```text\n") + out.write(tree_md) + out.write("\n```\n\n") + + # Files + out.write("## Files\n\n") + + for file_path in files_in_subdir: + rel = file_path.relative_to(git_root) + if is_binary_file(file_path): + skipped_binary.append(rel) + continue + + out.write(f"### `{rel}`\n\n") + out.write("```text\n") + try: + content = file_path.read_text(encoding="utf-8") + except UnicodeDecodeError: + skipped_binary.append(rel) + out.write("[File could not be read as UTF-8]\n") + out.write("```\n\n") + continue + out.write(content) + if not content.endswith("\n"): + out.write("\n") + out.write("```\n\n") + + if skipped_binary: + out.write("## Skipped files (binary or not readable)\n\n") + for rel in skipped_binary: + out.write(f"- `{rel}`\n") + + print(f"Done! Output written to: {output_path}") + + +if __name__ == "__main__": + main() diff --git a/docs/Auto_Docs.md b/docs/Auto_Docs.md new file mode 100644 index 0000000..34914e4 --- /dev/null +++ b/docs/Auto_Docs.md @@ -0,0 +1,69 @@ +# Auto-Docs & Lineage + +FastFlowTransform can generate a lightweight documentation site (DAG + model detail pages) plus an optional JSON manifest for external tooling. + +## Commands + +```bash +# Classic +fft dag . --env dev --html + +# Convenience wrapper (loads schema + descriptions + lineage, can emit JSON) +fft docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json +``` + +Add `--open-source` if you want the default browser to open the rendered `index.html` immediately. + +## Descriptions + +Descriptions can be provided in YAML (`project.yml`) and/or Markdown files. Markdown has higher priority. + +YAML in `project.yml`: + +```yaml +docs: + models: + users.ff: + description: "Raw users table imported from CRM." + columns: + id: "Primary key." + email: "User email address." + users_enriched: + description: "Adds gmail flag." + columns: + is_gmail: "True if email ends with @gmail.com" +``` + +Markdown overrides YAML when present: + +``` +/docs/models/.md +/docs/columns//.md +``` + +Optional front matter is ignored for now (title/tags may be used later). + +## Column Lineage + +- SQL models: expressions like `col`, `alias AS out`, `upper(u.email) AS email_upper)` are parsed; `u` must come from a `FROM ... AS u` clause that resolves to a relation. Functions mark lineage as *transformed*. +- Python (pandas) models: simple patterns like `rename`, `out["x"] = df["y"]`, `assign(x=...)` are recognized. +- Override hints in YAML when the heuristic is insufficient: + +```yaml +docs: + models: + mart_orders_enriched: + lineage: + email_upper: + from: [{ table: users, column: email }] + transformed: true +``` + +## JSON Manifest + +The optional manifest (via `--emit-json`) includes models, relations, descriptions, columns (with nullable/dtype), and lineage per column—useful for custom doc portals or CI checks. + +## Notes + +- Schema introspection currently supports DuckDB and Postgres. For other engines, the Columns card may be empty. +- Lineage is optional; when uncertain, entries fall back to “unknown” and never fail doc generation. diff --git a/docs/CLI_Guide.md b/docs/CLI_Guide.md new file mode 100644 index 0000000..042e5ed --- /dev/null +++ b/docs/CLI_Guide.md @@ -0,0 +1,37 @@ +# CLI Guide + +FastFlowTransform’s CLI is the entry point for seeding data, running DAGs, generating docs, syncing metadata, and executing quality tests. This guide summarizes the day-to-day commands and how they fit together. See `src/fastflowtransform/cli.py` for Typer definitions. + +## Core Commands + +| Command | Purpose | +|---------|---------| +| `fft seed [--env dev]` | Materialize CSV/Parquet seeds into the configured engine. | +| `fft run [--env dev]` | Execute the DAG (obeys cache + parallel flags). | +| `fft dag --html` | Render the DAG graph/site for quick inspection. | +| `fft docgen --out site/docs` | Generate the full documentation bundle (graph + model pages + optional JSON). | +| `fft test [--env dev]` | Run schema/data-quality tests defined in `project.yml` or schema YAML files. | +| `fft utest ` | Execute unit tests defined under `tests/unit/*.yml`. | +| `fft sync-db-comments ` | Push model/column descriptions into Postgres or Snowflake comments. | + +Use `--select` to scope `run`, `dag`, or `test` commands (e.g. `state:modified`, `tag:finance`, `result:error`). Environment overrides rely on the selected profile in `profiles.yml` or the `FF_*` variables. + +## HTTP/API Helpers + +Python models can make HTTP calls via `fastflowtransform.api.http`. When you need examples, head over to `docs/Api_Models.md` for `get_json`, `get_df`, pagination helpers, caching, and offline modes. + +## DAG & Documentation + +- Narrow the graph with `fft dag ... --select ` (for example `state:modified` or `tag:finance`). Combined with `--html` this produces a focused mini-site under `/docs/index.html`. +- Control schema introspection via `--with-schema/--no-schema`. Use `--no-schema` when the executor should avoid fetching column metadata (for example, BigQuery without sufficient permissions). +- `fft docgen` renders the DAG, model pages, and an optional JSON manifest in one command. Append `--open-source` to open `index.html` in your default browser after rendering. + +## Sync Database Comments + +`fft sync-db-comments --env ` pushes model and column descriptions from project YAML or Markdown into database comments. The command currently supports Postgres and Snowflake Snowpark: + +- Start with `--dry-run` to review the generated `COMMENT` statements. +- Postgres honors `profiles.yml -> postgres.db_schema` (and any `FF_PG_SCHEMA` override). +- Snowflake reuses the session or connection exposed by the executor. + +If no descriptions are found, the command exits without making changes. diff --git a/docs/Cache_and_Parallelism.md b/docs/Cache_and_Parallelism.md index d2b1a4b..479ccec 100644 --- a/docs/Cache_and_Parallelism.md +++ b/docs/Cache_and_Parallelism.md @@ -1,9 +1,9 @@ -### 🆕 `docs/Cache_and_Parallelism.md` +# Parallelism & Cache -````markdown -# Parallelism & Cache (FastFlowTransform v0.3) +**TL;DR:** FastFlowTransform executes models in parallel DAG levels and uses deterministic +fingerprints to skip unchanged nodes — while a separate HTTP cache accelerates API models. -FastFlowTransform 0.3 introduces a level-wise parallel scheduler and a build cache driven by stable fingerprints. This document explains **how parallel execution works**, **when nodes are skipped**, the exact **fingerprint formula**, and the **meta table** written after successful builds. +FastFlowTransform introduces a level-wise parallel scheduler and a build cache driven by stable fingerprints. This document explains **how parallel execution works**, **when nodes are skipped**, the exact **fingerprint formula**, and the **meta table** written after successful builds. --- @@ -34,7 +34,7 @@ fft run . --env dev --jobs 4 # Keep tasks in the same level running even if one fails fft run . --env dev --jobs 4 --keep-going -```` +``` --- @@ -60,6 +60,23 @@ A node is skipped iff: If the relation was dropped externally, FastFlowTransform will **rebuild** even if the fingerprint matches. +### HTTP Response Cache + +In addition to the build cache, FastFlowTransform provides an **HTTP response cache** for API models using +`fastflowtransform.api.http.get_df(...)`. + +- **Purpose:** Avoid redundant API calls and support offline mode. +- **Location:** Controlled by `FF_HTTP_CACHE_DIR` (e.g. `.local/http-cache`). +- **Controls (environment):** + - `FF_HTTP_ALLOWED_DOMAINS`: comma-separated list of hosts allowed to cache. + - `FF_HTTP_MAX_RPS`, `FF_HTTP_MAX_RETRIES`, `FF_HTTP_TIMEOUT`: rate limiting & retry policy. + - `FF_HTTP_OFFLINE=1`: run in offline mode — serve only from cache, no network calls. +- **CLI visibility:** Each run writes HTTP stats (`requests`, `cache_hits`, `bytes`, `used_offline`) + to `.fastflowtransform/target/run_results.json`. +- **Makefile helpers:** see `make api-show-http` in the API demo to inspect HTTP cache usage. + +> This cache is independent from the build cache; it stores API responses, not SQL or fingerprints. + --- ## Fingerprint Formula @@ -88,6 +105,10 @@ Fingerprints are stable hashes that change on any relevant input: * Minimal change in SQL/function ⇒ different hash. * Dependency changes propagate downstream. +> **Note:** The active engine and profile name are part of the fingerprint. +> Switching from `duckdb` to `postgres` automatically invalidates the cache, so cross-engine runs +> never reuse outdated fingerprints. + --- ## Meta Table Schema diff --git a/docs/Config_and_Macros.md b/docs/Config_and_Macros.md index 6ec708d..cdcdca3 100644 --- a/docs/Config_and_Macros.md +++ b/docs/Config_and_Macros.md @@ -377,7 +377,7 @@ fft utest . --env dev fft utest . --model users_enriched --case flags_gmail ``` -See the [Technical Overview](./Technical_Overview.md#model-unit-tests-fft-utest) for an exhaustive walkthrough (engine overrides, CI examples, troubleshooting). +See the [Model Unit Tests guide](./Unit_Tests.md) for an exhaustive walkthrough (engine overrides, CI examples, troubleshooting). --- diff --git a/docs/Data_Quality_Tests.md b/docs/Data_Quality_Tests.md index d04519e..d6f5a31 100644 --- a/docs/Data_Quality_Tests.md +++ b/docs/Data_Quality_Tests.md @@ -2,6 +2,22 @@ FastFlowTransform exposes a set of built-in data quality checks that you can configure in `project.yml → tests:` and execute with `fft test`. This document lists every supported test, required parameters, and example configurations. +## Supported Test Types + +The following values are currently supported for `type`: + +- `not_null` +- `unique` +- `accepted_values` +- `greater_equal` +- `non_negative_sum` +- `row_count_between` +- `freshness` +- `reconcile_equal` +- `reconcile_ratio_within` +- `reconcile_diff_within` +- `reconcile_coverage` + ## Usage Overview ```yaml @@ -24,10 +40,24 @@ tests: values: [active, invited] severity: warn # warn keeps run green on failure + - type: greater_equal + table: orders + column: amount + threshold: 0 + + - type: non_negative_sum + table: orders + column: amount + - type: row_count_between table: users_enriched - min: 1 - max: 100000 + min_rows: 1 + max_rows: 100000 + + - type: freshness + table: events + column: event_ts + max_delay_minutes: 30 - type: reconcile_equal name: revenue_vs_bookings # optional label in summaries @@ -35,18 +65,18 @@ tests: left: { table: fct_revenue, expr: "sum(amount)" } right: { table: fct_bookings, expr: "sum(expected_amount)" } abs_tolerance: 5.0 -``` +```` Every entry is a single dictionary describing one check. The common keys are: -| Key | Description | -|------------|-------------| -| `type` | Test kind (see tables below). | +| Key | Description | +| ---------- | ------------------------------------------------------------------------ | +| `type` | Test kind (see list above). | | `table` | Target table for table-level checks or display hint for reconciliations. | -| `column` | Required for column-scoped checks (`not_null`, `unique`, …). | -| `severity` | `error` (default) or `warn`. | -| `tags` | Optional list of selectors for `fft test --select tag:...`. | -| `name` | Optional identifier surfaced in summaries (useful for reconciliations). | +| `column` | Required for column-scoped checks (`not_null`, `unique`, …). | +| `severity` | `error` (default) or `warn`. | +| `tags` | Optional list of selectors for `fft test --select tag:...`. | +| `name` | Optional identifier surfaced in summaries (useful for reconciliations). | Run all configured checks: @@ -54,7 +84,7 @@ Run all configured checks: fft test . --env dev ``` -Use `--select tag:` to restrict by tags (legacy `--select batch` reads the same tags list). Tests always execute regardless of cache settings. +Use `--select tag:` to restrict by tags (e.g. `fft test --select tag:batch`). Tests always execute regardless of cache settings. Each entry produces a summary line. Failures stop the command unless `severity: warn` is set. @@ -63,103 +93,158 @@ Each entry produces a summary line. Failures stop the command unless `severity: These checks operate on a single table (optionally filtered with `where:`). Unless noted, they require a `column` argument. ### `not_null` -- **Purpose:** Assert that a column never contains NULLs. -- **Parameters:** - - `column` *(str, required)* - - `where` *(str, optional)* — SQL predicate applied before the NULL check. -- **Failure:** Reports the number of NULL rows and shows the underlying SQL. + +* **Purpose:** Assert that a column never contains NULLs. +* **Parameters:** + + * `column` *(str, required)* + * `where` *(str, optional)* — SQL predicate applied before the NULL check. +* **Failure:** Reports the number of NULL rows and shows the underlying SQL. + +--- ### `unique` -- **Purpose:** Detect duplicates within a column. -- **Parameters:** - - `column` *(str, required)* - - `where` *(str, optional)* -- **Failure:** Indicates how many duplicate groups were found (HAVING count > 1) and shows a sample query. + +* **Purpose:** Detect duplicates within a column. +* **Parameters:** + + * `column` *(str, required)* + * `where` *(str, optional)* +* **Failure:** Indicates how many duplicate groups were found (HAVING `count(*) > 1`) and shows a sample query. + +--- ### `accepted_values` -- **Purpose:** Ensure every non-NULL value is inside an allowed set. -- **Parameters:** - - `column` *(str, required)* - - `values` *(list, required)* — permitted literals (strings are quoted automatically). - - `where` *(str, optional)* -- **Failure:** Shows the number of out-of-set values plus up to five sample values. + +* **Purpose:** Ensure every non-NULL value is inside an allowed set. +* **Parameters:** + + * `column` *(str, required)* + * `values` *(list, required)* — permitted literals (strings are quoted automatically). + * `where` *(str, optional)* — additional filter condition. +* **Behaviour note:** If `values` is omitted or an empty list, the check is treated as a no-op and always passes. The summary still shows the configured test. +* **Failure:** Shows the number of out-of-set values plus up to five sample values. + +--- ### `greater_equal` -- **Purpose:** Require all values to be greater than or equal to a threshold. -- **Parameters:** - - `column` *(str, required)* - - `threshold` *(number, default `0`)* -- **Failure:** Lists how many rows fell below the threshold. + +* **Purpose:** Require all values to be greater than or equal to a threshold. +* **Parameters:** + + * `column` *(str, required)* + * `threshold` *(number, default `0`)* +* **Failure:** Lists how many rows fell below the threshold. + +--- ### `non_negative_sum` -- **Purpose:** Validate that the sum of a numeric column is not negative. -- **Parameters:** - - `column` *(str, required)* -- **Failure:** Reports the signed sum when it is negative. + +* **Purpose:** Validate that the sum of a numeric column is not negative. +* **Parameters:** + + * `column` *(str, required)* +* **Failure:** Reports the signed sum when it is negative. + +--- ### `row_count_between` -- **Purpose:** Guard minimum (and optional maximum) row counts for a table. -- **Parameters:** - - `min` *(int, default `1`)* - - `max` *(int, optional)* — omit for open-ended upper bounds. -- **Failure:** Indicates the observed row count when it falls outside `[min, max]`. + +* **Purpose:** Guard minimum (and optional maximum) row counts for a table. +* **Parameters:** + + * `min_rows` *(int, default `1`)* — minimum expected number of rows. + * `max_rows` *(int, optional)* — omit for open-ended upper bounds. +* **Failure:** Indicates the observed row count when it falls outside `[min_rows, max_rows]`. + +--- ### `freshness` -- **Purpose:** Warn when the latest timestamp is older than an allowed delay. -- **Parameters:** - - `column` *(str, required)* — timestamp column. - - `max_delay_minutes` *(int, required)* — permitted staleness. -- **Failure:** Reports the computed lag in minutes. Uses ANSI-style `DATE_PART` (works on DuckDB/Postgres; extend for other engines as needed). + +* **Purpose:** Warn when the latest timestamp is older than an allowed delay. +* **Parameters:** + + * `column` *(str, required)* — timestamp column. + * `max_delay_minutes` *(int, required)* — permitted staleness in whole minutes. +* **Failure:** Reports the computed lag in minutes. Uses: + + ```sql + select date_part('epoch', now() - max(column)) / 60.0 as delay_min + from + ``` + + This is straightforward for DuckDB/Postgres; other engines may need adaptations. ## Cross-Table Reconciliations -Reconciliation checks compare aggregates or keys across two relations. Their configuration accepts dictionaries describing the left/right side expressions or keys. +Reconciliation checks compare aggregates or keys across two relations. Their configuration accepts dictionaries describing the left/right side expressions or keys. The top-level `table`/`column` fields are used only for display and grouping; the actual queries are defined via the nested dictionaries. ### `reconcile_equal` -- **Purpose:** Compare two scalar expressions with optional tolerances. -- **Parameters:** - - `left`, `right` *(dict, required)* with keys: - - `table` *(str, required)* - - `expr` *(str, required)* — SQL select expression (e.g. `sum(amount)`). - - `where` *(str, optional)* - - `abs_tolerance` *(float, optional)* — maximum absolute difference. - - `rel_tolerance_pct` *(float, optional)* — maximum relative difference in percent. -- **Failure:** Displays both values, absolute and relative differences. + +* **Purpose:** Compare two scalar expressions with optional tolerances. +* **Parameters:** + + * `left`, `right` *(dict, required)* with keys: + + * `table` *(str, required)* + * `expr` *(str, required)* — SQL select expression (e.g. `sum(amount)`). + * `where` *(str, optional)* + * `abs_tolerance` *(float, optional)* — maximum absolute difference. + * `rel_tolerance_pct` *(float, optional)* — maximum relative difference in percent. +* **Failure:** Displays both values, absolute and relative differences. If no tolerance is provided, strict equality is enforced (diff must be exactly `0.0`). + +--- ### `reconcile_ratio_within` -- **Purpose:** Constrain the ratio `left/right` within bounds. -- **Parameters:** - - `left`, `right` *(dict, required as above)* - - `min_ratio`, `max_ratio` *(float, required)* -- **Failure:** Shows the computed ratio and expected interval. + +* **Purpose:** Constrain the ratio `left/right` within bounds. +* **Parameters:** + + * `left`, `right` *(dict, required as above)* + * `min_ratio`, `max_ratio` *(float, required)* +* **Failure:** Shows the computed ratio and expected interval. + +--- ### `reconcile_diff_within` -- **Purpose:** Limit the absolute difference between two aggregates. -- **Parameters:** - - `left`, `right` *(dict, required)* - - `max_abs_diff` *(float, required)* -- **Failure:** Reports the absolute difference when it exceeds `max_abs_diff`. + +* **Purpose:** Limit the absolute difference between two aggregates. +* **Parameters:** + + * `left`, `right` *(dict, required)* + * `max_abs_diff` *(float, required)* +* **Failure:** Reports the absolute difference when it exceeds `max_abs_diff`. + +--- ### `reconcile_coverage` -- **Purpose:** Ensure every key present in a source table appears in a target table (anti-join zero). -- **Parameters:** - - `source` *(dict, required)* — `table` and `key` column. - - `target` *(dict, required)* — `table` and `key` column. - - `source_where` *(str, optional)* — filter applied to the source. - - `target_where` *(str, optional)* — filter applied to the target. -- **Failure:** Reports the number of missing keys. -## Severity & Selectors +* **Purpose:** Ensure every key present in a source table appears in a target table (anti-join zero). +* **Parameters:** -- `severity: error` (default) makes failures stop the test run with exit code 1. -- `severity: warn` records the result but keeps the run successful. -- `selectors:` lets you group checks under named tokens (e.g. `batch`, `streaming`). Use `fft test --select tag:batch` to execute a subset. + * `source` *(dict, required)* — must contain: + + * `table` *(str)* — source table. + * `key` *(str)* — key column in the source. + * `target` *(dict, required)* — must contain: + + * `table` *(str)* — target table. + * `key` *(str)* — key column in the target. + * `source_where` *(str, optional)* — filter applied to the source. + * `target_where` *(str, optional)* — filter applied to the target. +* **Failure:** Reports the number of missing keys. + +## Severity & Tags + +* `severity: error` (default) makes failures stop the test run with exit code 1. +* `severity: warn` records the result but keeps the run successful. +* `tags:` lets you group checks under named tokens (e.g. `batch`, `streaming`). Use `fft test --select tag:batch` to execute a subset. ## CLI Summary Output Each executed check produces a line in the summary: -``` +```text ✓ not_null users.email (3ms) ✖ accepted_values events.status values=['new', 'active'] (warn) ``` @@ -168,5 +253,5 @@ Failures include the generated SQL (where available) to simplify debugging. Use ## Further Reading -- [`docs/YAML_Tests.md`](YAML_Tests.md) – schema for YAML-defined tests and advanced scenarios. -- [`fft test --help`] — command-line switches, selectors, and cache options. +* `docs/YAML_Tests.md` – schema for YAML-defined tests and advanced scenarios. +* `fft test --help` — command-line switches, selectors, and cache options. diff --git a/docs/Incremental.md b/docs/Incremental.md index 527f25b..d298bfa 100644 --- a/docs/Incremental.md +++ b/docs/Incremental.md @@ -1,66 +1,417 @@ -# Incremental Models (R1) +# Incremental models -This guide explains how to configure incremental models, use `is_incremental()` in SQL, engine compatibility, and schema change policies. +Incremental models let you **reuse existing data** and only process **new or changed rows** instead of rebuilding a table from scratch on every run. This is essential for larger datasets or frequently running pipelines. -## Quick Start +This page explains the **concepts and configuration** of incremental models in FastFlowTransform (FFT) independently of any specific example project. -A minimal incremental model: +--- + +## Why incremental models? + +By default, a model is built with a **full refresh**: + +* Read all sources +* Recompute all transformations +* Overwrite the target table + +For small tables this is fine. For anything medium-sized or larger, this quickly becomes: + +* slow, +* expensive (especially on cloud warehouses / Spark), +* and unnecessary if only a small portion of rows changed. + +Incremental models solve this by: + +1. Reusing existing target data. +2. Processing only **new / changed** rows. +3. Applying an **incremental strategy** (append or merge). + +--- + +## High-level architecture + +Incremental behaviour is coordinated between three layers: + +1. **Model configuration** + + You declare that a model is incremental and provide hints: + + * Does it append or upsert? + * What is the **unique key**? + * Which column(s) indicate freshness (e.g. `updated_at`)? + + This lives in the model’s `config(...)` (SQL) or `meta` (Python) and is validated against a strict schema. + +2. **Planner / Core** + + FFT looks at: + + * the model’s incremental config (`incremental={...}`), + * whether the physical table already exists, + * CLI flags like `--full-refresh`, + + and decides whether to: + + * run a **full rebuild**, or + * run an **incremental update** using engine hooks. + +3. **Engine executors** (DuckDB, Postgres, Databricks/Spark, …) + + Each engine implements a small incremental API: + + * `exists_relation(relation)` + * `create_table_as(relation, select_sql)` – initial full build + * `full_refresh_table(relation, select_sql)` – forced rebuild + * `incremental_insert(relation, select_sql)` – append-only + * `incremental_merge(relation, select_sql, unique_key)` – upsert / merge + * `alter_table_sync_schema(relation, select_sql, mode=...)` – optional schema evolution + + The planner calls these methods – you just configure the model. + +--- + +## Enabling incremental mode + +You enable incremental mode **per model** via the model config. + +### SQL models + +Inside the Jinja `config` block you use a structured `incremental` dictionary: ```sql --- examples/r1_demo/models/fct_events_inc.ff.sql {{ config( - materialized='incremental', - unique_key=['event_id'], - on_schema_change='append_new_columns' -- or 'sync_all_columns' + materialized='incremental', + tags=['example:incremental', 'engine:duckdb'], + incremental={ + "enabled": true, + "strategy": "merge", # or "append", "insert", "full_refresh" + "unique_key": ["event_id"], + "updated_at_column": "updated_at" + } ) }} -with src as ( - select * from {{ source('app', 'events') }} - {% if is_incremental() %} - where ingested_at > (select coalesce(max(ingested_at), timestamp '1970-01-01') from {{ this.name }}) - {% endif %} -) + select event_id, - user_id, - event_type, - ingested_at, - -- evolving column: will appear later - meta_json -from src; + updated_at, + value +from some_source ```` -### `is_incremental()` +Key points: + +* `materialized='incremental'` tells FFT to use the incremental pipeline. +* `incremental.enabled: true` declares that this model supports incremental processing. +* `unique_key` declares one or more columns that uniquely identify a row in the target. +* `strategy` is a hint for how deltas should be applied (append vs merge etc.). +* `updated_at_column` (or `delta_columns`/`updated_at_columns`) tells FFT which column is used for “new vs old” comparisons (usually a timestamp or monotonically increasing surrogate). + +There is **no extra `meta={...}` wrapper** anymore – the fields of `config(...)` are validated directly. + +### Python engine models + +For `@engine_model` functions you pass the same information via the `meta` parameter – but again with **top-level incremental config**, not inside another `meta` key: + +```python +from fastflowtransform import engine_model + +@engine_model( + only="duckdb", + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=["incremental", "engine:duckdb"], + meta={ + "materialized": "incremental", + "incremental": { + "enabled": True, + "strategy": "merge", + "unique_key": ["event_id"], + "updated_at_column": "updated_at", + }, + }, +) +def build(df): + # Return a frame with event_id, updated_at, value, ... + return df +``` + +The **frame you return** (pandas, Spark, etc.) is treated as the *delta dataset* for incremental processing – FFT does not care how you compute it, only about the columns and the meta. + +--- + +## Incremental strategies + +The core supports at least two conceptual strategies: + +### 1. Append / insert-only (`strategy: "append"` / `"insert"`) + +Use this when: + +* data is immutable once written, and +* new rows have strictly increasing `updated_at` / timestamp or surrogate key. + +Behaviour: + +* For the **first run**, FFT calls `create_table_as(relation, SELECT ...)`. +* For **subsequent runs**: + + * Only rows considered “new” are included in the SELECT (using your configured watermark columns). + * The executor calls `incremental_insert(relation, SELECT ...)` which typically becomes: + + ```sql + INSERT INTO target_table + SELECT ... + ``` + +Good for: + +* log/event style tables +* audit trails +* many ingestion pipelines + +### 2. Merge / upsert (`strategy: "merge"`) + +Use this when: + +* rows may change later, +* you want the target table to always reflect the **latest version** per `unique_key`. + +Behaviour: + +* For the **first run**, same as full refresh: `create_table_as`. +* For **later runs**: + + * The SELECT (or delta query, see below) produces a *delta* frame with new/updated rows. + * Executor tries `incremental_merge(relation, select_sql, unique_key)`. + +Engine-specific behaviour: + +* **Databricks / Spark (Delta)** + The executor attempts a native Delta MERGE: + + ```sql + MERGE INTO target AS t + USING (SELECT ...) AS s + ON t.key1 = s.key1 AND ... + WHEN MATCHED THEN UPDATE SET * + WHEN NOT MATCHED THEN INSERT * + ``` + + If MERGE is not supported (non-Delta table), it falls back to a safe full rebuild. + +* **Other engines (DuckDB, Postgres, …)** + The executor can implement merge using: + + * `INSERT ... ON CONFLICT ... DO UPDATE` (Postgres), + * a **full-refresh emulation**: build a new version by combining old rows and delta rows and overwrite. + +In all cases, the `unique_key` list is used to match rows between existing table and delta frame. + +--- + +## Watermark / delta SQL and default behaviour -* Available in SQL templates during rendering. -* Returns `true` when the model exists and the current `materialized='incremental'` run chooses an incremental path (insert/merge) instead of full rebuild. -* Typical usage: filter the source to “new” rows only. +To decide **which rows are “new enough”** for an incremental run, FFT uses the configuration you provide (for example `updated_at_column` or `delta_columns`) plus the existing table. -### Engine Matrix (MVP) +A typical default pattern is: -| Engine | Incremental Insert | Merge/Upsert | Schema Change Policy | -| ------------------ | ------------------ | ------------ | -------------------- | -| DuckDB | ✅ insert | 🚧 fallback* | ✅ append new cols | -| Postgres | ✅ insert | 🚧 fallback* | ✅ append new cols | -| BigQuery (classic) | ✅ insert | 🚧 fallback* | 🚧 best-effort | -| BigQuery BigFrames | ✅ insert | 🚧 fallback* | 🚧 best-effort | -| Databricks Spark | ✅ insert | 🚧 fallback* | 🚧 best-effort | -| Snowflake Snowpark | ✅ insert | 🚧 fallback* | 🚧 best-effort | +```sql +where updated_at > ( + select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') + from {{ this }} +) +``` + +The exact SQL will vary by engine, but the core idea is: + +* Read the current maximum of your watermark column in the target. +* Select only rows strictly newer than that. + +### Overriding the delta logic + +If the default “`updated_at > max(updated_at)`” is not enough, you have a few options: + +1. **Additional delta columns** + + Use `delta_columns` / `updated_at_columns` in `incremental={...}` to indicate multiple fields that drive change detection (especially for Python incremental). -* Fallback strategy merges by delete-on-keys + insert (best effort) if native merge isn’t wired. +2. **Inline delta SQL (`delta_sql`)** -### Schema Change Policies + Provide a custom **delta SELECT** that FFT should use on incremental runs: -* `append_new_columns` (default): new columns appear in target if they show up in the select. -* `sync_all_columns` (planned): attempt to keep type/nullable alignment. Currently not enforced; prefer append in R1. + ```sql + {{ config( + materialized='incremental', + incremental={ + "enabled": true, + "strategy": "merge", + "unique_key": ["event_id"], + "updated_at_column": "updated_at", + "delta_sql": " + with base as ( + select event_id, updated_at, value + from {{ ref('events_base.ff') }} + ) + select * + from base + where updated_at > ( + select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') + from {{ this }} + ) + " + } + ) }} + ``` -### End-to-End +3. **External delta config (`delta_config`)** + + Keep the base query in the model, but put the delta SQL into a separate YAML file and reference it via `delta_config: "config/incremental/my_model.delta.yml"`. + +In all cases, FFT still delegates the **merge/insert mechanics** to the executor; you only control what qualifies as “delta”. + +--- + +## Full refresh vs incremental + +You can always force a full rebuild: ```bash -# Seeds → initial incremental build → run again with filter -fft seed examples/r1_demo --env dev -fft run examples/r1_demo --env dev --select fct_events_inc.ff -# simulate new data (re-seed or append), then: -fft run examples/r1_demo --env dev --select fct_events_inc.ff +fft run . --env dev --full-refresh +``` + +The logic is: + +* If `--full-refresh` is set → **ignore incremental** and call `full_refresh_table`. + +* Otherwise, if the model has `incremental.enabled` and the target exists: + + * attempt incremental path (`incremental_insert` / `incremental_merge`), + +* Otherwise: + + * do initial full build via `create_table_as`. + +--- + +## Schema evolution for incremental models + +Real tables evolve. To avoid incremental runs failing when the output schema changes, executors can implement: + +```python +alter_table_sync_schema(relation: str, select_sql: str, mode: str = "append_new_columns") +``` + +Typical behaviour (Spark example): + +1. Run the SELECT with `LIMIT 0` to infer the **output schema**. +2. Compare it to the existing table schema. +3. For any **new columns**: + + * issue `ALTER TABLE ... ADD COLUMNS (...)`, + * map complex types to reasonable SQL types (often defaulting to `STRING` in Spark for safety). + +Modes: + +* `"append_new_columns"` – only new columns are added; existing columns are left untouched. +* `"sync_all_columns"` – more aggressive sync, may also adjust types (implementation-specific). + +For DuckDB/Postgres, the simplest implementation may be a no-op initially; more advanced engines (or future versions) can support automatic `ALTER TABLE` statements. + +--- + +## Storage overrides and Delta Lake integration + +Incremental models work with both: + +1. **Managed / catalog tables**, and +2. **Storage overrides** via `project.yml` / model config, e.g.: + + ```yaml + models: + storage: + fct_events: + path: ".local/spark/fct_events" + format: delta + ``` + +The storage layer (`fastflowtransform.storage`) provides helpers like: + +* `get_model_storage(name)` – resolve per-model `path`/`format`/`options` +* `spark_write_to_path(spark, identifier, df, storage=..., default_format=...)` + +For Spark/Delta: + +* Incremental models can be backed by **Delta files** at a fixed path. + +* The executor writes the DataFrame to a temporary directory, then atomically renames it into place and wires up: + + ```sql + CREATE TABLE `db`.`tbl` + USING DELTA + LOCATION '/path/to/model' + ``` + +* Incremental MERGE (`incremental_merge`) then runs against this Delta table. + +This keeps: + +* a stable location on disk / in the lake, +* and a proper table in the metastore/catalog. + +When the Databricks/Spark executor's `table_format` (or `FF_DBR_TABLE_FORMAT`) resolves to `delta`, +FastFlowTransform automatically pulls in `delta-spark` and configures both +`spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension` and +`spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog` (unless you +already provided custom values). Install `delta-spark >= 4.0` and you can seed/run Delta-backed +models without manually adding Spark CLI flags. + +--- + +## Interaction with metadata and DAG selection + +After each successful build, executors call: + +```python +on_node_built(node, relation, fingerprint) ``` -**Artifacts:** see `.fastflowtransform/target/{manifest.json, run_results.json, catalog.json}`. +which uses the meta helpers: + +* `ensure_meta_table(executor)` +* `upsert_meta(executor, node_name, relation, fingerprint, engine_name)` + +The `_ff_meta` table records, for each model and engine: + +* the relation name, +* the last fingerprint/hash, +* timestamps, etc. + +While this metadata is **not strictly required** for incremental mechanics, it is used for advanced features such as: + +* **state-based selection** (`--select state:modified`, etc.), +* change-aware DAG runs. + +Incremental models work together with these features: you can, for example, run only models whose source files changed and let the incremental planner update them efficiently. + +--- + +## Best practices & recommendations + +* **Always define a `unique_key`** for merge strategies. + Without a stable key, upserts can behave unpredictably. + +* **Use timestamps or monotonically increasing columns** for delta selection. + Avoid non-deterministic expressions (e.g. `now()` in your model SQL) in incremental filters. + +* **Start simple**: + + * Begin with `strategy: "append"` and a single `updated_at_column`. + * Move to `strategy: "merge"` only when you truly need updates. + +* **Test both fresh and incremental runs**: + + * First run with an empty database (initial full build). + * Then run again with new rows and verify the target grew as expected. + * Add automated tests that run the same model twice and assert row counts / contents. + +* **Use `--full-refresh` when semantics change**: + If you change the business logic of a model in a way that invalidates old rows, do a full rebuild at least once. diff --git a/docs/Logging.md b/docs/Logging.md new file mode 100644 index 0000000..48b3257 --- /dev/null +++ b/docs/Logging.md @@ -0,0 +1,44 @@ +# Logging & Verbosity + +FastFlowTransform exposes uniform logging controls across all CLI commands plus a dedicated SQL debug channel for tracing rendered SQL, dependency loading, and auxiliary queries. + +## CLI Flags + +- `-q` / `--quiet` → only errors (`ERROR`) +- *(default)* → concise warnings (`WARNING`) +- `-v` / `--verbose` → progress/info (`INFO`) +- `-vv` → full debug (`DEBUG`) including SQL debug output + +`-vv` automatically flips on the SQL debug channel (same effect as `FFT_SQL_DEBUG=1`). + +## SQL Debug Channel + +Enable it to inspect Python-model inputs, dependency columns, and helper SQL emitted by data-quality checks: + +```bash +# full debug (recommended) +fft run . -vv + +# equivalent using the env var (legacy behaviour retained) +FFT_SQL_DEBUG=1 fft run . +``` + +## Usage Patterns + +```bash +fft run . -q # quiet (errors only) +fft run . # default (concise) +fft run . -v # verbose progress (model names, executor info) +fft run . -vv # full debug + SQL channel +``` + +## Parallel Logging UX + +- Each node emits start/end lines with duration, truncated name, and engine abbreviation (DUCK/PG/BQ/…). +- Output remains line-stable via a thread-safe log queue; per-level summaries trail each run. +- Failures still surface the familiar “error block” per node for quick diagnosis. + +**Notes** + +- SQL debug output routes through the `fastflowtransform.sql` logger; use `-vv` or `FFT_SQL_DEBUG=1` to reveal it. +- Existing projects do not need changes: the environment variable keeps working even without `-vv`. diff --git a/docs/Technical_Overview.md b/docs/Technical_Overview.md index e33c1c4..7612bbf 100644 --- a/docs/Technical_Overview.md +++ b/docs/Technical_Overview.md @@ -21,23 +21,16 @@ - [Docs Navigation](#docs-navigation) - [Part I – Operational Guide](#part-i-operational-guide) - [Project Layout](#project-layout) - - [Sample Models](#sample-models) - - [Seeds & Example Data](#seeds-example-data) - - [Makefile Targets](#makefile-targets) + - [Example Projects and Seeds](#example-projects-and-seeds) - [CLI Flows](#cli-flows) - [Logging & Verbosity](#logging-verbosity) - [Model Unit Tests (`fft utest`)](#model-unit-tests-fft-utest) - [Troubleshooting](#troubleshooting) - - [Error Codes](#error-codes) - [Profiles & Environment Overrides](#profiles-environment-overrides) - - [Parallel Scheduler (v0.3)](#parallel-scheduler-v03) - - [Cache Policy (v0.3)](#cache-policy-v03) - - [Fingerprint Formula (v0.3)](#fingerprint-formula-v03) - - [Meta Table Schema (v0.3)](#meta-table-schema-v03) - - [Jinja DSL Quick Reference](#jinja-dsl-quick-reference) + - [Parallel Execution and Cache](#parallel-execution-and-cache) - [Roadmap Snapshot](#roadmap-snapshot) - [Cross-Table Reconciliations](#cross-table-reconciliations) - - [Auto-Docs & Lineage](#auto-docs-lineage) + - [Auto-Docs and Lineage](#auto-docs-and-lineage) - [Part II – Architecture & Internals](#part-ii-architecture-internals) - [Architecture Overview](#architecture-overview) - [Core Modules](#core-modules) @@ -119,460 +112,38 @@ fastflowtransform/ └── README.md ``` -### Sample Models +### Example Projects and Seeds -The demo project `examples/simple_duckdb` showcases the typical mix of SQL and Python models plus downstream marts. Use it as a template for your own projects. +Need runnable references? Start with the curated demos under `docs/examples/`: -- Batch models live under `models/` (`*.ff.sql`, `*.ff.py`). -- External tables are declared in `sources.yml`; reusable tests in `project.yml`. -- Seeds in `seeds/` keep demos deterministic. +- [Basic Demo](./examples/Basic_Demo.md) shows the minimum viable project (seeds, staging, marts) plus Makefile targets you can copy. +- [API Demo](./examples/API_Demo.md) focuses on HTTP-powered Python models. +- [Environment Matrix](./examples/Environment_Matrix.md) demonstrates multiple profiles talking to different engines. -> ℹ️ **Need full code samples and decorator details?** -> See [Model Fundamentals](./Config_and_Macros.md#1-model-fundamentals) in the Modeling Reference. - -### Seeds & Example Data - -`seeds/seed_users.csv` - -```csv -id,email -1,a@example.com -2,b@gmail.com -3,c@gmail.com -``` - -`seeds/seed_orders.csv` - -```csv -order_id,user_id,amount -100,1,19.9 -101,2,0 -``` - -### Makefile Targets - -```makefile -DB ?= .local/demo.duckdb -PROJECT ?= examples/simple_duckdb - -seed: - fft seed $(PROJECT) --env dev - -run: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev - -dag: - fft dag "$(PROJECT)" --env dev --html - -test: - fft test "$(PROJECT)" --env dev --select batch -``` - -Targets wrap the CLI commands showcased below. Feel free to copy the pattern into your own projects. +Each demo includes deterministic seeds (`seeds/*.csv`), schema YAML, and Makefile shortcuts, so the detailed CSV listings and commands here would be redundant. Follow the demo docs (or the [Quickstart](./Quickstart.md)) for the full walkthrough. ### CLI Flows -- CLI flags and internals are documented under [CLI Implementation](#cli-implementation). -- Automation examples appear in the [Makefile Targets](#makefile-targets). - - -#### HTTP/API in Python models -See [API calls in Python models](./Api_Models.md) for `get_json`/`get_df`, pagination, cache/offline flags. - - -#### DAG & Documentation - -- Narrow the graph with `fft dag ... --select ` (for example `state:modified` or `tag:finance`). Combined with `--html` this produces a focused mini site. -- Control schema introspection via `--with-schema/--no-schema`. Use `--no-schema` when the executor should avoid fetching column metadata (for example, BigQuery without sufficient permissions). -- `fft docgen` renders the DAG, model pages, and an optional JSON manifest in one command. Append `--open-source` to open `index.html` in your default browser after rendering. - -#### Sync Database Comments - -`fft sync-db-comments --env ` pushes model and column descriptions from project YAML or Markdown into database comments. The command currently supports Postgres and Snowflake Snowpark: - -- Start with `--dry-run` to review the generated `COMMENT` statements. -- Postgres honors `profiles.yml -> postgres.db_schema` (and any `FF_PG_SCHEMA` override). -- Snowflake reuses the session or connection exposed by the executor. - -If no descriptions are found, the command exits without making changes. +Looking for command recipes, selection filters, or sync workflows? See the dedicated [CLI Guide](./CLI_Guide.md) for a task-by-task breakdown (seed/run/dag/docgen/test/utest/sync-db-comments) plus links to API-model helpers. ### Logging & Verbosity -FastFlowTransform exposes uniform logging controls across all CLI commands plus a dedicated SQL debug channel. - -#### Flags - -- `-q` / `--quiet` → only errors (`ERROR`) -- *(default)* → concise warnings (`WARNING`) -- `-v` / `--verbose` → progress/info (`INFO`) -- `-vv` → full debug (`DEBUG`), including SQL debug output - -`-vv` flips on the SQL debug channel automatically (same as setting `FFT_SQL_DEBUG=1` - -#### SQL debug channel - -Enable it to inspect Python-model inputs, dependency columns, and helper SQL emitted by data-quality checks: - -```bash -# full debug (recommended) -fft run . -vv - -# equivalent using the env var (legacy behaviour retained) -FFT_SQL_DEBUG=1 fft run . -``` - -#### Usage patterns - -```bash -fft run . -q # quiet (errors only) -fft run . # default (concise) -fft run . -v # verbose progress (model names, executor info) -fft run . -vv # full debug + SQL channel -``` - -#### Parallel logging UX - -- Per node: start/end lines with duration, truncated name, and engine abbrev (DUCK/PG/BQ/…). -- Output is line-stable via a thread-safe log queue; per-level summaries at the end. -- On errors, the familiar “error block” is shown per node. - -**Notes** - -- SQL debug output routes through the `fastflowtransform.sql` logger; use `-vv` or the env var to see it. -- Existing projects do not need changes: the env var continues to work even without `-vv`. +Need the exact behaviour of `-q/-v/-vv`, SQL debug output, or the parallel log queue? Head over to [Logging.md](./Logging.md) for the full matrix plus usage snippets. ### Model Unit Tests (`fft utest`) -`fft utest` executes a single model in isolation, loading only the inputs you provide and comparing the result to an expected dataset. It works for SQL and Python models and runs against DuckDB or Postgres by default. - -#### Unit tests & cache - -`fft utest --cache {off|ro|rw}` (default: `off`) - -- `off`: deterministic, never skips. -- `ro`: skip on cache hit; on miss, build but **do not write** cache. -- `rw`: skip on hit; on miss, build **and write** fingerprint. - -Notes: -- UTests key the cache with `profile="utest"`. -- Fingerprints include case inputs (CSV content hash / inline rows), so changing inputs invalidates the cache. -- `--reuse-meta` is currently a reserved flag: it is exposed in the CLI, acts as a no-op today, and will enable future meta-table optimizations. - - -#### Why? - -- Fast feedback on transformation logic without full DAG runs -- Small, reproducible fixtures (rows inline or external CSV) -- Engine-agnostic: swap DuckDB/Postgres to spot dialect differences - -#### Folder layout - -Specs live under `/tests/unit/*.yml` relative to the project root (the directory passed to the CLI that contains `models/`): - -``` -your-project/ -├── models/ -│ ├── users.ff.sql -│ ├── users_enriched.ff.py -│ └── mart_users.ff.sql -└── tests/ - └── unit/ - ├── users_enriched.yml - └── mart_users.yml -``` - -#### YAML DSL (with `defaults`) - -Each file targets one logical node (the DAG name). Defaults are deep-merged into every case so you can share inputs/expectations and override per scenario. - -```yaml -# tests/unit/users_enriched.yml -model: users_enriched - -defaults: - inputs: - users: - rows: - - {id: 1, email: "a@example.com"} - - {id: 2, email: "b@gmail.com"} - expect: - relation: users_enriched - order_by: [id] - -cases: - - name: basic_gmail_flag - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} - - - name: override_inputs - inputs: - users: - rows: - - {id: 3, email: "c@hotmail.com"} - - {id: 4, email: "d@gmail.com"} - expect: - rows: - - {id: 3, email: "c@hotmail.com", is_gmail: false} - - {id: 4, email: "d@gmail.com", is_gmail: true} -``` - -SQL models use the file stem (including `.ff`) as `model`. Provide expected relation names that match the materialized table/view: - -```yaml -# tests/unit/mart_users.yml -model: mart_users.ff - -defaults: - inputs: - users_enriched: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} - expect: - relation: mart_users - order_by: [id] - -cases: - - name: passthrough_columns - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} -``` - -For multi-dependency models, include every physical relation name (what `relation_for(dep)` returns): - -```yaml -model: mart_orders_enriched -defaults: - inputs: - users_enriched: - rows: - - {id: 1, email: "x@gmail.com", is_gmail: true} - orders: - rows: - - {order_id: 10, user_id: 1, amount: 19.9} - - {order_id: 11, user_id: 1, amount: -1.0} -cases: - - name: join_and_flag - expect: - any_order: true - rows: - - {order_id: 10, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: 19.9, valid_amt: true} - - {order_id: 11, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: -1.0, valid_amt: false} -``` - -#### Input formats - -- `rows`: inline dictionaries per row -- `csv`: reference a CSV file (relative paths allowed) - -Keys under `inputs` are physical relations; use `relation_for('users.ff')` if unsure. - -#### Expected output & comparison - -- `relation`: actual table/view name produced by the model (defaults to `relation_for(model)`) -- Ordering: `order_by: [...]` or `any_order: true` -- Columns: `ignore_columns: [...]`, `subset: true` -- Numeric tolerance: `approx: true` or `approx: { col: 1e-9, other_col: 0.01 }` - (numbers can be plain `1e-9` or quoted; they are cast to float) - -#### Running utests - -```bash -fft utest . # discover all specs -fft utest . --env dev # use a specific profile -fft utest . --model users_enriched -fft utest . --model mart_orders_enriched --case join_and_flag -fft utest . --path tests/unit/users_enriched.yml -``` - -Override the executor for all specs (ensure credentials/DSNs are set): - -```bash -export FF_PG_DSN="postgresql+psycopg://postgres:postgres@localhost:5432/ffdb" -export FF_PG_SCHEMA="public" -fft utest . --engine postgres -``` - -Executor precedence (highest → lowest): CLI `--engine`, YAML `engine:` (optional), `profiles.yml`, environment overrides. - -#### Design notes - -- Only the target model runs; supply all upstream relations the model expects. -- `defaults` deep-merge: dicts merge, lists/scalars overwrite. -- Results compare as DataFrames with configurable order, subset, ignored columns, and numeric tolerances. -- Exit codes: `0` for success, `2` when at least one case fails (compact CSV-style diff is printed). - -**CI example (GitHub Actions)** - -```yaml -name: utests -on: [push, pull_request] -jobs: - duckdb: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: { python-version: "3.11" } - - run: pip install -e . - - run: fft utest . --env dev -``` - -(For Postgres, add a service container and run `fft utest . --engine postgres` with `FF_PG_DSN` / `FF_PG_SCHEMA`.) +The full how-to (cache modes, YAML DSL, CI snippets) moved to [Unit_Tests.md](./Unit_Tests.md). Keep this Section in mind whenever you need fast feedback on SQL/Python models without executing the entire DAG. ### Troubleshooting -- **DuckDB seeds not visible** → ensure `FF_DUCKDB_PATH` (or profile path) is identical for `seed`, `run`, `dag`, and `test`. -- **Postgres connection refused** → confirm `FF_PG_DSN`, container status (`docker ps`), and that port `5432` is open. -- **BigQuery permissions** → set `GOOGLE_APPLICATION_CREDENTIALS` and match dataset/location to your profile. -- **HTML docs missing** → run `fft dag --html` and open `/docs/index.html`. -- **Unexpected test failures** → inspect rendered SQL in CLI output, refine selection via `--select`, refresh seeds if needed. -- **Dependency table not found** in utests → provide all physical upstream relations in the YAML spec. - -### Error Codes - -| Type | Class/Source | Exit | Notes | -|---------------------------|---------------------------|------|---------------------------------------------------------| -| Missing dependency | `DependencyNotFoundError` | 1 | Per-node list; tips for `ref()` / names | -| Cycle in DAG | `ModelCycleError` | 1 | "Cycle detected among nodes: ..." | -| Model execution (KeyError)| `cli.py` → formatted block| 1 | Inspect columns, use `relation_for(dep)` as keys | -| Data quality failures | `cli test` → summary | 2 | "Totals ... passed/failed"; each failure on its own line | -| Unknown/unexpected | generic | 99 | Optional trace via `FFT_TRACE=1` | - -Error types map to the classes documented in [Core Modules](#core-modules) and [CLI Implementation](#cli-implementation). +Common fixes (engines, docs generation, tests) plus the exit-code matrix live in [Troubleshooting.md](./Troubleshooting.md). Skim that doc whenever you hit connectivity issues or need to decode return codes. ### Profiles & Environment Overrides +Need to understand profile precedence, `.env` layering, or the Pydantic models that back settings? Jump to the [Profiles guide](./Profiles.md) which covers file layout, environment helpers, validation, and selection precedence in depth. -**`profiles.yml` example:** - -```yaml -default: - engine: duckdb - duckdb: { path: ":memory:" } - -stg: - engine: postgres - postgres: - dsn: postgresql+psycopg://postgres:postgres@localhost:5432/ffdb - db_schema: public - -bq: - engine: bigquery - bigquery: - project: my-gcp-proj - dataset: demo - location: EU - use_bigframes: false -``` - -**ENV overrides (examples):** - -`FF_ENGINE`, `FF_DUCKDB_PATH`, `FF_PG_DSN`, `FF_PG_SCHEMA`, `FF_BQ_DATASET`, `FF_BQ_LOCATION`, `FF_BQ_USE_BIGFRAMES=1` - -**Priority (lowest → highest):** `profiles.yml` < environment variables (`FF_*`) < CLI flags (e.g. `--engine`). - -For the Pydantic models and resolution flow, see [Settings Infrastructure](#settings-infrastructure). - -### Parallel Scheduler (v0.3) - -FastFlowTransform executes the DAG in **levels**. Each level contains nodes without mutual dependencies. - -- `--jobs N` limits the **maximum concurrency per level**. -- `--keep-going` keeps tasks within the current level running even if one fails; subsequent levels are not started. - -**CLI** -```bash -fft run . --env dev --jobs 4 # parallel (level-wise) -fft run . --env dev --jobs 4 --keep-going - -fft run . --select model_b --jobs 4 # Run only model_b and whatever it depends on -fft run . --rebuild-only model_b # Rebuild only model_b, even if cache hits -``` - -**Internals** -- `dag.levels(nodes)` builds level lists using indegrees. -- `run_executor.schedule(levels, jobs, fail_policy)` spawns a thread pool per level and aggregates timings. - -### Cache Policy (v0.3) - -**Modes** -``` -off – always build -rw – default; skip if fingerprint matches and relation exists; write cache after build -ro – skip on match; on miss build but do not write cache -wo – always build and write cache -``` -`--rebuild ` ignores cache for matching nodes. - -**Skip condition** -1) Fingerprint matches the stored value (file-backed cache) -2) Physical relation exists on the target engine - -**Examples** -```bash -fft run . --env dev --cache=rw -fft run . --env dev --cache=ro -fft run . --env dev --cache=rw --rebuild marts_daily.ff -``` - -### Fingerprint Formula (v0.3) +### Parallel Execution and Cache -**SQL nodes**: -`fingerprint_sql(node, rendered_sql, env_ctx, dep_fps)` - -**Python nodes**: -`fingerprint_py(node, func_src, env_ctx, dep_fps)` - -**`env_ctx` content** -- `engine` (e.g. duckdb, postgres, bigquery) -- `profile_name` (CLI `--env`) -- selected environment keys/values: all `FF_*` -- normalized excerpt of `sources.yml` (sorted dump) - -**Properties** -- Same inputs ⇒ same hash. -- Minimal change in SQL/function ⇒ different hash. -- Any dependency fingerprint change bubbles downstream via `dep_fps`. - -### Meta Table Schema (v0.3) - -FastFlowTransform writes a per-node audit row after successful builds: - -``` -_ff_meta ( - node_name TEXT / STRING -- logical name, e.g. "users.ff" - relation TEXT / STRING -- physical name, e.g. "users" - fingerprint TEXT / STRING - engine TEXT / STRING - built_at TIMESTAMP -) -``` - -**Backends** -- DuckDB: table `_ff_meta` in `main`. -- Postgres: table `_ff_meta` in the active schema. -- BigQuery: table `._ff_meta`. - -**Notes** -- Meta is currently used for auditing and tooling; skip logic relies on fingerprint cache + relation existence checks. - -#### Executor meta hook - -After a successful materialization the executor calls: - on_node_built(node, relation, fingerprint) - -This performs an upsert into `_ff_meta` with `(node_name, relation, fingerprint, built_at, engine)`. - -Skipped nodes do **not** touch the meta table. - - -### Jinja DSL Quick Reference - -`ref()`, `source()`, `var()`, `config()`, `this` – see details in the [Modeling Reference](./Config_and_Macros.md). +Level-wise parallelism, cache modes, fingerprint formula, and the `_ff_meta` audit table are documented in [Cache_and_Parallelism.md](./Cache_and_Parallelism.md). Use that reference for CLI examples (`--jobs`, `--cache`, `--rebuild`), skip conditions, and troubleshooting tips related to concurrency. ### Roadmap Snapshot @@ -590,149 +161,11 @@ Skipped nodes do **not** touch the meta table. ### Cross-Table Reconciliations -FastFlowTransform can compare aggregates and key coverage **across two tables** and surface drift with clear, numeric messages. These checks run via the standard `fft test` entrypoint and integrate into the DQ summary output. - -**CLI** -```bash -# only run reconciliation checks -fft test . --env dev --select reconcile -``` - -**YAML DSL** - -All checks live under `project.yml → tests:` and should carry the tag `reconcile` for easy selection. - -1) **Equality / Approx Equality** -```yaml -- type: reconcile_equal - name: orders_total_equals_mart - tags: [reconcile] - left: { table: orders, expr: "sum(amount)" } - right: { table: mart_orders_enriched, expr: "sum(amount)", where: "valid_amt" } - # optional tolerances: - abs_tolerance: 0.01 # |L - R| <= 0.01 - rel_tolerance_pct: 0.1 # |L - R| / max(|R|, eps) <= 0.1% (0.1) -``` - -2) **Ratio within bounds** -```yaml -- type: reconcile_ratio_within - name: orders_vs_mart_ratio - tags: [reconcile] - left: { table: orders, expr: "sum(amount)" } - right: { table: mart_orders_enriched, expr: "sum(amount)" } - min_ratio: 0.999 - max_ratio: 1.001 -``` - -3) **Absolute difference within limit** -```yaml -- type: reconcile_diff_within - name: count_stability - tags: [reconcile] - left: { table: events_raw, expr: "count(*)", where: "event_type='purchase'" } - right: { table: fct_sales, expr: "sum(txn_count)" } - max_abs_diff: 10 -``` - -4) **Coverage (anti-join = 0)** -```yaml -- type: reconcile_coverage - name: all_orders_covered - tags: [reconcile] - source: { table: orders, key: "order_id" } - target: { table: mart_orders_enriched, key: "order_id" } - # optional filters - source_where: "order_date >= current_date - interval '7 days'" - target_where: "valid_amt" -``` - -**Parameter semantics** -- `expr`: SQL snippet placed into `SELECT {expr} FROM {table}` (keep it engine-neutral: `sum(...)`, `count(*)`, simple filters). -- `where`: optional SQL appended as `WHERE {where}`. -- `abs_tolerance`: absolute tolerance on the difference. -- `rel_tolerance_pct`: relative tolerance in **percent**; denominator is `max(|right|, 1e-12)`. -- `min_ratio` / `max_ratio`: inclusive bounds for `left/right`. -- Coverage uses an anti-join (`source` minus `target` on the given key). The check passes if missing = 0. - -**Summary output** -Each reconciliation contributes a line in the summary with a compact scope, e.g.: -``` -✅ reconcile_equal orders ⇔ mart_orders_enriched (4ms) -✅ reconcile_coverage orders ⇒ mart_orders_enriched (3ms) -``` - -**Engine notes** -- DuckDB and Postgres are supported out-of-the-box. BigQuery works with simple aggregates/filters (expressions should avoid dialect-specific functions). -- For relative tolerances, the implementation guards against zero denominators with a small epsilon (`1e-12`). - - -### Auto-Docs & Lineage - -FastFlowTransform can generate a lightweight documentation site (DAG + model detail pages) from your project: - -```bash -# Classic -fft dag . --env dev --html - -# Convenience wrapper (loads schema + descriptions + lineage, can emit JSON) -fft docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json -``` - -Add `--open-source` if you want the default browser to open the rendered `index.html` immediately. - -**Descriptions** can be provided in YAML (project.yml) and/or Markdown files. Markdown has higher priority. - -YAML in `project.yml`: - -```yaml -docs: - models: - users.ff: - description: "Raw users table imported from CRM." - columns: - id: "Primary key." - email: "User email address." - users_enriched: - description: "Adds gmail flag." - columns: - is_gmail: "True if email ends with @gmail.com" -``` - -Markdown (overrides YAML if present): - -``` -/docs/models/.md -/docs/columns//.md -``` - -Optional front matter is ignored for now (title/tags may be used later). - -**Column lineage (heuristic, best effort).** - -- SQL models: expressions like `col` / `alias AS out` / `upper(u.email) AS email_upper)` are parsed; - `u` must come from a `FROM ... AS u` that resolves to a relation. Functions mark lineage as *transformed*. -- Python (pandas) models: simple patterns like `rename`, `out["x"] = df["y"]`, `assign(x=...)` are recognized. -- You can override hints in YAML: - -```yaml -docs: - models: - mart_orders_enriched: - lineage: - email_upper: - from: [{ table: users, column: email }] - transformed: true -``` - -**JSON manifest** (optional via `--emit-json`) includes models, relations, descriptions, columns (with nullable/dtype), -and lineage per column. This is useful for custom doc portals or CI checks. - -Notes: -- Schema introspection currently supports DuckDB and Postgres. For other engines, the Columns card may be empty. -- Lineage is optional; when uncertain, entries fall back to “unknown” and never fail doc generation. +Reconciliation tests (`reconcile_equal`, `reconcile_ratio_within`, `reconcile_diff_within`, `reconcile_coverage`) are fully documented in the [Data Quality Test Reference](./Data_Quality_Tests.md#cross-table-reconciliations). Use that guide for YAML schemas, tolerance parameters, and engine notes before wiring the checks into `fft test`. +### Auto-Docs and Lineage +Rendering the DAG site, feeding project descriptions/lineage, and exporting JSON manifests are covered in [Auto_Docs.md](./Auto_Docs.md). Head there for command flags, markdown/YAML resolution, and lineage overrides. ## Part II – Architecture & Internals diff --git a/docs/Troubleshooting.md b/docs/Troubleshooting.md new file mode 100644 index 0000000..2c5f27e --- /dev/null +++ b/docs/Troubleshooting.md @@ -0,0 +1,24 @@ +# Troubleshooting & Error Codes + +Use this checklist when FastFlowTransform commands misbehave. Each item points to the quickest fix plus the relevant CLI options. + +## Quick Fixes + +- **DuckDB seeds not visible** → ensure `FF_DUCKDB_PATH` (or the profile path) is identical for `seed`, `run`, `dag`, and `test`. If you configure `FF_DUCKDB_SCHEMA` / `FF_DUCKDB_CATALOG`, keep them consistent across commands so unqualified references resolve to the right namespace. +- **Postgres connection refused** → confirm `FF_PG_DSN`, container status (`docker ps`), and that port `5432` is open. +- **BigQuery permissions** → set `GOOGLE_APPLICATION_CREDENTIALS` and match dataset/location to your profile. +- **HTML docs missing** → run `fft dag --html` and open `/docs/index.html`. +- **Unexpected test failures** → inspect rendered SQL in CLI output, refine selection via `--select`, refresh seeds if needed. +- **Dependency table not found in utests** → provide all physical upstream relations in the YAML spec. + +## Error Codes + +| Type | Class/Source | Exit | Notes | +|---------------------------|---------------------------|------|---------------------------------------------------------| +| Missing dependency | `DependencyNotFoundError` | 1 | Per-node list; tips for `ref()` / names | +| Cycle in DAG | `ModelCycleError` | 1 | “Cycle detected among nodes: …” | +| Model execution (KeyError)| `cli.py` → formatted block| 1 | Inspect columns, use `relation_for(dep)` as keys | +| Data quality failures | `cli test` → summary | 2 | Totals section prints passed/failed counts | +| Unknown/unexpected | generic | 99 | Optional trace via `FFT_TRACE=1` | + +Error types map to the classes documented in `docs/Technical_Overview.md#core-modules` and the CLI source. diff --git a/docs/Unit_Tests.md b/docs/Unit_Tests.md new file mode 100644 index 0000000..79e89e2 --- /dev/null +++ b/docs/Unit_Tests.md @@ -0,0 +1,182 @@ +# Model Unit Tests (`fft utest`) + +`fft utest` executes a single model in isolation, loading only the inputs you provide and comparing the result to an expected dataset. It works for SQL and Python models and runs against DuckDB or Postgres by default. + +## Cache Modes + +`fft utest --cache {off|ro|rw}` (default: `off`) + +- `off`: deterministic, never skips. +- `ro`: skip on cache hit; on miss, build but **do not write** cache. +- `rw`: skip on hit; on miss, build **and write** fingerprint. + +Notes: + +- UTests key the cache with `profile="utest"`. +- Fingerprints include case inputs (CSV content hash / inline rows), so changing inputs invalidates the cache. +- `--reuse-meta` is currently a reserved flag: exposed in the CLI, acts as a no-op today, and will enable future meta-table optimizations. + +## Why Use UTests? + +- Fast feedback on transformation logic without full DAG runs. +- Small, reproducible fixtures (rows inline or external CSV). +- Engine-agnostic: swap DuckDB/Postgres to spot dialect differences. + +## Folder Layout + +Specs live under `/tests/unit/*.yml` relative to the project root (the directory passed to the CLI that contains `models/`): + +``` +your-project/ +├── models/ +│ ├── users.ff.sql +│ ├── users_enriched.ff.py +│ └── mart_users.ff.sql +└── tests/ + └── unit/ + ├── users_enriched.yml + └── mart_users.yml +``` + +## YAML DSL (with `defaults`) + +Each file targets one logical node (the DAG name). Defaults are deep-merged into every case so you can share inputs/expectations and override per scenario. + +```yaml +# tests/unit/users_enriched.yml +model: users_enriched + +defaults: + inputs: + users: + rows: + - {id: 1, email: "a@example.com"} + - {id: 2, email: "b@gmail.com"} + expect: + relation: users_enriched + order_by: [id] + +cases: + - name: basic_gmail_flag + expect: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} + + - name: override_inputs + inputs: + users: + rows: + - {id: 3, email: "c@hotmail.com"} + - {id: 4, email: "d@gmail.com"} + expect: + rows: + - {id: 3, email: "c@hotmail.com", is_gmail: false} + - {id: 4, email: "d@gmail.com", is_gmail: true} +``` + +SQL models use the file stem (including `.ff`) as `model`. Provide expected relation names that match the materialized table/view: + +```yaml +# tests/unit/mart_users.yml +model: mart_users.ff + +defaults: + inputs: + users_enriched: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} + expect: + relation: mart_users + order_by: [id] + +cases: + - name: passthrough_columns + expect: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} +``` + +For multi-dependency models, include every physical relation name (what `relation_for(dep)` returns): + +```yaml +model: mart_orders_enriched +defaults: + inputs: + users_enriched: + rows: + - {id: 1, email: "x@gmail.com", is_gmail: true} + orders: + rows: + - {order_id: 10, user_id: 1, amount: 19.9} + - {order_id: 11, user_id: 1, amount: -1.0} +cases: + - name: join_and_flag + expect: + any_order: true + rows: + - {order_id: 10, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: 19.9, valid_amt: true} + - {order_id: 11, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: -1.0, valid_amt: false} +``` + +## Input Formats + +- `rows`: inline dictionaries per row. +- `csv`: reference a CSV file (relative paths allowed). + +Keys under `inputs` are physical relations; use `relation_for('users.ff')` if unsure. + +## Expected Output & Comparison + +- `relation`: actual table/view name produced by the model (defaults to `relation_for(model)`). +- Ordering: `order_by: [...]` or `any_order: true`. +- Columns: `ignore_columns: [...]`, `subset: true`. +- Numeric tolerance: `approx: true` or `approx: { col: 1e-9, other_col: 0.01 }` + (numbers can be plain `1e-9` or quoted; they are cast to float). + +## Running UTests + +```bash +fft utest . # discover all specs +fft utest . --env dev # use a specific profile +fft utest . --model users_enriched +fft utest . --model mart_orders_enriched --case join_and_flag +fft utest . --path tests/unit/users_enriched.yml +``` + +Override the executor for all specs (ensure credentials/DSNs are set): + +```bash +export FF_PG_DSN="postgresql+psycopg://postgres:postgres@localhost:5432/ffdb" +export FF_PG_SCHEMA="public" +fft utest . --engine postgres +``` + +Executor precedence (highest → lowest): CLI `--engine`, YAML `engine:` (optional), `profiles.yml`, environment overrides. + +## Design Notes + +- Only the target model runs; supply all upstream relations the model expects. +- `defaults` deep-merge: dicts merge, lists/scalars overwrite. +- Results compare as DataFrames with configurable order, subsets, ignored columns, and numeric tolerances. +- Exit codes: `0` for success, `2` when at least one case fails (compact CSV-style diff is printed). + +## CI Example + +```yaml +name: utests +on: [push, pull_request] +jobs: + duckdb: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: { python-version: "3.11" } + - run: pip install -e . + - run: fft utest . --env dev +``` + +For Postgres, add a service container and run `fft utest . --engine postgres` with `FF_PG_DSN` / `FF_PG_SCHEMA`. diff --git a/docs/_scripts/gen_api.py b/docs/_scripts/gen_api.py index 4c54fbe..b797324 100644 --- a/docs/_scripts/gen_api.py +++ b/docs/_scripts/gen_api.py @@ -2,6 +2,7 @@ from __future__ import annotations from pathlib import Path import mkdocs_gen_files +from collections import defaultdict # ------------------------------------------------------------------- # Configuration @@ -96,11 +97,54 @@ def detect_package() -> tuple[str, Path]: f.write(" filters:\n") f.write(' - "!^_"\n') -# Generate index page +# Generate index page (paths relative to the reference/ directory) index_path = "reference/index.md" with mkdocs_gen_files.open(index_path, "w") as f: f.write("# API Reference\n\n") f.write("> Auto-generated per module\n\n") for module, doc_file in generated_files: rel = Path(doc_file).relative_to("reference").as_posix() + # IMPORTANT: link without 'reference/' prefix f.write(f"- [{module}]({rel})\n") + +# ------------------------------------------------------------------- +# Generate reference/SUMMARY.md for mkdocs-literate-nav +# ------------------------------------------------------------------- +# This builds a *structured* nav. If you prefer a flat list, you can +# just copy the loop used for index.md above instead. +reference_root = Path("reference") + +# Build a tree: level-1 = top-level package (fastflowtransform), +# level-2 = immediate subpackage/module, then files underneath. +tree: dict[str, list[tuple[str, str]]] = defaultdict(list) +for module, doc_file in generated_files: + rel = Path(doc_file).relative_to("reference") + parts = module.split(".") + # Expect modules like "fastflowtransform", "fastflowtransform.api.http", ... + group = parts[1] if len(parts) > 1 else parts[0] # e.g. "api", "config", "executors" + tree[group].append((module, rel.as_posix())) + +# Sort groups and entries for stable nav +for k in list(tree.keys()): + tree[k].sort(key=lambda x: x[0]) + +with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as f: + f.write("# API Reference\n\n") + # Link to the overview page + f.write("- [Overview](index.md)\n") + # Grouped subsections + for group in sorted(tree.keys()): + items = tree[group] + if not items: + # Safety: never emit an empty section (would break literate-nav) + continue + if len(items) == 1: + # For single-item groups, emit a direct link (no section header) + module, rel = items[0] + f.write(f"- [{module}]({rel})\n") + continue + # Section header (no link) + f.write(f"- {group}\n") + for module, rel in items: + # Nested items MUST be indented by 4 spaces for literate-nav + f.write(f" - [{module}]({rel})\n") diff --git a/docs/examples/Cache_Demo.md b/docs/examples/Cache_Demo.md new file mode 100644 index 0000000..7f73250 --- /dev/null +++ b/docs/examples/Cache_Demo.md @@ -0,0 +1,204 @@ +# 🧠 Cache & Parallelism Demo + +This example demonstrates FastFlowTransform’s **build cache**, **fingerprint logic**, **parallel scheduler**, and **HTTP response caching**. +It’s a compact playground to visualize **when nodes are skipped**, **what triggers rebuilds**, and **how caching accelerates iterative runs**. + +--- + +## 🗂 Directory Structure + +```text +cache_demo/ + .env.dev_duckdb + Makefile + profiles.yml + project.yml + sources.yml + models/ + seeds_consumers/ + stg_users.ff.sql + stg_orders.ff.sql + marts/ + mart_user_orders.ff.sql + python/ + py_constants.ff.py + http/ + http_users.ff.py + seeds/ + seed_users.csv + seed_orders.csv + README.md +``` + +--- + +## ⚙️ Overview + +This demo showcases several FastFlowTransform features: + +| Feature | Demonstrated by | +| -------------------------- | ----------------------------------------------- | +| Level-wise parallelism | Multiple models running concurrently (`--jobs`) | +| Deterministic fingerprints | Build cache skipping unchanged nodes | +| Upstream invalidation | Seed → staging → mart rebuilds | +| Environment invalidation | Any `FF_*` change triggers rebuild | +| Python model caching | Fingerprints derived from function source | +| HTTP response caching | Persistent API result cache with offline mode | + +--- + +## ⚡ Quickstart + +```bash +cd examples/cache_demo +make cache_first # builds all nodes, writes cache +make cache_second # no-op run (everything skipped) +make change_sql # touch a model -> rebuilds dependent mart +make change_seed # change seed -> rebuilds staging + mart +make change_env # set FF_* env -> invalidates cache globally +make change_py # edit py_constants.ff.py -> rebuilds that model +make run_parallel # runs entire DAG with 4 workers per level +``` + +Inspect results: + +* `.fastflowtransform/target/run_results.json` – fingerprints, results, timings, HTTP stats +* `site/dag/index.html` – DAG visualization +* `.local/http-cache/` – persisted API responses + +--- + +## 🧩 Model Summary + +| Model | Kind | Purpose | Notes | +| ------------------------- | ------ | --------------------------- | ------------------------------------ | +| `stg_users.ff.sql` | SQL | Load & normalize users seed | Rebuilds if seed changes | +| `stg_orders.ff.sql` | SQL | Load orders seed | Builds as a view | +| `mart_user_orders.ff.sql` | SQL | Join staging tables | Rebuilds if any staging changes | +| `py_constants.ff.py` | Python | Simple constant DataFrame | Fingerprint based on function source | +| `http_users.ff.py` | Python | HTTP fetch with cache | Uses `get_df()` and offline cache | + +--- + +## 🌐 HTTP Response Cache + +The `http_users.ff.py` model demonstrates the built-in HTTP cache: + +* **First run:** downloads `https://jsonplaceholder.typicode.com/users` +* **Subsequent runs:** reuse cached responses from `.local/http-cache` +* **Offline mode:** works with `FF_HTTP_OFFLINE=1` + +```bash +make http_first # warms HTTP cache +make http_offline # reuses cached response, no network access +make http_cache_clear # deletes cache directory +``` + +You can inspect HTTP usage in the `run_results.json` file: + +```bash +jq -r '.results[] | select(.http!=null) + | "\(.name): requests=\(.http.requests) cache_hits=\(.http.cache_hits) offline=\(.http.used_offline)"' \ + .fastflowtransform/target/run_results.json +``` + +--- + +## ⚙️ Cache Logic Recap + +FastFlowTransform caches model fingerprints and skips nodes when: + +1. **Fingerprints match** (SQL text, Python source, vars, engine, env, deps). +2. The **physical relation exists** in the database. + +Changing *any* of the following invalidates the cache: + +* SQL/Jinja content +* Python model code +* `sources.yml` +* `FF_*` environment variables +* Seed file contents +* Engine or profile name + +You can control cache behavior via CLI: + +```bash +--cache=off # always build +--cache=rw # default; skip on match; write cache +--cache=ro # read-only; skip on hit, build on miss +--cache=wo # always build, always write +``` + +--- + +## 🧮 Parallel Scheduler + +FastFlowTransform executes models **level-wise**: + +* Each level contains nodes whose dependencies are fully satisfied. +* Up to `--jobs` nodes per level run concurrently. +* Logs are serialized for clean output. + +Example: + +```bash +fft run . --env dev_duckdb --jobs 4 +``` + +--- + +## 🧪 Example Experiments + +| Scenario | Command | Expected behavior | +| ------------------------- | -------------------------------------- | ------------------------------- | +| First full run | `make cache_first` | All models build, cache written | +| No-op run | `make cache_second` | All skipped (no rebuilds) | +| Modify SQL | `make change_sql` | Downstream mart rebuilds | +| Add seed row | `make change_seed` | Staging + mart rebuild | +| Change env | `make change_env` | All nodes rebuild | +| Edit Python constant | `make change_py` | Only that Python model rebuilds | +| Warm & offline HTTP cache | `make http_first && make http_offline` | HTTP cache reused, no network | + +--- + +## 🧩 DAG Example + +After the first run, generate the DAG visualization: + +```bash +make dag +open site/dag/index.html +``` + +You’ll see: + +``` +seed_users → stg_users.ff +seed_orders → stg_orders.ff +(stg_users + stg_orders) → mart_user_orders.ff +py_constants +http_users +``` + +* `py_constants` runs independently (parallel) +* `mart_user_orders.ff` depends on both staging nodes + +--- + +## 🧰 Tips + +* **Inspect fingerprints:** stored in `.fastflowtransform/target/manifest.json` +* **Audit table:** `_ff_meta` table in the engine stores build metadata +* **Clear cache:** delete `.fastflowtransform/` or use `make clean` +* **Parallel debugging:** use `--keep-going` to continue unaffected levels + +--- + +## ✅ Takeaways + +* FFT’s build cache uses stable fingerprints to skip unchanged nodes. +* Fingerprints propagate downstream, ensuring correctness. +* The HTTP cache supports deterministic, offline API pipelines. +* Parallel execution accelerates runs without breaking dependencies. + +Together, these features make iterative development **fast, reliable, and reproducible**. diff --git a/docs/examples/DQ_Demo.md b/docs/examples/DQ_Demo.md new file mode 100644 index 0000000..257b44a --- /dev/null +++ b/docs/examples/DQ_Demo.md @@ -0,0 +1,369 @@ +# Data Quality Demo Project + +The **Data Quality Demo** shows how to use **all built-in FFT data quality tests** on a small, understandable model: + +* Column checks: + + * `not_null` + * `unique` + * `accepted_values` + * `greater_equal` + * `non_negative_sum` + * `row_count_between` + * `freshness` +* Cross-table reconciliations: + + * `reconcile_equal` + * `reconcile_ratio_within` + * `reconcile_diff_within` + * `reconcile_coverage` + +It uses a simple **customers / orders / mart** setup so you can see exactly what each test does and how it fails when something goes wrong. + +--- + +## What this example demonstrates + +1. **Basic column checks** on staging tables + Ensure IDs are present and unique, amounts are non-negative, and status values are valid. + +2. **Freshness** on a timestamp column + Check that the most recent order in your mart is not “too old”, using `last_order_ts`. + +3. **Row count sanity checks** + Guard against empty tables and unexpectedly large row counts. + +4. **Cross-table reconciliations** between staging and mart + Verify that sums and counts match between `orders` and the aggregated `mart_orders_agg`, and that every customer has a corresponding mart row. + +5. **Tagged tests and selective execution** + All tests are tagged (e.g. `example:dq_demo`, `reconcile`) so you can run exactly the subset you care about. + +--- + +## Project layout (example) + +```text +examples/dq_demo/ + .env + .env.dev_duckdb + .env.dev_postgres + .env.dev_databricks + Makefile # optional, convenience wrapper around fft commands + profiles.yml + project.yml + sources.yml + + seeds/ + customers.csv + orders.csv + + models/ + staging/ + customers.ff.sql + orders.ff.sql + marts/ + mart_orders_agg.ff.sql +``` + +### Seeds + +* `seeds/customers.csv` + Simple customer dimension (e.g. `customer_id`, `name`, `status`). + +* `seeds/orders.csv` + Order fact data (e.g. `order_id`, `customer_id`, `amount`, `order_ts` as a string). + +### Models + +**1. Staging: `customers.ff.sql`** + +* Materialized as a table. +* Casts IDs and other fields into proper types. +* Used as the “clean” customer dimension for downstream checks. + +**2. Staging: `orders.ff.sql`** + +* Materialized as a table. +* Casts fields to proper types so DQ tests work reliably: + + ```sql + {{ config( + materialized='table', + tags=[ + 'example:dq_demo', + 'scope:staging', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], + ) }} + + select + cast(order_id as int) as order_id, + cast(customer_id as int) as customer_id, + cast(amount as double) as amount, + cast(order_ts as timestamp) as order_ts + from {{ source('crm', 'orders') }}; + ``` + + This is important for: + + * numeric checks (`greater_equal`, `non_negative_sum`) + * timestamp-based `freshness` checks + +**3. Mart: `mart_orders_agg.ff.sql`** + +Aggregates orders per customer and prepares data for reconciliation + freshness: + +```sql +{{ config( + materialized='table', + tags=[ + 'example:dq_demo', + 'scope:mart', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], +) }} + +-- Aggregate orders per customer for DQ & reconciliation tests +with base as ( + select + o.order_id, + o.customer_id, + -- Ensure numeric and timestamp types for downstream DQ checks + cast(o.amount as double) as amount, + cast(o.order_ts as timestamp) as order_ts, + c.name as customer_name, + c.status as customer_status + from {{ ref('orders.ff') }} o + join {{ ref('customers.ff') }} c + on o.customer_id = c.customer_id +) +select + customer_id, + customer_name, + customer_status as status, + count(*) as order_count, + sum(amount) as total_amount, + min(order_ts) as first_order_ts, + max(order_ts) as last_order_ts +from base +group by customer_id, customer_name, customer_status; +``` + +The important columns for DQ tests are: + +* `status` → used for `accepted_values` +* `order_count` and `total_amount` → used for numeric and reconciliation tests +* `last_order_ts` → used for `freshness` + +--- + +## Data quality configuration (`project.yml`) + +All tests live under `project.yml → tests:`. +This example uses the tag `example:dq_demo` for easy selection. + +### Column-level checks + +```yaml +tests: + # 1) IDs must be present and unique + - type: not_null + table: customers + column: customer_id + tags: [example:dq_demo, batch] + + - type: unique + table: customers + column: customer_id + tags: [example:dq_demo, batch] + + # 2) Order amounts must be >= 0 + - type: greater_equal + table: orders + column: amount + threshold: 0 + tags: [example:dq_demo, batch] + + # 3) Total sum of amounts must not be negative + - type: non_negative_sum + table: orders + column: amount + tags: [example:dq_demo, batch] + + # 4) Customer status values must be within a known set + - type: accepted_values + table: mart_orders_agg + column: status + values: ["active", "churned", "prospect"] + severity: warn # show as warning, not hard failure + tags: [example:dq_demo, batch] + + # 5) Row count sanity check on mart + - type: row_count_between + table: mart_orders_agg + min_rows: 1 + max_rows: 100000 + tags: [example:dq_demo, batch] + + # 6) Freshness: last order in the mart must not be "too old" + - type: freshness + table: mart_orders_agg + column: last_order_ts + max_delay_minutes: 100000000 + tags: [example:dq_demo, batch] +``` + +### Cross-table reconciliations + +```yaml + # 7) Reconcile total revenue between orders and mart + - type: reconcile_equal + name: total_amount_orders_vs_mart + tags: [example:dq_demo, reconcile] + left: + table: orders + expr: "sum(amount)" + right: + table: mart_orders_agg + expr: "sum(total_amount)" + abs_tolerance: 0.01 + + # 8) Ratio of sums should be ~1 (within tight bounds) + - type: reconcile_ratio_within + name: total_amount_ratio + tags: [example:dq_demo, reconcile] + left: + table: orders + expr: "sum(amount)" + right: + table: mart_orders_agg + expr: "sum(total_amount)" + min_ratio: 0.999 + max_ratio: 1.001 + + # 9) Row count diff between orders and mart should be bounded + - type: reconcile_diff_within + name: order_count_diff + tags: [example:dq_demo, reconcile] + left: + table: orders + expr: "count(*)" + right: + table: mart_orders_agg + expr: "sum(order_count)" + max_abs_diff: 0 + + # 10) Coverage: every customer should appear in the mart + - type: reconcile_coverage + name: customers_covered_in_mart + tags: [example:dq_demo, reconcile] + source: + table: customers + key: "customer_id" + target: + table: mart_orders_agg + key: "customer_id" +``` + +This set of tests touches **all available test types** and ties directly back to the simple data model. + +--- + +## Running the demo + +Assuming you are in the repo root and using DuckDB as a starting point: + +### 1. Seed the data + +```bash +fft seed examples/dq_demo --env dev_duckdb +``` + +This reads `seeds/customers.csv` and `seeds/orders.csv` and materializes them as tables referenced by `sources.yml`. + +### 2. Run the models + +```bash +fft run examples/dq_demo --env dev_duckdb +``` + +This builds: + +* `customers` (staging) +* `orders` (staging) +* `mart_orders_agg` (mart) + +### 3. Run all DQ tests + +```bash +fft test examples/dq_demo --env dev_duckdb --select tag:example:dq_demo +``` + +You should see a summary like: + +```text +Data Quality Summary +──────────────────── +✅ not_null customers.customer_id +✅ unique customers.customer_id +✅ greater_equal orders.amount +✅ non_negative_sum orders.amount +❕ accepted_values mart_orders_agg.status +✅ row_count_between mart_orders_agg +✅ freshness mart_orders_agg.last_order_ts +✅ reconcile_equal total_amount_orders_vs_mart +✅ reconcile_ratio_within total_amount_ratio +✅ reconcile_diff_within order_count_diff +✅ reconcile_coverage customers_covered_in_mart + +Totals +────── +✓ passed: 10 +! warnings: 1 +``` + +(Exact output will differ, but you’ll see pass/failed/warned checks listed.) + +### 4. Run only reconciliation tests + +```bash +fft test examples/dq_demo --env dev_duckdb --select tag:reconcile +``` + +This executes just the cross-table checks, which is handy when you’re iterating on a mart. + +--- + +## Things to experiment with + +To understand the tests better, intentionally break the data and re-run `fft test`: + +* Set one `customers.customer_id` to `NULL` → watch `not_null` fail. +* Duplicate a `customer_id` → watch `unique` fail. +* Put a negative `amount` in `orders.csv` → `greater_equal` and `non_negative_sum` fail. +* Add a new `status` value (e.g. `"paused"`) → `accepted_values` warns. +* Drop a customer from `mart_orders_agg` manually (or filter it out in SQL) → `reconcile_coverage` fails. +* Change an amount in the mart only → reconciliation tests fail. + +This makes it very clear what each test guards against. + +--- + +## Summary + +The Data Quality Demo is designed to be: + +* **Small and readable** – customers, orders, and a single mart. +* **Complete** – exercises every built-in FFT DQ test type. +* **Practical** – real-world patterns like: + + * typing in staging models, + * testing freshness on a mart timestamp, + * reconciling sums and row counts across tables. + +Once you’re comfortable with this example, you can copy the patterns into your real project: start with staging-level checks, then layer in reconciliations and freshness on your most important marts. diff --git a/docs/examples/Incremental_Demo.md b/docs/examples/Incremental_Demo.md new file mode 100644 index 0000000..d974e78 --- /dev/null +++ b/docs/examples/Incremental_Demo.md @@ -0,0 +1,657 @@ +# Incremental, Delta & Iceberg Demo + +This example project shows how to use **incremental models** and **Delta-/Iceberg-style merges** in FastFlowTransform across DuckDB, Postgres and Databricks Spark (Parquet, Delta & Iceberg). + + +It is intentionally small and self-contained so you can copy/paste patterns into your own project. + +--- + +## Location & Layout + +The example lives under: + +```text +examples/incremental_demo/ +```` + +Directory structure: + +```text +incremental_demo/ + .env + .env.dev_duckdb + .env.dev_postgres + .env.dev_databricks_delta + .env.dev_databricks_iceberg + Makefile + profiles.yml + project.yml + sources.yml + + seeds/ + seed_events.csv + + models/ + common/ + events_base.ff.sql + fct_events_sql_inline.ff.sql + fct_events_sql_yaml.ff.sql + engines/ + duckdb/ + fct_events_py_incremental.ff.py + postgres/ + fct_events_py_incremental.ff.py + databricks_spark/ + fct_events_py_incremental.ff.py +``` + +*Your actual filenames may differ slightly; the concepts are the same.* + +--- + +## What the demo shows + +The demo revolves around a tiny `events` dataset and three different ways to build an incremental fact table: + +1. **SQL incremental model with inline delta SQL** + + * `models/common/fct_events_sql_inline.ff.sql` + * All incremental logic (how to find “new/changed” rows) is defined directly in the model’s `config(meta=...)` block. + +2. **SQL incremental model with YAML config in `project.yml`** + + * `models/common/fct_events_sql_yaml.ff.sql` + * The base SELECT lives in the model, but all incremental hints (`incremental.enabled`, `unique_key`, `updated_at_column`, …) are configured in `project.yml → models.incremental`. + +3. **Python incremental model** + + * `models/engines/*/fct_events_py_incremental.ff.py` + * A Python model that returns a DataFrame; the executor applies incremental behaviour based on model `meta` (unique key + updated-at timestamp) and the target engine: + + * DuckDB / Postgres: incremental insert/merge in SQL + * Databricks Spark: `MERGE INTO` for Delta or Iceberg where available (Spark 4), with a fallback full-refresh strategy for other formats + +4. **Iceberg profile for Spark 4** + + * Optional Databricks/Spark profile that uses the built-in **Iceberg catalog**. + * Seeds and models are materialized as Iceberg tables in a local warehouse directory. + * `ref()` and `source()` automatically point to the Iceberg catalog when the `databricks_spark.table_format` is set to `iceberg`. + +--- + +## Seed data + +The demo uses a simple seed file: + +```text +examples/incremental_demo/seeds/seed_events.csv +``` + +Example contents (conceptually): + +```csv +event_id,updated_at,value +1,2024-01-01T10:00:00,10 +2,2024-01-01T10:05:00,20 +3,2024-01-01T10:10:00,30 +``` + +Running: + +```bash +fft seed examples/incremental_demo --env dev_duckdb +``` + +(or with your engine/env of choice) will materialize this seed into the warehouse (e.g. a DuckDB table or Postgres table). + +--- + +## Base model: `events_base` + +The base staging model simply exposes the events from the seed: + +```text +models/common/events_base.ff.sql +``` + +Conceptually: + +```sql +{{ config( + materialized='table', + tags=[ + 'example:incremental_demo', + 'scope:common', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + ], +) }} + +select + event_id, + updated_at, + value +from {{ source('raw', 'events') }}; +``` + +All incremental models build on top of this base table. + +--- + +## Incremental configuration (high-level) + +All three incremental models share the same core idea: + +* Mark the model as **incremental** +* Provide a **unique key** (e.g. `event_id`) +* Provide an **updated-at / timestamp column** (e.g. `updated_at`) +* Optionally specify a **delta strategy**: + + * **Inline SQL** (in the model) + * **External YAML** (referenced from the model) + * **Python** (engine-specific model that returns the delta dataset) + +There are two ways to express this in the demo: + +1. **Inline on the model** (used by `fct_events_sql_inline.ff.sql`), via `config(...)`: + +```jinja +{{ config( + materialized='incremental', + unique_key='event_id', + incremental={'updated_at_column': 'updated_at'}, + tags=['example:incremental_demo'], +) }} +``` + +2. **As an overlay in `project.yml`** (used by `fct_events_sql_yaml.ff.sql` and the Python model): + +```yaml +models: + incremental: + fct_events_sql_yaml.ff: + unique_key: "event_id" + incremental: + enabled: true + updated_at_column: "updated_at" + + fct_events_py_incremental.ff: + unique_key: "event_id" + incremental: + enabled: true + updated_at_column: "updated_at" +``` + +The incremental engine then uses these `meta` fields to decide whether to: + +* create the table (`create_table_as`) for the **first run** +* perform an **incremental insert** or **merge** for subsequent runs + +--- + +## 1) SQL incremental with inline delta SQL + +File: + +```text +models/common/fct_events_sql_inline.ff.sql +``` + +In this variant, both *incremental configuration* and the *delta filter* live directly in the model: + +```jinja +{{ config( + materialized='incremental', + unique_key='event_id', + incremental={'updated_at_column': 'updated_at'}, + tags=[ + 'example:incremental_demo', + 'scope:common', + 'kind:incremental', + 'inc:type:inline-sql', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + ], +) }} + +with base as ( + select * + from {{ ref('events_base.ff') }} +) +select + event_id, + updated_at, + value +from base +{% if is_incremental() %} +where updated_at > ( + select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') + from {{ this }} +) +{% endif %}; +``` + +On the **first run**, the engine sees no existing relation, so it materializes the full `select ... from events_base`. + +On subsequent runs, the engine evaluates the `delta.sql` snippet and: + +* **DuckDB / Postgres**: inserts or merges the resulting rows into the target table +* **Databricks Spark**: tries a `MERGE INTO` (Delta) and falls back to a full-refresh if necessary + +--- + +## 2) SQL incremental with YAML delta config + +File: + +```text +models/common/fct_events_sql_yaml.ff.sql +``` + +Here the model body only defines the **canonical SELECT** and does *not* contain any incremental hints: + +```jinja +{{ config( + materialized='incremental', + tags=[ + 'example:incremental_demo', + 'scope:common', + 'kind:incremental', + 'inc:type:yaml-config', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + ], +) }} + +with base as ( + select * + from {{ ref('events_base.ff') }} +) +select + event_id, + updated_at, + value +from base; +``` + +All incremental behaviour for this model is driven by `project.yml`: + +```yaml +models: + incremental: + fct_events_sql_yaml.ff: + unique_key: "event_id" + incremental: + enabled: true + updated_at_column: "updated_at" +``` + +The registry merges this overlay into the model at load time, so the incremental runtime +sees effectively the same config as for the inline model (`unique_key` + `updated_at_column`) – +only the **source of truth** is different. + +--- + +### Inline vs YAML config at a glance + +| Model | Where is incremental configured? | What lives in the SQL file? | +|----------------------------|-----------------------------------------|-----------------------------------------------| +| `fct_events_sql_inline.ff` | Inline in `config(...)` on the model | Full SELECT **+** `is_incremental()` filter | +| `fct_events_sql_yaml.ff` | `project.yml → models.incremental` | Full SELECT only (no incremental hints) | + +Both end up with the same runtime meta, only the **location of config** differs. + +## 3) Python incremental model + +Files: + +```text +models/engines/duckdb/fct_events_py_incremental.ff.py +models/engines/postgres/fct_events_py_incremental.ff.py +models/engines/databricks_spark/fct_events_py_incremental.ff.py +``` + +Each engine variant uses the same logical signature: + +```python +from fastflowtransform import engine_model +import pandas as pd # or pyspark.sql.DataFrame for Databricks Spark + + +@engine_model( + only="duckdb", # or "postgres" / "databricks_spark" + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=[ + "example:incremental_demo", + "scope:engine", + "engine:duckdb", # or engine-specific + ], + meta={ + "incremental": True, + "unique_key": ["event_id"], + "updated_at": "updated_at", + }, +) +def build(df_events): + # 'df_events' is either a pandas.DataFrame or Spark DataFrame + # depending on the engine. + # The function returns either: + # - a full canonical result, or + # - only the delta rows, depending on your design. + # + # In the simplest version, you just return the full dataset and let the + # executor handle incremental logic based on meta. + return df_events[["event_id", "updated_at", "value"]] +``` + +The executor uses the `meta.incremental` / `meta.unique_key` / `meta.updated_at` hints to run: + +* A **full-refresh** on the first run +* A **delta merge** on subsequent runs: + + * For DuckDB / Postgres: insert/merge SQL + * For Databricks Spark: + + * `MERGE INTO` for Delta tables, or + * a full-refresh fallback strategy that rewrites the table based on the union of existing + delta rows + +--- + +## Delta & Iceberg variants (Databricks / Spark) + +In addition to the “regular” incremental models, the demo also includes **Delta Lake** and **Iceberg** variants +that shows how to: + +- route a model to **Delta tables** via `project.yml` +- reuse the same incremental pattern, but with a **Delta-backed** table on Databricks/Spark +- keep Parquet and Delta models side-by-side in the same project + +This is optional and only relevant for the `databricks_spark` engine. + +--- + +### Storage configuration for the Delta / Iceberg models + +In `project.yml`, the Delta variant gets its own storage entry, separate from the Parquet fact table: + +```yaml +models: + storage: + # Existing Parquet fact table + fct_events_sql_inline: + path: ".local/spark/fct_events_sql_inline" + format: parquet + + # 🔹 Delta-based fact table (Spark/Databricks only) + fct_events_sql_inline_delta: + path: ".local/spark_delta/fct_events_sql_inline" + format: delta + + # ❄️ Iceberg-based fact table (Spark 4 / Databricks only) + fct_events_sql_inline_iceberg: + # Points into the Iceberg warehouse; must match your Iceberg catalog config + path: ".local/iceberg_warehouse/incremental_demo/fct_events_sql_inline" + format: iceberg +```` + +Notes: + +* The key `fct_events_sql_inline_delta` must match the **model name**. +* `format: delta` tells the Databricks/Spark executor to create `USING DELTA LOCATION ...`. +* The path is different from the Parquet path so artifacts don’t clash. + +--- + +### Delta fact model + +The Delta fact model is a close sibling of `fct_events_sql_inline.ff.sql`, but: + +* is tagged only for the Databricks/Spark engine +* is configured for incremental **merge** with a `unique_key` + `updated_at` column + +Example (conceptual) model: + +```sql +-- models/common/fct_events_sql_inline_delta.ff.sql + +{{ config( + materialized='table', + tags=[ + 'example:incremental_demo', + 'kind:incremental', + 'engine:databricks_spark', + ], + meta={ + 'incremental': True, + 'unique_key': ['event_id'], + 'updated_at': 'updated_at', + 'delta': { + 'sql': " + with base as ( + select event_id, updated_at, value + from {{ ref('events_base.ff') }} + ) + select + event_id, + updated_at, + value + from base + where updated_at > ( + select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') + from {{ this }} + ) + " + }, + }, +) }} + +-- canonical full-select (used for docs / full-refresh) +select + event_id, + updated_at, + value +from {{ ref('events_base.ff') }}; +``` + +What happens: + +* On the **first run**, the engine sees no existing table and does a full materialization + (a Delta table at `.local/spark_delta/fct_events_sql_inline`). +* On **subsequent runs**, the executor uses the `delta.sql` query as the **incremental delta** and: + + * attempts a `MERGE INTO` for Delta tables, or + * falls back to a full-refresh strategy if MERGE is not supported. + +--- + +### Running the Delta variant + +Once your Databricks/Spark profile is configured (e.g. `dev_databricks` in `profiles.yml` and `.env.dev_databricks`), +you can run the Delta model like any other: + +```bash +# From the repo root +cd examples/incremental_demo + +# Seed +FFT_ACTIVE_ENV=dev_databricks fft seed . + +# Run only the Delta variant +FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select fct_events_sql_inline_delta.ff \ + --select tag:engine:databricks_spark + +# Or include it in the general incremental demo selection +FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select tag:example:incremental_demo \ + --select tag:engine:databricks_spark +``` + +Optionally, you can add a small `not_null` test to `project.yml` to verify the Delta model: + +```yaml +tests: + - type: not_null + table: fct_events_sql_inline_delta + column: event_id + tags: [batch, delta] +``` + +Then run: + +```bash +FFT_ACTIVE_ENV=dev_databricks fft test . --select tag:delta +``` + +to validate the Delta-backed incremental table specifically. + +--- + +## Running the demo + +From the project root: + +```bash +cd examples/incremental_demo +``` + +### DuckDB + +```bash +# Seed +FFT_ACTIVE_ENV=dev_duckdb fft seed . + +# Initial full run +FFT_ACTIVE_ENV=dev_duckdb fft run . \ + --select tag:example:incremental_demo --select tag:engine:duckdb + +# Incremental run (after modifying seed_events.csv to add later events) +FFT_ACTIVE_ENV=dev_duckdb fft run . \ + --select tag:example:incremental_demo --select tag:engine:duckdb \ + --cache rw + +# Data-quality tests (if configured in project.yml / schema YAML) +FFT_ACTIVE_ENV=dev_duckdb fft test . \ + --select tag:example:incremental_demo +``` + +### Postgres + +```bash +FFT_ACTIVE_ENV=dev_postgres fft seed . +FFT_ACTIVE_ENV=dev_postgres fft run . \ + --select tag:example:incremental_demo --select tag:engine:postgres +FFT_ACTIVE_ENV=dev_postgres fft test . \ + --select tag:example:incremental_demo +``` + +Packen würde ich den Hinweis direkt an die Stelle, wo du schon beschreibst, wie man die Demo auf Databricks startet – also deine aktuelle Sektion: + +````markdown +### Databricks Spark + +```bash +FFT_ACTIVE_ENV=dev_databricks fft seed . +FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select tag:example:incremental_demo --select tag:engine:databricks_spark +FFT_ACTIVE_ENV=dev_databricks fft test . \ + --select tag:example:incremental_demo +```` + +### Databricks Spark (parquet vs Delta) + +You can run the incremental demo on Databricks/Spark against either **parquet** or **Delta** tables. + +FFT reads the desired table format from the `FF_DBR_TABLE_FORMAT` environment variable, which overrides +`databricks_spark.table_format` from `profiles.yml`. + +When `FF_DBR_TABLE_FORMAT=delta`, the Databricks/Spark executor automatically wires Delta Lake into the +SparkSession (downloads the Maven artifact via `delta-spark`, adds +`spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension`, and sets +`spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog` unless you already +overrode those settings). No extra `spark-submit --conf` flags are needed—just ensure the +`delta-spark >= 4.0` Python package is installed. + +From the repo root: + +```bash +cd examples/incremental_demo +```` + +Run with **parquet** tables (default): + +```bash +FF_DBR_TABLE_FORMAT=parquet FFT_ACTIVE_ENV=dev_databricks fft seed . +FF_DBR_TABLE_FORMAT=parquet FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select tag:example:incremental_demo --select tag:engine:databricks_spark +FF_DBR_TABLE_FORMAT=parquet FFT_ACTIVE_ENV=dev_databricks fft test . \ + --select tag:example:incremental_demo +``` + +Run with **Delta** tables: + +```bash +FF_DBR_TABLE_FORMAT=delta FFT_ACTIVE_ENV=dev_databricks fft seed . +FF_DBR_TABLE_FORMAT=delta FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select tag:example:incremental_demo --select tag:engine:databricks_spark +FF_DBR_TABLE_FORMAT=delta FFT_ACTIVE_ENV=dev_databricks fft test . \ + --select tag:example:incremental_demo +``` + +This way you can switch between parquet and Delta just by changing the `FF_DBR_TABLE_FORMAT` +environment variable, without touching the models or project.yml. + +Adjust environment names to match your `profiles.yml`. + +### Databricks Spark (Iceberg / Spark 4+) + +If you are on Spark 4 / Databricks with Iceberg support, you can also run the incremental demo +purely against Iceberg tables using a dedicated profile (for example `dev_databricks_iceberg`). + +That profile typically: + +* uses `engine: databricks_spark` +* sets `databricks_spark.table_format: iceberg` +* configures an Iceberg catalog via `extra_conf`, for example: + + models: + storage: + # Example warehouse location, adjust as needed + fct_events_sql_inline_iceberg: + path: ".local/iceberg_warehouse/incremental_demo/fct_events_sql_inline" + format: iceberg + +and in the profile (profiles.yml) something like: + + dev_databricks_iceberg: + engine: databricks_spark + databricks_spark: + master: "local[*]" + app_name: "incremental_demo" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" + extra_conf: + spark.sql.catalog.iceberg: org.apache.iceberg.spark.SparkCatalog + spark.sql.catalog.iceberg.type: hadoop + spark.sql.catalog.iceberg.warehouse: "file:///{{ project_dir() }}/.local/iceberg_warehouse" + +From the repo root: + + cd examples/incremental_demo + +Run seeds and models against Iceberg: + + FFT_ACTIVE_ENV=dev_databricks_iceberg fft seed . + + FFT_ACTIVE_ENV=dev_databricks_iceberg fft run . \ + --select tag:example:incremental_demo --select tag:engine:databricks_spark + + FFT_ACTIVE_ENV=dev_databricks_iceberg fft test . \ + --select tag:example:incremental_demo + +Under this profile, all `ref()` / `source()` calls in Spark SQL and Python models are resolved +against the Iceberg catalog, so seeds and incremental models operate purely on Iceberg tables. diff --git a/docs/examples/Local_Engine_Setup.md b/docs/examples/Local_Engine_Setup.md index b64e318..70eb8c2 100644 --- a/docs/examples/Local_Engine_Setup.md +++ b/docs/examples/Local_Engine_Setup.md @@ -2,7 +2,8 @@ ### DuckDB -- Copy `.env.dev_duckdb` and adjust `FF_DUCKDB_PATH` if you want a different location (default: `.local/api_demo.duckdb`). +- Copy `.env.dev_duckdb` and adjust `FF_DUCKDB_PATH` if you want a different location (default: `.local/api_demo.duckdb`). + Optionally set `FF_DUCKDB_SCHEMA` (default schema for models/seeds) and `FF_DUCKDB_CATALOG` (catalog alias) if you need to isolate namespaces. - Create the target directory once: `mkdir -p examples/api_demo/.local`. - Run `make ENGINE=duckdb seed run` to build the seeds and models inside the DuckDB file. diff --git a/docs/examples/Macros_Demo.md b/docs/examples/Macros_Demo.md new file mode 100644 index 0000000..e296473 --- /dev/null +++ b/docs/examples/Macros_Demo.md @@ -0,0 +1,264 @@ +# Macros Demo + +**Goal:** Showcase **SQL Jinja macros** and **Python render-time macros** working together across engines (DuckDB, Postgres, Databricks Spark). +You’ll see reusable SQL helpers, engine-aware SQL generation, and Python functions exposed as Jinja globals/filters. + +--- + +## Directory structure + +```text +examples/macros_demo/ + .env + .env.dev_databricks + .env.dev_duckdb + .env.dev_postgres + Makefile + profiles.yml + project.yml + sources.yml + seeds/ + seed_users.csv + seed_orders.csv + models/ + macros/ + utils.sql + star.sql + macros_py/ + helpers.py + common/ + stg_users.ff.sql + stg_orders.ff.sql + dim_users.ff.sql + fct_user_sales.ff.sql + engines/ + duckdb/ + py_example.ff.py + postgres/ + py_example.ff.py + databricks_spark/ + py_example.ff.py +``` + +--- + +## What this demo shows + +* **SQL Jinja macros** (`models/macros/*.sql`) + + * `email_domain(expr)` – derive email domain + * `safe_cast_amount(expr)` – engine-aware numeric cast + * `coalesce_any(expr, default)` – small convenience + * `default_country()` – pull a default from `project.yml → vars` + * `star_except(relation, exclude_cols)` – select all except listed columns (falls back to `*` if columns unknown) +* **Python macros** (`models/macros_py/helpers.py`) + + * `slugify(str)` – URL-friendly slug + * `mask_email(email)` – redact local part + * `csv_values(rows, cols)` – inline small lookup tables via SQL `VALUES(...)` +* **Usage from models** + + * `stg_users` uses SQL + Python macros at render time + * `stg_orders` uses engine-aware casting + * `dim_users` builds a tiny inline lookup via `csv_values(...)` + * `fct_user_sales` aggregates across staged models + +--- + +## Prerequisites + +* A working FFT installation (CLI `fft` available) +* For Postgres/Databricks: valid local env and drivers +* The core must expose these Jinja globals (already done in the FFT core): + + * `var(name, default)`, `env(name, default)`, `engine(default)` + (Used by profiles/macros to read vars and detect engine.) + +--- + +## Seeds + +Two tiny CSVs materialized via `fft seed`: + +* `seed_users.csv` — `id,email,country` +* `seed_orders.csv` — `order_id,customer_id,amount,order_ts` + +`profiles.yml` and `project.yml` give minimal storage and connection configs. + +--- + +## How to run + +From repo root: + +```bash +cd examples/macros_demo + +# Choose engine: duckdb (default) | postgres | databricks_spark +make ENGINE=duckdb demo +# or +make ENGINE=postgres demo +# or +make ENGINE=databricks_spark demo +``` + +The `demo` target runs: + +1. `fft seed` — loads CSVs +2. `fft run` — builds models using macros +3. `fft dag --html` — writes DAG HTML to `site/dag/index.html` +4. `fft test` — runs example tests +5. Prints artifact paths and tries to open the DAG + +--- + +## Key files (highlights) + +### SQL macros – `models/macros/utils.sql` + +```jinja +{%- macro email_domain(expr) -%} + lower(split_part({{ expr }}, '@', 2)) +{%- endmacro -%} + +{%- macro safe_cast_amount(expr) -%} +{%- set e = engine('duckdb') -%} +{%- if e in ['duckdb', 'postgres', 'databricks_spark'] -%} + cast({{ expr }} as double) +{%- else -%} + cast({{ expr }} as double) +{%- endif -%} +{%- endmacro -%} + +{%- macro coalesce_any(expr, default) -%} + coalesce({{ expr }}, {{ default }}) +{%- endmacro -%} + +{%- macro default_country() -%} + '{{ var("default_country", "DE") }}' +{%- endmacro -%} +``` + +### SQL macros – `models/macros/star.sql` + +```jinja +{%- macro star_except(relation, exclude_cols) -%} +{%- set excl = exclude_cols | map('lower') | list -%} +{%- set cols = adapter_columns(relation) -%} +{%- if cols and cols|length > 0 -%} + {{- (cols | reject('in', excl) | map('string') | join(', ')) -}} +{%- else -%} + * +{%- endif -%} +{%- endmacro -%} +``` + +> Note: If the executor can’t describe columns for `relation`, this macro falls back to `*`. + +### Python macros – `models/macros_py/helpers.py` + +```python +def slugify(value: str) -> str: ... +def mask_email(email: str) -> str: ... +def csv_values(rows: list[dict], cols: list[str]) -> str: ... +``` + +Exposed as Jinja globals/filters at **render time** (not runtime SQL UDFs). + +--- + +## Models using macros + +### `stg_users.ff.sql` (Jinja + Python macro usage) + +* Coalesces missing country with `default_country()` +* Adds `email_domain(...)` +* Embeds a `slugify(var('site_name', ...))` literal into SQL + +```jinja +with src as ( + select + cast(id as int) as user_id, + lower(email) as email, + {{ coalesce_any("country", default_country()) }} as country + from {{ source('crm', 'users') }} +) +select + user_id, + email, + {{ email_domain("email") }} as email_domain, + country, + '{{ slugify(var("site_name", "My Site")) }}' as site_slug +from src; +``` + +### `stg_orders.ff.sql` (engine-aware types) + +```jinja +select + cast(order_id as int) as order_id, + cast(customer_id as int) as user_id, + {{ safe_cast_amount("amount") }} as amount, + cast(order_ts as timestamp) as order_ts +from {{ source('sales', 'orders') }}; +``` + +### `dim_users.ff.sql` (inline lookup via Python macro) + +```jinja +labels as ( + select * from (values {{ csv_values( + [ + {"domain":"example.com", "label":"internal"}, + {"domain":"gmail.com", "label":"consumer"}, + ], + ["domain","label"] + ) }}) as t(domain, label) +) +``` + +### `fct_user_sales.ff.sql` (final aggregation) + +Joins `stg_orders` with `dim_users` and aggregates. + +--- + +## Tests (examples) + +Declared in `project.yml`: + +* `not_null(dim_users.user_id)` +* `row_count_between(fct_user_sales, min_rows=1)` + +Run with: + +```bash +fft test examples/macros_demo --env dev_duckdb --select tag:example:macros_demo +``` + +--- + +## Troubleshooting + +* **`jinja2.exceptions.UndefinedError: 'var'/'env'/'engine' is undefined`** + Ensure your core’s Jinja environment registers these globals before loading templates: + + ```python + env.globals.update(var=..., env=..., engine=...) + ``` +* **Engine differences (types & functions):** + Always branch in macros (`engine(...)`) when types or functions differ. +* **`adapter_columns(...)` returns none:** + The `star_except` macro will fallback to `*`. For strict behavior, replace with static column lists per engine. + +--- + +## Extending this demo + +* Add more helpers to `helpers.py` (e.g., `render_json(obj)`, `join_csv(list)`). +* Create reusable macro libraries under `models/macros/` (date handling, SCD helpers, etc.). +* Use `var(...)` to parameterize behavior per environment or profile. + +--- + +Happy macro-ing! diff --git a/docs/examples/Materializations_Demo.md b/docs/examples/Materializations_Demo.md new file mode 100644 index 0000000..3d89a53 --- /dev/null +++ b/docs/examples/Materializations_Demo.md @@ -0,0 +1,162 @@ +# Materializations Demo + +> This example shows how different **materializations** (`view`, `table`, `incremental`, `ephemeral`) behave in FastFlowTransform. + +The demo models are located under: +``` + +examples/materializations_demo/models/ + +```` + +Each model type demonstrates how FastFlowTransform builds, caches, or executes models differently depending on its `materialized:` configuration. + +--- + +## 🧩 1. View Models + +A **view** model is always re-created from scratch each run. +It defines a virtual relation that doesn’t store data permanently — ideal for lightweight transformations. + +```sql +{{ config(materialized='view') }} + +select + order_id, + customer_id, + total_amount, + order_date +from {{ ref('stg_orders') }} +```` + +**Characteristics** + +* Rebuilt each run (no persisted data) +* Useful for staging, joins, and intermediate logic +* Fast and always up-to-date with upstreams +* Cannot store or cache incremental state + +--- + +## 🧱 2. Table Models + +A **table** model materializes into a physical table on the target engine. + +```sql +{{ config(materialized='table') }} + +select * +from {{ ref('fct_orders_view') }} +``` + +**Characteristics** + +* Fully rebuilt every run +* Good for final curated datasets or small tables +* Overwrites previous contents (atomic replace) +* Compatible with all engines (DuckDB, Postgres, BigQuery, etc.) + +--- + +## ⚡ 3. Incremental Models + +An **incremental** model stores state and only updates changed records on subsequent runs. + +```sql +{{ config( + materialized='incremental', + incremental={ + "enabled": true, + "unique_key": "order_id", + "updated_at_column": "updated_at", + "delta_sql": "select * from {{ ref('stg_orders') }} where updated_at > (select max(updated_at) from {{ this }})" + } +) }} +``` + +**Characteristics** + +* Persists data between runs +* Only merges new or changed rows +* Significantly faster for large tables +* Requires `unique_key` and (optionally) an `updated_at_column` +* Schema changes can be managed via: + + * `on_schema_change: "ignore"` + * `on_schema_change: "append_new_columns"` + * `on_schema_change: "sync_all_columns"` + +**Behavior example:** + +| Run | Operation | Rows affected | +| --- | ----------- | ------------- | +| 1 | full load | 10,000 | +| 2 | merge delta | 120 | +| 3 | merge delta | 45 | + +--- + +## 🧮 4. Ephemeral Models + +An **ephemeral** model exists only during query compilation. +It never creates a physical table or view — it’s inlined wherever it’s referenced. + +```sql +{{ config(materialized='ephemeral') }} + +select + order_id, + total_amount * 0.1 as tax_amount +from {{ ref('fct_orders_inc') }} +``` + +**Characteristics** + +* Inlined into parent queries +* Reduces I/O overhead (no temporary tables) +* Ideal for lightweight reusable SQL snippets +* Not visible in the warehouse after execution + +--- + +## 🔗 5. Combined Example DAG + +In the demo, these models are connected as follows: + +```text +stg_orders + ↓ +fct_orders_view (view) + ↓ +fct_orders_tbl (table) + ↓ +fct_orders_inc (incremental) + ↓ +fct_orders_ephemeral (ephemeral) +``` + +This DAG demonstrates: + +* How **data flows** between materializations +* Which ones persist or recompute data +* How incremental models can feed downstream table or ephemeral models + +--- + +## 🧭 When to Use Each Type + +| Materialization | Persists? | Performance | Recommended Use Case | +| --------------- | --------- | ------------------- | ----------------------------------------- | +| `view` | ❌ No | ⚡ Fast rebuild | Intermediate or temporary transformations | +| `table` | ✅ Yes | ⚖️ Moderate | Final outputs or smaller datasets | +| `incremental` | ✅ Yes | 🚀 High (on deltas) | Large, frequently updated fact tables | +| `ephemeral` | ❌ No | ⚡ Fast inline | Reusable SQL snippets or shared logic | + +--- + +## 🧠 Tips + +* You can set default materializations in `project.yml` under `models.materialized`. +* Override per model using `{{ config(materialized='...') }}`. +* For incremental models, ensure **unique keys** and **delta logic** are consistent across runs. +* Test behavior locally using the DuckDB engine before deploying to a warehouse. diff --git a/docs/index.md b/docs/index.md index eccc66f..82efcc7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,17 +9,28 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t - [User Guide](./Technical_Overview.md#part-i-operational-guide) - [Modeling Reference](./Config_and_Macros.md) - [Parallelism & Cache](./Cache_and_Parallelism.md) +- [CLI Guide](./CLI_Guide.md) +- [Logging & Verbosity](./Logging.md) - [API calls in Python models](./Api_Models.md) - [Incremental Models](./Incremental.md) - [YAML Tests (Schema-bound)](./YAML_Tests.md) +- [Model Unit Tests](./Unit_Tests.md) - [Data Quality Tests Reference](./Data_Quality_Tests.md) +- [Auto-Docs & Lineage](./Auto_Docs.md) +- [Troubleshooting & Error Codes](./Troubleshooting.md) - [Profiles & Environments](./Profiles.md) - [Sources Declaration](./Sources.md) - [Project Configuration](./Project_Config.md) - [State Selection (changed & results)](./State_Selection.md) -- [Basic Demo Overview](./examples/Basic_Demo.md) -- [Cross-Table Reconciliations](./Technical_Overview.md#cross-table-reconciliations) -- [Auto-Docs & Lineage](./Technical_Overview.md#auto-docs-lineage) +- [Basic Demo](./examples/Basic_Demo.md) +- [Materializations Demo](./examples/Materializations_Demo.md) +- [Data Quality Tests Demo](./examples/DQ_Demo.md) +- [Macros Demo](./examples/Macros_Demo.md) +- [Cache Demo](./examples/Cache_Demo.md) +- [Environment Matrix Demo](./examples/Environment_Matrix.md) +- [Incremental & Delta Demo](examples/Incremental_Demo.md) +- [Local Engine Setup](./examples/Local_Engine_Setup.md) +- [API Demo](./examples/API_Demo.md) - [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) ## Table of Contents @@ -37,15 +48,15 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t - **Get set up quickly:** follow the dedicated [Quickstart](Quickstart.md) guide for installation, seeding, and a first run. - **Need local runtimes?** The [API demo local engine setup](examples/Local_Engine_Setup.md) walks through DuckDB, Postgres, and Databricks Spark. -- **Understand the project layout & CLI workflow:** see *Project Layout*, *Makefile Targets*, and *CLI Flows* in the [Technical Overview](Technical_Overview.md#project-layout). -- **Configure runtimes & profiles:** review executor profiles, environment overrides, and logging options in the [Technical Overview](Technical_Overview.md#profiles-environment-overrides). -- **Model data quality & troubleshoot runs:** the [Technical Overview](Technical_Overview.md#model-unit-tests-fft-utest) covers unit tests, troubleshooting tips, and exit codes. +- **Understand the project layout & CLI workflow:** start with *Project Layout* in the [Technical Overview](Technical_Overview.md#project-layout) and pair it with the [CLI Guide](CLI_Guide.md) for command patterns. +- **Configure runtimes & profiles:** review executor profiles and environment overrides in the dedicated [Profiles guide](Profiles.md) plus [Logging & Verbosity](Logging.md) for observability flags. +- **Model data quality & troubleshoot runs:** combine the [Model Unit Tests guide](Unit_Tests.md) with [Troubleshooting & Error Codes](Troubleshooting.md) to keep runs deterministic and easy to debug. - **Explore runnable demos:** start with the [Basic Demo Overview](examples/Basic_Demo.md) or browse the `examples/` directory; each subproject ships with its own README. ### 2. Extend FastFlowTransform (Developers & Contributors) - **Dive into architecture & core modules:** start with [Architecture Overview](Technical_Overview.md#architecture-overview) and [Core Modules](Technical_Overview.md#core-modules) for registry, DAG, executors, validation, and more. -- **Add tests & seeds:** see [Sample Models](Technical_Overview.md#sample-models), [Seeds & Example Data](Technical_Overview.md#seeds-example-data), and the unit test guide in [Model Unit Tests](Technical_Overview.md#model-unit-tests-fft-utest). +- **Add tests & seeds:** reuse the curated demos under `docs/examples/` for seeds/Makefiles and follow the [Model Unit Tests guide](Unit_Tests.md) for deterministic fixtures. - **Contribute code:** follow the workflow described in [`./Contributing.md`](./Contributing.md) and consult the module-level docs for internal APIs. - **Plan ahead:** check the roadmap snapshot in the [Technical Overview](Technical_Overview.md#roadmap-snapshot) to understand upcoming work. diff --git a/examples/api_demo/.env b/examples/api_demo/.env new file mode 100644 index 0000000..986d7dc --- /dev/null +++ b/examples/api_demo/.env @@ -0,0 +1,5 @@ +FF_HTTP_ALLOWED_DOMAINS=jsonplaceholder.typicode.com,api.github.com +FF_HTTP_CACHE_DIR=.local/http-cache +FF_HTTP_MAX_RPS=5 +FF_HTTP_MAX_RETRIES=3 +FF_HTTP_TIMEOUT=20 \ No newline at end of file diff --git a/examples/api_demo/.env.dev_databricks b/examples/api_demo/.env.dev_databricks new file mode 100644 index 0000000..4c425c0 --- /dev/null +++ b/examples/api_demo/.env.dev_databricks @@ -0,0 +1,11 @@ +# Databricks Spark profile defaults for API demo +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=api_demo + +# Managed table configuration (Hive-compatible Spark session) +FF_DBR_ENABLE_HIVE=1 +FF_DBR_DATABASE=api_demo +# Uncomment to switch to Delta Lake (requires delta-spark dependency) +# FF_DBR_TABLE_FORMAT=delta + +JAVA_HOME=/opt/homebrew/opt/openjdk@17 diff --git a/examples/api_demo/.env.dev_duckdb b/examples/api_demo/.env.dev_duckdb new file mode 100644 index 0000000..accb49b --- /dev/null +++ b/examples/api_demo/.env.dev_duckdb @@ -0,0 +1,3 @@ +# DuckDB profile for API demo (kept out of Makefile) +FF_DUCKDB_PATH=.local/api_demo.duckdb +FF_DUCKDB_SCHEMA=api_demo diff --git a/examples/api_demo/.env.dev_postgres b/examples/api_demo/.env.dev_postgres new file mode 100644 index 0000000..ff4c2a7 --- /dev/null +++ b/examples/api_demo/.env.dev_postgres @@ -0,0 +1,3 @@ +# Postgres profile for API demo (store real secrets outside VCS!) +FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 +FF_PG_SCHEMA=api_demo diff --git a/examples/api_demo/Makefile b/examples/api_demo/Makefile index 965c1d8..f6d4da5 100644 --- a/examples/api_demo/Makefile +++ b/examples/api_demo/Makefile @@ -108,7 +108,7 @@ demo-open: fi demo: clean - @echo "== 🚀 R1 Demo (DuckDB) ==" + @echo "== 🚀 API Demo (DuckDB) ==" @echo "Profile=$(PROFILE_ENV) DB=$(DB) PROJECT=$(PROJECT)" +$(MAKE) seed +$(MAKE) run @@ -128,7 +128,6 @@ demo: clean +$(MAKE) res-warn @echo @echo "✅ Demo done. Open DAG here: $(PROJECT)/site/dag/index.html" - +$(MAKE) demo-open # --- API-specific convenience targets ---------------------------------------- diff --git a/examples/api_demo/profiles.yml b/examples/api_demo/profiles.yml index d55c6b5..647fb16 100644 --- a/examples/api_demo/profiles.yml +++ b/examples/api_demo/profiles.yml @@ -3,23 +3,17 @@ dev_duckdb: engine: duckdb - vars: - api_users_model: "api_users_http" duckdb: path: "{{ env('FF_DUCKDB_PATH', '.local/api_demo.duckdb') }}" dev_postgres: engine: postgres - vars: - api_users_model: "api_users_http" postgres: dsn: "{{ env('FF_PG_DSN') }}" db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" dev_databricks: engine: databricks_spark - vars: - api_users_model: "api_users_http" databricks_spark: master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" app_name: "{{ env('FF_SPARK_APP_NAME', 'api_demo') }}" @@ -30,10 +24,3 @@ dev_databricks: spark.hadoop.datanucleus.schema.autoCreateAll: "true" spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" - -default: - engine: duckdb - vars: - api_users_model: "api_users_http" - duckdb: - path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" diff --git a/examples/api_demo/project.yml b/examples/api_demo/project.yml index c6788e5..2aa9492 100644 --- a/examples/api_demo/project.yml +++ b/examples/api_demo/project.yml @@ -25,8 +25,12 @@ seeds: format: parquet tests: - # Batch tables - type: not_null table: mart_users_join column: user_id tags: [batch] + - type: row_count_between + table: mart_users_join + min_rows: 3 + max_rows: 3 + tags: [batch] diff --git a/examples/basic_demo/.env.dev_databricks b/examples/basic_demo/.env.dev_databricks new file mode 100644 index 0000000..17bad51 --- /dev/null +++ b/examples/basic_demo/.env.dev_databricks @@ -0,0 +1,13 @@ +# Databricks (or local Spark) profile defaults for the basic demo +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=basic_demo + +# Optional overrides when using Databricks SQL warehouses or Unity Catalog +# FF_DBR_DATABASE=basic_demo +# FF_DBR_CATALOG=hive_metastore + +# Set these if you need a Hive-compatible Spark metastore +FF_DBR_ENABLE_HIVE=1 + +# Configure Java for local Spark sessions when needed +# JAVA_HOME=/opt/homebrew/opt/openjdk@17 diff --git a/examples/basic_demo/.env.dev_duckdb b/examples/basic_demo/.env.dev_duckdb new file mode 100644 index 0000000..1bad4d9 --- /dev/null +++ b/examples/basic_demo/.env.dev_duckdb @@ -0,0 +1,2 @@ +# DuckDB profile for the basic demo +FF_DUCKDB_PATH=.local/basic_demo.duckdb diff --git a/examples/basic_demo/.env.dev_postgres b/examples/basic_demo/.env.dev_postgres new file mode 100644 index 0000000..2000d7c --- /dev/null +++ b/examples/basic_demo/.env.dev_postgres @@ -0,0 +1,3 @@ +# Postgres profile for the basic demo (replace with your own connection string) +FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 +FF_PG_SCHEMA=basic_demo diff --git a/examples/basic_demo/Makefile b/examples/basic_demo/Makefile index 7f7f5a9..8f5a0be 100644 --- a/examples/basic_demo/Makefile +++ b/examples/basic_demo/Makefile @@ -92,6 +92,5 @@ demo: clean +$(MAKE) run ENGINE=$(ENGINE) +$(MAKE) dag ENGINE=$(ENGINE) +$(MAKE) test ENGINE=$(ENGINE) - +$(MAKE) show ENGINE=$(ENGINE) +$(MAKE) artifacts @echo "✅ Demo complete." diff --git a/examples/basic_demo/profiles.yml b/examples/basic_demo/profiles.yml index 130a7e7..514028f 100644 --- a/examples/basic_demo/profiles.yml +++ b/examples/basic_demo/profiles.yml @@ -3,38 +3,25 @@ dev_duckdb: engine: duckdb - vars: - demo_target_schema: main duckdb: path: "{{ env('FF_DUCKDB_PATH', '.local/basic_demo.duckdb') }}" dev_postgres: engine: postgres - vars: - demo_target_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" postgres: dsn: "{{ env('FF_PG_DSN') }}" db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" dev_databricks: engine: databricks_spark - vars: - demo_target_schema: "{{ env('FF_DBR_DATABASE', 'basic_demo') }}" databricks_spark: master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" app_name: "{{ env('FF_SPARK_APP_NAME', 'basic_demo') }}" warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" - schema: "{{ env('FF_DBR_DATABASE', 'basic_demo') }}" + database: "{{ env('FF_DBR_DATABASE', 'basic_demo') }}" extra_conf: spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" spark.hadoop.datanucleus.schema.autoCreateAll: "true" spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" - -default: - engine: duckdb - vars: - demo_target_schema: main - duckdb: - path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" diff --git a/examples/basic_demo/project.yml b/examples/basic_demo/project.yml index 3986ec5..1b380e7 100644 --- a/examples/basic_demo/project.yml +++ b/examples/basic_demo/project.yml @@ -61,3 +61,9 @@ tests: column: user_count threshold: 0 tags: [example_basic_demo] + + - type: row_count_between + table: mart_users_by_domain + min_rows: 1 + max_rows: 10 + tags: [example_basic_demo] diff --git a/examples/basic_demo/sources.yml b/examples/basic_demo/sources.yml index 25c9c73..d48deca 100644 --- a/examples/basic_demo/sources.yml +++ b/examples/basic_demo/sources.yml @@ -2,7 +2,6 @@ version: 2 sources: - name: crm - description: Seeded CRM-style data for the demo. tables: - name: users identifier: seed_users diff --git a/examples/cache_demo/.env.dev_duckdb b/examples/cache_demo/.env.dev_duckdb new file mode 100644 index 0000000..5243ddd --- /dev/null +++ b/examples/cache_demo/.env.dev_duckdb @@ -0,0 +1,10 @@ +# DuckDB profile for cache demo +FF_DUCKDB_PATH=.local/cache_demo.duckdb +FF_DUCKDB_SCHEMA=cache_demo + +# HTTP cache +FF_HTTP_ALLOWED_DOMAINS=jsonplaceholder.typicode.com +FF_HTTP_CACHE_DIR=.local/http-cache +FF_HTTP_MAX_RPS=5 +FF_HTTP_MAX_RETRIES=2 +FF_HTTP_TIMEOUT=10 diff --git a/examples/cache_demo/Makefile b/examples/cache_demo/Makefile new file mode 100644 index 0000000..da6aa4c --- /dev/null +++ b/examples/cache_demo/Makefile @@ -0,0 +1,101 @@ +.PHONY: seed run run_parallel cache_first cache_second \ + change_sql change_seed change_env change_py \ + http_first http_offline http_cache_clear artifacts dag clean \ + demo + +ENGINE ?= duckdb +PROFILE_ENV = dev_duckdb +PROJECT ?= . +UV ?= uv + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +RUN_ENV = $(BASE_ENV) + +SELECT_ALL = --select tag:example:cache_demo + +seed: + env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) + +run: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_ALL) --cache=rw + +run_parallel: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_ALL) --cache=rw --jobs 4 + +cache_first: seed run +cache_second: run + +change_sql: + # Touch SQL to change rendered output → downstream mart rebuilds + touch "$(PROJECT)/models/seeds_consumers/stg_users.ff.sql" + +$(MAKE) run + +change_seed: + # Modify seed → staging and mart rebuild + @printf "\n4,dan@example.com\n" >> "$(PROJECT)/seeds/seed_users.csv" + +$(MAKE) seed + +$(MAKE) run + +change_env: + # FF_* env affects fingerprint → everything rebuilds + env $(RUN_ENV) FF_DEMO_TOGGLE=1 $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_ALL) --cache=rw + +change_py: + # Edit the constant in py_constants.ff.py to 43 manually, then: + +$(MAKE) run + +http_first: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) --select http_users --cache=rw + +http_offline: + # Works only if http_first warmed cache + env $(RUN_ENV) FF_HTTP_OFFLINE=1 $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) --select http_users --cache=rw + +http_cache_clear: + rm -rf ".local/http-cache" + +artifacts: + @echo ".fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + +dag: + env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_ALL) --html + +clean: + rm -rf .local cache_demo.duckdb site .fastflowtransform + +demo: clean + @echo "== 🚀 Cache Demo ($(ENGINE)) ==" + @echo "== 1) First full build (writes cache) ==" + +$(MAKE) cache_first + @echo + @echo "== 2) No-op run (should skip everything) ==" + +$(MAKE) cache_second + @echo + @echo "== 3) Parallel run (visualize level-wise concurrency) ==" + +$(MAKE) run_parallel + @echo + @echo "== 4) Touch SQL model (downstream mart should rebuild) ==" + +$(MAKE) change_sql + +$(MAKE) cache_second + @echo + @echo "== 5) Change seed data (staging + mart should rebuild) ==" + +$(MAKE) change_seed + +$(MAKE) cache_second + @echo + @echo "== 6) Change FF_* env (global cache invalidation) ==" + +$(MAKE) change_env + +$(MAKE) cache_second + @echo + @echo "== 7) Edit Python model (only that one should rebuild) ==" + +$(MAKE) change_py + +$(MAKE) cache_second + @echo + @echo "== 8) Warm HTTP cache & run offline ==" + +$(MAKE) http_first + +$(MAKE) http_offline + @echo + @echo "== 9) DAG & artifacts ==" + +$(MAKE) dag + +$(MAKE) artifacts + @echo + @echo "✅ Demo complete. Open DAG at: $(PROJECT)/site/dag/index.html" diff --git a/examples/cache_demo/README.md b/examples/cache_demo/README.md new file mode 100644 index 0000000..9482057 --- /dev/null +++ b/examples/cache_demo/README.md @@ -0,0 +1,41 @@ +# Cache Demo + +This demo shows: +- Build cache skip/hit via fingerprints +- Downstream invalidation (seed → staging → mart) +- Environment-driven invalidation (only `FF_*`) +- Parallelism within levels (`--jobs`) +- HTTP response cache + offline mode + +## Quickstart + +```bash +cd examples/cache_demo +make cache_first # builds and writes cache +make cache_second # should SKIP everything +make change_sql # touch SQL → mart rebuilds +make change_seed # add a seed row → staging + mart rebuild +make change_env # FF_* env change → full rebuild +make change_py # edit constant in py_constants.ff.py → it rebuilds + +make http_first # warms HTTP cache +make http_offline # reuses HTTP cache without network +make http_cache_clear # clears HTTP response cache +Inspect: + +site/dag/index.html + +.fastflowtransform/target/run_results.json (HTTP stats, results) + +markdown +Code kopieren + +--- + +## What this demo proves (in a minute) + +- **Cache hit/skip:** `make cache_second` should skip everything (if nothing changed). +- **Upstream invalidation:** `make change_seed` rebuilds staging **and** the mart. +- **Env invalidation:** `make change_env` (because `FF_*` is part of the fingerprint). +- **Python source sensitivity:** `py_constants` rebuilds only when its code changes. +- **HTTP cache:** `http_first` fetches; `http_offline` runs fully offline using cached responses. diff --git a/examples/cache_demo/models/README.md b/examples/cache_demo/models/README.md new file mode 100644 index 0000000..32818bb --- /dev/null +++ b/examples/cache_demo/models/README.md @@ -0,0 +1,4 @@ +# Models directory + +Place SQL (`*.ff.sql`) and Python (`*.ff.py`) models here. +See docs/Config_and_Macros.md for modeling guidance and config options. diff --git a/examples/cache_demo/models/http/http_users.ff.py b/examples/cache_demo/models/http/http_users.ff.py new file mode 100644 index 0000000..bf1bf80 --- /dev/null +++ b/examples/cache_demo/models/http/http_users.ff.py @@ -0,0 +1,21 @@ +from fastflowtransform import model +from fastflowtransform.api.http import get_df +import pandas as pd + + +@model( + name="http_users", + deps=["stg_users.ff"], # just to show a dependency; not used in code + meta={ + "materialized": "table", + "tags": ["example:cache_demo", "engine:duckdb"], + }, +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, + normalize=True, + ) + cols = [c for c in df.columns if c in ("id", "email", "username")] + return df[cols].rename(columns={"id": "api_user_id"}) diff --git a/examples/cache_demo/models/marts/mart_user_orders.ff.sql b/examples/cache_demo/models/marts/mart_user_orders.ff.sql new file mode 100644 index 0000000..406aa61 --- /dev/null +++ b/examples/cache_demo/models/marts/mart_user_orders.ff.sql @@ -0,0 +1,12 @@ +{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb']) }} +with u as ( + select user_id, email from {{ ref('stg_users.ff') }} +), +o as ( + select user_id, amount from {{ ref('stg_orders.ff') }} +) +select u.user_id, u.email, coalesce(sum(o.amount),0) as total_amount +from u +left join o using (user_id) +group by 1,2 +order by 1; diff --git a/examples/cache_demo/models/python/py_constants.ff.py b/examples/cache_demo/models/python/py_constants.ff.py new file mode 100644 index 0000000..5e22b7a --- /dev/null +++ b/examples/cache_demo/models/python/py_constants.ff.py @@ -0,0 +1,16 @@ +from fastflowtransform import model +import pandas as pd + + +@model( + name="py_constants", + deps=[], # independent + meta={ + "materialized": "table", + "tags": ["example:cache_demo", "engine:duckdb"], + }, +) +def build() -> pd.DataFrame: + # Change this constant to trigger a fingerprint change for a pure Python model. + CONSTANT = 42 + return pd.DataFrame([{"k": "answer", "v": CONSTANT}]) diff --git a/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql b/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql new file mode 100644 index 0000000..9444fcc --- /dev/null +++ b/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql @@ -0,0 +1,6 @@ +{{ config(materialized='view', tags=['example:cache_demo','engine:duckdb']) }} +select + cast(order_id as int) as order_id, + cast(customer_id as int) as user_id, + cast(amount as double) as amount +from {{ source('crm', 'orders') }}; diff --git a/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql b/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql new file mode 100644 index 0000000..5d55e6a --- /dev/null +++ b/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql @@ -0,0 +1,3 @@ +{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb']) }} +select cast(id as int) as user_id, lower(email) as email +from {{ source('crm', 'users') }}; diff --git a/examples/cache_demo/profiles.yml b/examples/cache_demo/profiles.yml new file mode 100644 index 0000000..b65dc2f --- /dev/null +++ b/examples/cache_demo/profiles.yml @@ -0,0 +1,4 @@ +dev_duckdb: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/cache_demo.duckdb') }}" diff --git a/examples/cache_demo/project.yml b/examples/cache_demo/project.yml new file mode 100644 index 0000000..25b996d --- /dev/null +++ b/examples/cache_demo/project.yml @@ -0,0 +1,23 @@ +name: cache_demo +version: "0.1" + +vars: {} + +models: + storage: + stg_users.ff: { path: ".local/duck/users", format: parquet } + stg_orders.ff: { path: ".local/duck/orders", format: parquet } + mart_user_orders.ff: { path: ".local/duck/mart_user_orders", format: parquet } + py_constants: { path: ".local/duck/py_constants", format: parquet } + http_users: { path: ".local/duck/http_users", format: parquet } + +seeds: + storage: + seed_users: { path: ".local/duck/seed_users", format: parquet } + seed_orders: { path: ".local/duck/seed_orders", format: parquet } + +tests: + - type: row_count_between + table: stg_users + min_rows: 3 + tags: [batch] diff --git a/examples/cache_demo/seeds/README.md b/examples/cache_demo/seeds/README.md new file mode 100644 index 0000000..2e553ed --- /dev/null +++ b/examples/cache_demo/seeds/README.md @@ -0,0 +1,4 @@ +# Seeds directory + +Add CSV or Parquet files for reproducible seeds. +Usage examples are covered in docs/Quickstart.md and docs/Config_and_Macros.md#13-seeds-sources-and-dependencies. diff --git a/examples/cache_demo/seeds/seed_orders.csv b/examples/cache_demo/seeds/seed_orders.csv new file mode 100644 index 0000000..1ae1de3 --- /dev/null +++ b/examples/cache_demo/seeds/seed_orders.csv @@ -0,0 +1,4 @@ +order_id,customer_id,amount +10,1,12.50 +20,1,2.50 +30,2,8.00 diff --git a/examples/cache_demo/seeds/seed_users.csv b/examples/cache_demo/seeds/seed_users.csv new file mode 100644 index 0000000..5559381 --- /dev/null +++ b/examples/cache_demo/seeds/seed_users.csv @@ -0,0 +1,6 @@ +id,email +1,alice@example.com +2,bob@example.com +3,carol@example.com + +4,dan@example.com diff --git a/examples/postgres/site/dag/mart_users.ff.html b/examples/cache_demo/site/dag/http_users.html similarity index 95% rename from examples/postgres/site/dag/mart_users.ff.html rename to examples/cache_demo/site/dag/http_users.html index 9d31e7a..220718b 100644 --- a/examples/postgres/site/dag/mart_users.ff.html +++ b/examples/cache_demo/site/dag/http_users.html @@ -3,7 +3,7 @@ - mart_users.ff – FastFlowTransform + http_users – FastFlowTransform + + +

← Back to overview

+ +
+
+

+ py_constants + table +

+
Model Detail • FastFlowTransform
+
+ python +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
py_constants
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/cache_demo/models/python/py_constants.ff.py + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + + + +
+ + + + \ No newline at end of file diff --git a/examples/cache_demo/site/dag/stg_orders.ff.html b/examples/cache_demo/site/dag/stg_orders.ff.html new file mode 100644 index 0000000..de0f1b9 --- /dev/null +++ b/examples/cache_demo/site/dag/stg_orders.ff.html @@ -0,0 +1,146 @@ + + + + + + stg_orders.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ stg_orders.ff + view +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
view
+ +
Relation
+
stg_orders
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+
+ +
+ +
+
+ + + + + + +
+ + + + \ No newline at end of file diff --git a/examples/postgres/site/dag/users.ff.html b/examples/cache_demo/site/dag/stg_users.ff.html similarity index 94% rename from examples/postgres/site/dag/users.ff.html rename to examples/cache_demo/site/dag/stg_users.ff.html index e018a88..3e4ee0e 100644 --- a/examples/postgres/site/dag/users.ff.html +++ b/examples/cache_demo/site/dag/stg_users.ff.html @@ -3,7 +3,7 @@ - users.ff – FastFlowTransform + stg_users.ff – FastFlowTransform + + +

← Back to overview

+ +
+
+

+ customers.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
customers
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/dq_demo/models/staging/customers.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+
+ +
+ +
+
+ + + + +
+

Columns

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
customer_idbigint + + yes + + + + — + + + + + ?.customer_id + + direct + + + + +
namestring + + yes + + + + — + + + + + ?.name + + direct + + + + +
statusstring + + yes + + + + — + + + + + ?.status + + direct + + + + +
created_atstring + + yes + + + + — + + + + + ?.created_at + + direct + + + + +
+ + + + + + + + + \ No newline at end of file diff --git a/examples/dq_demo/site/dag/index.html b/examples/dq_demo/site/dag/index.html new file mode 100644 index 0000000..c9a164b --- /dev/null +++ b/examples/dq_demo/site/dag/index.html @@ -0,0 +1,248 @@ + + + + + + FastFlowTransform - DAG & Mini Docs + + + + + + + +
+
+

FastFlowTransform - DAG & Mini Docs

+
Mermaid renders automatically (light/dark)
+
+
+ + +
+
+ +
+
+

DAG

+
+ SQL + Python + + Materialization: + + table + + view + + ephemeral + + incremental + +
+
flowchart TD + classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; + classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; + customers_ff["customers.ff
(customers)"] + class customers_ff sql; + mart_orders_agg_ff["mart_orders_agg.ff
(mart_orders_agg)"] + class mart_orders_agg_ff sql; + orders_ff["orders.ff
(orders)"] + class orders_ff sql; + orders_ff --> mart_orders_agg_ff + customers_ff --> mart_orders_agg_ff +
+
+ + + +
+

Macros

+ +

No macros found.

+ +
+
+ + + + \ No newline at end of file diff --git a/examples/dq_demo/site/dag/mart_orders_agg.ff.html b/examples/dq_demo/site/dag/mart_orders_agg.ff.html new file mode 100644 index 0000000..91495af --- /dev/null +++ b/examples/dq_demo/site/dag/mart_orders_agg.ff.html @@ -0,0 +1,320 @@ + + + + + + mart_orders_agg.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ mart_orders_agg.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
mart_orders_agg
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/dq_demo/models/marts/mart_orders_agg.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
customer_idint + + yes + + + + — + + + + + ?.customer_id + + direct + + + + +
customer_namestring + + yes + + + + — + + + + + ?.name + + direct + + + + +
statusstring + + yes + + + + — + + + + unknown + +
order_countbigint + + yes + + + + — + + + + unknown + +
total_amountdecimal(20,0) + + yes + + + + — + + + + unknown + +
first_order_tstimestamp + + yes + + + + — + + + + unknown + +
last_order_tstimestamp + + yes + + + + — + + + + unknown + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/dq_demo/site/dag/orders.ff.html b/examples/dq_demo/site/dag/orders.ff.html new file mode 100644 index 0000000..40bcc68 --- /dev/null +++ b/examples/dq_demo/site/dag/orders.ff.html @@ -0,0 +1,275 @@ + + + + + + orders.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ orders.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
orders
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/dq_demo/models/staging/orders.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+
+ +
+ +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
order_idint + + yes + + + + — + + + + + ?.? + + transformed + + + + +
customer_idint + + yes + + + + — + + + + + ?.? + + transformed + + + + +
amountdecimal(10,0) + + yes + + + + — + + + + + ?.? + + transformed + + + + +
order_tstimestamp + + yes + + + + — + + + + + ?.? + + transformed + + + + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/dq_demo/sources.yml b/examples/dq_demo/sources.yml new file mode 100644 index 0000000..0260a5a --- /dev/null +++ b/examples/dq_demo/sources.yml @@ -0,0 +1,9 @@ +version: 2 + +sources: + - name: crm + tables: + - name: customers + identifier: seed_customers # materialized via `fft seed` + - name: orders + identifier: seed_orders # materialized via `fft seed` diff --git a/examples/dq_demo/tests/unit/README.md b/examples/dq_demo/tests/unit/README.md new file mode 100644 index 0000000..b3c3c8d --- /dev/null +++ b/examples/dq_demo/tests/unit/README.md @@ -0,0 +1,4 @@ +# Unit tests + +Define YAML unit specs as described in docs/Config_and_Macros.md#73-model-unit-tests-fft-utest. +Invoke them with `fft utest --env `. diff --git a/examples/env_matrix/.env b/examples/env_matrix/.env new file mode 100644 index 0000000..e14bf83 --- /dev/null +++ b/examples/env_matrix/.env @@ -0,0 +1,6 @@ +# examples/env_matrix/.env +# Shared defaults (low priority). These can be safely left empty. + +# Common HTTP/telemetry toggles if you like +FFT_SQL_DEBUG=0 +FFT_LOG_JSON=0 diff --git a/examples/env_matrix/.env.dev b/examples/env_matrix/.env.dev new file mode 100644 index 0000000..62795f1 --- /dev/null +++ b/examples/env_matrix/.env.dev @@ -0,0 +1,6 @@ +# examples/env_matrix/.env.dev +FFT_ACTIVE_ENV=dev + +# Dev: default to DuckDB in a local file +FF_ENGINE=duckdb +FF_DUCKDB_PATH=.local/env_matrix.dev.duckdb diff --git a/examples/env_matrix/.env.prod b/examples/env_matrix/.env.prod new file mode 100644 index 0000000..bf01f0d --- /dev/null +++ b/examples/env_matrix/.env.prod @@ -0,0 +1,5 @@ +# examples/env_matrix/.env.prod +FFT_ACTIVE_ENV=prod + +FF_ENGINE=duckdb +FF_DUCKDB_PATH=.local/env_matrix.prod.duckdb diff --git a/examples/env_matrix/.env.stg b/examples/env_matrix/.env.stg new file mode 100644 index 0000000..7d03292 --- /dev/null +++ b/examples/env_matrix/.env.stg @@ -0,0 +1,5 @@ +# examples/env_matrix/.env.stg +FFT_ACTIVE_ENV=stg + +FF_ENGINE=duckdb +FF_DUCKDB_PATH=.local/env_matrix.stg.duckdb diff --git a/examples/env_matrix/.fastflowtransform/cache/prod-duckdb.json b/examples/env_matrix/.fastflowtransform/cache/prod-duckdb.json deleted file mode 100644 index 9eacacd..0000000 --- a/examples/env_matrix/.fastflowtransform/cache/prod-duckdb.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "engine": "duckdb", - "entries": { - "env_vars.ff": "f7ef95030b8fe43c52bd748deb4fb8061c41d9f384b5b34a49ff69203cde4d82", - "users.ff": "11705124b10bc738b123f525e164f81d909972b29651b4b86153c8edba18bde1" - }, - "profile": "prod", - "version": 1 -} \ No newline at end of file diff --git a/examples/env_matrix/.fastflowtransform/target/catalog.json b/examples/env_matrix/.fastflowtransform/target/catalog.json index 743d3df..b1e4d0c 100644 --- a/examples/env_matrix/.fastflowtransform/target/catalog.json +++ b/examples/env_matrix/.fastflowtransform/target/catalog.json @@ -1,6 +1,6 @@ { "metadata": { - "generated_at": "2025-10-28T18:36:20+00:00", + "generated_at": "2025-11-11T15:53:24+00:00", "tool": "fastflowtransform" }, "relations": { diff --git a/examples/env_matrix/.fastflowtransform/target/manifest.json b/examples/env_matrix/.fastflowtransform/target/manifest.json index 473bc66..8aa6b59 100644 --- a/examples/env_matrix/.fastflowtransform/target/manifest.json +++ b/examples/env_matrix/.fastflowtransform/target/manifest.json @@ -1,7 +1,7 @@ { "macros": {}, "metadata": { - "generated_at": "2025-10-28T18:36:20+00:00", + "generated_at": "2025-11-11T15:53:24+00:00", "tool": "fastflowtransform" }, "nodes": { diff --git a/examples/env_matrix/.fastflowtransform/target/run_results.json b/examples/env_matrix/.fastflowtransform/target/run_results.json index a9adfc8..ea2db71 100644 --- a/examples/env_matrix/.fastflowtransform/target/run_results.json +++ b/examples/env_matrix/.fastflowtransform/target/run_results.json @@ -1,28 +1,36 @@ { "metadata": { - "generated_at": "2025-10-28T18:36:20+00:00", + "generated_at": "2025-11-11T15:53:24+00:00", "tool": "fastflowtransform" }, "results": [ { - "duration_ms": 5, - "finished_at": "2025-10-28T18:36:20+00:00", - "http": null, + "duration_ms": 7, + "finished_at": "2025-11-11T15:53:24+00:00", + "http": { + "bytes": 0, + "cache_hits": 0, + "content_hashes": [], + "keys": [], + "node": "env_vars.ff", + "requests": 0, + "used_offline": false + }, "message": null, "name": "env_vars.ff", - "started_at": "2025-10-28T18:36:20+00:00", + "started_at": "2025-11-11T15:53:24+00:00", "status": "success" }, { "duration_ms": 2, - "finished_at": "2025-10-28T18:36:20+00:00", + "finished_at": "2025-11-11T15:53:24+00:00", "http": null, "message": null, "name": "users.ff", - "started_at": "2025-10-28T18:36:20+00:00", + "started_at": "2025-11-11T15:53:24+00:00", "status": "success" } ], - "run_finished_at": "2025-10-28T18:36:20+00:00", - "run_started_at": "2025-10-28T18:36:20+00:00" + "run_finished_at": "2025-11-11T15:53:24+00:00", + "run_started_at": "2025-11-11T15:53:24+00:00" } diff --git a/examples/env_matrix/profiles.yml b/examples/env_matrix/profiles.yml index f9a04a6..abc03f1 100644 --- a/examples/env_matrix/profiles.yml +++ b/examples/env_matrix/profiles.yml @@ -1,15 +1,14 @@ -default: - dev: - engine: "{{ env('FF_ENGINE') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH') }}" +dev: + engine: "{{ env('FF_ENGINE') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH') }}" - stg: - engine: "{{ env('FF_ENGINE') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH') }}" +stg: + engine: "{{ env('FF_ENGINE') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH') }}" - prod: - engine: "{{ env('FF_ENGINE') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH') }}" +prod: + engine: "{{ env('FF_ENGINE') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH') }}" diff --git a/examples/env_matrix/project.yml b/examples/env_matrix/project.yml index 502a0b7..e3300b2 100644 --- a/examples/env_matrix/project.yml +++ b/examples/env_matrix/project.yml @@ -1,4 +1,4 @@ -name: duckdb_api_demo +name: duckdb_env_matrix_demo version: "0.1" vars: {} @@ -13,5 +13,6 @@ tests: - type: row_count_between table: users - min: 1 + min_rows: 3 + max_rows: 3 tags: [batch] diff --git a/examples/incremental_demo/.env.dev_databricks_delta b/examples/incremental_demo/.env.dev_databricks_delta new file mode 100644 index 0000000..03fa4fb --- /dev/null +++ b/examples/incremental_demo/.env.dev_databricks_delta @@ -0,0 +1,11 @@ +# Databricks Spark profile defaults for incremental demo +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=incremental_demo + +# Managed table configuration (Hive-compatible Spark session) +FF_DBR_ENABLE_HIVE=1 +FF_DBR_DATABASE=incremental_demo +# Optional: Delta Lake +# FF_DBR_TABLE_FORMAT=delta + +JAVA_HOME=/opt/homebrew/opt/openjdk@17 diff --git a/examples/incremental_demo/.env.dev_databricks_iceberg b/examples/incremental_demo/.env.dev_databricks_iceberg new file mode 100644 index 0000000..03fa4fb --- /dev/null +++ b/examples/incremental_demo/.env.dev_databricks_iceberg @@ -0,0 +1,11 @@ +# Databricks Spark profile defaults for incremental demo +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=incremental_demo + +# Managed table configuration (Hive-compatible Spark session) +FF_DBR_ENABLE_HIVE=1 +FF_DBR_DATABASE=incremental_demo +# Optional: Delta Lake +# FF_DBR_TABLE_FORMAT=delta + +JAVA_HOME=/opt/homebrew/opt/openjdk@17 diff --git a/examples/incremental_demo/.env.dev_duckdb b/examples/incremental_demo/.env.dev_duckdb new file mode 100644 index 0000000..b679589 --- /dev/null +++ b/examples/incremental_demo/.env.dev_duckdb @@ -0,0 +1,4 @@ +# DuckDB profile for incremental demo +FF_DUCKDB_PATH=.local/incremental_demo.duckdb +FF_DUCKDB_CATALOG=incremental_demo_catalog +FF_DUCKDB_SCHEMA=incremental_demo diff --git a/examples/incremental_demo/.env.dev_postgres b/examples/incremental_demo/.env.dev_postgres new file mode 100644 index 0000000..139ff95 --- /dev/null +++ b/examples/incremental_demo/.env.dev_postgres @@ -0,0 +1,3 @@ +# Postgres profile for incremental demo +FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 +FF_PG_SCHEMA=incremental_demo diff --git a/examples/incremental_demo/Makefile b/examples/incremental_demo/Makefile new file mode 100644 index 0000000..4b2c883 --- /dev/null +++ b/examples/incremental_demo/Makefile @@ -0,0 +1,112 @@ +.PHONY: seed run_full run_incr dag test artifacts clean demo demo-open + +# --- Config ------------------------------------------------------------------- + +DB ?= .local/incremental_demo.duckdb +PROJECT ?= . +UV ?= uv + +# Engine selector (duckdb|postgres|databricks_spark) +ENGINE ?= duckdb + +# For Databricks Spark: control table format (parquet|delta|iceberg) +DBR_TABLE_FORMAT ?= parquet + +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + OPENER := open +else + OPENER := xdg-open +endif + +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb + ENGINE_TAG = engine:duckdb +endif +ifeq ($(ENGINE),postgres) + PROFILE_ENV = dev_postgres + ENGINE_TAG = engine:postgres +endif +ifeq ($(ENGINE),databricks_spark) + ENGINE_TAG = engine:databricks_spark + # Choose profile based on table format so we can have separate configs: + # - dev_databricks_delta (Delta Lake) + # - dev_databricks_iceberg (Iceberg) + # - dev_databricks (generic / parquet) + ifeq ($(DBR_TABLE_FORMAT),delta) + PROFILE_ENV = dev_databricks_delta + else ifeq ($(DBR_TABLE_FORMAT),iceberg) + PROFILE_ENV = dev_databricks_iceberg + else + PROFILE_ENV = dev_databricks_delta + endif +endif + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +ifeq ($(ENGINE),databricks_spark) + BASE_ENV := $(BASE_ENV) FF_DBR_TABLE_FORMAT=$(DBR_TABLE_FORMAT) +endif + +RUN_ENV = $(BASE_ENV) + +SELECT_FLAGS = --select tag:example:incremental_demo --select tag:$(ENGINE_TAG) + +CLEAN_SCRIPT = ../_scripts/cleanup_env.py + +ifeq ($(ENGINE),duckdb) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine duckdb --env "$(PROFILE_ENV)" --project "$(PROJECT)" --duckdb-path "$(DB)" +else ifeq ($(ENGINE),postgres) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),databricks_spark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else + CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) +endif + +# --- Targets ------------------------------------------------------------------ + +seed: + env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) + +# Full refresh (first run) +run_full: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --cache rw + +# second/subsequent run: shows incremental/delta behaviour +run_incr: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --cache rw + +dag: + env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --html + +test: + env $(BASE_ENV) $(UV) run fft test "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +artifacts: + @echo + @echo "== 📦 Artifacts ==" + @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + @echo " DAG HTML: $(PROJECT)/site/dag/index.html" + +clean: + $(CLEAN_CMD) + +demo-open: + @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ + $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ + else \ + echo "No HTML found: $(PROJECT)/site/dag/index.html"; \ + fi + +demo: clean + @echo "== 🚀 Incremental Demo ($(ENGINE)) ==" + @echo "Profile=$(PROFILE_ENV) DB=$(DB) PROJECT=$(PROJECT) DBR_TABLE_FORMAT=$(DBR_TABLE_FORMAT)" + +$(MAKE) seed + +$(MAKE) run_full + @echo + @echo "== 🔁 Second run (Incremental/Delta) ==" + +$(MAKE) run_incr + +$(MAKE) dag + +$(MAKE) test + +$(MAKE) artifacts + @echo "✅ Demo done. Open DAG here: $(PROJECT)/site/dag/index.html" diff --git a/examples/incremental_demo/README.md b/examples/incremental_demo/README.md new file mode 100644 index 0000000..5e977f7 --- /dev/null +++ b/examples/incremental_demo/README.md @@ -0,0 +1,7 @@ +# FastFlowTransform project scaffold + +This project was created with `fft init`. +Next steps: +1. Update `profiles.yml` with real connection details (docs/Profiles.md). +2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). +3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). diff --git a/examples/incremental_demo/models/README.md b/examples/incremental_demo/models/README.md new file mode 100644 index 0000000..32818bb --- /dev/null +++ b/examples/incremental_demo/models/README.md @@ -0,0 +1,4 @@ +# Models directory + +Place SQL (`*.ff.sql`) and Python (`*.ff.py`) models here. +See docs/Config_and_Macros.md for modeling guidance and config options. diff --git a/examples/incremental_demo/models/common/events_base.ff.sql b/examples/incremental_demo/models/common/events_base.ff.sql new file mode 100644 index 0000000..b23424d --- /dev/null +++ b/examples/incremental_demo/models/common/events_base.ff.sql @@ -0,0 +1,17 @@ +{{ config( + materialized='table', + tags=[ + 'example:incremental_demo', + 'scope:common', + 'kind:staging', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], +) }} + +select + event_id, + cast(updated_at as timestamp) as updated_at, + value +from {{ source('raw', 'events') }}; diff --git a/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql b/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql new file mode 100644 index 0000000..4aee4d1 --- /dev/null +++ b/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql @@ -0,0 +1,32 @@ +{{ config( + materialized='incremental', + unique_key='event_id', + incremental={ + 'updated_at_column': 'updated_at' + }, + tags=[ + 'example:incremental_demo', + 'scope:common', + 'kind:incremental', + 'inc:type:inline-sql', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], +) }} + +with base as ( + select * + from {{ ref('events_base.ff') }} +) +select + event_id, + updated_at, + value +from base +{% if is_incremental() %} +where updated_at > ( + select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') + from {{ this }} +) +{% endif %}; diff --git a/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql b/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql new file mode 100644 index 0000000..2457d40 --- /dev/null +++ b/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql @@ -0,0 +1,22 @@ +{{ config( + materialized='incremental', + tags=[ + 'example:incremental_demo', + 'scope:common', + 'kind:incremental', + 'inc:type:yaml-config', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], +) }} + +with base as ( + select * + from {{ ref('events_base.ff') }} +) +select + event_id, + updated_at, + value +from base; diff --git a/examples/incremental_demo/models/engines/databricks_spark/fct_events_py_incremental.ff.py b/examples/incremental_demo/models/engines/databricks_spark/fct_events_py_incremental.ff.py new file mode 100644 index 0000000..cd414cd --- /dev/null +++ b/examples/incremental_demo/models/engines/databricks_spark/fct_events_py_incremental.ff.py @@ -0,0 +1,28 @@ +from fastflowtransform import engine_model +from pyspark.sql import DataFrame as SparkDataFrame +from pyspark.sql import functions as F + + +@engine_model( + only="databricks_spark", + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=[ + "example:incremental_demo", + "scope:engine", + "engine:databricks_spark", + "kind:python", + "kind:incremental", + ], +) +def build(events_df: SparkDataFrame) -> SparkDataFrame: + """ + Python-Incremental-Beispiel (Databricks Spark). + + Auch hier: + - Build-Snapshot im Python-Model + - Merge/Delta wird über Konfiguration gesteuert. + """ + return events_df.withColumn("value_x10", F.col("value") * F.lit(10)).select( + "event_id", "updated_at", "value", "value_x10" + ) diff --git a/examples/incremental_demo/models/engines/duckdb/fct_events_py_incremental.ff.py b/examples/incremental_demo/models/engines/duckdb/fct_events_py_incremental.ff.py new file mode 100644 index 0000000..d1deb9e --- /dev/null +++ b/examples/incremental_demo/models/engines/duckdb/fct_events_py_incremental.ff.py @@ -0,0 +1,29 @@ +from fastflowtransform import engine_model +import pandas as pd + + +@engine_model( + only="duckdb", + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=[ + "example:incremental_demo", + "scope:engine", + "engine:duckdb", + "kind:python", + "kind:incremental", + ], +) +def build(events_df: pd.DataFrame) -> pd.DataFrame: + """ + Python-Incremental-Beispiel (DuckDB). + + Dieses Modell baut immer einen vollständigen Snapshot: + - ggf. leichte Transformation + Die eigentliche Incremental-Logik (Merge per unique_key, Delta-Spalten etc.) + kommt aus project.yml → models.incremental.fct_events_py_incremental.ff. + """ + # kleine Beispiel-Transformation: value * 10 + df = events_df.copy() + df["value_x10"] = df["value"] * 10 + return df[["event_id", "updated_at", "value", "value_x10"]] diff --git a/examples/incremental_demo/models/engines/postgres/fct_events_py_incremental.ff.py b/examples/incremental_demo/models/engines/postgres/fct_events_py_incremental.ff.py new file mode 100644 index 0000000..3e4661b --- /dev/null +++ b/examples/incremental_demo/models/engines/postgres/fct_events_py_incremental.ff.py @@ -0,0 +1,25 @@ +from fastflowtransform import engine_model +import pandas as pd + + +@engine_model( + only="postgres", + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=[ + "example:incremental_demo", + "scope:engine", + "engine:postgres", + "kind:python", + "kind:incremental", + ], +) +def build(events_df: pd.DataFrame) -> pd.DataFrame: + """ + Python-Incremental-Beispiel (Postgres). + + Gleiche Semantik wie bei DuckDB; Snapshot out, Incremental-Logik in der Engine. + """ + df = events_df.copy() + df["value_x10"] = df["value"] * 10 + return df[["event_id", "updated_at", "value", "value_x10"]] diff --git a/examples/incremental_demo/profiles.yml b/examples/incremental_demo/profiles.yml new file mode 100644 index 0000000..a982986 --- /dev/null +++ b/examples/incremental_demo/profiles.yml @@ -0,0 +1,67 @@ +dev_duckdb: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/incremental_demo.duckdb') }}" + +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +dev_databricks_delta: + engine: databricks_spark + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'incremental_demo') }}" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" + extra_conf: + spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" + spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" + spark.hadoop.datanucleus.schema.autoCreateAll: "true" + spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + +dev_databricks_iceberg: + engine: databricks_spark + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'incremental_demo_iceberg') }}" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse_iceberg" + table_format: "iceberg" + extra_conf: + spark.jars.packages: "org.apache.iceberg:iceberg-spark-runtime-4.0_2.13:1.10.0" + # minimal local Hadoop catalog example: + spark.sql.catalog.iceberg: "org.apache.iceberg.spark.SparkCatalog" + spark.sql.catalog.iceberg.type: "hadoop" + spark.sql.catalog.iceberg.warehouse: "file://{{ project_dir() }}/.local/iceberg_warehouse" + spark.sql.catalog.iceberg.write.metadata.version-hint.enabled: "false" + spark.sql.catalog.iceberg.read.metadata.version-hint.enabled: "false" + +dev_databricks_hudi: + engine: databricks_spark + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'incremental_demo_hudi') }}" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse_hudi" + + # Tell DatabricksSparkExecutor / get_spark_format_handler to use HudiFormatHandler + table_format: "hudi" + + # Default Hudi options – overridden per model via models.storage + table_options: + hoodie.datasource.write.recordkey.field: "id" + hoodie.datasource.write.precombine.field: "updated_at" + hoodie.table.name: "fct_events_hudi" + + extra_conf: + spark.jars.packages: "org.apache.hudi:hudi-spark4.0-bundle_2.13:1.0.0" + + # Core Hudi Spark wiring + spark.serializer: "org.apache.spark.serializer.KryoSerializer" + spark.sql.extensions: "org.apache.spark.sql.hudi.HoodieSparkSessionExtension" + spark.sql.catalog.spark_catalog: "org.apache.spark.sql.hudi.catalog.HoodieCatalog" + spark.kryo.registrator: "org.apache.spark.HoodieSparkKryoRegistrar" + + # Keep warehouse on disk under the project (matches warehouse_dir) + spark.sql.warehouse.dir: "file://{{ project_dir() }}/.local/spark_warehouse_hudi" diff --git a/examples/incremental_demo/project.yml b/examples/incremental_demo/project.yml new file mode 100644 index 0000000..aaa5a8b --- /dev/null +++ b/examples/incremental_demo/project.yml @@ -0,0 +1,60 @@ +name: incremental_demo +version: "0.1" + +vars: {} + +models: + storage: + events_base.ff: + path: ".local/spark/events_base" + # format: parquet + + fct_events_sql_inline.ff: + path: ".local/spark/fct_events_sql_inline" + # format: parquet + + fct_events_sql_yaml.ff: + path: ".local/spark/fct_events_sql_yaml" + # format: parquet + + fct_events_py_incremental.ff: + path: ".local/spark/fct_events_py_incremental" + # format: parquet + + incremental: + fct_events_sql_inline.ff: + unique_key: "event_id" + + fct_events_sql_yaml.ff: + unique_key: "event_id" + + fct_events_sql_yaml.ff: + unique_key: "event_id" # top-level shortcut + incremental: + enabled: true + updated_at_column: "updated_at" + + fct_events_sql_inline_delta.ff: + unique_key: "event_id" + +seeds: + storage: + seed_events: + path: ".local/spark/seed_events" + # format: parquet + +tests: + - type: not_null + table: fct_events_sql_inline + column: event_id + tags: [incremental] + + - type: not_null + table: fct_events_sql_yaml + column: event_id + tags: [incremental] + + - type: not_null + table: fct_events_py_incremental + column: event_id + tags: [incremental] diff --git a/examples/incremental_demo/seeds/README.md b/examples/incremental_demo/seeds/README.md new file mode 100644 index 0000000..2e553ed --- /dev/null +++ b/examples/incremental_demo/seeds/README.md @@ -0,0 +1,4 @@ +# Seeds directory + +Add CSV or Parquet files for reproducible seeds. +Usage examples are covered in docs/Quickstart.md and docs/Config_and_Macros.md#13-seeds-sources-and-dependencies. diff --git a/examples/incremental_demo/seeds/seed_events.csv b/examples/incremental_demo/seeds/seed_events.csv new file mode 100644 index 0000000..7454236 --- /dev/null +++ b/examples/incremental_demo/seeds/seed_events.csv @@ -0,0 +1,4 @@ +event_id,updated_at,value +1,2024-01-01T00:00:00,10 +2,2024-01-02T00:00:00,20 +3,2024-01-03T00:00:00,30 diff --git a/examples/incremental_demo/site/dag/events_base.ff.html b/examples/incremental_demo/site/dag/events_base.ff.html new file mode 100644 index 0000000..87c952c --- /dev/null +++ b/examples/incremental_demo/site/dag/events_base.ff.html @@ -0,0 +1,150 @@ + + + + + + events_base.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ events_base.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
events_base
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/incremental_demo/models/common/events_base.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+ + +
+
+ + + + + + +
+ + + + \ No newline at end of file diff --git a/examples/postgres/site/dag/mart_orders_enriched.html b/examples/incremental_demo/site/dag/fct_events_py_incremental.html similarity index 94% rename from examples/postgres/site/dag/mart_orders_enriched.html rename to examples/incremental_demo/site/dag/fct_events_py_incremental.html index 293aee3..bb33bf9 100644 --- a/examples/postgres/site/dag/mart_orders_enriched.html +++ b/examples/incremental_demo/site/dag/fct_events_py_incremental.html @@ -3,7 +3,7 @@ - mart_orders_enriched – FastFlowTransform + fct_events_py_incremental – FastFlowTransform + + + +
+
+

FastFlowTransform - DAG & Mini Docs

+
Mermaid renders automatically (light/dark)
+
+
+ + +
+
+ +
+
+

DAG

+
+ SQL + Python + + Materialization: + + table + + view + + ephemeral + + incremental + +
+
flowchart TD + classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; + classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; + events_base_ff["events_base.ff
(events_base)"] + class events_base_ff sql; + fct_events_py_incremental("fct_events_py_incremental
(fct_events_py_incremental)") + class fct_events_py_incremental py; + fct_events_sql_inline_ff["fct_events_sql_inline.ff
(fct_events_sql_inline)"] + class fct_events_sql_inline_ff sql; + fct_events_sql_yaml_ff["fct_events_sql_yaml.ff
(fct_events_sql_yaml)"] + class fct_events_sql_yaml_ff sql; + events_base_ff --> fct_events_sql_inline_ff + events_base_ff --> fct_events_sql_yaml_ff + events_base_ff --> fct_events_py_incremental +
+
+ + + +
+

Macros

+ +

No macros found.

+ +
+
+ + + + \ No newline at end of file diff --git a/examples/incremental_demo/sources.yml b/examples/incremental_demo/sources.yml new file mode 100644 index 0000000..6715df2 --- /dev/null +++ b/examples/incremental_demo/sources.yml @@ -0,0 +1,7 @@ +version: 2 + +sources: + - name: raw + tables: + - name: events + identifier: seed_events diff --git a/examples/incremental_demo/tests/unit/README.md b/examples/incremental_demo/tests/unit/README.md new file mode 100644 index 0000000..b3c3c8d --- /dev/null +++ b/examples/incremental_demo/tests/unit/README.md @@ -0,0 +1,4 @@ +# Unit tests + +Define YAML unit specs as described in docs/Config_and_Macros.md#73-model-unit-tests-fft-utest. +Invoke them with `fft utest --env `. diff --git a/examples/macros_demo/.env.dev_databricks b/examples/macros_demo/.env.dev_databricks new file mode 100644 index 0000000..45685d8 --- /dev/null +++ b/examples/macros_demo/.env.dev_databricks @@ -0,0 +1,6 @@ +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=macros_demo +FF_DBR_ENABLE_HIVE=1 +FF_DBR_DATABASE=macros_demo +# FF_DBR_TABLE_FORMAT=delta +JAVA_HOME=/opt/homebrew/opt/openjdk@17 diff --git a/examples/macros_demo/.env.dev_duckdb b/examples/macros_demo/.env.dev_duckdb new file mode 100644 index 0000000..cda9cb7 --- /dev/null +++ b/examples/macros_demo/.env.dev_duckdb @@ -0,0 +1,2 @@ +FF_DUCKDB_PATH=.local/macros_demo.duckdb +FF_DUCKDB_SCHEMA=macros_demo diff --git a/examples/macros_demo/.env.dev_postgres b/examples/macros_demo/.env.dev_postgres new file mode 100644 index 0000000..6ed9548 --- /dev/null +++ b/examples/macros_demo/.env.dev_postgres @@ -0,0 +1,2 @@ +FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 +FF_PG_SCHEMA=macros_demo diff --git a/examples/macros_demo/Makefile b/examples/macros_demo/Makefile new file mode 100644 index 0000000..b5cd9bc --- /dev/null +++ b/examples/macros_demo/Makefile @@ -0,0 +1,77 @@ +.PHONY: demo seed run dag test artifacts clean demo-open + +DB ?= .local/macros_demo.duckdb +PROJECT ?= . +UV ?= uv +ENGINE ?= duckdb + +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + OPENER := open +else + OPENER := xdg-open +endif + +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb + ENGINE_TAG = engine:duckdb +endif +ifeq ($(ENGINE),postgres) + PROFILE_ENV = dev_postgres + ENGINE_TAG = engine:postgres +endif +ifeq ($(ENGINE),databricks_spark) + PROFILE_ENV = dev_databricks + ENGINE_TAG = engine:databricks_spark +endif + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +RUN_ENV = $(BASE_ENV) +SELECT_FLAGS = --select tag:example:macros_demo --select tag:$(ENGINE_TAG) + +CLEAN_SCRIPT = ../_scripts/cleanup_env.py +ifeq ($(ENGINE),duckdb) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine duckdb --env "$(PROFILE_ENV)" --project "$(PROJECT)" --duckdb-path "$(DB)" +else ifeq ($(ENGINE),postgres) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),databricks_spark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else + CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) +endif + +seed: + env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) + +run: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +test: + env $(BASE_ENV) $(UV) run fft test "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +dag: + env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --html + +artifacts: + @echo + @echo "== 📦 Artifacts ==" + @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + @echo " DAG HTML: $(PROJECT)/site/dag/index.html" + +clean: + $(CLEAN_CMD) + +demo-open: + @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ + $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ + else \ + echo "No HTML found: $(PROJECT)/site/dag/index.html"; \ + fi + +demo: clean + @echo "== 🧩 Macros Demo ($(ENGINE)) ==" + +$(MAKE) seed + +$(MAKE) run + +$(MAKE) dag + +$(MAKE) test + +$(MAKE) artifacts diff --git a/examples/macros_demo/README.md b/examples/macros_demo/README.md new file mode 100644 index 0000000..5e977f7 --- /dev/null +++ b/examples/macros_demo/README.md @@ -0,0 +1,7 @@ +# FastFlowTransform project scaffold + +This project was created with `fft init`. +Next steps: +1. Update `profiles.yml` with real connection details (docs/Profiles.md). +2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). +3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). diff --git a/examples/macros_demo/models/README.md b/examples/macros_demo/models/README.md new file mode 100644 index 0000000..32818bb --- /dev/null +++ b/examples/macros_demo/models/README.md @@ -0,0 +1,4 @@ +# Models directory + +Place SQL (`*.ff.sql`) and Python (`*.ff.py`) models here. +See docs/Config_and_Macros.md for modeling guidance and config options. diff --git a/examples/macros_demo/models/common/dim_users.ff.sql b/examples/macros_demo/models/common/dim_users.ff.sql new file mode 100644 index 0000000..7769d2d --- /dev/null +++ b/examples/macros_demo/models/common/dim_users.ff.sql @@ -0,0 +1,26 @@ +{{ config( + materialized='table', + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] +) }} + +with u as ( + select * from {{ ref('stg_users.ff') }} +), +labels as ( + -- Tiny lookup generated by Python render macro + select * from (values {{ csv_values( + [ + {"domain":"example.com", "label":"internal"}, + {"domain":"gmail.com", "label":"consumer"}, + ], + ["domain","label"] + ) }}) as t(domain, label) +) +select + u.user_id, + u.email, + u.email_domain, + u.country, + l.label as user_segment +from u +left join labels l on l.domain = u.email_domain; diff --git a/examples/macros_demo/models/common/fct_user_sales.ff.sql b/examples/macros_demo/models/common/fct_user_sales.ff.sql new file mode 100644 index 0000000..1f3de13 --- /dev/null +++ b/examples/macros_demo/models/common/fct_user_sales.ff.sql @@ -0,0 +1,21 @@ +{{ config( + materialized='table', + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] +) }} + +with o as ( + select * from {{ ref('stg_orders.ff') }} +), +u as ( + select * from {{ ref('dim_users.ff') }} +) +select + u.user_id, + u.user_segment, + count(*) as order_count, + sum(o.amount) as total_amount, + min(o.order_ts) as first_order_ts, + max(o.order_ts) as last_order_ts +from o +join u on u.user_id = o.user_id +group by u.user_id, u.user_segment; diff --git a/examples/macros_demo/models/common/stg_orders.ff.sql b/examples/macros_demo/models/common/stg_orders.ff.sql new file mode 100644 index 0000000..b92e9b1 --- /dev/null +++ b/examples/macros_demo/models/common/stg_orders.ff.sql @@ -0,0 +1,11 @@ +{{ config( + materialized='view', + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] +) }} + +select + cast(order_id as int) as order_id, + cast(customer_id as int) as user_id, + {{ safe_cast_amount("amount") }} as amount, + cast(order_ts as timestamp) as order_ts +from {{ source('sales', 'orders') }}; diff --git a/examples/macros_demo/models/common/stg_users.ff.sql b/examples/macros_demo/models/common/stg_users.ff.sql new file mode 100644 index 0000000..4a14e62 --- /dev/null +++ b/examples/macros_demo/models/common/stg_users.ff.sql @@ -0,0 +1,20 @@ +{{ config( + materialized='view', + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] +) }} + +with src as ( + select + cast(id as int) as user_id, + lower(email) as email, + {{ coalesce_any("country", default_country()) }} as country + from {{ source('crm', 'users') }} +) +select + user_id, + email, + {{ email_domain("email") }} as email_domain, + country, + -- Render-time Python macro usage (literal in SQL) + '{{ slugify(var("site_name", "My Site")) }}' as site_slug +from src; diff --git a/examples/macros_demo/models/engines/duckdb/py_exmaple.ff.py b/examples/macros_demo/models/engines/duckdb/py_exmaple.ff.py new file mode 100644 index 0000000..865dd8b --- /dev/null +++ b/examples/macros_demo/models/engines/duckdb/py_exmaple.ff.py @@ -0,0 +1,13 @@ +from fastflowtransform import engine_model +import pandas as pd + + +@engine_model( + only="duckdb", + name="py_example", + deps=["fct_user_sales.ff"], + tags=["example:macros_demo", "scope:engine", "engine:duckdb"], +) +def produce(_: pd.DataFrame) -> pd.DataFrame: + # In a real project, you might fetch extra metadata here or post-process + return pd.DataFrame([{"note": "Python model ran on DuckDB"}]) diff --git a/examples/macros_demo/models/macros/star.sql b/examples/macros_demo/models/macros/star.sql new file mode 100644 index 0000000..bd74e7c --- /dev/null +++ b/examples/macros_demo/models/macros/star.sql @@ -0,0 +1,12 @@ +{# Select * except some columns. Works across engines. #} +{%- macro star_except(relation, exclude_cols) -%} +{%- set excl = exclude_cols | map('lower') | list -%} +{%- set cols = adapter_columns(relation) -%} +{# adapter_columns is provided by FFT executors' catalog/describe (if available). + To keep demo simple, fall back to literal star if unknown. #} +{%- if cols and cols|length > 0 -%} + {{- (cols | reject('in', excl) | map('string') | join(', ')) -}} +{%- else -%} + * +{%- endif -%} +{%- endmacro -%} diff --git a/examples/macros_demo/models/macros/utils.sql b/examples/macros_demo/models/macros/utils.sql new file mode 100644 index 0000000..e72debe --- /dev/null +++ b/examples/macros_demo/models/macros/utils.sql @@ -0,0 +1,25 @@ +{# Reusable SQL helpers #} + +{%- macro email_domain(expr) -%} + lower(split_part({{ expr }}, '@', 2)) +{%- endmacro -%} + +{%- macro safe_cast_amount(expr) -%} +{# engine-aware numeric type #} +{%- set e = engine('duckdb') -%} +{%- if e in ['duckdb', 'postgres'] -%} + cast({{ expr }} as double) +{%- elif e == 'databricks_spark' -%} + cast({{ expr }} as double) +{%- else -%} + cast({{ expr }} as double) +{%- endif -%} +{%- endmacro -%} + +{%- macro coalesce_any(expr, default) -%} + coalesce({{ expr }}, {{ default }}) +{%- endmacro -%} + +{%- macro default_country() -%} + '{{ var("default_country", "DE") }}' +{%- endmacro -%} diff --git a/examples/macros_demo/models/macros_py/helpers.py b/examples/macros_demo/models/macros_py/helpers.py new file mode 100644 index 0000000..4780b2e --- /dev/null +++ b/examples/macros_demo/models/macros_py/helpers.py @@ -0,0 +1,44 @@ +""" +Python macros: exposed as Jinja globals & filters by FFT core. +They run at *render time* (not as SQL UDFs). +""" + +import re +from typing import Any + + +def slugify(value: str) -> str: + """Make a URL-friendly slug at render time.""" + value = value.strip().lower() + value = re.sub(r"[^a-z0-9]+", "-", value) + return re.sub(r"-{2,}", "-", value).strip("-") + + +def mask_email(email: str) -> str: + """Redact local part of an email (render-time).""" + if "@" not in email: + return email + local, domain = email.split("@", 1) + if not local: + return email + return f"{local[0]}***@{domain}" + + +def csv_values(rows: list[dict[str, Any]], cols: list[str]) -> str: + """ + Produce a SQL VALUES(...) list for small lookup tables at render time. + Example: csv_values([{'k':1,'v':'x'}], ['k','v']) -> "(1, 'x')" + """ + + def lit(v): + if v is None: + return "NULL" + if isinstance(v, (int, float)): + return str(v) + s = str(v).replace("'", "''") + return f"'{s}'" + + tuples = [] + for row in rows: + tuples.append("(" + ", ".join(lit(row.get(c)) for c in cols) + ")") + return ", ".join(tuples) diff --git a/examples/macros_demo/profiles.yml b/examples/macros_demo/profiles.yml new file mode 100644 index 0000000..143ee7a --- /dev/null +++ b/examples/macros_demo/profiles.yml @@ -0,0 +1,23 @@ +dev_duckdb: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/macros_demo.duckdb') }}" + +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +dev_databricks: + engine: databricks_spark + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'macros_demo') }}" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" + extra_conf: + spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" + spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" + spark.hadoop.datanucleus.schema.autoCreateAll: "true" + spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" diff --git a/examples/macros_demo/project.yml b/examples/macros_demo/project.yml new file mode 100644 index 0000000..16c6435 --- /dev/null +++ b/examples/macros_demo/project.yml @@ -0,0 +1,40 @@ +name: macros_demo +version: "0.1" + +vars: + # used by macros and examples + default_country: "DE" + +models: + storage: + stg_users: + path: ".local/spark/stg_users" + format: parquet + stg_orders: + path: ".local/spark/stg_orders" + format: parquet + dim_users.ff: + path: ".local/spark/dim_users" + format: parquet + fct_user_sales.ff: + path: ".local/spark/fct_user_sales" + format: parquet + +seeds: + storage: + seed_users: + path: ".local/spark/seed_users" + format: parquet + seed_orders: + path: ".local/spark/seed_orders" + format: parquet + +tests: + - type: not_null + table: dim_users + column: user_id + tags: [batch] + - type: row_count_between + table: fct_user_sales + min_rows: 1 + tags: [batch] diff --git a/examples/macros_demo/seeds/README.md b/examples/macros_demo/seeds/README.md new file mode 100644 index 0000000..2e553ed --- /dev/null +++ b/examples/macros_demo/seeds/README.md @@ -0,0 +1,4 @@ +# Seeds directory + +Add CSV or Parquet files for reproducible seeds. +Usage examples are covered in docs/Quickstart.md and docs/Config_and_Macros.md#13-seeds-sources-and-dependencies. diff --git a/examples/macros_demo/seeds/seed_orders.csv b/examples/macros_demo/seeds/seed_orders.csv new file mode 100644 index 0000000..5c50f6a --- /dev/null +++ b/examples/macros_demo/seeds/seed_orders.csv @@ -0,0 +1,4 @@ +order_id,customer_id,amount,order_ts +10,1,12.5,2025-10-01T12:00:00 +11,1,9.9,2025-10-05T09:00:00 +12,2,20.0,2025-10-07T18:15:00 diff --git a/examples/macros_demo/seeds/seed_users.csv b/examples/macros_demo/seeds/seed_users.csv new file mode 100644 index 0000000..515a678 --- /dev/null +++ b/examples/macros_demo/seeds/seed_users.csv @@ -0,0 +1,4 @@ +id,email,country +1,a@example.com,DE +2,b@gmail.com,AT +3,c@gmail.com, diff --git a/examples/postgres/site/dag/users_enriched.html b/examples/macros_demo/site/dag/dim_users.ff.html similarity index 93% rename from examples/postgres/site/dag/users_enriched.html rename to examples/macros_demo/site/dag/dim_users.ff.html index d30ac1e..724aa2c 100644 --- a/examples/postgres/site/dag/users_enriched.html +++ b/examples/macros_demo/site/dag/dim_users.ff.html @@ -3,7 +3,7 @@ - users_enriched – FastFlowTransform + dim_users.ff – FastFlowTransform + + +

← Back to overview

+ +
+
+

+ fct_user_sales.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
fct_user_sales
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/macros_demo/models/common/fct_user_sales.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+
+ +
+ +
+
+ + + + + + +
+ + + + \ No newline at end of file diff --git a/examples/simple_duckdb/site/dag/index.html b/examples/macros_demo/site/dag/index.html similarity index 70% rename from examples/simple_duckdb/site/dag/index.html rename to examples/macros_demo/site/dag/index.html index 0402a3a..9d3874d 100644 --- a/examples/simple_duckdb/site/dag/index.html +++ b/examples/macros_demo/site/dag/index.html @@ -124,30 +124,20 @@

DAG

flowchart TD classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - ephemeral_ids_ff["ephemeral_ids.ff
(ephemeral_ids)"] - class ephemeral_ids_ff sql; - mart_orders_enriched("mart_orders_enriched
(mart_orders_enriched)") - class mart_orders_enriched py; - mart_users_ff["mart_users.ff
(mart_users)"] - class mart_users_ff sql; - orders_ff["orders.ff
(orders)"] - class orders_ff sql; - users_ff["users.ff
(users)"] - class users_ff sql; - users_enriched("users_enriched
(users_enriched)") - class users_enriched py; - v_users_ff["v_users.ff
(v_users)"] - class v_users_ff sql; - v_users_enriched_ff["v_users_enriched.ff
(v_users_enriched)"] - class v_users_enriched_ff sql; - users_enriched --> v_users_enriched_ff - users_enriched --> mart_users_ff - users_ff --> ephemeral_ids_ff - users_ff --> v_users_ff - ephemeral_ids_ff --> v_users_ff - users_ff --> users_enriched - orders_ff --> mart_orders_enriched - users_enriched --> mart_orders_enriched + dim_users_ff["dim_users.ff
(dim_users)"] + class dim_users_ff sql; + fct_user_sales_ff["fct_user_sales.ff
(fct_user_sales)"] + class fct_user_sales_ff sql; + py_example("py_example
(py_example)") + class py_example py; + stg_orders_ff["stg_orders.ff
(stg_orders)"] + class stg_orders_ff sql; + stg_users_ff["stg_users.ff
(stg_users)"] + class stg_users_ff sql; + stg_orders_ff --> fct_user_sales_ff + dim_users_ff --> fct_user_sales_ff + stg_users_ff --> dim_users_ff + fct_user_sales_ff --> py_example
@@ -168,72 +158,15 @@

Models

- - ephemeral_ids.ff + + dim_users.ff sql - ephemeral_ids - users.ff - - - ephemeral - - - - - - - - - mart_orders_enriched - - - - python - mart_orders_enriched - orders.ff, users_enriched - - - table - - - - - - - - - mart_users.ff - - - - sql - mart_users - users_enriched - - - table - - - - - - - - - orders.ff - - - - sql - orders - – + dim_users + stg_users.ff table @@ -244,19 +177,15 @@

Models

- - users.ff + + fct_user_sales.ff - - Raw users table imported from CRM. - - sql - users - – + fct_user_sales + stg_orders.ff, dim_users.ff table @@ -267,19 +196,15 @@

Models

- - users_enriched + + py_example - - Adds gmail flag. - - python - users_enriched - users.ff + py_example + fct_user_sales.ff table @@ -290,15 +215,15 @@

Models

- - v_users.ff + + stg_orders.ff sql - v_users - users.ff, ephemeral_ids.ff + stg_orders + – view @@ -309,15 +234,15 @@

Models

- - v_users_enriched.ff + + stg_users.ff sql - v_users_enriched - users_enriched + stg_users + – view @@ -343,43 +268,93 @@

Macros

- sql_email_domain + Any + + + python + + + models/macros_py/helpers.py + + + + csv_values + + + python + + + models/macros_py/helpers.py + + + + mask_email + + + python + + + models/macros_py/helpers.py + + + + slugify python - models/macros_py/sql_helpers.py + models/macros_py/helpers.py + + + + coalesce_any + + + sql + + + models/macros/utils.sql + + + + default_country + + + sql + + + models/macros/utils.sql - nz + email_domain sql - models/macros/util.sql + models/macros/utils.sql - on_or_before + safe_cast_amount sql - models/macros/util.sql + models/macros/utils.sql - upper_col + star_except sql - models/macros/util.sql + models/macros/star.sql diff --git a/examples/simple_duckdb/site/dag/mart_users.ff.html b/examples/macros_demo/site/dag/py_example.html similarity index 95% rename from examples/simple_duckdb/site/dag/mart_users.ff.html rename to examples/macros_demo/site/dag/py_example.html index e05750f..be23140 100644 --- a/examples/simple_duckdb/site/dag/mart_users.ff.html +++ b/examples/macros_demo/site/dag/py_example.html @@ -3,7 +3,7 @@ - mart_users.ff – FastFlowTransform + py_example – FastFlowTransform + + +

← Back to overview

+ +
+
+

+ dim_customers.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
dim_customers
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/dim_customers.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+
+ +
+ +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
customer_idint + + yes + + + + — + + + + + ?.customer_id + + direct + + + + +
customer_namestring + + yes + + + + — + + + + + ?.customer_name + + direct + + + + +
customer_statusstring + + yes + + + + — + + + + + ?.customer_status + + direct + + + + +
loaded_attimestamp + + yes + + + + — + + + + + ?.current_timestamp + + direct + + + + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/fct_orders_inc.ff.html b/examples/materializations_demo/site/dag/fct_orders_inc.ff.html new file mode 100644 index 0000000..7897e80 --- /dev/null +++ b/examples/materializations_demo/site/dag/fct_orders_inc.ff.html @@ -0,0 +1,270 @@ + + + + + + fct_orders_inc.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ fct_orders_inc.ff + incremental +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
incremental
+ +
Relation
+
fct_orders_inc
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/fct_orders_inc.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
order_idint + + yes + + + + — + + + + + ?.order_id + + direct + + + + +
customer_idint + + yes + + + + — + + + + + ?.customer_id + + direct + + + + +
amountdouble + + yes + + + + — + + + + + ?.amount + + direct + + + + +
order_tstimestamp + + yes + + + + — + + + + + ?.order_ts + + direct + + + + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/index.html b/examples/materializations_demo/site/dag/index.html new file mode 100644 index 0000000..a630c86 --- /dev/null +++ b/examples/materializations_demo/site/dag/index.html @@ -0,0 +1,358 @@ + + + + + + FastFlowTransform - DAG & Mini Docs + + + + + + + +
+
+

FastFlowTransform - DAG & Mini Docs

+
Mermaid renders automatically (light/dark)
+
+
+ + +
+
+ +
+
+

DAG

+
+ SQL + Python + + Materialization: + + table + + view + + ephemeral + + incremental + +
+
flowchart TD + classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; + classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; + demo_py_emit("demo_py_emit
(demo_py_emit)") + class demo_py_emit py; + dim_customers_ff["dim_customers.ff
(dim_customers)"] + class dim_customers_ff sql; + fct_orders_inc_ff["fct_orders_inc.ff
(fct_orders_inc)"] + class fct_orders_inc_ff sql; + mart_order_summary_ff["mart_order_summary.ff
(mart_order_summary)"] + class mart_order_summary_ff sql; + order_flags_ephemeral_ff["order_flags_ephemeral.ff
(order_flags_ephemeral)"] + class order_flags_ephemeral_ff sql; + stg_customers_ff["stg_customers.ff
(stg_customers)"] + class stg_customers_ff sql; + stg_orders_ff["stg_orders.ff
(stg_orders)"] + class stg_orders_ff sql; + stg_orders_ff --> order_flags_ephemeral_ff + stg_orders_ff --> mart_order_summary_ff + order_flags_ephemeral_ff --> mart_order_summary_ff + stg_customers_ff --> mart_order_summary_ff + stg_orders_ff --> fct_orders_inc_ff + stg_customers_ff --> dim_customers_ff + dim_customers_ff --> demo_py_emit +
+
+ + + +
+

Macros

+ + + + + + + + + + + + + + + + + + +
NameTypePath
dtype_double + + sql + + models/macros/types.sql.j2
+ +
+
+ + + + \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/mart_order_summary.ff.html b/examples/materializations_demo/site/dag/mart_order_summary.ff.html new file mode 100644 index 0000000..cf5e1b0 --- /dev/null +++ b/examples/materializations_demo/site/dag/mart_order_summary.ff.html @@ -0,0 +1,330 @@ + + + + + + mart_order_summary.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ mart_order_summary.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
mart_order_summary
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/mart_order_summary.ff.sql + +
+ +
Dependencies
+ + + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
customer_idint + + yes + + + + — + + + + unknown + +
customer_namestring + + yes + + + + — + + + + unknown + +
customer_statusstring + + yes + + + + — + + + + unknown + +
order_countbigint + + yes + + + + — + + + + unknown + +
big_order_countbigint + + yes + + + + — + + + + unknown + +
total_amountdouble + + yes + + + + — + + + + unknown + +
first_order_tstimestamp + + yes + + + + — + + + + unknown + +
last_order_tstimestamp + + yes + + + + — + + + + unknown + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/simple_duckdb/site/dag/ephemeral_ids.ff.html b/examples/materializations_demo/site/dag/order_flags_ephemeral.ff.html similarity index 93% rename from examples/simple_duckdb/site/dag/ephemeral_ids.ff.html rename to examples/materializations_demo/site/dag/order_flags_ephemeral.ff.html index 5ff5411..095150c 100644 --- a/examples/simple_duckdb/site/dag/ephemeral_ids.ff.html +++ b/examples/materializations_demo/site/dag/order_flags_ephemeral.ff.html @@ -3,7 +3,7 @@ - ephemeral_ids.ff – FastFlowTransform + order_flags_ephemeral.ff – FastFlowTransform + + +

← Back to overview

+ +
+
+

+ stg_orders.ff + view +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
view
+ +
Relation
+
stg_orders
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/stg_orders.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
order_idint + + yes + + + + — + + + + + ?.? + + transformed + + + + +
customer_idint + + yes + + + + — + + + + + ?.? + + transformed + + + + +
amountdouble + + yes + + + + — + + + + unknown + +
order_tstimestamp + + yes + + + + — + + + + + ?.? + + transformed + + + + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/materializations_demo/sources.yml b/examples/materializations_demo/sources.yml new file mode 100644 index 0000000..1fa3552 --- /dev/null +++ b/examples/materializations_demo/sources.yml @@ -0,0 +1,9 @@ +version: 2 + +sources: + - name: demo + tables: + - name: customers + identifier: seed_customers # via `fft seed` + - name: orders + identifier: seed_orders # via `fft seed` diff --git a/examples/materializations_demo/tests/unit/README.md b/examples/materializations_demo/tests/unit/README.md new file mode 100644 index 0000000..b3c3c8d --- /dev/null +++ b/examples/materializations_demo/tests/unit/README.md @@ -0,0 +1,4 @@ +# Unit tests + +Define YAML unit specs as described in docs/Config_and_Macros.md#73-model-unit-tests-fft-utest. +Invoke them with `fft utest --env `. diff --git a/examples/postgres/.fastflowtransform/cache/stg-postgres.json b/examples/postgres/.fastflowtransform/cache/stg-postgres.json deleted file mode 100644 index 20a7672..0000000 --- a/examples/postgres/.fastflowtransform/cache/stg-postgres.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "engine": "postgres", - "entries": { - "mart_orders_enriched": "6599c52b248c143c13a9cf4daaab3b646685f10ba50cd9477e62791b3ae3071b", - "mart_users.ff": "68aa5a370f7fc55b669f87134e7ceaf959d3ee4d5a563e75ae83392709b085ce", - "orders.ff": "351176593a9a5e231aa860af35e1dab4e3d7070cc25cfd13a49e415214051358", - "users.ff": "78ff9f7baa9b7b5617dbb3081fdb8a61d19a637ad6a8874862a91051b31ef646", - "users_enriched": "bbbee5bef3591988a2d6cb27fd561ac6d6f719f74dec668903e28e74e85fbd63" - }, - "profile": "stg", - "version": 1 -} \ No newline at end of file diff --git a/examples/postgres/.fastflowtransform/target/catalog.json b/examples/postgres/.fastflowtransform/target/catalog.json deleted file mode 100644 index 9bf6fca..0000000 --- a/examples/postgres/.fastflowtransform/target/catalog.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "metadata": { - "generated_at": "2025-10-31T16:46:28+00:00", - "tool": "fastflowtransform" - }, - "relations": { - "mart_orders_enriched": { - "columns": [] - }, - "mart_users": { - "columns": [] - }, - "orders": { - "columns": [] - }, - "users": { - "columns": [] - }, - "users_enriched": { - "columns": [] - } - } -} diff --git a/examples/postgres/.fastflowtransform/target/manifest.json b/examples/postgres/.fastflowtransform/target/manifest.json deleted file mode 100644 index e41b5dc..0000000 --- a/examples/postgres/.fastflowtransform/target/manifest.json +++ /dev/null @@ -1,88 +0,0 @@ -{ - "macros": {}, - "metadata": { - "generated_at": "2025-10-31T16:46:28+00:00", - "tool": "fastflowtransform" - }, - "nodes": { - "mart_orders_enriched": { - "deps": [ - "orders.ff", - "users_enriched" - ], - "kind": "python", - "materialized": "table", - "name": "mart_orders_enriched", - "path": "models/mart_orders_enriched.ff.py", - "relation": "mart_orders_enriched" - }, - "mart_users.ff": { - "deps": [ - "users_enriched" - ], - "kind": "sql", - "materialized": "table", - "name": "mart_users.ff", - "path": "models/mart_users.ff.sql", - "relation": "mart_users" - }, - "orders.ff": { - "deps": [], - "kind": "sql", - "materialized": "table", - "name": "orders.ff", - "path": "models/orders.ff.sql", - "relation": "orders" - }, - "users.ff": { - "deps": [], - "kind": "sql", - "materialized": "table", - "name": "users.ff", - "path": "models/users.ff.sql", - "relation": "users" - }, - "users_enriched": { - "deps": [ - "users.ff" - ], - "kind": "python", - "materialized": "table", - "name": "users_enriched", - "path": "models/users_enrich.ff.py", - "relation": "users_enriched" - } - }, - "sources": { - "crm": { - "orders": { - "base": { - "catalog": null, - "database": null, - "dataset": null, - "format": null, - "identifier": "seed_orders", - "location": null, - "options": {}, - "project": null, - "schema": null - }, - "overrides": {} - }, - "users": { - "base": { - "catalog": null, - "database": null, - "dataset": null, - "format": null, - "identifier": "seed_users", - "location": null, - "options": {}, - "project": null, - "schema": null - }, - "overrides": {} - } - } - } -} diff --git a/examples/postgres/.fastflowtransform/target/run_results.json b/examples/postgres/.fastflowtransform/target/run_results.json deleted file mode 100644 index 07e2a05..0000000 --- a/examples/postgres/.fastflowtransform/target/run_results.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "metadata": { - "generated_at": "2025-10-31T16:46:28+00:00", - "tool": "fastflowtransform" - }, - "results": [ - { - "duration_ms": 0, - "finished_at": "2025-10-31T16:46:28+00:00", - "http": null, - "message": null, - "name": "mart_orders_enriched", - "started_at": "2025-10-31T16:46:28+00:00", - "status": "success" - }, - { - "duration_ms": 0, - "finished_at": "2025-10-31T16:46:28+00:00", - "http": null, - "message": null, - "name": "mart_users.ff", - "started_at": "2025-10-31T16:46:28+00:00", - "status": "success" - }, - { - "duration_ms": 0, - "finished_at": "2025-10-31T16:46:28+00:00", - "http": null, - "message": null, - "name": "orders.ff", - "started_at": "2025-10-31T16:46:28+00:00", - "status": "success" - }, - { - "duration_ms": 0, - "finished_at": "2025-10-31T16:46:28+00:00", - "http": null, - "message": null, - "name": "users.ff", - "started_at": "2025-10-31T16:46:28+00:00", - "status": "success" - }, - { - "duration_ms": 0, - "finished_at": "2025-10-31T16:46:28+00:00", - "http": null, - "message": null, - "name": "users_enriched", - "started_at": "2025-10-31T16:46:28+00:00", - "status": "success" - } - ], - "run_finished_at": "2025-10-31T16:46:28+00:00", - "run_started_at": "2025-10-31T16:46:28+00:00" -} diff --git a/examples/postgres/Makefile b/examples/postgres/Makefile deleted file mode 100644 index f1982d9..0000000 --- a/examples/postgres/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -.PHONY: up down seed run dag test - -PG_DSN?=postgresql+psycopg://postgres:postgres@localhost:5432/ffdb - -up: - docker run --rm -d --name ff-pg -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=ffdb -p 5432:5432 postgres:16 - -down: - -docker stop ff-pg - -# seed: -# psql postgresql://postgres:postgres@localhost:5432/ffdb -c "create table if not exists seed_users(id int primary key, email text);" -# psql postgresql://postgres:postgres@localhost:5432/ffdb -c "insert into seed_users values (1,'a@example.com') on conflict do nothing;" -# psql postgresql://postgres:postgres@localhost:5432/ffdb -c "insert into seed_users values (2,'b@gmail.com') on conflict do nothing;" -# psql postgresql://postgres:postgres@localhost:5432/ffdb -c "insert into seed_users values (3,'c@gmail.com') on conflict do nothing;" - -seed: - FF_ENGINE=postgres FF_PG_DSN="$(PG_DSN)" fft seed "$(PROJECT)" --env stg - -run: - FF_ENGINE=postgres FF_PG_DSN=$(PG_DSN) fft run examples/postgres --env stg - -dag: - FF_ENGINE=postgres FF_PG_DSN=$(PG_DSN) fft dag examples/postgres --env stg --html - -test: - FF_ENGINE=postgres FF_PG_DSN=$(PG_DSN) fft test examples/postgres --env stg --select batch diff --git a/examples/postgres/models/mart_orders_enriched.ff.py b/examples/postgres/models/mart_orders_enriched.ff.py deleted file mode 100644 index 7ff6aec..0000000 --- a/examples/postgres/models/mart_orders_enriched.ff.py +++ /dev/null @@ -1,23 +0,0 @@ -import pandas as pd - -from fastflowtransform import model - - -@model( - name="mart_orders_enriched", - deps=["orders.ff", "users_enriched"], - require={ - "orders.ff": ["order_id", "user_id", "amount"], # logical name works - "users_enriched": ["id", "email", "is_gmail"], # physical relation works too - }, -) -def build(dfs: dict[str, pd.DataFrame]) -> pd.DataFrame: - orders = dfs["orders"] - users = dfs["users_enriched"] - - out = ( - orders.merge(users, left_on="user_id", right_on="id", how="left") - .assign(valid_amt=lambda x: x["amount"].fillna(0).ge(0)) - .loc[:, ["order_id", "user_id", "email", "is_gmail", "amount", "valid_amt"]] - ) - return out diff --git a/examples/postgres/models/mart_users.ff.sql b/examples/postgres/models/mart_users.ff.sql deleted file mode 100644 index 8e3b79f..0000000 --- a/examples/postgres/models/mart_users.ff.sql +++ /dev/null @@ -1,3 +0,0 @@ -create or replace table mart_users as -select id, email, is_gmail -from {{ ref('users_enriched') }}; diff --git a/examples/postgres/models/orders.ff.sql b/examples/postgres/models/orders.ff.sql deleted file mode 100644 index 33b7d89..0000000 --- a/examples/postgres/models/orders.ff.sql +++ /dev/null @@ -1,4 +0,0 @@ --- materialises a table "orders" -create or replace table orders as -select * -from {{ source('crm', 'orders') }}; diff --git a/examples/postgres/models/users.ff.sql b/examples/postgres/models/users.ff.sql deleted file mode 100644 index 71ff50f..0000000 --- a/examples/postgres/models/users.ff.sql +++ /dev/null @@ -1,3 +0,0 @@ -create or replace table users as -select id, email -from {{ source('crm','users') }}; diff --git a/examples/postgres/models/users_enrich.ff.py b/examples/postgres/models/users_enrich.ff.py deleted file mode 100644 index 6fd575c..0000000 --- a/examples/postgres/models/users_enrich.ff.py +++ /dev/null @@ -1,10 +0,0 @@ -import pandas as pd - -from fastflowtransform import model - - -@model(name="users_enriched", deps=["users.ff"], require={"users.ff": ["id", "email"]}) -def enrich(df: pd.DataFrame) -> pd.DataFrame: - out = df.copy() - out["is_gmail"] = out["email"].str.endswith("@gmail.com") - return out diff --git a/examples/postgres/profiles.yml b/examples/postgres/profiles.yml deleted file mode 100644 index ff52ab7..0000000 --- a/examples/postgres/profiles.yml +++ /dev/null @@ -1,5 +0,0 @@ -stg: - engine: postgres - postgres: - dsn: postgresql+psycopg://postgres:postgres@localhost:5432/ffdb - schema: public diff --git a/examples/postgres/project.yml b/examples/postgres/project.yml deleted file mode 100644 index 3b1f8eb..0000000 --- a/examples/postgres/project.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: example_postgres -version: 0.1 -models_dir: models - -docs: - dag_dir: site/dag - -tests: - - type: not_null - table: users - column: id - tags: [batch] - - type: unique - table: users - column: id - tags: [batch] - - # Cross-table reconciliations (FF-310) - - type: reconcile_equal - name: orders_count_equals_mart - tags: [reconcile] - left: { table: orders, expr: "count(*)" } - right: { table: mart_orders_enriched, expr: "count(*)" } - - - type: reconcile_coverage - name: orders_fully_covered_in_mart - tags: [reconcile] - source: { table: orders, key: "order_id" } - target: { table: mart_orders_enriched, key: "order_id" } diff --git a/examples/postgres/seeds/seed_orders.csv b/examples/postgres/seeds/seed_orders.csv deleted file mode 100644 index f9c5a47..0000000 --- a/examples/postgres/seeds/seed_orders.csv +++ /dev/null @@ -1,4 +0,0 @@ -order_id,user_id,amount -101,1,10.0 -102,2,9.9 -103,1, diff --git a/examples/postgres/seeds/seed_users.csv b/examples/postgres/seeds/seed_users.csv deleted file mode 100644 index 2acf25f..0000000 --- a/examples/postgres/seeds/seed_users.csv +++ /dev/null @@ -1,4 +0,0 @@ -id,email -1,a@example.com -2,b@gmail.com -3,c@gmail.com diff --git a/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json b/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json index 6826995..5bb2736 100644 --- a/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json +++ b/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json @@ -1,14 +1,14 @@ { "engine": "duckdb", "entries": { - "ephemeral_ids.ff": "f36221b3fb6961430bffa420c97ed12a5ae2e9b92ec39acd18e74086814e4868", - "mart_orders_enriched": "5b99b9c7cafe7ce175c64eccecd245ef72cd40093a63c892b2e9fcedb64d6e6f", - "mart_users.ff": "14a922bedfa7d2eaa3b7f6a8a2e9fbc624d8a29a8d14689197b917540964a74d", - "orders.ff": "ffbbec879b95932afaead84df0b5bad425ced5068168d955faa29a3b49c24306", - "users.ff": "e16d945f4def7ce8bcb110ef25cca35c30626e4de80faf7d02aa39adaf4fe759", - "users_enriched": "8e9d55b46133f51e2eae0d68a2c3ae8a4c787b5e562279245309b64773f3b44b", - "v_users.ff": "4ecdbcaee200abfc46568a666aad308ca206120f1f9f55b15619ddc3683fd4d5", - "v_users_enriched.ff": "e92842ccc13ae2c0c2f1d84616bc6de8b382896eafbde178f9a986ab531a8dbc" + "ephemeral_ids.ff": "56f5222b3e2469d71e63fb1f664ae43bf2740b5cd64413ed6cb66fa20189ecc5", + "mart_orders_enriched": "bb79bdd27cdd29755517a3b3a28282e27ebdb692b39557295d3f30dee6af45c3", + "mart_users.ff": "a1ac028ccbe1496f0c3d0d54f6f59af224fec0aae5e63cfb271bb62a8fb640ae", + "orders.ff": "d6c8111b8a3d35990f7ca2062e1988a61cd64862c64fc053ffd173197035db10", + "users.ff": "6c1fbf51b1d449282d68b7667923039c177b350219f5709bfa29c62ceb257069", + "users_enriched": "867768eaf110ff081e5b0e0fddb5eead4c0f27209fd40c29b5b0f99d20602608", + "v_users.ff": "63c4587f79f58540afe06b8841a5b4e2b4de5171586b4007a32486393a59c24e", + "v_users_enriched.ff": "e19645f66bb1652495b3608cbc4cb658e90a9f9662cf7f927b8d999b9bdfee2f" }, "profile": "dev", "version": 1 diff --git a/examples/simple_duckdb/.fastflowtransform/target/catalog.json b/examples/simple_duckdb/.fastflowtransform/target/catalog.json index 88b0fd6..267b6af 100644 --- a/examples/simple_duckdb/.fastflowtransform/target/catalog.json +++ b/examples/simple_duckdb/.fastflowtransform/target/catalog.json @@ -1,6 +1,6 @@ { "metadata": { - "generated_at": "2025-10-31T16:46:22+00:00", + "generated_at": "2025-11-11T08:35:42+00:00", "tool": "fastflowtransform" }, "relations": { diff --git a/examples/simple_duckdb/.fastflowtransform/target/manifest.json b/examples/simple_duckdb/.fastflowtransform/target/manifest.json index c96a216..b1a7af4 100644 --- a/examples/simple_duckdb/.fastflowtransform/target/manifest.json +++ b/examples/simple_duckdb/.fastflowtransform/target/manifest.json @@ -6,7 +6,7 @@ "upper_col": "models/macros/util.sql" }, "metadata": { - "generated_at": "2025-10-31T16:46:22+00:00", + "generated_at": "2025-11-11T08:35:42+00:00", "tool": "fastflowtransform" }, "nodes": { diff --git a/examples/simple_duckdb/.fastflowtransform/target/run_results.json b/examples/simple_duckdb/.fastflowtransform/target/run_results.json index 7dad55a..cee9bdf 100644 --- a/examples/simple_duckdb/.fastflowtransform/target/run_results.json +++ b/examples/simple_duckdb/.fastflowtransform/target/run_results.json @@ -1,82 +1,82 @@ { "metadata": { - "generated_at": "2025-10-31T16:46:22+00:00", + "generated_at": "2025-11-11T08:35:42+00:00", "tool": "fastflowtransform" }, "results": [ { - "duration_ms": 0, - "finished_at": "2025-10-31T16:46:22+00:00", + "duration_ms": 1, + "finished_at": "2025-11-11T08:35:42+00:00", "http": null, "message": null, "name": "ephemeral_ids.ff", - "started_at": "2025-10-31T16:46:22+00:00", + "started_at": "2025-11-11T08:35:42+00:00", "status": "success" }, { - "duration_ms": 0, - "finished_at": "2025-10-31T16:46:22+00:00", + "duration_ms": 4, + "finished_at": "2025-11-11T08:35:42+00:00", "http": null, "message": null, "name": "mart_orders_enriched", - "started_at": "2025-10-31T16:46:22+00:00", + "started_at": "2025-11-11T08:35:42+00:00", "status": "success" }, { - "duration_ms": 1, - "finished_at": "2025-10-31T16:46:22+00:00", + "duration_ms": 2, + "finished_at": "2025-11-11T08:35:42+00:00", "http": null, "message": null, "name": "mart_users.ff", - "started_at": "2025-10-31T16:46:22+00:00", + "started_at": "2025-11-11T08:35:42+00:00", "status": "success" }, { - "duration_ms": 4, - "finished_at": "2025-10-31T16:46:22+00:00", + "duration_ms": 6, + "finished_at": "2025-11-11T08:35:42+00:00", "http": null, "message": null, "name": "orders.ff", - "started_at": "2025-10-31T16:46:22+00:00", + "started_at": "2025-11-11T08:35:42+00:00", "status": "success" }, { - "duration_ms": 1, - "finished_at": "2025-10-31T16:46:22+00:00", + "duration_ms": 2, + "finished_at": "2025-11-11T08:35:42+00:00", "http": null, "message": null, "name": "users.ff", - "started_at": "2025-10-31T16:46:22+00:00", + "started_at": "2025-11-11T08:35:42+00:00", "status": "success" }, { - "duration_ms": 1, - "finished_at": "2025-10-31T16:46:22+00:00", + "duration_ms": 3, + "finished_at": "2025-11-11T08:35:42+00:00", "http": null, "message": null, "name": "users_enriched", - "started_at": "2025-10-31T16:46:22+00:00", + "started_at": "2025-11-11T08:35:42+00:00", "status": "success" }, { - "duration_ms": 1, - "finished_at": "2025-10-31T16:46:22+00:00", + "duration_ms": 3, + "finished_at": "2025-11-11T08:35:42+00:00", "http": null, "message": null, "name": "v_users.ff", - "started_at": "2025-10-31T16:46:22+00:00", + "started_at": "2025-11-11T08:35:42+00:00", "status": "success" }, { - "duration_ms": 0, - "finished_at": "2025-10-31T16:46:22+00:00", + "duration_ms": 1, + "finished_at": "2025-11-11T08:35:42+00:00", "http": null, "message": null, "name": "v_users_enriched.ff", - "started_at": "2025-10-31T16:46:22+00:00", + "started_at": "2025-11-11T08:35:42+00:00", "status": "success" } ], - "run_finished_at": "2025-10-31T16:46:22+00:00", - "run_started_at": "2025-10-31T16:46:22+00:00" + "run_finished_at": "2025-11-11T08:35:42+00:00", + "run_started_at": "2025-11-11T08:35:42+00:00" } diff --git a/examples/streaming_demo/events.jsonl b/examples/streaming_demo/events.jsonl deleted file mode 100644 index c8e0d4b..0000000 --- a/examples/streaming_demo/events.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -{"user_id": "u1", "session_id": "s1", "source": "ads", "event_type": "page_view", "event_timestamp": "2025-09-27T10:00:01Z", "amount": null} -{"user_id": "u1", "session_id": "s1", "source": "ads", "event_type": "purchase", "event_timestamp": "2025-09-27T10:01:05Z", "amount": 29.9} -{"user_id": "u2", "session_id": "s2", "source": "organic", "event_type": "page_view", "event_timestamp": "2025-09-27T10:02:10Z", "amount": null} diff --git a/Combined.md b/exports/Combined.md similarity index 52% rename from Combined.md rename to exports/Combined.md index 33f3e76..8cf3868 100644 --- a/Combined.md +++ b/exports/Combined.md @@ -15,16 +15,28 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t - [User Guide](./Technical_Overview.md#part-i-operational-guide) - [Modeling Reference](./Config_and_Macros.md) - [Parallelism & Cache](./Cache_and_Parallelism.md) +- [CLI Guide](./CLI_Guide.md) +- [Logging & Verbosity](./Logging.md) - [API calls in Python models](./Api_Models.md) - [Incremental Models](./Incremental.md) - [YAML Tests (Schema-bound)](./YAML_Tests.md) +- [Model Unit Tests](./Unit_Tests.md) - [Data Quality Tests Reference](./Data_Quality_Tests.md) +- [Auto-Docs & Lineage](./Auto_Docs.md) +- [Troubleshooting & Error Codes](./Troubleshooting.md) - [Profiles & Environments](./Profiles.md) - [Sources Declaration](./Sources.md) - [Project Configuration](./Project_Config.md) - [State Selection (changed & results)](./State_Selection.md) -- [Cross-Table Reconciliations](./Technical_Overview.md#cross-table-reconciliations) -- [Auto-Docs & Lineage](./Technical_Overview.md#auto-docs-lineage) +- [Basic Demo](./examples/Basic_Demo.md) +- [Materializations Demo](./examples/Materializations_Demo.md) +- [Data Quality Tests Demo](./examples/DQ_Demo.md) +- [Macros Demo](./examples/Macros_Demo.md) +- [Cache Demo](./examples/Cache_Demo.md) +- [Environment Matrix Demo](./examples/Environment_Matrix.md) +- [Incremental & Delta Demo](examples/Incremental_Demo.md) +- [Local Engine Setup](./examples/Local_Engine_Setup.md) +- [API Demo](./examples/API_Demo.md) - [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) ## Table of Contents @@ -42,15 +54,15 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t - **Get set up quickly:** follow the dedicated [Quickstart](Quickstart.md) guide for installation, seeding, and a first run. - **Need local runtimes?** The [API demo local engine setup](examples/Local_Engine_Setup.md) walks through DuckDB, Postgres, and Databricks Spark. -- **Understand the project layout & CLI workflow:** see *Project Layout*, *Makefile Targets*, and *CLI Flows* in the [Technical Overview](Technical_Overview.md#project-layout). -- **Configure runtimes & profiles:** review executor profiles, environment overrides, and logging options in the [Technical Overview](Technical_Overview.md#profiles-environment-overrides). -- **Model data quality & troubleshoot runs:** the [Technical Overview](Technical_Overview.md#model-unit-tests-fft-utest) covers unit tests, troubleshooting tips, and exit codes. -- **Explore runnable demos:** browse the `examples/` directory in the repo; each subproject comes with its own README. +- **Understand the project layout & CLI workflow:** start with *Project Layout* in the [Technical Overview](Technical_Overview.md#project-layout) and pair it with the [CLI Guide](CLI_Guide.md) for command patterns. +- **Configure runtimes & profiles:** review executor profiles and environment overrides in the dedicated [Profiles guide](Profiles.md) plus [Logging & Verbosity](Logging.md) for observability flags. +- **Model data quality & troubleshoot runs:** combine the [Model Unit Tests guide](Unit_Tests.md) with [Troubleshooting & Error Codes](Troubleshooting.md) to keep runs deterministic and easy to debug. +- **Explore runnable demos:** start with the [Basic Demo Overview](examples/Basic_Demo.md) or browse the `examples/` directory; each subproject ships with its own README. ### 2. Extend FastFlowTransform (Developers & Contributors) - **Dive into architecture & core modules:** start with [Architecture Overview](Technical_Overview.md#architecture-overview) and [Core Modules](Technical_Overview.md#core-modules) for registry, DAG, executors, validation, and more. -- **Add tests & seeds:** see [Sample Models](Technical_Overview.md#sample-models), [Seeds & Example Data](Technical_Overview.md#seeds-example-data), and the unit test guide in [Model Unit Tests](Technical_Overview.md#model-unit-tests-fft-utest). +- **Add tests & seeds:** reuse the curated demos under `docs/examples/` for seeds/Makefiles and follow the [Model Unit Tests guide](Unit_Tests.md) for deterministic fixtures. - **Contribute code:** follow the workflow described in [`./Contributing.md`](./Contributing.md) and consult the module-level docs for internal APIs. - **Plan ahead:** check the roadmap snapshot in the [Technical Overview](Technical_Overview.md#roadmap-snapshot) to understand upcoming work. @@ -80,6 +92,16 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t This guide walks you through creating a minimal FastFlowTransform project from scratch and running it end-to-end. +## 0. Create a skeleton (optional) + +Start with a minimal project structure: + +```bash +fft init demo_project --engine duckdb +``` + +The command is non-interactive, refuses to overwrite existing directories, and leaves inline comments that point back to the relevant docs (`Project_Config.md`, `Profiles.md`, etc.). Populate the generated files before running the steps below. + ## 1. Install & bootstrap ```bash @@ -178,23 +200,16 @@ Refer to `docs/Config_and_Macros.md` for advanced configuration options. - [Docs Navigation](#docs-navigation) - [Part I – Operational Guide](#part-i-operational-guide) - [Project Layout](#project-layout) - - [Sample Models](#sample-models) - - [Seeds & Example Data](#seeds-example-data) - - [Makefile Targets](#makefile-targets) + - [Example Projects and Seeds](#example-projects-and-seeds) - [CLI Flows](#cli-flows) - [Logging & Verbosity](#logging-verbosity) - [Model Unit Tests (`fft utest`)](#model-unit-tests-fft-utest) - [Troubleshooting](#troubleshooting) - - [Error Codes](#error-codes) - [Profiles & Environment Overrides](#profiles-environment-overrides) - - [Parallel Scheduler (v0.3)](#parallel-scheduler-v03) - - [Cache Policy (v0.3)](#cache-policy-v03) - - [Fingerprint Formula (v0.3)](#fingerprint-formula-v03) - - [Meta Table Schema (v0.3)](#meta-table-schema-v03) - - [Jinja DSL Quick Reference](#jinja-dsl-quick-reference) + - [Parallel Execution and Cache](#parallel-execution-and-cache) - [Roadmap Snapshot](#roadmap-snapshot) - [Cross-Table Reconciliations](#cross-table-reconciliations) - - [Auto-Docs & Lineage](#auto-docs-lineage) + - [Auto-Docs and Lineage](#auto-docs-and-lineage) - [Part II – Architecture & Internals](#part-ii-architecture-internals) - [Architecture Overview](#architecture-overview) - [Core Modules](#core-modules) @@ -276,2787 +291,4804 @@ fastflowtransform/ └── README.md ``` -### Sample Models +### Example Projects and Seeds -The demo project `examples/simple_duckdb` showcases the typical mix of SQL and Python models plus downstream marts. Use it as a template for your own projects. +Need runnable references? Start with the curated demos under `docs/examples/`: -- Batch models live under `models/` (`*.ff.sql`, `*.ff.py`). -- External tables are declared in `sources.yml`; reusable tests in `project.yml`. -- Seeds in `seeds/` keep demos deterministic. +- [Basic Demo](./examples/Basic_Demo.md) shows the minimum viable project (seeds, staging, marts) plus Makefile targets you can copy. +- [API Demo](./examples/API_Demo.md) focuses on HTTP-powered Python models. +- [Environment Matrix](./examples/Environment_Matrix.md) demonstrates multiple profiles talking to different engines. -> ℹ️ **Need full code samples and decorator details?** -> See [Model Fundamentals](./Config_and_Macros.md#1-model-fundamentals) in the Modeling Reference. +Each demo includes deterministic seeds (`seeds/*.csv`), schema YAML, and Makefile shortcuts, so the detailed CSV listings and commands here would be redundant. Follow the demo docs (or the [Quickstart](./Quickstart.md)) for the full walkthrough. -### Seeds & Example Data - -`seeds/seed_users.csv` +### CLI Flows -```csv -id,email -1,a@example.com -2,b@gmail.com -3,c@gmail.com -``` +Looking for command recipes, selection filters, or sync workflows? See the dedicated [CLI Guide](./CLI_Guide.md) for a task-by-task breakdown (seed/run/dag/docgen/test/utest/sync-db-comments) plus links to API-model helpers. -`seeds/seed_orders.csv` +### Logging & Verbosity -```csv -order_id,user_id,amount -100,1,19.9 -101,2,0 -``` +Need the exact behaviour of `-q/-v/-vv`, SQL debug output, or the parallel log queue? Head over to [Logging.md](./Logging.md) for the full matrix plus usage snippets. -### Makefile Targets +### Model Unit Tests (`fft utest`) -```makefile -DB ?= .local/demo.duckdb -PROJECT ?= examples/simple_duckdb +The full how-to (cache modes, YAML DSL, CI snippets) moved to [Unit_Tests.md](./Unit_Tests.md). Keep this Section in mind whenever you need fast feedback on SQL/Python models without executing the entire DAG. -seed: - fft seed $(PROJECT) --env dev +### Troubleshooting -run: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev +Common fixes (engines, docs generation, tests) plus the exit-code matrix live in [Troubleshooting.md](./Troubleshooting.md). Skim that doc whenever you hit connectivity issues or need to decode return codes. -dag: - fft dag "$(PROJECT)" --env dev --html +### Profiles & Environment Overrides +Need to understand profile precedence, `.env` layering, or the Pydantic models that back settings? Jump to the [Profiles guide](./Profiles.md) which covers file layout, environment helpers, validation, and selection precedence in depth. -test: - fft test "$(PROJECT)" --env dev --select batch -``` +### Parallel Execution and Cache -Targets wrap the CLI commands showcased below. Feel free to copy the pattern into your own projects. +Level-wise parallelism, cache modes, fingerprint formula, and the `_ff_meta` audit table are documented in [Cache_and_Parallelism.md](./Cache_and_Parallelism.md). Use that reference for CLI examples (`--jobs`, `--cache`, `--rebuild`), skip conditions, and troubleshooting tips related to concurrency. -### CLI Flows +### Roadmap Snapshot -- CLI flags and internals are documented under [CLI Implementation](#cli-implementation). -- Automation examples appear in the [Makefile Targets](#makefile-targets). +| Version | Content | +|---------|---------------------------------------------------| +| 0.2 | `config(materialized=...)`, Jinja macros, variables | +| 0.3 | Parallel execution, cache | +| 0.4 | Incremental models | +| 0.5 | Streaming connectors (Kafka, S3) | +| 1.0 | Stable API, plugin SDK | +> See also: feature pyramid & roadmap phases (OSS/SaaS) in the separate document. -#### HTTP/API in Python models -See [API calls in Python models](./Api_Models.md) for `get_json`/`get_df`, pagination, cache/offline flags. +--- +### Cross-Table Reconciliations -#### DAG & Documentation +Reconciliation tests (`reconcile_equal`, `reconcile_ratio_within`, `reconcile_diff_within`, `reconcile_coverage`) are fully documented in the [Data Quality Test Reference](./Data_Quality_Tests.md#cross-table-reconciliations). Use that guide for YAML schemas, tolerance parameters, and engine notes before wiring the checks into `fft test`. -- Narrow the graph with `fft dag ... --select ` (for example `state:modified` or `tag:finance`). Combined with `--html` this produces a focused mini site. -- Control schema introspection via `--with-schema/--no-schema`. Use `--no-schema` when the executor should avoid fetching column metadata (for example, BigQuery without sufficient permissions). -- `fft docgen` renders the DAG, model pages, and an optional JSON manifest in one command. Append `--open-source` to open `index.html` in your default browser after rendering. +### Auto-Docs and Lineage -#### Sync Database Comments +Rendering the DAG site, feeding project descriptions/lineage, and exporting JSON manifests are covered in [Auto_Docs.md](./Auto_Docs.md). Head there for command flags, markdown/YAML resolution, and lineage overrides. -`fft sync-db-comments --env ` pushes model and column descriptions from project YAML or Markdown into database comments. The command currently supports Postgres and Snowflake Snowpark: +## Part II – Architecture & Internals -- Start with `--dry-run` to review the generated `COMMENT` statements. -- Postgres honors `profiles.yml -> postgres.db_schema` (and any `FF_PG_SCHEMA` override). -- Snowflake reuses the session or connection exposed by the executor. +### Architecture Overview -If no descriptions are found, the command exits without making changes. +``` +CLI (Typer) +│ +├── Registry (core.py) +│ ├── Discover models (*.ff.sql / *.ff.py) +│ ├── Load Python models (decorator) +│ ├── Parse/validate dependencies +│ └── Jinja environment + sources.yml +│ +├── DAG (dag.py) +│ ├── topo_sort (Kahn, deterministic) +│ └── mermaid() (styled + stable IDs) +│ +├── Executors (executors/*) +│ ├── BaseExecutor (SQL rendering, dependency loading, materialization, requires guard) +│ ├── DuckExecutor (DuckDB) +│ ├── PostgresExecutor (SQLAlchemy, shims) +│ ├── BigQueryExecutor (pandas) +│ ├── BigQueryBFExecutor (BigQuery DataFrames / bigframes) +│ ├── DatabricksSparkExecutor (PySpark, without pandas) +│ └── SnowflakeSnowparkExecutor (Snowpark, without pandas) +│ +├── Testing (testing.py) +│ ├── generic _exec / _scalar +│ └── Checks: not_null, unique, row_count_between, greater_equal, non_negative_sum, freshness +│ +├── Seeding (seeding.py) +│ └── Load seeds (CSV/Parquet/SQL) → engine agnostic +│ +├── Docs (docs.py + templates/) +│ ├── Mermaid + overview table (index.html) +│ └── Model detail pages (model.html) +│ +├── Settings/Profiles (settings.py) +│ └── Pydantic v2 discriminated union + ENV overrides +│ +└── Streaming (streaming/*) + ├── FileTailSource + └── StreamSessionizer +``` -### Logging & Verbosity +--- -FastFlowTransform exposes uniform logging controls across all CLI commands plus a dedicated SQL debug channel. +### Core Modules -#### Flags +#### `core.py` -- `-q` / `--quiet` → only errors (`ERROR`) -- *(default)* → concise warnings (`WARNING`) -- `-v` / `--verbose` → progress/info (`INFO`) -- `-vv` → full debug (`DEBUG`), including SQL debug output +Key data structures and the project loading process. -`-vv` flips on the SQL debug channel automatically (same as setting `FFT_SQL_DEBUG=1` +```python +@dataclass +class Node: + name: str # logical name (stem or @model(name=...)) + kind: str # "sql" | "python" + path: Path + deps: List[str] = field(default_factory=list) -#### SQL debug channel +class Registry: + def load_project(self, project_dir: Path) -> None: ... + def _register_node(self, node: Node) -> None: ... + def _load_py_module(self, path: Path) -> types.ModuleType: ... + def _scan_sql_deps(self, path: Path) -> List[str]: ... +``` -Enable it to inspect Python-model inputs, dependency columns, and helper SQL emitted by data-quality checks: +**Helpers & decorator:** -```bash -# full debug (recommended) -fft run . -vv +```python +def relation_for(node_name: str) -> str: ... +def ref(name: str) -> str: ... +def source(source_name: str, table_name: str) -> str: ... -# equivalent using the env var (legacy behaviour retained) -FFT_SQL_DEBUG=1 fft run . +def model(name=None, deps=None, requires=None) -> Callable[[Callable[..., Any]], Callable[..., Any]]: ... ``` -#### Usage patterns +**Python models (example):** -```bash -fft run . -q # quiet (errors only) -fft run . # default (concise) -fft run . -v # verbose progress (model names, executor info) -fft run . -vv # full debug + SQL channel +```python +@model(name="users_enriched", deps=["users.ff"], requires={"users": {"id","email"}}) +def enrich(df: pd.DataFrame) -> pd.DataFrame: ... ``` -#### Parallel logging UX +--- -- Per node: start/end lines with duration, truncated name, and engine abbrev (DUCK/PG/BQ/…). -- Output is line-stable via a thread-safe log queue; per-level summaries at the end. -- On errors, the familiar “error block” is shown per node. +#### `dag.py` -**Notes** +Deterministic topological sort plus Mermaid export. -- SQL debug output routes through the `fastflowtransform.sql` logger; use `-vv` or the env var to see it. -- Existing projects do not need changes: the env var continues to work even without `-vv`. +```python +def topo_sort(nodes: Dict[str, Node]) -> List[str]: ... +def mermaid(nodes: Dict[str, Node]) -> str: ... +``` -### Model Unit Tests (`fft utest`) +--- -`fft utest` executes a single model in isolation, loading only the inputs you provide and comparing the result to an expected dataset. It works for SQL and Python models and runs against DuckDB or Postgres by default. +#### `errors.py` -#### Unit tests & cache +Primary error types with helpful messages. -`fft utest --cache {off|ro|rw}` (default: `off`) +```python +class FastFlowTransformError(Exception): ... +class ModuleLoadError(FastFlowTransformError): ... +class DependencyNotFoundError(FastFlowTransformError): ... +class ModelCycleError(FastFlowTransformError): ... +class TestFailureError(FastFlowTransformError): ... +``` -- `off`: deterministic, never skips. -- `ro`: skip on cache hit; on miss, build but **do not write** cache. -- `rw`: skip on hit; on miss, build **and write** fingerprint. +--- -Notes: -- UTests key the cache with `profile="utest"`. -- Fingerprints include case inputs (CSV content hash / inline rows), so changing inputs invalidates the cache. -- `--reuse-meta` is currently a reserved flag: it is exposed in the CLI, acts as a no-op today, and will enable future meta-table optimizations. +#### Executors +Shared logic (`BaseExecutor`) plus engine implementations. -#### Why? +```python +class BaseExecutor(ABC): + def render_sql(self, node: Node, env: Environment, ref_resolver=None, source_resolver=None) -> str: ... + def run_python(self, node: Node) -> None: ... + @abstractmethod + def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.DataFrame: ... + @abstractmethod + def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None: ... +``` -- Fast feedback on transformation logic without full DAG runs -- Small, reproducible fixtures (rows inline or external CSV) -- Engine-agnostic: swap DuckDB/Postgres to spot dialect differences +**DuckDB (`duckdb_exec.py`)** -#### Folder layout +- `run_sql(node, env)` renders Jinja (`ref/source`) and executes the SQL. +- `_read_relation` loads a table as `DataFrame`; surfaces actionable errors when a dependency is missing. +- `_materialize_relation` writes the `DataFrame` as a table (`create or replace table ...`). -Specs live under `/tests/unit/*.yml` relative to the project root (the directory passed to the CLI that contains `models/`): +**Postgres (`postgres_exec.py`)** -``` -your-project/ -├── models/ -│ ├── users.ff.sql -│ ├── users_enriched.ff.py -│ └── mart_users.ff.sql -└── tests/ - └── unit/ - ├── users_enriched.yml - └── mart_users.yml -``` +- `_SAConnShim` (compatible with `testing._exec`). +- `run_sql` renders SQL and rewrites `CREATE OR REPLACE TABLE` to `DROP + CREATE AS`. +- `_read_relation` uses pandas, handles schemas, and provides clear guidance. +- `_materialize_relation` writes via `to_sql(if_exists="replace")`. -#### YAML DSL (with `defaults`) +**BigQuery / BigQuery DataFrames / Spark / Snowpark** -Each file targets one logical node (the DAG name). Defaults are deep-merged into every case so you can share inputs/expectations and override per scenario. +- Identical signatures; IO uses the respective native dataframes (no pandas for Spark/Snowpark). -```yaml -# tests/unit/users_enriched.yml -model: users_enriched +--- -defaults: - inputs: - users: - rows: - - {id: 1, email: "a@example.com"} - - {id: 2, email: "b@gmail.com"} - expect: - relation: users_enriched - order_by: [id] +#### `validation.py` -cases: - - name: basic_gmail_flag - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} +Required-column checks for Python models (single and multi dependency). - - name: override_inputs - inputs: - users: - rows: - - {id: 3, email: "c@hotmail.com"} - - {id: 4, email: "d@gmail.com"} - expect: - rows: - - {id: 3, email: "c@hotmail.com", is_gmail: false} - - {id: 4, email: "d@gmail.com", is_gmail: true} +```python +class RequiredColumnsError(ValueError): ... +def validate_required_columns(node_name: str, inputs: Any, requires: dict[str, set[str]]): ... ``` -SQL models use the file stem (including `.ff`) as `model`. Provide expected relation names that match the materialized table/view: +--- -```yaml -# tests/unit/mart_users.yml -model: mart_users.ff +#### `testing.py` -defaults: - inputs: - users_enriched: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} - expect: - relation: mart_users - order_by: [id] +Minimal data quality framework (engine agnostic via `_exec`). -cases: - - name: passthrough_columns - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} +**Checks:** `not_null`, `unique`, `greater_equal`, `non_negative_sum`, `row_count_between`, `freshness` + +```python +class TestFailure(Exception): ... +def _exec(con: Any, sql: Any): ... +def _scalar(con: Any, sql: Any): ... ``` -For multi-dependency models, include every physical relation name (what `relation_for(dep)` returns): +--- -```yaml -model: mart_orders_enriched -defaults: - inputs: - users_enriched: - rows: - - {id: 1, email: "x@gmail.com", is_gmail: true} - orders: - rows: - - {order_id: 10, user_id: 1, amount: 19.9} - - {order_id: 11, user_id: 1, amount: -1.0} -cases: - - name: join_and_flag - expect: - any_order: true - rows: - - {order_id: 10, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: 19.9, valid_amt: true} - - {order_id: 11, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: -1.0, valid_amt: false} -``` - -#### Input formats - -- `rows`: inline dictionaries per row -- `csv`: reference a CSV file (relative paths allowed) +#### `docs.py` & Templates -Keys under `inputs` are physical relations; use `relation_for('users.ff')` if unsure. +- `render_site(out_dir, nodes)` produces `index.html` plus `model.html` per model. +- Templates (`docs/templates/`) include dark mode, filters, copy buttons, legend. +- Uses `dag.mermaid(nodes)` for the graph. -#### Expected output & comparison +--- -- `relation`: actual table/view name produced by the model (defaults to `relation_for(model)`) -- Ordering: `order_by: [...]` or `any_order: true` -- Columns: `ignore_columns: [...]`, `subset: true` -- Numeric tolerance: `approx: true` or `approx: { col: 1e-9, other_col: 0.01 }` - (numbers can be plain `1e-9` or quoted; they are cast to float) +#### `seeding.py` -#### Running utests +Engine-agnostic seed loading (CSV/Parquet/SQL). -```bash -fft utest . # discover all specs -fft utest . --env dev # use a specific profile -fft utest . --model users_enriched -fft utest . --model mart_orders_enriched --case join_and_flag -fft utest . --path tests/unit/users_enriched.yml +```python +def seed_project(project_dir: Path, executor, schema: Optional[str] = None) -> int: ... ``` -Override the executor for all specs (ensure credentials/DSNs are set): +--- -```bash -export FF_PG_DSN="postgresql+psycopg://postgres:postgres@localhost:5432/ffdb" -export FF_PG_SCHEMA="public" -fft utest . --engine postgres -``` +### CLI Implementation -Executor precedence (highest → lowest): CLI `--engine`, YAML `engine:` (optional), `profiles.yml`, environment overrides. +Operational usage lives in [CLI Flows](#cli-flows). This section drills into the Typer command definitions in `cli.py`. -#### Design notes +**Commands:** -- Only the target model runs; supply all upstream relations the model expects. -- `defaults` deep-merge: dicts merge, lists/scalars overwrite. -- Results compare as DataFrames with configurable order, subset, ignored columns, and numeric tolerances. -- Exit codes: `0` for success, `2` when at least one case fails (compact CSV-style diff is printed). +- `fft run [--env dev] [--engine ...]` +- `fft dag [--env dev] [--html] [--select ...] [--with-schema/--no-schema]` +- `fft docgen [--env dev] [--out dir] [--emit-json path] [--open-source]` +- `fft test [--env dev] [--select batch|streaming|tag:...]` +- `fft seed [--env dev]` +- `fft sync-db-comments [--env dev] [--dry-run]` +- `fft utest [--env dev] [--cache off|ro|rw] [--reuse-meta]` +- `fft --version` -**CI example (GitHub Actions)** +**Key components:** -```yaml -name: utests -on: [push, pull_request] -jobs: - duckdb: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: { python-version: "3.11" } - - run: pip install -e . - - run: fft utest . --env dev +```python +def _load_project_and_env(project_arg) -> tuple[Path, Environment]: ... +def _resolve_profile(env_name, engine, proj) -> tuple[EnvSettings, Profile]: ... +def _get_test_con(executor: Any) -> Any: ... ``` -(For Postgres, add a service container and run `fft utest . --engine postgres` with `FF_PG_DSN` / `FF_PG_SCHEMA`.) - -### Troubleshooting - -- **DuckDB seeds not visible** → ensure `FF_DUCKDB_PATH` (or profile path) is identical for `seed`, `run`, `dag`, and `test`. -- **Postgres connection refused** → confirm `FF_PG_DSN`, container status (`docker ps`), and that port `5432` is open. -- **BigQuery permissions** → set `GOOGLE_APPLICATION_CREDENTIALS` and match dataset/location to your profile. -- **HTML docs missing** → run `fft dag --html` and open `/docs/index.html`. -- **Unexpected test failures** → inspect rendered SQL in CLI output, refine selection via `--select`, refresh seeds if needed. -- **Dependency table not found** in utests → provide all physical upstream relations in the YAML spec. +**Test summary (exit 2 on failures):** -### Error Codes +``` +Data Quality Summary +──────────────────── +✅ not_null users.email (3ms) +❌ unique users.id (2ms) + ↳ users.id has 1 duplicate -| Type | Class/Source | Exit | Notes | -|---------------------------|---------------------------|------|---------------------------------------------------------| -| Missing dependency | `DependencyNotFoundError` | 1 | Per-node list; tips for `ref()` / names | -| Cycle in DAG | `ModelCycleError` | 1 | "Cycle detected among nodes: ..." | -| Model execution (KeyError)| `cli.py` → formatted block| 1 | Inspect columns, use `relation_for(dep)` as keys | -| Data quality failures | `cli test` → summary | 2 | "Totals ... passed/failed"; each failure on its own line | -| Unknown/unexpected | generic | 99 | Optional trace via `FFT_TRACE=1` | +Totals +────── +✓ passed: 1 +✗ failed: 1 +``` -Error types map to the classes documented in [Core Modules](#core-modules) and [CLI Implementation](#cli-implementation). +--- -### Profiles & Environment Overrides +### Settings Infrastructure -**`profiles.yml` example:** +`settings.py` uses a **Pydantic v2 discriminated union** (`engine` as discriminator) plus ENV overrides. -```yaml -default: - engine: duckdb - duckdb: { path: ":memory:" } +Profile types: +- `DuckDBProfile(engine="duckdb", duckdb: {path})` +- `PostgresProfile(engine="postgres", postgres: {dsn, db_schema})` +- `BigQueryProfile(engine="bigquery", bigquery: {project?, dataset, location?, use_bigframes?})` +- `DatabricksSparkProfile(engine="databricks_spark", ...)` +- `SnowflakeSnowparkProfile(engine="snowflake_snowpark", ...)` -stg: - engine: postgres - postgres: - dsn: postgresql+psycopg://postgres:postgres@localhost:5432/ffdb - db_schema: public +Resolver idea: -bq: - engine: bigquery - bigquery: - project: my-gcp-proj - dataset: demo - location: EU - use_bigframes: false +```python +def resolve_profile(project_dir: Path, env_name: str, env: EnvSettings) -> Profile: ... ``` -**ENV overrides (examples):** - -`FF_ENGINE`, `FF_DUCKDB_PATH`, `FF_PG_DSN`, `FF_PG_SCHEMA`, `FF_BQ_DATASET`, `FF_BQ_LOCATION`, `FF_BQ_USE_BIGFRAMES=1` +--- -**Priority (lowest → highest):** `profiles.yml` < environment variables (`FF_*`) < CLI flags (e.g. `--engine`). +### Streaming Components -For the Pydantic models and resolution flow, see [Settings Infrastructure](#settings-infrastructure). +**`streaming/sessionizer.py`** -### Parallel Scheduler (v0.3) +- Normalizes events (JSONL / batch DF) and writes `fct_sessions_streaming`. +- `process_batch(df)` aggregates sessions (start/end, pageviews, revenue). -FastFlowTransform executes the DAG in **levels**. Each level contains nodes without mutual dependencies. +**Smoke test (DuckDB):** -- `--jobs N` limits the **maximum concurrency per level**. -- `--keep-going` keeps tasks within the current level running even if one fails; subsequent levels are not started. +```python +def test_stream_sessionizer_produces_sessions(): ... +``` -**CLI** -```bash -fft run . --env dev --jobs 4 # parallel (level-wise) -fft run . --env dev --jobs 4 --keep-going +--- -fft run . --select model_b --jobs 4 # Run only model_b and whatever it depends on -fft run . --rebuild-only model_b # Rebuild only model_b, even if cache hits -``` +### Mini End-to-End Example (Python API) -**Internals** -- `dag.levels(nodes)` builds level lists using indegrees. -- `run_executor.schedule(levels, jobs, fail_policy)` spawns a thread pool per level and aggregates timings. +```python +from pathlib import Path +from jinja2 import Environment, FileSystemLoader +from fastflowtransform.core import REGISTRY +from fastflowtransform.dag import topo_sort +from fastflowtransform.executors.duckdb_exec import DuckExecutor -### Cache Policy (v0.3) +proj = Path("examples/simple_duckdb").resolve() +REGISTRY.load_project(proj) +env = REGISTRY.env # Jinja env from the registry load -**Modes** -``` -off – always build -rw – default; skip if fingerprint matches and relation exists; write cache after build -ro – skip on match; on miss build but do not write cache -wo – always build and write cache -``` -`--rebuild ` ignores cache for matching nodes. +order = topo_sort(REGISTRY.nodes) +ex = DuckExecutor(db_path=str(proj / ".local" / "demo.duckdb")) -**Skip condition** -1) Fingerprint matches the stored value (file-backed cache) -2) Physical relation exists on the target engine +for name in order: + node = REGISTRY.nodes[name] + if node.kind == "sql": + ex.run_sql(node, env) + else: + ex.run_python(node) -**Examples** -```bash -fft run . --env dev --cache=rw -fft run . --env dev --cache=ro -fft run . --env dev --cache=rw --rebuild marts_daily.ff +print("✓ Done") ``` -### Fingerprint Formula (v0.3) - -**SQL nodes**: -`fingerprint_sql(node, rendered_sql, env_ctx, dep_fps)` +--- -**Python nodes**: -`fingerprint_py(node, func_src, env_ctx, dep_fps)` +Need a different angle? Head back to the [Docs Hub](./index.md) or deep-dive into the [Modeling Reference](./Config_and_Macros.md). -**`env_ctx` content** -- `engine` (e.g. duckdb, postgres, bigquery) -- `profile_name` (CLI `--env`) -- selected environment keys/values: all `FF_*` -- normalized excerpt of `sources.yml` (sorted dump) -**Properties** -- Same inputs ⇒ same hash. -- Minimal change in SQL/function ⇒ different hash. -- Any dependency fingerprint change bubbles downstream via `dep_fps`. -### Meta Table Schema (v0.3) + -FastFlowTransform writes a per-node audit row after successful builds: +# API Calls in Python Models -``` -_ff_meta ( - node_name TEXT / STRING -- logical name, e.g. "users.ff" - relation TEXT / STRING -- physical name, e.g. "users" - fingerprint TEXT / STRING - engine TEXT / STRING - built_at TIMESTAMP -) -``` +> **Status:** Experimental but stable for demos and smaller workflows. +> **Goal:** Query HTTP APIs from Python models, return responses as DataFrames, cache and instrument them cleanly, and support reproducible offline runs. -**Backends** -- DuckDB: table `_ff_meta` in `main`. -- Postgres: table `_ff_meta` in the active schema. -- BigQuery: table `._ff_meta`. +* [Motivation](#motivation) +* [Quickstart](#quickstart) +* [Programming API](#programming-api) + * [`get_json`](#get_json) + * [`get_df`](#get_df) + * [Pagination](#pagination) + * [Context & Telemetry](#context-telemetry) +* [CLI Flags & Environment Variables](#cli-flags-environment-variables) +* [Example Model](#example-model) +* [Artifacts](#artifacts) +* [Tests & Offline Demos](#tests-offline-demos) +* [Best Practices](#best-practices) +* [Troubleshooting](#troubleshooting) +* [Security & Compliance](#security-compliance) +* [FAQ](#faq) -**Notes** -- Meta is currently used for auditing and tooling; skip logic relies on fingerprint cache + relation existence checks. +--- -#### Executor meta hook +## Motivation -After a successful materialization the executor calls: - on_node_built(node, relation, fingerprint) +Many pipelines need small, reliable API fetchers: configuration tables, miniature dimensions, feature flags, SaaS exports. This feature provides: -This performs an upsert into `_ff_meta` with `(node_name, relation, fingerprint, built_at, engine)`. +- Simple HTTP calls inside Python models +- File-backed cache (reproducible builds, works offline) +- Per-node telemetry (requests, hits, bytes, hashes) +- CLI switches `--offline` and `--http-cache` for reproducible runs -Skipped nodes do **not** touch the meta table. +--- +## Quickstart -### Jinja DSL Quick Reference +1. **Optionally enable flags** (recommended): -`ref()`, `source()`, `var()`, `config()`, `this` – see details in the [Modeling Reference](./Config_and_Macros.md). + ```bash + # No network - cache hits only + fft run . --env dev --offline + # Cache mode + fft run . --env dev --http-cache rw # rw|ro|off + ``` -### Roadmap Snapshot +2. **Write a Python model**: -| Version | Content | -|---------|---------------------------------------------------| -| 0.2 | `config(materialized=...)`, Jinja macros, variables | -| 0.3 | Parallel execution, cache | -| 0.4 | Incremental models | -| 0.5 | Streaming connectors (Kafka, S3) | -| 1.0 | Stable API, plugin SDK | + ```python + # models/users_from_api.ff.py + import pandas as pd + from fastflowtransform.core import model + from fastflowtransform.api.http import get_df -> See also: feature pyramid & roadmap phases (OSS/SaaS) in the separate document. + @model(name="users_from_api", deps=["users.ff"]) + def fetch(_: pd.DataFrame) -> pd.DataFrame: + df = get_df( + url="https://api.example.com/users", + params={"page": 1}, + record_path=["data"], # JSON -> list -> DataFrame + ) + return df + ``` ---- +3. **Run it**: -### Cross-Table Reconciliations + ```bash + fft run . --env dev --select users_from_api + ``` -FastFlowTransform can compare aggregates and key coverage **across two tables** and surface drift with clear, numeric messages. These checks run via the standard `fft test` entrypoint and integrate into the DQ summary output. +--- -**CLI** -```bash -# only run reconciliation checks -fft test . --env dev --select reconcile -``` +## Programming API -**YAML DSL** +> Module: `fastflowtransform.api.http` -All checks live under `project.yml → tests:` and should carry the tag `reconcile` for easy selection. +### `get_json` -1) **Equality / Approx Equality** -```yaml -- type: reconcile_equal - name: orders_total_equals_mart - tags: [reconcile] - left: { table: orders, expr: "sum(amount)" } - right: { table: mart_orders_enriched, expr: "sum(amount)", where: "valid_amt" } - # optional tolerances: - abs_tolerance: 0.01 # |L - R| <= 0.01 - rel_tolerance_pct: 0.1 # |L - R| / max(|R|, eps) <= 0.1% (0.1) -``` +```python +from fastflowtransform.api.http import get_json -2) **Ratio within bounds** -```yaml -- type: reconcile_ratio_within - name: orders_vs_mart_ratio - tags: [reconcile] - left: { table: orders, expr: "sum(amount)" } - right: { table: mart_orders_enriched, expr: "sum(amount)" } - min_ratio: 0.999 - max_ratio: 1.001 +data = get_json( + url="https://api.example.com/objects", + params={"page": 1}, # optional + headers={"Authorization": "Bearer ..."}, # optional + timeout=20, # optional (seconds) +) +# -> Python dict / list ``` -3) **Absolute difference within limit** -```yaml -- type: reconcile_diff_within - name: count_stability - tags: [reconcile] - left: { table: events_raw, expr: "count(*)", where: "event_type='purchase'" } - right: { table: fct_sales, expr: "sum(txn_count)" } - max_abs_diff: 10 -``` +**Behavior** -4) **Coverage (anti-join = 0)** -```yaml -- type: reconcile_coverage - name: all_orders_covered - tags: [reconcile] - source: { table: orders, key: "order_id" } - target: { table: mart_orders_enriched, key: "order_id" } - # optional filters - source_where: "order_date >= current_date - interval '7 days'" - target_where: "valid_amt" +- Reads from the local cache (when present and valid). +- Writes to the cache (`rw` mode), including the response body. +- Respects offline mode (no network traffic). + +### `get_df` + +```python +from fastflowtransform.api.http import get_df + +df = get_df( + url="https://api.example.com/users", + params={"page": 1}, + record_path=["data"], # path to the JSON list + normalize=True, # optional: flatten nested objects + paginator=None, # optional: pagination strategy (see below) + output="pandas", # pandas|spark (default=pandas) +) +# -> pandas.DataFrame +``` + +**Conversion** + +- Default: `record_path` points to the array payload (for example `["data"]`). +- `normalize=True` delegates to `json_normalize` for deeper structures. +- `output='spark'` (plus an optional `session=SparkSession`) converts the normalized result into a `pyspark.sql.DataFrame`. Additional backends will reuse the same parameter. + +### Pagination + +For paged APIs you can describe the next request declaratively: + +```python +def paginator(url: str, params: dict | None, json_obj: dict): + next_url = json_obj.get("next") # e.g. absolute URL + if next_url: + return {"next_request": {"url": next_url}} + return None + +df = get_df( + "https://api.example.com/users?page=1", + paginator=paginator, + record_path=["data"], +) +``` + +The paginator may return the following fields: + +- `{"next_request": {"url": "...", "params": {...}, "headers": {...}}}` + (any missing field keeps its previous value) + +### Context & Telemetry + +During a model run the executor collects telemetry per node and writes it into `run_results.json`: + +- `requests` (count) +- `cache_hits` +- `bytes` (sum of response bodies) +- `used_offline` (bool) +- `keys` (cache keys) +- `entries` (optional compact array with URL, status, content hash) + +You will find these metrics under the `http` block of each node (see [Artifacts](#artifacts)). + +--- + +## CLI Flags & Environment Variables + +**CLI** + +- `--offline` + Sets `FF_HTTP_OFFLINE=1`; network requests are blocked, **cache hits only**. +- `--http-cache {off|ro|rw}` + Sets `FF_HTTP_CACHE_MODE`: + + - `off`: neither read nor write. + - `ro`: read-only (hits), **no** writes. + - `rw`: read and write (default). + +**Environment (optional to set directly)** + +| Variable | Default | Effect | +| ------------------------ | ------------------------------- | ----------------------------------- | +| `FF_HTTP_OFFLINE` | `0` | `1/true/on` -> offline mode | +| `FF_HTTP_CACHE_MODE` | `rw` | `off` / `ro` / `rw` | +| `FF_HTTP_CACHE_DIR` | `.fastflowtransform/http_cache` | Cache directory | +| `FF_HTTP_TTL` | `0` | Seconds; 0 = never expires | +| `FF_HTTP_TIMEOUT` | `20` | Request timeout (seconds) | +| `FF_HTTP_MAX_RETRIES` | `3` | Basic retry count | +| `FF_HTTP_RATE_LIMIT_RPS` | `0` | Requests per second (0 = unlimited) | + +--- + +## Example Model + +```python +# models/dim_countries_from_api.ff.py +import pandas as pd +from fastflowtransform.core import model +from fastflowtransform.api.http import get_df + +@model(name="dim_countries_from_api", deps=["users.ff"]) +def countries(_: pd.DataFrame) -> pd.DataFrame: + def pager(u, p, js): + nxt = js.get("paging", {}).get("next") + return {"next_request": {"url": nxt}} if nxt else None + + df = get_df( + url="https://api.example.com/countries?page=1", + paginator=pager, + record_path=["data"], + normalize=True, + ) + # lightweight post-processing + if "code" in df.columns: + df["code"] = df["code"].str.upper() + return df +``` + +Run: + +```bash +fft run . --env dev --select dim_countries_from_api --http-cache ro +``` + +--- + +## Artifacts + +`/.fastflowtransform/target/run_results.json` (excerpt): + +```json +{ + "results": [ + { + "name": "dim_countries_from_api", + "status": "success", + "duration_ms": 153, + "http": { + "requests": 2, + "cache_hits": 2, + "bytes": 1842, + "used_offline": true, + "keys": ["GET:https://api.example.com/countries?page=1|{}|{}", "..."], + "entries": [ + {"url": "https://api.example.com/countries?page=1", "status": 200, "content_hash": "sha256:..."}, + {"url": "https://api.example.com/countries?page=2", "status": 200, "content_hash": "sha256:..."} + ] + } + } + ] +} +``` + +> Note: When a node is **skipped** (fingerprint cache hit), no new `http` block is emitted - the model did not run. + +--- + +## Tests & Offline Demos + +- Place unit tests under `tests/api/...` and seed the cache directly (no real HTTP calls). +- Suggested scenarios: + + - **Offline hit:** set `FF_HTTP_OFFLINE=1`, seed the cache, `get_json/get_df` must succeed. + - **Cache mode `off`:** even with cache entries, **no** reads; expect a failure in offline mode. + - **`ro`:** allow read hits; **no** cache writes after a real or mocked request. + - **Pagination:** stitch several pages from offline fixtures; telemetry should count requests/hits. + +--- + +## Best Practices + +- **Stable URLs and parameter order** produce identical cache keys and reproducible builds. +- **Keep `record_path` shallow**; use `normalize=True` only when necessary (performance). +- **Never cache secrets:** provide tokens via headers; the response body and metadata are cached. +- **Use `--offline` in CI** for deterministic tests with a pre-seeded cache. +- **Set TTL intentionally** when APIs change frequently. +- **Scope engine-specific variants** with `engine_model(only=...)` so each execution backend registers only the models it can run (pair with SQL `config(engines=[...])` when duplicating logical names). + +--- + +## Troubleshooting + +- **“offline + cache miss”** + Seed the cache (see tests) or disable offline mode. +- **“Schema mismatch”** + Harmonize columns after `get_df` (types, missing keys). +- **“Too many requests”** + Configure `FF_HTTP_RATE_LIMIT_RPS`; make pagination more efficient (larger `page_size`). +- **“No http block”** + Was the node **skipped** (fingerprint cache)? Or did the model avoid HTTP calls altogether? + +--- + +## Security & Compliance + +- **Do not commit secrets** - use environment variables or a secret manager. +- **PII/GDPR:** verify whether the API returns personal data; minimise retention. +- **Cache directory:** keep it in `.gitignore`; encrypt or isolate it if necessary. + +--- + +## FAQ + +**Q:** Can I call other libraries (for example `requests`, `httpx`) directly? +**A:** Yes, but you lose telemetry and caching. The recommended entrypoint is `fastflowtransform.api.http`. + +**Q:** How do I add custom headers (for example OAuth)? +**A:** Pass `headers={...}`. Store sensitive values in env vars and inject them into your models. + +**Q:** Does this work for POST requests? +**A:** Release R1 focuses on GET. Please open an issue for POST/PUT support; the design can be extended. + +--- + +**See also:** + +- Technical guide: *Developer Guide – Architecture & Internals* +- Unit tests: `tests/api/test_http_*.py` +- Runtime & cache: *Parallelism & Cache (v0.3)* + + + + + +# FastFlowTransform Modeling Reference (v0.1) + +> Authoritative reference for FastFlowTransform’s modeling layer: SQL/Python models, configuration macros, templating helpers, and testing hooks. +> Works with FastFlowTransform v0.1 (T1–T11). Supported engines: DuckDB, Postgres, BigQuery (pandas & BigFrames), Databricks/Spark, Snowflake/Snowpark. +> **Execution & Cache (v0.3) quick notes** +> - Parallelism is level-wise; use `fft run --jobs N`. +> - Use `--cache={off|ro|rw|wo}` to control skipping behavior. +> - Fingerprints include rendered SQL / Python function source, selected `FF_*` env vars, `sources.yml` and upstream fingerprints. +> - Change any of these → downstream nodes rebuild. +> - `--rebuild ` forces rebuilding selected models (ignores cache). + + +For an operational walkthrough (CLI usage, troubleshooting, pipelines) see the [Technical Overview](./Technical_Overview.md). This document focuses purely on how you author and test models. + +--- + +## Docs Navigation +1. [Getting Started](./index.md) +2. [User Guide](./Technical_Overview.md#part-i-operational-guide) +3. **Modeling Reference** — you are here (`Config_and_Macros.md`) +4. [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) + +--- + +## Table of Contents + +- [Docs Navigation](#docs-navigation) +- [1. Model Fundamentals](#1-model-fundamentals) + - [1.1 SQL models (`*.ff.sql`)](#11-sql-models-ffsql) + - [1.2 Python models (`*.ff.py`)](#12-python-models-ffpy) + - [1.3 Seeds, sources, and dependencies](#13-seeds-sources-and-dependencies) +- [2. `config()` options](#2-config-options) +- [3. Variables with `var()`](#3-variables-with-var) +- [4. Template context & helpers](#4-template-context-helpers) +- [5. Macros & reusable Jinja code](#5-macros-reusable-jinja-code) +- [6. Materialization semantics](#6-materialization-semantics) +- [7. Testing & quality gates](#7-testing-quality-gates) +- [8. Quick cheat sheet](#8-quick-cheat-sheet) + +--- + +## 1. Model Fundamentals + +FastFlowTransform discovers models under `/models/` with two primary flavours: + +### 1.1 SQL models (`*.ff.sql`) + +- File stem defines the logical DAG node (`users.ff.sql` → `users.ff`). +- Jinja template rendered with FastFlowTransform context (helpers like `ref`, `source`, `var`, `config`, `this`). +- Output relation defaults to the stem without `.ff` (configurable via `config(alias=...)` if supported in future releases). + +```sql +-- models/users.ff.sql +{{ config(materialized='table', tags=['staging']) }} +create or replace table users as +select id, email +from {{ source('crm', 'users') }}; +``` + +### 1.2 Python models (`*.ff.py`) + +Use the `@model` decorator from `fastflowtransform.core` to register a callable. The decorator accepts: + +- `name` (optional) → overrides the logical name (defaults to stem). +- `deps` → list of dependency nodes (file stems or logical names). +- `requires` → column contract per dependency (validated via `validation.validate_required_columns`). +- `materialized` (optional) → `'table' | 'view' | 'ephemeral'`; mirrors `config(materialized=...)` for SQL. +- `tags` (optional) → convenience for attaching selection labels without writing `meta={"tags": ...}`. + +Dependencies determine the call signature: + +- Single dependency → function receives a single `pandas.DataFrame`. +- Multiple dependencies → function receives `dict[str, pandas.DataFrame]` keyed by physical relation name (e.g. `"users"`). + +```python +# models/users_enriched.ff.py +from fastflowtransform.core import model +import pandas as pd + +@model( + name="users_enriched", + deps=["users.ff"], + requires={"users": {"id", "email"}}, + materialized="view", +) +def enrich(df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + out["is_gmail"] = out["email"].str.endswith("@gmail.com") + return out +``` + +#### Engine-scoped registration + +When the same project supports multiple execution backends, use `engine_model` to register a Python model only for specific engines. The decorator wraps `@model` but bails out early if the active engine (from `FF_ENGINE` or the selected profile) is not allowed. + +```python +from fastflowtransform import engine_model +import pandas as pd + +@engine_model( + only=("duckdb", "postgres"), + name="api_users_requests", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine"], +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + ... +``` + +Allowed values are case-insensitive strings or tuples. If the engine does not match, the function is left undecorated and no node is created, preventing duplicate registrations across engine-specific folders. + +### 1.3 Seeds, sources, and dependencies + +- Declare external tables in `sources.yml`; they become available via `source('group','table')`. +- Provide reproducible inputs with CSV/Parquet seeds in `/seeds/`. +- FastFlowTransform auto-detects dependencies: + - SQL models → parse `ref()` / `source()` calls. + - Python models → use the decorator’s `deps`. + - Additional runtime dependencies can be expressed via `relation_for()`. + +> **Warning:** SQL dependency detection is static. Only literal calls such as `ref('users.ff')` are registered. When you need to gate a dependency behind a variable, materialise the options in a mapping (`{'foo': ref('foo'), 'bar': ref('bar')}`) and pick from that map at runtime; a bare `ref(variable)` will not show up in the DAG. + +- Persistence (e.g. Spark/Databricks): configure default targets under `project.yml → models.storage` (and optionally `seeds.storage`). Example: + + ```yaml + models: + storage: + api_users_http: + path: ".local/spark/api_users_http" + format: delta + options: + mergeSchema: true + + seeds: + storage: + users: + path: ".local/spark/seeds/users" + ``` + + Entries end up in `node.meta["storage"]` (keys: `path`, `format`, `options`) and are respected by the matching executor. + +```yaml +# sources.yml +version: 2 + +sources: + - name: crm + tables: + - name: users + identifier: seed_users + - name: erp + tables: + - name: orders + identifier: seed_orders +``` + +Each source can declare defaults such as `schema`, `database`, or `catalog`. Tables may +override those defaults, add per-engine overrides, or point at files: + +```yaml + - name: raw + schema: staging + tables: + - name: seed_users + identifier: seed_users + overrides: + postgres: + schema: raw + databricks_spark: + format: delta + location: "/mnt/delta/raw/seed_users" +``` + +--- + +## 2. `config()` options + +Call `config()` at the top of SQL models. Python models get the same options via the `@model(..., materialized=..., tags=...)` decorator kwargs. + +```sql +{{ config( + materialized='view', + tags=['mart', 'daily'] +) }} +``` + +Supported keys (v0.1): + +| Key | Type | Description | +|----------------|-----------------|------------------------------------------------------------------------------| +| `materialized` | `"table" \| "view" \| "ephemeral"` | Controls how FastFlowTransform persists the model. See [Materialization semantics](#6-materialization-semantics). | +| `tags` | `list[str]` | Arbitrary labels surfaced in docs / selection tooling. | +| `engines` | `list[str]` or `str` | Restrict registration to the listed engines (case-insensitive). Requires the active engine to be known (profile selection or `FF_ENGINE`). | +| (future) | – | Additional metadata is stored under `node.meta[...]` if added later. | + +**Tips** + +- Place `config()` before any SQL text. +- Use tags to power custom filters in docs or to drive test selection. +- Combine `engines=[...]` with per-engine subfolders to keep one physical file per backend without name clashes. When no engine is active, FastFlowTransform raises a clear error to avoid silent skips. +- Ephemeral models inline into downstream SQL; pick `view` for shareable logic without materializing a table. + +--- + +## 3. Variables with `var()` + +Project-level variables live under `project.yml → vars:` and can be overridden from the CLI: + +```yaml +# project.yml +vars: + snapshot_day: "2000-01-01" + limit: 100 +``` + +```bash +fft run . --vars snapshot_day='2025-10-01' limit=50 +``` + +Usage in templates: + +```sql +select * +from {{ source('crm','users') }} +where signup_date <= '{{ var("snapshot_day", "1970-01-01") }}' +limit {{ var("limit", 1000) }} +``` + +Resolution order: CLI overrides → project vars → default argument. + +--- + +## 4. Template context & helpers + +Every model (SQL & Python) gets a rich Jinja context. Key helpers: + +| Helper | Purpose | +|--------------------|------------------------------------------------------------------------------------------| +| `this` | Object exposing `name`, `relation`, `materialized`, `schema`, `database`. | +| `ref("model")` | Resolves another model’s physical relation (or inlines ephemeral SQL). | +| `source("group","table")` | Resolves entries defined in `sources.yml`. | +| `relation_for(node)` (Python utility) | Maps logical node names to physical relations (helpful inside UDFs/tests). | +| `var("key", default)` | Retrieves project/CLI variables (see above). | + +Example: + +```sql +{{ config(materialized='view') }} +select + u.id, + u.email, + {{ var("country_column", "'US'") }} as country_code +from {{ ref('users.ff') }} as u +-- rendered relation for logging/debugging +-- {{ this.relation }} +``` + +--- + +## 5. Macros & reusable Jinja code + +Organise shared SQL snippets in `models/macros/` (all `.sql` files are auto-loaded): + +``` +models/ + macros/ + string_utils.sql + marts/ + users.ff.sql +``` + +```jinja +{# models/macros/string_utils.sql #} +{% macro safe_lower(col) -%} +lower(trim({{ col }})) +{%- endmacro %} +``` + +Use the macro anywhere within the project: + +```sql +select {{ safe_lower("email") }} as email_lower +from {{ ref('users.ff') }}; +``` + +**Best practices** + +- Keep macros idempotent and side-effect free. +- Group related macros per file (e.g., string utilities, date helpers). +- Document macros with inline comments; FastFlowTransform’s generated docs list each macro with its path. + +--- + +## 6. Materialization semantics + +### SQL models + +| Materialization | Behaviour | +|-----------------|-----------| +| `table` | `CREATE OR REPLACE TABLE … AS ` | +| `ephemeral` | No object is created; downstream `ref()` expands to a subquery. | + +**Postgres-specific:** FastFlowTransform rewrites the “create or replace” pattern into `DROP TABLE IF EXISTS …; CREATE TABLE … AS …` for compatibility. + +### Python models + +- Default → materialized as `table`. +- `materialized='view'` produces an engine-specific temporary table first, then creates/overwrites a view that selects from it. +- Ephemeral Python models are not supported in v0.1. + +--- + +## 7. Testing & quality gates + +### 7.1 Column contracts (`requires`) + +Use the decorator’s `requires` argument (Python models) to ensure upstream inputs carry expected columns. Under the hood FastFlowTransform calls `validation.validate_required_columns`, raising `RequiredColumnsError` with a descriptive diff. + +```python +@model( + deps=["orders.ff", "users_enriched"], + requires={ + "orders": {"order_id", "user_id", "amount"}, + "users_enriched": {"id", "email", "is_gmail"} + } +) +def join_orders(inputs: dict[str, pd.DataFrame]) -> pd.DataFrame: + ... +``` + +### 7.2 Data quality tests (`project.yml`) + +Declare checks under `project.yml → tests:`. Each entry maps directly to a function in `fastflowtransform.testing` (`not_null`, `unique`, `row_count_between`, `greater_equal`, `non_negative_sum`, `freshness`). Run them via `fft test …`. + +```yaml +tests: + - type: not_null + table: users + column: email + tags: [batch] +``` + +### 7.3 Model unit tests (`fft utest`) + +Keep transformation logic honest with small, engine-agnostic specs: + +- Place YAML files under `/tests/unit/`. +- Express inputs via inline rows or CSV paths. +- Declare expected output rows plus comparison options (`order_by`, `any_order`, `ignore_columns`, `approx`). + +```yaml +# tests/unit/users_enriched.yml +model: users_enriched +defaults: + inputs: + users: + rows: + - {id: 1, email: "a@example.com"} + - {id: 2, email: "b@gmail.com"} + expect: + relation: users_enriched + order_by: [id] + +cases: + - name: flags_gmail + expect: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} +``` + +Run with: + +```bash +fft utest . --env dev +fft utest . --model users_enriched --case flags_gmail +``` + +See the [Model Unit Tests guide](./Unit_Tests.md) for an exhaustive walkthrough (engine overrides, CI examples, troubleshooting). + +--- + +## 8. Quick cheat sheet + +| Task | Snippet / Pointer | +|------|-------------------| +| Set materialization | `{{ config(materialized='view') }}` | +| Add tags | `{{ config(tags=['mart','daily']) }}` | +| Read project variable | `{{ var('run_date', '1970-01-01') }}` | +| Current relation name | `{{ this.relation }}` | +| Reference another model | `{{ ref('users.ff') }}` | +| Reference source | `{{ source('crm','users') }}` | +| Macro definition | `models/macros/*.sql` | +| Guarantee columns (Python) | `@model(..., requires={'users': {'id','email'}})` | +| Data-quality test | `project.yml → tests` + `fft test …` | +| Unit test | `tests/unit/*.yml` + `fft utest …` | + +--- + +Return to the [Docs Hub](./index.md) or switch to the [User/Developer Guide](./Technical_Overview.md). + + + + + +# Parallelism & Cache + +**TL;DR:** FastFlowTransform executes models in parallel DAG levels and uses deterministic +fingerprints to skip unchanged nodes — while a separate HTTP cache accelerates API models. + +FastFlowTransform introduces a level-wise parallel scheduler and a build cache driven by stable fingerprints. This document explains **how parallel execution works**, **when nodes are skipped**, the exact **fingerprint formula**, and the **meta table** written after successful builds. + +--- + +## Table of Contents +- [Parallel Scheduler](#parallel-scheduler) +- [Cache Policy](#cache-policy) +- [Fingerprint Formula](#fingerprint-formula) +- [Meta Table Schema](#meta-table-schema) +- [CLI Recipes](#cli-recipes) +- [Troubleshooting & FAQ](#troubleshooting--faq) +- [Example: simple_duckdb](#example-simple_duckdb) +- [Appendix: Environment Inputs](#appendix-environment-inputs) + +--- + +## Parallel Scheduler + +FastFlowTransform splits the DAG into **levels** (all nodes that can run together without violating dependencies). Within a level, up to `--jobs` nodes execute in **parallel**. + +- Dependencies are **never** violated. +- `--keep-going`: tasks already started in a level finish; **subsequent levels won’t start** if any task in the current level fails. +- Logs are serialized through an internal queue to keep lines readable and per-node timing visible. + +**Quick start** +```bash +# Run with 4 workers per level +fft run . --env dev --jobs 4 + +# Keep tasks in the same level running even if one fails +fft run . --env dev --jobs 4 --keep-going +``` + +--- + +## Cache Policy + +The cache decides whether a node can be **skipped** when nothing relevant changed. Modes: + +``` +--cache=off # always build +--cache=rw # default; skip on match; write cache after build +--cache=ro # skip on match; on miss build but don't write cache +--cache=wo # always build and write cache +--rebuild # ignore cache for matching nodes +--no-cache # alias for --cache=off +``` + +### Skip condition + +A node is skipped iff: + +1. The current **fingerprint** matches the on-disk cache value, **and** +2. The **physical relation exists** on the target engine. + +If the relation was dropped externally, FastFlowTransform will **rebuild** even if the fingerprint matches. + +### HTTP Response Cache + +In addition to the build cache, FastFlowTransform provides an **HTTP response cache** for API models using +`fastflowtransform.api.http.get_df(...)`. + +- **Purpose:** Avoid redundant API calls and support offline mode. +- **Location:** Controlled by `FF_HTTP_CACHE_DIR` (e.g. `.local/http-cache`). +- **Controls (environment):** + - `FF_HTTP_ALLOWED_DOMAINS`: comma-separated list of hosts allowed to cache. + - `FF_HTTP_MAX_RPS`, `FF_HTTP_MAX_RETRIES`, `FF_HTTP_TIMEOUT`: rate limiting & retry policy. + - `FF_HTTP_OFFLINE=1`: run in offline mode — serve only from cache, no network calls. +- **CLI visibility:** Each run writes HTTP stats (`requests`, `cache_hits`, `bytes`, `used_offline`) + to `.fastflowtransform/target/run_results.json`. +- **Makefile helpers:** see `make api-show-http` in the API demo to inspect HTTP cache usage. + +> This cache is independent from the build cache; it stores API responses, not SQL or fingerprints. + +--- + +## Fingerprint Formula + +Fingerprints are stable hashes that change on any relevant input: + +* **SQL models**: `fingerprint_sql(node, rendered_sql, env_ctx, dep_fps)` + + * Uses **rendered** SQL (after Jinja), not the raw template. +* **Python models**: `fingerprint_py(node, func_src, env_ctx, dep_fps)` + + * Uses `inspect.getsource(func)` with a **file-content fallback** if needed. + +`env_ctx` includes: + +* `engine` (e.g., `duckdb`, `postgres`, `bigquery`) +* `profile_name` (CLI `--env`) +* Selected environment entries: **all `FF_*` keys** (key + value) +* A **normalized** portion of `sources.yml` (sorted keys/dump) + +`dep_fps` are upstream fingerprints; **any upstream change** invalidates downstream fingerprints. + +**Properties** + +* Same inputs ⇒ same hash. +* Minimal change in SQL/function ⇒ different hash. +* Dependency changes propagate downstream. + +> **Note:** The active engine and profile name are part of the fingerprint. +> Switching from `duckdb` to `postgres` automatically invalidates the cache, so cross-engine runs +> never reuse outdated fingerprints. + +--- + +## Meta Table Schema + +After a successful build, FastFlowTransform writes a per-node audit row: + +``` +_ff_meta ( + node_name TEXT/STRING, -- logical name, e.g. "users.ff" + relation TEXT/STRING, -- physical table/view, e.g. "users" + fingerprint TEXT/STRING, + engine TEXT/STRING, + built_at TIMESTAMP +) +``` + +Backends: + +* **DuckDB:** table `_ff_meta` in `main`. +* **Postgres:** table `_ff_meta` in the active schema. +* **BigQuery:** table `._ff_meta`. + +> Note: Skip logic uses the file-backed fingerprint cache and a direct relation existence check; the meta table is for auditing and tooling. + +--- + +## CLI Recipes + +```bash +# First run — builds everything, writes cache and meta +fft run . --env dev --cache=rw + +# No-op run — should skip all nodes (if nothing changed) +fft run . --env dev --cache=rw + +# Force rebuild of a single model (ignores cache for it) +fft run . --env dev --cache=rw --rebuild marts_daily.ff + +# Read-only cache (skip on match, build on miss, no writes) +fft run . --env dev --cache=ro + +# Always build and write cache +fft run . --env dev --cache=wo + +# Disable cache entirely +fft run . --env dev --no-cache +``` + +With parallelism: + +```bash +fft run . --env dev --jobs 4 +fft run . --env dev --jobs 4 --keep-going +``` + +--- + +## Troubleshooting & FAQ + +**“Why did it skip?”** +A skip requires a fingerprint match and an existing relation. Fingerprints include: + +* rendered SQL / Python function source, +* `sources.yml` (normalized), +* engine/profile, +* **all `FF_*` environment variables**, +* upstream fingerprints. + +Any change in the above triggers a rebuild downstream. + +**“Relation missing but cache says skip?”** +We also check relation existence. If the table/view was dropped externally, FastFlowTransform will **rebuild**. + +**“My logs interleave under parallelism.”** +Logs are serialized via a queue; use `-v` / `-vv` for richer but still stable output. Each node prints start/end and duration; levels summarize. + +**“Utest cache?”** +`fft utest --cache {off|ro|rw}` defaults to `off` for deterministic runs. With `rw`, expensive unit cases can be accelerated. Unit tests do not rely on the meta table by default. + +--- + +## Example: simple_duckdb + +The demo contains two independent staging nodes (`users.ff.sql`, `orders.ff.sql`). They run in **parallel** within the same level. + +Makefile targets: + +```makefile +run_parallel: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --jobs 4 + +cache_rw_first: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --cache=rw + +cache_rw_second: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --cache=rw + +cache_invalidate_env: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" FF_DEMO_TOGGLE=1 fft run "$(PROJECT)" --env dev --cache=rw +``` + +--- + +## Appendix: Environment Inputs + +Only environment variables with the `FF_` prefix affect fingerprints (keys and values). If you change one (e.g., `FF_RUN_DATE`, `FF_REGION`), fingerprints change and downstream nodes rebuild. + +```bash +# Will invalidate fingerprints and rebuild affected nodes +FF_RUN_DATE=2025-01-01 fft run . --env dev --cache=rw +``` + +```` + +--- + +### 🔗 `docs/index.md` – Link zum neuen Kapitel + +```diff +--- a/docs/index.md ++++ b/docs/index.md +@@ -10,6 +10,7 @@ + - [User Guide – Operational](./Technical_Overview.md#part-i--operational-guide) + - [Modeling Reference](./Config_and_Macros.md) + - [Parallelism & Cache (v0.3)](./Cache_and_Parallelism.md) + - [Developer Guide – Architecture & Internals](./Technical_Overview.md#part-ii--architecture--internals) +```` + + + + + +# Incremental models + +Incremental models let you **reuse existing data** and only process **new or changed rows** instead of rebuilding a table from scratch on every run. This is essential for larger datasets or frequently running pipelines. + +This page explains the **concepts and configuration** of incremental models in FastFlowTransform (FFT) independently of any specific example project. + +--- + +## Why incremental models? + +By default, a model is built with a **full refresh**: + +* Read all sources +* Recompute all transformations +* Overwrite the target table + +For small tables this is fine. For anything medium-sized or larger, this quickly becomes: + +* slow, +* expensive (especially on cloud warehouses / Spark), +* and unnecessary if only a small portion of rows changed. + +Incremental models solve this by: + +1. Reusing existing target data. +2. Processing only **new / changed** rows. +3. Applying an **incremental strategy** (append or merge). + +--- + +## High-level architecture + +Incremental behaviour is coordinated between three layers: + +1. **Model configuration** + + You declare that a model is incremental and provide hints: + + * Does it append or upsert? + * What is the **unique key**? + * Which column(s) indicate freshness (e.g. `updated_at`)? + + This lives in the model’s `config(...)` (SQL) or `meta` (Python) and is validated against a strict schema. + +2. **Planner / Core** + + FFT looks at: + + * the model’s incremental config (`incremental={...}`), + * whether the physical table already exists, + * CLI flags like `--full-refresh`, + + and decides whether to: + + * run a **full rebuild**, or + * run an **incremental update** using engine hooks. + +3. **Engine executors** (DuckDB, Postgres, Databricks/Spark, …) + + Each engine implements a small incremental API: + + * `exists_relation(relation)` + * `create_table_as(relation, select_sql)` – initial full build + * `full_refresh_table(relation, select_sql)` – forced rebuild + * `incremental_insert(relation, select_sql)` – append-only + * `incremental_merge(relation, select_sql, unique_key)` – upsert / merge + * `alter_table_sync_schema(relation, select_sql, mode=...)` – optional schema evolution + + The planner calls these methods – you just configure the model. + +--- + +## Enabling incremental mode + +You enable incremental mode **per model** via the model config. + +### SQL models + +Inside the Jinja `config` block you use a structured `incremental` dictionary: + +```sql +{{ config( + materialized='incremental', + tags=['example:incremental', 'engine:duckdb'], + incremental={ + "enabled": true, + "strategy": "merge", # or "append", "insert", "full_refresh" + "unique_key": ["event_id"], + "updated_at_column": "updated_at" + } +) }} + +select + event_id, + updated_at, + value +from some_source +```` + +Key points: + +* `materialized='incremental'` tells FFT to use the incremental pipeline. +* `incremental.enabled: true` declares that this model supports incremental processing. +* `unique_key` declares one or more columns that uniquely identify a row in the target. +* `strategy` is a hint for how deltas should be applied (append vs merge etc.). +* `updated_at_column` (or `delta_columns`/`updated_at_columns`) tells FFT which column is used for “new vs old” comparisons (usually a timestamp or monotonically increasing surrogate). + +There is **no extra `meta={...}` wrapper** anymore – the fields of `config(...)` are validated directly. + +### Python engine models + +For `@engine_model` functions you pass the same information via the `meta` parameter – but again with **top-level incremental config**, not inside another `meta` key: + +```python +from fastflowtransform import engine_model + +@engine_model( + only="duckdb", + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=["incremental", "engine:duckdb"], + meta={ + "materialized": "incremental", + "incremental": { + "enabled": True, + "strategy": "merge", + "unique_key": ["event_id"], + "updated_at_column": "updated_at", + }, + }, +) +def build(df): + # Return a frame with event_id, updated_at, value, ... + return df +``` + +The **frame you return** (pandas, Spark, etc.) is treated as the *delta dataset* for incremental processing – FFT does not care how you compute it, only about the columns and the meta. + +--- + +## Incremental strategies + +The core supports at least two conceptual strategies: + +### 1. Append / insert-only (`strategy: "append"` / `"insert"`) + +Use this when: + +* data is immutable once written, and +* new rows have strictly increasing `updated_at` / timestamp or surrogate key. + +Behaviour: + +* For the **first run**, FFT calls `create_table_as(relation, SELECT ...)`. +* For **subsequent runs**: + + * Only rows considered “new” are included in the SELECT (using your configured watermark columns). + * The executor calls `incremental_insert(relation, SELECT ...)` which typically becomes: + + ```sql + INSERT INTO target_table + SELECT ... + ``` + +Good for: + +* log/event style tables +* audit trails +* many ingestion pipelines + +### 2. Merge / upsert (`strategy: "merge"`) + +Use this when: + +* rows may change later, +* you want the target table to always reflect the **latest version** per `unique_key`. + +Behaviour: + +* For the **first run**, same as full refresh: `create_table_as`. +* For **later runs**: + + * The SELECT (or delta query, see below) produces a *delta* frame with new/updated rows. + * Executor tries `incremental_merge(relation, select_sql, unique_key)`. + +Engine-specific behaviour: + +* **Databricks / Spark (Delta)** + The executor attempts a native Delta MERGE: + + ```sql + MERGE INTO target AS t + USING (SELECT ...) AS s + ON t.key1 = s.key1 AND ... + WHEN MATCHED THEN UPDATE SET * + WHEN NOT MATCHED THEN INSERT * + ``` + + If MERGE is not supported (non-Delta table), it falls back to a safe full rebuild. + +* **Other engines (DuckDB, Postgres, …)** + The executor can implement merge using: + + * `INSERT ... ON CONFLICT ... DO UPDATE` (Postgres), + * a **full-refresh emulation**: build a new version by combining old rows and delta rows and overwrite. + +In all cases, the `unique_key` list is used to match rows between existing table and delta frame. + +--- + +## Watermark / delta SQL and default behaviour + +To decide **which rows are “new enough”** for an incremental run, FFT uses the configuration you provide (for example `updated_at_column` or `delta_columns`) plus the existing table. + +A typical default pattern is: + +```sql +where updated_at > ( + select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') + from {{ this }} +) +``` + +The exact SQL will vary by engine, but the core idea is: + +* Read the current maximum of your watermark column in the target. +* Select only rows strictly newer than that. + +### Overriding the delta logic + +If the default “`updated_at > max(updated_at)`” is not enough, you have a few options: + +1. **Additional delta columns** + + Use `delta_columns` / `updated_at_columns` in `incremental={...}` to indicate multiple fields that drive change detection (especially for Python incremental). + +2. **Inline delta SQL (`delta_sql`)** + + Provide a custom **delta SELECT** that FFT should use on incremental runs: + + ```sql + {{ config( + materialized='incremental', + incremental={ + "enabled": true, + "strategy": "merge", + "unique_key": ["event_id"], + "updated_at_column": "updated_at", + "delta_sql": " + with base as ( + select event_id, updated_at, value + from {{ ref('events_base.ff') }} + ) + select * + from base + where updated_at > ( + select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') + from {{ this }} + ) + " + } + ) }} + ``` + +3. **External delta config (`delta_config`)** + + Keep the base query in the model, but put the delta SQL into a separate YAML file and reference it via `delta_config: "config/incremental/my_model.delta.yml"`. + +In all cases, FFT still delegates the **merge/insert mechanics** to the executor; you only control what qualifies as “delta”. + +--- + +## Full refresh vs incremental + +You can always force a full rebuild: + +```bash +fft run . --env dev --full-refresh +``` + +The logic is: + +* If `--full-refresh` is set → **ignore incremental** and call `full_refresh_table`. + +* Otherwise, if the model has `incremental.enabled` and the target exists: + + * attempt incremental path (`incremental_insert` / `incremental_merge`), + +* Otherwise: + + * do initial full build via `create_table_as`. + +--- + +## Schema evolution for incremental models + +Real tables evolve. To avoid incremental runs failing when the output schema changes, executors can implement: + +```python +alter_table_sync_schema(relation: str, select_sql: str, mode: str = "append_new_columns") +``` + +Typical behaviour (Spark example): + +1. Run the SELECT with `LIMIT 0` to infer the **output schema**. +2. Compare it to the existing table schema. +3. For any **new columns**: + + * issue `ALTER TABLE ... ADD COLUMNS (...)`, + * map complex types to reasonable SQL types (often defaulting to `STRING` in Spark for safety). + +Modes: + +* `"append_new_columns"` – only new columns are added; existing columns are left untouched. +* `"sync_all_columns"` – more aggressive sync, may also adjust types (implementation-specific). + +For DuckDB/Postgres, the simplest implementation may be a no-op initially; more advanced engines (or future versions) can support automatic `ALTER TABLE` statements. + +--- + +## Storage overrides and Delta Lake integration + +Incremental models work with both: + +1. **Managed / catalog tables**, and +2. **Storage overrides** via `project.yml` / model config, e.g.: + + ```yaml + models: + storage: + fct_events: + path: ".local/spark/fct_events" + format: delta + ``` + +The storage layer (`fastflowtransform.storage`) provides helpers like: + +* `get_model_storage(name)` – resolve per-model `path`/`format`/`options` +* `spark_write_to_path(spark, identifier, df, storage=..., default_format=...)` + +For Spark/Delta: + +* Incremental models can be backed by **Delta files** at a fixed path. + +* The executor writes the DataFrame to a temporary directory, then atomically renames it into place and wires up: + + ```sql + CREATE TABLE `db`.`tbl` + USING DELTA + LOCATION '/path/to/model' + ``` + +* Incremental MERGE (`incremental_merge`) then runs against this Delta table. + +This keeps: + +* a stable location on disk / in the lake, +* and a proper table in the metastore/catalog. + +When the Databricks/Spark executor's `table_format` (or `FF_DBR_TABLE_FORMAT`) resolves to `delta`, +FastFlowTransform automatically pulls in `delta-spark` and configures both +`spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension` and +`spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog` (unless you +already provided custom values). Install `delta-spark >= 4.0` and you can seed/run Delta-backed +models without manually adding Spark CLI flags. + +--- + +## Interaction with metadata and DAG selection + +After each successful build, executors call: + +```python +on_node_built(node, relation, fingerprint) +``` + +which uses the meta helpers: + +* `ensure_meta_table(executor)` +* `upsert_meta(executor, node_name, relation, fingerprint, engine_name)` + +The `_ff_meta` table records, for each model and engine: + +* the relation name, +* the last fingerprint/hash, +* timestamps, etc. + +While this metadata is **not strictly required** for incremental mechanics, it is used for advanced features such as: + +* **state-based selection** (`--select state:modified`, etc.), +* change-aware DAG runs. + +Incremental models work together with these features: you can, for example, run only models whose source files changed and let the incremental planner update them efficiently. + +--- + +## Best practices & recommendations + +* **Always define a `unique_key`** for merge strategies. + Without a stable key, upserts can behave unpredictably. + +* **Use timestamps or monotonically increasing columns** for delta selection. + Avoid non-deterministic expressions (e.g. `now()` in your model SQL) in incremental filters. + +* **Start simple**: + + * Begin with `strategy: "append"` and a single `updated_at_column`. + * Move to `strategy: "merge"` only when you truly need updates. + +* **Test both fresh and incremental runs**: + + * First run with an empty database (initial full build). + * Then run again with new rows and verify the target grew as expected. + * Add automated tests that run the same model twice and assert row counts / contents. + +* **Use `--full-refresh` when semantics change**: + If you change the business logic of a model in a way that invalidates old rows, do a full rebuild at least once. + + + + + +# Profiles Configuration + +FastFlowTransform uses `profiles.yml` to describe how each environment connects to the execution engine (DuckDB, Postgres, BigQuery, Databricks Spark, Snowflake Snowpark, …). This document covers file layout, supported features, environment overrides, and loading precedence. + +## File Location + +`profiles.yml` lives at the project root (same level as `models/`, `project.yml`). The CLI loads it whenever you run `fft` commands (seed/run/test/dag/utest/docgen …). + +``` +project/ +├── models/ +├── project.yml +└── profiles.yml ``` -**Parameter semantics** -- `expr`: SQL snippet placed into `SELECT {expr} FROM {table}` (keep it engine-neutral: `sum(...)`, `count(*)`, simple filters). -- `where`: optional SQL appended as `WHERE {where}`. -- `abs_tolerance`: absolute tolerance on the difference. -- `rel_tolerance_pct`: relative tolerance in **percent**; denominator is `max(|right|, 1e-12)`. -- `min_ratio` / `max_ratio`: inclusive bounds for `left/right`. -- Coverage uses an anti-join (`source` minus `target` on the given key). The check passes if missing = 0. +## Basic Structure + +The file is parsed as YAML after optional Jinja rendering. Top-level keys represent profile “names” (e.g. `dev`, `prod`, `dev_postgres`). Each profile must include an `engine` plus engine-specific configuration. + +```yaml +dev: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}" + +stg: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +prod: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET') }}" + location: EU + +default: + engine: duckdb + duckdb: + path: ":memory:" +``` + +### Engines and Sections + +Supported engines and their expected sections: + +| Engine | Section | Key Fields | +|----------------------|--------------------|---------------------------------------------------| +| `duckdb` | `duckdb` | `path` (file path or `:memory:`) | +| `postgres` | `postgres` | `dsn`, `db_schema` | +| `bigquery` | `bigquery` | `project` (optional), `dataset`, `location` | +| `databricks_spark` | `databricks_spark` | `master`, `app_name`, optional `extra_conf`, `warehouse_dir`, `use_hive_metastore`, `database`, `table_format`, `table_options` | +| `snowflake_snowpark` | `snowflake_snowpark`| `account`, `user`, `password`, `warehouse`, `database`, `db_schema`, optional `role` | + +Each profile can define its own `vars:` block (values exposed via `var('key')` inside templates). + +## Environment Variables + +`profiles.yml` supports Jinja expressions. The helper `env('FF_VAR', 'fallback')` reads process environment variables and substitutes the default if unset. Examples: + +```yaml +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'analytics') }}" +``` + +These expressions are rendered *before* YAML parsing. If the environment variable is missing and no default is provided, the expression resolves to an empty string and validation will fail with a clear error message. + +## Loading Order & Precedence + +When running `fft` commands, `_load_dotenv_layered()` loads `.env` files in ascending precedence: + +1. `/.env` +2. `/.env` +3. `/.env.local` +4. `/.env.` +5. `/.env..local` + +Earlier values fill defaults; later files override earlier ones *only for keys that are not already defined*. **Values set in the shell (e.g. via `FF_ENGINE=duckdb fft run …`) have highest priority**—they remain untouched, even if `.env` files define the same key. + +After `.env` loading, `profiles.yml` is rendered with Jinja (using the current `os.environ`) and parsed by Pydantic. Validation ensures required fields are present for each engine and produces human-readable errors for missing DSNs, schemas, etc. + +## Selecting Profiles + +- **Via `--env` flag**: `fft run . --env dev_postgres` +- **Via `FFT_ACTIVE_ENV`**: set in shell or `.env` to choose the active profile name. +- **Legacy `FF_ENGINE`** (overrides `engine` field post-parse): useful for quick experiments but explicit `profiles.yml` entries are preferred. + +Example Makefile snippet that switches profiles without exposing secrets: + +```make +ENGINE ?= duckdb + +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb +endif +ifeq ($(ENGINE),postgres) + PROFILE_ENV = dev_postgres +endif + +seed: + FFT_ACTIVE_ENV=$(PROFILE_ENV) uv run fft seed . --env $(PROFILE_ENV) +``` + +## Using `.env` for Secrets + +Keep sensitive credentials out of VCS by storing them in `.env` files referenced above: + +``` +examples/api_demo/ +├── .env.dev_duckdb # FF_DUCKDB_PATH=... +├── .env.dev_postgres # FF_PG_DSN=..., FF_PG_SCHEMA=... +├── .env.dev_databricks # FF_SPARK_MASTER=..., FF_SPARK_APP_NAME=... +└── profiles.yml +``` + +These files stay out of git (via `.gitignore`), while `profiles.yml` contains only non-sensitive wiring. + +## Summary of Features + +- Multiple profiles in a single YAML file. +- Jinja templating with `env()` helper for dynamic values. +- `.env` layered loading with shell overrides taking precedence. +- Validation for engine-specific parameters (clear error messages). +- Profile-specific `vars` exposed to Jinja `var()` function in models. +- Works seamlessly across CLI commands: seed, run, dag, test, docgen, utest. + +Keep `profiles.yml` declarative, `.env` files secret, and use CLI or Makefiles to select the active profile per run. This pattern scales from local DuckDB demos to production Postgres/BigQuery/Snowflake deployments. + + + + + +# Sources Configuration + +`sources.yml` declares external tables (seeds, raw inputs, lakehouse paths) that models can reference via `{{ source('group', 'table') }}`. This document covers the schema, engine overrides, file paths, and best practices. + +## File Location + +Place `sources.yml` at your project root (same level as `models/`). Example: + +``` +project/ +├── models/ +├── sources.yml +└── seeds/ +``` + +## YAML Schema (Version 2) + +FastFlowTransform expects a dbt-style structure: + +```yaml +version: 2 +sources: + - name: raw + schema: staging # default schema for this source group + overrides: + postgres: + schema: raw_main # engine-specific default override + + tables: + - name: seed_users + identifier: seed_users # optional physical name + overrides: + duckdb: + schema: main + databricks_spark: + format: delta + location: "/mnt/delta/raw/seed_users" +``` + +### Fields + +| Level | Field | Description | +|----------|-------------|-------------| +| source | `name` | Logical group identifier referenced by `source('name', ...)`. | +| | `schema` | Default target schema/database for the group. | +| | `database`/`catalog` | Optional qualifiers per engine (BigQuery, Snowflake). | +| | `overrides` | Map of engine → config snippet (schema overrides, formats, locations). | +| table | `name` | Logical table name (second argument in `source()`). | +| | `identifier`| Physical name; defaults to `name` if omitted. | +| | `location` | File/path location (used with `format`). | +| | `format` | Ingestion format for engines supporting path-based sources (`delta`, `parquet`, …). | +| | `options` | Dict of format options (Spark/Databricks). | +| | `overrides` | Additional engine-specific settings merged with source-level overrides. | + +Engine-specific overrides follow this merge order: + +1. Source defaults (`schema`, `database`, …) +2. Source-level `overrides[engine]` +3. Table-level `overrides[engine]` + +### Engine Behavior + +- **DuckDB / Postgres / BigQuery / Snowflake**: expect `identifier` (plus `schema`/`database` where relevant). Path-based sources raise errors. +- **Databricks Spark**: supports `format` + `location`. The executor registers a temp view with optional `options` (e.g. `compression`). + +### Path-Based Sources Example + +```yaml + - name: raw_events + tables: + - name: landing + overrides: + databricks_spark: + format: json + location: "abfss://landing@storage.dfs.core.windows.net/events/*.json" + options: + multiline: true +``` + +## Referencing Sources in Models + +```sql +select id, email +from {{ source('raw', 'seed_users') }} +``` + +After rendering, the executor resolves the fully-qualified relation or path depending on the active engine. + +## Seed Integration + +When combined with `seeds/schema.yml`, you can map CSV/Parquet seeds into schemas per engine: + +```yaml +targets: + raw/users: + schema: raw + schema_by_engine: + duckdb: main + postgres: staging +``` + +## Validation & Errors + +- Missing `identifier` *and* `location` produce `KeyError` during rendering. +- Unknown source/table names raise `KeyError` with suggestions. +- Unsupported path-based sources on an engine (`location` provided but no `format`) raise descriptive `NotImplementedError`. + +Keep `sources.yml` declarative, use engine overrides for schema differences, and lean on `.env` files where credentials or URIs vary per environment. + + + + + +# Project Configuration (`project.yml`) + +`project.yml` defines global metadata, documentation, variables, and data-quality tests for a FastFlowTransform project. This reference walks through the supported sections and common patterns. + +## File Location + +`project.yml` lives at the root of your project. + +``` +project/ +├── models/ +├── project.yml +└── profiles.yml +``` + +## Top-Level Keys + +```yaml +name: my_project +version: "0.1" +models_dir: models # optional, defaults to "models" + +docs: + dag_dir: site/dag # output for fft dag --html + models: + users: + description: "Raw users table" + columns: + id: "Primary key" + email: "Email address" + +vars: + snapshot_day: "2024-01-01" + default_limit: 100 -**Summary output** -Each reconciliation contributes a line in the summary with a compact scope, e.g.: +tests: + - type: not_null + table: users + column: id + tags: [batch] ``` -✅ reconcile_equal orders ⇔ mart_orders_enriched (4ms) -✅ reconcile_coverage orders ⇒ mart_orders_enriched (3ms) + +### Metadata + +| Key | Description | +|-------------|-------------| +| `name` | Project identifier (used in docs/metadata). | +| `version` | Arbitrary version string. | +| `models_dir`| Relative directory containing models (`*.ff.sql` / `*.ff.py`). | + +### Documentation (`docs`) + +- `dag_dir`: where `fft dag --html` writes the static site. +- `models`: per-model descriptions and column docs surfaced in the generated DAG/docs. + +### Variables (`vars`) + +Key/value pairs accessible via `{{ var('key', default) }}` in Jinja templates. CLI overrides (`--vars key=value`) take precedence. + +### Tests (`tests`) + +Project-wide data quality checks run by `fft test`. Each test is a dict with: + +- `type`: `not_null`, `unique`, `accepted_values`, `row_count_between`, `greater_equal`, `non_negative_sum`, `freshness`, or reconciliation checks (`reconcile_equal`, `reconcile_diff_within`, `reconcile_ratio_within`, `reconcile_coverage`). +- `table`: target table or relation. +- `column`: required for column-based tests. +- Optional: `tags`, `severity` (`error`/`warn`), additional parameters (e.g. `values`, `min`, `max`). + +Example: + +```yaml +tests: + - type: accepted_values + table: mart_users + column: status + values: [active, invited] + severity: warn + - type: reconcile_equal + name: revenue_vs_bookings + left: { table: fct_revenue, expr: "sum(amount)" } + right: { table: fct_bookings, expr: "sum(expected_amount)" } + abs_tolerance: 5.0 ``` -**Engine notes** -- DuckDB and Postgres are supported out-of-the-box. BigQuery works with simple aggregates/filters (expressions should avoid dialect-specific functions). -- For relative tolerances, the implementation guards against zero denominators with a small epsilon (`1e-12`). +## Interaction with `.env` and Profiles + +`project.yml` does not read environment variables directly. However: + +- `vars:` can reference `var('key')` defaults overridden by CLI or `.env`. +- Tests often depend on `profiles.yml` and `sources.yml` for the actual connection details. +- Makefiles may set `FFT_ACTIVE_ENV` or other `FF_*` variables influencing runs, but `project.yml` remains static. + +## Best Practices + +- Keep `project.yml` committed to version control (no secrets). +- Use `docs/` to provide richer Markdown descriptions; reference them via `columns` or `description` fields if desired. +- Organize tests by tag (`tags: [batch]`, `tags: [reconcile]`) to support selective execution: `fft test . --select tag:reconcile`. + +Refer to `docs/Data_Quality_Tests.md` for detailed test semantics and `docs/Profiles.md` for profile/env loading behavior. + + + + + +# State Selection — R1 +Build only changed nodes or select by last run results. -### Auto-Docs & Lineage +## Changed Nodes -FastFlowTransform can generate a lightweight documentation site (DAG + model detail pages) from your project: +- `state:modified` — models that have changed since last cached fingerprint. +- `state:modified+` — the above plus all downstream dependents. ```bash -# Classic -fft dag . --env dev --html +# First run populates cache +fft run examples/r1_demo --env dev --cache rw +# Touch files / change SQL → next run: +fft run examples/r1_demo --env dev --cache rw --select state:modified +fft run examples/r1_demo --env dev --cache rw --select state:modified+ +```` -# Convenience wrapper (loads schema + descriptions + lineage, can emit JSON) -fft docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json +## Result-based Selection + +Use the last `run_results.json`: + +* `result:ok` — successful models (no warnings) +* `result:warn` — successful but with warnings +* `result:fail` — alias of `result:error` +* `result:error`— failed models + +```bash +fft run examples/r1_demo --env dev --select result:error ``` -Add `--open-source` if you want the default browser to open the rendered `index.html` immediately. +### Artifacts + +``` +examples/r1_demo/.fastflowtransform/target/ +├── manifest.json +├── run_results.json +└── catalog.json +``` -**Descriptions** can be provided in YAML (project.yml) and/or Markdown files. Markdown has higher priority. -YAML in `project.yml`: + + + +# YAML Tests (Schema-bound) + +Schema-bound tests live in `models/*.yml` or `models/**/schema.yml` and complement (or replace) `project.yml`-based tests. + +## Example ```yaml -docs: - models: - users.ff: - description: "Raw users table imported from CRM." - columns: - id: "Primary key." - email: "User email address." - users_enriched: - description: "Adds gmail flag." - columns: - is_gmail: "True if email ends with @gmail.com" +# examples/r1_demo/models/users_enriched.yml +version: 2 +models: + - name: users_enriched + description: "Adds gmail flag" + columns: + - name: id + tests: + - not_null: { severity: error } + - unique + - name: email + tests: + - not_null + - accepted_values: + values: ["a@example.com","b@example.com","c@gmail.com"] + severity: warn +```` + +### Severities + +* `error` → contributes to failures (exit code 2). +* `warn` → surfaced in summary as ❕, does not affect exit code. + +### Run + +```bash +fft test examples/r1_demo --env dev +# Select only tests tagged 'reconcile' (if present) +fft test examples/r1_demo --env dev --select tag:reconcile ``` -Markdown (overrides YAML if present): +### Output (excerpt) ``` -/docs/models/.md -/docs/columns//.md +Data Quality Summary +──────────────────── +✅ not_null users.id (3ms) +❌ unique users.id (2ms) + ↳ [unique] users.id: found 1 duplicate +❕ accepted_values users_enriched.email (1ms) + +Totals +────── +✓ passed: 2 +✗ failed: 1 +! warnings: 1 +``` + + + + + +# Data Quality Test Reference + +FastFlowTransform exposes a set of built-in data quality checks that you can configure in `project.yml → tests:` and execute with `fft test`. This document lists every supported test, required parameters, and example configurations. + +## Supported Test Types + +The following values are currently supported for `type`: + +- `not_null` +- `unique` +- `accepted_values` +- `greater_equal` +- `non_negative_sum` +- `row_count_between` +- `freshness` +- `reconcile_equal` +- `reconcile_ratio_within` +- `reconcile_diff_within` +- `reconcile_coverage` + +## Usage Overview + +```yaml +# project.yml +tests: + - type: not_null + table: users + column: id + severity: error # default (omit for error) + tags: [batch] + + - type: unique + table: users + column: email + tags: [batch] + + - type: accepted_values + table: users + column: status + values: [active, invited] + severity: warn # warn keeps run green on failure + + - type: greater_equal + table: orders + column: amount + threshold: 0 + + - type: non_negative_sum + table: orders + column: amount + + - type: row_count_between + table: users_enriched + min_rows: 1 + max_rows: 100000 + + - type: freshness + table: events + column: event_ts + max_delay_minutes: 30 + + - type: reconcile_equal + name: revenue_vs_bookings # optional label in summaries + tags: [reconcile] + left: { table: fct_revenue, expr: "sum(amount)" } + right: { table: fct_bookings, expr: "sum(expected_amount)" } + abs_tolerance: 5.0 +```` + +Every entry is a single dictionary describing one check. The common keys are: + +| Key | Description | +| ---------- | ------------------------------------------------------------------------ | +| `type` | Test kind (see list above). | +| `table` | Target table for table-level checks or display hint for reconciliations. | +| `column` | Required for column-scoped checks (`not_null`, `unique`, …). | +| `severity` | `error` (default) or `warn`. | +| `tags` | Optional list of selectors for `fft test --select tag:...`. | +| `name` | Optional identifier surfaced in summaries (useful for reconciliations). | + +Run all configured checks: + +```bash +fft test . --env dev ``` -Optional front matter is ignored for now (title/tags may be used later). +Use `--select tag:` to restrict by tags (e.g. `fft test --select tag:batch`). Tests always execute regardless of cache settings. + +Each entry produces a summary line. Failures stop the command unless `severity: warn` is set. + +## Table-Level Checks + +These checks operate on a single table (optionally filtered with `where:`). Unless noted, they require a `column` argument. + +### `not_null` + +* **Purpose:** Assert that a column never contains NULLs. +* **Parameters:** + + * `column` *(str, required)* + * `where` *(str, optional)* — SQL predicate applied before the NULL check. +* **Failure:** Reports the number of NULL rows and shows the underlying SQL. + +--- + +### `unique` + +* **Purpose:** Detect duplicates within a column. +* **Parameters:** + + * `column` *(str, required)* + * `where` *(str, optional)* +* **Failure:** Indicates how many duplicate groups were found (HAVING `count(*) > 1`) and shows a sample query. + +--- + +### `accepted_values` + +* **Purpose:** Ensure every non-NULL value is inside an allowed set. +* **Parameters:** -**Column lineage (heuristic, best effort).** + * `column` *(str, required)* + * `values` *(list, required)* — permitted literals (strings are quoted automatically). + * `where` *(str, optional)* — additional filter condition. +* **Behaviour note:** If `values` is omitted or an empty list, the check is treated as a no-op and always passes. The summary still shows the configured test. +* **Failure:** Shows the number of out-of-set values plus up to five sample values. -- SQL models: expressions like `col` / `alias AS out` / `upper(u.email) AS email_upper)` are parsed; - `u` must come from a `FROM ... AS u` that resolves to a relation. Functions mark lineage as *transformed*. -- Python (pandas) models: simple patterns like `rename`, `out["x"] = df["y"]`, `assign(x=...)` are recognized. -- You can override hints in YAML: +--- -```yaml -docs: - models: - mart_orders_enriched: - lineage: - email_upper: - from: [{ table: users, column: email }] - transformed: true -``` +### `greater_equal` -**JSON manifest** (optional via `--emit-json`) includes models, relations, descriptions, columns (with nullable/dtype), -and lineage per column. This is useful for custom doc portals or CI checks. +* **Purpose:** Require all values to be greater than or equal to a threshold. +* **Parameters:** -Notes: -- Schema introspection currently supports DuckDB and Postgres. For other engines, the Columns card may be empty. -- Lineage is optional; when uncertain, entries fall back to “unknown” and never fail doc generation. + * `column` *(str, required)* + * `threshold` *(number, default `0`)* +* **Failure:** Lists how many rows fell below the threshold. +--- +### `non_negative_sum` -## Part II – Architecture & Internals +* **Purpose:** Validate that the sum of a numeric column is not negative. +* **Parameters:** -### Architecture Overview + * `column` *(str, required)* +* **Failure:** Reports the signed sum when it is negative. -``` -CLI (Typer) -│ -├── Registry (core.py) -│ ├── Discover models (*.ff.sql / *.ff.py) -│ ├── Load Python models (decorator) -│ ├── Parse/validate dependencies -│ └── Jinja environment + sources.yml -│ -├── DAG (dag.py) -│ ├── topo_sort (Kahn, deterministic) -│ └── mermaid() (styled + stable IDs) -│ -├── Executors (executors/*) -│ ├── BaseExecutor (SQL rendering, dependency loading, materialization, requires guard) -│ ├── DuckExecutor (DuckDB) -│ ├── PostgresExecutor (SQLAlchemy, shims) -│ ├── BigQueryExecutor (pandas) -│ ├── BigQueryBFExecutor (BigQuery DataFrames / bigframes) -│ ├── DatabricksSparkExecutor (PySpark, without pandas) -│ └── SnowflakeSnowparkExecutor (Snowpark, without pandas) -│ -├── Testing (testing.py) -│ ├── generic _exec / _scalar -│ └── Checks: not_null, unique, row_count_between, greater_equal, non_negative_sum, freshness -│ -├── Seeding (seeding.py) -│ └── Load seeds (CSV/Parquet/SQL) → engine agnostic -│ -├── Docs (docs.py + templates/) -│ ├── Mermaid + overview table (index.html) -│ └── Model detail pages (model.html) -│ -├── Settings/Profiles (settings.py) -│ └── Pydantic v2 discriminated union + ENV overrides -│ -└── Streaming (streaming/*) - ├── FileTailSource - └── StreamSessionizer -``` +--- + +### `row_count_between` + +* **Purpose:** Guard minimum (and optional maximum) row counts for a table. +* **Parameters:** + + * `min_rows` *(int, default `1`)* — minimum expected number of rows. + * `max_rows` *(int, optional)* — omit for open-ended upper bounds. +* **Failure:** Indicates the observed row count when it falls outside `[min_rows, max_rows]`. --- -### Core Modules +### `freshness` -#### `core.py` +* **Purpose:** Warn when the latest timestamp is older than an allowed delay. +* **Parameters:** -Key data structures and the project loading process. + * `column` *(str, required)* — timestamp column. + * `max_delay_minutes` *(int, required)* — permitted staleness in whole minutes. +* **Failure:** Reports the computed lag in minutes. Uses: -```python -@dataclass -class Node: - name: str # logical name (stem or @model(name=...)) - kind: str # "sql" | "python" - path: Path - deps: List[str] = field(default_factory=list) + ```sql + select date_part('epoch', now() - max(column)) / 60.0 as delay_min + from + ``` -class Registry: - def load_project(self, project_dir: Path) -> None: ... - def _register_node(self, node: Node) -> None: ... - def _load_py_module(self, path: Path) -> types.ModuleType: ... - def _scan_sql_deps(self, path: Path) -> List[str]: ... -``` + This is straightforward for DuckDB/Postgres; other engines may need adaptations. -**Helpers & decorator:** +## Cross-Table Reconciliations -```python -def relation_for(node_name: str) -> str: ... -def ref(name: str) -> str: ... -def source(source_name: str, table_name: str) -> str: ... +Reconciliation checks compare aggregates or keys across two relations. Their configuration accepts dictionaries describing the left/right side expressions or keys. The top-level `table`/`column` fields are used only for display and grouping; the actual queries are defined via the nested dictionaries. -def model(name=None, deps=None, requires=None) -> Callable[[Callable[..., Any]], Callable[..., Any]]: ... -``` +### `reconcile_equal` -**Python models (example):** +* **Purpose:** Compare two scalar expressions with optional tolerances. +* **Parameters:** -```python -@model(name="users_enriched", deps=["users.ff"], requires={"users": {"id","email"}}) -def enrich(df: pd.DataFrame) -> pd.DataFrame: ... -``` + * `left`, `right` *(dict, required)* with keys: + + * `table` *(str, required)* + * `expr` *(str, required)* — SQL select expression (e.g. `sum(amount)`). + * `where` *(str, optional)* + * `abs_tolerance` *(float, optional)* — maximum absolute difference. + * `rel_tolerance_pct` *(float, optional)* — maximum relative difference in percent. +* **Failure:** Displays both values, absolute and relative differences. If no tolerance is provided, strict equality is enforced (diff must be exactly `0.0`). --- -#### `dag.py` +### `reconcile_ratio_within` -Deterministic topological sort plus Mermaid export. +* **Purpose:** Constrain the ratio `left/right` within bounds. +* **Parameters:** -```python -def topo_sort(nodes: Dict[str, Node]) -> List[str]: ... -def mermaid(nodes: Dict[str, Node]) -> str: ... -``` + * `left`, `right` *(dict, required as above)* + * `min_ratio`, `max_ratio` *(float, required)* +* **Failure:** Shows the computed ratio and expected interval. --- -#### `errors.py` +### `reconcile_diff_within` -Primary error types with helpful messages. +* **Purpose:** Limit the absolute difference between two aggregates. +* **Parameters:** -```python -class FastFlowTransformError(Exception): ... -class ModuleLoadError(FastFlowTransformError): ... -class DependencyNotFoundError(FastFlowTransformError): ... -class ModelCycleError(FastFlowTransformError): ... -class TestFailureError(FastFlowTransformError): ... -``` + * `left`, `right` *(dict, required)* + * `max_abs_diff` *(float, required)* +* **Failure:** Reports the absolute difference when it exceeds `max_abs_diff`. --- -#### Executors +### `reconcile_coverage` -Shared logic (`BaseExecutor`) plus engine implementations. +* **Purpose:** Ensure every key present in a source table appears in a target table (anti-join zero). +* **Parameters:** -```python -class BaseExecutor(ABC): - def render_sql(self, node: Node, env: Environment, ref_resolver=None, source_resolver=None) -> str: ... - def run_python(self, node: Node) -> None: ... - @abstractmethod - def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.DataFrame: ... - @abstractmethod - def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None: ... + * `source` *(dict, required)* — must contain: + + * `table` *(str)* — source table. + * `key` *(str)* — key column in the source. + * `target` *(dict, required)* — must contain: + + * `table` *(str)* — target table. + * `key` *(str)* — key column in the target. + * `source_where` *(str, optional)* — filter applied to the source. + * `target_where` *(str, optional)* — filter applied to the target. +* **Failure:** Reports the number of missing keys. + +## Severity & Tags + +* `severity: error` (default) makes failures stop the test run with exit code 1. +* `severity: warn` records the result but keeps the run successful. +* `tags:` lets you group checks under named tokens (e.g. `batch`, `streaming`). Use `fft test --select tag:batch` to execute a subset. + +## CLI Summary Output + +Each executed check produces a line in the summary: + +```text +✓ not_null users.email (3ms) +✖ accepted_values events.status values=['new', 'active'] (warn) ``` -**DuckDB (`duckdb_exec.py`)** +Failures include the generated SQL (where available) to simplify debugging. Use `fft test --verbose` for more detail, or `FFT_SQL_DEBUG=1` to log the underlying queries. -- `run_sql(node, env)` renders Jinja (`ref/source`) and executes the SQL. -- `_read_relation` loads a table as `DataFrame`; surfaces actionable errors when a dependency is missing. -- `_materialize_relation` writes the `DataFrame` as a table (`create or replace table ...`). +## Further Reading -**Postgres (`postgres_exec.py`)** +* `docs/YAML_Tests.md` – schema for YAML-defined tests and advanced scenarios. +* `fft test --help` — command-line switches, selectors, and cache options. -- `_SAConnShim` (compatible with `testing._exec`). -- `run_sql` renders SQL and rewrites `CREATE OR REPLACE TABLE` to `DROP + CREATE AS`. -- `_read_relation` uses pandas, handles schemas, and provides clear guidance. -- `_materialize_relation` writes via `to_sql(if_exists="replace")`. -**BigQuery / BigQuery DataFrames / Spark / Snowpark** -- Identical signatures; IO uses the respective native dataframes (no pandas for Spark/Snowpark). + ---- +# CLI Guide -#### `validation.py` +FastFlowTransform’s CLI is the entry point for seeding data, running DAGs, generating docs, syncing metadata, and executing quality tests. This guide summarizes the day-to-day commands and how they fit together. See `src/fastflowtransform/cli.py` for Typer definitions. -Required-column checks for Python models (single and multi dependency). +## Core Commands -```python -class RequiredColumnsError(ValueError): ... -def validate_required_columns(node_name: str, inputs: Any, requires: dict[str, set[str]]): ... -``` +| Command | Purpose | +|---------|---------| +| `fft seed [--env dev]` | Materialize CSV/Parquet seeds into the configured engine. | +| `fft run [--env dev]` | Execute the DAG (obeys cache + parallel flags). | +| `fft dag --html` | Render the DAG graph/site for quick inspection. | +| `fft docgen --out site/docs` | Generate the full documentation bundle (graph + model pages + optional JSON). | +| `fft test [--env dev]` | Run schema/data-quality tests defined in `project.yml` or schema YAML files. | +| `fft utest ` | Execute unit tests defined under `tests/unit/*.yml`. | +| `fft sync-db-comments ` | Push model/column descriptions into Postgres or Snowflake comments. | ---- +Use `--select` to scope `run`, `dag`, or `test` commands (e.g. `state:modified`, `tag:finance`, `result:error`). Environment overrides rely on the selected profile in `profiles.yml` or the `FF_*` variables. -#### `testing.py` +## HTTP/API Helpers -Minimal data quality framework (engine agnostic via `_exec`). +Python models can make HTTP calls via `fastflowtransform.api.http`. When you need examples, head over to `docs/Api_Models.md` for `get_json`, `get_df`, pagination helpers, caching, and offline modes. -**Checks:** `not_null`, `unique`, `greater_equal`, `non_negative_sum`, `row_count_between`, `freshness` +## DAG & Documentation -```python -class TestFailure(Exception): ... -def _exec(con: Any, sql: Any): ... -def _scalar(con: Any, sql: Any): ... -``` +- Narrow the graph with `fft dag ... --select ` (for example `state:modified` or `tag:finance`). Combined with `--html` this produces a focused mini-site under `/docs/index.html`. +- Control schema introspection via `--with-schema/--no-schema`. Use `--no-schema` when the executor should avoid fetching column metadata (for example, BigQuery without sufficient permissions). +- `fft docgen` renders the DAG, model pages, and an optional JSON manifest in one command. Append `--open-source` to open `index.html` in your default browser after rendering. ---- +## Sync Database Comments -#### `docs.py` & Templates +`fft sync-db-comments --env ` pushes model and column descriptions from project YAML or Markdown into database comments. The command currently supports Postgres and Snowflake Snowpark: -- `render_site(out_dir, nodes)` produces `index.html` plus `model.html` per model. -- Templates (`docs/templates/`) include dark mode, filters, copy buttons, legend. -- Uses `dag.mermaid(nodes)` for the graph. +- Start with `--dry-run` to review the generated `COMMENT` statements. +- Postgres honors `profiles.yml -> postgres.db_schema` (and any `FF_PG_SCHEMA` override). +- Snowflake reuses the session or connection exposed by the executor. ---- +If no descriptions are found, the command exits without making changes. -#### `seeding.py` -Engine-agnostic seed loading (CSV/Parquet/SQL). -```python -def seed_project(project_dir: Path, executor, schema: Optional[str] = None) -> int: ... -``` + ---- +# Auto-Docs & Lineage -### CLI Implementation +FastFlowTransform can generate a lightweight documentation site (DAG + model detail pages) plus an optional JSON manifest for external tooling. -Operational usage lives in [CLI Flows](#cli-flows). This section drills into the Typer command definitions in `cli.py`. +## Commands -**Commands:** +```bash +# Classic +fft dag . --env dev --html -- `fft run [--env dev] [--engine ...]` -- `fft dag [--env dev] [--html] [--select ...] [--with-schema/--no-schema]` -- `fft docgen [--env dev] [--out dir] [--emit-json path] [--open-source]` -- `fft test [--env dev] [--select batch|streaming|tag:...]` -- `fft seed [--env dev]` -- `fft sync-db-comments [--env dev] [--dry-run]` -- `fft utest [--env dev] [--cache off|ro|rw] [--reuse-meta]` -- `fft --version` +# Convenience wrapper (loads schema + descriptions + lineage, can emit JSON) +fft docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json +``` -**Key components:** +Add `--open-source` if you want the default browser to open the rendered `index.html` immediately. -```python -def _load_project_and_env(project_arg) -> tuple[Path, Environment]: ... -def _resolve_profile(env_name, engine, proj) -> tuple[EnvSettings, Profile]: ... -def _get_test_con(executor: Any) -> Any: ... +## Descriptions + +Descriptions can be provided in YAML (`project.yml`) and/or Markdown files. Markdown has higher priority. + +YAML in `project.yml`: + +```yaml +docs: + models: + users.ff: + description: "Raw users table imported from CRM." + columns: + id: "Primary key." + email: "User email address." + users_enriched: + description: "Adds gmail flag." + columns: + is_gmail: "True if email ends with @gmail.com" ``` -**Test summary (exit 2 on failures):** +Markdown overrides YAML when present: ``` -Data Quality Summary -──────────────────── -✅ not_null users.email (3ms) -❌ unique users.id (2ms) - ↳ users.id has 1 duplicate +/docs/models/.md +/docs/columns//.md +``` -Totals -────── -✓ passed: 1 -✗ failed: 1 +Optional front matter is ignored for now (title/tags may be used later). + +## Column Lineage + +- SQL models: expressions like `col`, `alias AS out`, `upper(u.email) AS email_upper)` are parsed; `u` must come from a `FROM ... AS u` clause that resolves to a relation. Functions mark lineage as *transformed*. +- Python (pandas) models: simple patterns like `rename`, `out["x"] = df["y"]`, `assign(x=...)` are recognized. +- Override hints in YAML when the heuristic is insufficient: + +```yaml +docs: + models: + mart_orders_enriched: + lineage: + email_upper: + from: [{ table: users, column: email }] + transformed: true ``` ---- +## JSON Manifest + +The optional manifest (via `--emit-json`) includes models, relations, descriptions, columns (with nullable/dtype), and lineage per column—useful for custom doc portals or CI checks. + +## Notes + +- Schema introspection currently supports DuckDB and Postgres. For other engines, the Columns card may be empty. +- Lineage is optional; when uncertain, entries fall back to “unknown” and never fail doc generation. + -### Settings Infrastructure -`settings.py` uses a **Pydantic v2 discriminated union** (`engine` as discriminator) plus ENV overrides. + -Profile types: -- `DuckDBProfile(engine="duckdb", duckdb: {path})` -- `PostgresProfile(engine="postgres", postgres: {dsn, db_schema})` -- `BigQueryProfile(engine="bigquery", bigquery: {project?, dataset, location?, use_bigframes?})` -- `DatabricksSparkProfile(engine="databricks_spark", ...)` -- `SnowflakeSnowparkProfile(engine="snowflake_snowpark", ...)` +# Logging & Verbosity -Resolver idea: +FastFlowTransform exposes uniform logging controls across all CLI commands plus a dedicated SQL debug channel for tracing rendered SQL, dependency loading, and auxiliary queries. -```python -def resolve_profile(project_dir: Path, env_name: str, env: EnvSettings) -> Profile: ... -``` +## CLI Flags ---- +- `-q` / `--quiet` → only errors (`ERROR`) +- *(default)* → concise warnings (`WARNING`) +- `-v` / `--verbose` → progress/info (`INFO`) +- `-vv` → full debug (`DEBUG`) including SQL debug output -### Streaming Components +`-vv` automatically flips on the SQL debug channel (same effect as `FFT_SQL_DEBUG=1`). -**`streaming/sessionizer.py`** +## SQL Debug Channel -- Normalizes events (JSONL / batch DF) and writes `fct_sessions_streaming`. -- `process_batch(df)` aggregates sessions (start/end, pageviews, revenue). +Enable it to inspect Python-model inputs, dependency columns, and helper SQL emitted by data-quality checks: -**Smoke test (DuckDB):** +```bash +# full debug (recommended) +fft run . -vv -```python -def test_stream_sessionizer_produces_sessions(): ... +# equivalent using the env var (legacy behaviour retained) +FFT_SQL_DEBUG=1 fft run . ``` ---- - -### Mini End-to-End Example (Python API) +## Usage Patterns -```python -from pathlib import Path -from jinja2 import Environment, FileSystemLoader -from fastflowtransform.core import REGISTRY -from fastflowtransform.dag import topo_sort -from fastflowtransform.executors.duckdb_exec import DuckExecutor +```bash +fft run . -q # quiet (errors only) +fft run . # default (concise) +fft run . -v # verbose progress (model names, executor info) +fft run . -vv # full debug + SQL channel +``` -proj = Path("examples/simple_duckdb").resolve() -REGISTRY.load_project(proj) -env = REGISTRY.env # Jinja env from the registry load +## Parallel Logging UX -order = topo_sort(REGISTRY.nodes) -ex = DuckExecutor(db_path=str(proj / ".local" / "demo.duckdb")) +- Each node emits start/end lines with duration, truncated name, and engine abbreviation (DUCK/PG/BQ/…). +- Output remains line-stable via a thread-safe log queue; per-level summaries trail each run. +- Failures still surface the familiar “error block” per node for quick diagnosis. -for name in order: - node = REGISTRY.nodes[name] - if node.kind == "sql": - ex.run_sql(node, env) - else: - ex.run_python(node) +**Notes** -print("✓ Done") -``` +- SQL debug output routes through the `fastflowtransform.sql` logger; use `-vv` or `FFT_SQL_DEBUG=1` to reveal it. +- Existing projects do not need changes: the environment variable keeps working even without `-vv`. ---- -Need a different angle? Head back to the [Docs Hub](./index.md) or deep-dive into the [Modeling Reference](./Config_and_Macros.md). + +# Model Unit Tests (`fft utest`) - +`fft utest` executes a single model in isolation, loading only the inputs you provide and comparing the result to an expected dataset. It works for SQL and Python models and runs against DuckDB or Postgres by default. -# API Calls in Python Models +## Cache Modes -> **Status:** Experimental but stable for demos and smaller workflows. -> **Goal:** Query HTTP APIs from Python models, return responses as DataFrames, cache and instrument them cleanly, and support reproducible offline runs. +`fft utest --cache {off|ro|rw}` (default: `off`) -* [Motivation](#motivation) -* [Quickstart](#quickstart) -* [Programming API](#programming-api) - * [`get_json`](#get_json) - * [`get_df`](#get_df) - * [Pagination](#pagination) - * [Context & Telemetry](#context-telemetry) -* [CLI Flags & Environment Variables](#cli-flags-environment-variables) -* [Example Model](#example-model) -* [Artifacts](#artifacts) -* [Tests & Offline Demos](#tests-offline-demos) -* [Best Practices](#best-practices) -* [Troubleshooting](#troubleshooting) -* [Security & Compliance](#security-compliance) -* [FAQ](#faq) +- `off`: deterministic, never skips. +- `ro`: skip on cache hit; on miss, build but **do not write** cache. +- `rw`: skip on hit; on miss, build **and write** fingerprint. ---- +Notes: -## Motivation +- UTests key the cache with `profile="utest"`. +- Fingerprints include case inputs (CSV content hash / inline rows), so changing inputs invalidates the cache. +- `--reuse-meta` is currently a reserved flag: exposed in the CLI, acts as a no-op today, and will enable future meta-table optimizations. -Many pipelines need small, reliable API fetchers: configuration tables, miniature dimensions, feature flags, SaaS exports. This feature provides: +## Why Use UTests? -- Simple HTTP calls inside Python models -- File-backed cache (reproducible builds, works offline) -- Per-node telemetry (requests, hits, bytes, hashes) -- CLI switches `--offline` and `--http-cache` for reproducible runs +- Fast feedback on transformation logic without full DAG runs. +- Small, reproducible fixtures (rows inline or external CSV). +- Engine-agnostic: swap DuckDB/Postgres to spot dialect differences. ---- +## Folder Layout -## Quickstart +Specs live under `/tests/unit/*.yml` relative to the project root (the directory passed to the CLI that contains `models/`): -1. **Optionally enable flags** (recommended): +``` +your-project/ +├── models/ +│ ├── users.ff.sql +│ ├── users_enriched.ff.py +│ └── mart_users.ff.sql +└── tests/ + └── unit/ + ├── users_enriched.yml + └── mart_users.yml +``` - ```bash - # No network - cache hits only - fft run . --env dev --offline - # Cache mode - fft run . --env dev --http-cache rw # rw|ro|off - ``` +## YAML DSL (with `defaults`) -2. **Write a Python model**: +Each file targets one logical node (the DAG name). Defaults are deep-merged into every case so you can share inputs/expectations and override per scenario. - ```python - # models/users_from_api.ff.py - import pandas as pd - from fastflowtransform.core import model - from fastflowtransform.api.http import get_df +```yaml +# tests/unit/users_enriched.yml +model: users_enriched - @model(name="users_from_api", deps=["users.ff"]) - def fetch(_: pd.DataFrame) -> pd.DataFrame: - df = get_df( - url="https://api.example.com/users", - params={"page": 1}, - record_path=["data"], # JSON -> list -> DataFrame - ) - return df - ``` +defaults: + inputs: + users: + rows: + - {id: 1, email: "a@example.com"} + - {id: 2, email: "b@gmail.com"} + expect: + relation: users_enriched + order_by: [id] -3. **Run it**: +cases: + - name: basic_gmail_flag + expect: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} - ```bash - fft run . --env dev --select users_from_api - ``` + - name: override_inputs + inputs: + users: + rows: + - {id: 3, email: "c@hotmail.com"} + - {id: 4, email: "d@gmail.com"} + expect: + rows: + - {id: 3, email: "c@hotmail.com", is_gmail: false} + - {id: 4, email: "d@gmail.com", is_gmail: true} +``` ---- +SQL models use the file stem (including `.ff`) as `model`. Provide expected relation names that match the materialized table/view: -## Programming API +```yaml +# tests/unit/mart_users.yml +model: mart_users.ff -> Module: `fastflowtransform.api.http` +defaults: + inputs: + users_enriched: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} + expect: + relation: mart_users + order_by: [id] -### `get_json` +cases: + - name: passthrough_columns + expect: + rows: + - {id: 1, email: "a@example.com", is_gmail: false} + - {id: 2, email: "b@gmail.com", is_gmail: true} +``` -```python -from fastflowtransform.api.http import get_json +For multi-dependency models, include every physical relation name (what `relation_for(dep)` returns): -data = get_json( - url="https://api.example.com/objects", - params={"page": 1}, # optional - headers={"Authorization": "Bearer ..."}, # optional - timeout=20, # optional (seconds) -) -# -> Python dict / list +```yaml +model: mart_orders_enriched +defaults: + inputs: + users_enriched: + rows: + - {id: 1, email: "x@gmail.com", is_gmail: true} + orders: + rows: + - {order_id: 10, user_id: 1, amount: 19.9} + - {order_id: 11, user_id: 1, amount: -1.0} +cases: + - name: join_and_flag + expect: + any_order: true + rows: + - {order_id: 10, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: 19.9, valid_amt: true} + - {order_id: 11, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: -1.0, valid_amt: false} ``` -**Behavior** +## Input Formats -- Reads from the local cache (when present and valid). -- Writes to the cache (`rw` mode), including the response body. -- Respects offline mode (no network traffic). +- `rows`: inline dictionaries per row. +- `csv`: reference a CSV file (relative paths allowed). -### `get_df` +Keys under `inputs` are physical relations; use `relation_for('users.ff')` if unsure. -```python -from fastflowtransform.api.http import get_df +## Expected Output & Comparison -df = get_df( - url="https://api.example.com/users", - params={"page": 1}, - record_path=["data"], # path to the JSON list - normalize=True, # optional: flatten nested objects - paginator=None, # optional: pagination strategy (see below) - output="pandas", # pandas|spark (default=pandas) -) -# -> pandas.DataFrame +- `relation`: actual table/view name produced by the model (defaults to `relation_for(model)`). +- Ordering: `order_by: [...]` or `any_order: true`. +- Columns: `ignore_columns: [...]`, `subset: true`. +- Numeric tolerance: `approx: true` or `approx: { col: 1e-9, other_col: 0.01 }` + (numbers can be plain `1e-9` or quoted; they are cast to float). + +## Running UTests + +```bash +fft utest . # discover all specs +fft utest . --env dev # use a specific profile +fft utest . --model users_enriched +fft utest . --model mart_orders_enriched --case join_and_flag +fft utest . --path tests/unit/users_enriched.yml ``` -**Conversion** +Override the executor for all specs (ensure credentials/DSNs are set): -- Default: `record_path` points to the array payload (for example `["data"]`). -- `normalize=True` delegates to `json_normalize` for deeper structures. -- `output='spark'` (plus an optional `session=SparkSession`) converts the normalized result into a `pyspark.sql.DataFrame`. Additional backends will reuse the same parameter. +```bash +export FF_PG_DSN="postgresql+psycopg://postgres:postgres@localhost:5432/ffdb" +export FF_PG_SCHEMA="public" +fft utest . --engine postgres +``` + +Executor precedence (highest → lowest): CLI `--engine`, YAML `engine:` (optional), `profiles.yml`, environment overrides. -### Pagination +## Design Notes -For paged APIs you can describe the next request declaratively: +- Only the target model runs; supply all upstream relations the model expects. +- `defaults` deep-merge: dicts merge, lists/scalars overwrite. +- Results compare as DataFrames with configurable order, subsets, ignored columns, and numeric tolerances. +- Exit codes: `0` for success, `2` when at least one case fails (compact CSV-style diff is printed). -```python -def paginator(url: str, params: dict | None, json_obj: dict): - next_url = json_obj.get("next") # e.g. absolute URL - if next_url: - return {"next_request": {"url": next_url}} - return None +## CI Example -df = get_df( - "https://api.example.com/users?page=1", - paginator=paginator, - record_path=["data"], -) +```yaml +name: utests +on: [push, pull_request] +jobs: + duckdb: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: { python-version: "3.11" } + - run: pip install -e . + - run: fft utest . --env dev ``` -The paginator may return the following fields: - -- `{"next_request": {"url": "...", "params": {...}, "headers": {...}}}` - (any missing field keeps its previous value) +For Postgres, add a service container and run `fft utest . --engine postgres` with `FF_PG_DSN` / `FF_PG_SCHEMA`. -### Context & Telemetry -During a model run the executor collects telemetry per node and writes it into `run_results.json`: -- `requests` (count) -- `cache_hits` -- `bytes` (sum of response bodies) -- `used_offline` (bool) -- `keys` (cache keys) -- `entries` (optional compact array with URL, status, content hash) + -You will find these metrics under the `http` block of each node (see [Artifacts](#artifacts)). +# Troubleshooting & Error Codes ---- +Use this checklist when FastFlowTransform commands misbehave. Each item points to the quickest fix plus the relevant CLI options. -## CLI Flags & Environment Variables +## Quick Fixes -**CLI** +- **DuckDB seeds not visible** → ensure `FF_DUCKDB_PATH` (or the profile path) is identical for `seed`, `run`, `dag`, and `test`. If you configure `FF_DUCKDB_SCHEMA` / `FF_DUCKDB_CATALOG`, keep them consistent across commands so unqualified references resolve to the right namespace. +- **Postgres connection refused** → confirm `FF_PG_DSN`, container status (`docker ps`), and that port `5432` is open. +- **BigQuery permissions** → set `GOOGLE_APPLICATION_CREDENTIALS` and match dataset/location to your profile. +- **HTML docs missing** → run `fft dag --html` and open `/docs/index.html`. +- **Unexpected test failures** → inspect rendered SQL in CLI output, refine selection via `--select`, refresh seeds if needed. +- **Dependency table not found in utests** → provide all physical upstream relations in the YAML spec. -- `--offline` - Sets `FF_HTTP_OFFLINE=1`; network requests are blocked, **cache hits only**. -- `--http-cache {off|ro|rw}` - Sets `FF_HTTP_CACHE_MODE`: +## Error Codes - - `off`: neither read nor write. - - `ro`: read-only (hits), **no** writes. - - `rw`: read and write (default). +| Type | Class/Source | Exit | Notes | +|---------------------------|---------------------------|------|---------------------------------------------------------| +| Missing dependency | `DependencyNotFoundError` | 1 | Per-node list; tips for `ref()` / names | +| Cycle in DAG | `ModelCycleError` | 1 | “Cycle detected among nodes: …” | +| Model execution (KeyError)| `cli.py` → formatted block| 1 | Inspect columns, use `relation_for(dep)` as keys | +| Data quality failures | `cli test` → summary | 2 | Totals section prints passed/failed counts | +| Unknown/unexpected | generic | 99 | Optional trace via `FFT_TRACE=1` | -**Environment (optional to set directly)** +Error types map to the classes documented in `docs/Technical_Overview.md#core-modules` and the CLI source. -| Variable | Default | Effect | -| ------------------------ | ------------------------------- | ----------------------------------- | -| `FF_HTTP_OFFLINE` | `0` | `1/true/on` -> offline mode | -| `FF_HTTP_CACHE_MODE` | `rw` | `off` / `ro` / `rw` | -| `FF_HTTP_CACHE_DIR` | `.fastflowtransform/http_cache` | Cache directory | -| `FF_HTTP_TTL` | `0` | Seconds; 0 = never expires | -| `FF_HTTP_TIMEOUT` | `20` | Request timeout (seconds) | -| `FF_HTTP_MAX_RETRIES` | `3` | Basic retry count | -| `FF_HTTP_RATE_LIMIT_RPS` | `0` | Requests per second (0 = unlimited) | ---- -## Example Model + -```python -# models/dim_countries_from_api.ff.py -import pandas as pd -from fastflowtransform.core import model -from fastflowtransform.api.http import get_df +# Basic Demo Project -@model(name="dim_countries_from_api", deps=["users.ff"]) -def countries(_: pd.DataFrame) -> pd.DataFrame: - def pager(u, p, js): - nxt = js.get("paging", {}).get("next") - return {"next_request": {"url": nxt}} if nxt else None +The `examples/basic_demo` project shows the smallest end-to-end FastFlowTransform pipeline. It combines one seed, a staging model, and a final mart while staying portable across DuckDB, Postgres, and Databricks Spark. - df = get_df( - url="https://api.example.com/countries?page=1", - paginator=pager, - record_path=["data"], - normalize=True, - ) - # lightweight post-processing - if "code" in df.columns: - df["code"] = df["code"].str.upper() - return df -``` +## Why it exists +- **Start small** – demonstrate the minimum folder structure (`seeds/`, `models/`, `profiles.yml`) needed to run `fft`. +- **Engine parity** – prove that a single project can target multiple engines by swapping profiles. +- **Understand outputs** – show where documentation and manifests land after a run. -Run: +Use it as a sandbox before adding your own sources, macros, or Python models. -```bash -fft run . --env dev --select dim_countries_from_api --http-cache ro -``` +## Project layout ---- +| Path | Purpose | +|------|---------| +| `seeds/seed_users.csv` | Sample CRM-style user data. `fft seed` materializes it as `crm.users`. | +| `models/staging/users_clean.ff.sql` | Normalizes emails, casts types, and tags the model for all engines. | +| `models/marts/mart_users_by_domain.ff.sql` | Aggregates users per email domain and records the first/last signup dates. | +| `models/engines/*/mart_latest_signup.ff.py` | Engine-specific Python models (pandas for DuckDB/Postgres, PySpark for Databricks) selecting the most recent signup per domain from the staging view. | +| `profiles.yml` | Declares `dev_duckdb`, `dev_postgres`, and `dev_databricks` profiles driven by environment variables. | +| `.env.dev_*` | Template environment files you can `source` per engine. | +| `Makefile` | One command (`make demo ENGINE=…`) to seed, run, document, test, and preview results. | -## Artifacts +## Running the demo -`/.fastflowtransform/target/run_results.json` (excerpt): +1. `cd examples/basic_demo` +2. Choose an engine and export its environment variables: + ```bash + set -a; source .env.dev_duckdb; set +a + # swap to .env.dev_postgres or .env.dev_databricks for other engines + ``` +3. Execute the full flow: + ```bash + make demo ENGINE=duckdb + ``` + The Makefile runs `fft seed`, `fft run`, `fft dag`, `fft test`, and `fft show basic_demo.mart_users_by_domain`. To preview the Python mart, run `make show ENGINE=duckdb SHOW_MODEL=mart_latest_signup` (or swap `ENGINE` as needed). +4. Inspect artifacts: + - `.fastflowtransform/target/manifest.json` and `run_results.json` + - `site/dag/index.html` for the rendered model graph + - CLI output from `fft show` displaying the aggregated mart -```json -{ - "results": [ - { - "name": "dim_countries_from_api", - "status": "success", - "duration_ms": 153, - "http": { - "requests": 2, - "cache_hits": 2, - "bytes": 1842, - "used_offline": true, - "keys": ["GET:https://api.example.com/countries?page=1|{}|{}", "..."], - "entries": [ - {"url": "https://api.example.com/countries?page=1", "status": 200, "content_hash": "sha256:..."}, - {"url": "https://api.example.com/countries?page=2", "status": 200, "content_hash": "sha256:..."} - ] - } - } - ] -} -``` +The demo also enables baseline data quality checks in `project.yml`. Running `fft test` (or `make test`) verifies that primary keys remain unique/not-null across `seed_users`, `users_clean`, `mart_users_by_domain`, and the Python mart, while ensuring aggregate metrics such as `user_count` never drop below zero and each domain appears only once in `mart_latest_signup`. -> Note: When a node is **skipped** (fingerprint cache hit), no new `http` block is emitted - the model did not run. +## Next steps ---- +- Add more CSVs under `seeds/` and declare them in `sources.yml`. +- Create additional staging models so marts can reuse normalized data. +- Introduce Python models or macros mirroring how the API demo scales up. +- Update `.env.dev_*` with real credentials once you connect to shared databases. -## Tests & Offline Demos -- Place unit tests under `tests/api/...` and seed the cache directly (no real HTTP calls). -- Suggested scenarios: - - **Offline hit:** set `FF_HTTP_OFFLINE=1`, seed the cache, `get_json/get_df` must succeed. - - **Cache mode `off`:** even with cache entries, **no** reads; expect a failure in offline mode. - - **`ro`:** allow read hits; **no** cache writes after a real or mocked request. - - **Pagination:** stitch several pages from offline fixtures; telemetry should count requests/hits. + ---- +# Materializations Demo -## Best Practices +> This example shows how different **materializations** (`view`, `table`, `incremental`, `ephemeral`) behave in FastFlowTransform. -- **Stable URLs and parameter order** produce identical cache keys and reproducible builds. -- **Keep `record_path` shallow**; use `normalize=True` only when necessary (performance). -- **Never cache secrets:** provide tokens via headers; the response body and metadata are cached. -- **Use `--offline` in CI** for deterministic tests with a pre-seeded cache. -- **Set TTL intentionally** when APIs change frequently. -- **Scope engine-specific variants** with `engine_model(only=...)` so each execution backend registers only the models it can run (pair with SQL `config(engines=[...])` when duplicating logical names). +The demo models are located under: +``` ---- +examples/materializations_demo/models/ -## Troubleshooting +```` -- **“offline + cache miss”** - Seed the cache (see tests) or disable offline mode. -- **“Schema mismatch”** - Harmonize columns after `get_df` (types, missing keys). -- **“Too many requests”** - Configure `FF_HTTP_RATE_LIMIT_RPS`; make pagination more efficient (larger `page_size`). -- **“No http block”** - Was the node **skipped** (fingerprint cache)? Or did the model avoid HTTP calls altogether? +Each model type demonstrates how FastFlowTransform builds, caches, or executes models differently depending on its `materialized:` configuration. --- -## Security & Compliance - -- **Do not commit secrets** - use environment variables or a secret manager. -- **PII/GDPR:** verify whether the API returns personal data; minimise retention. -- **Cache directory:** keep it in `.gitignore`; encrypt or isolate it if necessary. +## 🧩 1. View Models ---- +A **view** model is always re-created from scratch each run. +It defines a virtual relation that doesn’t store data permanently — ideal for lightweight transformations. -## FAQ +```sql +{{ config(materialized='view') }} -**Q:** Can I call other libraries (for example `requests`, `httpx`) directly? -**A:** Yes, but you lose telemetry and caching. The recommended entrypoint is `fastflowtransform.api.http`. +select + order_id, + customer_id, + total_amount, + order_date +from {{ ref('stg_orders') }} +```` -**Q:** How do I add custom headers (for example OAuth)? -**A:** Pass `headers={...}`. Store sensitive values in env vars and inject them into your models. +**Characteristics** -**Q:** Does this work for POST requests? -**A:** Release R1 focuses on GET. Please open an issue for POST/PUT support; the design can be extended. +* Rebuilt each run (no persisted data) +* Useful for staging, joins, and intermediate logic +* Fast and always up-to-date with upstreams +* Cannot store or cache incremental state --- -**See also:** - -- Technical guide: *Developer Guide – Architecture & Internals* -- Unit tests: `tests/api/test_http_*.py` -- Runtime & cache: *Parallelism & Cache (v0.3)* +## 🧱 2. Table Models +A **table** model materializes into a physical table on the target engine. +```sql +{{ config(materialized='table') }} - +select * +from {{ ref('fct_orders_view') }} +``` -# FastFlowTransform Modeling Reference (v0.1) +**Characteristics** -> Authoritative reference for FastFlowTransform’s modeling layer: SQL/Python models, configuration macros, templating helpers, and testing hooks. -> Works with FastFlowTransform v0.1 (T1–T11). Supported engines: DuckDB, Postgres, BigQuery (pandas & BigFrames), Databricks/Spark, Snowflake/Snowpark. -> **Execution & Cache (v0.3) quick notes** -> - Parallelism is level-wise; use `fft run --jobs N`. -> - Use `--cache={off|ro|rw|wo}` to control skipping behavior. -> - Fingerprints include rendered SQL / Python function source, selected `FF_*` env vars, `sources.yml` and upstream fingerprints. -> - Change any of these → downstream nodes rebuild. -> - `--rebuild ` forces rebuilding selected models (ignores cache). +* Fully rebuilt every run +* Good for final curated datasets or small tables +* Overwrites previous contents (atomic replace) +* Compatible with all engines (DuckDB, Postgres, BigQuery, etc.) +--- -For an operational walkthrough (CLI usage, troubleshooting, pipelines) see the [Technical Overview](./Technical_Overview.md). This document focuses purely on how you author and test models. +## ⚡ 3. Incremental Models ---- +An **incremental** model stores state and only updates changed records on subsequent runs. -## Docs Navigation -1. [Getting Started](./index.md) -2. [User Guide](./Technical_Overview.md#part-i-operational-guide) -3. **Modeling Reference** — you are here (`Config_and_Macros.md`) -4. [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) +```sql +{{ config( + materialized='incremental', + incremental={ + "enabled": true, + "unique_key": "order_id", + "updated_at_column": "updated_at", + "delta_sql": "select * from {{ ref('stg_orders') }} where updated_at > (select max(updated_at) from {{ this }})" + } +) }} +``` ---- +**Characteristics** -## Table of Contents +* Persists data between runs +* Only merges new or changed rows +* Significantly faster for large tables +* Requires `unique_key` and (optionally) an `updated_at_column` +* Schema changes can be managed via: -- [Docs Navigation](#docs-navigation) -- [1. Model Fundamentals](#1-model-fundamentals) - - [1.1 SQL models (`*.ff.sql`)](#11-sql-models-ffsql) - - [1.2 Python models (`*.ff.py`)](#12-python-models-ffpy) - - [1.3 Seeds, sources, and dependencies](#13-seeds-sources-and-dependencies) -- [2. `config()` options](#2-config-options) -- [3. Variables with `var()`](#3-variables-with-var) -- [4. Template context & helpers](#4-template-context-helpers) -- [5. Macros & reusable Jinja code](#5-macros-reusable-jinja-code) -- [6. Materialization semantics](#6-materialization-semantics) -- [7. Testing & quality gates](#7-testing-quality-gates) -- [8. Quick cheat sheet](#8-quick-cheat-sheet) + * `on_schema_change: "ignore"` + * `on_schema_change: "append_new_columns"` + * `on_schema_change: "sync_all_columns"` ---- +**Behavior example:** -## 1. Model Fundamentals +| Run | Operation | Rows affected | +| --- | ----------- | ------------- | +| 1 | full load | 10,000 | +| 2 | merge delta | 120 | +| 3 | merge delta | 45 | -FastFlowTransform discovers models under `/models/` with two primary flavours: +--- -### 1.1 SQL models (`*.ff.sql`) +## 🧮 4. Ephemeral Models -- File stem defines the logical DAG node (`users.ff.sql` → `users.ff`). -- Jinja template rendered with FastFlowTransform context (helpers like `ref`, `source`, `var`, `config`, `this`). -- Output relation defaults to the stem without `.ff` (configurable via `config(alias=...)` if supported in future releases). +An **ephemeral** model exists only during query compilation. +It never creates a physical table or view — it’s inlined wherever it’s referenced. ```sql --- models/users.ff.sql -{{ config(materialized='table', tags=['staging']) }} -create or replace table users as -select id, email -from {{ source('crm', 'users') }}; -``` +{{ config(materialized='ephemeral') }} -### 1.2 Python models (`*.ff.py`) - -Use the `@model` decorator from `fastflowtransform.core` to register a callable. The decorator accepts: +select + order_id, + total_amount * 0.1 as tax_amount +from {{ ref('fct_orders_inc') }} +``` -- `name` (optional) → overrides the logical name (defaults to stem). -- `deps` → list of dependency nodes (file stems or logical names). -- `requires` → column contract per dependency (validated via `validation.validate_required_columns`). +**Characteristics** -Dependencies determine the call signature: +* Inlined into parent queries +* Reduces I/O overhead (no temporary tables) +* Ideal for lightweight reusable SQL snippets +* Not visible in the warehouse after execution -- Single dependency → function receives a single `pandas.DataFrame`. -- Multiple dependencies → function receives `dict[str, pandas.DataFrame]` keyed by physical relation name (e.g. `"users"`). +--- -```python -# models/users_enriched.ff.py -from fastflowtransform.core import model -import pandas as pd +## 🔗 5. Combined Example DAG -@model( - name="users_enriched", - deps=["users.ff"], - requires={"users": {"id", "email"}} -) -def enrich(df: pd.DataFrame) -> pd.DataFrame: - out = df.copy() - out["is_gmail"] = out["email"].str.endswith("@gmail.com") - return out +In the demo, these models are connected as follows: + +```text +stg_orders + ↓ +fct_orders_view (view) + ↓ +fct_orders_tbl (table) + ↓ +fct_orders_inc (incremental) + ↓ +fct_orders_ephemeral (ephemeral) ``` -#### Engine-scoped registration +This DAG demonstrates: -When the same project supports multiple execution backends, use `engine_model` to register a Python model only for specific engines. The decorator wraps `@model` but bails out early if the active engine (from `FF_ENGINE` or the selected profile) is not allowed. +* How **data flows** between materializations +* Which ones persist or recompute data +* How incremental models can feed downstream table or ephemeral models -```python -from fastflowtransform import engine_model -import pandas as pd +--- -@engine_model( - only=("duckdb", "postgres"), - name="api_users_requests", - deps=["users.ff"], - tags=["example:api_demo", "scope:engine"], -) -def fetch(_: pd.DataFrame) -> pd.DataFrame: - ... -``` +## 🧭 When to Use Each Type -Allowed values are case-insensitive strings or tuples. If the engine does not match, the function is left undecorated and no node is created, preventing duplicate registrations across engine-specific folders. +| Materialization | Persists? | Performance | Recommended Use Case | +| --------------- | --------- | ------------------- | ----------------------------------------- | +| `view` | ❌ No | ⚡ Fast rebuild | Intermediate or temporary transformations | +| `table` | ✅ Yes | ⚖️ Moderate | Final outputs or smaller datasets | +| `incremental` | ✅ Yes | 🚀 High (on deltas) | Large, frequently updated fact tables | +| `ephemeral` | ❌ No | ⚡ Fast inline | Reusable SQL snippets or shared logic | -### 1.3 Seeds, sources, and dependencies +--- -- Declare external tables in `sources.yml`; they become available via `source('group','table')`. -- Provide reproducible inputs with CSV/Parquet seeds in `/seeds/`. -- FastFlowTransform auto-detects dependencies: - - SQL models → parse `ref()` / `source()` calls. - - Python models → use the decorator’s `deps`. - - Additional runtime dependencies can be expressed via `relation_for()`. +## 🧠 Tips -> **Warning:** SQL dependency detection is static. Only literal calls such as `ref('users.ff')` are registered. When you need to gate a dependency behind a variable, materialise the options in a mapping (`{'foo': ref('foo'), 'bar': ref('bar')}`) and pick from that map at runtime; a bare `ref(variable)` will not show up in the DAG. +* You can set default materializations in `project.yml` under `models.materialized`. +* Override per model using `{{ config(materialized='...') }}`. +* For incremental models, ensure **unique keys** and **delta logic** are consistent across runs. +* Test behavior locally using the DuckDB engine before deploying to a warehouse. -- Persistence (e.g. Spark/Databricks): configure default targets under `project.yml → models.storage` (and optionally `seeds.storage`). Example: - ```yaml - models: - storage: - api_users_http: - path: ".local/spark/api_users_http" - format: delta - options: - mergeSchema: true - seeds: - storage: - users: - path: ".local/spark/seeds/users" - ``` + - Entries end up in `node.meta["storage"]` (keys: `path`, `format`, `options`) and are respected by the matching executor. +# Environment Matrix (DuckDB-only) — Example -```yaml -# sources.yml -version: 2 +This tiny project demonstrates **per-environment configuration** (dev / stg / prod) while keeping everything on **DuckDB**. +Each environment uses its **own DuckDB file**, so you can switch environments without changing code. -sources: - - name: crm - tables: - - name: users - identifier: seed_users - - name: erp - tables: - - name: orders - identifier: seed_orders -``` +It also includes a **seed step** (CSV → table) and two minimal models: -Each source can declare defaults such as `schema`, `database`, or `catalog`. Tables may -override those defaults, add per-engine overrides, or point at files: +* `env_vars.ff` (Python) — echoes which env is active and which DuckDB file is used +* `hello.ff` (SQL view) — shows how `{{ this.* }}` resolves from the active profile +* `users.ff` (SQL table) — reads from the seeded CSV table to prove seeding works -```yaml - - name: raw - schema: staging - tables: - - name: seed_users - identifier: seed_users - overrides: - postgres: - schema: raw - databricks_spark: - format: delta - location: "/mnt/delta/raw/seed_users" +--- + +## What this shows + +* Layered environment files: `.env.dev`, `.env.stg`, `.env.prod` (+ optional `*.local` overrides) +* `profiles.yml` that reads from `env('…')` so connection details live in env files +* All environments use **DuckDB**, but **different DB files** (e.g. `.local/dev.duckdb`, `.local/stg.duckdb`, …) +* Seeding CSV → `seed_users` table, then a simple model consuming it + +--- + +## Project layout + +``` +examples/env_matrix/ +├─ models/ +│ ├─ env_vars.ff.py # Python model: shows env + DuckDB file info +│ └─ users.ff.sql # SQL table: reads from seeded 'seed_users' +├─ seeds/ +│ └─ users.csv # sample data for seeding (-> seed_users) +├─ profiles.yml # all envs = DuckDB, different paths +├─ .env # shared defaults (optional) +├─ .env.dev # dev environment vars +├─ .env.stg # stg environment vars +├─ .env.prod # prod environment vars +├─ .env.dev.local # private overrides (gitignored; optional) +├─ .env.stg.local # private overrides (gitignored; optional) +├─ .env.prod.local # private overrides (gitignored; optional) +└─ Makefile # convenience targets (run, seed, dag) ``` --- -## 2. `config()` options +## Environment files -Call `config()` at the top of SQL models (and optionally within Python models via decorator kwargs in future versions). +Each env file sets a different DuckDB path: -```sql -{{ config( - materialized='view', - tags=['mart', 'daily'] -) }} -``` +* `.env.dev` -Supported keys (v0.1): + ``` + FFT_ACTIVE_ENV=dev + FF_ENGINE=duckdb + FF_DUCKDB_PATH=.local/env_matrix.dev.duckdb + ``` -| Key | Type | Description | -|----------------|-----------------|------------------------------------------------------------------------------| -| `materialized` | `"table" \| "view" \| "ephemeral"` | Controls how FastFlowTransform persists the model. See [Materialization semantics](#6-materialization-semantics). | -| `tags` | `list[str]` | Arbitrary labels surfaced in docs / selection tooling. | -| `engines` | `list[str]` or `str` | Restrict registration to the listed engines (case-insensitive). Requires the active engine to be known (profile selection or `FF_ENGINE`). | -| (future) | – | Additional metadata is stored under `node.meta[...]` if added later. | +* `.env.stg` -**Tips** + ``` + FFT_ACTIVE_ENV=stg + FF_ENGINE=duckdb + FF_DUCKDB_PATH=.local/env_matrix.stg.duckdb + ``` -- Place `config()` before any SQL text. -- Use tags to power custom filters in docs or to drive test selection. -- Combine `engines=[...]` with per-engine subfolders to keep one physical file per backend without name clashes. When no engine is active, FastFlowTransform raises a clear error to avoid silent skips. -- Ephemeral models inline into downstream SQL; pick `view` for shareable logic without materializing a table. +* `.env.prod` ---- + ``` + FFT_ACTIVE_ENV=prod + FF_ENGINE=duckdb + FF_DUCKDB_PATH=.local/env_matrix.prod.duckdb + ``` -## 3. Variables with `var()` +> You can place secrets or machine-local tweaks in `.env..local` (ignored by git). +> Optional toggles (if you want verbose SQL logs): +> `FFT_SQL_DEBUG=1`, `FFT_LOG_JSON=1` -Project-level variables live under `project.yml → vars:` and can be overridden from the CLI: +--- + +## `profiles.yml` (DuckDB for all envs) ```yaml -# project.yml -vars: - snapshot_day: "2000-01-01" - limit: 100 -``` +default: + dev: + engine: "{{ env('FF_ENGINE', 'duckdb') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" -```bash -fft run . --vars snapshot_day='2025-10-01' limit=50 + stg: + engine: "{{ env('FF_ENGINE', 'duckdb') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" + + prod: + engine: "{{ env('FF_ENGINE', 'duckdb') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" ``` -Usage in templates: +--- -```sql -select * -from {{ source('crm','users') }} -where signup_date <= '{{ var("snapshot_day", "1970-01-01") }}' -limit {{ var("limit", 1000) }} -``` +## Models -Resolution order: CLI overrides → project vars → default argument. +### `models/env_vars.ff.py` (Python) ---- +Returns one row with: -## 4. Template context & helpers +* `active_env_hint` (from `.env.*`), +* `ff_engine` (should be `duckdb` here), +* `duckdb_path`, `duckdb_exists`, `duckdb_size_bytes`. -Every model (SQL & Python) gets a rich Jinja context. Key helpers: +### `models/hello.ff.sql` (SQL view) -| Helper | Purpose | -|--------------------|------------------------------------------------------------------------------------------| -| `this` | Object exposing `name`, `relation`, `materialized`, `schema`, `database`. | -| `ref("model")` | Resolves another model’s physical relation (or inlines ephemeral SQL). | -| `source("group","table")` | Resolves entries defined in `sources.yml`. | -| `relation_for(node)` (Python utility) | Maps logical node names to physical relations (helpful inside UDFs/tests). | -| `var("key", default)` | Retrieves project/CLI variables (see above). | +Uses `{{ this.materialized }}`, `{{ this.schema }}`, `{{ this.database }}` so you can see what the active profile provides. (The simple `SELECT` is compatible with DuckDB; if you added casts like `::text`, they’re fine in DuckDB too.) -Example: +### `models/users.ff.sql` (SQL table) + +Reads from the seeded table `seed_users`: ```sql -{{ config(materialized='view') }} +{{ config(materialized='table', tags=['demo', 'seed']) }} + select - u.id, - u.email, - {{ var("country_column", "'US'") }} as country_code -from {{ ref('users.ff') }} as u --- rendered relation for logging/debugging --- {{ this.relation }} + id, + email +from "seed_users"; ``` +> If you see an error “table seed_users does not exist”, you **haven’t run `fft seed`** for that environment yet. + --- -## 5. Macros & reusable Jinja code +## Seeds -Organise shared SQL snippets in `models/macros/` (all `.sql` files are auto-loaded): +`seeds/users.csv` is loaded by `fft seed` into a table named `seed_users`. +(That’s the default naming convention: `users.csv` → `seed_users`.) -``` -models/ - macros/ - string_utils.sql - marts/ - users.ff.sql +--- + +## Running it + +From the repo root: + +### Using `uv` directly + +**Dev** + +```bash +uv run fft seed examples/env_matrix --env dev +uv run fft run examples/env_matrix --env dev +uv run fft dag examples/env_matrix --env dev --html ``` -```jinja -{# models/macros/string_utils.sql #} -{% macro safe_lower(col) -%} -lower(trim({{ col }})) -{%- endmacro %} +**Staging** + +```bash +uv run fft seed examples/env_matrix --env stg +uv run fft run examples/env_matrix --env stg ``` -Use the macro anywhere within the project: +**Prod** -```sql -select {{ safe_lower("email") }} as email_lower -from {{ ref('users.ff') }}; +```bash +uv run fft seed examples/env_matrix --env prod +uv run fft run examples/env_matrix --env prod ``` -**Best practices** +### Using the Makefile (inside `examples/env_matrix/`) -- Keep macros idempotent and side-effect free. -- Group related macros per file (e.g., string utilities, date helpers). -- Document macros with inline comments; FastFlowTransform’s generated docs list each macro with its path. +```bash +make run-dev # runs the DAG on dev +make run-stg +make run-prod ---- +make seed-dev # seed only (dev) +make seed-stg +make seed-prod -## 6. Materialization semantics +make dag-dev # generate HTML DAG for dev +make clean # remove .local/, docs/, site/, .fastflowtransform/ +``` -### SQL models +> Tip: re-run `fft seed` whenever you switch environments or change `seeds/*.csv`. -| Materialization | Behaviour | -|-----------------|-----------| -| `table` | `CREATE OR REPLACE TABLE … AS ` | -| `ephemeral` | No object is created; downstream `ref()` expands to a subquery. | +--- -**Postgres-specific:** FastFlowTransform rewrites the “create or replace” pattern into `DROP TABLE IF EXISTS …; CREATE TABLE … AS …` for compatibility. +## Inspecting results -### Python models +* The **HTML DAG** (after `make dag-dev`) will be at: -- Default → materialized as `table`. -- `materialized='view'` produces an engine-specific temporary table first, then creates/overwrites a view that selects from it. -- Ephemeral Python models are not supported in v0.1. + ``` + examples/env_matrix/site/dag/index.html + ``` +* The **artifacts** are under: + + ``` + examples/env_matrix/.fastflowtransform/target/{manifest.json, run_results.json, catalog.json} + ``` +* Query the DuckDB files directly with `duckdb` CLI or `python` + `duckdb` module if you want to peek inside. --- -## 7. Testing & quality gates +## Troubleshooting -### 7.1 Column contracts (`requires`) +* **`seed_users` not found** + Run `fft seed` for the same environment: + `uv run fft seed examples/env_matrix --env dev` -Use the decorator’s `requires` argument (Python models) to ensure upstream inputs carry expected columns. Under the hood FastFlowTransform calls `validation.validate_required_columns`, raising `RequiredColumnsError` with a descriptive diff. +* **No logs showing** + Use `-v`/`-vv` and/or `--sql-debug` on the CLI, or set: -```python -@model( - deps=["orders.ff", "users_enriched"], - requires={ - "orders": {"order_id", "user_id", "amount"}, - "users_enriched": {"id", "email", "is_gmail"} - } -) -def join_orders(inputs: dict[str, pd.DataFrame]) -> pd.DataFrame: - ... -``` + ``` + FFT_SQL_DEBUG=1 + FFT_LOG_JSON=1 # optional JSON logs + ``` -### 7.2 Data quality tests (`project.yml`) +* **Wrong environment picked** + Double-check the `--env` flag in your CLI call and ensure the `.env.` file exists. -Declare checks under `project.yml → tests:`. Each entry maps directly to a function in `fastflowtransform.testing` (`not_null`, `unique`, `row_count_between`, `greater_equal`, `non_negative_sum`, `freshness`). Run them via `fft test …`. +--- -```yaml -tests: - - type: not_null - table: users - column: email - tags: [batch] +## Clean up + +```bash +make clean # from examples/env_matrix/ +# or manually: +rm -rf examples/env_matrix/.local examples/env_matrix/site examples/env_matrix/docs +rm -rf examples/env_matrix/.fastflowtransform ``` -### 7.3 Model unit tests (`fft utest`) -Keep transformation logic honest with small, engine-agnostic specs: -- Place YAML files under `/tests/unit/`. -- Express inputs via inline rows or CSV paths. -- Declare expected output rows plus comparison options (`order_by`, `any_order`, `ignore_columns`, `approx`). + -```yaml -# tests/unit/users_enriched.yml -model: users_enriched -defaults: - inputs: - users: - rows: - - {id: 1, email: "a@example.com"} - - {id: 2, email: "b@gmail.com"} - expect: - relation: users_enriched - order_by: [id] +# Data Quality Demo Project -cases: - - name: flags_gmail - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} -``` +The **Data Quality Demo** shows how to use **all built-in FFT data quality tests** on a small, understandable model: -Run with: +* Column checks: -```bash -fft utest . --env dev -fft utest . --model users_enriched --case flags_gmail -``` + * `not_null` + * `unique` + * `accepted_values` + * `greater_equal` + * `non_negative_sum` + * `row_count_between` + * `freshness` +* Cross-table reconciliations: + + * `reconcile_equal` + * `reconcile_ratio_within` + * `reconcile_diff_within` + * `reconcile_coverage` -See the [Technical Overview](./Technical_Overview.md#model-unit-tests-fft-utest) for an exhaustive walkthrough (engine overrides, CI examples, troubleshooting). +It uses a simple **customers / orders / mart** setup so you can see exactly what each test does and how it fails when something goes wrong. --- -## 8. Quick cheat sheet +## What this example demonstrates -| Task | Snippet / Pointer | -|------|-------------------| -| Set materialization | `{{ config(materialized='view') }}` | -| Add tags | `{{ config(tags=['mart','daily']) }}` | -| Read project variable | `{{ var('run_date', '1970-01-01') }}` | -| Current relation name | `{{ this.relation }}` | -| Reference another model | `{{ ref('users.ff') }}` | -| Reference source | `{{ source('crm','users') }}` | -| Macro definition | `models/macros/*.sql` | -| Guarantee columns (Python) | `@model(..., requires={'users': {'id','email'}})` | -| Data-quality test | `project.yml → tests` + `fft test …` | -| Unit test | `tests/unit/*.yml` + `fft utest …` | +1. **Basic column checks** on staging tables + Ensure IDs are present and unique, amounts are non-negative, and status values are valid. ---- +2. **Freshness** on a timestamp column + Check that the most recent order in your mart is not “too old”, using `last_order_ts`. -Return to the [Docs Hub](./index.md) or switch to the [User/Developer Guide](./Technical_Overview.md). +3. **Row count sanity checks** + Guard against empty tables and unexpectedly large row counts. +4. **Cross-table reconciliations** between staging and mart + Verify that sums and counts match between `orders` and the aggregated `mart_orders_agg`, and that every customer has a corresponding mart row. +5. **Tagged tests and selective execution** + All tests are tagged (e.g. `example:dq_demo`, `reconcile`) so you can run exactly the subset you care about. - +--- -### 🆕 `docs/Cache_and_Parallelism.md` +## Project layout (example) -````markdown -# Parallelism & Cache (FastFlowTransform v0.3) +```text +examples/dq_demo/ + .env + .env.dev_duckdb + .env.dev_postgres + .env.dev_databricks + Makefile # optional, convenience wrapper around fft commands + profiles.yml + project.yml + sources.yml + + seeds/ + customers.csv + orders.csv + + models/ + staging/ + customers.ff.sql + orders.ff.sql + marts/ + mart_orders_agg.ff.sql +``` -FastFlowTransform 0.3 introduces a level-wise parallel scheduler and a build cache driven by stable fingerprints. This document explains **how parallel execution works**, **when nodes are skipped**, the exact **fingerprint formula**, and the **meta table** written after successful builds. +### Seeds ---- +* `seeds/customers.csv` + Simple customer dimension (e.g. `customer_id`, `name`, `status`). -## Table of Contents -- [Parallel Scheduler](#parallel-scheduler) -- [Cache Policy](#cache-policy) -- [Fingerprint Formula](#fingerprint-formula) -- [Meta Table Schema](#meta-table-schema) -- [CLI Recipes](#cli-recipes) -- [Troubleshooting & FAQ](#troubleshooting--faq) -- [Example: simple_duckdb](#example-simple_duckdb) -- [Appendix: Environment Inputs](#appendix-environment-inputs) +* `seeds/orders.csv` + Order fact data (e.g. `order_id`, `customer_id`, `amount`, `order_ts` as a string). ---- +### Models -## Parallel Scheduler +**1. Staging: `customers.ff.sql`** -FastFlowTransform splits the DAG into **levels** (all nodes that can run together without violating dependencies). Within a level, up to `--jobs` nodes execute in **parallel**. +* Materialized as a table. +* Casts IDs and other fields into proper types. +* Used as the “clean” customer dimension for downstream checks. -- Dependencies are **never** violated. -- `--keep-going`: tasks already started in a level finish; **subsequent levels won’t start** if any task in the current level fails. -- Logs are serialized through an internal queue to keep lines readable and per-node timing visible. +**2. Staging: `orders.ff.sql`** -**Quick start** -```bash -# Run with 4 workers per level -fft run . --env dev --jobs 4 +* Materialized as a table. +* Casts fields to proper types so DQ tests work reliably: -# Keep tasks in the same level running even if one fails -fft run . --env dev --jobs 4 --keep-going -```` + ```sql + {{ config( + materialized='table', + tags=[ + 'example:dq_demo', + 'scope:staging', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], + ) }} ---- + select + cast(order_id as int) as order_id, + cast(customer_id as int) as customer_id, + cast(amount as double) as amount, + cast(order_ts as timestamp) as order_ts + from {{ source('crm', 'orders') }}; + ``` -## Cache Policy + This is important for: -The cache decides whether a node can be **skipped** when nothing relevant changed. Modes: + * numeric checks (`greater_equal`, `non_negative_sum`) + * timestamp-based `freshness` checks -``` ---cache=off # always build ---cache=rw # default; skip on match; write cache after build ---cache=ro # skip on match; on miss build but don't write cache ---cache=wo # always build and write cache ---rebuild # ignore cache for matching nodes ---no-cache # alias for --cache=off -``` +**3. Mart: `mart_orders_agg.ff.sql`** -### Skip condition +Aggregates orders per customer and prepares data for reconciliation + freshness: -A node is skipped iff: +```sql +{{ config( + materialized='table', + tags=[ + 'example:dq_demo', + 'scope:mart', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], +) }} -1. The current **fingerprint** matches the on-disk cache value, **and** -2. The **physical relation exists** on the target engine. +-- Aggregate orders per customer for DQ & reconciliation tests +with base as ( + select + o.order_id, + o.customer_id, + -- Ensure numeric and timestamp types for downstream DQ checks + cast(o.amount as double) as amount, + cast(o.order_ts as timestamp) as order_ts, + c.name as customer_name, + c.status as customer_status + from {{ ref('orders.ff') }} o + join {{ ref('customers.ff') }} c + on o.customer_id = c.customer_id +) +select + customer_id, + customer_name, + customer_status as status, + count(*) as order_count, + sum(amount) as total_amount, + min(order_ts) as first_order_ts, + max(order_ts) as last_order_ts +from base +group by customer_id, customer_name, customer_status; +``` -If the relation was dropped externally, FastFlowTransform will **rebuild** even if the fingerprint matches. +The important columns for DQ tests are: ---- +* `status` → used for `accepted_values` +* `order_count` and `total_amount` → used for numeric and reconciliation tests +* `last_order_ts` → used for `freshness` -## Fingerprint Formula +--- -Fingerprints are stable hashes that change on any relevant input: +## Data quality configuration (`project.yml`) -* **SQL models**: `fingerprint_sql(node, rendered_sql, env_ctx, dep_fps)` +All tests live under `project.yml → tests:`. +This example uses the tag `example:dq_demo` for easy selection. - * Uses **rendered** SQL (after Jinja), not the raw template. -* **Python models**: `fingerprint_py(node, func_src, env_ctx, dep_fps)` +### Column-level checks - * Uses `inspect.getsource(func)` with a **file-content fallback** if needed. +```yaml +tests: + # 1) IDs must be present and unique + - type: not_null + table: customers + column: customer_id + tags: [example:dq_demo, batch] -`env_ctx` includes: + - type: unique + table: customers + column: customer_id + tags: [example:dq_demo, batch] + + # 2) Order amounts must be >= 0 + - type: greater_equal + table: orders + column: amount + threshold: 0 + tags: [example:dq_demo, batch] + + # 3) Total sum of amounts must not be negative + - type: non_negative_sum + table: orders + column: amount + tags: [example:dq_demo, batch] + + # 4) Customer status values must be within a known set + - type: accepted_values + table: mart_orders_agg + column: status + values: ["active", "churned", "prospect"] + severity: warn # show as warning, not hard failure + tags: [example:dq_demo, batch] -* `engine` (e.g., `duckdb`, `postgres`, `bigquery`) -* `profile_name` (CLI `--env`) -* Selected environment entries: **all `FF_*` keys** (key + value) -* A **normalized** portion of `sources.yml` (sorted keys/dump) + # 5) Row count sanity check on mart + - type: row_count_between + table: mart_orders_agg + min_rows: 1 + max_rows: 100000 + tags: [example:dq_demo, batch] + + # 6) Freshness: last order in the mart must not be "too old" + - type: freshness + table: mart_orders_agg + column: last_order_ts + max_delay_minutes: 100000000 + tags: [example:dq_demo, batch] +``` -`dep_fps` are upstream fingerprints; **any upstream change** invalidates downstream fingerprints. +### Cross-table reconciliations -**Properties** +```yaml + # 7) Reconcile total revenue between orders and mart + - type: reconcile_equal + name: total_amount_orders_vs_mart + tags: [example:dq_demo, reconcile] + left: + table: orders + expr: "sum(amount)" + right: + table: mart_orders_agg + expr: "sum(total_amount)" + abs_tolerance: 0.01 + + # 8) Ratio of sums should be ~1 (within tight bounds) + - type: reconcile_ratio_within + name: total_amount_ratio + tags: [example:dq_demo, reconcile] + left: + table: orders + expr: "sum(amount)" + right: + table: mart_orders_agg + expr: "sum(total_amount)" + min_ratio: 0.999 + max_ratio: 1.001 + + # 9) Row count diff between orders and mart should be bounded + - type: reconcile_diff_within + name: order_count_diff + tags: [example:dq_demo, reconcile] + left: + table: orders + expr: "count(*)" + right: + table: mart_orders_agg + expr: "sum(order_count)" + max_abs_diff: 0 + + # 10) Coverage: every customer should appear in the mart + - type: reconcile_coverage + name: customers_covered_in_mart + tags: [example:dq_demo, reconcile] + source: + table: customers + key: "customer_id" + target: + table: mart_orders_agg + key: "customer_id" +``` -* Same inputs ⇒ same hash. -* Minimal change in SQL/function ⇒ different hash. -* Dependency changes propagate downstream. +This set of tests touches **all available test types** and ties directly back to the simple data model. --- -## Meta Table Schema - -After a successful build, FastFlowTransform writes a per-node audit row: - -``` -_ff_meta ( - node_name TEXT/STRING, -- logical name, e.g. "users.ff" - relation TEXT/STRING, -- physical table/view, e.g. "users" - fingerprint TEXT/STRING, - engine TEXT/STRING, - built_at TIMESTAMP -) -``` +## Running the demo -Backends: +Assuming you are in the repo root and using DuckDB as a starting point: -* **DuckDB:** table `_ff_meta` in `main`. -* **Postgres:** table `_ff_meta` in the active schema. -* **BigQuery:** table `._ff_meta`. +### 1. Seed the data -> Note: Skip logic uses the file-backed fingerprint cache and a direct relation existence check; the meta table is for auditing and tooling. +```bash +fft seed examples/dq_demo --env dev_duckdb +``` ---- +This reads `seeds/customers.csv` and `seeds/orders.csv` and materializes them as tables referenced by `sources.yml`. -## CLI Recipes +### 2. Run the models ```bash -# First run — builds everything, writes cache and meta -fft run . --env dev --cache=rw +fft run examples/dq_demo --env dev_duckdb +``` -# No-op run — should skip all nodes (if nothing changed) -fft run . --env dev --cache=rw +This builds: -# Force rebuild of a single model (ignores cache for it) -fft run . --env dev --cache=rw --rebuild marts_daily.ff +* `customers` (staging) +* `orders` (staging) +* `mart_orders_agg` (mart) -# Read-only cache (skip on match, build on miss, no writes) -fft run . --env dev --cache=ro +### 3. Run all DQ tests -# Always build and write cache -fft run . --env dev --cache=wo +```bash +fft test examples/dq_demo --env dev_duckdb --select tag:example:dq_demo +``` -# Disable cache entirely -fft run . --env dev --no-cache +You should see a summary like: + +```text +Data Quality Summary +──────────────────── +✅ not_null customers.customer_id +✅ unique customers.customer_id +✅ greater_equal orders.amount +✅ non_negative_sum orders.amount +❕ accepted_values mart_orders_agg.status +✅ row_count_between mart_orders_agg +✅ freshness mart_orders_agg.last_order_ts +✅ reconcile_equal total_amount_orders_vs_mart +✅ reconcile_ratio_within total_amount_ratio +✅ reconcile_diff_within order_count_diff +✅ reconcile_coverage customers_covered_in_mart + +Totals +────── +✓ passed: 10 +! warnings: 1 ``` -With parallelism: +(Exact output will differ, but you’ll see pass/failed/warned checks listed.) + +### 4. Run only reconciliation tests ```bash -fft run . --env dev --jobs 4 -fft run . --env dev --jobs 4 --keep-going +fft test examples/dq_demo --env dev_duckdb --select tag:reconcile ``` +This executes just the cross-table checks, which is handy when you’re iterating on a mart. + --- -## Troubleshooting & FAQ +## Things to experiment with -**“Why did it skip?”** -A skip requires a fingerprint match and an existing relation. Fingerprints include: +To understand the tests better, intentionally break the data and re-run `fft test`: -* rendered SQL / Python function source, -* `sources.yml` (normalized), -* engine/profile, -* **all `FF_*` environment variables**, -* upstream fingerprints. +* Set one `customers.customer_id` to `NULL` → watch `not_null` fail. +* Duplicate a `customer_id` → watch `unique` fail. +* Put a negative `amount` in `orders.csv` → `greater_equal` and `non_negative_sum` fail. +* Add a new `status` value (e.g. `"paused"`) → `accepted_values` warns. +* Drop a customer from `mart_orders_agg` manually (or filter it out in SQL) → `reconcile_coverage` fails. +* Change an amount in the mart only → reconciliation tests fail. -Any change in the above triggers a rebuild downstream. +This makes it very clear what each test guards against. -**“Relation missing but cache says skip?”** -We also check relation existence. If the table/view was dropped externally, FastFlowTransform will **rebuild**. +--- -**“My logs interleave under parallelism.”** -Logs are serialized via a queue; use `-v` / `-vv` for richer but still stable output. Each node prints start/end and duration; levels summarize. +## Summary -**“Utest cache?”** -`fft utest --cache {off|ro|rw}` defaults to `off` for deterministic runs. With `rw`, expensive unit cases can be accelerated. Unit tests do not rely on the meta table by default. +The Data Quality Demo is designed to be: ---- +* **Small and readable** – customers, orders, and a single mart. +* **Complete** – exercises every built-in FFT DQ test type. +* **Practical** – real-world patterns like: -## Example: simple_duckdb + * typing in staging models, + * testing freshness on a mart timestamp, + * reconciling sums and row counts across tables. -The demo contains two independent staging nodes (`users.ff.sql`, `orders.ff.sql`). They run in **parallel** within the same level. +Once you’re comfortable with this example, you can copy the patterns into your real project: start with staging-level checks, then layer in reconciliations and freshness on your most important marts. -Makefile targets: -```makefile -run_parallel: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --jobs 4 -cache_rw_first: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --cache=rw + -cache_rw_second: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --cache=rw +# Macros Demo -cache_invalidate_env: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" FF_DEMO_TOGGLE=1 fft run "$(PROJECT)" --env dev --cache=rw -``` +**Goal:** Showcase **SQL Jinja macros** and **Python render-time macros** working together across engines (DuckDB, Postgres, Databricks Spark). +You’ll see reusable SQL helpers, engine-aware SQL generation, and Python functions exposed as Jinja globals/filters. --- -## Appendix: Environment Inputs - -Only environment variables with the `FF_` prefix affect fingerprints (keys and values). If you change one (e.g., `FF_RUN_DATE`, `FF_REGION`), fingerprints change and downstream nodes rebuild. +## Directory structure -```bash -# Will invalidate fingerprints and rebuild affected nodes -FF_RUN_DATE=2025-01-01 fft run . --env dev --cache=rw +```text +examples/macros_demo/ + .env + .env.dev_databricks + .env.dev_duckdb + .env.dev_postgres + Makefile + profiles.yml + project.yml + sources.yml + seeds/ + seed_users.csv + seed_orders.csv + models/ + macros/ + utils.sql + star.sql + macros_py/ + helpers.py + common/ + stg_users.ff.sql + stg_orders.ff.sql + dim_users.ff.sql + fct_user_sales.ff.sql + engines/ + duckdb/ + py_example.ff.py + postgres/ + py_example.ff.py + databricks_spark/ + py_example.ff.py ``` -```` - --- -### 🔗 `docs/index.md` – Link zum neuen Kapitel - -```diff ---- a/docs/index.md -+++ b/docs/index.md -@@ -10,6 +10,7 @@ - - [User Guide – Operational](./Technical_Overview.md#part-i--operational-guide) - - [Modeling Reference](./Config_and_Macros.md) - - [Parallelism & Cache (v0.3)](./Cache_and_Parallelism.md) - - [Developer Guide – Architecture & Internals](./Technical_Overview.md#part-ii--architecture--internals) -```` +## What this demo shows +* **SQL Jinja macros** (`models/macros/*.sql`) + * `email_domain(expr)` – derive email domain + * `safe_cast_amount(expr)` – engine-aware numeric cast + * `coalesce_any(expr, default)` – small convenience + * `default_country()` – pull a default from `project.yml → vars` + * `star_except(relation, exclude_cols)` – select all except listed columns (falls back to `*` if columns unknown) +* **Python macros** (`models/macros_py/helpers.py`) - + * `slugify(str)` – URL-friendly slug + * `mask_email(email)` – redact local part + * `csv_values(rows, cols)` – inline small lookup tables via SQL `VALUES(...)` +* **Usage from models** -# Incremental Models (R1) + * `stg_users` uses SQL + Python macros at render time + * `stg_orders` uses engine-aware casting + * `dim_users` builds a tiny inline lookup via `csv_values(...)` + * `fct_user_sales` aggregates across staged models -This guide explains how to configure incremental models, use `is_incremental()` in SQL, engine compatibility, and schema change policies. +--- -## Quick Start +## Prerequisites -A minimal incremental model: +* A working FFT installation (CLI `fft` available) +* For Postgres/Databricks: valid local env and drivers +* The core must expose these Jinja globals (already done in the FFT core): -```sql --- examples/r1_demo/models/fct_events_inc.ff.sql -{{ config( - materialized='incremental', - unique_key=['event_id'], - on_schema_change='append_new_columns' -- or 'sync_all_columns' -) }} -with src as ( - select * from {{ source('app', 'events') }} - {% if is_incremental() %} - where ingested_at > (select coalesce(max(ingested_at), timestamp '1970-01-01') from {{ this.name }}) - {% endif %} -) -select - event_id, - user_id, - event_type, - ingested_at, - -- evolving column: will appear later - meta_json -from src; -```` + * `var(name, default)`, `env(name, default)`, `engine(default)` + (Used by profiles/macros to read vars and detect engine.) -### `is_incremental()` +--- -* Available in SQL templates during rendering. -* Returns `true` when the model exists and the current `materialized='incremental'` run chooses an incremental path (insert/merge) instead of full rebuild. -* Typical usage: filter the source to “new” rows only. +## Seeds -### Engine Matrix (MVP) +Two tiny CSVs materialized via `fft seed`: -| Engine | Incremental Insert | Merge/Upsert | Schema Change Policy | -| ------------------ | ------------------ | ------------ | -------------------- | -| DuckDB | ✅ insert | 🚧 fallback* | ✅ append new cols | -| Postgres | ✅ insert | 🚧 fallback* | ✅ append new cols | -| BigQuery (classic) | ✅ insert | 🚧 fallback* | 🚧 best-effort | -| BigQuery BigFrames | ✅ insert | 🚧 fallback* | 🚧 best-effort | -| Databricks Spark | ✅ insert | 🚧 fallback* | 🚧 best-effort | -| Snowflake Snowpark | ✅ insert | 🚧 fallback* | 🚧 best-effort | +* `seed_users.csv` — `id,email,country` +* `seed_orders.csv` — `order_id,customer_id,amount,order_ts` -* Fallback strategy merges by delete-on-keys + insert (best effort) if native merge isn’t wired. +`profiles.yml` and `project.yml` give minimal storage and connection configs. -### Schema Change Policies +--- -* `append_new_columns` (default): new columns appear in target if they show up in the select. -* `sync_all_columns` (planned): attempt to keep type/nullable alignment. Currently not enforced; prefer append in R1. +## How to run -### End-to-End +From repo root: ```bash -# Seeds → initial incremental build → run again with filter -fft seed examples/r1_demo --env dev -fft run examples/r1_demo --env dev --select fct_events_inc.ff -# simulate new data (re-seed or append), then: -fft run examples/r1_demo --env dev --select fct_events_inc.ff +cd examples/macros_demo + +# Choose engine: duckdb (default) | postgres | databricks_spark +make ENGINE=duckdb demo +# or +make ENGINE=postgres demo +# or +make ENGINE=databricks_spark demo ``` -**Artifacts:** see `.fastflowtransform/target/{manifest.json, run_results.json, catalog.json}`. +The `demo` target runs: +1. `fft seed` — loads CSVs +2. `fft run` — builds models using macros +3. `fft dag --html` — writes DAG HTML to `site/dag/index.html` +4. `fft test` — runs example tests +5. Prints artifact paths and tries to open the DAG +--- - - -# Profiles Configuration +## Key files (highlights) -FastFlowTransform uses `profiles.yml` to describe how each environment connects to the execution engine (DuckDB, Postgres, BigQuery, Databricks Spark, Snowflake Snowpark, …). This document covers file layout, supported features, environment overrides, and loading precedence. +### SQL macros – `models/macros/utils.sql` -## File Location +```jinja +{%- macro email_domain(expr) -%} + lower(split_part({{ expr }}, '@', 2)) +{%- endmacro -%} + +{%- macro safe_cast_amount(expr) -%} +{%- set e = engine('duckdb') -%} +{%- if e in ['duckdb', 'postgres', 'databricks_spark'] -%} + cast({{ expr }} as double) +{%- else -%} + cast({{ expr }} as double) +{%- endif -%} +{%- endmacro -%} + +{%- macro coalesce_any(expr, default) -%} + coalesce({{ expr }}, {{ default }}) +{%- endmacro -%} + +{%- macro default_country() -%} + '{{ var("default_country", "DE") }}' +{%- endmacro -%} +``` -`profiles.yml` lives at the project root (same level as `models/`, `project.yml`). The CLI loads it whenever you run `fft` commands (seed/run/test/dag/utest/docgen …). +### SQL macros – `models/macros/star.sql` -``` -project/ -├── models/ -├── project.yml -└── profiles.yml +```jinja +{%- macro star_except(relation, exclude_cols) -%} +{%- set excl = exclude_cols | map('lower') | list -%} +{%- set cols = adapter_columns(relation) -%} +{%- if cols and cols|length > 0 -%} + {{- (cols | reject('in', excl) | map('string') | join(', ')) -}} +{%- else -%} + * +{%- endif -%} +{%- endmacro -%} ``` -## Basic Structure +> Note: If the executor can’t describe columns for `relation`, this macro falls back to `*`. -The file is parsed as YAML after optional Jinja rendering. Top-level keys represent profile “names” (e.g. `dev`, `prod`, `dev_postgres`). Each profile must include an `engine` plus engine-specific configuration. +### Python macros – `models/macros_py/helpers.py` -```yaml -dev: - engine: duckdb - duckdb: - path: "{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}" +```python +def slugify(value: str) -> str: ... +def mask_email(email: str) -> str: ... +def csv_values(rows: list[dict], cols: list[str]) -> str: ... +``` -stg: - engine: postgres - postgres: - dsn: "{{ env('FF_PG_DSN') }}" - db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" +Exposed as Jinja globals/filters at **render time** (not runtime SQL UDFs). -prod: - engine: bigquery - bigquery: - project: "{{ env('FF_BQ_PROJECT') }}" - dataset: "{{ env('FF_BQ_DATASET') }}" - location: EU +--- -default: - engine: duckdb - duckdb: - path: ":memory:" -``` +## Models using macros -### Engines and Sections +### `stg_users.ff.sql` (Jinja + Python macro usage) -Supported engines and their expected sections: +* Coalesces missing country with `default_country()` +* Adds `email_domain(...)` +* Embeds a `slugify(var('site_name', ...))` literal into SQL -| Engine | Section | Key Fields | -|----------------------|--------------------|---------------------------------------------------| -| `duckdb` | `duckdb` | `path` (file path or `:memory:`) | -| `postgres` | `postgres` | `dsn`, `db_schema` | -| `bigquery` | `bigquery` | `project` (optional), `dataset`, `location` | -| `databricks_spark` | `databricks_spark` | `master`, `app_name`, optional `extra_conf`, `warehouse_dir`, `use_hive_metastore`, `database`, `table_format`, `table_options` | -| `snowflake_snowpark` | `snowflake_snowpark`| `account`, `user`, `password`, `warehouse`, `database`, `db_schema`, optional `role` | +```jinja +with src as ( + select + cast(id as int) as user_id, + lower(email) as email, + {{ coalesce_any("country", default_country()) }} as country + from {{ source('crm', 'users') }} +) +select + user_id, + email, + {{ email_domain("email") }} as email_domain, + country, + '{{ slugify(var("site_name", "My Site")) }}' as site_slug +from src; +``` -Each profile can define its own `vars:` block (values exposed via `var('key')` inside templates). +### `stg_orders.ff.sql` (engine-aware types) -## Environment Variables +```jinja +select + cast(order_id as int) as order_id, + cast(customer_id as int) as user_id, + {{ safe_cast_amount("amount") }} as amount, + cast(order_ts as timestamp) as order_ts +from {{ source('sales', 'orders') }}; +``` -`profiles.yml` supports Jinja expressions. The helper `env('FF_VAR', 'fallback')` reads process environment variables and substitutes the default if unset. Examples: +### `dim_users.ff.sql` (inline lookup via Python macro) -```yaml -dev_postgres: - engine: postgres - postgres: - dsn: "{{ env('FF_PG_DSN') }}" - db_schema: "{{ env('FF_PG_SCHEMA', 'analytics') }}" +```jinja +labels as ( + select * from (values {{ csv_values( + [ + {"domain":"example.com", "label":"internal"}, + {"domain":"gmail.com", "label":"consumer"}, + ], + ["domain","label"] + ) }}) as t(domain, label) +) ``` -These expressions are rendered *before* YAML parsing. If the environment variable is missing and no default is provided, the expression resolves to an empty string and validation will fail with a clear error message. +### `fct_user_sales.ff.sql` (final aggregation) -## Loading Order & Precedence +Joins `stg_orders` with `dim_users` and aggregates. -When running `fft` commands, `_load_dotenv_layered()` loads `.env` files in ascending precedence: +--- -1. `/.env` -2. `/.env` -3. `/.env.local` -4. `/.env.` -5. `/.env..local` +## Tests (examples) -Earlier values fill defaults; later files override earlier ones *only for keys that are not already defined*. **Values set in the shell (e.g. via `FF_ENGINE=duckdb fft run …`) have highest priority**—they remain untouched, even if `.env` files define the same key. +Declared in `project.yml`: -After `.env` loading, `profiles.yml` is rendered with Jinja (using the current `os.environ`) and parsed by Pydantic. Validation ensures required fields are present for each engine and produces human-readable errors for missing DSNs, schemas, etc. +* `not_null(dim_users.user_id)` +* `row_count_between(fct_user_sales, min_rows=1)` -## Selecting Profiles +Run with: -- **Via `--env` flag**: `fft run . --env dev_postgres` -- **Via `FFT_ACTIVE_ENV`**: set in shell or `.env` to choose the active profile name. -- **Legacy `FF_ENGINE`** (overrides `engine` field post-parse): useful for quick experiments but explicit `profiles.yml` entries are preferred. +```bash +fft test examples/macros_demo --env dev_duckdb --select tag:example:macros_demo +``` -Example Makefile snippet that switches profiles without exposing secrets: +--- -```make -ENGINE ?= duckdb +## Troubleshooting -ifeq ($(ENGINE),duckdb) - PROFILE_ENV = dev_duckdb -endif -ifeq ($(ENGINE),postgres) - PROFILE_ENV = dev_postgres -endif +* **`jinja2.exceptions.UndefinedError: 'var'/'env'/'engine' is undefined`** + Ensure your core’s Jinja environment registers these globals before loading templates: -seed: - FFT_ACTIVE_ENV=$(PROFILE_ENV) uv run fft seed . --env $(PROFILE_ENV) -``` + ```python + env.globals.update(var=..., env=..., engine=...) + ``` +* **Engine differences (types & functions):** + Always branch in macros (`engine(...)`) when types or functions differ. +* **`adapter_columns(...)` returns none:** + The `star_except` macro will fallback to `*`. For strict behavior, replace with static column lists per engine. -## Using `.env` for Secrets +--- -Keep sensitive credentials out of VCS by storing them in `.env` files referenced above: +## Extending this demo -``` -examples/api_demo/ -├── .env.dev_duckdb # FF_DUCKDB_PATH=... -├── .env.dev_postgres # FF_PG_DSN=..., FF_PG_SCHEMA=... -├── .env.dev_databricks # FF_SPARK_MASTER=..., FF_SPARK_APP_NAME=... -└── profiles.yml -``` +* Add more helpers to `helpers.py` (e.g., `render_json(obj)`, `join_csv(list)`). +* Create reusable macro libraries under `models/macros/` (date handling, SCD helpers, etc.). +* Use `var(...)` to parameterize behavior per environment or profile. -These files stay out of git (via `.gitignore`), while `profiles.yml` contains only non-sensitive wiring. +--- -## Summary of Features +Happy macro-ing! -- Multiple profiles in a single YAML file. -- Jinja templating with `env()` helper for dynamic values. -- `.env` layered loading with shell overrides taking precedence. -- Validation for engine-specific parameters (clear error messages). -- Profile-specific `vars` exposed to Jinja `var()` function in models. -- Works seamlessly across CLI commands: seed, run, dag, test, docgen, utest. -Keep `profiles.yml` declarative, `.env` files secret, and use CLI or Makefiles to select the active profile per run. This pattern scales from local DuckDB demos to production Postgres/BigQuery/Snowflake deployments. + +# 🧠 Cache & Parallelism Demo - +This example demonstrates FastFlowTransform’s **build cache**, **fingerprint logic**, **parallel scheduler**, and **HTTP response caching**. +It’s a compact playground to visualize **when nodes are skipped**, **what triggers rebuilds**, and **how caching accelerates iterative runs**. -# Sources Configuration +--- -`sources.yml` declares external tables (seeds, raw inputs, lakehouse paths) that models can reference via `{{ source('group', 'table') }}`. This document covers the schema, engine overrides, file paths, and best practices. +## 🗂 Directory Structure -## File Location +```text +cache_demo/ + .env.dev_duckdb + Makefile + profiles.yml + project.yml + sources.yml + models/ + seeds_consumers/ + stg_users.ff.sql + stg_orders.ff.sql + marts/ + mart_user_orders.ff.sql + python/ + py_constants.ff.py + http/ + http_users.ff.py + seeds/ + seed_users.csv + seed_orders.csv + README.md +``` -Place `sources.yml` at your project root (same level as `models/`). Example: +--- -``` -project/ -├── models/ -├── sources.yml -└── seeds/ -``` +## ⚙️ Overview -## YAML Schema (Version 2) +This demo showcases several FastFlowTransform features: -FastFlowTransform expects a dbt-style structure: +| Feature | Demonstrated by | +| -------------------------- | ----------------------------------------------- | +| Level-wise parallelism | Multiple models running concurrently (`--jobs`) | +| Deterministic fingerprints | Build cache skipping unchanged nodes | +| Upstream invalidation | Seed → staging → mart rebuilds | +| Environment invalidation | Any `FF_*` change triggers rebuild | +| Python model caching | Fingerprints derived from function source | +| HTTP response caching | Persistent API result cache with offline mode | -```yaml -version: 2 -sources: - - name: raw - schema: staging # default schema for this source group - overrides: - postgres: - schema: raw_main # engine-specific default override +--- - tables: - - name: seed_users - identifier: seed_users # optional physical name - overrides: - duckdb: - schema: main - databricks_spark: - format: delta - location: "/mnt/delta/raw/seed_users" +## ⚡ Quickstart + +```bash +cd examples/cache_demo +make cache_first # builds all nodes, writes cache +make cache_second # no-op run (everything skipped) +make change_sql # touch a model -> rebuilds dependent mart +make change_seed # change seed -> rebuilds staging + mart +make change_env # set FF_* env -> invalidates cache globally +make change_py # edit py_constants.ff.py -> rebuilds that model +make run_parallel # runs entire DAG with 4 workers per level ``` -### Fields +Inspect results: -| Level | Field | Description | -|----------|-------------|-------------| -| source | `name` | Logical group identifier referenced by `source('name', ...)`. | -| | `schema` | Default target schema/database for the group. | -| | `database`/`catalog` | Optional qualifiers per engine (BigQuery, Snowflake). | -| | `overrides` | Map of engine → config snippet (schema overrides, formats, locations). | -| table | `name` | Logical table name (second argument in `source()`). | -| | `identifier`| Physical name; defaults to `name` if omitted. | -| | `location` | File/path location (used with `format`). | -| | `format` | Ingestion format for engines supporting path-based sources (`delta`, `parquet`, …). | -| | `options` | Dict of format options (Spark/Databricks). | -| | `overrides` | Additional engine-specific settings merged with source-level overrides. | +* `.fastflowtransform/target/run_results.json` – fingerprints, results, timings, HTTP stats +* `site/dag/index.html` – DAG visualization +* `.local/http-cache/` – persisted API responses -Engine-specific overrides follow this merge order: +--- -1. Source defaults (`schema`, `database`, …) -2. Source-level `overrides[engine]` -3. Table-level `overrides[engine]` +## 🧩 Model Summary + +| Model | Kind | Purpose | Notes | +| ------------------------- | ------ | --------------------------- | ------------------------------------ | +| `stg_users.ff.sql` | SQL | Load & normalize users seed | Rebuilds if seed changes | +| `stg_orders.ff.sql` | SQL | Load orders seed | Builds as a view | +| `mart_user_orders.ff.sql` | SQL | Join staging tables | Rebuilds if any staging changes | +| `py_constants.ff.py` | Python | Simple constant DataFrame | Fingerprint based on function source | +| `http_users.ff.py` | Python | HTTP fetch with cache | Uses `get_df()` and offline cache | + +--- -### Engine Behavior +## 🌐 HTTP Response Cache -- **DuckDB / Postgres / BigQuery / Snowflake**: expect `identifier` (plus `schema`/`database` where relevant). Path-based sources raise errors. -- **Databricks Spark**: supports `format` + `location`. The executor registers a temp view with optional `options` (e.g. `compression`). +The `http_users.ff.py` model demonstrates the built-in HTTP cache: -### Path-Based Sources Example +* **First run:** downloads `https://jsonplaceholder.typicode.com/users` +* **Subsequent runs:** reuse cached responses from `.local/http-cache` +* **Offline mode:** works with `FF_HTTP_OFFLINE=1` -```yaml - - name: raw_events - tables: - - name: landing - overrides: - databricks_spark: - format: json - location: "abfss://landing@storage.dfs.core.windows.net/events/*.json" - options: - multiline: true +```bash +make http_first # warms HTTP cache +make http_offline # reuses cached response, no network access +make http_cache_clear # deletes cache directory ``` -## Referencing Sources in Models +You can inspect HTTP usage in the `run_results.json` file: -```sql -select id, email -from {{ source('raw', 'seed_users') }} +```bash +jq -r '.results[] | select(.http!=null) + | "\(.name): requests=\(.http.requests) cache_hits=\(.http.cache_hits) offline=\(.http.used_offline)"' \ + .fastflowtransform/target/run_results.json ``` -After rendering, the executor resolves the fully-qualified relation or path depending on the active engine. +--- -## Seed Integration +## ⚙️ Cache Logic Recap -When combined with `seeds/schema.yml`, you can map CSV/Parquet seeds into schemas per engine: +FastFlowTransform caches model fingerprints and skips nodes when: -```yaml -targets: - raw/users: - schema: raw - schema_by_engine: - duckdb: main - postgres: staging +1. **Fingerprints match** (SQL text, Python source, vars, engine, env, deps). +2. The **physical relation exists** in the database. + +Changing *any* of the following invalidates the cache: + +* SQL/Jinja content +* Python model code +* `sources.yml` +* `FF_*` environment variables +* Seed file contents +* Engine or profile name + +You can control cache behavior via CLI: + +```bash +--cache=off # always build +--cache=rw # default; skip on match; write cache +--cache=ro # read-only; skip on hit, build on miss +--cache=wo # always build, always write ``` -## Validation & Errors +--- -- Missing `identifier` *and* `location` produce `KeyError` during rendering. -- Unknown source/table names raise `KeyError` with suggestions. -- Unsupported path-based sources on an engine (`location` provided but no `format`) raise descriptive `NotImplementedError`. +## 🧮 Parallel Scheduler -Keep `sources.yml` declarative, use engine overrides for schema differences, and lean on `.env` files where credentials or URIs vary per environment. +FastFlowTransform executes models **level-wise**: +* Each level contains nodes whose dependencies are fully satisfied. +* Up to `--jobs` nodes per level run concurrently. +* Logs are serialized for clean output. +Example: - +```bash +fft run . --env dev_duckdb --jobs 4 +``` -# Project Configuration (`project.yml`) +--- -`project.yml` defines global metadata, documentation, variables, and data-quality tests for a FastFlowTransform project. This reference walks through the supported sections and common patterns. +## 🧪 Example Experiments -## File Location +| Scenario | Command | Expected behavior | +| ------------------------- | -------------------------------------- | ------------------------------- | +| First full run | `make cache_first` | All models build, cache written | +| No-op run | `make cache_second` | All skipped (no rebuilds) | +| Modify SQL | `make change_sql` | Downstream mart rebuilds | +| Add seed row | `make change_seed` | Staging + mart rebuild | +| Change env | `make change_env` | All nodes rebuild | +| Edit Python constant | `make change_py` | Only that Python model rebuilds | +| Warm & offline HTTP cache | `make http_first && make http_offline` | HTTP cache reused, no network | -`project.yml` lives at the root of your project. +--- +## 🧩 DAG Example + +After the first run, generate the DAG visualization: + +```bash +make dag +open site/dag/index.html ``` -project/ -├── models/ -├── project.yml -└── profiles.yml + +You’ll see: + +``` +seed_users → stg_users.ff +seed_orders → stg_orders.ff +(stg_users + stg_orders) → mart_user_orders.ff +py_constants +http_users ``` -## Top-Level Keys +* `py_constants` runs independently (parallel) +* `mart_user_orders.ff` depends on both staging nodes -```yaml -name: my_project -version: "0.1" -models_dir: models # optional, defaults to "models" +--- -docs: - dag_dir: site/dag # output for fft dag --html - models: - users: - description: "Raw users table" - columns: - id: "Primary key" - email: "Email address" +## 🧰 Tips -vars: - snapshot_day: "2024-01-01" - default_limit: 100 +* **Inspect fingerprints:** stored in `.fastflowtransform/target/manifest.json` +* **Audit table:** `_ff_meta` table in the engine stores build metadata +* **Clear cache:** delete `.fastflowtransform/` or use `make clean` +* **Parallel debugging:** use `--keep-going` to continue unaffected levels -tests: - - type: not_null - table: users - column: id - tags: [batch] -``` +--- -### Metadata +## ✅ Takeaways -| Key | Description | -|-------------|-------------| -| `name` | Project identifier (used in docs/metadata). | -| `version` | Arbitrary version string. | -| `models_dir`| Relative directory containing models (`*.ff.sql` / `*.ff.py`). | +* FFT’s build cache uses stable fingerprints to skip unchanged nodes. +* Fingerprints propagate downstream, ensuring correctness. +* The HTTP cache supports deterministic, offline API pipelines. +* Parallel execution accelerates runs without breaking dependencies. -### Documentation (`docs`) +Together, these features make iterative development **fast, reliable, and reproducible**. -- `dag_dir`: where `fft dag --html` writes the static site. -- `models`: per-model descriptions and column docs surfaced in the generated DAG/docs. -### Variables (`vars`) -Key/value pairs accessible via `{{ var('key', default) }}` in Jinja templates. CLI overrides (`--vars key=value`) take precedence. + -### Tests (`tests`) +````markdown +# Incremental & Delta Demo -Project-wide data quality checks run by `fft test`. Each test is a dict with: +This example project shows how to use **incremental models** and **Delta-style merges** in FastFlowTransform across DuckDB, Postgres and Databricks Spark. -- `type`: `not_null`, `unique`, `accepted_values`, `row_count_between`, `greater_equal`, `non_negative_sum`, `freshness`, or reconciliation checks (`reconcile_equal`, `reconcile_diff_within`, `reconcile_ratio_within`, `reconcile_coverage`). -- `table`: target table or relation. -- `column`: required for column-based tests. -- Optional: `tags`, `severity` (`error`/`warn`), additional parameters (e.g. `values`, `min`, `max`). +It is intentionally small and self-contained so you can copy/paste patterns into your own project. -Example: +--- -```yaml -tests: - - type: accepted_values - table: mart_users - column: status - values: [active, invited] - severity: warn - - type: reconcile_equal - name: revenue_vs_bookings - left: { table: fct_revenue, expr: "sum(amount)" } - right: { table: fct_bookings, expr: "sum(expected_amount)" } - abs_tolerance: 5.0 +## Location & Layout + +The example lives under: + +```text +examples/incremental_demo/ +```` + +Directory structure: + +```text +incremental_demo/ + .env + .env.dev_duckdb + .env.dev_postgres + .env.dev_databricks + Makefile + profiles.yml + project.yml + sources.yml + + seeds/ + seed_events.csv + + models/ + common/ + events_base.ff.sql + fct_events_sql_inline.ff.sql + fct_events_sql_yaml.ff.sql + engines/ + duckdb/ + fct_events_py_incremental.ff.py + postgres/ + fct_events_py_incremental.ff.py + databricks_spark/ + fct_events_py_incremental.ff.py ``` -## Interaction with `.env` and Profiles +*Your actual filenames may differ slightly; the concepts are the same.* -`project.yml` does not read environment variables directly. However: +--- -- `vars:` can reference `var('key')` defaults overridden by CLI or `.env`. -- Tests often depend on `profiles.yml` and `sources.yml` for the actual connection details. -- Makefiles may set `FFT_ACTIVE_ENV` or other `FF_*` variables influencing runs, but `project.yml` remains static. +## What the demo shows -## Best Practices +The demo revolves around a tiny `events` dataset and three different ways to build an incremental fact table: -- Keep `project.yml` committed to version control (no secrets). -- Use `docs/` to provide richer Markdown descriptions; reference them via `columns` or `description` fields if desired. -- Organize tests by tag (`tags: [batch]`, `tags: [reconcile]`) to support selective execution: `fft test . --select tag:reconcile`. +1. **SQL incremental model with inline delta SQL** -Refer to `docs/Data_Quality_Tests.md` for detailed test semantics and `docs/Profiles.md` for profile/env loading behavior. + * `models/common/fct_events_sql_inline.ff.sql` + * All incremental logic (how to find “new/changed” rows) is defined directly in the model’s `config(meta=...)` block. +2. **SQL incremental model with YAML config in `project.yml`** + * `models/common/fct_events_sql_yaml.ff.sql` + * The base SELECT lives in the model, but all incremental hints (`incremental.enabled`, `unique_key`, `updated_at_column`, …) are configured in `project.yml → models.incremental`. - +3. **Python incremental model** -# State Selection — R1 + * `models/engines/*/fct_events_py_incremental.ff.py` + * A Python model that returns a DataFrame; the executor applies incremental behaviour based on model `meta` (unique key + updated-at timestamp) and the target engine: -Build only changed nodes or select by last run results. + * DuckDB / Postgres: incremental insert/merge in SQL + * Databricks Spark: `MERGE INTO` for Delta where available, with a fallback full-refresh strategy -## Changed Nodes +--- -- `state:modified` — models that have changed since last cached fingerprint. -- `state:modified+` — the above plus all downstream dependents. +## Seed data -```bash -# First run populates cache -fft run examples/r1_demo --env dev --cache rw -# Touch files / change SQL → next run: -fft run examples/r1_demo --env dev --cache rw --select state:modified -fft run examples/r1_demo --env dev --cache rw --select state:modified+ -```` +The demo uses a simple seed file: -## Result-based Selection +```text +examples/incremental_demo/seeds/seed_events.csv +``` -Use the last `run_results.json`: +Example contents (conceptually): -* `result:ok` — successful models (no warnings) -* `result:warn` — successful but with warnings -* `result:fail` — alias of `result:error` -* `result:error`— failed models +```csv +event_id,updated_at,value +1,2024-01-01T10:00:00,10 +2,2024-01-01T10:05:00,20 +3,2024-01-01T10:10:00,30 +``` + +Running: ```bash -fft run examples/r1_demo --env dev --select result:error +fft seed examples/incremental_demo --env dev_duckdb ``` -### Artifacts +(or with your engine/env of choice) will materialize this seed into the warehouse (e.g. a DuckDB table or Postgres table). -``` -examples/r1_demo/.fastflowtransform/target/ -├── manifest.json -├── run_results.json -└── catalog.json +--- + +## Base model: `events_base` + +The base staging model simply exposes the events from the seed: + +```text +models/common/events_base.ff.sql ``` +Conceptually: +```sql +{{ config( + materialized='table', + tags=[ + 'example:incremental_demo', + 'scope:common', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + ], +) }} - +select + event_id, + updated_at, + value +from {{ source('raw', 'events') }}; +``` -# YAML Tests (Schema-bound) +All incremental models build on top of this base table. -Schema-bound tests live in `models/*.yml` or `models/**/schema.yml` and complement (or replace) `project.yml`-based tests. +--- -## Example +## Incremental configuration (high-level) -```yaml -# examples/r1_demo/models/users_enriched.yml -version: 2 -models: - - name: users_enriched - description: "Adds gmail flag" - columns: - - name: id - tests: - - not_null: { severity: error } - - unique - - name: email - tests: - - not_null - - accepted_values: - values: ["a@example.com","b@example.com","c@gmail.com"] - severity: warn -```` +All three incremental models share the same core idea: -### Severities +* Mark the model as **incremental** +* Provide a **unique key** (e.g. `event_id`) +* Provide an **updated-at / timestamp column** (e.g. `updated_at`) +* Optionally specify a **delta strategy**: -* `error` → contributes to failures (exit code 2). -* `warn` → surfaced in summary as ❕, does not affect exit code. + * **Inline SQL** (in the model) + * **External YAML** (referenced from the model) + * **Python** (engine-specific model that returns the delta dataset) -### Run +There are two ways to express this in the demo: -```bash -fft test examples/r1_demo --env dev -# Select only tests tagged 'reconcile' (if present) -fft test examples/r1_demo --env dev --select tag:reconcile +1. **Inline on the model** (used by `fct_events_sql_inline.ff.sql`), via `config(...)`: + +```jinja +{{ config( + materialized='incremental', + unique_key='event_id', + incremental={'updated_at_column': 'updated_at'}, + tags=['example:incremental_demo'], +) }} ``` -### Output (excerpt) +2. **As an overlay in `project.yml`** (used by `fct_events_sql_yaml.ff.sql` and the Python model): +```yaml +models: + incremental: + fct_events_sql_yaml.ff: + unique_key: "event_id" + incremental: + enabled: true + updated_at_column: "updated_at" + + fct_events_py_incremental.ff: + unique_key: "event_id" + incremental: + enabled: true + updated_at_column: "updated_at" ``` -Data Quality Summary -──────────────────── -✅ not_null users.id (3ms) -❌ unique users.id (2ms) - ↳ [unique] users.id: found 1 duplicate -❕ accepted_values users_enriched.email (1ms) -Totals -────── -✓ passed: 2 -✗ failed: 1 -! warnings: 1 -``` +The incremental engine then uses these `meta` fields to decide whether to: +* create the table (`create_table_as`) for the **first run** +* perform an **incremental insert** or **merge** for subsequent runs +--- - +## 1) SQL incremental with inline delta SQL -# Data Quality Test Reference +File: -FastFlowTransform exposes a set of built-in data quality checks that you can configure in `project.yml → tests:` and execute with `fft test`. This document lists every supported test, required parameters, and example configurations. +```text +models/common/fct_events_sql_inline.ff.sql +``` -## Usage Overview +In this variant, both *incremental configuration* and the *delta filter* live directly in the model: -```yaml -# project.yml -tests: - - type: not_null - table: users - column: id - severity: error # default (omit for error) - tags: [batch] +```jinja +{{ config( + materialized='incremental', + unique_key='event_id', + incremental={'updated_at_column': 'updated_at'}, + tags=[ + 'example:incremental_demo', + 'scope:common', + 'kind:incremental', + 'inc:type:inline-sql', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + ], +) }} - - type: unique - table: users - column: email - tags: [batch] +with base as ( + select * + from {{ ref('events_base.ff') }} +) +select + event_id, + updated_at, + value +from base +{% if is_incremental() %} +where updated_at > ( + select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') + from {{ this }} +) +{% endif %}; +``` - - type: accepted_values - table: users - column: status - values: [active, invited] - severity: warn # warn keeps run green on failure +On the **first run**, the engine sees no existing relation, so it materializes the full `select ... from events_base`. - - type: row_count_between - table: users_enriched - min: 1 - max: 100000 +On subsequent runs, the engine evaluates the `delta.sql` snippet and: - - type: reconcile_equal - name: revenue_vs_bookings # optional label in summaries - tags: [reconcile] - left: { table: fct_revenue, expr: "sum(amount)" } - right: { table: fct_bookings, expr: "sum(expected_amount)" } - abs_tolerance: 5.0 -``` +* **DuckDB / Postgres**: inserts or merges the resulting rows into the target table +* **Databricks Spark**: tries a `MERGE INTO` (Delta) and falls back to a full-refresh if necessary -Every entry is a single dictionary describing one check. The common keys are: +--- -| Key | Description | -|------------|-------------| -| `type` | Test kind (see tables below). | -| `table` | Target table for table-level checks or display hint for reconciliations. | -| `column` | Required for column-scoped checks (`not_null`, `unique`, …). | -| `severity` | `error` (default) or `warn`. | -| `tags` | Optional list of selectors for `fft test --select tag:...`. | -| `name` | Optional identifier surfaced in summaries (useful for reconciliations). | +## 2) SQL incremental with YAML delta config -Run all configured checks: +File: -```bash -fft test . --env dev +```text +models/common/fct_events_sql_yaml.ff.sql ``` -Use `--select tag:` to restrict by tags (legacy `--select batch` reads the same tags list). Tests always execute regardless of cache settings. +Here the model body only defines the **canonical SELECT** and does *not* contain any incremental hints: -Each entry produces a summary line. Failures stop the command unless `severity: warn` is set. +```jinja +{{ config( + materialized='incremental', + tags=[ + 'example:incremental_demo', + 'scope:common', + 'kind:incremental', + 'inc:type:yaml-config', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + ], +) }} -## Table-Level Checks +with base as ( + select * + from {{ ref('events_base.ff') }} +) +select + event_id, + updated_at, + value +from base; +``` -These checks operate on a single table (optionally filtered with `where:`). Unless noted, they require a `column` argument. +All incremental behaviour for this model is driven by `project.yml`: -### `not_null` -- **Purpose:** Assert that a column never contains NULLs. -- **Parameters:** - - `column` *(str, required)* - - `where` *(str, optional)* — SQL predicate applied before the NULL check. -- **Failure:** Reports the number of NULL rows and shows the underlying SQL. +```yaml +models: + incremental: + fct_events_sql_yaml.ff: + unique_key: "event_id" + incremental: + enabled: true + updated_at_column: "updated_at" +``` -### `unique` -- **Purpose:** Detect duplicates within a column. -- **Parameters:** - - `column` *(str, required)* - - `where` *(str, optional)* -- **Failure:** Indicates how many duplicate groups were found (HAVING count > 1) and shows a sample query. +The registry merges this overlay into the model at load time, so the incremental runtime +sees effectively the same config as for the inline model (`unique_key` + `updated_at_column`) – +only the **source of truth** is different. -### `accepted_values` -- **Purpose:** Ensure every non-NULL value is inside an allowed set. -- **Parameters:** - - `column` *(str, required)* - - `values` *(list, required)* — permitted literals (strings are quoted automatically). - - `where` *(str, optional)* -- **Failure:** Shows the number of out-of-set values plus up to five sample values. +--- -### `greater_equal` -- **Purpose:** Require all values to be greater than or equal to a threshold. -- **Parameters:** - - `column` *(str, required)* - - `threshold` *(number, default `0`)* -- **Failure:** Lists how many rows fell below the threshold. +### Inline vs YAML config at a glance -### `non_negative_sum` -- **Purpose:** Validate that the sum of a numeric column is not negative. -- **Parameters:** - - `column` *(str, required)* -- **Failure:** Reports the signed sum when it is negative. +| Model | Where is incremental configured? | What lives in the SQL file? | +|----------------------------|-----------------------------------------|-----------------------------------------------| +| `fct_events_sql_inline.ff` | Inline in `config(...)` on the model | Full SELECT **+** `is_incremental()` filter | +| `fct_events_sql_yaml.ff` | `project.yml → models.incremental` | Full SELECT only (no incremental hints) | -### `row_count_between` -- **Purpose:** Guard minimum (and optional maximum) row counts for a table. -- **Parameters:** - - `min` *(int, default `1`)* - - `max` *(int, optional)* — omit for open-ended upper bounds. -- **Failure:** Indicates the observed row count when it falls outside `[min, max]`. +Both end up with the same runtime meta, only the **location of config** differs. -### `freshness` -- **Purpose:** Warn when the latest timestamp is older than an allowed delay. -- **Parameters:** - - `column` *(str, required)* — timestamp column. - - `max_delay_minutes` *(int, required)* — permitted staleness. -- **Failure:** Reports the computed lag in minutes. Uses ANSI-style `DATE_PART` (works on DuckDB/Postgres; extend for other engines as needed). +## 3) Python incremental model -## Cross-Table Reconciliations +Files: -Reconciliation checks compare aggregates or keys across two relations. Their configuration accepts dictionaries describing the left/right side expressions or keys. +```text +models/engines/duckdb/fct_events_py_incremental.ff.py +models/engines/postgres/fct_events_py_incremental.ff.py +models/engines/databricks_spark/fct_events_py_incremental.ff.py +``` -### `reconcile_equal` -- **Purpose:** Compare two scalar expressions with optional tolerances. -- **Parameters:** - - `left`, `right` *(dict, required)* with keys: - - `table` *(str, required)* - - `expr` *(str, required)* — SQL select expression (e.g. `sum(amount)`). - - `where` *(str, optional)* - - `abs_tolerance` *(float, optional)* — maximum absolute difference. - - `rel_tolerance_pct` *(float, optional)* — maximum relative difference in percent. -- **Failure:** Displays both values, absolute and relative differences. +Each engine variant uses the same logical signature: -### `reconcile_ratio_within` -- **Purpose:** Constrain the ratio `left/right` within bounds. -- **Parameters:** - - `left`, `right` *(dict, required as above)* - - `min_ratio`, `max_ratio` *(float, required)* -- **Failure:** Shows the computed ratio and expected interval. +```python +from fastflowtransform import engine_model +import pandas as pd # or pyspark.sql.DataFrame for Databricks Spark -### `reconcile_diff_within` -- **Purpose:** Limit the absolute difference between two aggregates. -- **Parameters:** - - `left`, `right` *(dict, required)* - - `max_abs_diff` *(float, required)* -- **Failure:** Reports the absolute difference when it exceeds `max_abs_diff`. -### `reconcile_coverage` -- **Purpose:** Ensure every key present in a source table appears in a target table (anti-join zero). -- **Parameters:** - - `source` *(dict, required)* — `table` and `key` column. - - `target` *(dict, required)* — `table` and `key` column. - - `source_where` *(str, optional)* — filter applied to the source. - - `target_where` *(str, optional)* — filter applied to the target. -- **Failure:** Reports the number of missing keys. +@engine_model( + only="duckdb", # or "postgres" / "databricks_spark" + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=[ + "example:incremental_demo", + "scope:engine", + "engine:duckdb", # or engine-specific + ], + meta={ + "incremental": True, + "unique_key": ["event_id"], + "updated_at": "updated_at", + }, +) +def build(df_events): + # 'df_events' is either a pandas.DataFrame or Spark DataFrame + # depending on the engine. + # The function returns either: + # - a full canonical result, or + # - only the delta rows, depending on your design. + # + # In the simplest version, you just return the full dataset and let the + # executor handle incremental logic based on meta. + return df_events[["event_id", "updated_at", "value"]] +``` -## Severity & Selectors +The executor uses the `meta.incremental` / `meta.unique_key` / `meta.updated_at` hints to run: -- `severity: error` (default) makes failures stop the test run with exit code 1. -- `severity: warn` records the result but keeps the run successful. -- `selectors:` lets you group checks under named tokens (e.g. `batch`, `streaming`). Use `fft test --select tag:batch` to execute a subset. +* A **full-refresh** on the first run +* A **delta merge** on subsequent runs: -## CLI Summary Output + * For DuckDB / Postgres: insert/merge SQL + * For Databricks Spark: -Each executed check produces a line in the summary: + * `MERGE INTO` for Delta tables, or + * a full-refresh fallback strategy that rewrites the table based on the union of existing + delta rows -``` -✓ not_null users.email (3ms) -✖ accepted_values events.status values=['new', 'active'] (warn) -``` +--- -Failures include the generated SQL (where available) to simplify debugging. Use `fft test --verbose` for more detail, or `FFT_SQL_DEBUG=1` to log the underlying queries. +## Delta variant (Databricks / Spark) -## Further Reading +In addition to the “regular” incremental models, the demo also includes a **Delta Lake variant** +that shows how to: -- [`docs/YAML_Tests.md`](YAML_Tests.md) – schema for YAML-defined tests and advanced scenarios. -- [`fft test --help`] — command-line switches, selectors, and cache options. +- route a model to **Delta tables** via `project.yml` +- reuse the same incremental pattern, but with a **Delta-backed** table on Databricks/Spark +- keep Parquet and Delta models side-by-side in the same project +This is optional and only relevant for the `databricks_spark` engine. +--- - +### Storage configuration for the Delta model -# Environment Matrix (DuckDB-only) — Example +In `project.yml`, the Delta variant gets its own storage entry, separate from the Parquet fact table: -This tiny project demonstrates **per-environment configuration** (dev / stg / prod) while keeping everything on **DuckDB**. -Each environment uses its **own DuckDB file**, so you can switch environments without changing code. +```yaml +models: + storage: + # Existing Parquet fact table + fct_events_sql_inline: + path: ".local/spark/fct_events_sql_inline" + format: parquet + + # 🔹 Delta-based fact table (Spark/Databricks only) + fct_events_sql_inline_delta: + path: ".local/spark_delta/fct_events_sql_inline" + format: delta +```` -It also includes a **seed step** (CSV → table) and two minimal models: +Notes: -* `env_vars.ff` (Python) — echoes which env is active and which DuckDB file is used -* `hello.ff` (SQL view) — shows how `{{ this.* }}` resolves from the active profile -* `users.ff` (SQL table) — reads from the seeded CSV table to prove seeding works +* The key `fct_events_sql_inline_delta` must match the **model name**. +* `format: delta` tells the Databricks/Spark executor to create `USING DELTA LOCATION ...`. +* The path is different from the Parquet path so artifacts don’t clash. --- -## What this shows +### Delta fact model -* Layered environment files: `.env.dev`, `.env.stg`, `.env.prod` (+ optional `*.local` overrides) -* `profiles.yml` that reads from `env('…')` so connection details live in env files -* All environments use **DuckDB**, but **different DB files** (e.g. `.local/dev.duckdb`, `.local/stg.duckdb`, …) -* Seeding CSV → `seed_users` table, then a simple model consuming it +The Delta fact model is a close sibling of `fct_events_sql_inline.ff.sql`, but: ---- +* is tagged only for the Databricks/Spark engine +* is configured for incremental **merge** with a `unique_key` + `updated_at` column -## Project layout +Example (conceptual) model: -``` -examples/env_matrix/ -├─ models/ -│ ├─ env_vars.ff.py # Python model: shows env + DuckDB file info -│ └─ users.ff.sql # SQL table: reads from seeded 'seed_users' -├─ seeds/ -│ └─ users.csv # sample data for seeding (-> seed_users) -├─ profiles.yml # all envs = DuckDB, different paths -├─ .env # shared defaults (optional) -├─ .env.dev # dev environment vars -├─ .env.stg # stg environment vars -├─ .env.prod # prod environment vars -├─ .env.dev.local # private overrides (gitignored; optional) -├─ .env.stg.local # private overrides (gitignored; optional) -├─ .env.prod.local # private overrides (gitignored; optional) -└─ Makefile # convenience targets (run, seed, dag) -``` +```sql +-- models/common/fct_events_sql_inline_delta.ff.sql ---- +{{ config( + materialized='table', + tags=[ + 'example:incremental_demo', + 'kind:incremental', + 'engine:databricks_spark', + ], + meta={ + 'incremental': True, + 'unique_key': ['event_id'], + 'updated_at': 'updated_at', + 'delta': { + 'sql': " + with base as ( + select event_id, updated_at, value + from {{ ref('events_base.ff') }} + ) + select + event_id, + updated_at, + value + from base + where updated_at > ( + select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') + from {{ this }} + ) + " + }, + }, +) }} -## Environment files +-- canonical full-select (used for docs / full-refresh) +select + event_id, + updated_at, + value +from {{ ref('events_base.ff') }}; +``` -Each env file sets a different DuckDB path: +What happens: -* `.env.dev` +* On the **first run**, the engine sees no existing table and does a full materialization + (a Delta table at `.local/spark_delta/fct_events_sql_inline`). +* On **subsequent runs**, the executor uses the `delta.sql` query as the **incremental delta** and: - ``` - FFT_ACTIVE_ENV=dev - FF_ENGINE=duckdb - FF_DUCKDB_PATH=.local/env_matrix.dev.duckdb - ``` + * attempts a `MERGE INTO` for Delta tables, or + * falls back to a full-refresh strategy if MERGE is not supported. -* `.env.stg` +--- - ``` - FFT_ACTIVE_ENV=stg - FF_ENGINE=duckdb - FF_DUCKDB_PATH=.local/env_matrix.stg.duckdb - ``` +### Running the Delta variant -* `.env.prod` +Once your Databricks/Spark profile is configured (e.g. `dev_databricks` in `profiles.yml` and `.env.dev_databricks`), +you can run the Delta model like any other: - ``` - FFT_ACTIVE_ENV=prod - FF_ENGINE=duckdb - FF_DUCKDB_PATH=.local/env_matrix.prod.duckdb - ``` +```bash +# From the repo root +cd examples/incremental_demo -> You can place secrets or machine-local tweaks in `.env..local` (ignored by git). -> Optional toggles (if you want verbose SQL logs): -> `FFT_SQL_DEBUG=1`, `FFT_LOG_JSON=1` +# Seed +FFT_ACTIVE_ENV=dev_databricks fft seed . ---- +# Run only the Delta variant +FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select fct_events_sql_inline_delta.ff \ + --select tag:engine:databricks_spark -## `profiles.yml` (DuckDB for all envs) +# Or include it in the general incremental demo selection +FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select tag:example:incremental_demo \ + --select tag:engine:databricks_spark +``` + +Optionally, you can add a small `not_null` test to `project.yml` to verify the Delta model: ```yaml -default: - dev: - engine: "{{ env('FF_ENGINE', 'duckdb') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" +tests: + - type: not_null + table: fct_events_sql_inline_delta + column: event_id + tags: [batch, delta] +``` - stg: - engine: "{{ env('FF_ENGINE', 'duckdb') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" +Then run: - prod: - engine: "{{ env('FF_ENGINE', 'duckdb') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" +```bash +FFT_ACTIVE_ENV=dev_databricks fft test . --select tag:delta ``` +to validate the Delta-backed incremental table specifically. + --- -## Models +## Running the demo -### `models/env_vars.ff.py` (Python) +From the project root: -Returns one row with: +```bash +cd examples/incremental_demo +``` -* `active_env_hint` (from `.env.*`), -* `ff_engine` (should be `duckdb` here), -* `duckdb_path`, `duckdb_exists`, `duckdb_size_bytes`. +### DuckDB -### `models/hello.ff.sql` (SQL view) +```bash +# Seed +FFT_ACTIVE_ENV=dev_duckdb fft seed . -Uses `{{ this.materialized }}`, `{{ this.schema }}`, `{{ this.database }}` so you can see what the active profile provides. (The simple `SELECT` is compatible with DuckDB; if you added casts like `::text`, they’re fine in DuckDB too.) +# Initial full run +FFT_ACTIVE_ENV=dev_duckdb fft run . \ + --select tag:example:incremental_demo --select tag:engine:duckdb -### `models/users.ff.sql` (SQL table) +# Incremental run (after modifying seed_events.csv to add later events) +FFT_ACTIVE_ENV=dev_duckdb fft run . \ + --select tag:example:incremental_demo --select tag:engine:duckdb \ + --cache rw -Reads from the seeded table `seed_users`: +# Data-quality tests (if configured in project.yml / schema YAML) +FFT_ACTIVE_ENV=dev_duckdb fft test . \ + --select tag:example:incremental_demo +``` -```sql -{{ config(materialized='table', tags=['demo', 'seed']) }} +### Postgres -select - id, - email -from "seed_users"; +```bash +FFT_ACTIVE_ENV=dev_postgres fft seed . +FFT_ACTIVE_ENV=dev_postgres fft run . \ + --select tag:example:incremental_demo --select tag:engine:postgres +FFT_ACTIVE_ENV=dev_postgres fft test . \ + --select tag:example:incremental_demo ``` -> If you see an error “table seed_users does not exist”, you **haven’t run `fft seed`** for that environment yet. +Packen würde ich den Hinweis direkt an die Stelle, wo du schon beschreibst, wie man die Demo auf Databricks startet – also deine aktuelle Sektion: ---- +````markdown +### Databricks Spark -## Seeds +```bash +FFT_ACTIVE_ENV=dev_databricks fft seed . +FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select tag:example:incremental_demo --select tag:engine:databricks_spark +FFT_ACTIVE_ENV=dev_databricks fft test . \ + --select tag:example:incremental_demo +```` -`seeds/users.csv` is loaded by `fft seed` into a table named `seed_users`. -(That’s the default naming convention: `users.csv` → `seed_users`.) +### Databricks Spark (parquet vs Delta) ---- +You can run the incremental demo on Databricks/Spark against either **parquet** or **Delta** tables. -## Running it +FFT reads the desired table format from the `FF_DBR_TABLE_FORMAT` environment variable, which overrides +`databricks_spark.table_format` from `profiles.yml`. + +When `FF_DBR_TABLE_FORMAT=delta`, the Databricks/Spark executor automatically wires Delta Lake into the +SparkSession (downloads the Maven artifact via `delta-spark`, adds +`spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension`, and sets +`spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog` unless you already +overrode those settings). No extra `spark-submit --conf` flags are needed—just ensure the +`delta-spark >= 4.0` Python package is installed. From the repo root: -### Using `uv` directly +```bash +cd examples/incremental_demo +```` -**Dev** +Run with **parquet** tables (default): ```bash -uv run fft seed examples/env_matrix --env dev -uv run fft run examples/env_matrix --env dev -uv run fft dag examples/env_matrix --env dev --html +FF_DBR_TABLE_FORMAT=parquet FFT_ACTIVE_ENV=dev_databricks fft seed . +FF_DBR_TABLE_FORMAT=parquet FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select tag:example:incremental_demo --select tag:engine:databricks_spark +FF_DBR_TABLE_FORMAT=parquet FFT_ACTIVE_ENV=dev_databricks fft test . \ + --select tag:example:incremental_demo ``` -**Staging** +Run with **Delta** tables: ```bash -uv run fft seed examples/env_matrix --env stg -uv run fft run examples/env_matrix --env stg +FF_DBR_TABLE_FORMAT=delta FFT_ACTIVE_ENV=dev_databricks fft seed . +FF_DBR_TABLE_FORMAT=delta FFT_ACTIVE_ENV=dev_databricks fft run . \ + --select tag:example:incremental_demo --select tag:engine:databricks_spark +FF_DBR_TABLE_FORMAT=delta FFT_ACTIVE_ENV=dev_databricks fft test . \ + --select tag:example:incremental_demo ``` -**Prod** +This way you can switch between parquet and Delta just by changing the `FF_DBR_TABLE_FORMAT` +environment variable, without touching the models or project.yml. -```bash -uv run fft seed examples/env_matrix --env prod -uv run fft run examples/env_matrix --env prod -``` +Adjust environment names to match your `profiles.yml`. -### Using the Makefile (inside `examples/env_matrix/`) +--- -```bash -make run-dev # runs the DAG on dev -make run-stg -make run-prod +## How to link this page into your docs -make seed-dev # seed only (dev) -make seed-stg -make seed-prod +### 1. MkDocs (`mkdocs.yml`) -make dag-dev # generate HTML DAG for dev -make clean # remove .local/, docs/, site/, .fastflowtransform/ +If you use MkDocs, place this file under e.g.: + +```text +docs/examples/incremental_demo.md ``` -> Tip: re-run `fft seed` whenever you switch environments or change `seeds/*.csv`. +and add it to your `mkdocs.yml` nav: ---- +```yaml +nav: + - Overview: index.md + - Examples: + - API demo: examples/api_demo.md + - Incremental & Delta demo: examples/incremental_demo.md +``` -## Inspecting results +### 2. Sphinx (`index.rst` + Markdown) -* The **HTML DAG** (after `make dag-dev`) will be at: +If you use Sphinx with MyST or Markdown support, put the file under: - ``` - examples/env_matrix/site/dag/index.html - ``` -* The **artifacts** are under: +```text +docs/examples/incremental_demo.md +``` - ``` - examples/env_matrix/.fastflowtransform/target/{manifest.json, run_results.json, catalog.json} - ``` -* Query the DuckDB files directly with `duckdb` CLI or `python` + `duckdb` module if you want to peek inside. +and reference it from your main `index.rst`: ---- +```rst +Welcome to FastFlowTransform's documentation! +============================================= -## Troubleshooting +.. toctree:: + :maxdepth: 2 -* **`seed_users` not found** - Run `fft seed` for the same environment: - `uv run fft seed examples/env_matrix --env dev` + overview + examples/api_demo + examples/incremental_demo +``` -* **No logs showing** - Use `-v`/`-vv` and/or `--sql-debug` on the CLI, or set: +(Adjust paths to match your actual layout.) - ``` - FFT_SQL_DEBUG=1 - FFT_LOG_JSON=1 # optional JSON logs - ``` +### 3. Top-level `index.md` (Markdown-only docs) -* **Wrong environment picked** - Double-check the `--env` flag in your CLI call and ensure the `.env.` file exists. +If your docs use a pure Markdown index, just add a link: ---- +```markdown +## Examples -## Clean up +- [API demo](examples/api_demo.md) +- [Incremental & Delta demo](examples/incremental_demo.md) +``` -```bash -make clean # from examples/env_matrix/ -# or manually: -rm -rf examples/env_matrix/.local examples/env_matrix/site examples/env_matrix/docs -rm -rf examples/env_matrix/.fastflowtransform +This way, the incremental demo appears alongside your existing API demo and other examples in your global documentation navigation. + +``` ``` @@ -3188,7 +5220,8 @@ This example demonstrates multi-engine configuration, environment-driven secrets ### DuckDB -- Copy `.env.dev_duckdb` and adjust `FF_DUCKDB_PATH` if you want a different location (default: `.local/api_demo.duckdb`). +- Copy `.env.dev_duckdb` and adjust `FF_DUCKDB_PATH` if you want a different location (default: `.local/api_demo.duckdb`). + Optionally set `FF_DUCKDB_SCHEMA` (default schema for models/seeds) and `FF_DUCKDB_CATALOG` (catalog alias) if you need to isolate namespaces. - Create the target directory once: `mkdir -p examples/api_demo/.local`. - Run `make ENGINE=duckdb seed run` to build the seeds and models inside the DuckDB file. @@ -3223,4 +5256,4 @@ This example demonstrates multi-engine configuration, environment-driven secrets # License ---8<-- "License" +--8<-- "License.md" diff --git a/mkdocs.yml b/mkdocs.yml index df787ba..1d17581 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ -site_name: FastFlowTransform -site_description: Documentation for the FastFlowTransform project +site_name: "FastFlowTransform – SQL & Python data transformation framework" +site_description: "Open-source Python framework for SQL & Python data modeling, ETL pipelines, and data quality tests on DuckDB, Postgres, BigQuery, Databricks and Snowflake." site_url: https://MirrorsAndMisdirections.github.io/FastFlowTransform/ repo_url: https://github.com/MirrorsAndMisdirections/FastFlowTransform repo_name: MirrorsAndMisdirections/FastFlowTransform @@ -32,11 +32,21 @@ nav: - State Selection: State_Selection.md - YAML Tests: YAML_Tests.md - Data Quality Tests: Data_Quality_Tests.md - - API Reference: reference/index.md + - API Reference: reference/ + - CLI Guide: CLI_Guide.md + - Auto Docs: Auto_Docs.md + - Logging: Logging.md + - Unit Tests: Unit_Tests.md + - Troubleshooting: Troubleshooting.md - Examples: + - Basic Demo: examples/Basic_Demo.md + - Materializations Demo: examples/Materializations_Demo.md - Environment Matrix: examples/Environment_Matrix.md - - API Demo Overview: examples/API_Demo.md - - Basic Demo Overview: examples/Basic_Demo.md + - Data Quality Tests Demo: examples/DQ_Demo.md + - Macros Demo: examples/Macros_Demo.md + - Cache Demo: examples/Cache_Demo.md + - Incremental Demo: examples/Incremental_Demo.md + - API Demo: examples/API_Demo.md - Local Engine Setup: examples/Local_Engine_Setup.md - Contributing: Contributing.md - License: License.md @@ -62,6 +72,7 @@ plugins: - gen-files: scripts: - docs/_scripts/gen_api.py + - literate-nav - mkdocstrings: handlers: python: diff --git a/pyproject.toml b/pyproject.toml index ae66b6c..ff3cce9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,20 +4,33 @@ build-backend = "hatchling.build" [project] name = "fastflowtransform" -version = "0.5.1" -description = "FastFlowTransform PoC: SQL+Python Models, DAG, DuckDB executor" +version = "0.5.15" +description = "ython framework for SQL & Python data transformation, ETL pipelines, and dbt-style data modeling" readme = "README.md" license = { text = "Apache-2.0" } authors = [ { name = "Marko Lekic", email = "you@example.com" } ] requires-python = ">=3.12" -keywords = ["data", "etl", "elt", "analytics", "dbt", "streaming", "dag"] +keywords = [ + "etl", + "data transformation", + "sql", + "data modeling", + "duckdb", + "postgres", + "bigquery", + "spark", + "snowflake", + "dbt alternative", +] classifiers = [ "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Development Status :: 3 - Alpha", "Intended Audience :: Developers", + "Topic :: Database", "Topic :: Software Development :: Build Tools", + "Topic :: Software Development :: Libraries", + "Topic :: Scientific/Engineering :: Information Analysis", + "Operating System :: OS Independent", + "License :: OSI Approved :: Apache Software License", ] dependencies = [ "typer>=0.12", @@ -37,6 +50,7 @@ dependencies = [ "snowflake-snowpark-python>=1.40.0", "bigframes>=2.24.0", "httpx>=0.28.1", + "delta-spark>=4.0.0", ] [project.optional-dependencies] @@ -63,8 +77,10 @@ docs = [ [project.urls] Homepage = "https://github.com/MirrorsAndMisdirections/FastFlowTransform" -Issues = "https://github.com/MirrorsAndMisdirections/FastFlowTransform/issues" +Documentation = "https://fastflowtransform.com" Repository = "https://github.com/MirrorsAndMisdirections/FastFlowTransform.git" +Source = "https://github.com/MirrorsAndMisdirections/fastflowtransform" +Issues = "https://github.com/MirrorsAndMisdirections/fastflowtransform/issues" [project.scripts] fft = "fastflowtransform.cli:app" diff --git a/pytest.ini b/pytest.ini index c488100..407b084 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,15 +3,11 @@ addopts = -q markers = duckdb: marks tests that require DuckDB postgres: marks tests that require Postgres - spark: marks tests that require Spark + databricks_spark: marks tests that require Spark bigquery: marks tests that require BigQuery snowflake: marks tests that require Snowflake - artifacts: marks tests covering artifacts generation - render: marks tests for render-time helpers/templates - schema: marks schema loader/validation tests http: marks tests that exercise the HTTP client/API - cli: marks CLI smoke tests streaming: marks tests that exercise streaming functionality - slow: marks slower end-to-end scenarios unit: marks unit tests integration: integration tests + example: run the examples as tests diff --git a/src/fastflowtransform/.env b/src/fastflowtransform/.env new file mode 100644 index 0000000..5826dc9 --- /dev/null +++ b/src/fastflowtransform/.env @@ -0,0 +1,13 @@ +# # Engine-Gesamtoverride +# FF_ENGINE=postgres + +# # DuckDB +# FF_DUCKDB_PATH=.local/demo.duckdb + +# # Postgres +# FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432/ffdb +# FF_PG_SCHEMA=public + +# # BigQuery +# FF_BQ_DATASET=my_dataset +# FF_BQ_LOCATION=EU \ No newline at end of file diff --git a/src/fastflowtransform/cache.py b/src/fastflowtransform/cache.py index c8749f8..b00fd50 100644 --- a/src/fastflowtransform/cache.py +++ b/src/fastflowtransform/cache.py @@ -15,9 +15,9 @@ import yaml from jinja2 import Environment -from .core import REGISTRY, relation_for -from .dag import topo_sort -from .meta import relation_exists as _relation_exists_engine +from fastflowtransform.core import REGISTRY, relation_for +from fastflowtransform.dag import topo_sort +from fastflowtransform.meta import relation_exists as _relation_exists_engine @dataclass diff --git a/src/fastflowtransform/cli/bootstrap.py b/src/fastflowtransform/cli/bootstrap.py index 6147196..835e2e4 100644 --- a/src/fastflowtransform/cli/bootstrap.py +++ b/src/fastflowtransform/cli/bootstrap.py @@ -24,7 +24,6 @@ ) from fastflowtransform.executors._shims import BigQueryConnShim, SAConnShim from fastflowtransform.executors.base import BaseExecutor -from fastflowtransform.incremental import run_or_dispatch from fastflowtransform.logging import echo from fastflowtransform.settings import ( EngineType, @@ -286,15 +285,19 @@ def _get_test_con(executor: Any) -> Any: def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Callable]: ex: BaseExecutor if prof.engine == "duckdb": - ex = DuckExecutor(db_path=prof.duckdb.path) - return ex, (lambda n: run_or_dispatch(ex, n, jenv)), ex.run_python + ex = DuckExecutor( + db_path=prof.duckdb.path, + schema=getattr(prof.duckdb, "db_schema", None), + catalog=getattr(prof.duckdb, "catalog", None), + ) + return ex, (lambda n: ex.run_sql(n, jenv)), ex.run_python if prof.engine == "postgres": if prof.postgres.dsn is None: raise RuntimeError("Postgres DSN must be set") ex = PostgresExecutor(dsn=prof.postgres.dsn, schema=prof.postgres.db_schema) - return ex, (lambda n: run_or_dispatch(ex, n, jenv)), ex.run_python + return ex, (lambda n: ex.run_sql(n, jenv)), ex.run_python if prof.engine == "bigquery": if prof.bigquery.dataset is None: @@ -312,7 +315,7 @@ def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Cal dataset=prof.bigquery.dataset, location=prof.bigquery.location, ) - return ex, (lambda n: run_or_dispatch(ex, n, jenv)), ex.run_python + return ex, (lambda n: ex.run_sql(n, jenv)), ex.run_python if prof.engine == "databricks_spark": ex = DatabricksSparkExecutor( @@ -326,7 +329,7 @@ def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Cal table_format=prof.databricks_spark.table_format, table_options=prof.databricks_spark.table_options, ) - return ex, (lambda n: run_or_dispatch(ex, n, jenv)), ex.run_python + return ex, (lambda n: ex.run_sql(n, jenv)), ex.run_python if prof.engine == "snowflake_snowpark": cfg = { @@ -340,7 +343,7 @@ def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Cal if prof.snowflake_snowpark.role: cfg["role"] = prof.snowflake_snowpark.role ex = SnowflakeSnowparkExecutor(cfg) - return ex, (lambda n: run_or_dispatch(ex, n, jenv)), ex.run_python + return ex, (lambda n: ex.run_sql(n, jenv)), ex.run_python _die(f"Unbekannter Engine-Typ: {getattr(prof, 'engine', None)}", code=1) raise AssertionError("unreachable") diff --git a/src/fastflowtransform/cli/options.py b/src/fastflowtransform/cli/options.py index 013c781..2e6dde4 100644 --- a/src/fastflowtransform/cli/options.py +++ b/src/fastflowtransform/cli/options.py @@ -185,6 +185,14 @@ class UTestCacheMode(str, Enum): ), ] +SkipBuildOpt = Annotated[ + bool, + typer.Option( + "--skip-build", + help="Do not build models before running tests (use existing tables).", + ), +] + __all__ = [ "CacheMode", @@ -205,6 +213,7 @@ class UTestCacheMode(str, Enum): "RebuildOnlyOpt", "ReuseMetaOpt", "SelectOpt", + "SkipBuildOpt", "UTestCacheMode", "UTestCacheOpt", "VarsOpt", diff --git a/src/fastflowtransform/cli/run.py b/src/fastflowtransform/cli/run.py index 5c35205..ab234b4 100644 --- a/src/fastflowtransform/cli/run.py +++ b/src/fastflowtransform/cli/run.py @@ -1,3 +1,4 @@ +# fastflowtransform/cli/run.py from __future__ import annotations import os @@ -46,7 +47,6 @@ fingerprint_sql, get_function_source, ) -from fastflowtransform.incremental import run_or_dispatch as run_sql_with_incremental from fastflowtransform.log_queue import LogQueue from fastflowtransform.logging import bind_context, bound_context, clear_context, echo, warn from fastflowtransform.meta import ensure_meta_table @@ -102,12 +102,6 @@ def _get_runner(self) -> tuple[Any, Callable, Callable]: clone_needed = not (isinstance(db_path, str) and db_path.strip() == ":memory:") if clone_needed: ex = ex.clone() - - def _run_sql_duckdb(n): - # Planner: intercept incremental materializations - return run_sql_with_incremental(ex, n, self.ctx.jinja_env) - - run_sql_wrapped = _run_sql_duckdb run_py_wrapped = ex.run_python except Exception: pass @@ -145,6 +139,87 @@ def _maybe_fingerprint(self, node: Any, ex: Any) -> str | None: return None return None + def _executor_namespace(self) -> str | None: + """ + Best-effort namespace (catalog/database/schema/dataset) to enrich log output. + """ + if not isinstance(self.shared, tuple) or not self.shared: + return None + executor = self.shared[0] + if executor is None: + return None + parts: list[str] = [] + for attr in ("catalog", "database"): + val = getattr(executor, attr, None) + if isinstance(val, str) and val.strip(): + parts.append(val.strip()) + for attr in ("dataset", "schema"): + val = getattr(executor, attr, None) + if isinstance(val, str) and val.strip(): + parts.append(val.strip()) + return ".".join(parts) if parts else None + + def _qualified_target(self, name: str) -> str | None: + namespace = self._executor_namespace() + if not namespace: + return None + rel = relation_for(name) + if not rel: + return None + return f"{namespace}.{rel}" + + def format_run_label(self, name: str) -> str: + """ + Build the human-facing label for run logs, e.g.: + fct_events_sql_inline.ff [delta] (catalog.schema.fct_events_sql_inline) + + The storage format is resolved from: + 1) per-model storage config (project.yml → models.storage / meta.storage), + 2) engine defaults (e.g. Databricks/Spark table_format) as a fallback. + + For database engines like DuckDB/Postgres we intentionally hide the + underlying storage format (e.g. 'parquet') to avoid confusing output. + """ + qualified = self._qualified_target(name) + engine = (self.ctx.profile.engine or "").lower() + + # 1) per-model storage.format from meta (preferred) + fmt_from_meta: str | None = None + try: + node = REGISTRY.get_node(name) + meta = getattr(node, "meta", {}) or {} + storage_cfg = meta.get("storage") or {} + if isinstance(storage_cfg, dict): + val = storage_cfg.get("format") + if isinstance(val, str) and val.strip(): + fmt_from_meta = val.strip() + except Exception: + fmt_from_meta = None + + fmt: str | None = fmt_from_meta + + # 2) engine-level default format (e.g. Spark table_format) as fallback. + # Only meaningful for Spark-like engines. + if fmt is None and engine in {"databricks_spark", "spark"}: + try: + executor, _, _ = self.shared + default_fmt = getattr(executor, "spark_table_format", None) + if isinstance(default_fmt, str) and default_fmt.strip(): + fmt = default_fmt.strip() + except Exception: + fmt = None + + # For database engines (DuckDB/Postgres), we do not show a format suffix + # at all to avoid misleading '[parquet]' labels. + if engine in {"duckdb", "postgres", "postgresql"}: + fmt_suffix = "" + else: + fmt_suffix = f" [{fmt}]" if fmt else "" + + if qualified: + return f"{name}{fmt_suffix} ({qualified})" + return f"{name}{fmt_suffix}" + def run_node(self, name: str) -> None: node = REGISTRY.nodes[name] ex, run_sql_fn, run_py_fn = self._get_runner() @@ -363,7 +438,8 @@ def _run_node_with_ctx(name: str) -> None: on_error=None, logger=logq, engine_abbr=_abbr(ctx.profile.engine), - name_width=28, + name_width=100, + name_formatter=engine_.format_run_label, ) finished_at = datetime.now(UTC).isoformat(timespec="seconds") diff --git a/src/fastflowtransform/cli/test_cmd.py b/src/fastflowtransform/cli/test_cmd.py index 92d91eb..7feb493 100644 --- a/src/fastflowtransform/cli/test_cmd.py +++ b/src/fastflowtransform/cli/test_cmd.py @@ -1,8 +1,9 @@ +# fastflowtransform/cli/test_cmd.py from __future__ import annotations import os import time -from collections.abc import Callable, Iterable +from collections.abc import Callable, Iterable, Mapping from dataclasses import dataclass from pathlib import Path from typing import Any @@ -10,13 +11,13 @@ import typer import yaml -from fastflowtransform import testing from fastflowtransform.cli.bootstrap import _get_test_con, _prepare_context from fastflowtransform.cli.options import ( EngineOpt, EnvOpt, ProjectArg, SelectOpt, + SkipBuildOpt, VarsOpt, ) from fastflowtransform.cli.selectors import _compile_selector @@ -25,7 +26,7 @@ from fastflowtransform.errors import ModelExecutionError from fastflowtransform.logging import echo from fastflowtransform.schema_loader import Severity, TestSpec, load_schema_tests -from fastflowtransform.test_registry import TESTS +from fastflowtransform.testing.registry import TESTS @dataclass @@ -115,11 +116,11 @@ def _apply_legacy_tag_filter( legacy_tag = tokens[0] def has_tag(t: Any) -> bool: - # Dict (altes Format) + # Dict (old format) if isinstance(t, dict): tags = t.get("tags") or [] return (legacy_tag in tags) if isinstance(tags, list) else (legacy_tag == tags) - # TestSpec (neues Schema) + # TestSpec (new Schema) if isinstance(t, TestSpec): return legacy_tag in (t.tags or []) return False @@ -127,51 +128,114 @@ def has_tag(t: Any) -> bool: return [t for t in tests if has_tag(t)] -def _run_dq_tests(con: Any, tests: Iterable[Any]) -> list[DQResult]: - results: list[DQResult] = [] - for t in tests: - severity: Severity - if isinstance(t, TestSpec): - kind = t.type - col = t.column - severity = t.severity - params: dict[str, Any] = t.params or {} - display_table = t.table - table_for_exec = t.table +def _fmt_table(value: Any, executor: Any) -> Any: + if executor is None or not hasattr(executor, "_format_test_table"): + return value + return executor._format_test_table(value) + + +def _fmt_reconcile_side(side: Any, executor: Any) -> Any: + if not isinstance(side, dict): + return side + side_fmt = dict(side) + tbl = side_fmt.get("table") + if tbl is not None: + side_fmt["table"] = _fmt_table(tbl, executor) + return side_fmt + + +def _prepare_test_from_spec( + t: TestSpec, executor: Any +) -> tuple[str, Any, Severity, dict[str, Any], Any, Any]: + """ + Normalize a TestSpec into (kind, column, severity, params, display_table, table_for_exec) + """ + kind = t.type + col = t.column + severity: Severity = t.severity + params: dict[str, Any] = t.params or {} + + display_table = t.table + table_for_exec = _fmt_table(t.table, executor) + + if kind.startswith("reconcile_"): + params = dict(params) # copy so we don't mutate original + for key in ("left", "right", "source", "target"): + side = params.get(key) + if isinstance(side, dict): + params[key] = _fmt_reconcile_side(side, executor) + + return kind, col, severity, params, display_table, table_for_exec + + +def _prepare_test_from_mapping( + t: Mapping[str, Any], executor: Any +) -> tuple[str, Any, Severity, dict[str, Any], Any, Any]: + """ + Normalize a dict-like test into (kind, column, severity, params, display_table, table_for_exec) + """ + kind = t["type"] + _sev = str(t.get("severity", "error")).lower() + severity: Severity = "warn" if _sev == "warn" else "error" + + params: dict[str, Any] = dict(t) + col = t.get("column") + + if kind.startswith("reconcile_"): + if isinstance(t.get("left"), dict) and isinstance(t.get("right"), dict): + lt = (t.get("left") or {}).get("table") + rt = (t.get("right") or {}).get("table") + display_table = f"{lt} ⇔ {rt}" + elif isinstance(t.get("source"), dict) and isinstance(t.get("target"), dict): + st = (t.get("source") or {}).get("table") + tt = (t.get("target") or {}).get("table") + display_table = f"{st} ⇒ {tt}" else: - kind = t["type"] - _sev = str(t.get("severity", "error")).lower() - severity = "warn" if _sev == "warn" else "error" - params = dict(t) - col = t.get("column") - if kind.startswith("reconcile_"): - if isinstance(t.get("left"), dict) and isinstance(t.get("right"), dict): - lt = (t.get("left") or {}).get("table") - rt = (t.get("right") or {}).get("table") - display_table = f"{lt} ⇔ {rt}" - elif isinstance(t.get("source"), dict) and isinstance(t.get("target"), dict): - st = (t.get("source") or {}).get("table") - tt = (t.get("target") or {}).get("table") - display_table = f"{st} ⇒ {tt}" - else: - display_table = "" - table_for_exec = t.get("table") - else: - table_for_exec = t.get("table") - if not isinstance(table_for_exec, str) or not table_for_exec: - raise typer.BadParameter("Missing or invalid 'table' in test config") - display_table = table_for_exec - - # Dispatch via registry if available; otherwise fallback to legacy map + display_table = "" + + table_for_exec = _fmt_table(t.get("table"), executor) + + for key in ("left", "right", "source", "target"): + side = params.get(key) + if isinstance(side, dict): + params[key] = _fmt_reconcile_side(side, executor) + else: + table_for_exec = _fmt_table(t.get("table"), executor) + if not isinstance(table_for_exec, str) or not table_for_exec: + raise typer.BadParameter("Missing or invalid 'table' in test config") + display_table = table_for_exec + + return kind, col, severity, params, display_table, table_for_exec + + +def _prepare_test( + raw_test: Any, executor: Any +) -> tuple[str, Any, Severity, dict[str, Any], Any, Any]: + """ + Dispatcher that normalizes both TestSpec and mapping-based tests. + """ + if isinstance(raw_test, TestSpec): + return _prepare_test_from_spec(raw_test, executor) + return _prepare_test_from_mapping(raw_test, executor) + + +def _run_dq_tests(con: Any, tests: Iterable[Any], executor: Any) -> list[DQResult]: + results: list[DQResult] = [] + + for raw_test in tests: + ( + kind, + col, + severity, + params, + display_table, + table_for_exec, + ) = _prepare_test(raw_test, executor) + t0 = time.perf_counter() - if kind in TESTS: - ok, msg, example = TESTS[kind](con, table_for_exec, col, params) - else: - ok, msg = _exec_test_kind(con, kind, params, table_for_exec, col) - example = None + ok, msg, example = TESTS[kind](con, table_for_exec, col, params) ms = int((time.perf_counter() - t0) * 1000) - # Build short parameter display for the summary line param_str = _format_params_for_summary(kind, params) results.append( @@ -187,70 +251,100 @@ def _run_dq_tests(con: Any, tests: Iterable[Any]) -> list[DQResult]: example_sql=example, ) ) + return results -def _exec_test_kind(con: Any, kind: str, t: dict, table: Any, col: Any) -> tuple[bool, str | None]: - # Guard for column-required tests in legacy path - def _need_col() -> str: - if not isinstance(col, str) or not col: - raise typer.BadParameter(f"Test '{kind}' requires a non-empty 'column' parameter") - return col - - try_map = { - "not_null": lambda: testing.not_null(con, table, _need_col()), - "unique": lambda: testing.unique(con, table, _need_col()), - "greater_equal": lambda: testing.greater_equal( - con, table, _need_col(), t.get("threshold", 0) - ), - "non_negative_sum": lambda: testing.non_negative_sum(con, table, _need_col()), - "row_count_between": lambda: testing.row_count_between( - con, table, t.get("min", 1), t.get("max") - ), - "freshness": lambda: testing.freshness(con, table, _need_col(), t["max_delay_minutes"]), - "accepted_values": lambda: testing.accepted_values( - con, table, col, values=t.get("values", []), where=t.get("where") - ), - "reconcile_equal": lambda: testing.reconcile_equal( - con, - t["left"], - t["right"], - abs_tolerance=t.get("abs_tolerance"), - rel_tolerance_pct=t.get("rel_tolerance_pct"), - ), - "reconcile_ratio_within": lambda: testing.reconcile_ratio_within( - con, - t["left"], - t["right"], - min_ratio=t["min_ratio"], - max_ratio=t["max_ratio"], - ), - "reconcile_diff_within": lambda: testing.reconcile_diff_within( - con, - t["left"], - t["right"], - max_abs_diff=t["max_abs_diff"], - ), - "reconcile_coverage": lambda: testing.reconcile_coverage( - con, - t["source"], - t["target"], - source_where=t.get("source_where"), - target_where=t.get("target_where"), - ), - } - - fn = try_map.get(kind) - if fn is None: - raise typer.BadParameter(f"Unknown test type: {kind}") - - try: - fn() - return True, None - except testing.TestFailure as e: - return False, str(e) - except Exception as e: - return False, f"Unexpected error: {e.__class__.__name__}: {e}" +# def _run_dq_tests(con: Any, tests: Iterable[Any], executor: Any) -> list[DQResult]: +# results: list[DQResult] = [] + +# def _fmt_table(value: Any) -> Any: +# if executor is None or not hasattr(executor, "_format_test_table"): +# return value +# return executor._format_test_table(value) + +# def _fmt_reconcile_side(side: Any) -> Any: +# if not isinstance(side, dict): +# return side +# side_fmt = dict(side) +# tbl = side_fmt.get("table") +# if tbl is not None: +# side_fmt["table"] = _fmt_table(tbl) +# return side_fmt + +# for t in tests: +# severity: Severity +# if isinstance(t, TestSpec): +# kind = t.type +# col = t.column +# severity = t.severity +# params: dict[str, Any] = t.params or {} +# display_table = t.table +# table_for_exec = _fmt_table(t.table) +# if kind.startswith("reconcile_"): +# params = dict(params) +# if isinstance(params.get("left"), dict): +# params["left"] = _fmt_reconcile_side(params["left"]) +# if isinstance(params.get("right"), dict): +# params["right"] = _fmt_reconcile_side(params["right"]) +# if isinstance(params.get("source"), dict): +# params["source"] = _fmt_reconcile_side(params["source"]) +# if isinstance(params.get("target"), dict): +# params["target"] = _fmt_reconcile_side(params["target"]) +# else: +# kind = t["type"] +# _sev = str(t.get("severity", "error")).lower() +# severity = "warn" if _sev == "warn" else "error" +# params = dict(t) +# col = t.get("column") +# if kind.startswith("reconcile_"): +# if isinstance(t.get("left"), dict) and isinstance(t.get("right"), dict): +# lt = (t.get("left") or {}).get("table") +# rt = (t.get("right") or {}).get("table") +# display_table = f"{lt} ⇔ {rt}" +# elif isinstance(t.get("source"), dict) and isinstance(t.get("target"), dict): +# st = (t.get("source") or {}).get("table") +# tt = (t.get("target") or {}).get("table") +# display_table = f"{st} ⇒ {tt}" +# else: +# display_table = "" +# table_for_exec = _fmt_table(t.get("table")) +# if isinstance(params.get("left"), dict): +# params["left"] = _fmt_reconcile_side(params["left"]) +# if isinstance(params.get("right"), dict): +# params["right"] = _fmt_reconcile_side(params["right"]) +# if isinstance(params.get("source"), dict): +# params["source"] = _fmt_reconcile_side(params["source"]) +# if isinstance(params.get("target"), dict): +# params["target"] = _fmt_reconcile_side(params["target"]) +# else: +# table_for_exec = _fmt_table(t.get("table")) +# if not isinstance(table_for_exec, str) or not table_for_exec: +# raise typer.BadParameter("Missing or invalid 'table' in test config") +# display_table = table_for_exec + +# # Dispatch via registry +# t0 = time.perf_counter() +# ok, msg, example = TESTS[kind](con, table_for_exec, col, params) +# ms = int((time.perf_counter() - t0) * 1000) + +# # Build short parameter display for the summary line +# param_str = _format_params_for_summary(kind, params) + +# results.append( +# DQResult( +# kind=kind, +# table=str(display_table), +# column=col, +# ok=ok, +# msg=msg, +# ms=ms, +# severity=severity, +# param_str=param_str, +# example_sql=example, +# ) +# ) +# return results def _print_summary(results: list[DQResult]) -> None: @@ -314,8 +408,8 @@ def test( engine: EngineOpt = None, vars: VarsOpt = None, select: SelectOpt = None, + skip_build: SkipBuildOpt = False, ) -> None: - # _ensure_logging() ctx = _prepare_context(project, env_name, engine, vars) tokens, pred = _compile_selector(select) has_model_matches = any(pred(node) for node in REGISTRY.nodes.values()) @@ -327,7 +421,8 @@ def test( model_pred = (lambda _n: True) if legacy_tag_only else pred # Run models; if a model fails, show friendly error then exit(1). - _run_models(model_pred, run_sql, run_py) + if not skip_build: + _run_models(model_pred, run_sql, run_py) # 1) project.yml tests tests: list[Any] = _load_tests(ctx.project) @@ -339,7 +434,7 @@ def test( typer.secho("No tests configured.", fg="bright_black") raise typer.Exit(code=0) - results = _run_dq_tests(con, tests) + results = _run_dq_tests(con, tests, execu) _print_summary(results) # Exit code: count only ERROR fails diff --git a/examples/postgres/__init__.py b/src/fastflowtransform/config/__init__.py similarity index 100% rename from examples/postgres/__init__.py rename to src/fastflowtransform/config/__init__.py diff --git a/src/fastflowtransform/config/models.py b/src/fastflowtransform/config/models.py new file mode 100644 index 0000000..7c3989e --- /dev/null +++ b/src/fastflowtransform/config/models.py @@ -0,0 +1,406 @@ +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from typing import Any, Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + +# --------------------------------------------------------------------------- +# Per-model storage configuration (project.yml → models.storage, or config(storage=...)) +# --------------------------------------------------------------------------- + + +class StorageConfig(BaseModel): + """ + Per-model storage override, for example: + + {{ config( + storage={ + "path": ".local/spark/users", + "format": "parquet", + "options": {"compression": "snappy"}, + } + ) }} + + This shape is also compatible with project.yml → models.storage. + """ + + model_config = ConfigDict(extra="forbid") + + path: str + format: str | None = None + options: dict[str, Any] = Field(default_factory=dict) + + +# --------------------------------------------------------------------------- +# Incremental / delta configuration (structured form) +# --------------------------------------------------------------------------- + + +class InlineDeltaConfig(BaseModel): + """ + Inline SQL delta definition, for example: + + {{ config( + incremental=True, + delta={ + "sql": "select ... from {{ ref('events_base') }} where updated_at > (...)" + }, + ) }} + """ + + model_config = ConfigDict(extra="forbid") + + sql: str + + +class IncrementalConfig(BaseModel): + """ + High-level incremental configuration used in structured form, for example: + + {{ config( + incremental={ + "enabled": true, + "unique_key": ["id"], + "updated_at_column": "updated_at", + "delta_sql": "select ... where updated_at > (...)", + "on_schema_change": "append_new_columns", + } + ) }} + + This complements the simple shorthand `incremental: true`. + """ + + model_config = ConfigDict(extra="forbid") + + # Master switch (default: enabled) + enabled: bool = True + + # Canonical business key(s) + unique_key: list[str] | None = None + + # Updated-at column (single) + updated_at_column: str | None = None + + # Optional alternative notations + updated_at_columns: list[str] | None = None + timestamp_columns: list[str] | None = None + + # Delta definitions: + # - delta_sql: inline SQL (short form) + # - delta_python: Python callable for custom merge logic + delta_sql: str | None = None + delta_python: str | None = None + + # Schema evolution behaviour; directly mapped to meta["on_schema_change"] + # and consumed by incremental._get_on_schema_change(...) + on_schema_change: Literal["ignore", "append_new_columns", "sync_all_columns"] | None = None + + @field_validator("unique_key", "updated_at_columns", "timestamp_columns", mode="before") + @classmethod + def _normalize_str_or_seq(cls, v: Any) -> list[str] | None: + if v is None: + return None + if isinstance(v, str): + return [v] + if isinstance(v, Sequence) and not isinstance(v, (str, bytes)): + return [str(x) for x in v] + raise TypeError("must be a string or a sequence of strings") + + +# --------------------------------------------------------------------------- +# ModelConfig - canonical form of config(...) / decorator meta +# --------------------------------------------------------------------------- + + +class ModelConfig(BaseModel): + """ + Canonical, *flattened* model configuration for SQL and Python models. + + This represents the keys that ultimately end up in Node.meta after: + + - SQL: {{ config(...) }} in the model header + - Python: @model(..., meta={...}) + - project.yml overlays (models.incremental / models.storage) + + The schema is intentionally strict (extra="forbid") so that: + - only documented keys are allowed + - typos and unknown fields fail fast + """ + + model_config = ConfigDict(extra="forbid") + + # --- Core materialization & classification ----------------------------- + + materialized: Literal["table", "view", "incremental", "ephemeral"] | None = None + + # Optional logical kind; useful for selectors (kind:python / kind:sql / etc.) + kind: str | None = None + + # Tags for selection (tag:...); both SQL & Python models contribute here + tags: list[str] = Field(default_factory=list) + + # Engine restriction, e.g. engines=["duckdb", "postgres"] + engines: list[str] = Field(default_factory=list) + + # --- Storage override (per model) -------------------------------------- + + storage: StorageConfig | None = None + + # --- Incremental flags & shortcuts ------------------------------------- + + # Shortcut: + # - True → incremental enabled + # - False / None → not incremental (unless executors override) + # + # Structured: + # - { ... IncrementalConfig fields ... } + incremental: IncrementalConfig | None = None + + # Top-level shortcuts (backwards-compatible) + # These are used by existing executor logic. + unique_key: list[str] | None = None + primary_key: list[str] | None = None # alias + + # Updated-at / timestamp information + updated_at: str | None = None + updated_at_column: str | None = None + updated_at_columns: list[str] | None = None + timestamp_columns: list[str] | None = None + + # Columns used to determine delta recency (used by Python incremental logic) + delta_columns: list[str] | None = None + + # Delta definitions - shorthand, equivalent to fields on IncrementalConfig + delta: InlineDeltaConfig | None = None + delta_sql: str | None = None + delta_python: str | None = None + + # Schema evolution behaviour; consumed by incremental._get_on_schema_change(...) + on_schema_change: Literal["ignore", "append_new_columns", "sync_all_columns"] | None = None + + # --- HTTP/API extension points (optional) ------------------------------ + # These are intentionally loose to allow API models to stash config blocks + # under known keys without having to allow arbitrary extras everywhere. + http: dict[str, Any] | None = None + api: dict[str, Any] | None = None + + # ---------------------------------------------------------------------- + # Normalisation helpers + # ---------------------------------------------------------------------- + + @field_validator("tags", "engines", mode="before") + @classmethod + def _normalize_tags_engines(cls, v: Any) -> list[str]: + """ + Allow: + - string: "duckdb" → ["duckdb"] + - sequence: ["duckdb", "postgres"] + """ + if v is None: + return [] + if isinstance(v, str): + return [v] + if isinstance(v, Sequence) and not isinstance(v, (str, bytes)): + return [str(x) for x in v] + raise TypeError("must be a string or a sequence of strings") + + @field_validator( + "unique_key", + "primary_key", + "updated_at_columns", + "timestamp_columns", + "delta_columns", + mode="before", + ) + @classmethod + def _normalize_key_lists(cls, v: Any) -> list[str] | None: + """ + Allow single string or list/tuple of strings. + """ + if v is None: + return None + if isinstance(v, str): + return [v] + if isinstance(v, Sequence) and not isinstance(v, (str, bytes)): + return [str(x) for x in v] + raise TypeError("must be a string or a sequence of strings") + + @model_validator(mode="after") + def _merge_incremental_overlays(self) -> ModelConfig: + """ + Backwards- and executor-compatible merge: + + - If `incremental` is an IncrementalConfig instance, mirror the + central fields onto the top-level shortcuts (unique_key, updated_at_column, delta_*). + + - If `incremental == True` but no IncrementalConfig was provided, + we simply rely on top-level fields (unique_key, updated_at, …). + """ + inc = self.incremental + + if isinstance(inc, IncrementalConfig): + # unique_key + if self.unique_key is None and inc.unique_key is not None: + self.unique_key = list(inc.unique_key) + + # updated-at / updated_at_column + if self.updated_at_column is None and inc.updated_at_column is not None: + self.updated_at_column = inc.updated_at_column + + if self.updated_at is None and inc.updated_at_column is not None: + # For older code that only checks `updated_at` + self.updated_at = inc.updated_at_column + + # timestamp / updated_at columns + if self.updated_at_columns is None and inc.updated_at_columns is not None: + self.updated_at_columns = list(inc.updated_at_columns) + + if self.timestamp_columns is None and inc.timestamp_columns is not None: + self.timestamp_columns = list(inc.timestamp_columns) + + # delta hints + if self.delta_sql is None and inc.delta_sql is not None: + self.delta_sql = inc.delta_sql + if self.delta_python is None and inc.delta_python is not None: + self.delta_python = inc.delta_python + + # schema evolution + if self.on_schema_change is None and inc.on_schema_change is not None: + self.on_schema_change = inc.on_schema_change + + # If InlineDeltaConfig is used, prefer its SQL for delta_sql + if self.delta and not self.delta_sql: + self.delta_sql = self.delta.sql + + return self + + # ---------------------------------------------------------------------- + # Convenience helpers for executor code + # ---------------------------------------------------------------------- + + def is_incremental_enabled(self) -> bool: + """ + Return True if incremental mode is effectively enabled for this model. + """ + if self.incremental is None: + return False + return bool(self.incremental.enabled) + + # ---------------------------------------------------------------------- + # Cross-field guardrails (fail fast with clear messages) + # ---------------------------------------------------------------------- + @model_validator(mode="after") + def _validate_incremental_requirements(self) -> ModelConfig: + """ + Enforce combinations that must hold for incremental materializations. + + Rules: + 1) If materialized == 'incremental', incremental must be effectively enabled. + 2) If incremental is enabled, at least one freshness/delta hint must exist: + - updated_at / updated_at_column / updated_at_columns / timestamp_columns + OR delta_sql OR delta_python. + 3) If both updated_at and updated_at_column are provided, they must match. + 4) (Opinionated) Require unique_key when incremental is enabled + to avoid accidental cartesian merges. Relax if your executor permits. + """ + is_mat_inc = self.materialized == "incremental" + is_inc_enabled = self.is_incremental_enabled() + + # 1) Require incremental block when materialized='incremental' + if is_mat_inc and not is_inc_enabled: + raise ValueError( + "materialized='incremental' requires an enabled incremental configuration. " + "Either set `incremental: true` or provide a " + "structured `incremental: { enabled: true, ... }`." + ) + + # 2) If incremental is enabled, ensure at least one delta/freshness hint + if is_inc_enabled: + has_time_hints = any( + [ + bool(self.updated_at), + bool(self.updated_at_column), + bool(self.updated_at_columns), + bool(self.timestamp_columns), + ] + ) + has_delta_hints = any([bool(self.delta_sql), bool(self.delta_python)]) + if not (has_time_hints or has_delta_hints): + raise ValueError( + "incremental.enabled=True but no delta/freshness hints were provided. " + "Please set one of: updated_at / updated_at_column / updated_at_columns / " + "timestamp_columns, or provide delta_sql / delta_python." + ) + + # 3) If both notations are present, they must agree + if self.updated_at and self.updated_at_column and self.updated_at != self.updated_at_column: + raise ValueError( + f"updated_at ('{self.updated_at}') and " + f"updated_at_column ('{self.updated_at_column}') " + "refer to different columns. Use one or make them identical." + ) + + # 4) (Opinionated) Require unique_key when incremental is enabled + if is_inc_enabled and not (self.unique_key or self.primary_key): + raise ValueError( + "incremental.enabled=True requires a unique_key (or primary_key) to be set " + "for safe merges. Example: unique_key: ['id']" + ) + + return self + + +# --------------------------------------------------------------------------- +# Helper: validate & normalize raw meta dict +# --------------------------------------------------------------------------- + + +def validate_model_meta(meta: Mapping[str, Any] | None) -> ModelConfig: + """ + Validate a raw meta mapping coming from SQL config(...) or Python decorators + and return a strongly-typed ModelConfig instance. + + This function also normalizes shorthand forms like: + - incremental: true/false + - incremental: { ... } (without explicit enabled flag) + """ + data: dict[str, Any] = dict(meta or {}) + + incr = data.get("incremental") + + if isinstance(incr, bool): + # incremental: true/false → normalize to nested config + data["incremental"] = {"enabled": incr} + elif isinstance(incr, Mapping): + # ensure we can mutate it + incr_dict = dict(incr) + # default enabled=True if omitted + incr_dict.setdefault("enabled", True) + data["incremental"] = incr_dict + elif incr is not None: + raise TypeError("meta.incremental must be a bool, a mapping or null") + + return ModelConfig.model_validate(data) + + +def validate_model_meta_strict( + meta: Mapping[str, Any] | None, + *, + model_name: str | None = None, + file_path: str | None = None, +) -> ModelConfig: + """ + Like validate_model_meta(), but wraps exceptions with model/file context for clearer errors. + Callers in the loader should prefer this, so a bad config never silently disables a model. + """ + try: + return validate_model_meta(meta) + except Exception as e: + ctx = [] + if model_name: + ctx.append(f"model '{model_name}'") + if file_path: + ctx.append(f"{file_path}") + prefix = f"Invalid model config ({', '.join(ctx)})" if ctx else "Invalid model config" + raise TypeError(f"{prefix}: {e}") from e diff --git a/src/fastflowtransform/config/project.py b/src/fastflowtransform/config/project.py new file mode 100644 index 0000000..d9ff1fc --- /dev/null +++ b/src/fastflowtransform/config/project.py @@ -0,0 +1,528 @@ +from __future__ import annotations + +from collections.abc import Sequence +from pathlib import Path +from typing import Annotated, Any, Literal + +import yaml +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + +from fastflowtransform.config.models import IncrementalConfig, StorageConfig + +# --------------------------------------------------------------------------- +# Incremental overlays from project.yml → models.incremental +# --------------------------------------------------------------------------- + + +class IncrementalModelConfig(BaseModel): + """ + Per-model incremental overlay from project.yml, for example: + + models: + incremental: + fct_events_sql_inline: + incremental: true + unique_key: ["event_id"] + updated_at: "updated_at" + delta_sql: | + select ... + schema_sync: append_new_columns + + fct_events_py_incremental: + incremental: + enabled: true + strategy: merge + unique_key: ["event_id"] + updated_at_column: "updated_at" + + This is intentionally compatible with the fields on ModelConfig. + """ + + model_config = ConfigDict(extra="forbid") + + # Master switch / structured config + incremental: bool | IncrementalConfig | None = None + + # Shortcuts (later merged into ModelConfig) + unique_key: list[str] | None = None + primary_key: list[str] | None = None + + updated_at: str | None = None + updated_at_column: str | None = None + updated_at_columns: list[str] | None = None + timestamp_columns: list[str] | None = None + + delta_sql: str | None = None + delta_config: str | None = None + delta_python: str | None = None + + schema_sync: Literal["none", "append_new_columns", "sync_all_columns"] | None = None + + @field_validator( + "unique_key", + "primary_key", + "updated_at_columns", + "timestamp_columns", + mode="before", + ) + @classmethod + def _normalize_key_lists(cls, v: Any) -> list[str] | None: + if v is None: + return None + if isinstance(v, str): + return [v] + if isinstance(v, Sequence) and not isinstance(v, (str, bytes)): + return [str(x) for x in v] + raise TypeError("must be a string or a sequence of strings") + + +# --------------------------------------------------------------------------- +# models: block from project.yml +# --------------------------------------------------------------------------- + + +class ModelsBlock(BaseModel): + """ + project.yml: + + models: + storage: + users: + path: ".local/spark/users" + format: parquet + ... + + incremental: + my_model: + incremental: true + unique_key: ["id"] + updated_at: "updated_at" + """ + + model_config = ConfigDict(extra="forbid") + + storage: dict[str, StorageConfig] = Field(default_factory=dict) + incremental: dict[str, IncrementalModelConfig] = Field(default_factory=dict) + + +# --------------------------------------------------------------------------- +# seeds: block from project.yml +# --------------------------------------------------------------------------- + + +class SeedsBlock(BaseModel): + """ + project.yml: + + seeds: + storage: + seed_users: + path: ".local/spark/seed_users" + format: parquet + """ + + model_config = ConfigDict(extra="forbid") + + storage: dict[str, StorageConfig] = Field(default_factory=dict) + + +# --------------------------------------------------------------------------- +# docs: block from project.yml +# --------------------------------------------------------------------------- + + +class DocsConfig(BaseModel): + """ + Optional documentation-related configuration. + + Example: + + docs: + dag_dir: "site/dag" + """ + + model_config = ConfigDict(extra="forbid") + + dag_dir: str | None = None + + +# --------------------------------------------------------------------------- +# Top-level tests from project.yml (in addition to schema tests) +# --------------------------------------------------------------------------- + + +class BaseProjectTestConfig(BaseModel): + """ + Common fields for all project-level tests declared in project.yml under `tests:`. + + NOTE: + - For table/column-level tests (not_null, unique, ...), `table` and/or `column` + are required in the concrete subclasses. + - For reconciliation tests, `table` and `column` are optional and used only + for display/grouping in summaries. + """ + + model_config = ConfigDict(extra="forbid") + + type: str # discriminated in concrete subclasses + + severity: Literal["error", "warn"] = "error" + tags: list[str] = Field(default_factory=list) + + # Optional human-readable label, especially for reconciliations + name: str | None = None + + +class NotNullTestConfig(BaseProjectTestConfig): + """ + not_null test: assert that a column contains no NULL values. + """ + + type: Literal["not_null"] + + # required for this test + table: str + column: str + + # optional WHERE predicate + where: str | None = None + + +class UniqueTestConfig(BaseProjectTestConfig): + """ + unique test: detect duplicate values within a column. + """ + + type: Literal["unique"] + + table: str + column: str + + where: str | None = None + + +class AcceptedValuesTestConfig(BaseProjectTestConfig): + """ + accepted_values test: ensure all non-NULL values are inside an allowed set. + + Behaviour: + - If `values` is None or an empty list, the test is treated as a no-op + (always passes), but still appears in summaries. + """ + + type: Literal["accepted_values"] + + table: str + column: str + + # allowed literals (strings, numbers, ...) + values: list[Any] | None = None + where: str | None = None + + +class GreaterEqualTestConfig(BaseProjectTestConfig): + """ + greater_equal test: require all values to be >= threshold. + """ + + type: Literal["greater_equal"] + + table: str + column: str + + threshold: float = 0.0 + + +class NonNegativeSumTestConfig(BaseProjectTestConfig): + """ + non_negative_sum test: validate that SUM(column) is not negative. + """ + + type: Literal["non_negative_sum"] + + table: str + column: str + + +class RowCountBetweenTestConfig(BaseProjectTestConfig): + """ + row_count_between test: ensure row count is between [min_rows, max_rows]. + + - `min_rows` defaults to 1. + - `max_rows` is optional (open-ended upper bound). + """ + + type: Literal["row_count_between"] + + table: str + + min_rows: int = 1 + max_rows: int | None = None + + @model_validator(mode="after") + def validate_bounds(self) -> RowCountBetweenTestConfig: + """ + Ensure that min_rows is less than or equal to max_rows when both are set. + """ + if self.max_rows is not None and self.min_rows > self.max_rows: + raise ValueError( + f"row_count_between: min_rows ({self.min_rows}) " + f"must be less than or equal to max_rows ({self.max_rows})." + ) + return self + + +class FreshnessTestConfig(BaseProjectTestConfig): + """ + freshness test: warn or fail when latest timestamp is older + than `max_delay_minutes`. + """ + + type: Literal["freshness"] + + table: str + column: str # timestamp column + + max_delay_minutes: int + + +class ReconcileExprSide(BaseModel): + """ + Expression-based reconciliation side (left/right): + + left/right: + table: str + expr: str + where: optional filter condition + """ + + model_config = ConfigDict(extra="forbid") + + table: str + expr: str + where: str | None = None + + +class ReconcileKeySide(BaseModel): + """ + Key-based reconciliation side for coverage checks: + + source/target: + table: str + key: str + """ + + model_config = ConfigDict(extra="forbid") + + table: str + key: str + + +class ReconcileEqualTestConfig(BaseProjectTestConfig): + """ + `reconcile_equal` test: compare two scalar expressions with optional tolerances. + + Attributes: + left (ReconcileExprSide): Left-hand expression (`table`, `expr`, optional `where`). + right (ReconcileExprSide): Right-hand expression. + abs_tolerance (float | None): Maximum absolute difference allowed. + rel_tolerance_pct (float | None): Maximum relative difference (percent). + + Notes: + The top-level `table`/`column` fields are optional and only used for display in summaries. + + Example (YAML): + + ```yaml + - type: reconcile_equal + left: { table: a_tbl, expr: "sum(x)" } + right: { table: b_tbl, expr: "sum(y)", where: "dt >= current_date - interval '7 days'" } + abs_tolerance: 0.01 + rel_tolerance_pct: 1.0 + ``` + """ + + type: Literal["reconcile_equal"] + + left: ReconcileExprSide + right: ReconcileExprSide + + abs_tolerance: float | None = None + rel_tolerance_pct: float | None = None + + +class ReconcileRatioWithinTestConfig(BaseProjectTestConfig): + """ + `reconcile_ratio_within` test: constrain the ratio `left/right` within `[min_ratio, max_ratio]`. + + Attributes: + left (ReconcileExprSide): Left-hand expression. + right (ReconcileExprSide): Right-hand expression. + min_ratio (float): Minimum allowed ratio. + max_ratio (float): Maximum allowed ratio. + + Example (YAML): + + ```yaml + - type: reconcile_ratio_within + left: { table: orders, expr: "sum(amount)" } + right: { table: payments, expr: "sum(value)" } + min_ratio: 0.98 + max_ratio: 1.02 + ``` + """ + + type: Literal["reconcile_ratio_within"] + + left: ReconcileExprSide + right: ReconcileExprSide + + min_ratio: float + max_ratio: float + + +class ReconcileDiffWithinTestConfig(BaseProjectTestConfig): + """ + `reconcile_diff_within` test: limit the absolute difference between two aggregates. + + Attributes: + left (ReconcileExprSide): Left-hand expression. + right (ReconcileExprSide): Right-hand expression. + max_abs_diff (float): Maximum allowed absolute difference. + + Example (YAML): + + ```yaml + - type: reconcile_diff_within + left: { table: a, expr: "count(*)" } + right: { table: b, expr: "count(*)" } + max_abs_diff: 10 + ``` + """ + + type: Literal["reconcile_diff_within"] + + left: ReconcileExprSide + right: ReconcileExprSide + + max_abs_diff: float + + +class ReconcileCoverageTestConfig(BaseProjectTestConfig): + """ + `reconcile_coverage` test: ensure all keys from `source` exist in `target`. + + Attributes: + source (ReconcileKeySide): Source side (`table`, `key`). + target (ReconcileKeySide): Target side (`table`, `key`). + source_where (str | None): Optional filter predicate applied to the source. + target_where (str | None): Optional filter predicate applied to the target. + + Example (YAML): + + ```yaml + - type: reconcile_coverage + source: { table: crm_users, key: "user_id" } + target: { table: fact_orders, key: "user_id" } + source_where: "status = 'active'" + target_where: "dt >= current_date - interval '30 days'" + ``` + """ + + type: Literal["reconcile_coverage"] + + source: ReconcileKeySide + target: ReconcileKeySide + + source_where: str | None = None + target_where: str | None = None + + +ProjectTestConfig = Annotated[ + NotNullTestConfig + | UniqueTestConfig + | AcceptedValuesTestConfig + | GreaterEqualTestConfig + | NonNegativeSumTestConfig + | RowCountBetweenTestConfig + | FreshnessTestConfig + | ReconcileEqualTestConfig + | ReconcileRatioWithinTestConfig + | ReconcileDiffWithinTestConfig + | ReconcileCoverageTestConfig, + Field(discriminator="type"), +] + + +# --------------------------------------------------------------------------- +# project.yml - top-level model +# --------------------------------------------------------------------------- + + +class ProjectConfig(BaseModel): + """ + Strict representation of project.yml. + + Example: + + name: duckdb_api_demo + version: "0.1" + + vars: {} + + models: + storage: { ... } + incremental: { ... } + + seeds: + storage: { ... } + + tests: + - type: not_null + table: mart_users_join + column: user_id + tags: [batch] + """ + + model_config = ConfigDict(extra="forbid") + + name: str + version: str | int + + # Models directory (in case you want this configurable) + models_dir: str = "models" + + # Arbitrary variables that can be accessed via var('key') in Jinja + vars: dict[str, Any] = Field(default_factory=dict) + + models: ModelsBlock = Field(default_factory=ModelsBlock) + seeds: SeedsBlock = Field(default_factory=SeedsBlock) + + tests: list[ProjectTestConfig] = Field(default_factory=list) + + docs: DocsConfig | None = None + + +# --------------------------------------------------------------------------- +# Helper: load & validate project.yml +# --------------------------------------------------------------------------- + + +def parse_project_yaml_config(project_dir: Path) -> ProjectConfig: + """ + Read project.yml under `project_dir` and validate it strictly using Pydantic. + + Typical usage inside core._load_project_yaml: + + from fastflowtransform.config.project import parse_project_yaml_config + + proj_cfg = parse_project_yaml_config(project_dir) + self.project_vars = dict(proj_cfg.vars or {}) + + # models.storage → storage.set_model_storage(...) + # seeds.storage → storage.set_seed_storage(...) + # models.incremental → self.incremental_models = ... + """ + cfg_path = project_dir / "project.yml" + raw = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} + return ProjectConfig.model_validate(raw) diff --git a/src/fastflowtransform/config/seeds.py b/src/fastflowtransform/config/seeds.py new file mode 100644 index 0000000..437b540 --- /dev/null +++ b/src/fastflowtransform/config/seeds.py @@ -0,0 +1,128 @@ +# src/fastflowtransform/config/seeds.py +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml +from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator, model_validator + +from fastflowtransform.settings import EngineType + + +class SeedTargetConfig(BaseModel): + """ + Configuration for a single seed target entry in seeds/schema.yml. + + Example: + targets: + raw/users: + schema: raw + table: seed_users + schema_by_engine: + duckdb: main + postgres: raw + """ + + model_config = ConfigDict(extra="forbid", populate_by_name=True) + + schema_: str | None = Field(default=None, alias="schema") + table: str | None = None + schema_by_engine: dict[EngineType, str] = Field(default_factory=dict) + + @field_validator("schema_") + @classmethod + def _strip_schema(cls, value: str | None) -> str | None: + if isinstance(value, str): + value = value.strip() + return value or None + return value + + @field_validator("schema_by_engine") + @classmethod + def _strip_schema_by_engine(cls, value: dict[str, str]) -> dict[str, str]: + out: dict[str, str] = {} + for eng, sch in (value or {}).items(): + if not isinstance(sch, str): + continue + sch_clean = sch.strip() + if sch_clean: + out[eng] = sch_clean + return out + + @model_validator(mode="after") + def _allow_empty_schema(self) -> SeedTargetConfig: + # At the moment we allow targets without schema / schema_by_engine, + # so that the executor/default schema can still be used. + # If you want to enforce at least one schema, uncomment the check below. + # + # if not self.schema and not self.schema_by_engine: + # raise ValueError( + # "Either 'schema' or 'schema_by_engine' must be set for a seed target" + # ) + return self + + +class SeedsSchemaConfig(BaseModel): + """ + Top-level configuration for seeds/schema.yml. + + Structure: + targets: + : + schema: ... + table: ... + schema_by_engine: { duckdb: ..., postgres: ... } + + dtypes: + : + column_a: string + column_b: int64 + """ + + model_config = ConfigDict(extra="forbid") + + targets: dict[str, SeedTargetConfig] = Field(default_factory=dict) + dtypes: dict[str, dict[str, str]] = Field(default_factory=dict) + + @field_validator("dtypes") + @classmethod + def _normalize_dtypes(cls, value: dict[str, dict[str, Any]]) -> dict[str, dict[str, str]]: + out: dict[str, dict[str, str]] = {} + for table_key, cols in (value or {}).items(): + if not isinstance(cols, dict): + continue + clean_cols: dict[str, str] = {} + for col, dtype in cols.items(): + if not isinstance(col, str) or not isinstance(dtype, str): + continue + col_clean = col.strip() + dtype_clean = dtype.strip() + if col_clean and dtype_clean: + clean_cols[col_clean] = dtype_clean + if clean_cols: + out[table_key] = clean_cols + return out + + +def load_seeds_schema(project_dir: Path) -> SeedsSchemaConfig | None: + """ + Load and validate seeds/schema.yml for a given project. + + Returns: + - SeedsSchemaConfig instance when the file exists and is valid + - None when no file is present + + Raises: + ValueError: when YAML is present but does not match the expected schema. + """ + seeds_dir = project_dir / "seeds" + cfg_path = seeds_dir / "schema.yml" + if not cfg_path.exists(): + return None + + raw = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} + try: + return SeedsSchemaConfig.model_validate(raw) + except ValidationError as exc: + raise ValueError(f"Failed to parse seeds/schema.yml: {exc}") from exc diff --git a/src/fastflowtransform/config/sources.py b/src/fastflowtransform/config/sources.py new file mode 100644 index 0000000..076da14 --- /dev/null +++ b/src/fastflowtransform/config/sources.py @@ -0,0 +1,379 @@ +# fastflowtransform/config/sources.py +from __future__ import annotations + +from collections.abc import Mapping +from pathlib import Path +from typing import Any, Literal + +import yaml +from pydantic import BaseModel, ConfigDict, Field, field_validator + +# --------------------------------------------------------------------------- +# Low-level helpers +# --------------------------------------------------------------------------- + +_SOURCE_CFG_FIELDS = { + "identifier", + "schema", + "database", + "catalog", + "project", + "dataset", + "location", + "format", + "options", +} + + +def _compact_cfg(cfg: Mapping[str, Any]) -> dict[str, Any]: + cleaned: dict[str, Any] = {} + for key, value in cfg.items(): + if key == "options": + if value: + cleaned[key] = dict(value) + continue + if value is not None: + cleaned[key] = value + return cleaned + + +def _normalize_options(value: Any, *, field_path: str) -> dict[str, Any]: + if value is None: + return {} + if isinstance(value, Mapping): + return {str(k): v for k, v in value.items()} + raise ValueError(f"sources.yml → {field_path}: expected mapping, got {type(value).__name__}") + + +def _pick_source_fields( + data: Mapping[str, Any] | None, + base: Mapping[str, Any] | None, + *, + field_path: str, +) -> dict[str, Any]: + """Return a dict limited to the supported source configuration fields.""" + data = data or {} + base = base or {} + out: dict[str, Any] = {k: base.get(k) for k in _SOURCE_CFG_FIELDS} + for key, value in data.items(): + if key not in _SOURCE_CFG_FIELDS: + continue + if key == "options": + base_opts = out.get("options") or {} + incoming = _normalize_options(value, field_path=f"{field_path}.options") + merged = dict(base_opts) + merged.update(incoming) + out["options"] = merged + else: + out[key] = value + + if "options" not in out or out["options"] is None: + out["options"] = {} + return out + + +def _normalize_engine_overrides( + overrides: Mapping[str, Any] | None, + *, + field_path: str, +) -> dict[str, dict[str, Any]]: + if overrides is None: + return {} + if not isinstance(overrides, Mapping): + raise ValueError( + f"sources.yml → {field_path}: overrides must be a mapping of engine -> config" + ) + + normalized: dict[str, dict[str, Any]] = {} + for engine, cfg in overrides.items(): + if cfg is None: + normalized[str(engine)] = {} + continue + if not isinstance(cfg, Mapping): + raise ValueError( + f"sources.yml → {field_path}[{engine!r}]: " + f"expected mapping, got {type(cfg).__name__}" + ) + picked = _pick_source_fields(cfg, None, field_path=f"{field_path}[{engine!r}]") + normalized[str(engine)] = _compact_cfg(picked) + return normalized + + +def _merge_source_configs(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]: + merged = dict(base) + for key, value in override.items(): + if key == "options": + opts = dict(merged.get("options") or {}) + opts.update(value or {}) + merged["options"] = opts + else: + merged[key] = value + if "options" not in merged or merged["options"] is None: + merged["options"] = {} + return merged + + +def _combine_engine_overrides( + source_overrides: Mapping[str, dict[str, Any]], + table_overrides: Mapping[str, dict[str, Any]], +) -> dict[str, dict[str, Any]]: + engines = set(source_overrides) | set(table_overrides) + combined: dict[str, dict[str, Any]] = {} + for engine in engines: + combined[engine] = _merge_source_configs( + source_overrides.get(engine, {}), + table_overrides.get(engine, {}), + ) + return combined + + +# --------------------------------------------------------------------------- +# Pydantic models mirroring sources.yml structure +# --------------------------------------------------------------------------- + + +class SourceTableConfig(BaseModel): + """ + Schema for an individual table entry under a source group. + + We allow extra keys so that future metadata (e.g. owner) doesn't break users, + but we only *use* the known ones below when normalizing. + """ + + model_config = ConfigDict(extra="allow") + + name: str + identifier: str | None = None + + # core location fields + schema_: str | None = Field(default=None, alias="schema") + database: str | None = None + catalog: str | None = None + project: str | None = None + dataset: str | None = None + location: str | None = None + format: str | None = None + options: dict[str, Any] | None = None + + overrides: dict[str, dict[str, Any]] | None = None + + # dbt-compatible metadata (kept as-is) + description: str | None = None + columns: Any | None = None + meta: dict[str, Any] | None = None + + @field_validator("options", mode="before") + @classmethod + def _normalize_opts(cls, v: Any) -> dict[str, Any] | None: + if v is None: + return None + if isinstance(v, Mapping): + return {str(k): v for k, v in v.items()} + raise TypeError("options must be a mapping if provided") + + +class SourceGroupConfig(BaseModel): + """ + Schema for each entry under top-level `sources:` in sources.yml. + """ + + model_config = ConfigDict(extra="forbid") + + name: str + + # group-level location defaults + schema_: str | None = Field(default=None, alias="schema") + database: str | None = None + catalog: str | None = None + project: str | None = None + dataset: str | None = None + location: str | None = None + format: str | None = None + options: dict[str, Any] | None = None + + overrides: dict[str, dict[str, Any]] | None = None + + tables: list[SourceTableConfig] + + @field_validator("options", mode="before") + @classmethod + def _normalize_opts(cls, v: Any) -> dict[str, Any] | None: + if v is None: + return None + if isinstance(v, Mapping): + return {str(k): v for k, v in v.items()} + raise TypeError("options must be a mapping if provided") + + +class SourcesFileConfig(BaseModel): + """ + Strict representation of sources.yml (version 2). + """ + + model_config = ConfigDict(extra="forbid") + + version: Literal[2] + sources: list[SourceGroupConfig] = Field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Normalization: Pydantic → legacy normalized dict +# --------------------------------------------------------------------------- + + +def _normalize_sources(cfg: SourcesFileConfig) -> dict[str, dict[str, dict[str, Any]]]: + """ + Convert the strongly typed config into the normalized structure currently + expected by Registry.sources and resolve_source_entry. + + Shape: + { + "": { + "": { + "base": { ...location fields... }, + "overrides": { "": { ... } }, + "description": ..., + "columns": ..., + "meta": ..., + }, + ... + }, + ... + } + """ + normalized: dict[str, dict[str, dict[str, Any]]] = {} + + for s_idx, src in enumerate(cfg.sources): + if src.name in normalized: + raise ValueError(f"sources.yml: duplicate source '{src.name}'.") + + # group defaults & engine overrides + src_defaults = _pick_source_fields( + src.model_dump( + exclude={"tables", "overrides", "name"}, + exclude_none=True, + by_alias=True, + ), + None, + field_path=f"sources[{s_idx}]", + ) + src_overrides = _normalize_engine_overrides( + src.overrides, + field_path=f"sources[{s_idx}].overrides", + ) + + group: dict[str, dict[str, Any]] = {} + for t_idx, tbl in enumerate(src.tables): + if tbl.name in group: + raise ValueError( + f"sources.yml → source '{src.name}': duplicate table '{tbl.name}'." + ) + + base_cfg = _pick_source_fields( + tbl.model_dump( + include=_SOURCE_CFG_FIELDS, + exclude_none=True, + by_alias=True, + ), + src_defaults, + field_path=f"sources[{s_idx}].tables[{t_idx}]", + ) + if not base_cfg.get("identifier") and not base_cfg.get("location"): + base_cfg["identifier"] = tbl.name + + table_overrides = _normalize_engine_overrides( + tbl.overrides, + field_path=f"sources[{s_idx}].tables[{t_idx}].overrides", + ) + overrides = _combine_engine_overrides(src_overrides, table_overrides) + + entry_meta = { + "description": tbl.description, + "columns": tbl.columns, + "meta": tbl.meta, + } + + group[tbl.name] = { + "base": base_cfg, + "overrides": overrides, + **{k: v for k, v in entry_meta.items() if v is not None}, + } + + normalized[src.name] = group + + return normalized + + +# --------------------------------------------------------------------------- +# Public helper used by core.Registry +# --------------------------------------------------------------------------- + + +def load_sources_config(project_dir: Path) -> dict[str, dict[str, dict[str, Any]]]: + """ + Read `sources.yml` under `project_dir`, validate it with Pydantic, and + return the normalized dict that Registry expects. + + This function is the direct analogue of `parse_project_yaml_config`. + """ + cfg_path = project_dir / "sources.yml" + raw = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} + + try: + parsed = SourcesFileConfig.model_validate(raw) + except Exception: # pydantic.ValidationError, yaml issues bubbled up earlier + # Let the caller wrap this into a friendlier "Failed to parse sources.yml" message + raise + + return _normalize_sources(parsed) + + +# --------------------------------------------------------------------------- +# Optional: resolve_source_entry helper +# --------------------------------------------------------------------------- + + +def resolve_source_entry( + entry: Mapping[str, Any], + engine: str | None, + *, + default_identifier: str | None = None, +) -> dict[str, Any]: + """ + Apply engine overrides to a normalized entry ("base" + "overrides"). + + This is unchanged from your current implementation. + """ + base = entry.get("base") if isinstance(entry, Mapping) else None + if not isinstance(base, Mapping): + base = {} + + cfg = dict(base) + cfg.setdefault("identifier", None) + cfg.setdefault("schema", None) + cfg.setdefault("database", None) + cfg.setdefault("catalog", None) + cfg.setdefault("project", None) + cfg.setdefault("dataset", None) + cfg.setdefault("location", None) + cfg.setdefault("format", None) + cfg.setdefault("options", {}) + + overrides = entry.get("overrides") if isinstance(entry, Mapping) else None + if isinstance(overrides, Mapping): + # wildcard/default overrides + for wildcard_key in ("*", "default", "any"): + if wildcard_key in overrides: + cfg = _merge_source_configs(cfg, overrides[wildcard_key]) + # engine-specific overrides + if engine and engine in overrides: + cfg = _merge_source_configs(cfg, overrides[engine]) + + ident = cfg.get("identifier") + if (ident is None or ident == "") and not cfg.get("location"): + if default_identifier: + cfg["identifier"] = default_identifier + else: + raise KeyError("Source configuration missing identifier or location") + + return cfg diff --git a/src/fastflowtransform/core.py b/src/fastflowtransform/core.py index 25da7cf..a2612af 100644 --- a/src/fastflowtransform/core.py +++ b/src/fastflowtransform/core.py @@ -3,6 +3,7 @@ import ast import importlib.util +import inspect import os import re import types @@ -13,99 +14,81 @@ from typing import Any import jinja2.runtime -import yaml from jinja2 import Environment, FileSystemLoader, StrictUndefined +from pydantic import ValidationError from fastflowtransform import storage -from fastflowtransform.errors import DependencyNotFoundError, ModuleLoadError +from fastflowtransform.config.models import validate_model_meta_strict +from fastflowtransform.config.project import parse_project_yaml_config +from fastflowtransform.config.sources import load_sources_config +from fastflowtransform.errors import ( + DependencyNotFoundError, + ModelConfigError, + ModuleLoadError, +) from fastflowtransform.logging import get_logger -_SOURCE_CFG_FIELDS = { - "identifier", - "schema", - "database", - "catalog", - "project", - "dataset", - "location", - "format", - "options", -} - - -def _compact_cfg(cfg: Mapping[str, Any]) -> dict[str, Any]: - cleaned: dict[str, Any] = {} - for key, value in cfg.items(): - if key == "options": - if value: - cleaned[key] = dict(value) - continue - if value is not None: - cleaned[key] = value - return cleaned - - -def _normalize_options(value: Any, *, field_path: str) -> dict[str, Any]: - if value is None: - return {} - if isinstance(value, Mapping): - return {str(k): v for k, v in value.items()} - raise ValueError(f"sources.yml → {field_path}: expected mapping, got {type(value).__name__}") - - -def _pick_source_fields( - data: Mapping[str, Any] | None, - base: Mapping[str, Any] | None, - *, - field_path: str, -) -> dict[str, Any]: - """Return a dict limited to the supported source configuration fields.""" - - data = data or {} - base = base or {} - out: dict[str, Any] = {k: base.get(k) for k in _SOURCE_CFG_FIELDS} - for key, value in data.items(): - if key not in _SOURCE_CFG_FIELDS: - continue - if key == "options": - base_opts = out.get("options") or {} - incoming = _normalize_options(value, field_path=f"{field_path}.options") - merged = dict(base_opts) - merged.update(incoming) - out["options"] = merged - else: - out[key] = value - - if "options" not in out or out["options"] is None: - out["options"] = {} - return out - -def _normalize_engine_overrides( - overrides: Mapping[str, Any] | None, - *, - field_path: str, -) -> dict[str, dict[str, Any]]: - if overrides is None: - return {} - if not isinstance(overrides, Mapping): - raise ValueError( - f"sources.yml → {field_path}: overrides must be a mapping of engine -> config" +def _validate_py_model_signature(func: Callable, deps: list[str], *, path: Path, name: str) -> None: + """ + Validate that a Python model function can accept the declared deps. + + Rules: + - If no deps are declared: + - Functions with 0 positional params are OK. + - Functions with *args/**kwargs are OK. + - Otherwise: error. + - If N deps are declared: + - Functions with at least N positional params are OK. + - Functions with *args are OK regardless of arity. + - Otherwise: error. + """ + sig = inspect.signature(func) + params = list(sig.parameters.values()) + + # Count positional params (pos-only or pos-or-kw) + pos_params = [p for p in params if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)] + has_varargs = any(p.kind == p.VAR_POSITIONAL for p in params) # *args + has_varkw = any(p.kind == p.VAR_KEYWORD for p in params) # **kwargs + + dep_count = len(deps or []) + + # Zero deps case + if dep_count == 0: + if len(pos_params) == 0: + return # perfect match + if has_varargs or has_varkw: + return # flexible function, OK + # Too many required positional params, no *args/**kwargs + raise ModuleLoadError( + f"{path}: @model(name='{name}') declares no deps but the function defines " + f"{len(pos_params)} positional parameter(s).", + hint=( + "Strict mode is enabled: zero-dep models must not define positional parameters.\n" + "Fix one of the following:\n" + " • Remove parameters: def build(): …\n" + " • Accept varargs: def build(*_): …\n" + " • Declare explicit deps: @model(..., deps=['upstream']) " + "and def build(upstream): …" + ), + code="PY_SIG_STRICT", ) - normalized: dict[str, dict[str, Any]] = {} - for engine, cfg in overrides.items(): - if cfg is None: - normalized[str(engine)] = {} - continue - if not isinstance(cfg, Mapping): - raise ValueError( - f"sources.yml → {field_path}[{engine!r}]: " - "expected mapping, got {type(cfg).__name__}" - ) - picked = _pick_source_fields(cfg, None, field_path=f"{field_path}[{engine!r}]") - normalized[str(engine)] = _compact_cfg(picked) - return normalized + # N deps case + if len(pos_params) >= dep_count or has_varargs: + return # OK (enough positional slots, or *args present) + + # Not enough positional capacity, and no *args + expected = ", ".join(deps) + raise ModuleLoadError( + f"{path}: @model(name='{name}') declares {dep_count} dep(s) but the function " + f"accepts only {len(pos_params)} positional parameter(s) and no *args.", + hint=( + "Match parameter count/order to your deps or accept varargs.\n" + f"Example: def {func.__name__}({expected}): …\n" + f"Or: def {func.__name__}(*deps): …" + ), + ) def _merge_source_configs(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]: @@ -122,20 +105,6 @@ def _merge_source_configs(base: Mapping[str, Any], override: Mapping[str, Any]) return merged -def _combine_engine_overrides( - source_overrides: Mapping[str, dict[str, Any]], - table_overrides: Mapping[str, dict[str, Any]], -) -> dict[str, dict[str, Any]]: - engines = set(source_overrides) | set(table_overrides) - combined: dict[str, dict[str, Any]] = {} - for engine in engines: - combined[engine] = _merge_source_configs( - source_overrides.get(engine, {}), - table_overrides.get(engine, {}), - ) - return combined - - def resolve_source_entry( entry: Mapping[str, Any], engine: str | None, *, default_identifier: str | None = None ) -> dict[str, Any]: @@ -172,96 +141,6 @@ def resolve_source_entry( return cfg -def _parse_sources_yaml(raw: Any) -> dict[str, dict[str, dict[str, Any]]]: - if not raw: - return {} - if not isinstance(raw, Mapping): - raise ValueError("sources.yml must be a mapping with keys 'version' and 'sources'.") - - version = raw.get("version") - version_no = 2 - if version != version_no: - raise ValueError("sources.yml → version: Only '2' is supported.") - - entries = raw.get("sources") - if entries is None: - return {} - if not isinstance(entries, Iterable): - raise ValueError("sources.yml → sources: expected a list of source declarations.") - - normalized: dict[str, dict[str, dict[str, Any]]] = {} - for idx, entry in enumerate(entries): - if not isinstance(entry, Mapping): - raise ValueError( - f"sources.yml → sources[{idx}]: expected mapping, got {type(entry).__name__}." - ) - - src_name = entry.get("name") - if not src_name or not isinstance(src_name, str): - raise ValueError(f"sources.yml → sources[{idx}]: missing 'name'.") - - if src_name in normalized: - raise ValueError(f"sources.yml: duplicate source '{src_name}'.") - - src_defaults = _pick_source_fields(entry, None, field_path=f"sources[{idx}]") - src_overrides = _normalize_engine_overrides( - entry.get("overrides"), field_path=f"sources[{idx}].overrides" - ) - - tables = entry.get("tables") - if tables is None: - raise ValueError(f"sources.yml → sources[{idx}]: missing 'tables' list.") - if not isinstance(tables, Iterable): - raise ValueError( - f"sources.yml → sources[{idx}].tables: expected list, got {type(tables).__name__}." - ) - - group: dict[str, dict[str, Any]] = {} - for t_idx, table in enumerate(tables): - if not isinstance(table, Mapping): - raise ValueError( - f"sources.yml → sources[{idx}].tables[{t_idx}]: " - f"expected mapping, got {type(table).__name__}." - ) - - tbl_name = table.get("name") - if not tbl_name or not isinstance(tbl_name, str): - raise ValueError(f"sources.yml → sources[{idx}].tables[{t_idx}]: missing 'name'.") - - if tbl_name in group: - raise ValueError( - f"sources.yml → source '{src_name}': duplicate table '{tbl_name}'." - ) - - base_cfg = _pick_source_fields( - table, src_defaults, field_path=f"sources[{idx}].tables[{t_idx}]" - ) - if not base_cfg.get("identifier") and not base_cfg.get("location"): - base_cfg["identifier"] = tbl_name - - table_overrides = _normalize_engine_overrides( - table.get("overrides"), - field_path=f"sources[{idx}].tables[{t_idx}].overrides", - ) - overrides = _combine_engine_overrides(src_overrides, table_overrides) - - entry_meta = { - "description": table.get("description"), - "columns": table.get("columns"), - "meta": table.get("meta"), - } - - group[tbl_name] = { - "base": base_cfg, - "overrides": overrides, - **{k: v for k, v in entry_meta.items() if v is not None}, - } - - normalized[src_name] = group - - return normalized - - @dataclass class Node: name: str @@ -283,6 +162,7 @@ def __init__(self): self.project_vars: dict[str, Any] = {} # project.yml: vars self.cli_vars: dict[str, Any] = {} # CLI --vars overrides self.active_engine: str | None = None + self.incremental_models: dict[str, dict[str, Any]] = {} def get_project_dir(self) -> Path: """Return the project directory after load_project(), or raise if not set.""" @@ -325,6 +205,28 @@ def _lookup_storage_meta(self, node_name: str) -> dict[str, Any]: """ return storage.get_model_storage(node_name) + def _lookup_incremental_meta(self, node_name: str) -> dict[str, Any]: + """ + Return incremental metadata for a given node (from project.yml → models.incremental). + + Accepts names with or without trailing '.ff' — we try both variants and + return the first match. + """ + candidates: list[str] + if node_name.endswith(".ff"): + # e.g. "users.ff" → try "users.ff", then "users" + candidates = [node_name, node_name[:-3]] + else: + # e.g. "users" → try "users", then "users.ff" + candidates = [node_name, f"{node_name}.ff"] + + for key in candidates: + cfg = self.incremental_models.get(key) + if cfg: + return dict(cfg) + + return {} + def _current_engine(self) -> str | None: """ Determine the active engine in precedence order: @@ -390,118 +292,6 @@ def _should_register_for_engine(self, meta: Mapping[str, Any], *, path: Path) -> ) return current in allowed - # def load_project(self, project_dir: Path) -> None: - # self.nodes.clear() - # self.py_funcs.clear() - # self.py_requires.clear() - # self.sources = {} - # self.project_vars = {} - # self.cli_vars = {} - # self.macros.clear() - - # storage.set_model_storage({}) - # storage.set_seed_storage({}) - - # self.project_dir = project_dir - # models_dir = project_dir / "models" - # self.env = Environment( - # loader=FileSystemLoader(str(models_dir)), - # undefined=StrictUndefined, - # autoescape=False, - # trim_blocks=True, - # lstrip_blocks=True, - # ) - - # # Make sure macros are available to all templates before model discovery. - # self._load_macros(models_dir) - # self._load_py_macros(models_dir) - - # # load sources (version 2 schema) - # src_path = project_dir / "sources.yml" - # if src_path.exists(): - # raw_sources = yaml.safe_load(src_path.read_text(encoding="utf-8")) - # try: - # self.sources = _parse_sources_yaml(raw_sources) - # except ValueError as exc: - # raise ValueError(f"Failed to parse sources.yml: {exc}") from exc - # else: - # self.sources = {} - - # # load project.yml (vars) - # proj_path = project_dir / "project.yml" - # if proj_path.exists(): - # proj_cfg = yaml.safe_load(proj_path.read_text(encoding="utf-8")) or {} - # self.project_vars = dict(proj_cfg.get("vars", {}) or {}) - - # models_cfg = proj_cfg.get("models") if isinstance(proj_cfg, Mapping) else None - # model_storage_raw = None - # if isinstance(models_cfg, Mapping): - # candidate = models_cfg.get("storage") - # if isinstance(candidate, Mapping): - # model_storage_raw = candidate - # storage.set_model_storage( - # storage.normalize_storage_map(model_storage_raw, project_dir=project_dir) - # ) - - # seeds_cfg = proj_cfg.get("seeds") if isinstance(proj_cfg, Mapping) else None - # seed_storage_raw = None - # if isinstance(seeds_cfg, Mapping): - # candidate = seeds_cfg.get("storage") - # if isinstance(candidate, Mapping): - # seed_storage_raw = candidate - # storage.set_seed_storage( - # storage.normalize_storage_map(seed_storage_raw, project_dir=project_dir) - # ) - - # # discover models - # for p in models_dir.rglob("*.ff.sql"): - # name = p.stem - # deps = self._scan_sql_deps(p) - # meta = dict(self._parse_model_config(p)) - # storage_meta = self._lookup_storage_meta(name) - # if storage_meta: - # existing = dict(meta.get("storage") or {}) - # existing.update(storage_meta) - # meta["storage"] = existing - # if not self._should_register_for_engine(meta, path=p): - # continue - # self._add_node_or_fail(name, "sql", p, deps, meta=meta) - # for p in models_dir.rglob("*.ff.py"): - # self._load_py_module(p) - # for _, func in list(self.py_funcs.items()): - # func_path = Path(getattr(func, "__ff_path__", "")).resolve() - # if func_path == p.resolve(): - # name = getattr(func, "__ff_name__", func.__name__) - # deps = getattr(func, "__ff_deps__", []) - # kind = getattr(func, "__ff_kind__", "python") or "python" - - # meta = dict(getattr(func, "__ff_meta__", {}) or {}) - # storage_meta = self._lookup_storage_meta(name) - # if storage_meta: - # existing = dict(meta.get("storage") or {}) - # existing.update(storage_meta) - # meta["storage"] = existing - # tags = list(getattr(func, "__ff_tags__", []) or []) - # if tags: - # existing_tags = meta.get("tags") - # if isinstance(existing_tags, list): - # merged = existing_tags + [t for t in tags if t not in existing_tags] - # meta["tags"] = merged - # elif existing_tags is None: - # meta["tags"] = tags - # else: - # # Normalize non-list tags into a list while preserving the value - # meta["tags"] = [existing_tags, *tags] - - # self._add_node_or_fail(name, kind, p, deps, meta=meta) - - # req = getattr(func, "__ff_require__", None) - # if req: - # self.py_requires[name] = req - - # # ---- Dependency validation (early and clear) - # self._validate_dependencies() - def load_project(self, project_dir: Path) -> None: """Load a FastFlowTransform project from the given directory.""" self._reset_registry_state() @@ -533,6 +323,7 @@ def _reset_registry_state(self) -> None: self.project_vars = {} self.cli_vars = {} self.macros.clear() + self.incremental_models = {} # reset storage maps storage.set_model_storage({}) storage.set_seed_storage({}) @@ -547,6 +338,33 @@ def _init_jinja_env(self, models_dir: Path) -> None: lstrip_blocks=True, ) + # ---- Make project vars & helpers available in Jinja ---- + # Note: these callables close over `self`, so they always read the + # latest self.cli_vars / self.project_vars even after project.yml loads. + def _var(key: str, default: Any | None = None) -> Any: + # CLI --vars override project vars + if isinstance(self.cli_vars, dict) and key in self.cli_vars: + return self.cli_vars[key] + if isinstance(self.project_vars, dict) and key in self.project_vars: + return self.project_vars[key] + return default + + def _engine(default: str | None = None) -> str | None: + # Current active engine (duckdb|postgres|databricks_spark|…) + return self._current_engine() or default + + # Simple env reader for templates/macros: {{ env("NAME", "fallback") }} + def _env(name: str, default: Any | None = None) -> Any: + return os.environ.get(name, default) + + # Expose helpers to Jinja + self.env.globals["var"] = _var + self.env.globals["engine"] = _engine + self.env.globals["env"] = _env + + self.env.filters["var"] = _var + self.env.filters["env"] = _env + def _load_sources_yaml(self, project_dir: Path) -> None: """Load sources.yml (version 2) if present.""" src_path = project_dir / "sources.yml" @@ -554,64 +372,114 @@ def _load_sources_yaml(self, project_dir: Path) -> None: self.sources = {} return - raw_sources = yaml.safe_load(src_path.read_text(encoding="utf-8")) try: - self.sources = _parse_sources_yaml(raw_sources) - except ValueError as exc: + self.sources = load_sources_config(project_dir) + except Exception as exc: + # pydantic.ValidationError, ValueError, etc. raise ValueError(f"Failed to parse sources.yml: {exc}") from exc def _load_project_yaml(self, project_dir: Path) -> None: - """Load project.yml (vars, storage blocks) if present.""" + """Load and validate project.yml (vars, storage, incremental overlays).""" proj_path = project_dir / "project.yml" if not proj_path.exists(): return - proj_cfg = yaml.safe_load(proj_path.read_text(encoding="utf-8")) or {} - self.project_vars = dict(proj_cfg.get("vars", {}) or {}) - - # models.storage - models_cfg = proj_cfg.get("models") if isinstance(proj_cfg, Mapping) else None - model_storage_raw = None - if isinstance(models_cfg, Mapping): - candidate = models_cfg.get("storage") - if isinstance(candidate, Mapping): - model_storage_raw = candidate + try: + proj_cfg = parse_project_yaml_config(project_dir) + except Exception as exc: + # Surface a clear error when project.yml is invalid + raise ValueError(f"Failed to parse project.yml: {exc}") from exc + + # Vars → available in Jinja via var("key") + self.project_vars = dict(proj_cfg.vars or {}) + + # Incremental overlays (per model) from project.yml → models.incremental + # Stored as plain dicts so the rest of the registry can treat them as before. + self.incremental_models = { + name: cfg.model_dump(exclude_none=True) + for name, cfg in proj_cfg.models.incremental.items() + } + + # models.storage → storage.set_model_storage(...) + model_storage_raw: dict[str, dict[str, Any]] = { + name: s.model_dump(exclude_none=True) for name, s in proj_cfg.models.storage.items() + } storage.set_model_storage( storage.normalize_storage_map(model_storage_raw, project_dir=project_dir) ) - # seeds.storage - seeds_cfg = proj_cfg.get("seeds") if isinstance(proj_cfg, Mapping) else None - seed_storage_raw = None - if isinstance(seeds_cfg, Mapping): - candidate = seeds_cfg.get("storage") - if isinstance(candidate, Mapping): - seed_storage_raw = candidate + # seeds.storage → storage.set_seed_storage(...) + seed_storage_raw: dict[str, dict[str, Any]] = { + name: s.model_dump(exclude_none=True) for name, s in proj_cfg.seeds.storage.items() + } storage.set_seed_storage( storage.normalize_storage_map(seed_storage_raw, project_dir=project_dir) ) def _discover_sql_models(self, models_dir: Path) -> None: - """Scan *.ff.sql files, parse deps, and register nodes.""" + """Scan *.ff.sql files, parse config, validate meta, and register nodes.""" for path in models_dir.rglob("*.ff.sql"): name = path.stem deps = self._scan_sql_deps(path) - meta = dict(self._parse_model_config(path)) + + # Raw config from leading {{ config(...) }} in the SQL file + raw_meta = dict(self._parse_model_config(path)) + + # Merge project-level storage override (project.yml → models.storage) storage_meta = self._lookup_storage_meta(name) if storage_meta: - existing = dict(meta.get("storage") or {}) + existing = dict(raw_meta.get("storage") or {}) existing.update(storage_meta) - meta["storage"] = existing + raw_meta["storage"] = existing + + # Merge project-level incremental overlay (project.yml → models.incremental) + incr_meta = self._lookup_incremental_meta(name) + if incr_meta: + merged = dict(incr_meta) + merged.update(raw_meta or {}) + raw_meta = merged + + # Pydantic validation: hard fail on unknown keys / wrong types + try: + cfg = validate_model_meta_strict(raw_meta) + except ValidationError as exc: + # Reformat Pydantic errors into a compact, user-friendly message. + lines = [] + for err in exc.errors(): + loc = ".".join(str(p) for p in err.get("loc", ()) if p != "__root__") + msg = err.get("msg", "invalid value") + if loc: + lines.append(f"• {loc}: {msg}") + else: + lines.append(f"• {msg}") + details = "\n".join(lines) if lines else str(exc) + raise ModelConfigError( + f"schema validation failed:\n{details}", + path=str(path), + hint="Fix the fields listed above. Unknown keys are rejected (extra='forbid').", + code="CFG_SCHEMA", + ) from exc + + # Backwards-compatible default: incremental → materialized='incremental' + if cfg.is_incremental_enabled() and cfg.materialized is None: + cfg.materialized = "incremental" + + # Node.meta is kept as a plain dict + meta = cfg.model_dump(exclude_none=True) + + # Engine-filtering still works on the dict (config(engines=[...])) if not self._should_register_for_engine(meta, path=path): continue + self._add_node_or_fail(name, "sql", path, deps, meta=meta) def _discover_python_models(self, models_dir: Path) -> None: - """Scan *.ff.py files, import them, and register decorated callables.""" + """Scan *.ff.py files, import them, validate meta, and register decorated callables.""" for path in models_dir.rglob("*.ff.py"): + # Import the module so decorators can register functions self._load_py_module(path) - # we might have loaded several functions; filter by file path + # We may have loaded several functions; filter by file path for _, func in list(self.py_funcs.items()): func_path = Path(getattr(func, "__ff_path__", "")).resolve() if func_path != path.resolve(): @@ -621,27 +489,72 @@ def _discover_python_models(self, models_dir: Path) -> None: deps = getattr(func, "__ff_deps__", []) kind = getattr(func, "__ff_kind__", "python") or "python" - meta = dict(getattr(func, "__ff_meta__", {}) or {}) + # Validate function signature vs declared deps (fail fast) + _validate_py_model_signature(func, deps or [], path=path, name=name) + + # Raw meta attached by @model(..., meta={...}) + raw_meta = dict(getattr(func, "__ff_meta__", {}) or {}) + + # Merge storage override from project.yml (models.storage) storage_meta = self._lookup_storage_meta(name) if storage_meta: - existing = dict(meta.get("storage") or {}) + existing = dict(raw_meta.get("storage") or {}) existing.update(storage_meta) - meta["storage"] = existing + raw_meta["storage"] = existing + + # Merge incremental overlay from project.yml (models.incremental) + incr_meta = self._lookup_incremental_meta(name) + if incr_meta: + merged = dict(incr_meta) + merged.update(raw_meta or {}) + raw_meta = merged - # merge tags from decorator into model meta.tags + # Merge tags from decorator into meta.tags tags = list(getattr(func, "__ff_tags__", []) or []) if tags: - existing_tags = meta.get("tags") + existing_tags = raw_meta.get("tags") if isinstance(existing_tags, list): - merged = existing_tags + [t for t in tags if t not in existing_tags] - meta["tags"] = merged + base = existing_tags elif existing_tags is None: - meta["tags"] = tags + base = [] else: - meta["tags"] = [existing_tags, *tags] - + base = [existing_tags] + merged_tags = base + [t for t in tags if t not in base] + raw_meta["tags"] = merged_tags + + # Store kind in meta for selectors / docs (optional but handy) + raw_meta.setdefault("kind", kind) + + # Validate via Pydantic + try: + cfg = validate_model_meta_strict(raw_meta) + except ValidationError as exc: + lines = [] + for err in exc.errors(): + loc = ".".join(str(p) for p in err.get("loc", ()) if p != "__root__") + msg = err.get("msg", "invalid value") + if loc: + lines.append(f"• {loc}: {msg}") + else: + lines.append(f"• {msg}") + details = "\n".join(lines) if lines else str(exc) + raise ModelConfigError( + f"schema validation failed:\n{details}", + path=str(path), + hint="Check your @model(meta=...) dictionary.", + code="CFG_SCHEMA", + ) from exc + + # Default incremental materialization if enabled and not set explicitly + if cfg.is_incremental_enabled() and cfg.materialized is None: + cfg.materialized = "incremental" + + meta = cfg.model_dump(exclude_none=True) + + # Register node self._add_node_or_fail(name, kind, path, deps, meta=meta) + # Required-columns spec (for executors) stays as before req = getattr(func, "__ff_require__", None) if req: self.py_requires[name] = req @@ -766,8 +679,11 @@ def _scan_sql_deps(self, path: Path) -> list[str]: # -------- {{ config(...) }} Head-Parser -------- def _parse_model_config(self, path: Path) -> dict[str, Any]: """ - Reads the leading line {{ config(materialized='view', key=1) }}. - Safely parses via ast.literal_eval for keyword arguments. Errors → {}. + Read the leading `{{ config(...) }}` header and parse keyword arguments. + Behavior: + - If no `config(...)` block is found → return {}. + - If a `config(...)` block is found but parsing fails → RAISE ModuleLoadError. + This ensures misconfigured headers fail loudly instead of being silently ignored. """ try: head = path.read_text(encoding="utf-8", errors="ignore")[:2000] @@ -781,21 +697,43 @@ def _parse_model_config(self, path: Path) -> dict[str, Any]: args = m.group("args").strip() if not args: return {} + src = f"__CFG__({args})" try: - # parse "a=1, b='x'" as a Call and extract keywords - node = ast.parse(f"__CFG__({args})", mode="eval") + node = ast.parse(src, mode="eval") if not isinstance(node.body, ast.Call): + # Not a function-call AST; treat as empty to avoid false positives return {} - cfg: dict[str, Any] = {} - for kw in node.body.keywords: - if kw.arg is None: - # **kwargs werden (noch) ignoriert - continue - cfg[kw.arg] = ast.literal_eval(kw.value) - return cfg - except Exception: - # Robust: keine Hard-Fails beim Laden - return {} + except Exception as e: + raise ModelConfigError( + f"invalid syntax: {e}", + path=str(path), + field=None, + hint="Ensure {{ config(...) }} contains comma-separated key=value literals.", + ) from e + + cfg: dict[str, Any] = {} + for kw in node.body.keywords: + # Disallow **kwargs explicitly with a crisp message + if kw.arg is None: + val_src = ast.get_source_segment(src, kw.value) or "" + raise ModelConfigError( + f"unsupported **kwargs (got {val_src})", + path=str(path), + field="**kwargs", + hint="Use explicit key=value pairs; expressions are not allowed.", + ) + field = kw.arg + try: + cfg[field] = ast.literal_eval(kw.value) + except Exception as err: + val_src = ast.get_source_segment(src, kw.value) or "" + raise ModelConfigError( + f"invalid literal (quote strings, no expressions): {val_src}", + path=str(path), + field=field, + hint="All values must be JSON/Python literals (e.g. 'view', ['tag']).", + ) from err + return cfg def _validate_dependencies(self) -> None: """ diff --git a/src/fastflowtransform/errors.py b/src/fastflowtransform/errors.py index 875268d..bbc848e 100644 --- a/src/fastflowtransform/errors.py +++ b/src/fastflowtransform/errors.py @@ -78,6 +78,34 @@ class ModuleLoadError(FastFlowTransformError): pass +class ModelConfigError(FastFlowTransformError): + """ + Raised when a model's {{ config(...) }} (or @model(meta=...)) is malformed + or fails schema validation. + + Typical causes: + - Syntax errors in the config(...) header + - Non-literal expressions in values (must be JSON/Python literals) + - Unknown/forbidden keys + - Wrong types for documented fields + """ + + def __init__( + self, + message: str, + *, + path: str | None = None, + field: str | None = None, + hint: str | None = None, + code: str = "CFG_PARSE", + ): + prefix = f"{path}: " if path else "" + scope = f"config.{field}: " if field else "config: " + super().__init__(f"{prefix}{scope}{message}".rstrip(), code=code, hint=hint) + self.path = path + self.field = field + + class ProfileConfigError(FastFlowTransformError): """Profile/configuration error with a short, actionable hint.""" diff --git a/src/fastflowtransform/executors/base.py b/src/fastflowtransform/executors/base.py index 4c891df..c85bc36 100644 --- a/src/fastflowtransform/executors/base.py +++ b/src/fastflowtransform/executors/base.py @@ -2,22 +2,84 @@ from __future__ import annotations import contextvars +import importlib import re from abc import ABC, abstractmethod -from collections.abc import Callable, Iterable +from collections.abc import Callable, Iterable, Mapping from contextlib import suppress from pathlib import Path -from typing import Any, TypeVar +from typing import Any, TypeVar, cast +import pandas as pd from jinja2 import Environment from pandas import DataFrame as _PDDataFrame +from fastflowtransform import incremental as _ff_incremental from fastflowtransform.api import context as _http_ctx from fastflowtransform.core import REGISTRY, Node, relation_for, resolve_source_entry from fastflowtransform.errors import ModelExecutionError +from fastflowtransform.incremental import _normalize_unique_key from fastflowtransform.logging import echo_debug from fastflowtransform.validation import validate_required_columns + +def _python_incremental_merge_default( + df_old: _PDDataFrame, + df_new: _PDDataFrame, + unique_key: list[str], + update_cols: list[str], +) -> _PDDataFrame: + """ + Default merge for Python-Incremental: + - unique_key: key columns + - update_cols: columns from which to determine delta + Strategy: + - df_old + df_new concat, + - sorted by unique_key + update_cols + - Deduplicate unique_key (keep='last'). + """ + if df_old is None or df_old.empty: + return df_new.copy() + if df_new is None or df_new.empty: + return df_old.copy() + + if not unique_key: + combined = pd.concat([df_old, df_new], ignore_index=True) + combined = combined.drop_duplicates() + return combined + + combined = pd.concat([df_old, df_new], ignore_index=True) + + # Nur Update-Spalten verwenden, die es wirklich gibt + update_cols = [c for c in update_cols if c in combined.columns] + + sort_cols = unique_key + update_cols if update_cols else unique_key + combined = combined.sort_values(sort_cols) + combined = combined.drop_duplicates(subset=unique_key, keep="last") + return combined + + +def _load_callable(path: str) -> Callable[..., Any]: + """ + Import a callable from 'pkg.mod:func' or 'pkg.mod.func'. + """ + text = path.strip() + if ":" in text: + mod_name, func_name = text.split(":", 1) + elif "." in text: + mod_name, func_name = text.rsplit(".", 1) + else: + raise ValueError( + f"Invalid callable path {path!r}; expected 'module:func' or 'module.func'." + ) + + mod = importlib.import_module(mod_name) + fn = getattr(mod, func_name, None) + if not callable(fn): + raise ValueError(f"{path!r} is not a callable") + return fn + + # Frame type (pandas.DataFrame, pyspark.sql.DataFrame, snowflake.snowpark.DataFrame, ...) TFrame = TypeVar("TFrame") @@ -98,13 +160,13 @@ def _var(key: str, default: Any = None) -> Any: env.globals["var"] = _var # ---- is_incremental() builtin - # True iff materialization is 'incremental' AND the target relation already exists. + # True iff meta marks the model as incremental AND the target relation exists. if "is_incremental" not in env.globals: def _is_incremental() -> bool: try: - mat = (getattr(node, "meta", {}) or {}).get("materialized", "table") - if mat != "incremental": + meta = getattr(node, "meta", {}) or {} + if not self._meta_is_incremental(meta): return False rel = relation_for(node.name) return bool(self.exists_relation(rel)) @@ -142,7 +204,7 @@ def _default_source(source_name: str, table_name: str) -> str: # expose 'this' to the template: Proxy-Objekt, das wie String wirkt this_obj = _ThisProxy( - relation_for(node.name), + self._this_identifier(node), (getattr(node, "meta", {}) or {}).get("materialized", "table"), getattr(self, "schema", None) or getattr(self, "dataset", None), getattr(self, "database", None) or getattr(self, "project", None), @@ -169,6 +231,11 @@ def run_sql(self, node: Node, env: Environment) -> None: The body is CTE-aware (keeps WITH … SELECT … intact). On failure, raise ModelExecutionError with a helpful snippet. """ + meta = getattr(node, "meta", {}) or {} + if self._meta_is_incremental(meta): + # Delegates to incremental engine: render, schema sync, merge/insert, etc. + return _ff_incremental.run_or_dispatch(self, node, env) + sql_rendered = self.render_sql( node, env, @@ -330,7 +397,7 @@ def _render_ephemeral_sql(self, name: str, env: Environment) -> str: ref=lambda n: self._resolve_ref(n, env), source=self._resolve_source, this=_ThisProxy( - relation_for(node.name), + self._this_identifier(node), (getattr(node, "meta", {}) or {}).get("materialized", "table"), getattr(self, "schema", None) or getattr(self, "dataset", None), getattr(self, "database", None) or getattr(self, "project", None), @@ -343,56 +410,190 @@ def _render_ephemeral_sql(self, name: str, env: Environment) -> str: # ---------- Python models ---------- def run_python(self, node: Node) -> None: + """Execute the Python model for a given node and materialize its result.""" func = REGISTRY.py_funcs[node.name] deps = REGISTRY.nodes[node.name].deps or [] - if _http_ctx is not None: - with suppress(Exception): - _http_ctx.reset_for_node(node.name) - - # Load inputs - arg: Any - if len(deps) == 0: - arg = None - elif len(deps) == 1: - rel = relation_for(deps[0]) - df_in: TFrame = self._read_relation(rel, node, deps) - arg = df_in # TFrame - else: - frames: dict[str, TFrame] = {} - for dep in deps: - rel = relation_for(dep) - f = self._read_relation(rel, node, deps) - frames[rel] = f - arg = frames # dict[str, TFrame] - - # Validate required columns / structure (frame specific) + + self._reset_http_ctx(node) + + # arg = self._build_python_args(node, deps) + args, argmap = self._build_python_inputs(node, deps) requires = REGISTRY.py_requires.get(node.name, {}) + # if deps: + # self._validate_required(node.name, arg, requires) if deps: - self._validate_required(node.name, arg, requires) + # Required-columns check works against the mapping + self._validate_required(node.name, argmap, requires) + + # out = self._execute_python_func(func, arg, node) + out = self._execute_python_func(func, args, node) + + target = relation_for(node.name) + meta = getattr(node, "meta", {}) or {} + mat = self._resolve_materialization_strategy(meta) - # Execute the model - out = func(arg) - if not self._is_frame(out): + if mat == "incremental": + self._materialize_incremental(target, out, node, meta) + elif mat == "view": + self._materialize_view(target, out, node) + else: + self._materialize_relation(target, out, node) + + self._snapshot_http_ctx(node) + + # ----------------- helpers ----------------- + + def _reset_http_ctx(self, node: Node) -> None: + """Reset HTTP context for the given node if available.""" + if _http_ctx is None: + return + with suppress(Exception): + _http_ctx.reset_for_node(node.name) + + def _build_python_inputs( + self, node: Node, deps: list[str] + ) -> tuple[list[TFrame], dict[str, TFrame]]: + """ + Load input frames for the Python model. + Returns: + - args: positional argument list in the order of `deps` + - argmap: mapping {relation_name -> frame} for validation + """ + args: list[TFrame] = [] + argmap: dict[str, TFrame] = {} + for dep in deps or []: + rel = relation_for(dep) + df = self._read_relation(rel, node, deps) + args.append(df) + argmap[rel] = df + return args, argmap + + def _execute_python_func( + self, + func: Callable[[Any], Any], + args: Any, + node: Node, + ) -> TFrame: + """Execute the Python function and ensure it returns a valid frame.""" + # raw = func(arg) + raw = func(*args) + if not self._is_frame(raw): raise TypeError( f"Python-Modell '{node.name}' muss {self._frame_name()} DataFrame zurückgeben." ) + return cast(TFrame, raw) - # Materialize the result (table default; view supported) - target = relation_for(node.name) - mat = (getattr(node, "meta", {}) or {}).get("materialized", "table") - if mat == "view": - backing = self._py_view_backing_name(target) - self._materialize_relation(backing, out, node) - self._create_or_replace_view_from_table(target, backing, node) - else: + def _resolve_materialization_strategy(self, meta: dict[str, Any]) -> str: + """ + Determine how the Python model result should be materialized. + + Returns "table" by default, but respects: + - meta["materialized"] + - meta["incremental"] (bool or dict) as a shortcut for incremental + materialization. + """ + if self._meta_is_incremental(meta): + return "incremental" + mat = meta.get("materialized") or "table" + return str(mat) + + def _materialize_view(self, target: str, out: TFrame, node: Node) -> None: + """Materialize a Python model as a backing table and expose it as a view.""" + backing = self._py_view_backing_name(target) + self._materialize_relation(backing, out, node) + self._create_or_replace_view_from_table(target, backing, node) + + def _materialize_incremental( + self, + target: str, + out: TFrame, + node: Node, + meta: dict[str, Any], + ) -> None: + """Materialize a Python model using incremental semantics.""" + if not self._relation_exists_safely(target): + # First run -> write full table self._materialize_relation(target, out, node) + return - if _http_ctx is not None: - try: - snap = _http_ctx.snapshot() - (node.meta or {}).update({"_http_snapshot": snap}) - except Exception: - pass + if not isinstance(out, _PDDataFrame): + # Non-pandas frames: fall back to full refresh + self._materialize_relation(target, out, node) + return + + df_old = self._safe_read_existing_incremental(target, node) + if df_old is None or not isinstance(df_old, _PDDataFrame): + # Fallback: full-refresh + self._materialize_relation(target, out, node) + return + + merged = self._merge_incremental_frames(df_old, out, meta, node) + self._materialize_relation(target, merged, node) + + def _relation_exists_safely(self, target: str) -> bool: + """Check whether the target relation exists, swallowing backend errors.""" + try: + return bool(self.exists_relation(target)) + except Exception: + return False + + def _safe_read_existing_incremental(self, target: str, node: Node) -> Any: + """Try to read an existing incremental relation, swallowing backend errors.""" + try: + return self._read_relation(target, node, deps=[]) + except Exception: + return None + + def _merge_incremental_frames( + self, + df_old: _PDDataFrame, + df_new: _PDDataFrame, + meta: dict[str, Any], + node: Node, + ) -> TFrame: + """ + Merge existing and new frames using a custom delta function if configured, + otherwise fall back to the default incremental merge. + """ + delta_fn_ref = meta.get("delta_python") + + if isinstance(delta_fn_ref, str) and delta_fn_ref.strip(): + delta_fn = _load_callable(delta_fn_ref) + merged = delta_fn( + existing=df_old, + new=df_new, + node=node, + executor=self, + meta=meta, + ) + if not self._is_frame(merged): + raise TypeError( + f"delta_python '{delta_fn_ref}' must return a DataFrame {self._frame_name()}." + ) + return cast(TFrame, merged) + + unique_key = _normalize_unique_key(meta.get("unique_key") or meta.get("primary_key")) + update_cols = _normalize_unique_key( + meta.get("delta_columns") + or meta.get("updated_at_columns") + or meta.get("updated_at") + or meta.get("timestamp_columns") + ) + merged_default = _python_incremental_merge_default(df_old, df_new, unique_key, update_cols) + return cast(TFrame, merged_default) + + def _snapshot_http_ctx(self, node: Node) -> None: + """Store an HTTP snapshot into node.meta if HTTP context is available.""" + if _http_ctx is None: + return + + try: + snap = _http_ctx.snapshot() + except Exception: + return + + with suppress(Exception): + (node.meta or {}).update({"_http_snapshot": snap}) # -------- Python model view helpers (shared) -------- def _py_view_backing_name(self, relation: str) -> str: @@ -456,6 +657,28 @@ def _create_or_replace_table(self, target_sql: str, select_body: str, node: Node ... # ---------- Resolution helpers ---------- + def _this_identifier(self, node: Node) -> str: + """ + Physical identifier backing {{ this }} in SQL templates. + + Engines may override to inject catalog/schema qualification. + """ + return relation_for(node.name) + + def _format_test_table(self, table: str | None) -> str | None: + """ + Format table identifiers for data-quality tests (fft test). + + Default behavior normalizes '.ff' suffixes only; engines can override + to add catalog/schema qualification. + """ + if not isinstance(table, str): + return table + stripped = table.strip() + if not stripped: + return stripped + return relation_for(stripped) if stripped.endswith(".ff") else stripped + def _resolve_ref(self, name: str, env: Environment) -> str: dep = REGISTRY.get_node(name) if hasattr(REGISTRY, "get_node") else REGISTRY.nodes[name] if dep.meta.get("materialized") == "ephemeral": @@ -560,6 +783,39 @@ def alter_table_sync_schema( """ return None + @staticmethod + def _meta_is_incremental(meta: Mapping[str, Any] | None) -> bool: + """ + Return True if the given meta mapping describes an incremental model. + + This mirrors the semantics of ModelConfig.is_incremental_enabled(), but + works on a plain mapping to avoid tight coupling to the Pydantic model. + """ + if not meta: + return False + + incremental_cfg = meta.get("incremental") + materialized = str(meta.get("materialized") or "").lower() + + # Explicit materialized='incremental' always wins. + if materialized == "incremental": + return True + + # incremental: true / false + if isinstance(incremental_cfg, bool): + return incremental_cfg + + # incremental: {enabled: bool, ...} + if isinstance(incremental_cfg, dict): + enabled = incremental_cfg.get("enabled") + if isinstance(enabled, bool): + return enabled + # Default: treat presence of a dict as "enabled" if no explicit flag is set. + return True + + # Fallback: any non-empty incremental value is treated as "enabled". + return bool(incremental_cfg) + ENGINE_NAME: str = "generic" @property diff --git a/src/fastflowtransform/executors/databricks_spark_exec.py b/src/fastflowtransform/executors/databricks_spark_exec.py index c476ba8..a8afca6 100644 --- a/src/fastflowtransform/executors/databricks_spark_exec.py +++ b/src/fastflowtransform/executors/databricks_spark_exec.py @@ -1,22 +1,167 @@ # src/fastflowtransform/executors/databricks_spark_exec.py from __future__ import annotations -import shutil from collections.abc import Iterable from contextlib import suppress from pathlib import Path from typing import Any from urllib.parse import unquote, urlparse -from pyspark.errors.exceptions.base import AnalysisException from pyspark.sql import DataFrame as SDF, SparkSession from pyspark.sql.types import DataType +try: + # Enable Delta Lake via delta-spark when available + from delta import configure_spark_with_delta_pip +except Exception: # pragma: no cover + configure_spark_with_delta_pip = None # type: ignore[assignment] + from fastflowtransform import storage from fastflowtransform.core import REGISTRY, Node, relation_for from fastflowtransform.errors import ModelExecutionError from fastflowtransform.executors.base import BaseExecutor +from fastflowtransform.logging import echo_debug from fastflowtransform.meta import ensure_meta_table, upsert_meta +from fastflowtransform.table_formats import get_spark_format_handler +from fastflowtransform.table_formats.base import SparkFormatHandler + +_DELTA_EXTENSION = "io.delta.sql.DeltaSparkSessionExtension" +_DELTA_CATALOG = "org.apache.spark.sql.delta.catalog.DeltaCatalog" +_SPARK_DEFAULT_CATALOG = "org.apache.spark.sql.internal.CatalogImpl" # Spark's built-in + + +def _has_delta(spark: SparkSession) -> bool: + """ + Best-effort Delta availability check that works with: + * local Spark + delta-spark + * Databricks runtime + * Databricks Connect + + We first inspect Spark configuration, then fall back to checking that + delta-spark is importable, and finally use the old JVM heuristic for + plain local Spark. + """ + # 1) Look at Spark SQL extensions (delta-spark & Databricks both wire this) + try: + exts = spark.conf.get("spark.sql.extensions", "") + if _DELTA_EXTENSION in str(exts): + return True + except Exception: + pass + + # 2) If delta-spark is importable, we assume Delta is available + # (this covers Databricks Connect as well in practice) + try: + from delta.tables import DeltaTable # noqa PLC0415 + + _ = DeltaTable # silence linters; import succeeded + return True + except Exception: + pass + + # 3) Fallback: old JVM classpath heuristic for bare Spark installs + def _handles() -> list[Any]: + refs: list[Any] = [] + with suppress(Exception): + refs.append(getattr(spark, "_jvm", None)) + with suppress(Exception): + sc = getattr(spark, "sparkContext", None) + if sc: + gw = getattr(sc, "_gateway", None) + if gw: + refs.append(getattr(gw, "jvm", None)) + return [ref for ref in refs if ref is not None] + + def _try_for_name(jvm: Any) -> bool: + candidates: list[Any] = [] + + java_pkg = getattr(jvm, "java", None) + if java_pkg is not None: + with suppress(Exception): + candidates.append(java_pkg.lang.Class) + + lang_pkg = getattr(jvm, "lang", None) + if lang_pkg is not None: + with suppress(Exception): + candidates.append(lang_pkg.Class) + + cls = getattr(jvm, "Class", None) + if cls is not None: + candidates.append(cls) + + for target in candidates: + try: + target.forName(_DELTA_CATALOG) + return True + except Exception: + continue + return False + + return any(_try_for_name(handle) for handle in _handles()) + + +def _csv_tokens(value: str | None) -> list[str]: + if not value: + return [] + return [part.strip() for part in value.split(",") if part and part.strip()] + + +def _ensure_csv_token(value: str | None, token: str) -> tuple[str | None, bool]: + tokens = _csv_tokens(value) + if token in tokens: + return value, False + tokens.append(token) + return ",".join(tokens), True + + +def _as_nonempty_str(value: Any | None) -> str | None: + if value is None: + return None + text = str(value).strip() + return text or None + + +def _safe_conf(spark: SparkSession, key: str, default: str = "") -> str: + try: + return str(spark.conf.get(key, default)) + except Exception as exc: + return f"" + + +def _log_delta_capabilities( + spark: SparkSession, + *, + wants_delta: bool, + delta_ok: bool, + user_spark: SparkSession | None, + table_format: str | None, +) -> None: + """ + Debug helper: log what we know about Spark/Delta capabilities. + Useful for environments like Databricks Connect where JVM probing is tricky. + """ + lines: list[str] = [] + lines.append("=== DatabricksSparkExecutor capabilities ===") + lines.append(f"Spark version: {getattr(spark, 'version', '')}") + lines.append(f"user_spark_provided: {user_spark is not None}") + lines.append(f"table_format: {table_format!r}") + lines.append(f"wants_delta: {wants_delta}") + lines.append(f"delta_ok: {delta_ok}") + lines.append(f"spark.sql.extensions: {_safe_conf(spark, 'spark.sql.extensions')}") + lines.append( + f"spark.sql.catalog.spark_catalog: {_safe_conf(spark, 'spark.sql.catalog.spark_catalog')}" + ) + + # Check whether delta-spark is importable + try: + from delta.tables import DeltaTable # noqa PLC0415 + + _ = DeltaTable + lines.append("delta.tables.DeltaTable import: OK") + except Exception as exc: + lines.append(f"delta.tables.DeltaTable import: FAILED ({exc})") + + echo_debug("\n".join(lines)) class DatabricksSparkExecutor(BaseExecutor[SDF]): @@ -35,8 +180,13 @@ def __init__( database: str | None = None, table_format: str | None = "parquet", table_options: dict[str, Any] | None = None, + spark: SparkSession | None = None, ): + extra_conf = dict(extra_conf or {}) + self._user_spark = spark builder = SparkSession.builder.master(master).appName(app_name) + catalog_key = "spark.sql.catalog.spark_catalog" + ext_key = "spark.sql.extensions" warehouse_path: Path | None = None if warehouse_dir: @@ -46,8 +196,9 @@ def __init__( warehouse_path.mkdir(parents=True, exist_ok=True) builder = builder.config("spark.sql.warehouse.dir", str(warehouse_path)) - if catalog: - builder = builder.config("spark.sql.catalog.spark_catalog", catalog) + catalog_value = _as_nonempty_str(catalog) + if catalog_value: + builder = builder.config(catalog_key, catalog_value) if extra_conf: for key, value in extra_conf.items(): @@ -58,7 +209,38 @@ def __init__( builder = builder.config("spark.sql.catalogImplementation", "hive") builder = builder.enableHiveSupport() - self.spark = builder.getOrCreate() + fmt_requested = (table_format or "").strip().lower() + wants_delta = fmt_requested == "delta" + + if not wants_delta and self._user_spark is None: + catalog_overridden = bool(catalog_value) + if not catalog_overridden: + # Leave Spark catalog untouched; downstream environments may supply + # their own defaults (e.g., Unity, Glue). We only force a catalog + # when the user explicitly opts into Delta. + pass + + # Apply Delta configuration last, after all Spark configs are set. + if wants_delta and self._user_spark is None: + if configure_spark_with_delta_pip is None: + raise RuntimeError( + "Delta table_format requested for DatabricksSparkExecutor, " + "but 'delta-spark' is not installed. " + "Install it with: pip install delta-spark" + ) + builder = configure_spark_with_delta_pip(builder) + + ext_value = _as_nonempty_str(extra_conf.get(ext_key)) + merged_ext, changed = _ensure_csv_token(ext_value, _DELTA_EXTENSION) + if changed or ext_value is None: + builder = builder.config(ext_key, merged_ext) + + extra_catalog = _as_nonempty_str(extra_conf.get(catalog_key)) + catalog_overridden = bool(catalog_value) or bool(extra_catalog) + if not catalog_overridden: + builder = builder.config(catalog_key, _DELTA_CATALOG) + + self.spark = self._user_spark or builder.getOrCreate() # Lightweight testing shim so tests can call executor.con.execute("SQL") self.con = _SparkConnShim(self.spark) self._registered_path_sources: dict[str, dict[str, Any]] = {} @@ -66,35 +248,58 @@ def __init__( self.catalog = catalog self.database = database self.schema = database + if database: self.spark.sql(f"CREATE DATABASE IF NOT EXISTS `{database}`") with suppress(Exception): self.spark.catalog.setCurrentDatabase(database) - fmt = (table_format or "").strip().lower() - self.spark_table_format: str | None = fmt or None - if table_options: - self.spark_table_options = {str(k): str(v) for k, v in table_options.items()} - else: - self.spark_table_options = {} + self.spark_table_format: str | None = fmt_requested or None + self.spark_table_options = {str(k): str(v) for k, v in (table_options or {}).items()} + # ---- Delta availability check ---- + self._delta_ok = _has_delta(self.spark) + + # Log capabilities whenever Delta is requested or detected + if wants_delta or self._delta_ok: + _log_delta_capabilities( + self.spark, + wants_delta=wants_delta, + delta_ok=self._delta_ok, + user_spark=self._user_spark, + table_format=self.spark_table_format, + ) + + if wants_delta and not self._delta_ok and self._user_spark is None: + raise RuntimeError( + "Delta table_format requested, but the Delta Lake classes are not available. " + "Install delta-spark or provide a SparkSession already configured for Delta." + ) + + # Unified format handler for managed tables (Delta, Iceberg, generic Parquet/ORC/etc.) + self._format_handler: SparkFormatHandler = get_spark_format_handler( + self.spark_table_format, + self.spark, + table_options=self.spark_table_options, + ) # ---------- Frame hooks (required) ---------- def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> SDF: # relation may optionally be "db.table" (via source()/ref()) - return self.spark.table(relation) + physical = self._format_handler.qualify_identifier(relation, database=self.database) + return self.spark.table(physical) def _materialize_relation(self, relation: str, df: SDF, node: Node) -> None: if not self._is_frame(df): raise TypeError("Spark model must return a Spark DataFrame") storage_meta = self._storage_meta(node, relation) - if storage_meta.get("path"): - self._write_to_storage_path(relation, df, storage_meta) - return - # write as a table in Hive/Unity/Delta environments + # Delegate managed/unmanaged handling to _save_df_as_table so Iceberg + # (or other handlers) can consistently enforce managed tables. self._save_df_as_table(relation, df, storage=storage_meta) def _create_view_over_table(self, view_name: str, backing_table: str, node: Node) -> None: - self.spark.sql(f"CREATE OR REPLACE VIEW `{view_name}` AS SELECT * FROM `{backing_table}`") + view_sql = self._sql_identifier(view_name) + backing_sql = self._sql_identifier(backing_table) + self.spark.sql(f"CREATE OR REPLACE VIEW {view_sql} AS SELECT * FROM {backing_sql}") def _validate_required( self, node_name: str, inputs: Any, requires: dict[str, set[str]] @@ -171,11 +376,10 @@ def _storage_meta(self, node: Node | None, relation: str) -> dict[str, Any]: continue return storage.get_model_storage(rel_clean) - def _write_to_storage_path( - self, relation: str, df: SDF, storage_meta: dict[str, Any] - ) -> None: # pragma: no cover + def _write_to_storage_path(self, relation: str, df: SDF, storage_meta: dict[str, Any]) -> None: parts = self._identifier_parts(relation) identifier = ".".join(parts) + storage.spark_write_to_path( self.spark, identifier, @@ -185,9 +389,27 @@ def _write_to_storage_path( default_options=self.spark_table_options, ) + path = storage_meta.get("path") + if path: + with suppress(Exception): + self.spark.catalog.refreshByPath(path) + # ---- SQL hooks ---- def _format_relation_for_ref(self, name: str) -> str: - return self._q_ident(relation_for(name)) + """ + Format a ref(...) relation for use in SQL. + + - Default: just backtick-quote the logical relation name. + - Iceberg: qualify with the Iceberg catalog so that models point at + tables in `iceberg..
`, matching the seed & incremental + write path. + """ + base = relation_for(name) + return self._sql_identifier(base) + + def _this_identifier(self, node: Node) -> str: + base = relation_for(node.name) + return self._sql_identifier(base) def _format_source_reference( self, cfg: dict[str, Any], source_name: str, table_name: str @@ -197,8 +419,8 @@ def _format_source_reference( if location: alias = identifier or f"__ff_src_{source_name}_{table_name}" - fmt = cfg.get("format") - if not fmt: + fmt_src = cfg.get("format") + if not fmt_src: raise KeyError( f"Source {source_name}.{table_name} requires 'format' when using a location" ) @@ -206,12 +428,12 @@ def _format_source_reference( options = dict(cfg.get("options") or {}) descriptor = { "location": location, - "format": fmt, + "format": fmt_src, "options": options, } existing = self._registered_path_sources.get(alias) if existing != descriptor: - reader = self.spark.read.format(fmt) + reader = self.spark.read.format(fmt_src) if options: reader = reader.options(**options) df = reader.load(location) @@ -221,13 +443,20 @@ def _format_source_reference( if not identifier: raise KeyError(f"Source {source_name}.{table_name} missing identifier") + catalog = cfg.get("catalog") + schema = cfg.get("schema") or cfg.get("database") + if catalog or schema: + logical = ".".join([p for p in (catalog, schema, identifier) if p]) + return self._sql_identifier(logical) - catalog = cfg.get("catalog") or cfg.get("database") - schema = cfg.get("schema") - parts = [p for p in (catalog, schema, identifier) if p] - if not parts: - parts = [identifier] - return ".".join(self._q_ident(str(part)) for part in parts) + fallback_db = self.database or self.spark.catalog.currentDatabase() + return self._sql_identifier(str(identifier), database=fallback_db) + + def _format_test_table(self, table: str | None) -> str | None: + formatted = super()._format_test_table(table) + if not isinstance(formatted, str): + return formatted + return self._format_handler.format_test_table(formatted, database=self.database) # ---- Spark table helpers ---- @staticmethod @@ -238,6 +467,14 @@ def _identifier_parts(self, identifier: str) -> list[str]: cleaned = self._strip_quotes(identifier) return [part for part in cleaned.split(".") if part] + def _physical_identifier(self, identifier: str, *, database: str | None = None) -> str: + db = database if database is not None else self.database + return self._format_handler.qualify_identifier(identifier, database=db) + + def _sql_identifier(self, identifier: str, *, database: str | None = None) -> str: + db = database if database is not None else self.database + return self._format_handler.format_identifier_for_sql(identifier, database=db) + def _warehouse_base(self) -> Path | None: try: conf_val = self.spark.conf.get("spark.sql.warehouse.dir", "spark-warehouse") @@ -291,43 +528,35 @@ def _table_location(self, parts: list[str]) -> Path | None: def _save_df_as_table( self, identifier: str, df: SDF, *, storage: dict[str, Any] | None = None ) -> None: + """ + Save a DataFrame as a (managed or unmanaged) table. + + - If storage["path"] is set -> unmanaged/path-based via storage.spark_write_to_path. + - Otherwise -> managed table via the configured format handler + (Delta, Parquet, future Iceberg, ...). + """ parts = self._identifier_parts(identifier) if not parts: raise ValueError(f"Invalid Spark table identifier: {identifier}") - storage_meta = storage or self._storage_meta(None, identifier) - if storage_meta.get("path"): + storage_meta = dict(storage or self._storage_meta(None, identifier) or {}) + + path_override = storage_meta.get("path") + if path_override and not self._format_handler.allows_unmanaged_paths(): + echo_debug( + f"Ignoring storage.path override for table '{identifier}' because " + f"format '{self._format_handler.table_format or 'default'}' " + "requires managed tables." + ) + path_override = None + + if path_override: self._write_to_storage_path(identifier, df, storage_meta) return table_name = ".".join(parts) - target_location = self._table_location(parts) - - def _write() -> None: - writer = df.write.mode("overwrite") - if self.spark_table_format: - writer = writer.format(self.spark_table_format) - if self.spark_table_options: - writer = writer.options(**self.spark_table_options) - writer.saveAsTable(table_name) - - target_sql = ".".join(self._q_ident(p) for p in parts) - with suppress(Exception): - self.spark.sql(f"DROP TABLE IF EXISTS {target_sql}") - if target_location and target_location.exists(): - with suppress(Exception): - shutil.rmtree(target_location, ignore_errors=True) - - try: - _write() - except AnalysisException as exc: # pragma: no cover - requires real Spark/Delta error - message = str(exc) - if target_location and "LOCATION_ALREADY_EXISTS" in message.upper(): - with suppress(Exception): - shutil.rmtree(target_location, ignore_errors=True) - _write() - else: - raise + # Managed tables: delegate to the format handler (Delta, Parquet, Iceberg, ...) + self._format_handler.save_df_as_table(table_name, df) def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None: self.spark.sql(f"CREATE OR REPLACE VIEW {target_sql} AS {select_body}") @@ -344,7 +573,9 @@ def _create_or_replace_table(self, target_sql: str, select_body: str, node: Node def _create_or_replace_view_from_table( self, view_name: str, backing_table: str, node: Node ) -> None: - self.spark.sql(f"CREATE OR REPLACE VIEW `{view_name}` AS SELECT * FROM `{backing_table}`") + view_sql = self._sql_identifier(view_name) + backing_sql = self._sql_identifier(backing_table) + self.spark.sql(f"CREATE OR REPLACE VIEW {view_sql} AS SELECT * FROM {backing_sql}") # ---- Meta hook ---- def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: @@ -358,43 +589,93 @@ def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: # ── Incremental API (parity) ───────────────────────────────────────── def exists_relation(self, relation: str) -> bool: """Check whether a table/view exists (optionally qualified with database).""" - db, tbl = _split_db_table(relation) - if db: - return bool(self.spark.catalog._jcatalog.tableExists(db, tbl)) - return self.spark.catalog.tableExists(tbl) + return self._format_handler.relation_exists(relation, database=self.database) def create_table_as(self, relation: str, select_sql: str) -> None: """CREATE TABLE AS with cleaned SELECT body.""" - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + df = self.spark.sql(body) + self._save_df_as_table(relation, df) + + def full_refresh_table(self, relation: str, select_sql: str) -> None: + """ + Engine-specific full refresh for incremental fallbacks. + Important: NO 'REPLACE TABLE' SQL, but DataFrame path + saveAsTable instead. + """ + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + # Delegate to format handler via _save_df_as_table for managed, or storage for unmanaged df = self.spark.sql(body) self._save_df_as_table(relation, df) def incremental_insert(self, relation: str, select_sql: str) -> None: - """INSERT INTO with cleaned SELECT body.""" - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - self.spark.sql(f"INSERT INTO {relation} {body}") + """INSERT INTO with cleaned SELECT body (format-aware via handler).""" + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + self._format_handler.incremental_insert(relation, body) def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None: - """ - Try Delta MERGE (Databricks typical). If MERGE fails (non-Delta), fallback to full replace. - """ - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE" + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + + # First: let the current format handler try to do a native merge. + # - DeltaFormatHandler -> DeltaTable.merge() + # - IcebergFormatHandler -> Spark SQL MERGE INTO try: - # Use inline subquery as source; SET * / INSERT * requires Delta ≥ 1.2 / Spark ≥ 3.4. - self.spark.sql( - f""" - MERGE INTO {relation} AS t - USING ({body}) AS s - ON {pred} - WHEN MATCHED THEN UPDATE SET * - WHEN NOT MATCHED THEN INSERT * - """ - ) - except Exception: - # Fallback: Full replace is safer across lake formats - df = self.spark.sql(body) - self._save_df_as_table(relation, df) + self._format_handler.incremental_merge(relation, body, unique_key) + return + except NotImplementedError: + # Format handler doesn't support MERGE → fall back to generic Spark strategy. + pass + + # Fallback for formats without native merge: + # overwrite = (existing minus keys being updated) UNION (new rows) + materialized: list[SDF] = [] + + def _materialize(df: SDF) -> SDF: + """ + Ensure the frame is realized independently of the source table so an + overwrite doesn't conflict with the read path. + """ + try: + cp = df.localCheckpoint(eager=True) + materialized.append(cp) + return cp + except Exception: + cached = df.cache() + cached.count() + materialized.append(cached) + return cached + + try: + physical = self._physical_identifier(relation) + existing = _materialize(self.spark.table(physical)) + incoming = _materialize(self.spark.sql(body)) + + if unique_key: + # ensure key columns exist on incoming + missing = [k for k in unique_key if k not in incoming.columns] + if missing: + raise ModelExecutionError( + node_name="__python_incremental__", + relation=relation, + message=( + "incremental_merge fallback: missing key columns on incoming: " + f"{missing}" + ), + ) + key_df = incoming.select(*unique_key).dropDuplicates() + # left_anti: keep only rows whose keys are NOT in incoming + kept = existing.join(key_df, on=unique_key, how="left_anti") + merged = kept.unionByName(incoming, allowMissingColumns=True) + else: + # No keys → append & deduplicate + merged = existing.unionByName(incoming, allowMissingColumns=True).dropDuplicates() + + merged = _materialize(merged) + # Full overwrite with merged result + self._save_df_as_table(relation, merged) + finally: + for handle in materialized: + with suppress(Exception): + handle.unpersist() def alter_table_sync_schema( self, relation: str, select_sql: str, *, mode: str = "append_new_columns" @@ -408,7 +689,8 @@ def alter_table_sync_schema( return # Target schema try: - target_df = self.spark.table(relation) + physical = self._physical_identifier(relation) + target_df = self.spark.table(physical) except Exception: return existing = {f.name for f in target_df.schema.fields} @@ -429,7 +711,8 @@ def _spark_sql_type(dt: DataType) -> str: ) cols_sql = ", ".join([f"`{f.name}` {_spark_sql_type(f.dataType)}" for f in to_add]) - self.spark.sql(f"ALTER TABLE {relation} ADD COLUMNS ({cols_sql})") + table_sql = self._sql_identifier(relation) + self.spark.sql(f"ALTER TABLE {table_sql} ADD COLUMNS ({cols_sql})") # ────────────────────────── local helpers / shim ────────────────────────── diff --git a/src/fastflowtransform/executors/duckdb_exec.py b/src/fastflowtransform/executors/duckdb_exec.py index d937dd7..c642672 100644 --- a/src/fastflowtransform/executors/duckdb_exec.py +++ b/src/fastflowtransform/executors/duckdb_exec.py @@ -22,18 +22,56 @@ def _q(ident: str) -> str: class DuckExecutor(BaseExecutor[pd.DataFrame]): ENGINE_NAME = "duckdb" - def __init__(self, db_path: str = ":memory:"): + def __init__( + self, db_path: str = ":memory:", schema: str | None = None, catalog: str | None = None + ): if db_path and db_path != ":memory:" and "://" not in db_path: with suppress(Exception): Path(db_path).parent.mkdir(parents=True, exist_ok=True) self.db_path = db_path self.con = duckdb.connect(db_path) + self.schema = schema.strip() if isinstance(schema, str) and schema.strip() else None + catalog_override = catalog.strip() if isinstance(catalog, str) and catalog.strip() else None + self.catalog = self._detect_catalog() + if catalog_override: + if self._apply_catalog_override(catalog_override): + self.catalog = catalog_override + else: + self.catalog = self._detect_catalog() + if self.schema: + safe_schema = _q(self.schema) + self.con.execute(f"create schema if not exists {safe_schema}") + self.con.execute(f"set schema '{self.schema}'") + + def _detect_catalog(self) -> str | None: + try: + rows = self.con.execute("PRAGMA database_list").fetchall() + if rows: + return str(rows[0][1]) + except Exception: + return None + return None + + def _apply_catalog_override(self, name: str) -> bool: + alias = name.strip() + if not alias: + return False + try: + if self.db_path != ":memory:": + resolved = str(Path(self.db_path).resolve()) + with suppress(Exception): + self.con.execute(f"detach database {_q(alias)}") + self.con.execute(f"attach database '{resolved}' as {_q(alias)} (READ_ONLY FALSE)") + self.con.execute(f"set catalog '{alias}'") + return True + except Exception: + return False def clone(self) -> DuckExecutor: """ Generates a new Executor instance with own connection for Thread-Worker. """ - return DuckExecutor(self.db_path) + return DuckExecutor(self.db_path, schema=self.schema, catalog=self.catalog) def _exec_many(self, sql: str) -> None: """ @@ -47,9 +85,34 @@ def _exec_many(self, sql: str) -> None: self.con.execute(stmt) # ---- Frame hooks ---- + def _qualified(self, relation: str, *, quoted: bool = True) -> str: + """ + Return (catalog.)schema.relation if schema is set; otherwise just relation. + When quoted=False, emit bare identifiers for APIs like con.table(). + """ + rel = relation_for(relation) if relation.endswith(".ff") else relation + rel_part = _q(rel) if quoted else rel + if not self.schema: + return rel_part + parts: list[str] = [] + cat_clean = None + include_catalog = False + if isinstance(self.catalog, str): + cat_trimmed = self.catalog.strip() + if cat_trimmed and cat_trimmed.lower() == self.schema.strip().lower(): + include_catalog = True + cat_clean = cat_trimmed + if include_catalog and cat_clean is not None: + parts.append(_q(cat_clean) if quoted else cat_clean) + schema_clean = self.schema.strip() + parts.append(_q(schema_clean) if quoted else schema_clean) + parts.append(rel_part) + return ".".join(parts) + def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.DataFrame: try: - return self.con.table(relation).df() + target = self._qualified(relation, quoted=False) + return self.con.table(target).df() except CatalogException as e: existing = [ r[0] @@ -68,7 +131,8 @@ def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> tmp = "_ff_py_out" try: self.con.register(tmp, df) - self.con.execute(f'create or replace table "{relation}" as select * from "{tmp}"') + target = self._qualified(relation) + self.con.execute(f'create or replace table {target} as select * from "{tmp}"') finally: try: self.con.unregister(tmp) @@ -78,14 +142,16 @@ def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> def _create_or_replace_view_from_table( self, view_name: str, backing_table: str, node: Node ) -> None: - self.con.execute(f'create or replace view "{view_name}" as select * from "{backing_table}"') + view_target = self._qualified(view_name) + backing = self._qualified(backing_table) + self.con.execute(f"create or replace view {view_target} as select * from {backing}") def _frame_name(self) -> str: return "pandas" # ---- SQL hooks ---- def _format_relation_for_ref(self, name: str) -> str: - return _q(relation_for(name)) + return self._qualified(relation_for(name)) def _format_source_reference( self, cfg: dict[str, Any], source_name: str, table_name: str @@ -98,19 +164,30 @@ def _format_source_reference( if not identifier: raise KeyError(f"Source {source_name}.{table_name} missing identifier") - parts = [ - p - for p in ( - cfg.get("catalog") or cfg.get("database"), - cfg.get("schema"), - identifier, - ) - if p - ] - if not parts: - parts = [identifier] - - return ".".join(_q(str(part)) for part in parts) + catalog_cfg = cfg.get("catalog") or cfg.get("database") + catalog = ( + catalog_cfg.strip() if isinstance(catalog_cfg, str) and catalog_cfg.strip() else None + ) + schema_candidate = cfg.get("schema") or self.schema + schema = ( + schema_candidate.strip() + if isinstance(schema_candidate, str) and schema_candidate.strip() + else None + ) + if catalog is None and schema and isinstance(self.catalog, str): + cat_clean = self.catalog.strip() + if cat_clean and cat_clean.lower() == schema.lower(): + catalog = cat_clean + if catalog is None and schema is None and isinstance(self.catalog, str): + cat_clean = self.catalog.strip() + catalog = cat_clean or None + parts: list[str] = [] + if catalog: + parts.append(catalog) + if schema: + parts.append(schema) + parts.append(identifier) + return ".".join(_q(str(part)) for part in parts if part) def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None: self.con.execute(f"create or replace view {target_sql} as {select_body}") @@ -132,25 +209,32 @@ def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: # ── Incremental API ──────────────────────────────────────────────────── def exists_relation(self, relation: str) -> bool: - sql = """ - select 1 - from information_schema.tables - where table_schema in ('main','temp') - and lower(table_name) = lower(?) - limit 1 - """ - res = self.con.execute(sql, [relation]).fetchone() - return bool(res) + where_tables: list[str] = ["lower(table_name) = lower(?)"] + params: list[str] = [relation] + if self.catalog: + where_tables.append("lower(table_catalog) = lower(?)") + params.append(self.catalog) + if self.schema: + where_tables.append("lower(table_schema) = lower(?)") + params.append(self.schema) + else: + where_tables.append("table_schema in ('main','temp')") + where = " AND ".join(where_tables) + sql_tables = f"select 1 from information_schema.tables where {where} limit 1" + if self.con.execute(sql_tables, params).fetchone(): + return True + sql_views = f"select 1 from information_schema.views where {where} limit 1" + return bool(self.con.execute(sql_views, params).fetchone()) def create_table_as(self, relation: str, select_sql: str) -> None: # Use only the SELECT body and strip trailing semicolons for safety. - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - self.con.execute(f"create table {relation} as {body}") + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + self.con.execute(f"create table {self._qualified(relation)} as {body}") def incremental_insert(self, relation: str, select_sql: str) -> None: # Ensure the inner SELECT is clean (no trailing semicolon; SELECT body only). - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - self.con.execute(f"insert into {relation} {body}") + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + self.con.execute(f"insert into {self._qualified(relation)} {body}") def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None: """ @@ -161,17 +245,17 @@ def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str statements and DuckDB won't see the CTE from the previous statement. """ # 1) clean inner SELECT - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") # 2) predicate for DELETE keys_pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE" # 3) first: delete collisions - delete_sql = f"delete from {relation} t using ({body}) s where {keys_pred}" + delete_sql = f"delete from {self._qualified(relation)} t using ({body}) s where {keys_pred}" self.con.execute(delete_sql) # 4) then: insert fresh rows - insert_sql = f"insert into {relation} select * from ({body}) src" + insert_sql = f"insert into {self._qualified(relation)} select * from ({body}) src" self.con.execute(insert_sql) def alter_table_sync_schema( @@ -189,15 +273,18 @@ def alter_table_sync_schema( r[0] for r in self.con.execute( "select column_name from information_schema.columns " - + "where lower(table_name)=lower(?)", - [relation], + + "where lower(table_name)=lower(?)" + + (" and lower(table_schema)=lower(?)" if self.schema else ""), + ([relation, self.schema] if self.schema else [relation]), ).fetchall() } add = [c for c in cols if c not in existing] for c in add: # Typ heuristisch: typeof aus einer CAST-Probe; fallback VARCHAR + col = _q(c) + target = self._qualified(relation) try: # Versuche Typ aus Expression abzuleiten (best effort) - self.con.execute(f"alter table {relation} add column {c} varchar") + self.con.execute(f"alter table {target} add column {col} varchar") except Exception: - self.con.execute(f"alter table {relation} add column {c} varchar") + self.con.execute(f"alter table {target} add column {col} varchar") diff --git a/src/fastflowtransform/executors/postgres_exec.py b/src/fastflowtransform/executors/postgres_exec.py index c59138d..7590d4a 100644 --- a/src/fastflowtransform/executors/postgres_exec.py +++ b/src/fastflowtransform/executors/postgres_exec.py @@ -191,6 +191,7 @@ def exists_relation(self, relation: str) -> bool: """ ) with self.engine.begin() as con: + self._set_search_path(con) return bool(con.execute(sql, {"t": relation}).fetchone()) def create_table_as(self, relation: str, select_sql: str) -> None: @@ -200,6 +201,18 @@ def create_table_as(self, relation: str, select_sql: str) -> None: self._set_search_path(con) con.execute(text(f"create table {qrel} as {body}")) + def full_refresh_table(self, relation: str, select_sql: str) -> None: + """ + Full refresh for incremental fallbacks: + DROP TABLE IF EXISTS + CREATE TABLE AS. + """ + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + qrel = self._qualified(relation) + with self.engine.begin() as conn: + self._set_search_path(conn) + conn.execute(text(f"drop table if exists {qrel}")) + conn.execute(text(f"create table {qrel} as {body}")) + def incremental_insert(self, relation: str, select_sql: str) -> None: body = self._extract_select_like(select_sql) qrel = self._qualified(relation) diff --git a/src/fastflowtransform/incremental.py b/src/fastflowtransform/incremental.py index 8c2cab6..f522ec2 100644 --- a/src/fastflowtransform/incremental.py +++ b/src/fastflowtransform/incremental.py @@ -1,7 +1,7 @@ +# fastflowtransform/incremental.py from __future__ import annotations from collections.abc import Callable, Sequence -from contextlib import suppress from typing import Any from sqlalchemy import text as _sa_text @@ -25,12 +25,47 @@ def _normalize_unique_key(val: Any) -> list[str]: return [] -def _get_on_schema_change(meta: dict | None) -> str: - v = (meta or {}).get("on_schema_change") or "ignore" - v = str(v).strip().lower() - if v not in {"ignore", "append_new_columns", "sync_all_columns"}: +def _get_schema_sync_policy(meta: dict | None) -> str: + """ + Resolve schema sync / on_schema_change policy with backwards compatibility. + + Priority: + 1) top-level schema_sync + 2) incremental.schema_sync (if present as nested config) + 3) legacy on_schema_change + + Normalizes: + - "none" -> "ignore" + - invalid values -> "ignore" + """ + data = meta or {} + + raw = data.get("schema_sync") + if raw is None: + incr = data.get("incremental") + if isinstance(incr, dict): + raw = incr.get("schema_sync") + if raw is None: + raw = data.get("on_schema_change") + + v = str(raw or "ignore").strip().lower() + + if v in {"none", "ignore"}: return "ignore" - return v + if v in {"append_new_columns", "sync_all_columns"}: + return v + return "ignore" + + +def _is_merge_not_supported_error(exc: Exception) -> bool: + """ + Detect engine messages where MERGE is simply not supported for the target table/catalog. + In those cases we want to gracefully fall back to a full refresh, instead of failing. + """ + msg = str(exc) + text = msg.lower() + # Databricks / Spark-style messages + return "merge into table is not supported" in text or "merge into is not supported" in text # ---------- Helper ---------- @@ -104,11 +139,25 @@ def _maybe_schema_sync(executor: Any, relation: Any, rendered_sql: str, policy: if policy in {"append_new_columns", "sync_all_columns"} and hasattr( executor, "alter_table_sync_schema" ): - with suppress(Exception): - executor.alter_table_sync_schema(relation, rendered_sql, mode=policy) + executor.alter_table_sync_schema(relation, rendered_sql, mode=policy) def _create_table_as_or_replace(executor: Any, relation: Any, rendered_sql: str) -> None: + _full_refresh_table(executor, relation, rendered_sql) + + +def _full_refresh_table(executor: Any, relation: Any, rendered_sql: str) -> None: + """ + Engine-agnostic full refresh: + - If the executor has a `full_refresh_table(...)` method, it is used. + - Otherwise: first try `create_table_as`; on failure, fall back to raw SQL + 'create or replace table ... as ...' (for DuckDB, Postgres, Snowflake, etc.). + """ + full_refresh = getattr(executor, "full_refresh_table", None) + if callable(full_refresh): + full_refresh(relation, rendered_sql) + return + try: executor.create_table_as(relation, rendered_sql) except Exception: @@ -123,27 +172,43 @@ def _merge_or_insert_with_fallback( relation: Any, rendered_sql: str, unique_key: UniqueKey, + *, + fallback_sql: str | None = None, + on_full_refresh_error: Callable[[Exception], None] | None = None, ) -> None: + fallback_sql = fallback_sql or rendered_sql + + def _run_full_refresh() -> None: + try: + _full_refresh_table(executor, relation, fallback_sql) + except Exception as exc: + if on_full_refresh_error is not None: + on_full_refresh_error(exc) + else: + raise + if unique_key: - # auf Liste normalisieren if isinstance(unique_key, str): keys: list[str] = [unique_key] else: - keys = list(unique_key) # Sequence[str] -> List[str] + keys = list(unique_key) try: - # merge mit Keys executor.incremental_merge(relation, rendered_sql, keys) return - except Exception: - _exec_sql(executor, f"create or replace table {relation} as {rendered_sql}") - return - - # kein unique_key -> insert + except Exception as exc: + # 🔑 Only treat "MERGE not supported" as a soft error → fallback + if _is_merge_not_supported_error(exc): + _run_full_refresh() + return + # Any other error (e.g. UNRESOLVED_COLUMN, syntax, etc.) is a *real* failure + raise + + # no unique_key -> insert-only try: executor.incremental_insert(relation, rendered_sql) except Exception: - _exec_sql(executor, f"create or replace table {relation} as {rendered_sql}") + _run_full_refresh() # ---------- Run or dispatch ---------- @@ -151,40 +216,82 @@ def _merge_or_insert_with_fallback( def run_or_dispatch(executor: Any, node: Any, jenv: Any) -> None: """ - Wrapper für executor.run_sql(...): - - fängt materialized='incremental' ab und plant nenewu/merge/insert. - - sonst Delegation an executor. + Incremental materialization for materialized='incremental'. + Method called from BaseExecutor.run_sql(...). """ meta = getattr(node, "meta", {}) or {} - materialized = meta.get("materialized", "table") - if materialized != "incremental": + materialized = meta.get("materialized") + incr_cfg = meta.get("incremental") + + # Determine if incremental is enabled + incr_enabled = False + if isinstance(incr_cfg, bool): + incr_enabled = incr_cfg + elif isinstance(incr_cfg, dict): + # default enabled=True if a dict is present unless explicitly disabled + incr_enabled = incr_cfg.get("enabled", True) + + # Decide whether to treat this as an incremental model + is_incremental_model = False + if materialized == "incremental": + # respect enabled flag if present + is_incremental_model = incr_enabled if incr_cfg is not None else True + elif materialized is None and incr_enabled: + # legacy: "incremental: true" without explicit materialized + is_incremental_model = True + + relation = relation_for(node.name) + + if not is_incremental_model: + rendered_sql = _render_sql_safe(executor, node, jenv) + wrap_and_raise = _wrap_and_raise_factory(node.name, relation, rendered_sql) try: - return executor.run_sql(node, jenv) + _create_table_as_or_replace(executor, relation, rendered_sql) except Exception as e: - rel = relation_for(node.name) - msg = f"{e.__class__.__name__}: {e}" - raise ModelExecutionError(node.name, rel, msg, sql_snippet=None) from e + wrap_and_raise(e) + return - relation = relation_for(node.name) exists = _safe_exists(executor, relation) env = _env_with_incremental(jenv, exists) - rendered_sql = _render_sql_safe(executor, node, env) - wrap_and_raise = _wrap_and_raise_factory(node.name, relation, rendered_sql) + + base_sql = _render_sql_safe(executor, node, env) + + delta_sql = meta.get("delta_sql") + if exists and isinstance(delta_sql, str) and delta_sql.strip(): + rendered_sql = delta_sql.strip() + else: + rendered_sql = base_sql + + fallback_sql = rendered_sql + if exists: + non_incr_env = _env_with_incremental(jenv, False) + fallback_sql = _render_sql_safe(executor, node, non_incr_env) + + wrap_incremental = _wrap_and_raise_factory(node.name, relation, rendered_sql) + wrap_full_refresh = _wrap_and_raise_factory(node.name, relation, fallback_sql) unique_key = _normalize_unique_key(meta.get("unique_key")) - schema_policy = _get_on_schema_change(meta) + schema_policy = _get_schema_sync_policy(meta) if not exists: try: - _create_table_as_or_replace(executor, relation, rendered_sql) + _create_table_as_or_replace(executor, relation, fallback_sql) except Exception as e: - wrap_and_raise(e) + wrap_full_refresh(e) return - # exists -> inkrementeller Pfad _maybe_schema_sync(executor, relation, rendered_sql, schema_policy) try: - _merge_or_insert_with_fallback(executor, relation, rendered_sql, unique_key) + _merge_or_insert_with_fallback( + executor, + relation, + rendered_sql, + unique_key, + fallback_sql=fallback_sql, + on_full_refresh_error=wrap_full_refresh, + ) + except ModelExecutionError: + raise except Exception as e: - wrap_and_raise(e) + wrap_incremental(e) diff --git a/src/fastflowtransform/logging.py b/src/fastflowtransform/logging.py index fbe3fe7..f7efb19 100644 --- a/src/fastflowtransform/logging.py +++ b/src/fastflowtransform/logging.py @@ -46,7 +46,9 @@ def _prefix_text_block(text: str) -> str: return "".join(prefixed) -def _prefix_format(fmt: str) -> str: +def _prefix_format(fmt: str | None) -> str | None: + if not fmt: + return fmt if not _prefix_enabled(): return fmt return f"{LOG_PREFIX} {fmt}" diff --git a/src/fastflowtransform/run_executor.py b/src/fastflowtransform/run_executor.py index b70b2bc..36e8cfb 100644 --- a/src/fastflowtransform/run_executor.py +++ b/src/fastflowtransform/run_executor.py @@ -99,6 +99,7 @@ def _run_level( logger: LogQueue | None, engine_abbr: str, name_width: int, + name_formatter: Callable[[str], str] | None, ) -> tuple[bool, int, int, int]: """Führt eine Ebene aus und loggt. Rückgabe: (had_error, ok_count, fail_count, lvl_ms).""" if not names: @@ -110,26 +111,31 @@ def _run_level( fail_in_level = 0 level_had_error = False + display_names: dict[str, str] = {} + with ThreadPoolExecutor(max_workers=max(1, int(jobs)), thread_name_prefix="ff-worker") as pool: futures: dict[Future[None], str] = {} for nm in names: - _log_start(logger, lvl_idx, engine_abbr, nm, name_width) + label = name_formatter(nm) if name_formatter else nm + display_names[nm] = label + _log_start(logger, lvl_idx, engine_abbr, label, name_width) futures[pool.submit(task, nm)] = nm for fut in as_completed(futures): nm = futures[fut] + label = display_names.get(nm, nm) try: fut.result() ok_in_level += 1 _log_end( - logger, lvl_idx, engine_abbr, nm, True, int(per_node[nm] * 1000), name_width + logger, lvl_idx, engine_abbr, label, True, int(per_node[nm] * 1000), name_width ) except BaseException as e: level_had_error = True failed[nm] = e fail_in_level += 1 ms = int((per_node.get(nm, perf_counter() - lvl_t0)) * 1000) - _log_end(logger, lvl_idx, engine_abbr, nm, False, ms, name_width) + _log_end(logger, lvl_idx, engine_abbr, label, False, ms, name_width) if fail_policy == "fail_fast": for f in futures: if not f.done(): @@ -153,6 +159,7 @@ def schedule( logger: LogQueue | None = None, engine_abbr: str = "", name_width: int = 28, + name_formatter: Callable[[str], str] | None = None, ) -> ScheduleResult: """Run levels sequentially; within a level run up to `jobs` nodes in parallel.""" per_node: dict[str, float] = {} @@ -174,6 +181,7 @@ def schedule( logger=logger, engine_abbr=engine_abbr, name_width=name_width, + name_formatter=name_formatter, ) if had_error: if on_error: diff --git a/src/fastflowtransform/schema_loader.py b/src/fastflowtransform/schema_loader.py index 2091352..99a551f 100644 --- a/src/fastflowtransform/schema_loader.py +++ b/src/fastflowtransform/schema_loader.py @@ -1,3 +1,4 @@ +# fastflowtransform/schema_loader.py from __future__ import annotations from dataclasses import dataclass, field diff --git a/src/fastflowtransform/seeding.py b/src/fastflowtransform/seeding.py index 9afe4c3..72a1c6e 100644 --- a/src/fastflowtransform/seeding.py +++ b/src/fastflowtransform/seeding.py @@ -8,22 +8,22 @@ from contextlib import suppress from pathlib import Path from time import perf_counter -from typing import Any, NamedTuple +from typing import Any, NamedTuple, cast from urllib.parse import unquote, urlparse import pandas as pd -import yaml +from pyspark.sql import DataFrame as SDF, SparkSession from fastflowtransform import storage +from fastflowtransform.config.seeds import SeedsSchemaConfig, load_seeds_schema from fastflowtransform.logging import echo +from fastflowtransform.settings import EngineType try: # Optional Spark dependency from pyspark.errors.exceptions.base import AnalysisException as _SparkAnalysisException except Exception: # pragma: no cover - Spark not installed _SparkAnalysisException = Exception # type: ignore -# If you use this in a CLI, your code elsewhere should provide _prepare_context. - # ----------------------------- File I/O & Schema (dtypes) ----------------------------- @@ -37,20 +37,26 @@ def _read_seed_file(path: Path) -> pd.DataFrame: raise ValueError(f"Unsupported seed file format: {path.name}") -def _apply_schema(df: pd.DataFrame, table: str, schema_cfg: dict | None) -> pd.DataFrame: +def _apply_schema( + df: pd.DataFrame, + table: str, + schema_cfg: SeedsSchemaConfig | None, +) -> pd.DataFrame: """ Apply optional pandas dtypes from seeds/schema.yml for a given table key. - Expected structure: + + The validated configuration is: + dtypes: : col_a: string col_b: int64 - Soft-fails on casting errors to avoid blocking loads. + + Casting errors are swallowed on purpose to avoid blocking seed loads. """ if not schema_cfg: return df - cfg = schema_cfg.get("dtypes") or {} - dtypes: dict[str, str] = cfg.get(table) or {} + dtypes: dict[str, str] = schema_cfg.dtypes.get(table) or {} if not dtypes: return df @@ -75,7 +81,7 @@ def _is_qualified(name: str) -> bool: return "." in name -def _qualify(table: str, schema: str | None) -> str: +def _qualify(table: str, schema: str | None, catalog: str | None = None) -> str: """ Return a safely quoted, optionally schema-qualified identifier. - Respects already-qualified names like raw.users or "raw"."users". @@ -83,8 +89,17 @@ def _qualify(table: str, schema: str | None) -> str: """ if _is_qualified(table): return ".".join(_dq(p) for p in table.split(".")) + catalog_part = catalog.strip() if isinstance(catalog, str) and catalog.strip() else None if schema: - return f"{_dq(schema)}.{_dq(table)}" + schema_part = schema.strip() + parts: list[str] = [] + if catalog_part: + parts.append(_dq(catalog_part)) + parts.append(_dq(schema_part)) + parts.append(_dq(table)) + return ".".join(parts) + if catalog_part: + return f"{_dq(catalog_part)}.{_dq(table)}" return _dq(table) @@ -209,16 +224,42 @@ class SeedTarget(NamedTuple): def _engine_name_from_executor(executor: Any) -> str: - """Infer a human/CFG-facing engine name from the executor object.""" + """ + Infer a canonical engine name from the executor object. + + Preference: + 1) executor.engine_name (BaseExecutor-derived) + 2) Spark hint → "databricks_spark" + 3) SQLAlchemy dialect ("postgresql" → "postgres", "bigquery" → "bigquery") + 4) DuckDB heuristic (executor.con present) → "duckdb" + 5) "unknown" as last resort + """ + # 1) BaseExecutor-style engine_name + engine_name = getattr(executor, "engine_name", None) + if isinstance(engine_name, str) and engine_name.strip(): + return engine_name.strip() + + # 2) Spark-style executor if getattr(executor, "spark", None) is not None: - return "spark" + return "databricks_spark" + + # 3) SQLAlchemy-based executors eng = getattr(executor, "engine", None) if eng is not None: - name = getattr(getattr(eng, "dialect", None), "name", None) - if name: - return str(name) + dialect_name = getattr(getattr(eng, "dialect", None), "name", None) + if isinstance(dialect_name, str) and dialect_name: + low = dialect_name.lower() + if low.startswith("postgres"): + return "postgres" + if low.startswith("bigquery"): + return "bigquery" + return low + + # 4) DuckDB-ish: has a .con (DuckDBPyConnection or similar) if getattr(executor, "con", None) is not None: return "duckdb" + + # 5) Fallback return "unknown" @@ -237,7 +278,7 @@ def _seed_id(seeds_dir: Path, path: Path) -> str: def _resolve_schema_and_table_by_cfg( seed_id: str, stem: str, - schema_cfg: dict | None, + schema_cfg: SeedsSchemaConfig | None, executor: Any, default_schema: str | None, ) -> tuple[str | None, str]: @@ -261,12 +302,14 @@ def _resolve_schema_and_table_by_cfg( if not schema_cfg: return schema, table - targets: dict[str, dict] = schema_cfg.get("targets") or {} + targets = schema_cfg.targets engine = _engine_name_from_executor(executor) entry = targets.get(seed_id) if not entry: - entry = targets.get(seed_id.replace("/", ".")) # optional "raw.users" key + # Optional "raw.users" style key as a convenience + dotted_id = seed_id.replace("/", ".") + entry = targets.get(dotted_id) # stem-based only if present (uniqueness checked by caller) if not entry and stem in targets: @@ -275,9 +318,10 @@ def _resolve_schema_and_table_by_cfg( if not entry: return schema, table - table = entry.get("table", table) - by_engine = entry.get("schema_by_engine") or {} - schema = by_engine.get(engine, entry.get("schema", schema)) + table = entry.table or table + engine = _engine_name_from_executor(executor) + engine_key = cast(EngineType, engine) + schema = entry.schema_by_engine.get(engine_key) or entry.schema_ or schema return schema, table @@ -304,7 +348,8 @@ def _handle_duckdb(table: str, df: pd.DataFrame, executor: Any, schema: str | No if not is_duck_con: return False - full_name = _qualify(table, schema) + catalog = getattr(executor, "catalog", None) + full_name = _qualify(table, schema, catalog) created_schema = False if schema and not _is_qualified(table): con.execute(f"create schema if not exists {_dq(schema)}") @@ -361,16 +406,28 @@ def _handle_sqlalchemy(table: str, df: pd.DataFrame, executor: Any, schema: str return True -def _handle_spark(table: str, df: pd.DataFrame, executor: Any, schema: str | None) -> bool: - """Versucht Spark/Databricks zu erkennen und zu bedienen.""" - spark = getattr(executor, "spark", None) - if spark is None: - return False +def _spark_ident(name: str) -> str: + """Return a Spark-safe identifier (escapes backticks).""" + return name.replace("`", "``") + - def _spark_ident(name: str) -> str: - return name.replace("`", "``") +def _prepare_spark_target( + table: str, + schema: str | None, + executor: Any, + spark: Any, +) -> tuple[str, str, Any, bool]: + """ + Build Spark target identifiers and detect the table location. + Returns: + target_identifier: unquoted table identifier (db.table or table) + target_sql: quoted SQL identifier with backticks + target_location: filesystem location (may be None) + created_schema: whether a database schema was created implicitly + """ created_schema = False + if schema and not _is_qualified(table): spark.sql(f"CREATE DATABASE IF NOT EXISTS `{_spark_ident(schema)}`") created_schema = True @@ -382,62 +439,231 @@ def _spark_ident(name: str) -> str: target_identifier = ".".join(parts) target_sql = ".".join(f"`{_spark_ident(p)}`" for p in parts) target_location = _spark_table_location(parts, spark) + return target_identifier, target_sql, target_location, created_schema + + +def _write_spark_seed_to_path( + spark: Any, + target_identifier: str, + sdf: Any, + storage_meta: dict[str, Any], + table_format: str | None, + table_options: dict[str, Any], +) -> str: + """Write the seed via a custom storage path configuration.""" + storage.spark_write_to_path( + spark, + target_identifier, + sdf, + storage=storage_meta, + default_format=table_format, + default_options=table_options, + ) + return "custom path" - table_format = getattr(executor, "spark_table_format", None) - table_options = getattr(executor, "spark_table_options", None) or {} - - storage_meta = storage.get_seed_storage(target_identifier) - t0 = perf_counter() - sdf = spark.createDataFrame(df) - cleanup_hint = None - - if storage_meta.get("path"): - storage.spark_write_to_path( - spark, - target_identifier, - sdf, - storage=storage_meta, - default_format=table_format, - default_options=table_options, - ) - cleanup_hint = "custom path" - else: - with suppress(Exception): - spark.sql(f"DROP TABLE IF EXISTS {target_sql}") - if target_location and target_location.exists(): - with suppress(Exception): - shutil.rmtree(target_location, ignore_errors=True) - cleanup_hint = "reset location" +def _write_spark_seed_managed( + executor: Any, + spark: SparkSession, + target_identifier: str, + sdf: SDF, + table_format: str | None, + table_options: dict[str, Any], +) -> str | None: + """ + Write the seed as a *managed* Spark table (no custom storage path). - def _write() -> None: + For engines like DatabricksSparkExecutor this ensures that the + table_format handler (Delta / Iceberg / etc.) is used, so Iceberg + seeds become proper Iceberg tables. + """ + try: + # Prefer engine-specific helper when available (e.g. DatabricksSparkExecutor) + save_df = getattr(executor, "_save_df_as_table", None) + if callable(save_df): + # Pass a truthy storage dict with path=None so we do *not* get + # redirected back into path-based storage again. + save_df( + target_identifier, + sdf, + storage={"path": None, "options": table_options or {}}, + ) + else: + # Generic Spark fallback: may not be format-aware, but keeps behavior writer = sdf.write.mode("overwrite") if table_format: writer = writer.format(table_format) if table_options: writer = writer.options(**table_options) writer.saveAsTable(target_identifier) + except Exception as exc: + raise RuntimeError( + f"Failed to materialize Spark seed '{target_identifier}' as managed table: {exc}" + ) from exc + # No temporary path to clean up for managed tables + return None + + +def _spark_write_table( + sdf: Any, + target_identifier: str, + table_format: str | None, + table_options: dict[str, Any], +) -> None: + """Perform the actual Spark saveAsTable call with configured options.""" + writer = sdf.write.mode("overwrite") + if table_format: + writer = writer.format(table_format) + if table_options: + writer = writer.options(**table_options) + writer.saveAsTable(target_identifier) + + +def _reset_spark_table_and_location( + spark: Any, + target_sql: str, + target_location: Any, +) -> str | None: + """ + Drop the Spark table and remove the underlying location if possible. + + Returns: + A cleanup hint string (e.g. 'reset location') or None. + """ + with suppress(Exception): + spark.sql(f"DROP TABLE IF EXISTS {target_sql}") + + cleanup_hint: str | None = None + if target_location and target_location.exists(): + with suppress(Exception): + shutil.rmtree(target_location, ignore_errors=True) + cleanup_hint = "reset location" + return cleanup_hint + + +def _write_spark_seed_to_table( + spark: Any, + sdf: Any, + target_identifier: str, + target_sql: str, + target_location: Any, + table_format: str | None, + table_options: dict[str, Any], +) -> str | None: + """ + Write the seed as a managed Spark table, handling common location issues. + + Returns: + A cleanup hint string describing corrective actions, or None. + """ + cleanup_hint = _reset_spark_table_and_location(spark, target_sql, target_location) + + try: + _spark_write_table(sdf, target_identifier, table_format, table_options) + return cleanup_hint + except _SparkAnalysisException as exc: + message = str(exc) + if target_location and "LOCATION_ALREADY_EXISTS" in message.upper(): + # Attempt to fix by resetting the table location and retrying once. + with suppress(Exception): + shutil.rmtree(target_location, ignore_errors=True) + cleanup_hint = "reset location" + _spark_write_table(sdf, target_identifier, table_format, table_options) + return cleanup_hint + raise RuntimeError(f"Spark seed load failed for {target_sql}: {exc}") from exc + except Exception as exc: # pragma: no cover - generic safety net + raise RuntimeError(f"Spark seed load failed for {target_sql}: {exc}") from exc + + +def _detect_spark_storage_format( + storage_meta: Any, + table_format: Any, +) -> str: + """ + Determine an effective storage format label (e.g. 'delta') from storage + metadata or executor configuration. + """ + storage_format = "" + if isinstance(storage_meta, dict): + raw_fmt = storage_meta.get("format") + if isinstance(raw_fmt, str) and raw_fmt.strip(): + storage_format = raw_fmt.strip().lower() + + if not storage_format and isinstance(table_format, str) and table_format.strip(): + storage_format = table_format.strip().lower() + + return storage_format + + +def _handle_spark( + table: str, + df: pd.DataFrame, + executor: Any, + schema: str | None, +) -> bool: + """Try to detect and handle Spark/Databricks for seeding.""" + spark = getattr(executor, "spark", None) + if spark is None: + return False + target_identifier, target_sql, target_location, created_schema = _prepare_spark_target( + table=table, + schema=schema, + executor=executor, + spark=spark, + ) + + table_format = getattr(executor, "spark_table_format", None) + table_options = getattr(executor, "spark_table_options", None) or {} + format_handler = getattr(executor, "_format_handler", None) + + storage_meta = storage.get_seed_storage(target_identifier) + + t0 = perf_counter() + sdf = spark.createDataFrame(df) + + cleanup_hint: str | None = None + allows_unmanaged = bool(getattr(format_handler, "allows_unmanaged_paths", lambda: True)()) + + if storage_meta.get("path") and allows_unmanaged: + # Behavior for parquet/delta/etc: respect custom path. + cleanup_hint = _write_spark_seed_to_path( + spark=spark, + target_identifier=target_identifier, + sdf=sdf, + storage_meta=storage_meta, + table_format=table_format, + table_options=table_options, + ) + else: + # Behavior when no path is configured: table-based seed via executor handler try: - _write() - except _SparkAnalysisException as exc: - message = str(exc) - if target_location and "LOCATION_ALREADY_EXISTS" in message.upper(): - with suppress(Exception): - shutil.rmtree(target_location, ignore_errors=True) - cleanup_hint = "reset location" - _write() + if hasattr(executor, "_save_df_as_table"): + executor._save_df_as_table(target_identifier, sdf, storage={"path": None}) + cleanup_hint = None else: - raise RuntimeError(f"Spark seed load failed for {target_sql}: {exc}") from exc - except Exception as exc: + cleanup_hint = _write_spark_seed_to_table( + spark=spark, + sdf=sdf, + target_identifier=target_identifier, + target_sql=target_sql, + target_location=target_location, + table_format=table_format, + table_options=table_options, + ) + except Exception as exc: # pragma: no cover raise RuntimeError(f"Spark seed load failed for {target_sql}: {exc}") from exc dt_ms = int((perf_counter() - t0) * 1000) + + storage_format = _detect_spark_storage_format(storage_meta, table_format) + engine_label = f"spark/{storage_format}" if storage_format else "spark" + _echo_seed_line( full_name=target_sql, rows=len(df), cols=df.shape[1], - engine="spark", + engine=engine_label, ms=dt_ms, created_schema=created_schema, action="replaced", @@ -483,18 +709,20 @@ def seed_project(project_dir: Path, executor: Any, default_schema: str | None = """ Load every seed file under /seeds recursively and materialize it. - Supports configuration in seeds/schema.yml: - - targets: - : # e.g., "raw/users" (path-based, recommended) - schema: # global target schema - table: # optional rename - schema_by_engine: # optional engine overrides - postgres: raw - duckdb: main - - dtypes: - : - column_a: string - column_b: int64 + Supports configuration in seeds/schema.yml (validated via Pydantic): + + targets: + : # e.g., "raw/users" (path-based, recommended) + schema: # global target schema + table: # optional rename + schema_by_engine: # optional engine overrides (EngineType keys) + postgres: raw + duckdb: main + + dtypes: + : + column_a: string + column_b: int64 Resolution priority for (schema, table): 1) targets[] (e.g., "raw/users") @@ -512,10 +740,8 @@ def seed_project(project_dir: Path, executor: Any, default_schema: str | None = if not seeds_dir.exists(): return 0 - schema_cfg = None - schema_file = seeds_dir / "schema.yml" - if schema_file.exists(): - schema_cfg = yaml.safe_load(schema_file.read_text(encoding="utf-8")) + # Pydantic-validated seeds/schema.yml (or None if not present) + schema_cfg = load_seeds_schema(project_dir) # Collect seed files recursively to allow folder-based schema conventions. paths: list[Path] = [ @@ -544,11 +770,7 @@ def seed_project(project_dir: Path, executor: Any, default_schema: str | None = # If schema.yml uses a bare stem while that stem exists multiple times, # force disambiguation. - if ( - schema_cfg - and (schema_cfg.get("targets") or {}).get(stem) - and stem_counts.get(stem, 0) > 1 - ): + if schema_cfg and stem in schema_cfg.targets and stem_counts.get(stem, 0) > 1: raise ValueError( f'Seed stem "{stem}" appears multiple times. ' f"Please configure using the path-based seed ID " diff --git a/src/fastflowtransform/settings.py b/src/fastflowtransform/settings.py index 08dbdc1..ea34152 100644 --- a/src/fastflowtransform/settings.py +++ b/src/fastflowtransform/settings.py @@ -1,3 +1,4 @@ +# fastflowtransform/settings.py from __future__ import annotations import os @@ -7,7 +8,7 @@ import yaml from jinja2 import Environment, StrictUndefined -from pydantic import BaseModel, Field, TypeAdapter +from pydantic import BaseModel, ConfigDict, Field, TypeAdapter from pydantic_settings import BaseSettings, SettingsConfigDict from fastflowtransform.errors import ProfileConfigError @@ -15,23 +16,30 @@ EngineType = Literal["duckdb", "postgres", "bigquery", "databricks_spark", "snowflake_snowpark"] -class DuckDBConfig(BaseModel): +class BaseConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + +class DuckDBConfig(BaseConfig): + model_config = ConfigDict(populate_by_name=True, extra="forbid") path: str = ":memory:" # file path or ":memory:" + db_schema: str | None = Field(default=None, alias="schema") + catalog: str | None = None -class PostgresConfig(BaseModel): +class PostgresConfig(BaseConfig): dsn: str | None = None # e.g. postgresql+psycopg://user:pass@host:5432/db db_schema: str = "public" -class BigQueryConfig(BaseModel): +class BigQueryConfig(BaseConfig): project: str | None = None dataset: str | None = None location: str | None = None use_bigframes: bool = True -class DatabricksSparkConfig(BaseModel): +class DatabricksSparkConfig(BaseConfig): master: str = "local[*]" app_name: str = "fastflowtransform" extra_conf: dict[str, Any] | None = None @@ -43,7 +51,7 @@ class DatabricksSparkConfig(BaseModel): table_options: dict[str, Any] | None = None -class SnowflakeSnowparkConfig(BaseModel): +class SnowflakeSnowparkConfig(BaseConfig): account: str user: str password: str @@ -53,27 +61,27 @@ class SnowflakeSnowparkConfig(BaseModel): role: str | None = None -class DuckDBProfile(BaseModel): +class DuckDBProfile(BaseConfig): engine: Literal["duckdb"] duckdb: DuckDBConfig -class PostgresProfile(BaseModel): +class PostgresProfile(BaseConfig): engine: Literal["postgres"] postgres: PostgresConfig -class BigQueryProfile(BaseModel): +class BigQueryProfile(BaseConfig): engine: Literal["bigquery"] bigquery: BigQueryConfig -class DatabricksSparkProfile(BaseModel): +class DatabricksSparkProfile(BaseConfig): engine: Literal["databricks_spark"] databricks_spark: DatabricksSparkConfig -class SnowflakeSnowparkProfile(BaseModel): +class SnowflakeSnowparkProfile(BaseConfig): engine: Literal["snowflake_snowpark"] snowflake_snowpark: SnowflakeSnowparkConfig @@ -88,12 +96,6 @@ class SnowflakeSnowparkProfile(BaseModel): ] -class ProjectConfig(BaseModel): - name: str - version: str - models_dir: str = "models" - - class EnvSettings(BaseSettings): model_config = SettingsConfigDict(env_prefix="FF_", env_file=".env", extra="ignore") @@ -106,6 +108,8 @@ class EnvSettings(BaseSettings): # DuckDB DUCKDB_PATH: str | None = None + DUCKDB_SCHEMA: str | None = None + DUCKDB_CATALOG: str | None = None # Postgres PG_DSN: str | None = None @@ -143,12 +147,6 @@ class EnvSettings(BaseSettings): HTTP_ALLOWED_DOMAINS: str | None = None # FF_HTTP_ALLOWED_DOMAINS (csv) -def load_project_config(project_dir: Path) -> ProjectConfig: - cfg_path = project_dir / "project.yml" - data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) - return ProjectConfig(**data) - - # ---------- Loader ---------- def load_profiles(project_dir: Path) -> dict: """ @@ -184,10 +182,17 @@ def _env(name: str, default: str | None = "") -> str: # ---------- Resolver ---------- def resolve_profile(project_dir: Path, env_name: str, env: EnvSettings) -> Profile: profiles: dict[str, dict[str, Any]] = load_profiles(project_dir) + requested = profiles.get(env_name) + fallback = profiles.get("default") + + if profiles and requested is None and fallback is None: + raise ProfileConfigError( + f"Profile '{env_name}' not found " + "in profiles.yml (define it or add a 'default' profile)." + ) + raw: dict[str, Any] = ( - profiles.get(env_name) - or profiles.get("default") - or {"engine": "duckdb", "duckdb": {"path": ":memory:"}} + requested or fallback or {"engine": "duckdb", "duckdb": {"path": ":memory:"}} ) _apply_env_overrides(raw, env) @@ -223,6 +228,8 @@ def _set_if(d: dict[str, Any], key: str, value: Any | None) -> None: def _ov_duckdb(raw: dict[str, Any], env: EnvSettings) -> None: duck = raw.setdefault("duckdb", {}) _set_if(duck, "path", getattr(env, "DUCKDB_PATH", None)) + _set_if(duck, "schema", getattr(env, "DUCKDB_SCHEMA", None)) + _set_if(duck, "catalog", getattr(env, "DUCKDB_CATALOG", None)) def _ov_postgres(raw: dict[str, Any], env: EnvSettings) -> None: diff --git a/src/fastflowtransform/storage.py b/src/fastflowtransform/storage.py index a53d12f..26f9ddb 100644 --- a/src/fastflowtransform/storage.py +++ b/src/fastflowtransform/storage.py @@ -6,6 +6,7 @@ from dataclasses import dataclass, field from pathlib import Path from typing import Any +from uuid import uuid4 @dataclass @@ -123,21 +124,39 @@ def _quote(part: str) -> str: target_sql = ".".join(_quote(p) for p in parts) - spark.sql(f"DROP TABLE IF EXISTS {target_sql}") - - path_str = str(path) - if "://" not in path_str: - target_path = Path(path_str) - if target_path.exists(): - shutil.rmtree(target_path, ignore_errors=True) - target_path.parent.mkdir(parents=True, exist_ok=True) - writer = df.write.mode("overwrite") if fmt: writer = writer.format(fmt) if options: writer = writer.options(**options) - writer.save(path_str) + + path_str = str(path) + is_local_path = "://" not in path_str + + if is_local_path: + target_path = Path(path_str) + target_path.parent.mkdir(parents=True, exist_ok=True) + tmp_path = target_path.parent / f".ff_tmp_{target_path.name}_{uuid4().hex}" + if tmp_path.exists(): + shutil.rmtree(tmp_path, ignore_errors=True) + + try: + writer.save(str(tmp_path)) + except Exception: + shutil.rmtree(tmp_path, ignore_errors=True) + raise + + spark.sql(f"DROP TABLE IF EXISTS {target_sql}") + if target_path.exists(): + shutil.rmtree(target_path, ignore_errors=True) + try: + tmp_path.rename(target_path) + except Exception: + shutil.rmtree(tmp_path, ignore_errors=True) + raise + else: + writer.save(path_str) + spark.sql(f"DROP TABLE IF EXISTS {target_sql}") using_clause = f"USING {fmt}" if fmt else "" escaped_path = path_str.replace("'", "''") diff --git a/src/fastflowtransform/table_formats/__init__.py b/src/fastflowtransform/table_formats/__init__.py new file mode 100644 index 0000000..a1083b0 --- /dev/null +++ b/src/fastflowtransform/table_formats/__init__.py @@ -0,0 +1,59 @@ +# fastflowtransform/table_formats/__init__.py +from __future__ import annotations + +from typing import Any + +from pyspark.sql import SparkSession + +from .base import SparkFormatHandler +from .spark_default import DefaultSparkFormatHandler +from .spark_delta import DeltaFormatHandler +from .spark_hudi import HudiFormatHandler +from .spark_iceberg import IcebergFormatHandler + +# Mapping: normalized format name -> handler class +_SPARK_FORMAT_REGISTRY: dict[str, type[SparkFormatHandler]] = { + "delta": DeltaFormatHandler, + "iceberg": IcebergFormatHandler, + "hudi": HudiFormatHandler, +} + + +def register_spark_format( + name: str, + handler_cls: type[SparkFormatHandler], +) -> None: + """ + Register or override a Spark format handler. + + This can be used by extensions/plug-ins to add new formats without + touching core code. + """ + _SPARK_FORMAT_REGISTRY[name.lower()] = handler_cls + + +def get_spark_format_handler( + table_format: str | None, + spark: SparkSession, + *, + table_options: dict[str, Any] | None = None, +) -> SparkFormatHandler: + """ + Factory for SparkFormatHandler based on a logical format name. + + - If a specific handler is registered (delta, iceberg, ...), use it. + - Otherwise fall back to DefaultSparkFormatHandler with `table_format`. + """ + fmt = (table_format or "").lower() + handler_cls = _SPARK_FORMAT_REGISTRY.get(fmt) + + if handler_cls is not None: + # Handlers like DeltaFormatHandler/IcebergFormatHandler don't need table_format + return handler_cls(spark, table_options=table_options or {}) + + # Fallback: generic Spark format handler (parquet/orc/etc.) + return DefaultSparkFormatHandler( + spark, + table_format=fmt or None, + table_options=table_options or {}, + ) diff --git a/src/fastflowtransform/table_formats/base.py b/src/fastflowtransform/table_formats/base.py new file mode 100644 index 0000000..b251d0d --- /dev/null +++ b/src/fastflowtransform/table_formats/base.py @@ -0,0 +1,115 @@ +# fastflowtransform/table_formats/base.py +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any + +from pyspark.sql import DataFrame as SDF, SparkSession + + +class SparkFormatHandler(ABC): + """ + Abstract base for Spark table format handlers (Delta, Parquet, Iceberg, ...). + + Responsibilities: + - Saving a DataFrame as a managed table. + - Incremental INSERT semantics. + - Optional incremental MERGE semantics (can raise NotImplementedError). + + This is intentionally minimal so that engines (DatabricksSparkExecutor) + can: + - Delegate managed table handling to the handler. + - Still implement engine-level fallbacks for merge semantics. + """ + + def __init__( + self, + spark: SparkSession, + *, + table_format: str | None = None, + table_options: dict[str, Any] | None = None, + ) -> None: + self.spark = spark + self.table_format: str | None = (table_format or "").lower() or None + # Normalize options to strings for Spark + self.table_options: dict[str, str] = { + str(k): str(v) for k, v in (table_options or {}).items() + } + + # ---- Identifier helpers ---- + def qualify_identifier(self, table_name: str, *, database: str | None = None) -> str: + """Return the physical table identifier for Spark APIs (unquoted).""" + return (table_name or "").strip() + + def format_identifier_for_sql(self, table_name: str, *, database: str | None = None) -> str: + """Return a SQL-safe identifier (per-part quoted) for the table.""" + ident = self.qualify_identifier(table_name, database=database) + parts = [p for p in ident.split(".") if p] + if not parts: + return self._quote_part(ident) + return ".".join(self._quote_part(part) for part in parts) + + def format_test_table( + self, table_name: str | None, *, database: str | None = None + ) -> str | None: + if table_name is None: + return None + return self.format_identifier_for_sql(table_name, database=database) + + def allows_unmanaged_paths(self) -> bool: + """Whether storage.path overrides should be honored for this format.""" + return True + + def relation_exists(self, table_name: str, *, database: str | None = None) -> bool: + ident = self.qualify_identifier(table_name, database=database) + try: + return bool(self.spark.catalog.tableExists(ident)) + except Exception: + return False + + @staticmethod + def _quote_part(value: str) -> str: + inner = (value or "").replace("`", "``") + return f"`{inner}`" + + # ---- Required API ---- + @abstractmethod + def save_df_as_table(self, table_name: str, df: SDF) -> None: + """ + Save the given DataFrame as a (managed) table. + + The input name is the *fully-qualified* identifier Spark should use, + e.g. "db.table" or just "table". + """ + raise NotImplementedError + + # ---- Optional / defaulted API ---- + def incremental_insert(self, table_name: str, select_body_sql: str) -> None: + """ + Default incremental INSERT implementation, format-agnostic. + + `select_body_sql` must be a *SELECT-able* body (no trailing semicolon), + e.g. "SELECT ... FROM ...". + """ + body = select_body_sql.strip().rstrip(";\n\t ") + if not body.lower().startswith("select"): + # This is a guard; DatabricksSparkExecutor uses _selectable_body already. + raise ValueError(f"incremental_insert expects SELECT body, got: {body[:40]!r}") + self.spark.sql(f"INSERT INTO {table_name} {body}") + + def incremental_merge( + self, + table_name: str, + select_body_sql: str, + unique_key: list[str], + ) -> None: + """ + Optional: incremental MERGE semantics (UPSERT-like). + Subclasses may override this. Default: not supported. + + Engines using this handler MUST be prepared to handle NotImplementedError + and fall back to a more generic strategy. + """ + raise NotImplementedError( + f"incremental_merge is not implemented for format '{self.table_format or 'default'}'" + ) diff --git a/src/fastflowtransform/table_formats/spark_default.py b/src/fastflowtransform/table_formats/spark_default.py new file mode 100644 index 0000000..24b0ac3 --- /dev/null +++ b/src/fastflowtransform/table_formats/spark_default.py @@ -0,0 +1,48 @@ +# fastflowtransform/table_formats/spark_default.py +from __future__ import annotations + +from typing import Any + +from pyspark.sql import DataFrame as SDF, SparkSession + +from fastflowtransform.table_formats.base import SparkFormatHandler + + +class DefaultSparkFormatHandler(SparkFormatHandler): + """ + Default Spark format handler for non-Delta managed tables + (e.g. Parquet, ORC, generic catalog tables). + + Responsibilities: + - save_df_as_table() using DataFrameWriter.saveAsTable. + - incremental_insert() uses the base implementation (INSERT INTO ...). + - incremental_merge() is intentionally NOT implemented and is expected + to be handled by the executor via a generic fallback. + """ + + def __init__( + self, + spark: SparkSession, + *, + table_format: str | None = None, + table_options: dict[str, Any] | None = None, + ) -> None: + super().__init__(spark, table_format=table_format, table_options=table_options) + + def save_df_as_table(self, table_name: str, df: SDF) -> None: + """ + Save DataFrame as a managed table using Spark's built-in formats. + + - Overwrites the table content. + - Uses self.table_format (if provided) as the writer format. + - Applies self.table_options as writer options. + """ + writer = df.write.mode("overwrite") + + if self.table_format: + writer = writer.format(self.table_format) + + if self.table_options: + writer = writer.options(**self.table_options) + + writer.saveAsTable(table_name) diff --git a/src/fastflowtransform/table_formats/spark_delta.py b/src/fastflowtransform/table_formats/spark_delta.py new file mode 100644 index 0000000..66937df --- /dev/null +++ b/src/fastflowtransform/table_formats/spark_delta.py @@ -0,0 +1,110 @@ +# fastflowtransform/table_formats/spark_delta.py +from __future__ import annotations + +from typing import Any + +from delta.tables import DeltaTable +from pyspark.sql import DataFrame as SDF, SparkSession + +from fastflowtransform.table_formats.base import SparkFormatHandler + + +class DeltaFormatHandler(SparkFormatHandler): + """ + Delta Lake format handler using delta-spark's DeltaTable API. + + Responsibilities: + - save_df_as_table() with format("delta"). + - incremental_insert(): default SparkFormatHandler implementation + (INSERT INTO). + - incremental_merge(): uses DeltaTable.merge() + with whenMatchedUpdateAll / whenNotMatchedInsertAll. + """ + + def __init__( + self, + spark: SparkSession, + *, + table_options: dict[str, Any] | None = None, + ) -> None: + super().__init__(spark, table_format="delta", table_options=table_options or {}) + + # ---------- Core helpers ---------- + def _delta_table_for(self, table_name: str) -> DeltaTable: + """ + Resolve a DeltaTable from a table name. + + This assumes a managed/catalog Delta table; unmanaged/path-based + tables are handled via the storage layer and *not* by this handler. + """ + try: + return DeltaTable.forName(self.spark, table_name) + except Exception as exc: # pragma: no cover - error path + raise RuntimeError( + f"Delta table '{table_name}' does not exist " + f"or is not registered as a Delta table: {exc}" + ) from exc + + # ---------- Required API ---------- + def save_df_as_table(self, table_name: str, df: SDF) -> None: + """ + Save DataFrame as a managed Delta table. + + Overwrites the table content: + - writer.format("delta") + - writer.mode("overwrite") + - options from self.table_options + """ + writer = df.write.format("delta").mode("overwrite") + + if self.table_options: + writer = writer.options(**self.table_options) + + writer.saveAsTable(table_name) + + # ---------- Incremental API ---------- + # incremental_insert: base implementation is fine: + # INSERT INTO table SELECT ... + # but we keep the signature here for clarity/override if needed. + def incremental_insert(self, table_name: str, select_body_sql: str) -> None: + super().incremental_insert(table_name, select_body_sql) + + def incremental_merge( + self, + table_name: str, + select_body_sql: str, + unique_key: list[str], + ) -> None: + """ + Delta MERGE implementation using DeltaTable.merge API. + + Semantics: + - If unique_key is empty -> falls back to insert-only semantics. + - Otherwise: + MERGE INTO table AS t + USING () AS s + ON AND-joined equality on unique_key + WHEN MATCHED THEN UPDATE SET * + WHEN NOT MATCHED THEN INSERT * + """ + body = select_body_sql.strip().rstrip(";\n\t ") + if not unique_key: + # No keys -> treat this as pure append. + self.incremental_insert(table_name, body) + return + + # Materialize the source DataFrame for the merge + source_df = self.spark.sql(body) + + # Build the join predicate: t.k = s.k AND ... + condition = " AND ".join([f"t.`{k}` = s.`{k}`" for k in unique_key]) + + delta_tbl = self._delta_table_for(table_name) + + ( + delta_tbl.alias("t") + .merge(source_df.alias("s"), condition) + .whenMatchedUpdateAll() + .whenNotMatchedInsertAll() + .execute() + ) diff --git a/src/fastflowtransform/table_formats/spark_hudi.py b/src/fastflowtransform/table_formats/spark_hudi.py new file mode 100644 index 0000000..a0ad8ed --- /dev/null +++ b/src/fastflowtransform/table_formats/spark_hudi.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +from typing import Any + +from pyspark.sql import DataFrame as SDF, SparkSession + +from fastflowtransform.table_formats.base import SparkFormatHandler + + +class HudiFormatHandler(SparkFormatHandler): + """ + Hudi format handler using Spark's Hudi integration. + + Responsibilities: + - save_df_as_table() via df.write.format("hudi").saveAsTable(...) + - incremental_insert(): INSERT INTO ... SELECT ... + - incremental_merge(): MERGE INTO ... USING (...) WHEN MATCHED/NOT MATCHED ... + (Hudi's Spark MERGE support must be enabled in the cluster). + """ + + def __init__( + self, + spark: SparkSession, + *, + default_database: str | None = None, + table_options: dict[str, Any] | None = None, + ) -> None: + # table_format="hudi" so the base class knows what we're dealing with + super().__init__(spark, table_format="hudi", table_options=table_options or {}) + self.default_database = default_database or spark.catalog.currentDatabase() + + # ---------- Core helpers ---------- + def _qualify_table_name(self, table_name: str, database: str | None = None) -> str: + """ + Normalize input like "seed_events" or "db.seed_events" to "db.seed_events". + + For Hudi we normally rely on the current Spark catalog / Hive metastore, + so there is no extra "catalog." prefix like in Iceberg. + """ + raw = (table_name or "").strip() + if not raw: + raise ValueError("Empty table name for HudiFormatHandler") + + parts = [p for p in raw.split(".") if p] + if len(parts) == 1: + db = database or self.default_database + return ".".join([db, parts[0]]) + # already db.table or catalog.db.table - just pass through + if len(parts) == 2: + return ".".join(parts) + return ".".join(parts) + + # ---------- Identifier overrides ---------- + def qualify_identifier(self, table_name: str, *, database: str | None = None) -> str: + # For Spark SQL we just use db.table, no extra quoting here - the caller + # can quote if needed. + return self._qualify_table_name(table_name, database=database) + + def allows_unmanaged_paths(self) -> bool: + # Hudi can work as a path-based table as well, so we allow that. + # (Your higher-level executor can still decide whether to use paths.) + return True + + def relation_exists(self, table_name: str, *, database: str | None = None) -> bool: + ident = self.qualify_identifier(table_name, database=database) + try: + return self.spark.catalog.tableExists(ident) + except Exception: + return False + + # ---------- Required API ---------- + def save_df_as_table(self, table_name: str, df: SDF) -> None: + """ + Save DataFrame as a Hudi table registered in the current catalog. + + Typical Hudi options you might pass via table_options include: + - hoodie.datasource.write.recordkey.field + - hoodie.datasource.write.precombine.field + - hoodie.table.name (optional when using saveAsTable) + """ + full_name = self._qualify_table_name(table_name) + + writer = df.write.format("hudi") + for k, v in self.table_options.items(): + writer = writer.option(str(k), str(v)) + + # Full refresh semantics: overwrite the Hudi table + writer.mode("overwrite").saveAsTable(full_name) + + # ---------- Incremental API ---------- + def incremental_insert(self, table_name: str, select_body_sql: str) -> None: + """ + Append-only incremental load. + + Uses Spark SQL INSERT INTO; the Hudi connector will handle this as an + insert/upsert depending on table configuration. + """ + body = select_body_sql.strip().rstrip(";\n\t ") + if not body.lower().startswith("select"): + raise ValueError(f"incremental_insert expects SELECT body, got: {body[:40]!r}") + + full_name = self._qualify_table_name(table_name) + self.spark.sql(f"INSERT INTO {full_name} {body}") + + def incremental_merge( + self, + table_name: str, + select_body_sql: str, + unique_key: list[str], + ) -> None: + """ + Hudi MERGE implementation. + + MERGE INTO db.table AS t + USING () AS s + ON AND-joined equality on unique_key + WHEN MATCHED THEN UPDATE SET * + WHEN NOT MATCHED THEN INSERT * + + This requires Hudi's MERGE support to be enabled on your Spark cluster. + """ + body = select_body_sql.strip().rstrip(";\n\t ") + if not unique_key: + # No key - fall back to simple insert + self.incremental_insert(table_name, body) + return + + full_name = self._qualify_table_name(table_name) + pred = " AND ".join([f"t.`{k}` = s.`{k}`" for k in unique_key]) + + self.spark.sql( + f""" + MERGE INTO {full_name} AS t + USING ({body}) AS s + ON {pred} + WHEN MATCHED THEN UPDATE SET * + WHEN NOT MATCHED THEN INSERT * + """ + ) diff --git a/src/fastflowtransform/table_formats/spark_iceberg.py b/src/fastflowtransform/table_formats/spark_iceberg.py new file mode 100644 index 0000000..7c6cf36 --- /dev/null +++ b/src/fastflowtransform/table_formats/spark_iceberg.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +from typing import Any + +from pyspark.sql import DataFrame as SDF, SparkSession + +from fastflowtransform.table_formats.base import SparkFormatHandler + + +class IcebergFormatHandler(SparkFormatHandler): + """ + Iceberg format handler using Spark's Iceberg integration. + + Responsibilities: + - save_df_as_table() with format("iceberg"). + - incremental_insert(): default SparkFormatHandler implementation + (INSERT INTO). + - incremental_merge(): uses Spark SQL MERGE INTO ... USING (...) syntax, + which Iceberg supports when the catalog is configured for Iceberg. + """ + + def __init__( + self, + spark: SparkSession, + *, + table_options: dict[str, Any] | None = None, + ) -> None: + super().__init__(spark, table_format="iceberg", table_options=table_options or {}) + self.catalog_name = "iceberg" + + # ---------- Core helpers ---------- + def _qualify_table_name(self, table_name: str, database: str | None = None) -> str: + """ + Normalize arbitrary input like "seed_events" or "db.seed_events" + to the fully-qualified Iceberg identifier "iceberg.db.seed_events". + """ + raw = (table_name or "").strip() + if not raw: + raise ValueError("Empty table name for IcebergFormatHandler") + + parts = [p for p in raw.split(".") if p] + cat = self.catalog_name + + if len(parts) == 1: + # table → iceberg..table + db = database or self.spark.catalog.currentDatabase() + return ".".join([cat, db, parts[0]]) + if len(parts) == 2: + # db.table → iceberg.db.table + return ".".join([cat, *parts]) + # len >= 3: assume already catalog.db.table + return ".".join(parts) + + # ---------- Identifier overrides ---------- + def qualify_identifier(self, table_name: str, *, database: str | None = None) -> str: + return self._qualify_table_name(table_name, database=database) + + def allows_unmanaged_paths(self) -> bool: + return False + + def relation_exists(self, table_name: str, *, database: str | None = None) -> bool: + ident = self.qualify_identifier(table_name, database=database) + try: + self.spark.table(ident) + return True + except Exception: + return False + + # ---------- Required API ---------- + def save_df_as_table(self, table_name: str, df: SDF) -> None: + """ + Save DataFrame as an Iceberg table in the configured catalog. + + Uses DataFrameWriterV2: + + df.writeTo("iceberg.db.table").using("iceberg").createOrReplace() + """ + full_name = self._qualify_table_name(table_name) + + writer = df.writeTo(full_name).using("iceberg") + for k, v in self.table_options.items(): + writer = writer.tableProperty(str(k), str(v)) + + # Upsert semantics for seeds / full-refresh + writer.createOrReplace() + + # ---------- Incremental API ---------- + def incremental_insert(self, table_name: str, select_body_sql: str) -> None: + body = select_body_sql.strip().rstrip(";\n\t ") + if not body.lower().startswith("select"): + raise ValueError(f"incremental_insert expects SELECT body, got: {body[:40]!r}") + + full_name = self._qualify_table_name(table_name) + self.spark.sql(f"INSERT INTO {full_name} {body}") + + def incremental_merge( + self, + table_name: str, + select_body_sql: str, + unique_key: list[str], + ) -> None: + """ + Iceberg MERGE implementation. + + MERGE INTO iceberg.db.table AS t + USING () AS s + ON AND-joined equality on unique_key + WHEN MATCHED THEN UPDATE SET * + WHEN NOT MATCHED THEN INSERT * + """ + body = select_body_sql.strip().rstrip(";\n\t ") + if not unique_key: + self.incremental_insert(table_name, body) + return + + full_name = self._qualify_table_name(table_name) + pred = " AND ".join([f"t.`{k}` = s.`{k}`" for k in unique_key]) + + self.spark.sql( + f""" + MERGE INTO {full_name} AS t + USING ({body}) AS s + ON {pred} + WHEN MATCHED THEN UPDATE SET * + WHEN NOT MATCHED THEN INSERT * + """ + ) diff --git a/src/fastflowtransform/test_registry.py b/src/fastflowtransform/test_registry.py deleted file mode 100644 index 0eb7e81..0000000 --- a/src/fastflowtransform/test_registry.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations - -from typing import Any, Protocol - -from fastflowtransform import testing - - -class Runner(Protocol): - """Callable signature for a generic test runner. - - Returns: - ok (bool): Whether the test passed. - message (str | None): Optional human-friendly message (usually set on failure). - example_sql (str | None): Optional example SQL (shown in summary on failure). - """ - - def __call__( - self, con: Any, table: str, column: str | None, params: dict[str, Any] - ) -> tuple[bool, str | None, str | None]: ... - - -def _sql_list(values: list[Any]) -> str: - """Render a simple SQL literal list, portable enough for DuckDB/Postgres/BigQuery.""" - - def lit(v: Any) -> str: - if v is None: - return "NULL" - if isinstance(v, (int, float)): - return str(v) - s = str(v).replace("'", "''") - return f"'{s}'" - - return ", ".join(lit(v) for v in (values or [])) - - -def run_not_null( - con: Any, table: str, column: str | None, params: dict[str, Any] -) -> tuple[bool, str | None, str | None]: - where = params.get("where") - example = f"select count(*) from {table} where {column} is null" + ( - f" and ({where})" if where else "" - ) - if column is None: - # Column is required for not_null - return False, "missing required parameter: column", example - col = column - try: - testing.not_null(con, table, col, where=where) - return True, None, example - except testing.TestFailure as e: - return False, str(e), example - - -def run_unique( - con: Any, table: str, column: str | None, params: dict[str, Any] -) -> tuple[bool, str | None, str | None]: - where = params.get("where") - example = ( - f"select {column} as key, count(*) c from {table}" - + (f" where ({where})" if where else "") - + " group by 1 having count(*) > 1 limit 5" - ) - if column is None: - return False, "missing required parameter: column", example - col = column - try: - testing.unique(con, table, col, where=where) - return True, None, example - except testing.TestFailure as e: - return False, str(e), example - - -def run_accepted_values( - con: Any, table: str, column: str | None, params: dict[str, Any] -) -> tuple[bool, str | None, str | None]: - values = params.get("values") or [] - where = params.get("where") - in_list = _sql_list(values) - example = ( - f"select distinct {column} from {table} " - + f"where {column} is not null and {column} not in ({in_list})" - + (f" and ({where})" if where else "") - + " limit 5" - ) - if column is None: - return False, "missing required parameter: column", example - col = column - try: - testing.accepted_values(con, table, col, values=values, where=where) - return True, None, example - except testing.TestFailure as e: - return False, str(e), example - - -# Public registry (extensible). -TESTS: dict[str, Runner] = { - "not_null": run_not_null, - "unique": run_unique, - "accepted_values": run_accepted_values, # NEW - # "relationships": run_relationships, # (later) -} diff --git a/tests/executors/__init__.py b/src/fastflowtransform/testing/__init__.py similarity index 100% rename from tests/executors/__init__.py rename to src/fastflowtransform/testing/__init__.py diff --git a/src/fastflowtransform/testing.py b/src/fastflowtransform/testing/base.py similarity index 71% rename from src/fastflowtransform/testing.py rename to src/fastflowtransform/testing/base.py index bce36fc..cac08f8 100644 --- a/src/fastflowtransform/testing.py +++ b/src/fastflowtransform/testing/base.py @@ -1,4 +1,4 @@ -# src/fastflowtransform/testing.py +# src/fastflowtransform/testing/base.py from __future__ import annotations from collections.abc import Iterable, Sequence @@ -113,8 +113,10 @@ def _pretty_sql(sql: Any) -> str: return repr(sql) -def _sql_list(values: list) -> str: - def lit(v): +def sql_list(values: list[Any] | None) -> str: + """Render a simple SQL literal list, portable enough for DuckDB/Postgres/BigQuery.""" + + def lit(v: Any) -> str: if v is None: return "NULL" if isinstance(v, (int, float)): @@ -122,33 +124,35 @@ def lit(v): s = str(v).replace("'", "''") return f"'{s}'" - return ", ".join(lit(v) for v in values) + return ", ".join(lit(v) for v in (values or [])) def accepted_values( - con: Any, table: str, column: str, *, values: list, where: str | None = None -) -> bool: + con: Any, table: str, column: str, *, values: list[Any], where: str | None = None +) -> None: """ - Checks that all non-NULL values of table.column are in the set 'values'. + Fail if any non-NULL value of table.column is outside the set 'values'. """ - in_list = _sql_list(values or []) + # If no values are provided, we consider the check vacuously true. + if not values: + return + + in_list = sql_list(values) + sql = f"select count(*) from {table} where {column} is not null" + sql = f"select count(*) from {table} where {column} is not null and {column} not in ({in_list})" - if where: - sql += f" and ({where})" + n = _scalar(con, sql) if int(n or 0) > 0: - # Beispielwerte zeigen - sample_sql = ( - f"select distinct {column} " - f"from {table} " - f"where {column} is not null and {column} not in ({in_list})" - ) + sample_sql = f"select distinct {column} from {table} where {column} is not null" + if in_list: + sample_sql += f" and {column} not in ({in_list})" if where: + sql += f" and ({where})" sample_sql += f" and ({where})" sample_sql += " limit 5" rows = [r[0] for r in _exec(con, sample_sql).fetchall()] raise TestFailure(f"{table}.{column} has {n} value(s) outside accepted set; e.g. {rows}") - return True # ===== Tests ============================================================== @@ -186,7 +190,6 @@ def not_null(con: Any, table: str, column: str, where: str | None = None) -> Non c = _scalar(con, sql) except Exception as e: raise _wrap_db_error("not_null", table, column, sql, e) from e - c = _scalar(con, sql) dprint("not_null:", sql, "=>", c) if c and c != 0: _fail("not_null", table, column, sql, f"has {c} NULL-values") @@ -236,13 +239,99 @@ def row_count_between(con: Any, table: str, min_rows: int = 1, max_rows: int | N def freshness(con: Any, table: str, ts_col: str, max_delay_minutes: int) -> None: - # Straightforward for DuckDB/PG; BQ would need a TIMESTAMPDIFF variant. - sql = f"select date_part('epoch', now() - max({ts_col})) / 60.0 as delay_min from {table}" - # Note: DuckDB has different date_diff signatures; for DuckDB-only might use: - # sql_duckdb = - # f\"\"\"select date_diff('minute', max({ts_col}), now()) as delay_min from {table}\"\"\" - delay = _scalar(con, sql) - dprint("freshness:", sql, "=>", delay) + """ + Fail if the latest timestamp in `ts_col` is older than `max_delay_minutes`. + + Behaviour: + - First, run a lightweight probe on max(ts_col) to detect clearly wrong types + (e.g. VARCHAR) and emit an actionable error instead of an engine-specific + type/binder exception. + - Then compute the delay in minutes using an engine-friendly expression: + * Postgres / DuckDB: date_part('epoch', now() - max(ts_col)) / 60.0 + * Spark / Databricks: + (unix_timestamp(current_timestamp()) - unix_timestamp(max(ts_col))) / 60.0 + + For Spark-like connections we go straight to the unix_timestamp variant so + we do not trigger noisy INVALID_EXTRACT_FIELD logs from the planner. + """ + # 1) Probe type: read max(ts_col) and inspect the Python value that comes back. + probe_sql = f"select max({ts_col}) from {table}" + try: + probe = _scalar(con, probe_sql) + except Exception as e: + # Column missing or other metadata-related DB error + raise _wrap_db_error("freshness", table, ts_col, probe_sql, e) from e + + # If max(...) comes back as a string, this is almost certainly a typed-as-VARCHAR + # timestamp column. Fail with a clear hint instead of letting the engine throw. + if probe is not None and isinstance(probe, str): + raise TestFailure( + f"[freshness] {table}.{ts_col} must be a TIMESTAMP-like column, but " + f"max({ts_col}) returned a value of type {type(probe).__name__}.\n" + f"Hint: cast the column in your model, for example:\n" + f" select ..., CAST({ts_col} AS TIMESTAMP) as {ts_col}, ...\n" + f"and then reference that column in the freshness test." + ) + + # 2) Decide which SQL to use based on the connection type. + # + # We cannot rely on a formal engine flag here, but the Databricks/Spark + # test connection lives in the databricks_spark_exec module and/or wraps + # a SparkSession. We use a simple heuristic on the connection type name + # and module to detect "Spark-like" behaviour. + con_type = type(con) + mod = getattr(con_type, "__module__", "") or "" + name = getattr(con_type, "__name__", "") or "" + mod_l = mod.lower() + name_l = name.lower() + is_spark_like = any(token in mod_l or token in name_l for token in ("spark", "databricks")) + + # Primary SQL (Postgres / DuckDB style) + sql_primary = ( + f"select date_part('epoch', now() - max({ts_col})) / 60.0 as delay_min from {table}" + ) + + # Spark / Databricks: unix_timestamp over timestamps + sql_spark = ( + "select " + f"(unix_timestamp(current_timestamp()) - unix_timestamp(max({ts_col}))) / 60.0 " + f"as delay_min from {table}" + ) + + delay = None + sql_used: str + + if is_spark_like: + # For Spark-like engines we never send the date_part('epoch', ...) SQL, + # to avoid INVALID_EXTRACT_FIELD noise in the logs. + sql_used = sql_spark + try: + delay = _scalar(con, sql_spark) + except Exception as e: + raise _wrap_db_error("freshness", table, ts_col, sql_spark, e) from e + else: + # Non-Spark engines: try the Postgres/DuckDB expression first. + sql_used = sql_primary + try: + delay = _scalar(con, sql_primary) + except Exception as e: + txt = str(e).lower() + # If the engine complains about invalid extract fields / epoch, + # attempt the Spark-style expression as a fallback. + if ( + "invalid_extract_field" in txt + or "cannot extract" in txt + or ("epoch" in txt and "extract" in txt) + ): + sql_used = sql_spark + try: + delay = _scalar(con, sql_spark) + except Exception as e2: + raise _wrap_db_error("freshness", table, ts_col, sql_spark, e2) from e2 + else: + raise _wrap_db_error("freshness", table, ts_col, sql_primary, e) from e + + dprint("freshness:", sql_used, "=>", delay) if delay is None or delay > max_delay_minutes: raise TestFailure( f"freshness of {table}.{ts_col} too old: {delay} min > {max_delay_minutes} min" diff --git a/src/fastflowtransform/testing/registry.py b/src/fastflowtransform/testing/registry.py new file mode 100644 index 0000000..2dc60b8 --- /dev/null +++ b/src/fastflowtransform/testing/registry.py @@ -0,0 +1,370 @@ +# fastflowtransform/testing/registry.py +from __future__ import annotations + +from typing import Any, Protocol + +from fastflowtransform.testing import base as testing + + +class Runner(Protocol): + """Callable signature for a generic test runner. + + Returns: + ok (bool): Whether the test passed. + message (str | None): Optional human-friendly message (usually set on failure). + example_sql (str | None): Optional example SQL (shown in summary on failure). + """ + + def __call__( + self, con: Any, table: str, column: str | None, params: dict[str, Any] + ) -> tuple[bool, str | None, str | None]: ... + + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + + +def _example_where(where: str | None) -> str: + """Return a ' where (...)' suffix if where is provided, otherwise empty string.""" + return f" where ({where})" if where else "" + + +# --------------------------------------------------------------------------- +# Basic column-level tests +# --------------------------------------------------------------------------- + + +def run_not_null( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + where = params.get("where") + example = f"select count(*) from {table} where {column} is null" + ( + f" and ({where})" if where else "" + ) + if column is None: + # Column is required for not_null + return False, "missing required parameter: column", example + col = column + try: + testing.not_null(con, table, col, where=where) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +def run_unique( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + where = params.get("where") + example = ( + f"select {column} as key, count(*) c from {table}" + + (f" where ({where})" if where else "") + + " group by 1 having count(*) > 1 limit 5" + ) + if column is None: + return False, "missing required parameter: column", example + col = column + try: + testing.unique(con, table, col, where=where) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +def run_accepted_values( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + """Runner for testing.accepted_values.""" + values = params.get("values") or [] + where = params.get("where") + + if column is None: + example = "-- accepted_values: column parameter is required" + return False, "missing required parameter: column", example + + if not values: + # No values configured -> we treat this as a no-op check. + example = f"-- accepted_values: no values provided; check is skipped for {table}.{column}" + return True, None, example + + in_list = testing.sql_list(values) + example = ( + f"select distinct {column} from {table} " + + f"where {column} is not null and {column} not in ({in_list})" + + (f" and ({where})" if where else "") + + " limit 5" + ) + + col = column + try: + testing.accepted_values(con, table, col, values=values, where=where) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +def run_greater_equal( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + """Runner for testing.greater_equal (column >= threshold).""" + threshold = float(params.get("threshold", 0.0)) + if column is None: + example = f"select count(*) from {table} where < {threshold}" + return False, "missing required parameter: column", example + + example = f"select count(*) from {table} where {column} < {threshold}" + col = column + try: + testing.greater_equal(con, table, col, threshold=threshold) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +def run_non_negative_sum( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + """Runner for testing.non_negative_sum.""" + if column is None: + example = f"select coalesce(sum(), 0) from {table}" + return False, "missing required parameter: column", example + + example = f"select coalesce(sum({column}), 0) from {table}" + col = column + try: + testing.non_negative_sum(con, table, col) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +# --------------------------------------------------------------------------- +# Table-level tests +# --------------------------------------------------------------------------- + + +def run_row_count_between( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + """Runner for testing.row_count_between.""" + min_rows = int(params.get("min_rows", 1)) + max_rows_param = params.get("max_rows") + max_rows = int(max_rows_param) if max_rows_param is not None else None + + example = f"select count(*) from {table}" + try: + testing.row_count_between(con, table, min_rows=min_rows, max_rows=max_rows) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +def run_freshness( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + """Runner for testing.freshness (max timestamp delay in minutes).""" + if column is None: + example = ( + f"select date_part('epoch', now() - max()) / 60.0 as delay_min from {table}" + ) + return False, "missing required parameter: column (ts_col)", example + + max_delay_raw = params.get("max_delay_minutes") + example = f"select date_part('epoch', now() - max({column})) / 60.0 as delay_min from {table}" + + if max_delay_raw is None: + return False, "missing required parameter: max_delay_minutes", example + + try: + max_delay_int = int(max_delay_raw) + except (TypeError, ValueError): + return ( + False, + f"invalid max_delay_minutes (expected integer minutes, got {max_delay_raw!r})", + example, + ) + + col = column + try: + testing.freshness(con, table, col, max_delay_minutes=max_delay_int) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +# --------------------------------------------------------------------------- +# Helpers for reconcile tests +# --------------------------------------------------------------------------- + + +def _example_scalar_side(side: dict[str, Any]) -> str: + """Render an example SELECT for a reconcile side.""" + tbl = side.get("table", "
") + expr = side.get("expr", "") + where = side.get("where") + return f"select {expr} from {tbl}" + (f" where {where}" if where else "") + + +def _example_coverage_sql( + source: dict[str, Any], + target: dict[str, Any], + source_where: str | None, + target_where: str | None, +) -> str: + """Render an example SQL for reconcile_coverage.""" + s_tbl, s_key = source.get("table", ""), source.get("key", "") + t_tbl, t_key = target.get("table", ""), target.get("key", "") + s_w = f" where {source_where}" if source_where else "" + t_w = f" where {target_where}" if target_where else "" + return f""" +with src as (select {s_key} as k from {s_tbl}{s_w}), + tgt as (select {t_key} as k from {t_tbl}{t_w}) +select count(*) from src s +left join tgt t on s.k = t.k +where t.k is null +""".strip() + + +# --------------------------------------------------------------------------- +# Reconcile tests +# --------------------------------------------------------------------------- + + +def run_reconcile_equal( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + """Runner for testing.reconcile_equal (left == right within tolerances).""" + left = params.get("left") + right = params.get("right") + abs_tol = params.get("abs_tolerance") + rel_tol = params.get("rel_tolerance_pct") + + if not isinstance(left, dict) or not isinstance(right, dict): + example = "-- reconcile_equal requires 'left' and 'right' dict parameters" + return False, "missing or invalid 'left'/'right' parameters", example + + example = _example_scalar_side(left) + ";\n" + _example_scalar_side(right) + + try: + testing.reconcile_equal( + con, + left=left, + right=right, + abs_tolerance=abs_tol, + rel_tolerance_pct=rel_tol, + ) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +def run_reconcile_ratio_within( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + """Runner for testing.reconcile_ratio_within (min_ratio <= L/R <= max_ratio).""" + left = params.get("left") + right = params.get("right") + min_ratio = params.get("min_ratio") + max_ratio = params.get("max_ratio") + + if not isinstance(left, dict) or not isinstance(right, dict): + example = "-- reconcile_ratio_within requires 'left' and 'right' dict parameters" + return False, "missing or invalid 'left'/'right' parameters", example + + if min_ratio is None or max_ratio is None: + example = _example_scalar_side(left) + ";\n" + _example_scalar_side(right) + return False, "missing required parameters: min_ratio / max_ratio", example + + example = _example_scalar_side(left) + ";\n" + _example_scalar_side(right) + + try: + testing.reconcile_ratio_within( + con, + left=left, + right=right, + min_ratio=float(min_ratio), + max_ratio=float(max_ratio), + ) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +def run_reconcile_diff_within( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + """Runner for testing.reconcile_diff_within (|L - R| <= max_abs_diff).""" + left = params.get("left") + right = params.get("right") + max_abs_diff = params.get("max_abs_diff") + + if not isinstance(left, dict) or not isinstance(right, dict): + example = "-- reconcile_diff_within requires 'left' and 'right' dict parameters" + return False, "missing or invalid 'left'/'right' parameters", example + + if max_abs_diff is None: + example = _example_scalar_side(left) + ";\n" + _example_scalar_side(right) + return False, "missing required parameter: max_abs_diff", example + + example = _example_scalar_side(left) + ";\n" + _example_scalar_side(right) + + try: + testing.reconcile_diff_within( + con, + left=left, + right=right, + max_abs_diff=float(max_abs_diff), + ) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +def run_reconcile_coverage( + con: Any, table: str, column: str | None, params: dict[str, Any] +) -> tuple[bool, str | None, str | None]: + """Runner for testing.reconcile_coverage (anti-join count == 0).""" + source = params.get("source") + target = params.get("target") + source_where = params.get("source_where") + target_where = params.get("target_where") + + if not isinstance(source, dict) or not isinstance(target, dict): + example = "-- reconcile_coverage requires 'source' and 'target' dict parameters" + return False, "missing or invalid 'source'/'target' parameters", example + + example = _example_coverage_sql(source, target, source_where, target_where) + + try: + testing.reconcile_coverage( + con, + source=source, + target=target, + source_where=source_where, + target_where=target_where, + ) + return True, None, example + except testing.TestFailure as e: + return False, str(e), example + + +# --------------------------------------------------------------------------- +# Registry +# --------------------------------------------------------------------------- + +# Public registry (extensible). +TESTS: dict[str, Runner] = { + "not_null": run_not_null, + "unique": run_unique, + "accepted_values": run_accepted_values, + "greater_equal": run_greater_equal, + "non_negative_sum": run_non_negative_sum, + "row_count_between": run_row_count_between, + "freshness": run_freshness, + # Reconcile tests + "reconcile_equal": run_reconcile_equal, + "reconcile_ratio_within": run_reconcile_ratio_within, + "reconcile_diff_within": run_reconcile_diff_within, + "reconcile_coverage": run_reconcile_coverage, +} diff --git a/src/fastflowtransform/validation.py b/src/fastflowtransform/validation.py index 9dd9255..4f8e4c7 100644 --- a/src/fastflowtransform/validation.py +++ b/src/fastflowtransform/validation.py @@ -7,10 +7,6 @@ def validate_required_columns(node_name: str, inputs: Any, requires: dict[str, set[str]]) -> None: - """ - inputs: entweder ein DataFrame (bei 1 Dep) oder dict[str, DataFrame] (bei >1 Deps) - requires: Mapping physische_relations_name -> set[columns] - """ if not requires: return errors = [] diff --git a/tests/.env.dev_databricks b/tests/.env.dev_databricks new file mode 100644 index 0000000..2ff082a --- /dev/null +++ b/tests/.env.dev_databricks @@ -0,0 +1,3 @@ +# Databricks Spark profile defaults for unittests +FF_SPARK_MASTER=local[*] +JAVA_HOME=/opt/homebrew/opt/openjdk@17 diff --git a/tests/.env.dev_duckdb b/tests/.env.dev_duckdb new file mode 100644 index 0000000..e93a460 --- /dev/null +++ b/tests/.env.dev_duckdb @@ -0,0 +1,2 @@ +# DuckDB profile for unittests +FF_DUCKDB_PATH=.local/unittests.duckdb diff --git a/tests/.env.dev_postgres b/tests/.env.dev_postgres new file mode 100644 index 0000000..c374662 --- /dev/null +++ b/tests/.env.dev_postgres @@ -0,0 +1,3 @@ +# Postgres profile for unittests +FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 +FF_PG_SCHEMA=incremental_demo diff --git a/tests/common/fixtures.py b/tests/common/fixtures.py index 4092c6a..a46c8b3 100644 --- a/tests/common/fixtures.py +++ b/tests/common/fixtures.py @@ -8,8 +8,11 @@ import pandas as pd import psycopg import pytest +import sqlalchemy as sa +from dotenv import load_dotenv from jinja2 import Environment, FileSystemLoader, select_autoescape from psycopg import sql +from sqlalchemy import text from fastflowtransform import utest from fastflowtransform.core import REGISTRY @@ -17,6 +20,21 @@ from tests.common.utils import ROOT, run +# ---- Load Env Variables ---- +@pytest.fixture(scope="session", autouse=True) +def load_test_env(): + candidates = [ + ROOT / "tests" / ".env", + ROOT / "tests" / ".env.dev_databricks", + ROOT / "tests" / ".env.dev_duckdb", + ROOT / "tests" / ".env.dev_postgres", + ] + + for env_file in candidates: + if env_file.is_file(): + load_dotenv(env_file, override=False) + + # ---- Jinja Env ---- @pytest.fixture(scope="session") def jinja_env(): @@ -59,17 +77,21 @@ def duckdb_env(duckdb_db_path): return {"FF_ENGINE": "duckdb", "FF_DUCKDB_PATH": str(duckdb_db_path)} -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def duckdb_seeded(duckdb_project, duckdb_env): db_path = duckdb_env.get("FF_DUCKDB_PATH") - if db_path: - db_file = Path(db_path) + db_file = Path(db_path) if db_path else None + if db_file: if db_file.exists(): db_file.unlink() - # ensure parent dir exists for fresh DB creation - db_file.parent.mkdir(parents=True, exist_ok=True) + db_file.parent.mkdir(parents=True, exist_ok=True) run(["fft", "seed", str(duckdb_project), "--env", "dev"], duckdb_env) - yield + try: + yield + finally: + if db_file: + with suppress(Exception): + db_file.unlink() # ---- Postgres ---- @@ -80,15 +102,13 @@ def pg_project(): @pytest.fixture(scope="session") def pg_env(): - # Passe DSN/Schema bei Bedarf an dein Profil an dsn = os.environ.get("FF_PG_DSN", "postgresql+psycopg://postgres:postgres@localhost:5432/ffdb") - schema = os.environ.get("FF_PG_SCHEMA", "public") # falls Profile das verwenden + schema = os.environ.get("FF_PG_SCHEMA", "public") return {"FF_ENGINE": "postgres", "FF_PG_DSN": dsn, "FF_PG_SCHEMA": schema} @pytest.fixture(scope="module") def pg_seeded(pg_project, pg_env): - # optional: Datenbank leeren/neu erstellen - je nach CI-Setup dsn = pg_env.get("FF_PG_DSN") schema = pg_env.get("FF_PG_SCHEMA") or "public" if dsn and schema and ("psycopg://" in dsn or "+psycopg" in dsn): @@ -126,7 +146,10 @@ def _make(**kwargs): # make .config(...) chainable fake_builder.config.return_value = fake_builder fake_builder.enableHiveSupport.return_value = fake_builder - fake_spark = MagicMock() + fake_conf = MagicMock() + fake_sc_conf = MagicMock() + fake_sc = MagicMock(getConf=MagicMock(return_value=fake_sc_conf)) + fake_spark = MagicMock(conf=fake_conf, sparkContext=fake_sc) fake_builder.getOrCreate.return_value = fake_spark ex = DatabricksSparkExecutor(**kwargs) @@ -150,10 +173,31 @@ def spark_exec(spark_tmpdir: Path) -> DatabricksSparkExecutor: ) +@pytest.fixture(scope="session") +def spark_exec_delta(spark_tmpdir): + try: + pass + except Exception: + pytest.skip("delta-spark is not installed; skipping Delta tests") + + extra_conf = { + "spark.ui.enabled": "false", + "spark.sql.shuffle.partitions": "1", + } + + return DatabricksSparkExecutor( + master="local[*]", + app_name="fft-it-delta", + warehouse_dir=str(spark_tmpdir), + database=os.getenv("FF_DBR_DATABASE", "default"), + extra_conf=extra_conf, + table_format="delta", # executor will configure Delta & verify it + ) + + # ---- utest ---- @pytest.fixture def fake_registry(tmp_path, monkeypatch): - # wir brauchen ein REGISTRY mit projekt-dir und 1 node node = SimpleNamespace(name="model_a", kind="sql", deps=["src1"]) reg = SimpleNamespace( nodes={"model_a": node}, @@ -161,7 +205,6 @@ def fake_registry(tmp_path, monkeypatch): get_project_dir=lambda: tmp_path, ) monkeypatch.setattr(utest, "REGISTRY", reg) - # relation_for -> immer schema.model monkeypatch.setattr(utest, "relation_for", lambda name: f"public.{name}") return reg @@ -169,14 +212,13 @@ def fake_registry(tmp_path, monkeypatch): @pytest.fixture def duckdb_executor(): """ - Fake-Executor, der dem DuckDB-Pfad ähnelt: - - hat .con + Fake-Executor: + - has .con - con.register(...) - con.execute(...) - con.table(...).df() """ con = MagicMock() - # für _read_result (duckdb) table_df = pd.DataFrame([{"id": 1}]) con.table.return_value.df.return_value = table_df @@ -184,9 +226,7 @@ class DuckEx: def __init__(self, con): self.con = con - # für _execute_node(sql) def run_sql(self, node, jenv): - # schreibt nix, simuliert nur Erfolg return None def run_python(self, node): @@ -208,3 +248,57 @@ def __init__(self, engine): self.schema = "public" return PgEx(engine) + + +# ---- Examples ---- +@pytest.fixture(scope="session") +def duckdb_engine_env(tmp_path_factory): + """Basic env for DuckDB examples.""" + db_dir = tmp_path_factory.mktemp("duckdb") + db_path = db_dir / "examples.duckdb" + return { + "FF_ENGINE": "duckdb", + "FF_DUCKDB_PATH": str(db_path), + } + + +@pytest.fixture(scope="session") +def postgres_engine_env(): + """Basic env für Postgres. Skipped if DSN is missing or DB not reachable.""" + dsn = os.environ.get( + "FF_PG_DSN", + "postgresql+psycopg://postgres:postgres@localhost:5432/ffdb", + ) + schema = os.environ.get("FF_PG_SCHEMA", "public") + + # Optional: Connectivity-Check + try: + engine = sa.create_engine(dsn) + with engine.connect() as conn: + conn.execute(text("select 1")) + except Exception as exc: + pytest.skip(f"Postgres not reachable at DSN={dsn!r}: {exc}") + + return { + "FF_ENGINE": "postgres", + "FF_PG_DSN": dsn, + "FF_PG_SCHEMA": schema, + } + + +@pytest.fixture(scope="session") +def spark_engine_env(tmp_path_factory): + """Basic env for Databricks-Spark-Executor. Skipped if JAVA_HOME is missing.""" + if not os.environ.get("JAVA_HOME"): + pytest.skip("JAVA_HOME not set for Spark tests") + + warehouse = tmp_path_factory.mktemp("spark_warehouse") + + return { + "FF_ENGINE": "databricks_spark", + "FF_SPARK_MASTER": "local[*]", + "FF_SPARK_APP_NAME": "fft_examples_ci", + "FF_DBR_ENABLE_HIVE": "1", + "FF_DBR_DATABASE": "ff_examples_ci", + "FF_SPARK_WAREHOUSE_DIR": str(warehouse), + } diff --git a/tests/common/mock/profiles.py b/tests/common/mock/profiles.py index 06de95c..b36fc41 100644 --- a/tests/common/mock/profiles.py +++ b/tests/common/mock/profiles.py @@ -41,6 +41,8 @@ def fake_bigquery_profile( def fake_duckdb_profile( *, path: str = ":memory:", + schema: str | None = None, + catalog: str | None = None, ) -> Profile: """ Fake DuckDB profile - just enough for _validate_profile_params and _make_executor. @@ -49,6 +51,8 @@ def fake_duckdb_profile( engine="duckdb", duckdb=SimpleNamespace( path=path, + db_schema=schema, + catalog=catalog, ), ) return cast(Profile, ns) diff --git a/tests/executors/duckdb/__init__.py b/tests/executors/duckdb/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/executors/duckdb/test_reconcile.py b/tests/executors/duckdb/test_reconcile.py deleted file mode 100644 index 5e4aae3..0000000 --- a/tests/executors/duckdb/test_reconcile.py +++ /dev/null @@ -1,13 +0,0 @@ -import pytest - -from tests.common.utils import run - - -@pytest.mark.duckdb -def test_reconcile_duckdb_smoke(duckdb_project, duckdb_env, duckdb_seeded): - # Run full test suite but select only reconcile-tagged checks - env = dict(duckdb_env) - res = run(["fft", "test", str(duckdb_project), "--env", "dev", "--select", "reconcile"], env) - # Exit code 0 or 2 is handled in run(); we assert substring in stdout to ensure checks ran - assert "orders_count_equals_mart" in res.stdout or "orders ⇔ mart_orders_enriched" in res.stdout - assert "Totals" in res.stdout diff --git a/tests/executors/duckdb/test_ref_source.py b/tests/executors/duckdb/test_ref_source.py deleted file mode 100644 index 15b639d..0000000 --- a/tests/executors/duckdb/test_ref_source.py +++ /dev/null @@ -1,24 +0,0 @@ -# tests/duckdb/test_ref_source.py -import duckdb -import pytest - -from tests.common.utils import ROOT, run - -PROJ = ROOT / "examples" / "simple_duckdb" -DB = PROJ / ".local" / "demo.duckdb" -ENV = {"FF_ENGINE": "duckdb", "FF_DUCKDB_PATH": str(DB)} - - -@pytest.mark.duckdb -def test_ref_and_source_duckdb(duckdb_seeded, duckdb_project, duckdb_env): - # Seeds sind durch duckdb_seeded geladen - run(["fft", "run", str(duckdb_project), "--env", "dev"], duckdb_env) - - con = duckdb.connect(duckdb_env["FF_DUCKDB_PATH"]) - orders_count = con.execute("select count(*) from orders").fetchone() - assert orders_count is not None - assert orders_count[0] >= 1 # source() - - mart_users_count = con.execute("select count(*) from mart_users").fetchone() - assert mart_users_count is not None - assert mart_users_count[0] >= 1 # ref() -> users_enriched diff --git a/tests/executors/postgres/__init__.py b/tests/executors/postgres/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/executors/postgres/test_reconcile.py b/tests/executors/postgres/test_reconcile.py deleted file mode 100644 index 53edc3f..0000000 --- a/tests/executors/postgres/test_reconcile.py +++ /dev/null @@ -1,10 +0,0 @@ -import pytest - -from tests.common.utils import run - - -@pytest.mark.postgres -def test_reconcile_postgres_smoke(pg_project, pg_env, pg_seeded): - # Same as DuckDB; relies on the example PG project mirroring objects - res = run(["fft", "test", str(pg_project), "--env", "stg", "--select", "reconcile"], pg_env) - assert "Totals" in res.stdout diff --git a/tests/executors/postgres/test_ref_source.py b/tests/executors/postgres/test_ref_source.py deleted file mode 100644 index 0efccbe..0000000 --- a/tests/executors/postgres/test_ref_source.py +++ /dev/null @@ -1,21 +0,0 @@ -# tests/postgres/test_ref_source_postgres.py -import pytest -from sqlalchemy import create_engine, text - -from tests.common.utils import run - - -@pytest.mark.postgres -def test_ref_and_source_pg(pg_seeded, pg_project, pg_env): - # gleiche ENV wie beim Seeding → gleiches DSN/Schema - run(["fft", "run", str(pg_project), "--env", "stg"], pg_env) - - eng = create_engine(pg_env["FF_PG_DSN"], future=True) - schema = pg_env.get("FF_PG_SCHEMA", "public") - with eng.begin() as c: - c.execute(text(f'set local search_path = "{schema}"')) - orders_count = c.execute(text("select count(*) from orders")).scalar() - assert orders_count and orders_count >= 1 - - mart_users_count = c.execute(text("select count(*) from mart_users")).scalar() - assert mart_users_count and mart_users_count >= 1 diff --git a/tests/integration/artifacts/test_catalog_duckdb_integration.py b/tests/integration/artifacts/test_catalog_duckdb_integration.py index 23d202b..00a0640 100644 --- a/tests/integration/artifacts/test_catalog_duckdb_integration.py +++ b/tests/integration/artifacts/test_catalog_duckdb_integration.py @@ -15,7 +15,6 @@ def test_catalog_duckdb(tmp_path: Path): (tmp_path / "models" / "t.ff.sql").write_text( "create or replace table t as select 1::int as id, 'x'::varchar as email", encoding="utf-8" ) - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") REGISTRY.load_project(tmp_path) env = REGISTRY.get_env() ex = DuckExecutor(":memory:") diff --git a/tests/cli/test_test_cmd_schema_merge.py b/tests/integration/cli/test_cmd/test_test_cmd_schema_merge_integration.py similarity index 83% rename from tests/cli/test_test_cmd_schema_merge.py rename to tests/integration/cli/test_cmd/test_test_cmd_schema_merge_integration.py index 64c57bc..51f178c 100644 --- a/tests/cli/test_test_cmd_schema_merge.py +++ b/tests/integration/cli/test_cmd/test_test_cmd_schema_merge_integration.py @@ -1,17 +1,23 @@ from pathlib import Path +import pytest + from fastflowtransform.cli.test_cmd import _apply_legacy_tag_filter, _run_dq_tests from fastflowtransform.core import REGISTRY from fastflowtransform.executors.duckdb_exec import DuckExecutor from fastflowtransform.schema_loader import load_schema_tests +@pytest.mark.integration +@pytest.mark.duckdb def test_merge_project_yaml_and_schema_yaml(tmp_path: Path): # Projekt + Modell (tmp_path / "models").mkdir(parents=True) - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") (tmp_path / "project.yml").write_text( """ +name: test_project +version: "0.1" + tests: - type: not_null table: users @@ -46,7 +52,6 @@ def test_merge_project_yaml_and_schema_yaml(tmp_path: Path): ex = DuckExecutor(":memory:") ex.run_sql(REGISTRY.get_node("users.ff"), env) - # Sammeln legacy = [ { "type": "not_null", @@ -58,13 +63,10 @@ def test_merge_project_yaml_and_schema_yaml(tmp_path: Path): ] schema_specs = load_schema_tests(tmp_path) - # Legacy-Tagfilter (nur 'legacy') legacy_only = _apply_legacy_tag_filter(legacy + schema_specs, ["legacy"], legacy_token=True) - res_legacy = _run_dq_tests(ex.con, legacy_only) + res_legacy = _run_dq_tests(ex.con, legacy_only, ex) assert all(r.ok for r in res_legacy) - # Schema-Tagfilter (nur 'schema') schema_only = _apply_legacy_tag_filter(legacy + schema_specs, ["schema"], legacy_token=True) - res_schema = _run_dq_tests(ex.con, schema_only) - # accepted_values mit passendem Wert -> ok (WARN-Fail wäre egal, hier aber grün) + res_schema = _run_dq_tests(ex.con, schema_only, ex) assert all(r.ok or r.severity == "warn" for r in res_schema) diff --git a/tests/integration/test_state_modified_integration.py b/tests/integration/cli/test_state_modified_integration.py similarity index 96% rename from tests/integration/test_state_modified_integration.py rename to tests/integration/cli/test_state_modified_integration.py index a15cb34..fc4dbe8 100644 --- a/tests/integration/test_state_modified_integration.py +++ b/tests/integration/cli/test_state_modified_integration.py @@ -1,14 +1,15 @@ import time from pathlib import Path +import pytest from typer.testing import CliRunner from fastflowtransform.cli import app +@pytest.mark.integration def test_state_modified_and_plus(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") (tmp_path / "models" / "a.ff.sql").write_text("select 1 as x", encoding="utf-8") (tmp_path / "models" / "b.ff.sql").write_text( "select * from {{ ref('a.ff') }}", encoding="utf-8" diff --git a/tests/integration/test_version.py b/tests/integration/cli/test_version.py similarity index 92% rename from tests/integration/test_version.py rename to tests/integration/cli/test_version.py index 2721798..b94b0cc 100644 --- a/tests/integration/test_version.py +++ b/tests/integration/cli/test_version.py @@ -3,7 +3,7 @@ import pytest -@pytest.mark.cli +@pytest.mark.integration def test_version_flag(): cp = subprocess.run(["fft", "--version"], check=True, stdout=subprocess.PIPE, text=True) out = cp.stdout.strip() diff --git a/tests/integration/test_buildins_var_this.py b/tests/integration/core/test_buildins_var_this_integration.py similarity index 89% rename from tests/integration/test_buildins_var_this.py rename to tests/integration/core/test_buildins_var_this_integration.py index c575f01..48089d4 100644 --- a/tests/integration/test_buildins_var_this.py +++ b/tests/integration/core/test_buildins_var_this_integration.py @@ -12,9 +12,9 @@ def test_var_overrides_and_this_object(tmp_path: Path): # Arrange: project with project.yml vars and a simple model using var() and this (tmp_path / "models").mkdir(parents=True) - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") (tmp_path / "project.yml").write_text( - "vars:\n day: '2025-10-01'\n limit: 5\n", encoding="utf-8" + "name: test_var_this\nversion: '0.1'\nvars:\n day: '2025-10-01'\n limit: 5\n", + encoding="utf-8", ) # Model prints var('day') and this.name (tmp_path / "models" / "m.ff.sql").write_text( diff --git a/tests/integration/test_python_model_dependencies.py b/tests/integration/core/test_python_model_dependencies_integration.py similarity index 79% rename from tests/integration/test_python_model_dependencies.py rename to tests/integration/core/test_python_model_dependencies_integration.py index 0eb2cf6..7860d2b 100644 --- a/tests/integration/test_python_model_dependencies.py +++ b/tests/integration/core/test_python_model_dependencies_integration.py @@ -1,24 +1,26 @@ -# tests/test_py_model_dep_loading.py +# tests/integration/core/test_python_model_dependencies_integration.py import pandas as pd +import pytest from fastflowtransform.core import REGISTRY, Node from fastflowtransform.executors.duckdb_exec import DuckExecutor +@pytest.mark.integration +@pytest.mark.duckdb def test_python_model_dep_loading_single_and_multi(tmp_path): ex = DuckExecutor() con = ex.con - # seed zwei Tabellen + # seed two tables con.execute("create table users as select 1::int as id, 'a@example.com'::varchar as email") con.execute("create table orders as select 1::int as user_id, 10.0::double as order_value") - # registriere zwei Python-Modelle in REGISTRY (vereinfachtes Setup) def one(df: pd.DataFrame) -> pd.DataFrame: return df.assign(flag=True) - def multi(dfs: dict[str, pd.DataFrame]) -> pd.DataFrame: - return dfs["orders"].merge(dfs["users"], left_on="user_id", right_on="id") + def multi(orders: pd.DataFrame, users: pd.DataFrame) -> pd.DataFrame: + return orders.merge(users, left_on="user_id", right_on="id") REGISTRY.py_funcs["u1"] = one REGISTRY.py_funcs["m1"] = multi diff --git a/tests/unit/test_selective_run_subgraph.py b/tests/integration/core/test_selective_run_subgraph_integration.py similarity index 98% rename from tests/unit/test_selective_run_subgraph.py rename to tests/integration/core/test_selective_run_subgraph_integration.py index d5589c8..0e29fab 100644 --- a/tests/unit/test_selective_run_subgraph.py +++ b/tests/integration/core/test_selective_run_subgraph_integration.py @@ -1,5 +1,6 @@ from types import SimpleNamespace +import pytest from jinja2 import Environment from typer.testing import CliRunner @@ -22,6 +23,7 @@ def _mk_node(tmp, name, kind="sql", deps=None, mat="table", tags=None): return n +@pytest.mark.integration def test_run_select_includes_upstream_and_excludes_others(tmp_path, monkeypatch): # Graph: # users.ff -> users_enriched @@ -77,6 +79,7 @@ def run_py(node): assert calls["run_py"] == [] +@pytest.mark.integration def test_run_exclude_removes_targets_and_downstream(tmp_path, monkeypatch): # Graph: stg -> mart stg = _mk_node(tmp_path, "stg_events") @@ -126,6 +129,7 @@ def run_py(node): assert set(calls["run_sql"]) == {"stg_events"} +@pytest.mark.integration def test_run_select_conflicts_with_exclude_drops_invalid_targets(tmp_path, monkeypatch): # Graph: A -> B; select B, exclude A ⇒ nothing to run a = _mk_node(tmp_path, "users_enriched", deps=["users.ff"]) diff --git a/tests/integration/examples/config.py b/tests/integration/examples/config.py new file mode 100644 index 0000000..dd076cb --- /dev/null +++ b/tests/integration/examples/config.py @@ -0,0 +1,79 @@ +# tests/integration/examples/config.py +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from tests.common.utils import ROOT + + +@dataclass +class ExampleConfig: + name: str + path: Path + make_target: str + env_by_engine: dict[str, str] + spark_table_formats: list[str] | None = None + + +EXAMPLES: list[ExampleConfig] = [ + ExampleConfig( + name="basic_demo", + path=ROOT / "examples" / "basic_demo", + make_target="demo", + env_by_engine={ + "duckdb": "dev_duckdb", + "postgres": "dev_postgres", + "databricks_spark": "dev_databricks", + }, + ), + ExampleConfig( + name="materializations_demo", + path=ROOT / "examples" / "materializations_demo", + make_target="demo", + env_by_engine={ + "duckdb": "dev_duckdb", + "postgres": "dev_postgres", + "databricks_spark": "dev_databricks", + }, + ), + ExampleConfig( + name="dq_demo", + path=ROOT / "examples" / "dq_demo", + make_target="demo", + env_by_engine={ + "duckdb": "dev_duckdb", + "postgres": "dev_postgres", + "databricks_spark": "dev_databricks", + }, + ), + ExampleConfig( + name="macros_demo", + path=ROOT / "examples" / "macros_demo", + make_target="demo", + env_by_engine={ + "duckdb": "dev_duckdb", + }, + ), + ExampleConfig( + name="api_demo", + path=ROOT / "examples" / "api_demo", + make_target="demo", + env_by_engine={ + "duckdb": "dev_duckdb", + "postgres": "dev_postgres", + "databricks_spark": "dev_databricks", + }, + ), + ExampleConfig( + name="incremental_demo", + path=ROOT / "examples" / "incremental_demo", + make_target="demo", + env_by_engine={ + "duckdb": "dev_duckdb", + "postgres": "dev_postgres", + "databricks_spark": "dev_databricks", + }, + spark_table_formats=["parquet", "delta", "iceberg"], + ), +] diff --git a/tests/integration/examples/test_examples_matrix.py b/tests/integration/examples/test_examples_matrix.py new file mode 100644 index 0000000..f43d77d --- /dev/null +++ b/tests/integration/examples/test_examples_matrix.py @@ -0,0 +1,87 @@ +# tests/integration/examples/test_examples_matrix.py +from __future__ import annotations + +import json +import os +from pathlib import Path +from subprocess import CalledProcessError, run + +import pytest +from tests.integration.examples.config import EXAMPLES + + +def _run_cmd(cmd: list[str], cwd: Path, extra_env: dict[str, str] | None = None) -> None: + env = os.environ.copy() + if extra_env: + env.update(extra_env) + proc = run(cmd, check=False, cwd=str(cwd), env=env, text=True, capture_output=True) + if proc.returncode != 0: + raise CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr) + + +ENGINE_MARKS = { + "duckdb": pytest.mark.duckdb, + "postgres": pytest.mark.postgres, + "databricks_spark": pytest.mark.databricks_spark, +} + +# build only the actually-supported (example, engine) combinations +EXAMPLE_ENGINE_PARAMS = [ + pytest.param( + example, + engine, + id=f"{example.name}[{engine}]", + marks=ENGINE_MARKS[engine], + ) + for example in EXAMPLES + for engine in example.env_by_engine +] + +ENGINE_ENV_FIXTURE = { + "duckdb": "duckdb_engine_env", + "postgres": "postgres_engine_env", + "databricks_spark": "spark_engine_env", +} + + +@pytest.mark.integration +@pytest.mark.example +@pytest.mark.parametrize("example,engine", EXAMPLE_ENGINE_PARAMS) +def test_examples_with_all_engines(example, engine, request): + fixture_name = ENGINE_ENV_FIXTURE[engine] + engine_env: dict[str, str] = request.getfixturevalue(fixture_name) + + env = dict(engine_env) + env["FFT_ACTIVE_ENV"] = example.env_by_engine[engine] + + cmd = ["make", example.make_target, f"ENGINE={engine}"] + + if engine == "databricks_spark": + formats = example.spark_table_formats or ["parquet"] + for fmt in formats: + env_fmt = dict(env) + env_fmt["DBR_TABLE_FORMAT"] = fmt + _run_cmd( + [*cmd, f"DBR_TABLE_FORMAT={fmt}"], + cwd=example.path, + extra_env=env_fmt, + ) + else: + _run_cmd(cmd, cwd=example.path, extra_env=env) + + target_dir = example.path / ".fastflowtransform" / "target" + manifest = target_dir / "manifest.json" + run_results = target_dir / "run_results.json" + + assert manifest.exists(), f"{example.name} ({engine}): manifest.json missing" + assert run_results.exists(), f"{example.name} ({engine}): run_results.json missing" + + data = json.loads(run_results.read_text(encoding="utf-8")) + results = data.get("results") or [] + + assert results, f"{example.name} ({engine}): no results in run_results.json" + + all_status = {r.get("status") for r in results} + assert all_status != {"error"}, ( + f"{example.name} ({engine}): all models failed according to run_results.json" + ) diff --git a/tests/executors/duckdb/test_ephemeral_inlining.py b/tests/integration/executors/duckdb/test_ephemeral_inlining_integration.py similarity index 98% rename from tests/executors/duckdb/test_ephemeral_inlining.py rename to tests/integration/executors/duckdb/test_ephemeral_inlining_integration.py index b91f33d..f185132 100644 --- a/tests/executors/duckdb/test_ephemeral_inlining.py +++ b/tests/integration/executors/duckdb/test_ephemeral_inlining_integration.py @@ -18,6 +18,8 @@ def _w(p: Path, txt: str) -> Path: return p +@pytest.mark.duckdb +@pytest.mark.integration def test_ephemeral_inlining_end_to_end(tmp_path: Path): proj = tmp_path models = proj / "models" diff --git a/tests/integration/test_executor_meta_hook_duckdb.py b/tests/integration/executors/duckdb/test_executor_meta_hook_duckdb.py similarity index 99% rename from tests/integration/test_executor_meta_hook_duckdb.py rename to tests/integration/executors/duckdb/test_executor_meta_hook_duckdb.py index 1e83384..a2a4c9f 100644 --- a/tests/integration/test_executor_meta_hook_duckdb.py +++ b/tests/integration/executors/duckdb/test_executor_meta_hook_duckdb.py @@ -42,6 +42,7 @@ def fake_resolve_profile(env_name, engine, proj): @pytest.mark.duckdb +@pytest.mark.integration def test_executor_meta_hook_duckdb_build_then_skip(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): # Arrange project with a single SQL model proj = tmp_path / "proj" diff --git a/tests/unit/test_executor_meta_hook_smoke.py b/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py similarity index 89% rename from tests/unit/test_executor_meta_hook_smoke.py rename to tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py index bdc99b2..a4871e3 100644 --- a/tests/unit/test_executor_meta_hook_smoke.py +++ b/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py @@ -2,10 +2,14 @@ from pathlib import Path +import pytest + from fastflowtransform.core import Node from fastflowtransform.executors.duckdb_exec import DuckExecutor +@pytest.mark.integration +@pytest.mark.duckdb def test_duckdb_on_node_built_no_crash(tmp_path: Path): # Smoke-test: calling the hook must not raise errors (best-effort semantics) ex = DuckExecutor(db_path=":memory:") diff --git a/tests/executors/duckdb/test_materializations.py b/tests/integration/executors/duckdb/test_materializations_integration.py similarity index 93% rename from tests/executors/duckdb/test_materializations.py rename to tests/integration/executors/duckdb/test_materializations_integration.py index 875184a..18e8e32 100644 --- a/tests/executors/duckdb/test_materializations.py +++ b/tests/integration/executors/duckdb/test_materializations_integration.py @@ -1,4 +1,4 @@ -# tests/duckdb/test_materializations_duckdb.py +# tests/integration/executors/duckdb/test_materializations_integration.py from pathlib import Path import pytest @@ -8,12 +8,11 @@ @pytest.mark.duckdb +@pytest.mark.integration def test_materializations_and_ephemeral_inlining_duckdb(tmp_path: Path): # Arrange: minimal project with 3 models (table/view/ephemeral) + consumer models = tmp_path / "models" models.mkdir(parents=True, exist_ok=True) - # empty sources.yml so source(...) is available if needed - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") # Base model (default: table) (models / "base.ff.sql").write_text( diff --git a/tests/executors/duckdb/test_python_dependency_loading.py b/tests/integration/executors/duckdb/test_python_dependency_loading_integration.py similarity index 80% rename from tests/executors/duckdb/test_python_dependency_loading.py rename to tests/integration/executors/duckdb/test_python_dependency_loading_integration.py index 98de31b..95e55a6 100644 --- a/tests/executors/duckdb/test_python_dependency_loading.py +++ b/tests/integration/executors/duckdb/test_python_dependency_loading_integration.py @@ -6,6 +6,7 @@ @pytest.mark.duckdb +@pytest.mark.integration def test_duckdb_executor_dep_loading_unit(tmp_path): ex = DuckExecutor() con = ex.con @@ -15,9 +16,8 @@ def test_duckdb_executor_dep_loading_unit(tmp_path): "select 101::int as order_id, 1::int as user_id, 10.0::double as amount" ) - # registriere ein Multi-Dep Python-Modell on-the-fly - def multi(dfs: dict[str, pd.DataFrame]) -> pd.DataFrame: - return dfs["orders"].merge(dfs["users"], left_on="user_id", right_on="id") + def multi(orders: pd.DataFrame, users: pd.DataFrame) -> pd.DataFrame: + return orders.merge(users, left_on="user_id", right_on="id") REGISTRY.py_funcs["m1"] = multi REGISTRY.nodes["m1"] = Node( diff --git a/tests/executors/duckdb/test_python_model_materialized_view.py b/tests/integration/executors/duckdb/test_python_model_materialized_view_integration.py similarity index 94% rename from tests/executors/duckdb/test_python_model_materialized_view.py rename to tests/integration/executors/duckdb/test_python_model_materialized_view_integration.py index 35b6e24..43cf4d5 100644 --- a/tests/executors/duckdb/test_python_model_materialized_view.py +++ b/tests/integration/executors/duckdb/test_python_model_materialized_view_integration.py @@ -1,4 +1,4 @@ -# tests/test_python_model_materialized_view.py +# tests/integration/executors/duckdb/test_python_model_materialized_view_integration.py from pathlib import Path import pytest @@ -8,11 +8,11 @@ @pytest.mark.duckdb +@pytest.mark.integration def test_python_model_materialized_as_view(tmp_path: Path, monkeypatch): # Arrange: minimal project m = tmp_path / "models" m.mkdir() - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") # Seed base table via SQL model (m / "base.ff.sql").write_text("select 1 as id, 'x@gmail.com' as email;", encoding="utf-8") diff --git a/tests/integration/executors/test_databricks_spark_exec_integration.py b/tests/integration/executors/test_databricks_spark_exec_integration.py index 15ec0b6..cd5b7b3 100644 --- a/tests/integration/executors/test_databricks_spark_exec_integration.py +++ b/tests/integration/executors/test_databricks_spark_exec_integration.py @@ -13,7 +13,7 @@ @pytest.mark.integration -@pytest.mark.spark +@pytest.mark.databricks_spark def test_create_table_and_exists(spark_exec: DatabricksSparkExecutor): spark_exec.create_table_as("default.it_users", "SELECT 1 AS id, 'x' AS name") assert spark_exec.exists_relation("default.it_users") @@ -21,7 +21,7 @@ def test_create_table_and_exists(spark_exec: DatabricksSparkExecutor): @pytest.mark.integration -@pytest.mark.spark +@pytest.mark.databricks_spark def test_incremental_insert_integration(spark_exec: DatabricksSparkExecutor): spark_exec.create_table_as("it_inc", "SELECT 1 AS id") spark_exec.incremental_insert("it_inc", "SELECT 2 AS id") @@ -30,9 +30,9 @@ def test_incremental_insert_integration(spark_exec: DatabricksSparkExecutor): @pytest.mark.integration -@pytest.mark.spark -def test_incremental_merge_integration(spark_exec: DatabricksSparkExecutor): - spark_exec.create_table_as("it_merge", "SELECT 1 AS id, 'old' AS v") +@pytest.mark.databricks_spark +def test_incremental_merge_fallback_without_delta(spark_exec: DatabricksSparkExecutor): + spark_exec.create_table_as("it_merge_fallback", "SELECT 1 AS id, 'old' AS v") sql = """ SELECT * FROM ( SELECT 1 AS id, 'new' AS v @@ -40,13 +40,15 @@ def test_incremental_merge_integration(spark_exec: DatabricksSparkExecutor): SELECT 2 AS id, 'other' AS v ) s """ - spark_exec.incremental_merge("it_merge", sql, unique_key=["id"]) - rows = {(r["id"], r["v"]) for r in spark_exec.spark.sql("SELECT * FROM it_merge").collect()} + spark_exec.incremental_merge("it_merge_fallback", sql, unique_key=["id"]) + rows = { + (r["id"], r["v"]) for r in spark_exec.spark.sql("SELECT * FROM it_merge_fallback").collect() + } assert rows == {(1, "new"), (2, "other")} @pytest.mark.integration -@pytest.mark.spark +@pytest.mark.databricks_spark def test_alter_table_sync_schema_integration(spark_exec: DatabricksSparkExecutor): spark_exec.create_table_as("it_schema", "SELECT 1 AS id") spark_exec.alter_table_sync_schema("it_schema", "SELECT 1 AS id, 'x' AS extra") @@ -55,7 +57,7 @@ def test_alter_table_sync_schema_integration(spark_exec: DatabricksSparkExecutor @pytest.mark.integration -@pytest.mark.spark +@pytest.mark.databricks_spark def test_create_or_replace_table_wraps_error(spark_exec: DatabricksSparkExecutor): bad_sql = "SELECT * FROM not_there" node = Node(name="bad_node", kind="sql", path=Path("dummy")) @@ -64,7 +66,7 @@ def test_create_or_replace_table_wraps_error(spark_exec: DatabricksSparkExecutor @pytest.mark.integration -@pytest.mark.spark +@pytest.mark.databricks_spark def test_materialize_relation_real(spark_exec: DatabricksSparkExecutor): df = spark_exec.spark.createDataFrame([(1, "x")], ["id", "val"]) node = Node(name="it_node", kind="python", path=Path("x")) @@ -74,7 +76,7 @@ def test_materialize_relation_real(spark_exec: DatabricksSparkExecutor): @pytest.mark.integration -@pytest.mark.spark +@pytest.mark.databricks_spark def test_create_view_over_table_real(spark_exec: DatabricksSparkExecutor): """Create a table and a view over it using simple, backtick-safe names.""" # 1) create a table WITHOUT a dot in the name diff --git a/tests/unit/test_meta_bigquery_fake.py b/tests/integration/meta/test_meta_bigquery_fake_integration.py similarity index 95% rename from tests/unit/test_meta_bigquery_fake.py rename to tests/integration/meta/test_meta_bigquery_fake_integration.py index 671d2cf..ab631ca 100644 --- a/tests/unit/test_meta_bigquery_fake.py +++ b/tests/integration/meta/test_meta_bigquery_fake_integration.py @@ -1,9 +1,11 @@ -# tests/unit/test_meta_bigquery_fake.py +# tests/integration/test_meta_bigquery_fake_integration.py from __future__ import annotations import re from types import SimpleNamespace +import pytest + from fastflowtransform.meta import ensure_meta_table, get_meta, relation_exists, upsert_meta @@ -56,6 +58,8 @@ def _grab(field: str) -> str: return FakeBQClient._Res([]) +@pytest.mark.integration +@pytest.mark.bigquery def test_bigquery_meta_with_fake(): client = FakeBQClient() ex = SimpleNamespace(client=client, dataset="dset", project="proj") diff --git a/tests/unit/test_meta_duckdb.py b/tests/integration/meta/test_meta_duckdb_integration.py similarity index 91% rename from tests/unit/test_meta_duckdb.py rename to tests/integration/meta/test_meta_duckdb_integration.py index fe900fa..b843720 100644 --- a/tests/unit/test_meta_duckdb.py +++ b/tests/integration/meta/test_meta_duckdb_integration.py @@ -1,12 +1,16 @@ -# tests/unit/test_meta_duckdb.py +# tests/integration/meta/test_meta_duckdb_integration.py from __future__ import annotations from pathlib import Path +import pytest + from fastflowtransform.executors.duckdb_exec import DuckExecutor from fastflowtransform.meta import ensure_meta_table, get_meta, relation_exists, upsert_meta +@pytest.mark.integration +@pytest.mark.duckdb def test_duckdb_meta_roundtrip(tmp_path: Path): ex = DuckExecutor(db_path=str(tmp_path / "t.duckdb")) ensure_meta_table(ex) diff --git a/tests/integration/test_meta_postgres.py b/tests/integration/meta/test_meta_postgres.py similarity index 95% rename from tests/integration/test_meta_postgres.py rename to tests/integration/meta/test_meta_postgres.py index 9bc2859..970aed3 100644 --- a/tests/integration/test_meta_postgres.py +++ b/tests/integration/meta/test_meta_postgres.py @@ -13,6 +13,8 @@ pytestmark = pytest.mark.postgres # mark to opt-in in CI +@pytest.mark.postgres +@pytest.mark.integration def test_postgres_meta_roundtrip(pg_env): engine = create_engine(pg_env["FF_PG_DSN"]) ex = SimpleNamespace(engine=engine, schema=os.getenv("FF_PG_SCHEMA", "public")) diff --git a/tests/integration/test_schema_yaml_basic.py b/tests/integration/schema_loader/test_schema_yaml_basic_integration.py similarity index 92% rename from tests/integration/test_schema_yaml_basic.py rename to tests/integration/schema_loader/test_schema_yaml_basic_integration.py index a7f55d8..18a7d08 100644 --- a/tests/integration/test_schema_yaml_basic.py +++ b/tests/integration/schema_loader/test_schema_yaml_basic_integration.py @@ -8,10 +8,10 @@ from fastflowtransform.schema_loader import load_schema_tests +@pytest.mark.integration @pytest.mark.duckdb def test_schema_yaml_runs_basic_checks(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") (tmp_path / "models" / "users.ff.sql").write_text( "create or replace table users as select 1 as id, 'a@example.com' as email", encoding="utf-8", @@ -44,7 +44,7 @@ def test_schema_yaml_runs_basic_checks(tmp_path: Path): specs = load_schema_tests(tmp_path) specs = _apply_legacy_tag_filter(specs, ["batch"], legacy_token=True) - results = _run_dq_tests(ex.con, specs) + results = _run_dq_tests(ex.con, specs, ex) error_fails = [r for r in results if (not r.ok) and r.severity != "warn"] assert error_fails == [] diff --git a/tests/integration/test_schema_yaml_registry_mix.py b/tests/integration/schema_loader/test_schema_yaml_registry_mix_integration.py similarity index 92% rename from tests/integration/test_schema_yaml_registry_mix.py rename to tests/integration/schema_loader/test_schema_yaml_registry_mix_integration.py index 6e5888d..b8fe817 100644 --- a/tests/integration/test_schema_yaml_registry_mix.py +++ b/tests/integration/schema_loader/test_schema_yaml_registry_mix_integration.py @@ -8,10 +8,10 @@ from fastflowtransform.schema_loader import load_schema_tests +@pytest.mark.integration @pytest.mark.duckdb def test_mix_multiple_tests_per_column(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") (tmp_path / "models" / "u.ff.sql").write_text( "create or replace table u as select 1 as id, " "'x@example.com' as email union all select 1, 'bad@x.com'", @@ -39,7 +39,7 @@ def test_mix_multiple_tests_per_column(tmp_path: Path): ex = DuckExecutor(":memory:") ex.run_sql(REGISTRY.get_node("u.ff"), env) specs = load_schema_tests(tmp_path) - res = _run_dq_tests(ex.con, specs) + res = _run_dq_tests(ex.con, specs, ex) # Both should fail with error severity assert any((not r.ok) and r.kind == "unique" for r in res) assert any((not r.ok) and r.kind == "accepted_values" for r in res) diff --git a/tests/integration/test_profiles_validation.py b/tests/integration/settings/test_profiles_validation.py similarity index 86% rename from tests/integration/test_profiles_validation.py rename to tests/integration/settings/test_profiles_validation.py index 06ce44e..a52d6b0 100644 --- a/tests/integration/test_profiles_validation.py +++ b/tests/integration/settings/test_profiles_validation.py @@ -1,4 +1,4 @@ -# tests/integration/test_profiles_validation.py +# tests/integration/settings/test_profiles_validation_integration.py from __future__ import annotations import os @@ -17,6 +17,7 @@ def _write_profiles(tmp_path: Path, yaml_text: str) -> None: ) +@pytest.mark.integration @pytest.mark.parametrize( "case_name,profiles_yml,env_kwargs,expect_error,expect_substring", [ @@ -110,12 +111,24 @@ def _write_profiles(tmp_path: Path, yaml_text: str) -> None: password: "" warehouse: "" database: "" - schema: "" + db_schema: "" """, {}, True, "Snowflake profile missing:", ), + ( + "missing_profile_without_default", + """ + dev: + engine: duckdb + duckdb: + path: ":memory:" + """, + {}, + True, + "Profile 'prod' not found in profiles.yml", + ), ], ) def test_profiles_validation( @@ -134,17 +147,18 @@ def test_profiles_validation( monkeypatch.delenv(key, raising=False) env = EnvSettings(**env_kwargs) + requested_env = "prod" if case_name == "missing_profile_without_default" else "dev" if expect_error: with pytest.raises(ProfileConfigError) as exc: - resolve_profile(tmp_path, "dev", env) + resolve_profile(tmp_path, requested_env, env) msg = str(exc.value) # single-line hint (no embedded newlines) assert "\n" not in msg, f"{case_name}: error message must be single-line" if expect_substring: assert expect_substring in msg, f"{case_name}: expected hint not found:\n{msg}" else: - prof = resolve_profile(tmp_path, "dev", env) + prof = resolve_profile(tmp_path, requested_env, env) # sanity: returns the right engine type assert prof.engine in { "duckdb", diff --git a/tests/integration/test_smoke_streaming.py b/tests/integration/streaming/test_smoke_streaming.py similarity index 96% rename from tests/integration/test_smoke_streaming.py rename to tests/integration/streaming/test_smoke_streaming.py index 55c41f9..13216d4 100644 --- a/tests/integration/test_smoke_streaming.py +++ b/tests/integration/streaming/test_smoke_streaming.py @@ -4,12 +4,12 @@ import pandas as pd import pytest -from fastflowtransform import testing from fastflowtransform.streaming import StreamSessionizer +from fastflowtransform.testing import base as testing +@pytest.mark.integration @pytest.mark.streaming -@pytest.mark.slow def test_stream_sessionizer_produces_sessions(): delay_amt = 15 expected_rows_count = 2 diff --git a/tests/integration/test_artifacts_e2e.py b/tests/integration/test_artifacts_integration.py similarity index 89% rename from tests/integration/test_artifacts_e2e.py rename to tests/integration/test_artifacts_integration.py index b5dde23..5d28dff 100644 --- a/tests/integration/test_artifacts_e2e.py +++ b/tests/integration/test_artifacts_integration.py @@ -1,6 +1,8 @@ import json from pathlib import Path +import pytest + from fastflowtransform.artifacts import ( RunNodeResult, write_catalog, @@ -11,13 +13,19 @@ from fastflowtransform.executors.duckdb_exec import DuckExecutor +@pytest.mark.integration +@pytest.mark.duckdb def test_artifacts_all_written(tmp_path: Path): # Project (tmp_path / "models").mkdir(parents=True) (tmp_path / "models" / "m.ff.sql").write_text( "create or replace table m as select 1 as id", encoding="utf-8" ) - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") + (tmp_path / "sources.yml").write_text( + "version: 2\nsources: []\n", + encoding="utf-8", + ) + REGISTRY.load_project(tmp_path) env = REGISTRY.get_env() ex = DuckExecutor(":memory:") diff --git a/tests/integration/test_cli_vars_flag.py b/tests/integration/test_cli_vars_flag.py deleted file mode 100644 index 603b6fa..0000000 --- a/tests/integration/test_cli_vars_flag.py +++ /dev/null @@ -1,44 +0,0 @@ -# tests/test_cli_vars_flag.py -from pathlib import Path - -import pytest -from typer.testing import CliRunner - -from fastflowtransform.cli import app -from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor - -runner = CliRunner() - - -@pytest.mark.cli -def test_cli_vars_available_in_templates(tmp_path: Path, monkeypatch): - models = tmp_path / "models" - models.mkdir(parents=True) - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") - (tmp_path / "project.yml").write_text("vars:\n day: '2000-01-01'\n", encoding="utf-8") - - # Model uses var('day'); result materializes into table 'm' - (models / "m.ff.sql").write_text( - "select '{{ var(\"day\") }}' as d;", - encoding="utf-8", - ) - - # Run CLI with override - result = runner.invoke( - app, ["run", str(tmp_path), "--env", "dev", "--vars", "day='2025-10-01'"] - ) - assert result.exit_code == 0, result.output - - # Verify materialized data via a quick DuckDB read (in-memory run path may vary in your setup) - # If your 'run' command uses DuckDB by default, connect and check: - ex = DuckExecutor(":memory:") - # Re-render just to read the table in this test session: - REGISTRY.load_project(tmp_path) - REGISTRY.cli_vars = {"day": "2025-10-01"} - env = REGISTRY.get_env() - ex.run_sql(REGISTRY.nodes["m.ff"], env) - val_rows = ex.con.execute("select * from m").fetchone() - assert val_rows is not None - val = val_rows[0] - assert val == "2025-10-01" diff --git a/tests/integration/test_incremental_duckdb_smoke.py b/tests/integration/test_incremental_duckdb_smoke.py deleted file mode 100644 index 274b760..0000000 --- a/tests/integration/test_incremental_duckdb_smoke.py +++ /dev/null @@ -1,7 +0,0 @@ -from tests.common.utils import run - - -def test_incremental_smoke_duckdb(duckdb_project, duckdb_env): - # Expects model with materialized='incremental' - res = run(["fft", "run", str(duckdb_project), "--env", "dev"], duckdb_env) - assert res.returncode == 0 diff --git a/tests/integration/test_seeding.py b/tests/integration/test_seeding.py deleted file mode 100644 index f9906f7..0000000 --- a/tests/integration/test_seeding.py +++ /dev/null @@ -1,34 +0,0 @@ -from pathlib import Path - -import pytest -from jinja2 import Environment, FileSystemLoader - -from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor -from fastflowtransform.executors.postgres_exec import PostgresExecutor -from fastflowtransform.seeding import seed_project - - -def _make_env(project: Path) -> Environment: - return Environment(loader=FileSystemLoader(str(project / "models"))) - - -@pytest.mark.duckdb -def test_duckdb_unit_seeding(tmp_path, duckdb_project): - ex = DuckExecutor(db_path=":memory:") - # Load registry + Jinja so run() works later - REGISTRY.load_project(duckdb_project) - - # Load seeds directly (without the CLI) - n = seed_project(duckdb_project, ex) - assert n >= 1 - - # Afterwards you can execute nodes via ex.run_sql/ex.run_python - # or continue using the CLI in other tests. - - -@pytest.mark.postgres -def test_pg_unit_seeding(pg_project, pg_env): - ex = PostgresExecutor(dsn=pg_env["FF_PG_DSN"], schema=pg_env.get("FF_PG_SCHEMA")) - n = seed_project(pg_project, ex, default_schema=pg_env.get("FF_PG_SCHEMA")) - assert n >= 1 diff --git a/tests/integration/test_smoke_duckdb.py b/tests/integration/test_smoke_duckdb.py deleted file mode 100644 index b104d2d..0000000 --- a/tests/integration/test_smoke_duckdb.py +++ /dev/null @@ -1,104 +0,0 @@ -import os -import shutil -import subprocess - -import duckdb -import pytest - -from tests.common.utils import ROOT, run - -PROJECT = ROOT / "examples" / "simple_duckdb" -DOCS = PROJECT / "site" / "dag" -PROJECT_LOCAL = PROJECT / ".local" -DB = PROJECT_LOCAL / "demo.duckdb" - -ENV = os.environ.copy() -ENV.setdefault("FF_ENGINE", "duckdb") -ENV.setdefault("FF_DUCKDB_PATH", str(DB)) # erzwinge konsistenten DB-Ort - - -def setup_module(module): - # clean previous artifacts im Projektkontext - if DOCS.exists(): - shutil.rmtree(DOCS) - if DB.exists(): - DB.unlink() - PROJECT_LOCAL.mkdir(exist_ok=True, parents=True) - - # Seed via Make (im Projektordner) - try: - subprocess.run( - ["make", "-C", str(PROJECT), "seed"], - check=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - env=ENV, - ) - except subprocess.CalledProcessError as e: - pytest.skip(f"Seed fehlgeschlagen (DuckDB CLI fehlt?) - skippe Smoke-Tests.\n{e.stdout}") - - -@pytest.mark.duckdb -@pytest.mark.cli -@pytest.mark.slow -def test_run_builds_tables(duckdb_seeded, duckdb_project, duckdb_env): - run(["fft", "run", str(duckdb_project), "--env", "dev"], duckdb_env) - - -@pytest.mark.duckdb -@pytest.mark.cli -@pytest.mark.slow -def test_batch_tests_green(duckdb_seeded, duckdb_project, duckdb_env): - run(["fft", "test", str(duckdb_project), "--env", "dev", "--select", "batch"], duckdb_env) - - -@pytest.mark.duckdb -@pytest.mark.cli -@pytest.mark.slow -def test_html_dag_generated(): - run(["fft", "dag", str(PROJECT), "--env", "dev", "--html"]) - assert (DOCS / "index.html").exists(), "index.html was not created" - - -@pytest.mark.duckdb -@pytest.mark.cli -@pytest.mark.slow -def test_duckdb_end_to_end_with_multi_deps( - duckdb_seeded, duckdb_project, duckdb_env, duckdb_db_path -): - run(["fft", "run", str(duckdb_project), "--env", "dev"], duckdb_env) - con = duckdb.connect(str(DB)) - # users (seed) -> users_enriched (python, 1 dep) - users_rows = con.execute("select count(*) from users").fetchone() - assert users_rows is not None - assert users_rows[0] >= 1 - - users_enriched_rows = con.execute("select count(*) from users_enriched").fetchone() - assert users_enriched_rows is not None - assert users_enriched_rows[0] >= 1 - - cols_users_enriched = [r[0] for r in con.execute("describe users_enriched").fetchall()] - assert {"id", "email", "is_gmail"}.issubset(set(cols_users_enriched)) - - # orders (seed) - orders_rows = con.execute("select count(*) from orders").fetchone() - assert orders_rows is not None - assert orders_rows[0] >= 1 - - # mart_orders_enriched (python, >1 deps via dict[str,df]) - mart_orders_enriched_rows = con.execute("select count(*) from mart_orders_enriched").fetchone() - assert mart_orders_enriched_rows is not None - assert mart_orders_enriched_rows[0] >= 1 - - cols_moe = [r[0] for r in con.execute("describe mart_orders_enriched").fetchall()] - assert {"order_id", "user_id", "email", "is_gmail", "amount", "valid_amt"}.issubset( - set(cols_moe) - ) - - # einfache Qualitätschecks auf dem Multi-Dep-Ergebnis - mart_orders_enriched_neg_rows = con.execute( - "select count(*) from mart_orders_enriched where amount < 0" - ).fetchone() - assert mart_orders_enriched_neg_rows is not None - assert mart_orders_enriched_neg_rows[0] == 0 diff --git a/tests/integration/test_smoke_postgres.py b/tests/integration/test_smoke_postgres.py deleted file mode 100644 index 270bc3c..0000000 --- a/tests/integration/test_smoke_postgres.py +++ /dev/null @@ -1,78 +0,0 @@ -import os - -import pytest -from sqlalchemy import create_engine, text - -from tests.common.utils import ROOT, run - -PROJECT = ROOT / "examples" / "postgres" -DOCS = PROJECT / "site" / "dag" - -PG_DSN = os.environ.get( - "FF_PG_DSN", - "postgresql+psycopg://postgres:postgres@localhost:5432/ffdb", -) - - -@pytest.mark.postgres -@pytest.mark.cli -@pytest.mark.slow -def test_pg_html_dag_generated(pg_env): - run(["fft", "dag", str(PROJECT), "--env", "stg", "--html"], pg_env) - assert (DOCS / "index.html").exists(), "index.html was not created" - - -@pytest.mark.postgres -@pytest.mark.cli -@pytest.mark.slow -def test_pg_batch_tests_green(pg_env): - run(["fft", "test", str(PROJECT), "--env", "stg", "--select", "batch"], pg_env) - - -@pytest.mark.postgres -@pytest.mark.slow -def test_pg_result_exists(): - engine = create_engine(PG_DSN, future=True) - with engine.begin() as conn: - n = conn.execute(text("select count(*) from mart_users")).scalar() - assert n is not None and n >= 1 - - -@pytest.mark.postgres -@pytest.mark.cli -@pytest.mark.slow -def test_pg_run_builds_tables(pg_project, pg_env): - # 1) Load seeds (uses FF_ENGINE=postgres & DSN) - run(["fft", "seed", str(pg_project), "--env", "stg"], pg_env) - - # 2) Run the pipeline - run(["fft", "run", str(pg_project), "--env", "stg"], pg_env) - - -@pytest.mark.postgres -@pytest.mark.slow -def test_pg_multi_dep_model_exists(pg_env): - engine = create_engine(pg_env["FF_PG_DSN"], future=True) - schema = pg_env.get("FF_PG_SCHEMA", "public") - with engine.begin() as conn: - conn.execute(text(f'set local search_path = "{schema}"')) - n = conn.execute(text("select count(*) from mart_orders_enriched")).scalar() - assert n is not None and n >= 1 - - -@pytest.mark.postgres -@pytest.mark.slow -def test_pg_multi_dep_model_columns(pg_env): - engine = create_engine(PG_DSN, future=True) - with engine.begin() as conn: - cols = [ - r[0] - for r in conn.execute( - text(""" - select column_name from information_schema.columns - where table_name = 'mart_orders_enriched' - """) - ).fetchall() - ] - for c in ["order_id", "user_id", "email", "is_gmail", "amount", "valid_amt"]: - assert c in cols diff --git a/tests/integration/test_registry/test_dispatch_integration.py b/tests/integration/testing/registry/test_dispatch_integration.py similarity index 93% rename from tests/integration/test_registry/test_dispatch_integration.py rename to tests/integration/testing/registry/test_dispatch_integration.py index d87a352..f09be81 100644 --- a/tests/integration/test_registry/test_dispatch_integration.py +++ b/tests/integration/testing/registry/test_dispatch_integration.py @@ -1,7 +1,7 @@ import pytest from fastflowtransform.executors.duckdb_exec import DuckExecutor -from fastflowtransform.test_registry import TESTS +from fastflowtransform.testing.registry import TESTS @pytest.mark.integration diff --git a/tests/unit/test_utest_cache_flag.py b/tests/integration/utest/test_utest_cache_flag_integration.py similarity index 98% rename from tests/unit/test_utest_cache_flag.py rename to tests/integration/utest/test_utest_cache_flag_integration.py index 6c9d8c8..20e3e31 100644 --- a/tests/unit/test_utest_cache_flag.py +++ b/tests/integration/utest/test_utest_cache_flag_integration.py @@ -44,6 +44,7 @@ def fake_make_executor(_prof, _env): monkeypatch.setattr("fastflowtransform.cli.bootstrap._make_executor", fake_make_executor) +@pytest.mark.integration def test_utest_cache_default_off(monkeypatch: pytest.MonkeyPatch, tmp_path: Path): _stub_minimal_context(monkeypatch, tmp_path) @@ -66,6 +67,7 @@ def fake_run(specs, executor, jenv, only_case=None, **kw): assert captured.get("reuse_meta") is False +@pytest.mark.integration def test_utest_cache_rw_and_reuse(monkeypatch: pytest.MonkeyPatch, tmp_path: Path): _stub_minimal_context(monkeypatch, tmp_path) @@ -87,6 +89,7 @@ def fake_run(specs, executor, jenv, only_case=None, **kw): assert captured.get("reuse_meta") is True +@pytest.mark.integration def test_utest_cache_ro(monkeypatch: pytest.MonkeyPatch, tmp_path: Path): _stub_minimal_context(monkeypatch, tmp_path) diff --git a/tests/unit/api/test_rate_limit_unit.py b/tests/unit/api/test_rate_limit_unit.py index 9fa8109..88ddc0b 100644 --- a/tests/unit/api/test_rate_limit_unit.py +++ b/tests/unit/api/test_rate_limit_unit.py @@ -1,233 +1,233 @@ -# tests/unit/api/test_rate_limit_unit.py -from __future__ import annotations +# # tests/unit/api/test_rate_limit_unit.py +# from __future__ import annotations -from typing import Any +# from typing import Any -import pytest +# import pytest -import fastflowtransform.api.rate_limit as rl_mod +# import fastflowtransform.api.rate_limit as rl_mod -@pytest.fixture(autouse=True) -def _reset_rate_limiter(): - """Ensure each test runs with a clean module-level state.""" - rl_mod.reset() - yield - rl_mod.reset() +# @pytest.fixture(autouse=True) +# def _reset_rate_limiter(): +# """Ensure each test runs with a clean module-level state.""" +# rl_mod.reset() +# yield +# rl_mod.reset() -@pytest.mark.unit -def test_tokenbucket_try_consume_enough_tokens(monkeypatch): - """try_consume should return True and deduct tokens when bucket has enough.""" - tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) - tb._tokens = 3.0 - monkeypatch.setattr(rl_mod, "monotonic", lambda: 100.0) +# @pytest.mark.unit +# def test_tokenbucket_try_consume_enough_tokens(monkeypatch): +# """try_consume should return True and deduct tokens when bucket has enough.""" +# tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) +# tb._tokens = 3.0 +# monkeypatch.setattr(rl_mod, "monotonic", lambda: 100.0) - ok = tb.try_consume(2.0) - assert ok is True - assert tb._tokens == pytest.approx(1.0) +# ok = tb.try_consume(2.0) +# assert ok is True +# assert tb._tokens == pytest.approx(1.0) -@pytest.mark.unit -def test_tokenbucket_try_consume_not_enough_tokens(monkeypatch): - """try_consume should return False when bucket has not enough tokens.""" - tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) - tb._tokens = 0.5 - monkeypatch.setattr(rl_mod, "monotonic", lambda: 50.0) +# @pytest.mark.unit +# def test_tokenbucket_try_consume_not_enough_tokens(monkeypatch): +# """try_consume should return False when bucket has not enough tokens.""" +# tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) +# tb._tokens = 0.5 +# monkeypatch.setattr(rl_mod, "monotonic", lambda: 50.0) - ok = tb.try_consume(1.0) - assert ok is False - # token count should stay the same - assert tb._tokens == pytest.approx(0.5) +# ok = tb.try_consume(1.0) +# assert ok is False +# # token count should stay the same +# assert tb._tokens == pytest.approx(0.5) -@pytest.mark.unit -def test_tokenbucket_refills_on_try_consume(monkeypatch): - """try_consume should trigger a refill before checking tokens.""" - tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=2.0) - tb._tokens = 0.0 - tb._last_refill = 10.0 - # now - last_refill = 1s -> +2 tokens - monkeypatch.setattr(rl_mod, "monotonic", lambda: 11.0) +# @pytest.mark.unit +# def test_tokenbucket_refills_on_try_consume(monkeypatch): +# """try_consume should trigger a refill before checking tokens.""" +# tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=2.0) +# tb._tokens = 0.0 +# tb._last_refill = 10.0 +# # now - last_refill = 1s -> +2 tokens +# monkeypatch.setattr(rl_mod, "monotonic", lambda: 11.0) - ok = tb.try_consume(1.0) - assert ok is True - # 2 - 1 = 1 - assert tb._tokens == pytest.approx(1.0) +# ok = tb.try_consume(1.0) +# assert ok is True +# # 2 - 1 = 1 +# assert tb._tokens == pytest.approx(1.0) -@pytest.mark.unit -def test_tokenbucket_wait_does_not_block_when_enough(monkeypatch): - """wait() should return immediately if enough tokens are present.""" - # freeze time first so both _last_refill and later calls are the same - monkeypatch.setattr(rl_mod, "monotonic", lambda: 200.0) +# @pytest.mark.unit +# def test_tokenbucket_wait_does_not_block_when_enough(monkeypatch): +# """wait() should return immediately if enough tokens are present.""" +# # freeze time first so both _last_refill and later calls are the same +# monkeypatch.setattr(rl_mod, "monotonic", lambda: 200.0) - tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) - tb._tokens = 4.0 - # make sure no extra refill happens inside wait() - tb._last_refill = 200.0 +# tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) +# tb._tokens = 4.0 +# # make sure no extra refill happens inside wait() +# tb._last_refill = 200.0 - called: dict[str, bool] = {"sleep": False} +# called: dict[str, bool] = {"sleep": False} - def fake_sleep(_dur: float) -> None: - called["sleep"] = True +# def fake_sleep(_dur: float) -> None: +# called["sleep"] = True - monkeypatch.setattr(rl_mod.time, "sleep", fake_sleep) +# monkeypatch.setattr(rl_mod.time, "sleep", fake_sleep) - tb.wait(2.0) +# tb.wait(2.0) - # should not have slept - assert called["sleep"] is False - # consumed exactly 2 - assert tb._tokens == pytest.approx(2.0) +# # should not have slept +# assert called["sleep"] is False +# # consumed exactly 2 +# assert tb._tokens == pytest.approx(2.0) -@pytest.mark.unit -def test_tokenbucket_wait_blocks_once_and_consumes(monkeypatch): - """wait() should sleep exactly once when tokens are not yet available.""" - tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) - tb._tokens = 0.0 - tb._last_refill = 10.0 +# @pytest.mark.unit +# def test_tokenbucket_wait_blocks_once_and_consumes(monkeypatch): +# """wait() should sleep exactly once when tokens are not yet available.""" +# tb = rl_mod.TokenBucket(capacity=5.0, refill_per_sec=1.0) +# tb._tokens = 0.0 +# tb._last_refill = 10.0 - # 1st call -> 10.0 → not enough → sleep(1.0) - # 2nd call after sleep -> 11.0 → +1 token → consume - times = [10.0, 11.0] +# # 1st call -> 10.0 → not enough → sleep(1.0) +# # 2nd call after sleep -> 11.0 → +1 token → consume +# times = [10.0, 11.0] - def fake_monotonic() -> float: - return times.pop(0) +# def fake_monotonic() -> float: +# return times.pop(0) - slept_for: list[float] = [] +# slept_for: list[float] = [] - def fake_sleep(dur: float) -> None: - slept_for.append(dur) +# def fake_sleep(dur: float) -> None: +# slept_for.append(dur) - monkeypatch.setattr(rl_mod, "monotonic", fake_monotonic) - monkeypatch.setattr(rl_mod.time, "sleep", fake_sleep) +# monkeypatch.setattr(rl_mod, "monotonic", fake_monotonic) +# monkeypatch.setattr(rl_mod.time, "sleep", fake_sleep) - tb.wait(1.0) +# tb.wait(1.0) - assert len(slept_for) == 1 - assert slept_for[0] == pytest.approx(1.0) - # after consuming the freshly refilled token - assert tb._tokens == pytest.approx(0.0) +# assert len(slept_for) == 1 +# assert slept_for[0] == pytest.approx(1.0) +# # after consuming the freshly refilled token +# assert tb._tokens == pytest.approx(0.0) -@pytest.mark.unit -def test_tokenbucket_wait_disabled_does_nothing(monkeypatch): - """If capacity/rps <= 0, wait() should be a no-op.""" - tb = rl_mod.TokenBucket(capacity=0.0, refill_per_sec=1.0) - called = {"sleep": False} +# @pytest.mark.unit +# def test_tokenbucket_wait_disabled_does_nothing(monkeypatch): +# """If capacity/rps <= 0, wait() should be a no-op.""" +# tb = rl_mod.TokenBucket(capacity=0.0, refill_per_sec=1.0) +# called = {"sleep": False} - def fake_sleep(*_: float) -> None: - called["sleep"] = True +# def fake_sleep(*_: float) -> None: +# called["sleep"] = True - monkeypatch.setattr(rl_mod.time, "sleep", fake_sleep) +# monkeypatch.setattr(rl_mod.time, "sleep", fake_sleep) - tb.wait(10.0) +# tb.wait(10.0) - assert called["sleep"] is False +# assert called["sleep"] is False -# ---------------- module-level helpers ---------------- +# # ---------------- module-level helpers ---------------- -@pytest.mark.unit -def test_init_rate_limiter_creates_bucket(): - """init_rate_limiter should build a TokenBucket when params are positive.""" - rl_mod.init_rate_limiter(5, 2) - assert isinstance(rl_mod._STATE.rl, rl_mod.TokenBucket) - assert rl_mod._STATE.rl.capacity == 5 - assert rl_mod._STATE.rl.refill_per_sec == 2 +# @pytest.mark.unit +# def test_init_rate_limiter_creates_bucket(): +# """init_rate_limiter should build a TokenBucket when params are positive.""" +# rl_mod.init_rate_limiter(5, 2) +# assert isinstance(rl_mod._STATE.rl, rl_mod.TokenBucket) +# assert rl_mod._STATE.rl.capacity == 5 +# assert rl_mod._STATE.rl.refill_per_sec == 2 -@pytest.mark.unit -def test_init_rate_limiter_disables_on_zero(): - """init_rate_limiter should disable when params are non-positive.""" - rl_mod.init_rate_limiter(0, 10) - assert rl_mod._STATE.rl is None +# @pytest.mark.unit +# def test_init_rate_limiter_disables_on_zero(): +# """init_rate_limiter should disable when params are non-positive.""" +# rl_mod.init_rate_limiter(0, 10) +# assert rl_mod._STATE.rl is None - rl_mod.init_rate_limiter(10, 0) - assert rl_mod._STATE.rl is None +# rl_mod.init_rate_limiter(10, 0) +# assert rl_mod._STATE.rl is None -@pytest.mark.unit -def test_set_params_on_uninitialized_creates_when_both_given(): - """set_params should create a bucket if none exists and both positive values are passed.""" - assert rl_mod._STATE.rl is None - rl_mod.set_params(capacity=3, rps=1) - assert isinstance(rl_mod._STATE.rl, rl_mod.TokenBucket) - assert rl_mod._STATE.rl.capacity == 3 - assert rl_mod._STATE.rl.refill_per_sec == 1 +# @pytest.mark.unit +# def test_set_params_on_uninitialized_creates_when_both_given(): +# """set_params should create a bucket if none exists and both positive values are passed.""" +# assert rl_mod._STATE.rl is None +# rl_mod.set_params(capacity=3, rps=1) +# assert isinstance(rl_mod._STATE.rl, rl_mod.TokenBucket) +# assert rl_mod._STATE.rl.capacity == 3 +# assert rl_mod._STATE.rl.refill_per_sec == 1 -@pytest.mark.unit -def test_set_params_updates_existing(): - """set_params should rebuild the bucket based on existing values when some params are None.""" - rl_mod.init_rate_limiter(5, 2) - rl_mod.set_params(rps=10) - assert isinstance(rl_mod._STATE.rl, rl_mod.TokenBucket) - assert rl_mod._STATE.rl.capacity == 5 - assert rl_mod._STATE.rl.refill_per_sec == 10 +# @pytest.mark.unit +# def test_set_params_updates_existing(): +# """set_params should rebuild the bucket based on existing values when some params are None.""" +# rl_mod.init_rate_limiter(5, 2) +# rl_mod.set_params(rps=10) +# assert isinstance(rl_mod._STATE.rl, rl_mod.TokenBucket) +# assert rl_mod._STATE.rl.capacity == 5 +# assert rl_mod._STATE.rl.refill_per_sec == 10 -@pytest.mark.unit -def test_set_params_can_disable(): - """set_params should disable limiter when resulting params are non-positive.""" - rl_mod.init_rate_limiter(5, 2) - rl_mod.set_params(capacity=0) - assert rl_mod._STATE.rl is None +# @pytest.mark.unit +# def test_set_params_can_disable(): +# """set_params should disable limiter when resulting params are non-positive.""" +# rl_mod.init_rate_limiter(5, 2) +# rl_mod.set_params(capacity=0) +# assert rl_mod._STATE.rl is None -@pytest.mark.unit -def test_rate_limit_delegates_when_initialized(monkeypatch): - """rate_limit() should call wait() on the current bucket.""" - rl_mod.init_rate_limiter(5, 1) - bucket = rl_mod._STATE.rl - assert bucket is not None +# @pytest.mark.unit +# def test_rate_limit_delegates_when_initialized(monkeypatch): +# """rate_limit() should call wait() on the current bucket.""" +# rl_mod.init_rate_limiter(5, 1) +# bucket = rl_mod._STATE.rl +# assert bucket is not None - called: dict[str, Any] = {"wait": False} +# called: dict[str, Any] = {"wait": False} - def fake_wait(cost: float = 1.0) -> None: - called["wait"] = cost +# def fake_wait(cost: float = 1.0) -> None: +# called["wait"] = cost - bucket.wait = fake_wait # type: ignore[assignment] +# bucket.wait = fake_wait # type: ignore[assignment] - rl_mod.rate_limit(3.5) - assert called["wait"] == 3.5 +# rl_mod.rate_limit(3.5) +# assert called["wait"] == 3.5 -@pytest.mark.unit -def test_rate_limit_noop_when_uninitialized(): - """rate_limit() should just return when limiter is not initialized.""" - assert rl_mod._STATE.rl is None - rl_mod.rate_limit(10.0) - assert rl_mod._STATE.rl is None +# @pytest.mark.unit +# def test_rate_limit_noop_when_uninitialized(): +# """rate_limit() should just return when limiter is not initialized.""" +# assert rl_mod._STATE.rl is None +# rl_mod.rate_limit(10.0) +# assert rl_mod._STATE.rl is None -@pytest.mark.unit -def test_try_consume_noop_when_uninitialized(): - """try_consume() should return True when limiter is not initialized.""" - assert rl_mod._STATE.rl is None - assert rl_mod.try_consume(999.0) is True +# @pytest.mark.unit +# def test_try_consume_noop_when_uninitialized(): +# """try_consume() should return True when limiter is not initialized.""" +# assert rl_mod._STATE.rl is None +# assert rl_mod.try_consume(999.0) is True -@pytest.mark.unit -def test_try_consume_delegates_when_initialized(monkeypatch): - """try_consume() should delegate to bucket.try_consume().""" - rl_mod.init_rate_limiter(5, 1) - bucket = rl_mod._STATE.rl - assert bucket is not None +# @pytest.mark.unit +# def test_try_consume_delegates_when_initialized(monkeypatch): +# """try_consume() should delegate to bucket.try_consume().""" +# rl_mod.init_rate_limiter(5, 1) +# bucket = rl_mod._STATE.rl +# assert bucket is not None - monkeypatch.setattr(bucket, "try_consume", lambda cost=1.0: cost == 1.0) +# monkeypatch.setattr(bucket, "try_consume", lambda cost=1.0: cost == 1.0) - assert rl_mod.try_consume(1.0) is True - assert rl_mod.try_consume(2.0) is False +# assert rl_mod.try_consume(1.0) is True +# assert rl_mod.try_consume(2.0) is False -@pytest.mark.unit -def test_reset_clears_state(): - """reset() should clear the module-level bucket.""" - rl_mod.init_rate_limiter(5, 1) - assert rl_mod._STATE.rl is not None - rl_mod.reset() - assert rl_mod._STATE.rl is None +# @pytest.mark.unit +# def test_reset_clears_state(): +# """reset() should clear the module-level bucket.""" +# rl_mod.init_rate_limiter(5, 1) +# assert rl_mod._STATE.rl is not None +# rl_mod.reset() +# assert rl_mod._STATE.rl is None diff --git a/tests/unit/artifacts/test_manifest_unit.py b/tests/unit/artifacts/test_manifest_unit.py index 9397a3b..add2452 100644 --- a/tests/unit/artifacts/test_manifest_unit.py +++ b/tests/unit/artifacts/test_manifest_unit.py @@ -8,11 +8,13 @@ @pytest.mark.unit -@pytest.mark.artifacts def test_manifest_minimal(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) (tmp_path / "models" / "m.ff.sql").write_text("select 1 as x", encoding="utf-8") - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") + (tmp_path / "sources.yml").write_text( + "version: 2\nsources: []\n", + encoding="utf-8", + ) REGISTRY.load_project(tmp_path) p = write_manifest(tmp_path) @@ -21,5 +23,4 @@ def test_manifest_minimal(tmp_path: Path): n = data["nodes"]["m.ff"] assert n["relation"] == "m" assert n["path"] == "models/m.ff.sql" - # sorted & deterministic keys (spot check) assert list(sorted(data["nodes"].keys())) == list(data["nodes"].keys()) diff --git a/tests/unit/artifacts/test_run_result_unit.py b/tests/unit/artifacts/test_run_result_unit.py index 000a47b..49cced0 100644 --- a/tests/unit/artifacts/test_run_result_unit.py +++ b/tests/unit/artifacts/test_run_result_unit.py @@ -7,7 +7,6 @@ @pytest.mark.unit -@pytest.mark.artifacts def test_run_results_written(tmp_path: Path): started = "2025-01-01T00:00:00+00:00" finished = "2025-01-01T00:01:00+00:00" diff --git a/tests/unit/cache/test_cache_policy_cli.py b/tests/unit/cache/test_cache_policy_cli.py index 6b932ad..5b181bb 100644 --- a/tests/unit/cache/test_cache_policy_cli.py +++ b/tests/unit/cache/test_cache_policy_cli.py @@ -78,6 +78,7 @@ def schedule( logger=None, engine_abbr="", name_width=28, + name_formatter=None, ): start = time.perf_counter() per: dict[str, float] = {} @@ -92,6 +93,9 @@ def schedule( before(name, 1) except TypeError: before(name) + if name_formatter: + # exercise formatter for parity with real schedule + name_formatter(name) run_node(name) except BaseException as e: if on_error: diff --git a/tests/unit/cli/test_bootstrap_unit.py b/tests/unit/cli/test_bootstrap_unit.py index 73b7181..cadbd76 100644 --- a/tests/unit/cli/test_bootstrap_unit.py +++ b/tests/unit/cli/test_bootstrap_unit.py @@ -194,15 +194,17 @@ def run_python(self, *a, **k): @pytest.mark.unit def test_make_executor_duckdb(monkeypatch, tmp_path: Path): class _FakeDuckExec: - def __init__(self, db_path: str): + def __init__(self, db_path: str, schema: str | None = None, catalog: str | None = None): self.db_path = db_path + self.schema = schema + self.catalog = catalog def run_python(self, *a, **k): pass monkeypatch.setattr(bootstrap, "DuckExecutor", _FakeDuckExec, raising=True) - prof = fake_duckdb_profile(path=str(tmp_path / "test.duckdb")) + prof = fake_duckdb_profile(path=str(tmp_path / "test.duckdb"), schema="demo", catalog="demo") jenv = Environment() ex, run_fn, py_fn = bootstrap._make_executor(prof, jenv) diff --git a/tests/unit/test_cli_init.py b/tests/unit/cli/test_cli_init_unit.py similarity index 97% rename from tests/unit/test_cli_init.py rename to tests/unit/cli/test_cli_init_unit.py index 36e5bde..1abe819 100644 --- a/tests/unit/test_cli_init.py +++ b/tests/unit/cli/test_cli_init_unit.py @@ -12,6 +12,7 @@ def _read(path: Path) -> str: return path.read_text(encoding="utf-8") +@pytest.mark.unit def test_init_creates_minimal_skeleton(tmp_path: Path): runner = CliRunner() target = tmp_path / "warehouse" @@ -45,6 +46,7 @@ def test_init_creates_minimal_skeleton(tmp_path: Path): assert "fft utest" in tests_note +@pytest.mark.unit def test_init_refuses_existing_directory(tmp_path: Path): runner = CliRunner() target = tmp_path / "existing" @@ -57,6 +59,7 @@ def test_init_refuses_existing_directory(tmp_path: Path): assert not list(target.glob("*")) +@pytest.mark.unit @pytest.mark.parametrize("engine", ["unknown", "sqlite"]) def test_init_validates_engine(engine: str, tmp_path: Path): runner = CliRunner() diff --git a/tests/unit/test_cli_select.py b/tests/unit/cli/test_cli_select_unit.py similarity index 99% rename from tests/unit/test_cli_select.py rename to tests/unit/cli/test_cli_select_unit.py index 8c86580..daa0235 100644 --- a/tests/unit/test_cli_select.py +++ b/tests/unit/cli/test_cli_select_unit.py @@ -26,6 +26,7 @@ def _mk_node(tmp_path: Path, name: str, kind: str = "sql", mat: str = "table", t return n +@pytest.mark.unit def test_select_predicates_and_parse(tmp_path: Path): a = _mk_node(tmp_path, "users", mat="view", tags=["mart", "dim"]) b = _mk_node(tmp_path, "orders.ff", mat="table", tags=["fct"]) @@ -63,6 +64,7 @@ def test_select_predicates_and_parse(tmp_path: Path): # --------------------------------- # CLI-level: run / dag with Typer # --------------------------------- +@pytest.mark.unit def test_cli_run_and_dag_apply_select_filters(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): # Arrange a tiny in-memory registry a = _mk_node(tmp_path, "users", mat="view", tags=["mart"]) diff --git a/tests/cli/test_summary_params_and_sql.py b/tests/unit/cli/test_cmd/test_summary_params_and_sql_unit.py similarity index 97% rename from tests/cli/test_summary_params_and_sql.py rename to tests/unit/cli/test_cmd/test_summary_params_and_sql_unit.py index 1ffe869..1f57283 100644 --- a/tests/cli/test_summary_params_and_sql.py +++ b/tests/unit/cli/test_cmd/test_summary_params_and_sql_unit.py @@ -1,9 +1,12 @@ import io import sys +import pytest + from fastflowtransform.cli.test_cmd import DQResult, _print_summary +@pytest.mark.unit def test_prints_params_and_example_sql(): buf = io.StringIO() old = sys.stdout diff --git a/tests/cli/test_test_cmd_output_marks.py b/tests/unit/cli/test_cmd/test_test_cmd_output_marks_unit.py similarity index 96% rename from tests/cli/test_test_cmd_output_marks.py rename to tests/unit/cli/test_cmd/test_test_cmd_output_marks_unit.py index 9b4a830..3956384 100644 --- a/tests/cli/test_test_cmd_output_marks.py +++ b/tests/unit/cli/test_cmd/test_test_cmd_output_marks_unit.py @@ -1,9 +1,12 @@ import io import sys +import pytest + from fastflowtransform.cli.test_cmd import DQResult, _print_summary +@pytest.mark.unit def test_output_marks_and_totals(): results = [ DQResult( diff --git a/tests/unit/cli/test_sync_db_comments_unit.py b/tests/unit/cli/test_sync_db_comments_unit.py index 7b1e457..8946467 100644 --- a/tests/unit/cli/test_sync_db_comments_unit.py +++ b/tests/unit/cli/test_sync_db_comments_unit.py @@ -16,7 +16,6 @@ @pytest.mark.unit -@pytest.mark.cli def test_strip_html_for_comment_removes_tags_and_collapses_spaces(): html = "

Hello World


again" out = mod._strip_html_for_comment(html) @@ -24,19 +23,16 @@ def test_strip_html_for_comment_removes_tags_and_collapses_spaces(): @pytest.mark.unit -@pytest.mark.cli def test_strip_html_for_comment_none(): assert mod._strip_html_for_comment(None) == "" @pytest.mark.unit -@pytest.mark.cli def test_pg_quote_ident_escapes_quotes(): assert mod._pg_quote_ident('my"table') == '"my""table"' @pytest.mark.unit -@pytest.mark.cli @pytest.mark.parametrize( "schema,relation,expected", [ @@ -50,7 +46,6 @@ def test_pg_fq_table(schema, relation, expected): @pytest.mark.unit -@pytest.mark.cli def test_sql_literal_escapes_single_quotes(): assert mod._sql_literal("O'Reilly") == "'O''Reilly'" @@ -61,7 +56,6 @@ def test_sql_literal_escapes_single_quotes(): @pytest.mark.unit -@pytest.mark.cli def test_sync_comments_postgres_dry_run(capsys): intents = [ {"kind": "table", "relation": "users", "text": "Users table"}, @@ -77,7 +71,6 @@ def test_sync_comments_postgres_dry_run(capsys): @pytest.mark.unit -@pytest.mark.cli def test_sync_comments_postgres_executes_on_engine(capsys): # fake sqlalchemy engine fake_conn = MagicMock() @@ -107,7 +100,6 @@ def test_sync_comments_postgres_executes_on_engine(capsys): @pytest.mark.unit -@pytest.mark.cli def test_sync_comments_snowflake_dry_run(capsys): intents = [ {"kind": "table", "relation": "MY_DB.MY_TBL", "text": "Some table"}, @@ -123,7 +115,6 @@ def test_sync_comments_snowflake_dry_run(capsys): @pytest.mark.unit -@pytest.mark.cli def test_sync_comments_snowflake_with_session(): fake_session = MagicMock() fake_exec = SimpleNamespace(session=fake_session) @@ -143,7 +134,6 @@ def test_sync_comments_snowflake_with_session(): @pytest.mark.unit -@pytest.mark.cli def test_sync_comments_snowflake_with_execute_method(): exec_mock = SimpleNamespace(execute=MagicMock()) @@ -162,7 +152,6 @@ def test_sync_comments_snowflake_with_execute_method(): @pytest.mark.unit -@pytest.mark.cli def test_sync_db_comments_no_intents_exits(monkeypatch): """ Fall: es gibt gar keine Descriptions -> sofort Exit(0) mit gelb. @@ -188,7 +177,6 @@ def test_sync_db_comments_no_intents_exits(monkeypatch): @pytest.mark.unit -@pytest.mark.cli def test_sync_db_comments_postgres_path(monkeypatch): # 1) Kontext vorbereiten fake_exec = MagicMock() @@ -246,7 +234,6 @@ def fake_sync_pg(execu, intents, schema, dry_run): @pytest.mark.unit -@pytest.mark.cli def test_sync_db_comments_snowflake_path(monkeypatch): fake_exec = MagicMock() fake_ctx = SimpleNamespace( @@ -292,7 +279,6 @@ def fake_sync_sf(execu, intents, schema, dry_run): @pytest.mark.unit -@pytest.mark.cli def test_sync_db_comments_unsupported_engine(monkeypatch, capsys): fake_exec = MagicMock() fake_ctx = SimpleNamespace( diff --git a/tests/unit/test_config_hook.py b/tests/unit/config/test_config_hook.py similarity index 75% rename from tests/unit/test_config_hook.py rename to tests/unit/config/test_config_hook.py index 5dd5a32..57106ae 100644 --- a/tests/unit/test_config_hook.py +++ b/tests/unit/config/test_config_hook.py @@ -1,12 +1,17 @@ -# tests/test_config_hook.py from pathlib import Path +import pytest + from fastflowtransform.core import REGISTRY +@pytest.mark.unit def test_sql_model_config_materialized_view(tmp_path: Path): (tmp_path / "models").mkdir() - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") + (tmp_path / "sources.yml").write_text( + "version: 2\nsources: []\n", + encoding="utf-8", + ) (tmp_path / "models" / "users.ff.sql").write_text( "{{ config(materialized='view') }}\nselect 1 as id, 'x' as email;", encoding="utf-8", diff --git a/tests/unit/test_config_vars_macros.py b/tests/unit/config/test_config_vars_macros.py similarity index 99% rename from tests/unit/test_config_vars_macros.py rename to tests/unit/config/test_config_vars_macros.py index 8820f2b..b8beb32 100644 --- a/tests/unit/test_config_vars_macros.py +++ b/tests/unit/config/test_config_vars_macros.py @@ -66,6 +66,7 @@ def _setup_env(models_dir: Path) -> None: REGISTRY.cli_vars = {} +@pytest.mark.unit def test_config_hook_sets_materialized_view(tmp_path: Path): proj = tmp_path models = proj / "models" @@ -88,6 +89,7 @@ def test_config_hook_sets_materialized_view(tmp_path: Path): assert REGISTRY.nodes["users.ff"].meta.get("materialized") == "view" +@pytest.mark.unit def test_var_reads_project_and_cli_override(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): proj = tmp_path models = proj / "models" @@ -120,6 +122,7 @@ def test_var_reads_project_and_cli_override(tmp_path: Path, monkeypatch: pytest. assert "2000-01-01" not in body +@pytest.mark.unit def test_macros_are_loaded_and_callable(tmp_path: Path): proj = tmp_path models = proj / "models" diff --git a/tests/unit/config/test_model_unit.py b/tests/unit/config/test_model_unit.py new file mode 100644 index 0000000..135f472 --- /dev/null +++ b/tests/unit/config/test_model_unit.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from fastflowtransform.config.models import IncrementalConfig, ModelConfig, validate_model_meta + + +@pytest.mark.unit +def test_validate_model_meta_basic_storage_and_tags(): + meta = { + "materialized": "table", + "tags": ["example", "demo"], + "storage": { + "path": "/tmp/users", + "format": "parquet", + "options": {"compression": "snappy"}, + }, + } + + cfg = validate_model_meta(meta) + assert isinstance(cfg, ModelConfig) + assert cfg.materialized == "table" + assert cfg.tags == ["example", "demo"] + assert cfg.storage is not None + assert cfg.storage.path == "/tmp/users" + assert cfg.storage.format == "parquet" + assert cfg.storage.options == {"compression": "snappy"} + + +@pytest.mark.unit +def test_validate_model_meta_rejects_unknown_keys(): + # extra keys should be rejected if ModelConfig uses extra="forbid" + with pytest.raises(ValidationError): + validate_model_meta({"materialized": "table", "unknown_field": 1}) + + +@pytest.mark.unit +def test_validate_model_meta_incremental_bool_and_dict_variants(): + # incremental: true + minimal hints (freshness + unique key) + cfg_bool = validate_model_meta( + { + "incremental": True, + "updated_at": "updated_at", + "unique_key": ["id"], + } + ) + assert isinstance(cfg_bool, ModelConfig) + assert cfg_bool.is_incremental_enabled() is True + assert isinstance(cfg_bool.incremental, IncrementalConfig) + assert cfg_bool.incremental.enabled is True + assert cfg_bool.updated_at == "updated_at" + assert cfg_bool.unique_key == ["id"] + + # incremental: {enabled: False, ...} → allowed without hints + cfg_dict = validate_model_meta( + { + "incremental": { + "enabled": False, + "unique_key": ["id"], + } + } + ) + assert cfg_dict.is_incremental_enabled() is False + # unique_key should be accepted and mirrored to the top-level shortcut + assert cfg_dict.unique_key == ["id"] + + +@pytest.mark.unit +def test_validate_model_meta_incremental_invalid_strategy_raises(): + with pytest.raises(ValidationError): + validate_model_meta( + { + "incremental": { + "enabled": True, + "strategy": "not_a_valid_strategy", + } + } + ) diff --git a/tests/unit/test_core_python_tags.py b/tests/unit/core/test_core_python_tags.py similarity index 78% rename from tests/unit/test_core_python_tags.py rename to tests/unit/core/test_core_python_tags.py index 95a0f94..d1ee8b2 100644 --- a/tests/unit/test_core_python_tags.py +++ b/tests/unit/core/test_core_python_tags.py @@ -2,9 +2,12 @@ from pathlib import Path +import pytest + from fastflowtransform.core import Registry +@pytest.mark.unit def test_python_model_tags_propagate_to_node(tmp_path: Path, monkeypatch) -> None: project_dir = tmp_path / "proj" models_dir = project_dir / "models" @@ -12,12 +15,11 @@ def test_python_model_tags_propagate_to_node(tmp_path: Path, monkeypatch) -> Non model_file = models_dir / "py_tagged.ff.py" model_file.write_text( - ( - "from fastflowtransform import model\n\n" - "@model(name='py_tagged', tags=['example', 'demo'], materialized='view')\n" - "def build(df=None):\n" - " return df\n" - ), + "from fastflowtransform import model\n" + "import pandas as pd\n\n" + "@model(name='py_tagged', tags=['example', 'demo'], materialized='view')\n" + "def build():\n" + " return pd.DataFrame()\n", encoding="utf-8", ) diff --git a/tests/unit/test_macros_loading.py b/tests/unit/core/test_macros_loading_unit.py similarity index 94% rename from tests/unit/test_macros_loading.py rename to tests/unit/core/test_macros_loading_unit.py index d71ec33..e67d44a 100644 --- a/tests/unit/test_macros_loading.py +++ b/tests/unit/core/test_macros_loading_unit.py @@ -1,16 +1,21 @@ import textwrap from pathlib import Path +import pytest from jinja2 import Environment, FileSystemLoader, StrictUndefined from fastflowtransform.core import REGISTRY # ----------------------- SQL Macros --------------------------------- +@pytest.mark.unit def test_macros_are_loaded_and_callable(tmp_path: Path): models = tmp_path / "models" / "macros" models.mkdir(parents=True, exist_ok=True) - (tmp_path / "sources.yml").write_text("{}", encoding="utf-8") + (tmp_path / "sources.yml").write_text( + "version: 2\nsources: []\n", + encoding="utf-8", + ) # A tiny macro file (models / "utils.sql").write_text( @@ -54,6 +59,7 @@ def write(p: Path, s: str): p.write_text(textwrap.dedent(s).strip() + "\n", encoding="utf-8") +@pytest.mark.unit def test_python_macros_are_loaded_and_callable(tmp_path: Path): proj = tmp_path models = proj / "models" diff --git a/tests/unit/test_relation_for.py b/tests/unit/core/test_relation_for_unit.py similarity index 87% rename from tests/unit/test_relation_for.py rename to tests/unit/core/test_relation_for_unit.py index 857f00b..eb46c14 100644 --- a/tests/unit/test_relation_for.py +++ b/tests/unit/core/test_relation_for_unit.py @@ -1,11 +1,15 @@ +import pytest + from fastflowtransform.core import relation_for +@pytest.mark.unit def test_relation_for_strips_ff_suffix(): assert relation_for("users.ff") == "users" assert relation_for("mart_users.ff") == "mart_users" +@pytest.mark.unit def test_relation_for_passthrough_other_names(): assert relation_for("users") == "users" assert relation_for("users_enriched") == "users_enriched" diff --git a/tests/unit/test_result_selector.py b/tests/unit/core/test_result_selector_unit.py similarity index 98% rename from tests/unit/test_result_selector.py rename to tests/unit/core/test_result_selector_unit.py index 1490765..d3dbace 100644 --- a/tests/unit/test_result_selector.py +++ b/tests/unit/core/test_result_selector_unit.py @@ -1,4 +1,4 @@ -# tests/unit/test_result_selector.py +# tests/unit/core/test_result_selector_unit.py from __future__ import annotations import json @@ -38,6 +38,7 @@ def clean_registry(): REGISTRY.nodes.clear() +@pytest.mark.unit def test_result_tokens_ok_error_fail_warn(tmp_path: Path): # Minimal project structure so selectors can resolve project dir (tmp_path / "models").mkdir(parents=True) @@ -121,6 +122,7 @@ def test_result_tokens_ok_error_fail_warn(tmp_path: Path): assert not pred(REGISTRY.nodes["warn_node"]) +@pytest.mark.unit def test_result_tokens_without_file_are_noops(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) REGISTRY.project_dir = tmp_path @@ -138,6 +140,7 @@ def test_result_tokens_without_file_are_noops(tmp_path: Path): assert not pred(REGISTRY.nodes["b"]) +@pytest.mark.unit def test_result_tokens_coexist_with_other_filters(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) REGISTRY.project_dir = tmp_path diff --git a/tests/unit/executors/test_base_unit.py b/tests/unit/executors/test_base_unit.py new file mode 100644 index 0000000..7d29807 --- /dev/null +++ b/tests/unit/executors/test_base_unit.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import pytest + +from fastflowtransform.executors.base import BaseExecutor + + +@pytest.mark.unit +def test_meta_is_incremental_simple_bool(): + assert BaseExecutor._meta_is_incremental({"incremental": True}) is True + assert BaseExecutor._meta_is_incremental({"incremental": False}) is False + + +@pytest.mark.unit +def test_meta_is_incremental_dict_enabled_flag(): + assert BaseExecutor._meta_is_incremental({"incremental": {"enabled": True}}) is True + assert BaseExecutor._meta_is_incremental({"incremental": {"enabled": False}}) is False + + +@pytest.mark.unit +def test_meta_is_incremental_dict_without_enabled_defaults_to_true(): + assert BaseExecutor._meta_is_incremental({"incremental": {"strategy": "merge"}}) is True + + +@pytest.mark.unit +def test_meta_is_incremental_respects_materialized_incremental(): + assert BaseExecutor._meta_is_incremental({"materialized": "incremental"}) is True + # Even if incremental is explicitly False, materialized wins + assert ( + BaseExecutor._meta_is_incremental({"materialized": "incremental", "incremental": False}) + is True + ) + + +@pytest.mark.unit +def test_meta_is_incremental_handles_empty_and_none(): + assert BaseExecutor._meta_is_incremental({}) is False + assert BaseExecutor._meta_is_incremental(None) is False diff --git a/tests/unit/executors/test_databricks_spark_exec_unit.py b/tests/unit/executors/test_databricks_spark_exec_unit.py index c9ef934..addcbe1 100644 --- a/tests/unit/executors/test_databricks_spark_exec_unit.py +++ b/tests/unit/executors/test_databricks_spark_exec_unit.py @@ -13,10 +13,35 @@ _SparkConnShim, _split_db_table, ) +from fastflowtransform.table_formats.spark_iceberg import IcebergFormatHandler + + +def _config_values(fake_builder, key: str) -> list[str]: + return [ + call.args[1] + for call in fake_builder.config.call_args_list + if call.args and call.args[0] == key + ] + + +@pytest.mark.unit +@pytest.mark.databricks_spark +def test_non_delta_respects_explicit_catalog(exec_factory): + _, fake_builder, _ = exec_factory(table_format="parquet", catalog="unity_catalog") + catalog_values = _config_values(fake_builder, "spark.sql.catalog.spark_catalog") + assert catalog_values[-1] == "unity_catalog" + + +@pytest.mark.unit +@pytest.mark.databricks_spark +def test_non_delta_leaves_catalog_unset(exec_factory): + _, fake_builder, _ = exec_factory(table_format="parquet") + catalog_values = _config_values(fake_builder, "spark.sql.catalog.spark_catalog") + assert catalog_values == [] @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_split_db_table_unit(): assert _split_db_table("db.tbl") == ("db", "tbl") assert _split_db_table("`db`.`tbl`") == ("db`", "`tbl") @@ -24,7 +49,7 @@ def test_split_db_table_unit(): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_q_ident_unit(exec_minimal): assert exec_minimal._q_ident("foo") == "`foo`" assert exec_minimal._q_ident("foo`bar") == "`foo``bar`" @@ -32,7 +57,83 @@ def test_q_ident_unit(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark +def test_delta_format_sets_extension_and_catalog_defaults(exec_factory): + def _passthrough(builder, extra_packages=None): + return builder + + with patch.object(mod, "configure_spark_with_delta_pip", _passthrough): + _, fake_builder, _ = exec_factory(table_format="delta") + + ext_values = _config_values(fake_builder, "spark.sql.extensions") + assert ext_values, "expected spark.sql.extensions to be configured" + assert mod._DELTA_EXTENSION in ext_values[-1] + + catalog_values = _config_values(fake_builder, "spark.sql.catalog.spark_catalog") + assert catalog_values, "expected spark.sql.catalog.spark_catalog to be configured" + assert catalog_values[-1] == mod._DELTA_CATALOG + + +@pytest.mark.unit +@pytest.mark.databricks_spark +def test_delta_format_appends_existing_extension(exec_factory): + def _passthrough(builder, extra_packages=None): + return builder + + extra_conf = {"spark.sql.extensions": "com.example.Ext"} + with patch.object(mod, "configure_spark_with_delta_pip", _passthrough): + _, fake_builder, _ = exec_factory(table_format="delta", extra_conf=extra_conf) + + ext_values = _config_values(fake_builder, "spark.sql.extensions") + assert ext_values + last = ext_values[-1] + assert "com.example.Ext" in last + assert mod._DELTA_EXTENSION in last + + +@pytest.mark.unit +@pytest.mark.databricks_spark +def test_delta_format_respects_custom_catalog(exec_factory): + def _passthrough(builder, extra_packages=None): + return builder + + with patch.object(mod, "configure_spark_with_delta_pip", _passthrough): + _, fake_builder, _ = exec_factory(table_format="delta", catalog="unity") + + catalog_values = _config_values(fake_builder, "spark.sql.catalog.spark_catalog") + assert catalog_values == ["unity"] + + +@pytest.mark.unit +@pytest.mark.databricks_spark +def test_delta_format_respects_extra_conf_catalog(exec_factory): + def _passthrough(builder, extra_packages=None): + return builder + + extra_conf = {"spark.sql.catalog.spark_catalog": "ext_catalog"} + with patch.object(mod, "configure_spark_with_delta_pip", _passthrough): + _, fake_builder, _ = exec_factory(table_format="delta", extra_conf=extra_conf) + + catalog_values = _config_values(fake_builder, "spark.sql.catalog.spark_catalog") + assert catalog_values == ["ext_catalog"] + + +@pytest.mark.unit +@pytest.mark.databricks_spark +def test_delta_format_errors_when_delta_missing(exec_factory): + def _passthrough(builder, extra_packages=None): + return builder + + with ( + patch.object(mod, "configure_spark_with_delta_pip", _passthrough), + patch.object(mod, "_has_delta", return_value=False), + pytest.raises(RuntimeError), + ): + exec_factory(table_format="delta") + + +@pytest.mark.unit +@pytest.mark.databricks_spark def test_validate_required_single_df_unit(exec_minimal): # Fake Spark DF fake_df = SimpleNamespace(schema=SimpleNamespace(fieldNames=lambda: ["id", "email"])) @@ -45,7 +146,7 @@ def test_validate_required_single_df_unit(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_validate_required_single_df_raises_unit(exec_minimal): fake_df = SimpleNamespace(schema=SimpleNamespace(fieldNames=lambda: ["id"])) with pytest.raises(ValueError): @@ -57,7 +158,7 @@ def test_validate_required_single_df_raises_unit(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_validate_required_multi_dep_unit(exec_minimal): fake_users = SimpleNamespace(schema=SimpleNamespace(fieldNames=lambda: ["id", "email"])) fake_orders = SimpleNamespace( @@ -75,7 +176,7 @@ def test_validate_required_multi_dep_unit(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_format_source_reference_classic_unit(exec_minimal): cfg = {"identifier": "seed_users", "schema": "staging", "catalog": "spark_catalog"} ref = exec_minimal._format_source_reference(cfg, "raw", "users") @@ -85,7 +186,7 @@ def test_format_source_reference_classic_unit(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_format_source_reference_path_based_unit(exec_minimal): # wir patchen spark.read.format(...) Kette fake_df = MagicMock() @@ -110,7 +211,7 @@ def test_format_source_reference_path_based_unit(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test__materialize_relation_uses_save_table_when_no_path(exec_minimal): """Executor should call internal table-saving logic when no storage path is configured.""" df = MagicMock() @@ -127,29 +228,29 @@ def test__materialize_relation_uses_save_table_when_no_path(exec_minimal): @pytest.mark.unit -@pytest.mark.spark -def test__materialize_relation_uses_write_to_storage_path(exec_minimal, tmp_path): - """Executor should delegate to _write_to_storage_path when storage meta has a path.""" +@pytest.mark.databricks_spark +def test__materialize_relation_with_path_delegates_to_save(exec_minimal, tmp_path): + """Executor should still call _save_df_as_table even if storage meta defines a path.""" df = MagicMock() node = Node(name="dummy", kind="python", path=Path(".")) + storage_meta = {"path": str(tmp_path), "format": "parquet"} - exec_minimal._storage_meta = MagicMock( - return_value={"path": str(tmp_path), "format": "parquet"} - ) + exec_minimal._storage_meta = MagicMock(return_value=storage_meta) + exec_minimal._save_df_as_table = MagicMock() exec_minimal._write_to_storage_path = MagicMock() exec_minimal._materialize_relation("default.unit_tbl", df, node) - exec_minimal._write_to_storage_path.assert_called_once() - exec_minimal._write_to_storage_path.assert_called_with( + exec_minimal._save_df_as_table.assert_called_once_with( "default.unit_tbl", df, - {"path": str(tmp_path), "format": "parquet"}, + storage=storage_meta, ) + exec_minimal._write_to_storage_path.assert_not_called() @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test__write_to_storage_path_calls_storage_helper(exec_minimal, monkeypatch, tmp_path): """_write_to_storage_path should just be a thin adapter to storage.spark_write_to_path.""" called = {} @@ -177,7 +278,7 @@ def fake_write(spark, identifier, df, storage, default_format=None, default_opti @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test__create_view_over_table_executes_expected_sql(exec_minimal): """_create_view_over_table should emit a simple CREATE OR REPLACE VIEW SELECT * statement.""" exec_minimal.spark.sql = MagicMock() @@ -192,7 +293,7 @@ def test__create_view_over_table_executes_expected_sql(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_on_node_built_calls_meta_helpers(exec_minimal, monkeypatch): """on_node_built should best-effort call ensure_meta_table and upsert_meta.""" ensure_called = {} @@ -215,7 +316,7 @@ def fake_upsert(executor, node_name, relation, fingerprint, engine): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_spark_conn_shim_execute_runs_select(monkeypatch): """_SparkConnShim.execute should return rows collected from spark.sql.""" fake_spark = MagicMock() @@ -228,7 +329,7 @@ def test_spark_conn_shim_execute_runs_select(monkeypatch): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_read_relation_uses_spark_table(exec_minimal): exec_minimal.spark.table.return_value = "DF" out = exec_minimal._read_relation("users", Node(name="n", kind="sql", path=Path(".")), []) @@ -237,14 +338,14 @@ def test_read_relation_uses_spark_table(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_validate_required_no_requires_is_noop(exec_minimal): # should not raise exec_minimal._validate_required("node_x", inputs=MagicMock(), requires={}) @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_materialize_relation_rejects_non_frame(exec_minimal, monkeypatch): # für diesen Test brauchen wir das echte Verhalten monkeypatch.setattr(exec_minimal, "_is_frame", lambda obj: False) @@ -254,15 +355,7 @@ def test_materialize_relation_rejects_non_frame(exec_minimal, monkeypatch): @pytest.mark.unit -@pytest.mark.spark -def test_exists_relation_qualified(exec_minimal): - exec_minimal.spark.catalog._jcatalog.tableExists.return_value = True - assert exec_minimal.exists_relation("default.my_tbl") is True - exec_minimal.spark.catalog._jcatalog.tableExists.assert_called_with("default", "my_tbl") - - -@pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_exists_relation_unqualified(exec_minimal): exec_minimal.spark.catalog.tableExists.return_value = False assert exec_minimal.exists_relation("my_tbl") is False @@ -270,21 +363,21 @@ def test_exists_relation_unqualified(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_init_makes_relative_warehouse_absolute(exec_factory): ex, _, _ = exec_factory(warehouse_dir="rel_dir") assert ex.warehouse_dir.is_absolute() @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_init_with_catalog_sets_config(exec_factory): _, fake_builder, _ = exec_factory(catalog="hive_metastore") fake_builder.config.assert_any_call("spark.sql.catalog.spark_catalog", "hive_metastore") @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_init_with_extra_conf(exec_factory): _, fake_builder, _ = exec_factory(extra_conf={"spark.foo": "1", "spark.bar": "2"}) fake_builder.config.assert_any_call("spark.foo", "1") @@ -292,7 +385,7 @@ def test_init_with_extra_conf(exec_factory): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_init_with_hive_support(exec_factory): _, fake_builder, _ = exec_factory(use_hive_metastore=True) fake_builder.config.assert_any_call("spark.sql.catalogImplementation", "hive") @@ -300,14 +393,14 @@ def test_init_with_hive_support(exec_factory): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_init_with_table_options(exec_factory): ex, _, _ = exec_factory(table_options={"mergeSchema": True}) assert ex.spark_table_options == {"mergeSchema": "True"} @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_storage_meta_prefers_node_storage(exec_minimal): node = Node( name="users.ff", kind="sql", path=Path("x"), meta={"storage": {"path": "/tmp/users"}} @@ -317,7 +410,7 @@ def test_storage_meta_prefers_node_storage(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_storage_meta_uses_global_lookup_when_node_empty(exec_minimal): with patch("fastflowtransform.executors.databricks_spark_exec.storage.get_model_storage") as gm: gm.return_value = {"path": "/tmp/global"} @@ -327,7 +420,7 @@ def test_storage_meta_uses_global_lookup_when_node_empty(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_storage_meta_falls_back_to_registry_scan(exec_minimal, monkeypatch): # 1) Fake-Node im Registry, der Storage hat reg_node = Node( @@ -348,7 +441,7 @@ def test_storage_meta_falls_back_to_registry_scan(exec_minimal, monkeypatch): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_storage_meta_registry_scan_then_global(exec_minimal, monkeypatch): reg_node = Node( name="orders.ff", @@ -371,16 +464,36 @@ def test_storage_meta_registry_scan_then_global(exec_minimal, monkeypatch): @pytest.mark.unit -@pytest.mark.spark -def test_format_relation_for_ref(exec_minimal): +@pytest.mark.databricks_spark +def test_format_relation_for_ref_iceberg(exec_minimal): + exec_minimal.spark_table_format = "iceberg" + exec_minimal.database = "demo" + exec_minimal.spark.catalog.currentDatabase.return_value = "demo" + exec_minimal._format_handler = IcebergFormatHandler(exec_minimal.spark) + with patch("fastflowtransform.executors.databricks_spark_exec.relation_for") as rel_for: - rel_for.return_value = "real_table" - out = exec_minimal._format_relation_for_ref("users.ff") - assert out == "`real_table`" + rel_for.return_value = "events_base" + out = exec_minimal._format_relation_for_ref("events_base.ff") + + assert out == "`iceberg`.`demo`.`events_base`" + + +@pytest.mark.unit +@pytest.mark.databricks_spark +def test_this_identifier_iceberg(exec_minimal): + exec_minimal.spark_table_format = "iceberg" + exec_minimal.database = "demo" + exec_minimal.spark.catalog.currentDatabase.return_value = "demo" + exec_minimal._format_handler = IcebergFormatHandler(exec_minimal.spark) + + node = Node(name="fct_events_sql_inline.ff", kind="sql", path=Path(".")) + ident = exec_minimal._this_identifier(node) + + assert ident == "`iceberg`.`demo`.`fct_events_sql_inline`" @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_format_source_reference_location_without_format_raises(exec_minimal): cfg = {"location": "/tmp/data", "identifier": "x"} # no "format" with pytest.raises(KeyError, match="requires 'format'"): @@ -388,7 +501,7 @@ def test_format_source_reference_location_without_format_raises(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_save_df_as_table_respects_storage_path(exec_minimal): df = MagicMock() exec_minimal._write_to_storage_path = MagicMock() @@ -403,7 +516,28 @@ def test_save_df_as_table_respects_storage_path(exec_minimal): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark +def test_save_df_as_table_iceberg_ignores_storage_path(exec_minimal): + df = MagicMock() + exec_minimal.spark_table_format = "iceberg" + handler = MagicMock() + handler.table_format = "iceberg" + handler.allows_unmanaged_paths.return_value = False + exec_minimal._format_handler = handler + exec_minimal._write_to_storage_path = MagicMock() + + exec_minimal._save_df_as_table( + "ice_tbl", + df, + storage={"path": "/tmp/ignored"}, + ) + + exec_minimal._write_to_storage_path.assert_not_called() + handler.save_df_as_table.assert_called_once_with("ice_tbl", df) + + +@pytest.mark.unit +@pytest.mark.databricks_spark def test_create_or_replace_table_happy_path_calls_save(exec_minimal): # spark.sql soll NICHT werfen, sondern ein DF liefern fake_df = MagicMock() @@ -422,3 +556,17 @@ def test_create_or_replace_table_happy_path_calls_save(exec_minimal): exec_minimal.spark.sql.assert_called_with("SELECT 1 AS id") exec_minimal._save_df_as_table.assert_called_once_with("target_tbl", fake_df, storage=ANY) + + +@pytest.mark.unit +@pytest.mark.databricks_spark +def test_read_relation_iceberg_qualifies(exec_minimal): + exec_minimal.spark_table_format = "iceberg" + exec_minimal.database = "demo" + exec_minimal.spark.catalog.currentDatabase.return_value = "demo" + exec_minimal._format_handler = IcebergFormatHandler(exec_minimal.spark) + + node = Node(name="model", kind="sql", path=Path(".")) + exec_minimal._read_relation("events_base", node, deps=[]) + + exec_minimal.spark.table.assert_called_with("iceberg.demo.events_base") diff --git a/tests/unit/executors/test_duckdb_exec_unit.py b/tests/unit/executors/test_duckdb_exec_unit.py index 6aa8475..e8446d1 100644 --- a/tests/unit/executors/test_duckdb_exec_unit.py +++ b/tests/unit/executors/test_duckdb_exec_unit.py @@ -16,6 +16,11 @@ def duck_exec() -> DuckExecutor: return DuckExecutor(":memory:") +@pytest.fixture +def duck_exec_schema() -> DuckExecutor: + return DuckExecutor(":memory:", schema="demo_schema") + + def _node(name: str = "m", kind: str = "python") -> Node: return Node(name=name, kind=kind, path=Path(".")) @@ -99,6 +104,21 @@ def test_format_relation_for_ref(duck_exec: DuckExecutor): assert rel == _q("my_model") +@pytest.mark.unit +@pytest.mark.duckdb +def test_format_relation_for_ref_with_schema(duck_exec_schema: DuckExecutor): + rel = duck_exec_schema._format_relation_for_ref("my_model") + assert rel == f'"demo_schema".{_q("my_model")}' + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_format_relation_for_ref_with_catalog_collision(): + exec_catalog = DuckExecutor(":memory:", schema="memory") + rel = exec_catalog._format_relation_for_ref("foo") + assert rel == '"memory"."memory"."foo"' + + @pytest.mark.unit @pytest.mark.duckdb def test_format_source_reference_ok(duck_exec: DuckExecutor): @@ -132,6 +152,22 @@ def test_format_source_reference_path_not_supported(duck_exec: DuckExecutor): duck_exec._format_source_reference(cfg, "src", "tbl") +@pytest.mark.unit +@pytest.mark.duckdb +def test_format_source_reference_injects_executor_schema(): + exec_schema = DuckExecutor(":memory:", schema="demo_schema") + ref = exec_schema._format_source_reference({"identifier": "src_tbl"}, "src", "tbl") + assert ref == '"demo_schema"."src_tbl"' + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_format_source_reference_injects_catalog_when_matches_schema(): + exec_catalog = DuckExecutor(":memory:", schema="memory") + ref = exec_catalog._format_source_reference({"identifier": "src_tbl"}, "src", "tbl") + assert ref == '"memory"."memory"."src_tbl"' + + # --------------------------------------------------------------------------- # on_node_built - best effort # --------------------------------------------------------------------------- @@ -178,6 +214,13 @@ def test_exists_relation_false(duck_exec: DuckExecutor): assert duck_exec.exists_relation("nope") is False +@pytest.mark.unit +@pytest.mark.duckdb +def test_exists_relation_with_schema(duck_exec_schema: DuckExecutor): + duck_exec_schema.con.execute('create table "demo_schema"."t_s" (id int)') + assert duck_exec_schema.exists_relation("t_s") is True + + # --------------------------------------------------------------------------- # create_table_as / incremental_insert / incremental_merge # --------------------------------------------------------------------------- @@ -242,3 +285,12 @@ def test_alter_table_sync_schema_adds_missing_columns(duck_exec: DuckExecutor): col_names = [r[1] for r in info] assert "id" in col_names assert "new_col" in col_names + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_read_relation_respects_schema(duck_exec_schema: DuckExecutor): + duck_exec_schema.con.execute('create table "demo_schema"."t_in_schema" (id int)') + duck_exec_schema.con.execute('insert into "demo_schema"."t_in_schema" values (5)') + df = duck_exec_schema._read_relation("t_in_schema", _node(), deps=[]) + assert df.to_dict(orient="records") == [{"id": 5}] diff --git a/tests/unit/test_incremental_planner.py b/tests/unit/incremental/test_incremental_planner_unit.py similarity index 100% rename from tests/unit/test_incremental_planner.py rename to tests/unit/incremental/test_incremental_planner_unit.py diff --git a/tests/unit/test_lineage_py.py b/tests/unit/lineage/test_lineage_py_unit.py similarity index 93% rename from tests/unit/test_lineage_py.py rename to tests/unit/lineage/test_lineage_py_unit.py index cf6028e..dc9083f 100644 --- a/tests/unit/test_lineage_py.py +++ b/tests/unit/lineage/test_lineage_py_unit.py @@ -1,13 +1,10 @@ import pandas as pd +import pytest from fastflowtransform.lineage import infer_py_lineage -def _dummy(): - # docstring to keep parser calm - pass - - +@pytest.mark.unit def test_pandas_rename_and_assign(): """Basic pandas lineage patterns: rename + new column from existing.""" diff --git a/tests/unit/test_lineage_sql.py b/tests/unit/lineage/test_lineage_sql_unit.py similarity index 96% rename from tests/unit/test_lineage_sql.py rename to tests/unit/lineage/test_lineage_sql_unit.py index 52ad67b..ce4debe 100644 --- a/tests/unit/test_lineage_sql.py +++ b/tests/unit/lineage/test_lineage_sql_unit.py @@ -1,6 +1,9 @@ +import pytest + from fastflowtransform.lineage import infer_sql_lineage +@pytest.mark.unit def test_simple_alias_and_function_transformed(): """ Heuristics: @@ -32,6 +35,7 @@ def test_simple_alias_and_function_transformed(): ) +@pytest.mark.unit def test_passthrough_without_alias(): """SELECT u.email -> output column name 'email' inferred as direct.""" sql = "select u.email from users u" diff --git a/tests/unit/test_logging_flags.py b/tests/unit/logging/test_logging_flags_unit.py similarity index 95% rename from tests/unit/test_logging_flags.py rename to tests/unit/logging/test_logging_flags_unit.py index e12c14b..4e41d07 100644 --- a/tests/unit/test_logging_flags.py +++ b/tests/unit/logging/test_logging_flags_unit.py @@ -1,6 +1,7 @@ -# tests/unit/test_logging_flags.py +# tests/unit/logging/test_logging_flags_unit.py import importlib +import pytest from jinja2 import Environment from typer.testing import CliRunner @@ -10,6 +11,7 @@ cli_run = importlib.import_module("fastflowtransform.cli.run") +@pytest.mark.unit def test_verbose_flags_wiring(monkeypatch, tmp_path): models_dir = tmp_path / "models" models_dir.mkdir(parents=True, exist_ok=True) diff --git a/tests/unit/test_parallel_logging_error_block.py b/tests/unit/logging/test_parallel_logging_error_block_unit.py similarity index 98% rename from tests/unit/test_parallel_logging_error_block.py rename to tests/unit/logging/test_parallel_logging_error_block_unit.py index cc51061..dc4326d 100644 --- a/tests/unit/test_parallel_logging_error_block.py +++ b/tests/unit/logging/test_parallel_logging_error_block_unit.py @@ -3,6 +3,7 @@ import importlib from types import SimpleNamespace +import pytest from jinja2 import Environment from typer.testing import CliRunner @@ -14,6 +15,7 @@ cli_run = importlib.import_module("fastflowtransform.cli.run") +@pytest.mark.unit def test_error_block_prints_after_logs_without_interleaving(monkeypatch, tmp_path): models_dir = tmp_path / "models" models_dir.mkdir(parents=True, exist_ok=True) diff --git a/tests/unit/test_parallel_logging_snapshot.py b/tests/unit/logging/test_parallel_logging_snapshot_unit.py similarity index 97% rename from tests/unit/test_parallel_logging_snapshot.py rename to tests/unit/logging/test_parallel_logging_snapshot_unit.py index c5e1219..36fe852 100644 --- a/tests/unit/test_parallel_logging_snapshot.py +++ b/tests/unit/logging/test_parallel_logging_snapshot_unit.py @@ -3,6 +3,8 @@ import re import time +import pytest + from fastflowtransform.log_queue import LogQueue from fastflowtransform.run_executor import ScheduleResult, schedule @@ -16,6 +18,7 @@ def _normalize(lines: list[str]) -> list[str]: return out +@pytest.mark.unit def test_logging_snapshot_single_level_order_and_summary(): levels = [["a", "b"]] logq = LogQueue() @@ -47,6 +50,7 @@ def run_node(name: str) -> None: assert any(line.startswith("✓ L01 [DUCK] a") for line in lines) +@pytest.mark.unit def test_logging_snapshot_multi_level_and_long_names(): levels = [["very_long_model_name_exceeding_width.ff"], ["next"]] logq = LogQueue() diff --git a/tests/unit/render/test_this_proxy_unit.py b/tests/unit/render/test_this_proxy_unit.py index 3564ef4..ea575c4 100644 --- a/tests/unit/render/test_this_proxy_unit.py +++ b/tests/unit/render/test_this_proxy_unit.py @@ -9,7 +9,6 @@ @pytest.mark.unit -@pytest.mark.render def test_this_string_and_name(tmp_path: Path): p = tmp_path / "m.ff.sql" p.write_text("select '{{ this }}' as a, '{{ this.name }}' as b", encoding="utf-8") diff --git a/tests/unit/render/test_this_relation_unit.py b/tests/unit/render/test_this_relation_unit.py index ce7440a..3536756 100644 --- a/tests/unit/render/test_this_relation_unit.py +++ b/tests/unit/render/test_this_relation_unit.py @@ -16,7 +16,6 @@ def _env_for_tests() -> Environment: @pytest.mark.unit -@pytest.mark.render def test_this_renders_physical_relation(tmp_path): # Arrange: Minimal SQL-Model, das nur `{{ this }}` rendert sql_path = tmp_path / "m.ff.sql" diff --git a/tests/unit/schema/test_schema_loader_unit.py b/tests/unit/schema/test_schema_loader_unit.py index a2ff898..0153019 100644 --- a/tests/unit/schema/test_schema_loader_unit.py +++ b/tests/unit/schema/test_schema_loader_unit.py @@ -6,7 +6,6 @@ @pytest.mark.unit -@pytest.mark.schema def test_parse_schema_yaml_column_tests(tmp_path: Path): (tmp_path / "models").mkdir(parents=True) (tmp_path / "models" / "users_enriched.yml").write_text( diff --git a/tests/unit/test_docgen_site.py b/tests/unit/test_docgen_site.py deleted file mode 100644 index ea8d721..0000000 --- a/tests/unit/test_docgen_site.py +++ /dev/null @@ -1,80 +0,0 @@ -import json -import shutil -from pathlib import Path - -import yaml - -from tests.common.utils import ROOT, run - - -def _write(p: Path, text: str) -> None: - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(text, encoding="utf-8") - - -def test_docgen_generates_descriptions_and_lineage(tmp_path): - """End-to-end: docgen outputs HTML with Description/Columns/Lineage and a JSON manifest.""" - # Copy example project to an isolated temp dir - src_proj = ROOT / "examples" / "simple_duckdb" - proj = tmp_path / "proj" - shutil.copytree(src_proj, proj) - - # Extend project.yml with docs YAML and a very simple SQL model that guarantees clear lineage - proj_yml = yaml.safe_load((proj / "project.yml").read_text(encoding="utf-8")) - proj_yml.setdefault("docs", {}).setdefault("models", {}) - proj_yml["docs"]["models"]["users_enriched"] = { - "description": "YAML users_enriched description", - "columns": {"email": "Original email", "is_gmail": "Gmail flag"}, - } - (proj / "project.yml").write_text(yaml.safe_dump(proj_yml), encoding="utf-8") - - # Markdown override for model description (should win over YAML) - _write(proj / "docs" / "models" / "users_enriched.md", "MD model description") - # Markdown column description override (should win over YAML) - _write(proj / "docs" / "columns" / "users_enriched" / "email.md", "MD column description") - - # Run build first so schema introspection sees physical tables - env = {"FF_ENGINE": "duckdb", "FF_DUCKDB_PATH": str(proj / ".local" / "demo.duckdb")} - run(["fft", "seed", str(proj), "--env", "dev"], env) - run(["fft", "run", str(proj), "--env", "dev"], env) - - out_dir = proj / "site" / "docs" - manifest_path = out_dir / "docs_manifest.json" - res = run( - [ - "fft", - "docgen", - str(proj), - "--env", - "dev", - "--out", - str(out_dir), - "--emit-json", - str(manifest_path), - ], - env, - ) - assert res.returncode == 0, res.stdout - - # HTML page exists and includes description + columns + lineage heading - index_html = (out_dir / "index.html").read_text(encoding="utf-8") - assert "FastFlowTransform - DAG & Mini Docs" in index_html - model_html = (out_dir / "users_enriched.html").read_text(encoding="utf-8") - assert "Description" in model_html - assert "Columns" in model_html - assert "Lineage" in model_html - # Markdown model description took precedence - assert "MD model description" in model_html - - # JSON manifest exists and has at least one lineage entry somewhere - data = json.loads(manifest_path.read_text(encoding="utf-8")) - assert "models" in data and isinstance(data["models"], list) and len(data["models"]) >= 1 - any_lineage = False - for m in data["models"]: - for c in m.get("columns", []): - if c.get("lineage"): - any_lineage = True - break - if any_lineage: - break - assert any_lineage, "Expected at least one lineage entry in manifest" diff --git a/tests/unit/test_fingerprint.py b/tests/unit/test_fingerprint_unit.py similarity index 84% rename from tests/unit/test_fingerprint.py rename to tests/unit/test_fingerprint_unit.py index 7decb83..da5e92d 100644 --- a/tests/unit/test_fingerprint.py +++ b/tests/unit/test_fingerprint_unit.py @@ -3,9 +3,9 @@ from pathlib import Path -import yaml +import pytest -from fastflowtransform.core import Node, _parse_sources_yaml +from fastflowtransform.core import Node from fastflowtransform.fingerprint import ( EnvCtx, build_env_ctx, @@ -13,26 +13,10 @@ fingerprint_sql, get_function_source, inspect, - normalized_sources_blob, ) -def test_sources_normalization_stable(): - doc = """version: 2 - -sources: - - name: crm - tables: - - name: users - identifier: seed_users - - name: orders - identifier: seed_orders -""" - parsed = _parse_sources_yaml(yaml.safe_load(doc)) - reordered = {"crm": {"orders": parsed["crm"]["orders"], "users": parsed["crm"]["users"]}} - assert normalized_sources_blob(parsed) == normalized_sources_blob(reordered) - - +@pytest.mark.unit def test_env_ctx_respects_selected_env_keys(monkeypatch): monkeypatch.setenv("FF_ENGINE", "duckdb") monkeypatch.setenv("SECRET_TOKEN", "shh") @@ -46,6 +30,7 @@ def test_env_ctx_respects_selected_env_keys(monkeypatch): assert ctx1.to_payload() == ctx3.to_payload() +@pytest.mark.unit def test_fingerprint_sql_changes_on_small_sql_edit(): node = Node(name="users.ff", kind="sql", path=Path(__file__)) ctx = EnvCtx(engine="duckdb", profile="dev", env_vars={}, sources_json="{}") @@ -54,6 +39,7 @@ def test_fingerprint_sql_changes_on_small_sql_edit(): assert fp1 != fp2 +@pytest.mark.unit def test_fingerprint_sql_dep_cascade(): node = Node(name="mart.ff", kind="sql", path=Path(__file__), deps=["users.ff"]) ctx = EnvCtx(engine="duckdb", profile="dev", env_vars={}, sources_json="{}") @@ -76,6 +62,7 @@ def _dummy_func_b(x): return x + 2 +@pytest.mark.unit def test_get_function_source_is_stable_and_different_per_change(): src_a = get_function_source(_dummy_func_a) src_b = get_function_source(_dummy_func_b) @@ -83,6 +70,7 @@ def test_get_function_source_is_stable_and_different_per_change(): assert src_a != src_b +@pytest.mark.unit def test_fingerprint_py_changes_with_source_and_deps(): node = Node(name="py_model", kind="python", path=Path(__file__), deps=["users.ff"]) ctx = EnvCtx(engine="duckdb", profile="dev", env_vars={}, sources_json="{}") @@ -92,6 +80,7 @@ def test_fingerprint_py_changes_with_source_and_deps(): assert fp1 != fp2 +@pytest.mark.unit def test_get_function_source_fallback(monkeypatch): # Force inspect.getsource to fail to exercise fallback path def boom(_): diff --git a/tests/unit/test_seeding_unit.py b/tests/unit/test_seeding_unit.py index e298fff..ecec419 100644 --- a/tests/unit/test_seeding_unit.py +++ b/tests/unit/test_seeding_unit.py @@ -4,6 +4,7 @@ import textwrap from pathlib import Path from types import SimpleNamespace +from typing import Any from unittest.mock import MagicMock import pandas as pd @@ -38,7 +39,7 @@ def test_read_seed_file_unsupported(tmp_path: Path): @pytest.mark.unit def test_apply_schema_happy(): df = pd.DataFrame({"id": [1, 2], "name": ["a", "b"], "age": [10, 20]}) - schema_cfg = { + cfg_raw = { "dtypes": { "users": { "name": "string", @@ -46,9 +47,9 @@ def test_apply_schema_happy(): } } } + schema_cfg = seeding.SeedsSchemaConfig.model_validate(cfg_raw) out = seeding._apply_schema(df, "users", schema_cfg) - # 'name' should be string dtype assert str(out.dtypes["name"]).startswith("string") assert str(out.dtypes["age"]) in ("int64", "Int64") @@ -56,8 +57,9 @@ def test_apply_schema_happy(): @pytest.mark.unit def test_apply_schema_ignores_missing_table_key(): df = pd.DataFrame({"id": [1]}) - out = seeding._apply_schema(df, "other", {"dtypes": {"users": {"id": "int64"}}}) - # unchanged + cfg_raw = {"dtypes": {"users": {"id": "int64"}}} + schema_cfg = seeding.SeedsSchemaConfig.model_validate(cfg_raw) + out = seeding._apply_schema(df, "other", schema_cfg) assert out.equals(df) @@ -65,9 +67,9 @@ def test_apply_schema_ignores_missing_table_key(): def test_apply_schema_soft_fails_on_bad_cast(): df = pd.DataFrame({"id": ["x"]}) # force bad cast - cfg = {"dtypes": {"t": {"id": "int64"}}} - out = seeding._apply_schema(df, "t", cfg) - # should not raise and should still have the row + cfg_raw = {"dtypes": {"t": {"id": "int64"}}} + schema_cfg = seeding.SeedsSchemaConfig.model_validate(cfg_raw) + out = seeding._apply_schema(df, "t", schema_cfg) assert len(out) == 1 @@ -94,6 +96,18 @@ def test_qualify_unqualified_with_schema(): assert out == '"raw"."users"' +@pytest.mark.unit +def test_qualify_with_schema_and_catalog(): + out = seeding._qualify("users", "raw", "cat") + assert out == '"cat"."raw"."users"' + + +@pytest.mark.unit +def test_qualify_with_catalog_only(): + out = seeding._qualify("users", None, "cat") + assert out == '"cat"."users"' + + @pytest.mark.unit def test_qualify_already_qualified_preserves_parts(): out = seeding._qualify("raw.users", None) @@ -106,7 +120,7 @@ def test_qualify_already_qualified_preserves_parts(): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_spark_warehouse_base_local(tmp_path: Path): fake_spark = SimpleNamespace( conf=SimpleNamespace(get=lambda key, default=None: str(tmp_path / "wh")) @@ -116,14 +130,14 @@ def test_spark_warehouse_base_local(tmp_path: Path): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_spark_warehouse_base_remote_scheme(): fake_spark = SimpleNamespace(conf=SimpleNamespace(get=lambda *_: "s3://bucket/warehouse")) assert seeding._spark_warehouse_base(fake_spark) is None @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_spark_table_location_strips_catalog(tmp_path: Path): # warehouse dir is local fake_spark = SimpleNamespace(conf=SimpleNamespace(get=lambda *_: str(tmp_path / "wh"))) @@ -188,7 +202,7 @@ def fake_echo(msg: str) -> None: @pytest.mark.unit def test_engine_name_from_executor_spark(): ex = SimpleNamespace(spark=object()) - assert seeding._engine_name_from_executor(ex) == "spark" + assert seeding._engine_name_from_executor(ex) == "databricks_spark" @pytest.mark.unit @@ -224,7 +238,7 @@ def test_seed_id_nested(tmp_path: Path): @pytest.mark.unit def test_resolve_schema_and_table_by_cfg_priority_engine_override(): - schema_cfg = { + schema_cfg_raw = { "targets": { "raw/users": { "schema": "raw", @@ -239,6 +253,8 @@ def test_resolve_schema_and_table_by_cfg_priority_engine_override(): # executor pretending to be postgres ex = SimpleNamespace(engine=SimpleNamespace(dialect=SimpleNamespace(name="postgres"))) + schema_cfg = seeding.SeedsSchemaConfig.model_validate(schema_cfg_raw) + schema, table = seeding._resolve_schema_and_table_by_cfg( seed_id="raw/users", stem="users", @@ -341,7 +357,7 @@ def test_handle_sqlalchemy_returns_false_if_engine_not_sqlalchemy(): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_handle_spark_happy_default_table(tmp_path: Path, monkeypatch): # fake spark with local warehouse fake_spark = MagicMock() @@ -374,21 +390,39 @@ def test_handle_spark_happy_default_table(tmp_path: Path, monkeypatch): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_handle_spark_uses_seed_storage(monkeypatch): - # storage override set to custom path + # 1) Seed-Storage Override setzen storage.set_seed_storage( {"raw.users": {"path": "/tmp/custom", "format": "parquet", "options": {"x": "1"}}} ) + # 2) Fake Spark + DataFrame fake_spark = MagicMock() fake_sdf = MagicMock() fake_spark.createDataFrame.return_value = fake_sdf + writer = MagicMock() fake_sdf.write.mode.return_value = writer writer.format.return_value = writer writer.options.return_value = writer + # 3) spark_write_to_path stubben, damit kein echtes FS angefasst wird + called: dict[str, Any] = {} + + def _fake_write_to_path( + spark, identifier, df, *, storage: dict, default_format, default_options + ): + called["spark"] = spark + called["identifier"] = identifier + called["df"] = df + called["storage"] = storage + called["default_format"] = default_format + called["default_options"] = default_options + + monkeypatch.setattr(seeding.storage, "spark_write_to_path", _fake_write_to_path) + + # 4) Executor-Stub executor = SimpleNamespace( spark=fake_spark, spark_table_format=None, @@ -396,15 +430,17 @@ def test_handle_spark_uses_seed_storage(monkeypatch): ) df = pd.DataFrame({"id": [1]}) - # name must match our storage key + + # 5) Aufruf handled = seeding._handle_spark("raw.users", df, executor, schema=None) assert handled is True - # since we used storage override, it should have called storage.spark_write_to_path - # easiest: monkeypatch seeding.storage.spark_write_to_path and assert - # but here we can assert spark.sql got a DROP TABLE? no, path → register only - # so instead let's just check that createDataFrame was called (path flow runs too) - fake_spark.createDataFrame.assert_called_once() + # 6) Asserts + fake_spark.createDataFrame.assert_called_once_with(df) + assert called["spark"] is fake_spark + assert called["identifier"] == "raw.users" + assert called["storage"]["path"] == "/tmp/custom" + assert called["storage"]["format"] == "parquet" # --------------------------------------------------------------------------- diff --git a/tests/unit/test_sources_parser.py b/tests/unit/test_sources_parser.py deleted file mode 100644 index 3ecf1ed..0000000 --- a/tests/unit/test_sources_parser.py +++ /dev/null @@ -1,81 +0,0 @@ -from __future__ import annotations - -import yaml - -from fastflowtransform.core import _parse_sources_yaml, resolve_source_entry - - -def test_parse_sources_and_overrides_merge(): - doc = """version: 2 - -sources: - - name: raw - schema: staging - overrides: - databricks_spark: - schema: bronze - tables: - - name: users - identifier: seed_users - overrides: - databricks_spark: - format: delta - location: "/mnt/delta/raw/users" -""" - - parsed = _parse_sources_yaml(yaml.safe_load(doc)) - entry = parsed["raw"]["users"] - - default_cfg = resolve_source_entry(entry, "duckdb") - assert default_cfg["identifier"] == "seed_users" - assert default_cfg["schema"] == "staging" - assert default_cfg["location"] is None - - spark_cfg = resolve_source_entry(entry, "databricks_spark") - assert spark_cfg["schema"] == "bronze" - assert spark_cfg["format"] == "delta" - assert spark_cfg["location"] == "/mnt/delta/raw/users" - - -def test_wildcard_override_applied_before_engine_specific(): - doc = """version: 2 - -sources: - - name: crm - tables: - - name: users - identifier: seed_users - overrides: - "*": - schema: shared - postgres: - schema: analytics -""" - - parsed = _parse_sources_yaml(yaml.safe_load(doc)) - entry = parsed["crm"]["users"] - - duck_cfg = resolve_source_entry(entry, "duckdb") - assert duck_cfg["schema"] == "shared" - - pg_cfg = resolve_source_entry(entry, "postgres") - assert pg_cfg["schema"] == "analytics" - - -def test_missing_identifier_falls_back_to_table_name(): - doc = """version: 2 - -sources: - - name: ext - tables: - - name: events - overrides: - postgres: - schema: external -""" - - parsed = _parse_sources_yaml(yaml.safe_load(doc)) - entry = parsed["ext"]["events"] - - pg_cfg = resolve_source_entry(entry, "postgres", default_identifier="events") - assert pg_cfg["identifier"] == "events" diff --git a/tests/unit/test_storage_unit.py b/tests/unit/test_storage_unit.py index 5074da7..c9f54c9 100644 --- a/tests/unit/test_storage_unit.py +++ b/tests/unit/test_storage_unit.py @@ -154,7 +154,7 @@ def test_get_seed_storage_exact_and_last_part(): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_spark_write_to_path_happy(tmp_path: Path, monkeypatch): # fake spark + df.write chain fake_spark = MagicMock() @@ -168,6 +168,12 @@ def test_spark_write_to_path_happy(tmp_path: Path, monkeypatch): # .options(...) → writer writer.options.return_value = writer + def _save_side_effect(path_str: str): + p = Path(path_str) + p.mkdir(parents=True, exist_ok=True) + + writer.save.side_effect = _save_side_effect + # storage entry with local path target_dir = tmp_path / "out" storage_meta = { @@ -196,7 +202,12 @@ def test_spark_write_to_path_happy(tmp_path: Path, monkeypatch): writer.options.assert_called_once_with(mergeSchema="true", compression="snappy") # 3) save() called with path - writer.save.assert_called_once_with(str(target_dir)) + assert writer.save.call_count == 1 + tmp_save_path = writer.save.call_args[0][0] + assert ".ff_tmp_" in tmp_save_path + assert str(target_dir.parent) in tmp_save_path + + assert target_dir.exists() # 4) create table ... location ... # fmt is known → USING parquet @@ -207,7 +218,7 @@ def test_spark_write_to_path_happy(tmp_path: Path, monkeypatch): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_spark_write_to_path_without_format_uses_default(tmp_path: Path): fake_spark = MagicMock() fake_df = MagicMock() @@ -215,6 +226,8 @@ def test_spark_write_to_path_without_format_uses_default(tmp_path: Path): fake_df.write.mode.return_value = writer writer.format.return_value = writer + writer.save.side_effect = lambda p: Path(p).mkdir(parents=True, exist_ok=True) + target_dir = tmp_path / "x" storage_meta = { "path": str(target_dir), @@ -234,7 +247,7 @@ def test_spark_write_to_path_without_format_uses_default(tmp_path: Path): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_spark_write_to_path_requires_path(): fake_spark = MagicMock() fake_df = MagicMock() @@ -252,7 +265,7 @@ def test_spark_write_to_path_requires_path(): @pytest.mark.unit -@pytest.mark.spark +@pytest.mark.databricks_spark def test_spark_write_to_path_rejects_empty_identifier(tmp_path: Path): fake_spark = MagicMock() fake_df = MagicMock() diff --git a/tests/unit/test_testing_unit.py b/tests/unit/test_testing_unit.py index 96c4a70..f4330e0 100644 --- a/tests/unit/test_testing_unit.py +++ b/tests/unit/test_testing_unit.py @@ -5,13 +5,12 @@ import pytest -from fastflowtransform.testing import ( +from fastflowtransform.testing.base import ( TestFailure, _exec, _fail, _pretty_sql, _scalar, - _sql_list, accepted_values, freshness, greater_equal, @@ -22,6 +21,7 @@ reconcile_equal, reconcile_ratio_within, row_count_between, + sql_list, unique, ) @@ -67,9 +67,9 @@ def test_pretty_sql_sequence(): @pytest.mark.unit def test_sql_list_various_types(): - assert _sql_list([1, 2, 3]) == "1, 2, 3" - assert _sql_list(["a", "b"]) == "'a', 'b'" - assert _sql_list([None, "O'Reilly"]) == "NULL, 'O''Reilly'" + assert sql_list([1, 2, 3]) == "1, 2, 3" + assert sql_list(["a", "b"]) == "'a', 'b'" + assert sql_list([None, "O'Reilly"]) == "NULL, 'O''Reilly'" # --------------------------------------------------------------------------- @@ -241,7 +241,7 @@ def execute(self, sql): return _FakeResult([]) con = FakeCon() - assert accepted_values(con, "tbl", "col", values=["a", "b"]) is True + accepted_values(con, "tbl", "col", values=["a", "b"]) assert con.calls == 1 diff --git a/tests/testing/test_accepted_values_unit.py b/tests/unit/testing/test_accepted_values_unit.py similarity index 52% rename from tests/testing/test_accepted_values_unit.py rename to tests/unit/testing/test_accepted_values_unit.py index 4de3492..4e94e4e 100644 --- a/tests/testing/test_accepted_values_unit.py +++ b/tests/unit/testing/test_accepted_values_unit.py @@ -1,17 +1,17 @@ import pytest from fastflowtransform.executors.duckdb_exec import DuckExecutor -from fastflowtransform.testing import TestFailure, accepted_values +from fastflowtransform.testing.base import TestFailure, accepted_values +@pytest.mark.unit def test_accepted_values_pass_and_fail(): ex = DuckExecutor(":memory:") ex.con.execute("create table t(id int, email varchar)") ex.con.execute("insert into t values (1,'a@example.com'),(2,'b@example.com')") # Pass - assert accepted_values(ex.con, "t", "email", values=["a@example.com", "b@example.com"]) is True + accepted_values(ex.con, "t", "email", values=["a@example.com", "b@example.com"]) + # Fail - ex.con.execute("insert into t values (3,'bad@example.com')") - with pytest.raises(TestFailure) as e: - accepted_values(ex.con, "t", "email", values=["a@example.com", "b@example.com"]) - assert "outside accepted set" in str(e.value) + with pytest.raises(TestFailure): + accepted_values(ex.con, "t", "email", values=["a@example.com"]) diff --git a/uv.lock b/uv.lock index 86e4660..9e1b983 100644 --- a/uv.lock +++ b/uv.lock @@ -680,6 +680,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, ] +[[package]] +name = "delta-spark" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "pyspark" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/fd/37a0c2ee6fbf4dcbf034178dcb978ba586d37dc9c95214b90416d1042735/delta-spark-4.0.0.tar.gz", hash = "sha256:39325d76d6e5d8b5fea9d47827c59f5e0674844cb95f97b2cc613bfc5209c7ec", size = 35026, upload-time = "2025-06-06T01:41:46.286Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/d8/265a93d22ae79262cdff701496a6f5676926a342153f3855ae6060430660/delta_spark-4.0.0-py3-none-any.whl", hash = "sha256:4e4ded07bb9ee4f6a0df45606d84395239d4b82001e765a627fecc1e914f3029", size = 39756, upload-time = "2025-06-06T01:41:44.815Z" }, +] + [[package]] name = "distlib" version = "0.4.0" @@ -720,10 +733,11 @@ wheels = [ [[package]] name = "fastflowtransform" -version = "0.5.1" +version = "0.5.15" source = { editable = "." } dependencies = [ { name = "bigframes" }, + { name = "delta-spark" }, { name = "duckdb" }, { name = "google-cloud-bigquery" }, { name = "httpx" }, @@ -766,6 +780,7 @@ docs = [ [package.metadata] requires-dist = [ { name = "bigframes", specifier = ">=2.24.0" }, + { name = "delta-spark", specifier = ">=4.0.0" }, { name = "duckdb", specifier = ">=1.0" }, { name = "google-cloud-bigquery", specifier = ">=3.25" }, { name = "httpx", specifier = ">=0.28.1" }, @@ -1232,6 +1247,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, @@ -1241,6 +1258,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, @@ -1248,6 +1267,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, + { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" }, { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, ] @@ -1396,6 +1417,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] +[[package]] +name = "importlib-metadata" +version = "8.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" }, +] + [[package]] name = "iniconfig" version = "2.1.0" @@ -3617,3 +3650,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/b7/503c98092fb3b344a179579f55814b613c1fbb1c23b3ec14a7b008a66a6e/yarl-1.22.0-cp314-cp314t-win_arm64.whl", hash = "sha256:9f6d73c1436b934e3f01df1e1b21ff765cd1d28c77dfb9ace207f746d4610ee1", size = 85171, upload-time = "2025-10-06T14:12:16.935Z" }, { url = "https://files.pythonhosted.org/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" }, ] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +]