diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7aed4e5..1941c2a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,17 +46,6 @@ jobs: git rev-parse --short HEAD git log -1 --stat - - name: "Debug: ensure pyproject present" - run: | - test -f pyproject.toml || (echo "pyproject.toml fehlt!" && exit 1) - ls -la - - - name: "Debug: Ruff version & scope" - run: | - uv run ruff --version - echo "---- Ruff will lint these files (no cache) ----" - uv run ruff check src tests --no-cache --show-files - - name: Ruff (lint) run: uv run ruff check src tests --no-cache --output-format=github @@ -66,7 +55,54 @@ jobs: - name: Unit tests (fast) env: PYTHONWARNINGS: default - run: uv run pytest -q tests -m unit --maxfail=1 + run: uv run pytest -q tests -m "unit and not (postgres or databricks_spark or bigquery or snowflake)" --maxfail=1 + + # ---------- Engine-specific unit slices (require optional extras) ---------- + unit-matrix: + runs-on: ubuntu-latest + needs: checks + strategy: + fail-fast: false + matrix: + include: + - name: postgres + extra: postgres + marker: "unit and postgres" + - name: databricks_spark + extra: spark + marker: "unit and databricks_spark" + java: true + - name: bigquery + extra: bigquery_bf + marker: "unit and bigquery" + - name: snowflake + extra: snowflake + marker: "unit and snowflake" + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup uv (and Python) + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.12" + enable-cache: true + + - name: Sync deps (dev + extra) + run: uv sync --extra dev --extra ${{ matrix.extra }} --frozen + + - name: Setup Java for Spark + if: matrix.java == true + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: '17' + + - name: Run unit tests for engine + env: + PYTHONWARNINGS: default + run: uv run pytest -q tests -m "${{ matrix.marker }}" --maxfail=1 # ---------- Examples: Integration Tests ---------- examples-matrix: @@ -75,7 +111,101 @@ jobs: strategy: fail-fast: false matrix: - engine: [duckdb, postgres, databricks_spark] + include: + # DuckDB examples + - engine: duckdb + extra: "" + example: api_demo + env_file: examples/api_demo/.env.dev_duckdb + - engine: duckdb + extra: "" + example: basic_demo + env_file: examples/basic_demo/.env.dev_duckdb + - engine: duckdb + extra: "" + example: cache_demo + env_file: examples/cache_demo/.env.dev_duckdb + - engine: duckdb + extra: "" + example: dq_demo + env_file: examples/dq_demo/.env.dev_duckdb + - engine: duckdb + extra: "" + example: incremental_demo + env_file: examples/incremental_demo/.env.dev_duckdb + - engine: duckdb + extra: "" + example: macros_demo + env_file: examples/macros_demo/.env.dev_duckdb + - engine: duckdb + extra: "" + example: materializations_demo + env_file: examples/materializations_demo/.env.dev_duckdb + # Postgres examples + - engine: postgres + extra: "postgres" + example: api_demo + env_file: examples/api_demo/.env.dev_postgres + - engine: postgres + extra: "postgres" + example: basic_demo + env_file: examples/basic_demo/.env.dev_postgres + - engine: postgres + extra: "postgres" + example: cache_demo + env_file: examples/cache_demo/.env.dev_postgres + - engine: postgres + extra: "postgres" + example: dq_demo + env_file: examples/dq_demo/.env.dev_postgres + - engine: postgres + extra: "postgres" + example: incremental_demo + env_file: examples/incremental_demo/.env.dev_postgres + - engine: postgres + extra: "postgres" + example: macros_demo + env_file: examples/macros_demo/.env.dev_postgres + - engine: postgres + extra: "postgres" + example: materializations_demo + env_file: examples/materializations_demo/.env.dev_postgres + # Spark examples + - engine: databricks_spark + extra: "spark" + example: api_demo + java: true + env_file: examples/api_demo/.env.dev_databricks + - engine: databricks_spark + extra: "spark" + example: basic_demo + java: true + env_file: examples/basic_demo/.env.dev_databricks + - engine: databricks_spark + extra: "spark" + example: cache_demo + java: true + env_file: examples/cache_demo/.env.dev_databricks + - engine: databricks_spark + extra: "spark" + example: dq_demo + java: true + env_file: examples/dq_demo/.env.dev_databricks + - engine: databricks_spark + extra: "spark" + example: incremental_demo + java: true + env_file: examples/incremental_demo/.env.dev_databricks + - engine: databricks_spark + extra: "spark" + example: macros_demo + java: true + env_file: examples/macros_demo/.env.dev_databricks + - engine: databricks_spark + extra: "spark" + example: materializations_demo + java: true + env_file: examples/materializations_demo/.env.dev_databricks services: postgres: @@ -102,8 +232,13 @@ jobs: python-version: "3.12" enable-cache: true - - name: Sync deps (dev) - run: uv sync --extra dev --frozen + - name: Sync deps (dev + extra) + run: | + extras="--extra dev" + if [ -n "${{ matrix.extra }}" ]; then + extras="$extras --extra ${{ matrix.extra }}" + fi + uv sync $extras --frozen - name: Setup Java for Spark if: matrix.engine == 'databricks_spark' @@ -113,19 +248,19 @@ jobs: java-version: '17' - name: Run example/integration tests for engine - env: - FF_PG_DSN: postgresql+psycopg://postgres:postgres@localhost:5432/fastflowtransform - FF_PG_SCHEMA: ci_examples run: | - echo "Running integration tests for engine=${{ matrix.engine }}" - case "${{ matrix.engine }}" in - duckdb) - uv run pytest -m "integration and duckdb" --maxfail=1 -q tests - ;; - postgres) - uv run pytest -m "integration and postgres" --maxfail=1 -q tests - ;; - databricks_spark) - uv run pytest -m "integration and databricks_spark" --maxfail=1 -q tests - ;; - esac + echo "Running integration tests for engine=${{ matrix.engine }} example=${{ matrix.example }}" + set -a + unset FF_PG_DSN FF_PG_SCHEMA + if [ -f "${{ matrix.env_file }}" ]; then + source "${{ matrix.env_file }}" + fi + if [ "${{ matrix.engine }}" = "postgres" ]; then + export FF_PG_SCHEMA="${{ matrix.example }}" + export FF_PG_DSN="${FF_PG_DSN:-postgresql+psycopg://postgres:postgres@localhost:5432/fastflowtransform}" + fi + set +a + uv run pytest -m "integration and ${{ matrix.engine }}" \ + -vv --show-capture=all --log-cli-level=INFO \ + -k "${{ matrix.example }} and ${{ matrix.engine }}" \ + --maxfail=1 -q tests/integration/examples/test_examples_matrix.py diff --git a/.gitignore b/.gitignore index 35acb1e..7049ad9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Envs & Secrets .env.local .env.*.local +secrets/ # Local DBs / Artifacts *.duckdb @@ -19,6 +20,7 @@ __pycache__/ *.egg-info/ .build/ .eggs/ +.uv-cache/ .DS_Store # Tooling Caches @@ -31,7 +33,8 @@ htmlcov/ # Build Artifacts build/ dist/ -site/dag +**/site/dag/ +**/site/dag/** spark-warehouse metastore_db derby.log diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5658f47..e188b15 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,6 @@ repos: pass_filenames: false types_or: [python] - # Optional: dieselben Checks auch beim Push fΓΌrs β€žCI-GefΓΌhlβ€œ - id: prepush-ruff name: ruff (pre-push) entry: uv run ruff check src tests diff --git a/Contributing.md b/Contributing.md index ea84ae2..d892432 100644 --- a/Contributing.md +++ b/Contributing.md @@ -61,6 +61,18 @@ pytest -q make demo ``` +For engines behind optional extras, run targeted installs/tests in a matrix (local or CI) to catch import/runtime gaps without pulling every dependency: + +```bash +uv pip install -e .[duckdb] # core +uv pip install -e .[postgres] +uv pip install -e .[bigquery] +uv pip install -e .[bigquery_bf] +uv pip install -e .[spark] +uv pip install -e .[snowflake] +# or uv pip install -e .[full] for an all-in-one sweep +``` + --- ## πŸ§‘β€πŸ€β€πŸ§‘ Code of Conduct diff --git a/Makefile b/Makefile index 3c59f7f..7e9b8cb 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,6 @@ SHELL := /bin/bash -# Defaults (per CLI ΓΌberschreibbar): make FF_PROJECT=examples/postgres FF_ENV=stg FF_PROJECT ?= examples/simple_duckdb FF_DB ?= $(FF_PROJECT)/.local/demo.duckdb FF_ENV ?= dev diff --git a/Makefile.pipeline b/Makefile.pipeline index 4b82eb8..a628cfe 100644 --- a/Makefile.pipeline +++ b/Makefile.pipeline @@ -43,7 +43,7 @@ demo: seed run dag demo-open test clean: rm -rf .local "$(FF_PROJECT)/docs" dist build *.egg-info -# --- Cache demos (v0.3) --- +# --- Cache demos --- cache_rw_first: # first run writes cache and meta diff --git a/README.md b/README.md index b0d43e9..71d8392 100644 --- a/README.md +++ b/README.md @@ -1,215 +1,48 @@ -# FastFlowTransform (PoC 0.5.1) +# FastFlowTransform -[![CI](https://github.com///actions/workflows/ci.yml/badge.svg)](https://github.com///actions/workflows/ci.yml) +[![CI](https://github.com/MirrorsAndMisdirections/FastFlowTransform/actions/workflows/ci.yml/badge.svg)](https://github.com/MirrorsAndMisdirections/FastFlowTransform/actions/workflows/ci.yml) [![PyPI version](https://img.shields.io/pypi/v/fastflowtransform.svg)](https://pypi.org/project/fastflowtransform/) -> ⚠️ **Project status:** early proof-of-concept. Stable enough for demos and smaller workflows. Public APIs may still change. +FastFlowTransform (FFT) is a SQL + Python data modeling engine with a deterministic DAG, level-wise parallelism, optional caching, incremental builds, auto-docs, and built-in data-quality tests. Projects are plain directories containing models, seeds, and YAML config; the `fft` CLI handles compilation, execution, docs, and validation across multiple execution engines. ---- - -## Table of Contents - -- [Overview](#overview) -- [Key Features](#key-features) -- [Requirements](#requirements) -- [Installation](#installation) -- [Quickstart](#quickstart) -- [Documentation](#documentation) -- [Contributing](#contributing) -- [License](#license) - ---- - -## Overview - -FastFlowTransform combines SQL and Python models in a lightweight DAG engine. A project is simply a directory with models, optional seeds, and configuration. The CLI renders SQL, runs Python models, materialises results, generates HTML documentation, and executes data-quality checks against multiple execution backends. - -> ℹ️ **Project layout & CLI overview** -> Curious about the full folder structure, Makefile targets, or example models? See the *Project Layout* and related sections in the [User Guide](docs/Technical_Overview.md#project-layout). - ---- - -## Key Features - -- **Polyglot modelling:** build transformation nodes in SQL (`*.ff.sql`) or Python (`*.ff.py`) and wire them together with `ref()`/`source()` and `deps=[...]`. -- **Multiple executors:** DuckDB (local default), Postgres, BigQuery (classic + BigFrames), Databricks Spark, and Snowflake Snowpark are supported via pluggable executors. -- **Deterministic DAG:** dependencies are resolved statically; `fft dag` renders either Mermaid source or a ready-to-view HTML mini site. -- **Data quality built in:** configure checks such as `not_null`, `unique`, `row_count_between`, `greater_equal`, `non_negative_sum`, and `freshness` in `project.yml`. -- **Environment-aware configuration:** `profiles.yml` plus environment variables (`FF_*`) drive executor settings; CLI flags can override at runtime. -- **Seeds & docs:** `fft seed` loads CSV/Parquet seeds, and `fft dag --html` produces browsable documentation for every model. - ---- +## Highlights +- SQL or Python models (`*.ff.sql` / `*.ff.py`) wired with `ref()` / `source()` / `deps=[...]`. +- Executors for DuckDB, Postgres, BigQuery (pandas + BigFrames), Databricks/Spark, and Snowflake Snowpark. +- Level-wise parallel scheduler with cache fingerprints, rebuild flags, and state/result selectors. +- Incremental and materialized models with engine-specific merge/append hooks. +- Tests everywhere: schema/YAML checks, reconciliation rules, and fast model unit tests (`fft utest`). +- Docs on demand: `fft dag --html` and `fft docgen` generate a browsable site plus JSON artifacts; optional `sync-db-comments` to push descriptions to Postgres/Snowflake. +- HTTP helpers for Python models (`fastflowtransform.api.http`) and Jinja macros/config for templating. ## Requirements - -- Python **3.12+** -- Optional client libraries per executor (e.g. `google-cloud-bigquery`, `snowflake-snowpark-python`, `pyspark`, appropriate database drivers). Install only what you need for your chosen backend. - ---- - -## Installation - -```bash -python -m pip install --upgrade pip -pip install -e . -# Optional: install pre-commit hooks -pip install pre-commit -pre-commit install -``` - -You can also bootstrap everything with the provided Makefile: - -```bash -make install # upgrades pip + installs FastFlowTransform in editable mode -``` - ---- - -## Quickstart - -### Project skeleton (optional) - -```bash -fft init ./demo_project --engine duckdb -``` - -`fft init` generates a non-interactive skeleton (no demo models) and adds inline comments pointing to the relevant documentation pages. - -> πŸ“š **Read more… CLI-Details** -> For flag referencees, automatization and backgrounds see [`docs/Technical_Overview.md`](docs/Technical_Overview.md#cli-flows). - -Run the end-to-end DuckDB demo (seed β†’ run β†’ docs β†’ tests) in under a minute: - -```bash -make demo -``` - -The target project lives in `examples/simple_duckdb`. After the demo finishes you'll find the rendered DAG at `examples/simple_duckdb/site/dag/index.html`. Open it via: - -```bash -open examples/simple_duckdb/site/dag/index.html # macOS -xdg-open examples/simple_duckdb/site/dag/index.html # Linux -``` - -If you prefer manual control: - -```bash -fft seed examples/simple_duckdb --env dev -fft run examples/simple_duckdb --env dev -fft dag examples/simple_duckdb --env dev --html -fft test examples/simple_duckdb --env dev --select batch -``` - ---- - -> For a deep dive into the v0.3 features, see **[Parallelism & Cache](docs/Cache_and_Parallelism.md)**. - -## Parallelism & Cache (v0.3) - -FastFlowTransform 0.3 adds a level-wise parallel scheduler and an opt-in build cache. - -### Parallel execution -- DAG is split into **levels** (all nodes with the same maximum distance from sources). -- Within a level, up to `--jobs` nodes run **in parallel**. Dependencies are never violated. -- `--keep-going`: tasks already started in a level run to completion, but **subsequent levels won’t start** if any task in the current level fails. - -**Examples** -```bash -# run with 4 workers per level -fft run examples/simple_duckdb --env dev --jobs 4 - -# keep tasks in the current level running even if one fails -fft run examples/simple_duckdb --env dev --jobs 4 --keep-going -``` - -### Cache modes -The cache decides whether a node can be **skipped** when nothing relevant changed. - -``` ---cache=off # always build ---cache=rw # default: skip on match; write cache after build ---cache=ro # skip on match; build on miss, but don't write cache ---cache=wo # always build and write cache ---rebuild # ignore cache for selected nodes ---no-cache # alias for --cache=off -``` - -**When is a node skipped?** -FastFlowTransform computes a **fingerprint** from: -- SQL/Python source (rendered SQL or function source) -- environment context (engine, profile name, selected `FF_*` env vars, normalized `sources.yml`) -- **dependency fingerprints** (change upstream β‡’ downstream fingerprint changes) -The node is skipped if the fingerprint matches the on-disk cache **and** the physical relation exists. - -**Examples** -```bash -# first run (build + cache write) -fft run . --env dev --cache=rw - -# second run (no-op if nothing changed) -fft run . --env dev --cache=rw - -# force rebuild of a specific model -fft run . --env dev --cache=rw --rebuild marts_daily.ff - -# diagnose a surprising skip: change an FF_* env var to invalidate fingerprints -FF_DEMO_TOGGLE=1 fft run . --env dev --cache=rw -``` - -**Troubleshooting** -- *β€œWhy did it skip?”* β†’ Compare your last changes: SQL/Python code, `sources.yml`, `FF_*` env vars, profile/engine. Any change alters the fingerprint. -- *β€œRelation missing but cache says skip”* β†’ FastFlowTransform also checks relation existence; if it was dropped externally, it will **rebuild**. -- *β€œParallel tasks interleave logs”* β†’ Logs are serialized via an internal queue to keep lines readable; use `-v`/`-vv` for more detail. - ---- - -## Selective runs - -Use patterns to run only a subgraph. - -- `--select `: builds only targets that match **and their dependencies**. -- `--exclude `: excludes matching targets from the build (deps remain if still required). - -Examples: - fft run . --select marts_daily.ff - fft run . --exclude 'mart_*' - ---- - -## Rebuild controls - -- `--rebuild` β†’ rebuild **all selected** nodes (ignore cache). -- `--rebuild-only NAME …` β†’ rebuild only the specified nodes (ignore cache). - -These flags compose with `--select/--exclude`. - -Examples: - # Rebuild everything that matches --select - fft run . --select marts_daily.ff --rebuild - - # Rebuild only a specific node - fft run . --rebuild-only marts_daily.ff - ---- - -## Documentation - -- **Documentation hub:** choose your path (operators vs contributors) β€” see [`docs/index.md`](docs/index.md). -- **User & operator guide:** project layout, CLI usage, troubleshooting tips β€” see [`docs/Technical_Overview.md`](docs/Technical_Overview.md). -- **Docgen shortcut:** append `--open-source` to `fft docgen ...` to launch the freshly rendered `index.html` immediately; use `--no-schema` when column introspection should be skipped. -- **Modeling reference:** configuration, Jinja helpers, macros β€” see [`docs/Config_and_Macros.md`](docs/Config_and_Macros.md). -- **API calls in Python models:** [`docs/API_Models.md`](docs/API_Models.md) -- **Database comments sync:** preview database comment updates with `fft sync-db-comments . --env dev --dry-run` before applying them to Postgres or Snowflake. -- **Examples:** runnable demo projects live under `examples/`; - ---- +- Python 3.12+ +- Engine extras installed only as needed (e.g. BigQuery, Snowflake, Spark/Delta, Postgres drivers). The core DuckDB path works out of the box. + +## Install & Quickstart +- Pick the engine extras you need (combine as `pkg[a,b]`): + - DuckDB/core: `pip install fastflowtransform` + - Postgres: `pip install fastflowtransform[postgres]` + - BigQuery (pandas): `pip install fastflowtransform[bigquery]` + - BigQuery (BigFrames): `pip install fastflowtransform[bigquery_bf]` + - Databricks/Spark + Delta: `pip install fastflowtransform[spark]` + - Snowflake Snowpark: `pip install fastflowtransform[snowflake]` + - Everything: `pip install fastflowtransform[full]` +- Installation and first run: see `docs/Quickstart.md` (venv + editable install, DuckDB demo, and init walkthrough). +- CLI usage and flags: see `docs/CLI_Guide.md`. +- Makefile shortcut: `make demo` runs the simple DuckDB example end-to-end and opens the DAG (`examples/simple_duckdb`). + +## Docs & examples +- Docs hub: `docs/index.md` or https://fastflowtransform.com. +- Operational guide & architecture: `docs/Technical_Overview.md`. +- Modeling reference & macros: `docs/Config_and_Macros.md`. +- Parallelism, cache, and state selection: `docs/Cache_and_Parallelism.md`, `docs/State_Selection.md`. +- Incremental models: `docs/Incremental.md`. +- Data-quality + YAML tests: `docs/Data_Quality_Tests.md`, `docs/YAML_Tests.md`, `docs/Unit_Tests.md`. +- CLI details and troubleshooting: `docs/CLI_Guide.md`, `docs/Troubleshooting.md`. +- Runnable demos live under `examples/` (basic, materializations, incremental, DQ, macros, cache, env matrix, API, events). ## Contributing - -Issues and pull requests are welcome! Please read [`Contributing.md`](./Contributing.md) for guidelines, development setup, and testing instructions. Sharing minimal reproduction steps plus `fft --version` output greatly speeds up reviews. - ---- +Issues and PRs are welcome. See `Contributing.md` for development setup, testing (`make demo`, `uv run pytest`, `fft utest`), and code-style guidelines. ## License - -FastFlowTransform is licensed under the [Apache License 2.0](./License). +Apache 2.0 β€” see `License.md`. diff --git a/_scripts/concat_docs.py b/_scripts/concat_docs.py index 0b04287..fe59d5c 100644 --- a/_scripts/concat_docs.py +++ b/_scripts/concat_docs.py @@ -131,31 +131,31 @@ def main(): "-d", "--docs-dir", default=DOCS_DIR_DEFAULT, - help="Pfad zum docs-Verzeichnis (Default: docs)", + help="Path to docs directory (Default: docs)", ) - parser.add_argument("-o", "--output", required=True, help="Ausgabedatei (z. B. Combined.md)") + parser.add_argument("-o", "--output", required=True, help="Output file (e.g. Combined.md)") parser.add_argument( "--demote", action="store_true", - help="Headings ab der zweiten Datei um eine Ebene demoten (# -> ##, usw.)", + help="Demote headings starting with the second file by one level (# -> ##, etc.)", ) parser.add_argument( "--exclude", action="append", default=[], - help="Glob-Pattern zum Ausschließen (z. B. 'reference/**'). Mehrfach nutzbar.", + help="Glob pattern to exclude (e.g. 'reference/**'). Can be used multiple times.", ) parser.add_argument( "--no-nav", action="store_true", - help="mkdocs.yml ignorieren und alphabetisch alle .md zusammenfΓΌgen", + help="Ignore mkdocs.yml and concatenate all .md alphabetically", ) args = parser.parse_args() project_root = Path(".").resolve() docs_dir = (project_root / args.docs_dir).resolve() if not docs_dir.exists(): - print(f"Fehler: docs-Verzeichnis nicht gefunden: {docs_dir}", file=sys.stderr) + print(f"Error: docs directory not found: {docs_dir}", file=sys.stderr) sys.exit(1) # 1) Order from mkdocs.yml (if not disabled / available) @@ -176,7 +176,7 @@ def main(): seen.add(rel.as_posix()) if not ordered: - print("Keine Markdown-Dateien gefunden.", file=sys.stderr) + print("No Markdown files found.", file=sys.stderr) sys.exit(2) out_path = Path(args.output).resolve() @@ -194,7 +194,7 @@ def main(): out_text = f"# Combined Documentation\n\n" + "\n".join(parts) out_path.write_text(out_text, encoding="utf-8") - print(f"βœ”οΈ {len(ordered)} Dateien zusammengefΓΌhrt β†’ {out_path}") + print(f"βœ”οΈ {len(ordered)} files merged β†’ {out_path}") if __name__ == "__main__": diff --git a/docs/Api_Models.md b/docs/Api_Models.md index 0d93861..8e3e1b6 100644 --- a/docs/Api_Models.md +++ b/docs/Api_Models.md @@ -306,4 +306,4 @@ fft run . --env dev --select dim_countries_from_api --http-cache ro - Technical guide: *Developer Guide – Architecture & Internals* - Unit tests: `tests/api/test_http_*.py` -- Runtime & cache: *Parallelism & Cache (v0.3)* +- Runtime & cache: *Parallelism & Cache* diff --git a/docs/CLI_Guide.md b/docs/CLI_Guide.md index 042e5ed..3c6f327 100644 --- a/docs/CLI_Guide.md +++ b/docs/CLI_Guide.md @@ -9,7 +9,7 @@ FastFlowTransform’s CLI is the entry point for seeding data, running DAGs, gen | `fft seed [--env dev]` | Materialize CSV/Parquet seeds into the configured engine. | | `fft run [--env dev]` | Execute the DAG (obeys cache + parallel flags). | | `fft dag --html` | Render the DAG graph/site for quick inspection. | -| `fft docgen --out site/docs` | Generate the full documentation bundle (graph + model pages + optional JSON). | +| `fft docgen [--out site/docs] [--emit-json path] [--open-source]` | Generate the full documentation bundle (graph + model pages + optional JSON). Default output is `/site/docs`. | | `fft test [--env dev]` | Run schema/data-quality tests defined in `project.yml` or schema YAML files. | | `fft utest ` | Execute unit tests defined under `tests/unit/*.yml`. | | `fft sync-db-comments ` | Push model/column descriptions into Postgres or Snowflake comments. | diff --git a/docs/Cache_and_Parallelism.md b/docs/Cache_and_Parallelism.md index 479ccec..b8f6439 100644 --- a/docs/Cache_and_Parallelism.md +++ b/docs/Cache_and_Parallelism.md @@ -13,7 +13,7 @@ FastFlowTransform introduces a level-wise parallel scheduler and a build cache d - [Fingerprint Formula](#fingerprint-formula) - [Meta Table Schema](#meta-table-schema) - [CLI Recipes](#cli-recipes) -- [Troubleshooting & FAQ](#troubleshooting--faq) +- [Troubleshooting & FAQ](#troubleshooting-faq) - [Example: simple_duckdb](#example-simple_duckdb) - [Appendix: Environment Inputs](#appendix-environment-inputs) @@ -233,6 +233,6 @@ FF_RUN_DATE=2025-01-01 fft run . --env dev --cache=rw @@ -10,6 +10,7 @@ - [User Guide – Operational](./Technical_Overview.md#part-i--operational-guide) - [Modeling Reference](./Config_and_Macros.md) - - [Parallelism & Cache (v0.3)](./Cache_and_Parallelism.md) +- [Parallelism & Cache](./Cache_and_Parallelism.md) - [Developer Guide – Architecture & Internals](./Technical_Overview.md#part-ii--architecture--internals) ```` diff --git a/docs/Config_and_Macros.md b/docs/Config_and_Macros.md index cdcdca3..2ceeac7 100644 --- a/docs/Config_and_Macros.md +++ b/docs/Config_and_Macros.md @@ -1,8 +1,8 @@ -# FastFlowTransform Modeling Reference (v0.1) +# FastFlowTransform Modeling Reference > Authoritative reference for FastFlowTransform’s modeling layer: SQL/Python models, configuration macros, templating helpers, and testing hooks. -> Works with FastFlowTransform v0.1 (T1–T11). Supported engines: DuckDB, Postgres, BigQuery (pandas & BigFrames), Databricks/Spark, Snowflake/Snowpark. -> **Execution & Cache (v0.3) quick notes** +> Supported engines: DuckDB, Postgres, BigQuery (pandas & BigFrames), Databricks/Spark, Snowflake/Snowpark. +> **Execution & Cache quick notes** > - Parallelism is level-wise; use `fft run --jobs N`. > - Use `--cache={off|ro|rw|wo}` to control skipping behavior. > - Fingerprints include rendered SQL / Python function source, selected `FF_*` env vars, `sources.yml` and upstream fingerprints. @@ -184,7 +184,7 @@ Call `config()` at the top of SQL models. Python models get the same options via ) }} ``` -Supported keys (v0.1): +Supported keys: | Key | Type | Description | |----------------|-----------------|------------------------------------------------------------------------------| @@ -307,7 +307,7 @@ from {{ ref('users.ff') }}; - Default β†’ materialized as `table`. - `materialized='view'` produces an engine-specific temporary table first, then creates/overwrites a view that selects from it. -- Ephemeral Python models are not supported in v0.1. +- Ephemeral Python models are not supported. --- diff --git a/docs/Quickstart.md b/docs/Quickstart.md index 2973c8b..a06407f 100644 --- a/docs/Quickstart.md +++ b/docs/Quickstart.md @@ -15,12 +15,33 @@ The command is non-interactive, refuses to overwrite existing directories, and l ## 1. Install & bootstrap ```bash -python -m venv .venv -. .venv/bin/activate -pip install -e ./fastflowtransform +python3 -m venv .venv +. .venv/bin/activate # or source .venv/bin/activate +pip install --upgrade pip +pip install -e . # run from the repo root; use `uv pip install --editable .` if you prefer uv fft --help ``` +Choose extras if you target other engines (combine as needed): + +```bash +# Postgres +pip install -e .[postgres] + +# BigQuery (pandas) or BigFrames +pip install -e .[bigquery] # pandas +pip install -e .[bigquery_bf] # BigFrames + +# Databricks/Spark + Delta +pip install -e .[spark] + +# Snowflake Snowpark +pip install -e .[snowflake] + +# Everything +pip install -e .[full] +``` + ## 2. Create project layout ```bash @@ -47,12 +68,19 @@ cat <<'SQL' > demo/models/users.ff.sql select id, email from {{ source('raw', 'users') }} SQL + +cat <<'YAML' > demo/profiles.yml +dev: + engine: duckdb + duckdb: + path: ".local/demo.duckdb" +YAML ``` ## 3. Seed static inputs ```bash -fft seed demo --profile dev +fft seed demo --env dev ``` This materializes the CSV into the configured engine (DuckDB by default) using `seed_users` as the physical table. @@ -60,7 +88,7 @@ This materializes the CSV into the configured engine (DuckDB by default) using ` ## 4. Run the pipeline ```bash -fft run demo --cache off +fft run demo --env dev --cache off ``` You should see log lines similar to `βœ“ L01 [DUCK] users.ff`. The resulting table lives in the target schema (`staging` in this example). @@ -80,5 +108,6 @@ You should see log lines similar to `βœ“ L01 [DUCK] users.ff`. The resulting tab - Add `project.yml` for reusable `vars:` and metadata - Explore `fft docs` to generate HTML documentation - Use engine profiles under `profiles.yml` to target Postgres, BigQuery, or Databricks (path-based sources supported via `format` + `location` overrides) +- Render the DAG site for this project: `fft dag demo --env dev --html` (find it under `demo/site/dag/index.html`) Refer to `docs/Config_and_Macros.md` for advanced configuration options. diff --git a/docs/Technical_Overview.md b/docs/Technical_Overview.md index 7612bbf..2da99e8 100644 --- a/docs/Technical_Overview.md +++ b/docs/Technical_Overview.md @@ -1,4 +1,4 @@ -# 🧭 FastFlowTransform – Technical Developer Documentation (v0.4) +# 🧭 FastFlowTransform – Technical Developer Documentation > Status: latest updates from your context dump. This document consolidates project structure, architecture, core APIs, error handling, CLI, examples, and roadmap into a print/git-friendly Markdown. > @@ -53,64 +53,7 @@ ### Project Layout -```text -fastflowtransform/ -β”œβ”€β”€ pyproject.toml -β”œβ”€β”€ src/ -β”‚ └── fastflowtransform/ -β”‚ β”œβ”€β”€ __init__.py -β”‚ β”œβ”€β”€ cli.py -β”‚ β”œβ”€β”€ core.py -β”‚ β”œβ”€β”€ dag.py -β”‚ β”œβ”€β”€ docs.py -β”‚ β”œβ”€β”€ errors.py -β”‚ β”œβ”€β”€ settings.py -β”‚ β”œβ”€β”€ seeding.py -β”‚ β”œβ”€β”€ testing.py -β”‚ β”œβ”€β”€ validation.py -β”‚ β”œβ”€β”€ decorators.py # optional, if not kept in core.py -β”‚ β”œβ”€β”€ docs/ -β”‚ β”‚ └── templates/ -β”‚ β”‚ β”œβ”€β”€ index.html.j2 -β”‚ β”‚ └── model.html.j2 -β”‚ β”œβ”€β”€ executors/ -β”‚ β”‚ β”œβ”€β”€ __init__.py -β”‚ β”‚ β”œβ”€β”€ base.py -β”‚ β”‚ β”œβ”€β”€ duckdb_exec.py -β”‚ β”‚ β”œβ”€β”€ postgres_exec.py -β”‚ β”‚ β”œβ”€β”€ bigquery_exec.py # pandas + BigQuery client -β”‚ β”‚ β”œβ”€β”€ bigquery_bf_exec.py # BigQuery DataFrames (bigframes) -β”‚ β”‚ β”œβ”€β”€ databricks_spark_exec.py # PySpark (without pandas) -β”‚ β”‚ └── snowflake_snowpark_exec.py# Snowpark (without pandas) -β”‚ └── streaming/ -β”‚ β”œβ”€β”€ __init__.py -β”‚ β”œβ”€β”€ file_tail.py -β”‚ └── sessionizer.py -β”‚ -β”œβ”€β”€ examples/ -β”‚ β”œβ”€β”€ simple_duckdb/ -β”‚ β”‚ β”œβ”€β”€ models/ -β”‚ β”‚ β”‚ β”œβ”€β”€ users.ff.sql -β”‚ β”‚ β”‚ β”œβ”€β”€ users_enriched.ff.py -β”‚ β”‚ β”‚ β”œβ”€β”€ orders.ff.sql -β”‚ β”‚ β”‚ β”œβ”€β”€ mart_orders_enriched.ff.py -β”‚ β”‚ β”‚ └── mart_users.ff.sql -β”‚ β”‚ β”œβ”€β”€ seeds/ -β”‚ β”‚ β”‚ β”œβ”€β”€ seed_users.csv -β”‚ β”‚ β”‚ └── seed_orders.csv -β”‚ β”‚ β”œβ”€β”€ sources.yml -β”‚ β”‚ β”œβ”€β”€ project.yml -β”‚ β”‚ β”œβ”€β”€ Makefile -β”‚ β”‚ └── .local/demo.duckdb (after make seed/run) -β”‚ └── postgres/ # similar structure if needed -β”‚ -β”œβ”€β”€ tests/ -β”‚ β”œβ”€β”€ conftest.py -β”‚ β”œβ”€β”€ duckdb/ … # end-to-end + unit -β”‚ β”œβ”€β”€ postgres/ … -β”‚ └── streaming/ … -└── README.md -``` +For an up-to-date view, browse the repository tree or run `find . -maxdepth 2` from the root; all examples live under `examples/` with their own READMEs. ### Example Projects and Seeds @@ -147,17 +90,12 @@ Level-wise parallelism, cache modes, fingerprint formula, and the `_ff_meta` aud ### Roadmap Snapshot -| Version | Content | -|---------|---------------------------------------------------| -| 0.2 | `config(materialized=...)`, Jinja macros, variables | -| 0.3 | Parallel execution, cache | -| 0.4 | Incremental models | -| 0.5 | Streaming connectors (Kafka, S3) | -| 1.0 | Stable API, plugin SDK | - -> See also: feature pyramid & roadmap phases (OSS/SaaS) in the separate document. +Lightweight glance at near-term priorities: ---- +- **Docs & UX:** tighten CLI help/autocomplete, add more cookbook-style recipes. +- **Engines:** polish Databricks/Spark parity (Unity Catalog, Delta) and Snowpark SQL coverage. +- **Reliability:** concurrency/caching hardening and better error surfacing in auto-docs. +- **DX:** richer typing for Python models plus template improvements for examples/macros. ### Cross-Table Reconciliations @@ -293,13 +231,13 @@ class BaseExecutor(ABC): def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None: ... ``` -**DuckDB (`duckdb_exec.py`)** +**DuckDB (`duckdb.py`)** - `run_sql(node, env)` renders Jinja (`ref/source`) and executes the SQL. - `_read_relation` loads a table as `DataFrame`; surfaces actionable errors when a dependency is missing. - `_materialize_relation` writes the `DataFrame` as a table (`create or replace table ...`). -**Postgres (`postgres_exec.py`)** +**Postgres (`postgres.py`)** - `_SAConnShim` (compatible with `testing._exec`). - `run_sql` renders SQL and rewrites `CREATE OR REPLACE TABLE` to `DROP + CREATE AS`. @@ -357,41 +295,7 @@ def seed_project(project_dir: Path, executor, schema: Optional[str] = None) -> i ### CLI Implementation -Operational usage lives in [CLI Flows](#cli-flows). This section drills into the Typer command definitions in `cli.py`. - -**Commands:** - -- `fft run [--env dev] [--engine ...]` -- `fft dag [--env dev] [--html] [--select ...] [--with-schema/--no-schema]` -- `fft docgen [--env dev] [--out dir] [--emit-json path] [--open-source]` -- `fft test [--env dev] [--select batch|streaming|tag:...]` -- `fft seed [--env dev]` -- `fft sync-db-comments [--env dev] [--dry-run]` -- `fft utest [--env dev] [--cache off|ro|rw] [--reuse-meta]` -- `fft --version` - -**Key components:** - -```python -def _load_project_and_env(project_arg) -> tuple[Path, Environment]: ... -def _resolve_profile(env_name, engine, proj) -> tuple[EnvSettings, Profile]: ... -def _get_test_con(executor: Any) -> Any: ... -``` - -**Test summary (exit 2 on failures):** - -``` -Data Quality Summary -──────────────────── -βœ… not_null users.email (3ms) -❌ unique users.id (2ms) - ↳ users.id has 1 duplicate - -Totals -────── -βœ“ passed: 1 -βœ— failed: 1 -``` +Operational usage lives in [CLI Flows](#cli-flows) and the dedicated [CLI Guide](CLI_Guide.md). For implementation details, see the Typer commands in `src/fastflowtransform/cli/`. --- @@ -436,7 +340,7 @@ from pathlib import Path from jinja2 import Environment, FileSystemLoader from fastflowtransform.core import REGISTRY from fastflowtransform.dag import topo_sort -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor proj = Path("examples/simple_duckdb").resolve() REGISTRY.load_project(proj) diff --git a/docs/examples/API_Demo.md b/docs/examples/API_Demo.md index 1a9d6d3..104c43b 100644 --- a/docs/examples/API_Demo.md +++ b/docs/examples/API_Demo.md @@ -3,7 +3,7 @@ The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local data, external APIs, and multiple execution engines. It highlights: - **Hybrid data model**: joins a local seed (`crm.users`) with live user data from JSONPlaceholder. -- **Multiple environments**: switch between DuckDB, Postgres, and Databricks Spark using `profiles.yml` + `.env.*`. +- **Multiple environments**: switch between DuckDB, Postgres, Databricks Spark, and BigQuery (pandas or BigFrames client) using `profiles.yml` + `.env.*`. - **HTTP integration**: compare the built-in FastFlowTransform HTTP client (`api_users_http`) with a plain `requests` implementation (`api_users_requests`). - **Offline caching & telemetry**: inspect HTTP snapshots via `run_results.json`. - **Engine-aware registration**: scope Python models via `engine_model` and SQL models via `config(engines=[...])` so only the active engine’s nodes load. @@ -20,7 +20,8 @@ The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local 'kind:seed-consumer', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ] ) }} select id, email @@ -28,14 +29,14 @@ The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local ``` Consumes `sources.yml β†’ crm.users` (seeded from `seeds/seed_users.csv`). -2. **API enrichment** – two Python implementations under `models/engines/duckdb/`: +2. **API enrichment** – engine-specific Python implementations under `models/engines//`: - `api_users_http.ff.py` uses the built-in HTTP wrapper (`fastflowtransform.api.http.get_df`) with cache/offline support. - `api_users_requests.ff.py` uses raw `requests` for maximum flexibility. - - Wrap engine-specific callables with `engine_model(only="duckdb", ...)` to skip registration when another engine is selected. + - Engine-specific callables are scoped with `engine_model(only=...)` (DuckDB/Postgres/Spark) or `env_match={"FF_ENGINE": "bigquery", "FF_ENGINE_VARIANT": ...}` (BigQuery pandas/BigFrames) to stay isolated per engine. 3. **Mart join** – `models/common/mart_users_join.ff.sql` ```sql - {{ config(engines=['duckdb','postgres','databricks_spark']) }} + {{ config(engines=['duckdb','postgres','databricks_spark','bigquery']) }} {% set api_users_model = var('api_users_model', 'api_users_http') %} {% set api_users_refs = { 'api_users_http': ref('api_users_http'), @@ -69,14 +70,28 @@ dev_postgres: postgres: dsn: "{{ env('FF_PG_DSN') }}" db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'api_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true ``` `.env.dev_*` files supply the actual values. `_load_dotenv_layered()` loads them in priority order: repo `.env` β†’ project `.env` β†’ `.env.` β†’ shell overrides (highest priority). Secrets stay out of version control. +### BigQuery specifics + +- Set `ENGINE=bigquery` in the Makefile targets and choose a client via `BQ_FRAME=pandas` or `BQ_FRAME=bigframes` (default). +- Required env vars: `FF_BQ_PROJECT`, `FF_BQ_DATASET` (defaults to `api_demo`), and optionally `FF_BQ_LOCATION`. Uncomment `allow_create_dataset` in `profiles.yml` for first-run convenience. +- BigFrames variants ingest the HTTP payload into a pandas DataFrame, then wrap it as a BigFrames DataFrame (FFT’s `get_df(..., output="bigframes")` is not implemented yet). + ## Makefile Workflow -`Makefile` chooses the profile via `ENGINE` (`duckdb`/`postgres`/`databricks_spark`) and wraps the main commands: +`Makefile` chooses the profile via `ENGINE` (`duckdb`/`postgres`/`databricks_spark`/`bigquery`) and wraps the main commands. For BigQuery, set `BQ_FRAME=pandas|bigframes`: ```make ENGINE ?= duckdb @@ -85,6 +100,14 @@ ifeq ($(ENGINE),duckdb) PROFILE_ENV = dev_duckdb endif ... +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif seed: uv run fft seed "$(PROJECT)" --env $(PROFILE_ENV) @@ -98,6 +121,7 @@ Common targets: |--------------------------|-------------| | `make ENGINE=duckdb seed`| Materialize seeds into DuckDB. | | `make ENGINE=postgres run`| Execute the full pipeline against Postgres. | +| `make ENGINE=bigquery run BQ_FRAME=bigframes`| Run against BigQuery (default BigFrames client; set `BQ_FRAME=pandas` to switch). | | `make dag` | Render documentation (`site/dag/`). | | `make api-run` | Run only API models (uses HTTP cache). | | `make api-offline` | Force offline mode (`FF_HTTP_OFFLINE=1`). | @@ -107,7 +131,7 @@ HTTP tuning parameters (`FF_HTTP_ALLOWED_DOMAINS`, cache dir, timeouts) live in ## End-to-End Demo -1. **Select engine**: `make ENGINE=duckdb` (default). Set `ENGINE=postgres` or `ENGINE=databricks_spark` to switch. +1. **Select engine**: `make ENGINE=duckdb` (default). Set `ENGINE=postgres`, `ENGINE=databricks_spark`, or `ENGINE=bigquery BQ_FRAME=` to switch. 2. **Seed data**: `make seed` 3. **Run pipeline**: `make run` 4. **Explore docs**: `make dag` β†’ open `examples/api_demo/site/dag/index.html` diff --git a/docs/examples/Basic_Demo.md b/docs/examples/Basic_Demo.md index baea5de..7ba73e2 100644 --- a/docs/examples/Basic_Demo.md +++ b/docs/examples/Basic_Demo.md @@ -1,10 +1,12 @@ # Basic Demo Project -The `examples/basic_demo` project shows the smallest end-to-end FastFlowTransform pipeline. It combines one seed, a staging model, and a final mart while staying portable across DuckDB, Postgres, and Databricks Spark. +The `examples/basic_demo` project shows the smallest end-to-end FastFlowTransform pipeline. It combines one seed, a staging model, and a final mart while staying portable across DuckDB, Postgres, Databricks Spark, and BigQuery. ## Why it exists + - **Start small** – demonstrate the minimum folder structure (`seeds/`, `models/`, `profiles.yml`) needed to run `fft`. - **Engine parity** – prove that a single project can target multiple engines by swapping profiles. +- **Cloud & local** – show that the same project runs both on local engines (DuckDB/Postgres/Spark) and in a cloud warehouse (BigQuery). - **Understand outputs** – show where documentation and manifests land after a run. Use it as a sandbox before adding your own sources, macros, or Python models. @@ -13,12 +15,12 @@ Use it as a sandbox before adding your own sources, macros, or Python models. | Path | Purpose | |------|---------| -| `seeds/seed_users.csv` | Sample CRM-style user data. `fft seed` materializes it as `crm.users`. | +| `seeds/seed_users.csv` | Sample CRM-style user data. `fft seed` materializes it as a physical `seed_users` table in the active engine (schema/dataset depends on the profile). | | `models/staging/users_clean.ff.sql` | Normalizes emails, casts types, and tags the model for all engines. | | `models/marts/mart_users_by_domain.ff.sql` | Aggregates users per email domain and records the first/last signup dates. | -| `models/engines/*/mart_latest_signup.ff.py` | Engine-specific Python models (pandas for DuckDB/Postgres, PySpark for Databricks) selecting the most recent signup per domain from the staging view. | -| `profiles.yml` | Declares `dev_duckdb`, `dev_postgres`, and `dev_databricks` profiles driven by environment variables. | -| `.env.dev_*` | Template environment files you can `source` per engine. | +| `models/engines/*/mart_latest_signup.ff.py` | Engine-specific Python models selecting the most recent signup per domain from the staging view:
β€’ pandas for DuckDB/Postgres
β€’ PySpark for Databricks
β€’ BigQuery DataFrames (BigFrames) for BigQuery. | +| `profiles.yml` | Declares `dev_duckdb`, `dev_postgres`, `dev_databricks`, and `dev_bigquery` profiles driven by environment variables. | +| `.env.dev_*` | Template environment files you can `source` per engine (`.env.dev_duckdb`, `.env.dev_postgres`, `.env.dev_databricks`, `.env.dev_bigquery`). | | `Makefile` | One command (`make demo ENGINE=…`) to seed, run, document, test, and preview results. | ## Running the demo @@ -26,24 +28,61 @@ Use it as a sandbox before adding your own sources, macros, or Python models. 1. `cd examples/basic_demo` 2. Choose an engine and export its environment variables: ```bash + # DuckDB set -a; source .env.dev_duckdb; set +a - # swap to .env.dev_postgres or .env.dev_databricks for other engines + + # Postgres + # set -a; source .env.dev_postgres; set +a + + # Databricks Spark + # set -a; source .env.dev_databricks; set +a + + # BigQuery (choose one) + # set -a; source .env.dev_bigquery_pandas; set +a # pandas client + # set -a; source .env.dev_bigquery_bigframes; set +a # BigFrames ``` -3. Execute the full flow: + +3. Execute the full flow for the selected engine: + ```bash + # DuckDB / Postgres / Databricks make demo ENGINE=duckdb + # make demo ENGINE=postgres + # make demo ENGINE=databricks_spark + + # BigQuery (set BQ_FRAME to choose pandas vs bigframes) + # builds into ..* + # requires a GCP project, dataset, and credentials (see BigQuery setup docs) + # set profiles.yml β†’ bigquery.allow_create_dataset: true if the dataset should be auto-created + # make demo ENGINE=bigquery BQ_FRAME=bigframes + # make demo ENGINE=bigquery BQ_FRAME=pandas + ``` + + The Makefile runs `fft seed`, `fft run`, `fft dag`, and `fft test`. + + To open the rendered DAG site after a run: + + ```bash + make show ENGINE=duckdb + make show ENGINE=bigquery ``` - The Makefile runs `fft seed`, `fft run`, `fft dag`, `fft test`, and `fft show basic_demo.mart_users_by_domain`. To preview the Python mart, run `make show ENGINE=duckdb SHOW_MODEL=mart_latest_signup` (or swap `ENGINE` as needed). 4. Inspect artifacts: - - `.fastflowtransform/target/manifest.json` and `run_results.json` - - `site/dag/index.html` for the rendered model graph - - CLI output from `fft show` displaying the aggregated mart -The demo also enables baseline data quality checks in `project.yml`. Running `fft test` (or `make test`) verifies that primary keys remain unique/not-null across `seed_users`, `users_clean`, `mart_users_by_domain`, and the Python mart, while ensuring aggregate metrics such as `user_count` never drop below zero and each domain appears only once in `mart_latest_signup`. + * `.fastflowtransform/target/manifest.json` and `run_results.json` + * `site/dag/index.html` for the rendered model graph + * Use your engine’s client (or `fft run` logs) to inspect the mart outputs + +## Data quality tests + +The demo enables baseline data quality checks in `project.yml`. Running `fft test` (or `make test ENGINE=…`) verifies that: + +* Primary keys remain unique/not-null across: -## Next steps + * `seed_users` + * `users_clean` + * `mart_users_by_domain` + * the Python mart `mart_latest_signup` +* Aggregate metrics such as `user_count` never drop below zero. +* Each email domain appears only once in `mart_latest_signup`. -- Add more CSVs under `seeds/` and declare them in `sources.yml`. -- Create additional staging models so marts can reuse normalized data. -- Introduce Python models or macros mirroring how the API demo scales up. -- Update `.env.dev_*` with real credentials once you connect to shared databases. +These tests run against whatever engine/profile is active β€” including BigQuery, where they execute as standard SQL queries on the configured dataset. diff --git a/docs/examples/Cache_Demo.md b/docs/examples/Cache_Demo.md index 7f73250..e6fc7e3 100644 --- a/docs/examples/Cache_Demo.md +++ b/docs/examples/Cache_Demo.md @@ -54,12 +54,15 @@ cd examples/cache_demo make cache_first # builds all nodes, writes cache make cache_second # no-op run (everything skipped) make change_sql # touch a model -> rebuilds dependent mart -make change_seed # change seed -> rebuilds staging + mart +make change_seed # use patches/seed_users_patch.csv -> rebuilds staging + mart (no tracked edits) make change_env # set FF_* env -> invalidates cache globally make change_py # edit py_constants.ff.py -> rebuilds that model make run_parallel # runs entire DAG with 4 workers per level ``` +Seeds stay immutable: `change_seed` assembles a temporary combined copy in `.local/seeds` using +`patches/seed_users_patch.csv`, so the repo stays clean while fingerprints still change. + Inspect results: * `.fastflowtransform/target/run_results.json` – fingerprints, results, timings, HTTP stats @@ -154,7 +157,7 @@ fft run . --env dev_duckdb --jobs 4 | First full run | `make cache_first` | All models build, cache written | | No-op run | `make cache_second` | All skipped (no rebuilds) | | Modify SQL | `make change_sql` | Downstream mart rebuilds | -| Add seed row | `make change_seed` | Staging + mart rebuild | +| Add seed row | `make change_seed` | Staging + mart rebuild (temp combined seed from patches/) | | Change env | `make change_env` | All nodes rebuild | | Edit Python constant | `make change_py` | Only that Python model rebuilds | | Warm & offline HTTP cache | `make http_first && make http_offline` | HTTP cache reused, no network | diff --git a/docs/examples/DQ_Demo.md b/docs/examples/DQ_Demo.md index 257b44a..8bbbde0 100644 --- a/docs/examples/DQ_Demo.md +++ b/docs/examples/DQ_Demo.md @@ -49,6 +49,8 @@ examples/dq_demo/ .env.dev_duckdb .env.dev_postgres .env.dev_databricks + .env.dev_bigquery_pandas + .env.dev_bigquery_bigframes Makefile # optional, convenience wrapper around fft commands profiles.yml project.yml @@ -339,6 +341,24 @@ This executes just the cross-table checks, which is handy when you’re iteratin --- +## BigQuery variant (pandas or BigFrames) + +To run the same demo on BigQuery: + +1. Copy `.env.dev_bigquery_pandas` or `.env.dev_bigquery_bigframes` to `.env` and fill in: + ```bash + FF_BQ_PROJECT= + FF_BQ_DATASET=dq_demo + FF_BQ_LOCATION= # e.g., EU or US + GOOGLE_APPLICATION_CREDENTIALS=../secrets/.json # or rely on gcloud / WIF + ``` +2. Run via the Makefile from `examples/dq_demo`: + ```bash + make demo ENGINE=bigquery BQ_FRAME=pandas # or bigframes + ``` + +Both profiles accept `allow_create_dataset` in `profiles.yml` if you want the example to create the dataset automatically. + ## Things to experiment with To understand the tests better, intentionally break the data and re-run `fft test`: diff --git a/docs/examples/Incremental_Demo.md b/docs/examples/Incremental_Demo.md index d974e78..3005b1a 100644 --- a/docs/examples/Incremental_Demo.md +++ b/docs/examples/Incremental_Demo.md @@ -1,6 +1,6 @@ # Incremental, Delta & Iceberg Demo -This example project shows how to use **incremental models** and **Delta-/Iceberg-style merges** in FastFlowTransform across DuckDB, Postgres and Databricks Spark (Parquet, Delta & Iceberg). +This example project shows how to use **incremental models** and **Delta-/Iceberg-style merges** in FastFlowTransform across DuckDB, Postgres, Databricks Spark (Parquet, Delta & Iceberg), and BigQuery (pandas or BigFrames). It is intentionally small and self-contained so you can copy/paste patterns into your own project. @@ -24,6 +24,8 @@ incremental_demo/ .env.dev_postgres .env.dev_databricks_delta .env.dev_databricks_iceberg + .env.dev_bigquery_pandas + .env.dev_bigquery_bigframes Makefile profiles.yml project.yml @@ -44,6 +46,11 @@ incremental_demo/ fct_events_py_incremental.ff.py databricks_spark/ fct_events_py_incremental.ff.py + bigquery/ + pandas/ + fct_events_py_incremental.ff.py + bigframes/ + fct_events_py_incremental.ff.py ``` *Your actual filenames may differ slightly; the concepts are the same.* @@ -71,6 +78,7 @@ The demo revolves around a tiny `events` dataset and three different ways to bui * DuckDB / Postgres: incremental insert/merge in SQL * Databricks Spark: `MERGE INTO` for Delta or Iceberg where available (Spark 4), with a fallback full-refresh strategy for other formats + * BigQuery: pandas- or BigFrames-backed DataFrame models with incremental merge logic handled by the BigQuery executor 4. **Iceberg profile for Spark 4** @@ -240,6 +248,7 @@ On subsequent runs, the engine evaluates the `delta.sql` snippet and: * **DuckDB / Postgres**: inserts or merges the resulting rows into the target table * **Databricks Spark**: tries a `MERGE INTO` (Delta) and falls back to a full-refresh if necessary +* **BigQuery**: applies incremental insert/merge logic in SQL via the BigQuery executor --- @@ -313,6 +322,8 @@ Files: models/engines/duckdb/fct_events_py_incremental.ff.py models/engines/postgres/fct_events_py_incremental.ff.py models/engines/databricks_spark/fct_events_py_incremental.ff.py +models/engines/bigquery/pandas/fct_events_py_incremental.ff.py +models/engines/bigquery/bigframes/fct_events_py_incremental.ff.py ``` Each engine variant uses the same logical signature: @@ -550,9 +561,26 @@ FFT_ACTIVE_ENV=dev_postgres fft test . \ --select tag:example:incremental_demo ``` -Packen wΓΌrde ich den Hinweis direkt an die Stelle, wo du schon beschreibst, wie man die Demo auf Databricks startet – also deine aktuelle Sektion: +### BigQuery + +```bash +# pandas +FF_ENGINE=bigquery FF_ENGINE_VARIANT=pandas FFT_ACTIVE_ENV=dev_bigquery_pandas fft seed . +FF_ENGINE=bigquery FF_ENGINE_VARIANT=pandas FFT_ACTIVE_ENV=dev_bigquery_pandas fft run . \ + --select tag:example:incremental_demo --select tag:engine:bigquery --cache rw +FF_ENGINE=bigquery FF_ENGINE_VARIANT=pandas FFT_ACTIVE_ENV=dev_bigquery_pandas fft test . \ + --select tag:example:incremental_demo + +# BigFrames +FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigframes fft seed . +FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigframes fft run . \ + --select tag:example:incremental_demo --select tag:engine:bigquery --cache rw +FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigframes fft test . \ + --select tag:example:incremental_demo +``` + +Ensure the service account credentials pointed to by `GOOGLE_APPLICATION_CREDENTIALS` can create/drop tables in the target dataset. -````markdown ### Databricks Spark ```bash diff --git a/docs/examples/Local_Engine_Setup.md b/docs/examples/Local_Engine_Setup.md index 70eb8c2..c75da91 100644 --- a/docs/examples/Local_Engine_Setup.md +++ b/docs/examples/Local_Engine_Setup.md @@ -25,3 +25,185 @@ `FF_DBR_ENABLE_HIVE=1`, `FF_DBR_WAREHOUSE_DIR=examples/api_demo/spark-warehouse`, `FF_DBR_DATABASE=api_demo`. - Switch the physical format by setting `FF_DBR_TABLE_FORMAT` (e.g. `delta`, requires the Delta Lake runtime); extra writer options can be supplied via `profiles.yml β†’ databricks_spark.table_options`. - Ensure your shell loads `.env.dev_databricks` (via `make`, `direnv`, or manual export) and run `make ENGINE=databricks_spark seed run`. + + +Yep, let’s bolt on a β€œhow to set it up in GCP” section that fits with what you already have. + +Here’s an extended BigQuery section you can drop into your docs (you can keep or trim the parts you already added): + +### BigQuery + +#### 1. One-time setup in Google Cloud + +You only need to do this once per project / environment. + +1. **Create (or pick) a GCP project** + + - Go to the *Google Cloud Console* β†’ **IAM & Admin β†’ Create project**. + - Give it a name, e.g. `FFT Basic Demo`, and note the **Project ID**, e.g. `fft-basic-demo`. + - All further steps refer to this project id. + +2. **Enable the BigQuery API** + + - In the console, go to **APIs & Services β†’ Library**. + - Search for **β€œBigQuery API”** and click **Enable**. + - (Optional but recommended) Also enable **BigQuery Storage API** for faster reads. + +3. **Create a BigQuery dataset** + + - Go to **BigQuery** in the console (left sidebar). + - Make sure your project `fft-basic-demo` is selected. + - Click **β€œ+ Create dataset”**: + - **Dataset ID**: e.g. `basic_demo` + - **Location type**: choose a **multi-region**, e.g.: + - `EU` or `US` + - Click **Create dataset**. + + ⚠️ **Important:** The dataset **location must match** the location you use in your env (`FF_BQ_LOCATION`). + - If your dataset is in `EU` (multi-region), then `FF_BQ_LOCATION=EU`. + - If the dataset is in a single region like `europe-west3`, use that exact region name. + +4. **Create a service account (for CI / non-interactive use)** + + For local dev you can use your own user credentials (see below), but for CI/CD or shared environments + a service account is better. + + - Go to **IAM & Admin β†’ Service Accounts β†’ Create service account**. + - Name it e.g. `fft-runner`. + - On the **Roles** step, add roles with BigQuery write access, for example: + - `BigQuery Job User` + - `BigQuery Data Editor` + - (Optionally) Restrict to dataset level later if you want stricter permissions. + + Then create a key: + + - Click your service account β†’ **Keys β†’ Add key β†’ Create new key**. + - Select **JSON**, download the file, and store it somewhere safe (e.g. `~/.config/gcloud/fft-sa.json`). + +5. **Authentication options** + + You have two ways to authenticate locally: + + **A) Application Default Credentials via gcloud (easy for dev)** + + ```bash + gcloud auth application-default login + ``` + +This opens a browser, you log in, and Google stores your ADC in +`~/.config/gcloud/application_default_credentials.json`. + +The BigQuery client in `fastflowtransform` will pick this up automatically **as long as** +`FF_BQ_PROJECT` points to a project you have access to. + +**B) Service account key (good for CI)** + +* Put the downloaded JSON key (from step 4) somewhere on disk. + +* Set the environment variable before running `fft`: + + ```bash + export GOOGLE_APPLICATION_CREDENTIALS=/path/to/fft-sa.json + ``` + +* Make sure the service account has at least: + + * `BigQuery Job User` + * `BigQuery Data Editor` + +* Optionally grant `BigQuery Data Viewer` if you’re only reading some tables. + +--- + +#### 2. Local configuration (env + profiles) + +1. **Environment file (`.env.dev_bigquery`)** + + ```env + # BigQuery connection + FF_BQ_PROJECT=fft-basic-demo # your GCP project id + FF_BQ_DATASET=basic_demo # dataset from step 3 + FF_BQ_LOCATION=EU # or europe-west3, US, etc. MUST match dataset location + + # Active fft environment name (must match profiles.yml) + FFT_ACTIVE_ENV=dev_bigquery + ``` + + Load this via `direnv`, `make`, or manual `export`. + +2. **profiles.yml** + + ```yaml + dev_bigquery: + engine: bigquery + bigquery: + project: ${FF_BQ_PROJECT} + dataset: ${FF_BQ_DATASET} + location: ${FF_BQ_LOCATION} + use_bigframes: true # Python models use BigQuery DataFrames (BigFrames) + ``` + +--- + +#### 3. Running seeds, models, and tests + +* **Seed BigQuery from `seeds/`:** + + ```bash + make ENGINE=bigquery seed + ``` + + This writes all `seeds/*.csv|parquet` to tables under + `${FF_BQ_PROJECT}.${FF_BQ_DATASET}.*`. + +* **Build models:** + + ```bash + make ENGINE=bigquery run + ``` + + * SQL models are executed as BigQuery queries. + * Python models with `only="bigquery"` run via `BigQueryBFExecutor` (BigQuery DataFrames) + and are written back into the same dataset. + +* **Run data-quality tests:** + + ```bash + make ENGINE=bigquery test + ``` + + `fft test` uses the BigQuery shim to run checks like `not_null`, `unique`, + `row_count_between`, `greater_equal`, etc. against + `${FF_BQ_PROJECT}.${FF_BQ_DATASET}.`. + +--- + +#### 4. Common BigQuery gotchas + +* **Location mismatch** + + * Error like `Location basic_demo does not support this operation` or `Not found: Dataset ...`: + + * Check the **dataset location** in the BigQuery UI. + * Make sure `FF_BQ_LOCATION` is exactly that value (`EU`, `US`, `europe-west3`, …). + * Ensure the executor is initialized with the same location (via `profiles.yml` β†’ `location`). + +* **Permission issues** + + * If you see `accessDenied` or `Permission denied`: + + * Confirm you authenticated (ADC or service account). + * Ensure your user / service account has at least: + + * `BigQuery Job User` + * `BigQuery Data Editor` on the project or dataset. + +* **Dataset not found** + + * Error `Not found: Dataset fft-basic-demo:basic_demo`: + + * Check that the dataset id matches exactly: + + * Project: `fft-basic-demo` + * Dataset: `basic_demo` + * Verify it exists and is in the same project you set in `FF_BQ_PROJECT`. diff --git a/docs/examples/Macros_Demo.md b/docs/examples/Macros_Demo.md index e296473..8dee673 100644 --- a/docs/examples/Macros_Demo.md +++ b/docs/examples/Macros_Demo.md @@ -1,6 +1,6 @@ # Macros Demo -**Goal:** Showcase **SQL Jinja macros** and **Python render-time macros** working together across engines (DuckDB, Postgres, Databricks Spark). +**Goal:** Showcase **SQL Jinja macros** and **Python render-time macros** working together across engines (DuckDB, Postgres, Databricks Spark, BigQuery). You’ll see reusable SQL helpers, engine-aware SQL generation, and Python functions exposed as Jinja globals/filters. --- @@ -20,24 +20,29 @@ examples/macros_demo/ seeds/ seed_users.csv seed_orders.csv - models/ - macros/ - utils.sql - star.sql - macros_py/ - helpers.py - common/ - stg_users.ff.sql - stg_orders.ff.sql - dim_users.ff.sql - fct_user_sales.ff.sql - engines/ - duckdb/ - py_example.ff.py - postgres/ - py_example.ff.py - databricks_spark/ - py_example.ff.py + models/ + macros/ + utils.sql + star.sql + macros_py/ + helpers.py + common/ + stg_users.ff.sql + stg_orders.ff.sql + dim_users.ff.sql + fct_user_sales.ff.sql + engines/ + duckdb/ + py_example.ff.py + postgres/ + py_example.ff.py + databricks_spark/ + py_example.ff.py + bigquery/ + bigframes/ + py_example.ff.py + pandas/ + py_example.ff.py ``` --- diff --git a/examples/_scripts/cleanup_env.py b/examples/_scripts/cleanup_env.py index fe7ae7c..90e7281 100644 --- a/examples/_scripts/cleanup_env.py +++ b/examples/_scripts/cleanup_env.py @@ -89,6 +89,58 @@ def cleanup_postgres(*, dsn: str | None, schema: str | None, dry_run: bool) -> N conn.execute(text(f'CREATE SCHEMA "{schema}"')) +def cleanup_bigquery( + *, + project_id: str | None, + dataset: str | None, + location: str | None, + dry_run: bool, +) -> None: + """ + Reset a BigQuery demo dataset by dropping and recreating it. + + Reads project/dataset/location from args/env/profile; this is meant for + isolated demo datasets (like fft-basic-demo.basic_demo), not shared prod. + """ + if not project_id: + raise ValueError("BigQuery cleanup requires FF_BQ_PROJECT or --bq-project") + if not dataset: + raise ValueError("BigQuery cleanup requires FF_BQ_DATASET or --bq-dataset") + + from google.cloud import bigquery # local import so other engines don't require it + + client = bigquery.Client(project=project_id, location=location) + full_id = f"{project_id}.{dataset}" + + if dry_run: + _log(f"[dry-run] Would delete and recreate BigQuery dataset {full_id}") + return + + # Try to preserve existing location if not explicitly given + ds_location = location + try: + ds = client.get_dataset(full_id) + if not ds_location: + ds_location = ds.location + except Exception: + # Dataset may not exist yet – that's fine, we'll just create it below. + pass + + _log(f"Deleting BigQuery dataset {full_id} (if exists, with contents)") + client.delete_dataset( + full_id, + delete_contents=True, + not_found_ok=True, + ) + + ds_obj = bigquery.Dataset(full_id) + if ds_location: + ds_obj.location = ds_location + + _log(f"Recreating BigQuery dataset {full_id} (location={ds_location or 'default'})") + client.create_dataset(ds_obj, exists_ok=True) + + def _env_flag(name: str, default: bool = False) -> bool: val = os.getenv(name) if val is None: @@ -234,7 +286,7 @@ def _load_profile(project: Path, env_name: str, engine: str | None): def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description="Reset FastFlowTransform example environments.") parser.add_argument( - "--engine", required=True, choices=["duckdb", "postgres", "databricks_spark"] + "--engine", required=True, choices=["duckdb", "postgres", "databricks_spark", "bigquery"] ) parser.add_argument("--project", default=".") parser.add_argument("--env", help="Profile environment name (e.g. dev_duckdb).") @@ -249,6 +301,9 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument( "--spark-use-hive", action="store_true", help="Force Hive metastore enablement for cleanup." ) + parser.add_argument("--bq-project", help="Override BigQuery project ID (FF_BQ_PROJECT).") + parser.add_argument("--bq-dataset", help="Override BigQuery dataset (FF_BQ_DATASET).") + parser.add_argument("--bq-location", help="Override BigQuery location (FF_BQ_LOCATION).") parser.add_argument("--dry-run", action="store_true") parser.add_argument( "--skip-artifacts", @@ -262,7 +317,11 @@ def main(argv: list[str] | None = None) -> int: env_name = ( args.env or os.getenv("FFT_ACTIVE_ENV") - or ("dev_" + args.engine if args.engine in {"duckdb", "postgres"} else "dev") + or ( + "dev_" + args.engine + if args.engine in {"duckdb", "postgres", "databricks_spark", "bigquery"} + else "dev" + ) ) os.environ["FFT_ACTIVE_ENV"] = env_name @@ -314,6 +373,22 @@ def main(argv: list[str] | None = None) -> int: use_hive=args.spark_use_hive or bool(profile_use_hive), dry_run=args.dry_run, ) + elif args.engine == "bigquery": + profile_bq = getattr(profile, "bigquery", None) if profile else None + profile_project = getattr(profile_bq, "project", None) if profile_bq else None + profile_dataset = getattr(profile_bq, "dataset", None) if profile_bq else None + profile_location = getattr(profile_bq, "location", None) if profile_bq else None + + project_id = args.bq_project or os.getenv("FF_BQ_PROJECT") or profile_project + dataset = args.bq_dataset or os.getenv("FF_BQ_DATASET") or profile_dataset + location = args.bq_location or os.getenv("FF_BQ_LOCATION") or profile_location + + cleanup_bigquery( + project_id=project_id, + dataset=dataset, + location=location, + dry_run=args.dry_run, + ) except Exception as exc: _log(f"Cleanup failed: {exc}") return 1 diff --git a/examples/api_demo/.env.dev_bigquery_bigframes b/examples/api_demo/.env.dev_bigquery_bigframes new file mode 100644 index 0000000..e3f91f7 --- /dev/null +++ b/examples/api_demo/.env.dev_bigquery_bigframes @@ -0,0 +1,7 @@ +# BigQuery profile for the basic demo +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=api_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/api_demo/.env.dev_bigquery_pandas b/examples/api_demo/.env.dev_bigquery_pandas new file mode 100644 index 0000000..e3f91f7 --- /dev/null +++ b/examples/api_demo/.env.dev_bigquery_pandas @@ -0,0 +1,7 @@ +# BigQuery profile for the basic demo +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=api_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/api_demo/.env.dev_databricks b/examples/api_demo/.env.dev_databricks index 4c425c0..1d8876d 100644 --- a/examples/api_demo/.env.dev_databricks +++ b/examples/api_demo/.env.dev_databricks @@ -8,4 +8,7 @@ FF_DBR_DATABASE=api_demo # Uncomment to switch to Delta Lake (requires delta-spark dependency) # FF_DBR_TABLE_FORMAT=delta -JAVA_HOME=/opt/homebrew/opt/openjdk@17 +# Prefer an existing JAVA_HOME (e.g., in CI); fall back to the macOS brew path for local use. +if [ -z "${JAVA_HOME:-}" ] && [ -d "/opt/homebrew/opt/openjdk@17" ]; then + JAVA_HOME=/opt/homebrew/opt/openjdk@17 +fi diff --git a/examples/api_demo/Makefile b/examples/api_demo/Makefile index f6d4da5..ada8869 100644 --- a/examples/api_demo/Makefile +++ b/examples/api_demo/Makefile @@ -12,6 +12,9 @@ UV ?= uv # Engine selector (duckdb|postgres|databricks_spark) ENGINE ?= duckdb +# BigQuery frame type selector (pandas | bigframes) +BQ_FRAME ?= bigframes + # HTTP wrapper defaults (override per call if needed) # Allowed domains are comma-separated (no https://) FF_HTTP_ALLOWED_DOMAINS ?= jsonplaceholder.typicode.com,api.github.com @@ -41,9 +44,20 @@ ifeq ($(ENGINE),databricks_spark) PROFILE_ENV = dev_databricks ENGINE_TAG = engine:databricks_spark endif +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +ifeq ($(ENGINE),bigquery) + BASE_ENV := $(BASE_ENV) FF_ENGINE=$(ENGINE) FF_ENGINE_VARIANT=$(BQ_FRAME) +endif RUN_ENV = $(BASE_ENV) # Select only API demo models for the active engine (common models carry all engine tags) @@ -57,6 +71,8 @@ else ifeq ($(ENGINE),postgres) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),bigquery) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" else CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) endif @@ -108,7 +124,7 @@ demo-open: fi demo: clean - @echo "== πŸš€ API Demo (DuckDB) ==" + @echo "== πŸš€ API Demo ($(ENGINE)) ==" @echo "Profile=$(PROFILE_ENV) DB=$(DB) PROJECT=$(PROJECT)" +$(MAKE) seed +$(MAKE) run @@ -153,7 +169,7 @@ api-show-http: fi api-demo: clean - @echo "== 🌐 API Demo (DuckDB) ==" + @echo "== 🌐 API Demo ($(ENGINE)) ==" @echo "Profile=$(PROFILE_ENV) DB=$(DB) PROJECT=$(PROJECT)" +$(MAKE) run +$(MAKE) dag diff --git a/examples/api_demo/models/common/mart_users_join.ff.sql b/examples/api_demo/models/common/mart_users_join.ff.sql index 0197db7..ad6feef 100644 --- a/examples/api_demo/models/common/mart_users_join.ff.sql +++ b/examples/api_demo/models/common/mart_users_join.ff.sql @@ -5,7 +5,8 @@ 'scope:common', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} diff --git a/examples/api_demo/models/common/users.ff.sql b/examples/api_demo/models/common/users.ff.sql index ec68605..b170c5a 100644 --- a/examples/api_demo/models/common/users.ff.sql +++ b/examples/api_demo/models/common/users.ff.sql @@ -6,7 +6,8 @@ 'kind:seed-consumer', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} -- Simple staging table from seed diff --git a/examples/api_demo/models/engines/bigquery/bigframes/api_users_http.ff.py b/examples/api_demo/models/engines/bigquery/bigframes/api_users_http.ff.py new file mode 100644 index 0000000..6fae179 --- /dev/null +++ b/examples/api_demo/models/engines/bigquery/bigframes/api_users_http.ff.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df + +if TYPE_CHECKING: + import bigframes.pandas as bpd + from bigframes.pandas import DataFrame as BFDataFrame +else: + bpd: Any = None + + class BFDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _get_bigframes() -> Any: + try: + import bigframes.pandas as bpd_mod + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "bigframes is required for this model. Install fastflowtransform[bigquery_bf]." + ) from exc + return bpd_mod + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "bigframes", + }, + name="api_users_http", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine", "engine:bigquery"], +) +def fetch(_: BFDataFrame) -> BFDataFrame: + """ + Fetch users via the FFT HTTP helper and return a BigFrames DataFrame. + """ + _get_bigframes() + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, + normalize=True, + output="bigframes", + ) + return df.loc[:, ["id", "email", "username", "name"]].rename(columns={"id": "api_user_id"}) # type: ignore[arg-type] diff --git a/examples/api_demo/models/engines/bigquery/bigframes/api_users_requests.ff.py b/examples/api_demo/models/engines/bigquery/bigframes/api_users_requests.ff.py new file mode 100644 index 0000000..c098900 --- /dev/null +++ b/examples/api_demo/models/engines/bigquery/bigframes/api_users_requests.ff.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: + import bigframes.pandas as bpd + from bigframes.pandas import DataFrame as BFDataFrame +else: + bpd: Any = None + + class BFDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _get_bigframes() -> Any: + try: + import bigframes.pandas as bpd_mod + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "bigframes is required for this model. Install fastflowtransform[bigquery_bf]." + ) from exc + return bpd_mod + + +try: + import httpx +except Exception as _e: # pragma: no cover + raise RuntimeError("Please install 'httpx' to run this model") from _e + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "bigframes", + }, + name="api_users_requests", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine", "engine:bigquery"], +) +def fetch(_: BFDataFrame) -> BFDataFrame: + """Fetch users via plain httpx and return a BigFrames DataFrame.""" + bpd_mod = _get_bigframes() + resp = httpx.get("https://jsonplaceholder.typicode.com/users", timeout=30.0) + resp.raise_for_status() + df = bpd_mod.DataFrame(resp.json()) # accepts a JSON-serialisable list of dicts + return df.loc[:, ["id", "email", "username", "name"]].rename( # type: ignore[arg-type] + columns={"id": "api_user_id"} + ) diff --git a/examples/api_demo/models/engines/bigquery/pandas/api_users_http.ff.py b/examples/api_demo/models/engines/bigquery/pandas/api_users_http.ff.py new file mode 100644 index 0000000..cd28db5 --- /dev/null +++ b/examples/api_demo/models/engines/bigquery/pandas/api_users_http.ff.py @@ -0,0 +1,26 @@ +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df +import pandas as pd + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "pandas", + }, + name="api_users_http", + deps=["users.ff"], # at least one dependency is required by the executor contract + tags=["example:api_demo", "scope:engine", "engine:bigquery"], +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + """ + Fetch users from a public demo API using the built-in HTTP wrapper (pandas client). + """ + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, # the outer JSON is already a list + normalize=True, # flatten objects to columns (address.*, company.*) + ) + + cols = [c for c in df.columns if c in ("id", "email", "username", "name")] + return df[cols].rename(columns={"id": "api_user_id"}) diff --git a/examples/api_demo/models/engines/bigquery/pandas/api_users_requests.ff.py b/examples/api_demo/models/engines/bigquery/pandas/api_users_requests.ff.py new file mode 100644 index 0000000..aa460d7 --- /dev/null +++ b/examples/api_demo/models/engines/bigquery/pandas/api_users_requests.ff.py @@ -0,0 +1,28 @@ +from fastflowtransform import engine_model +import pandas as pd + +try: + import httpx +except Exception as _e: # pragma: no cover + raise RuntimeError("Please install 'httpx' to run this model") from _e + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "pandas", + }, + name="api_users_requests", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine", "engine:bigquery"], +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + """Fetch users via plain httpx (pandas client).""" + url = "https://jsonplaceholder.typicode.com/users" + resp = httpx.get(url, timeout=30.0) + resp.raise_for_status() + data = resp.json() + + df = pd.DataFrame(data) + cols = [c for c in df.columns if c in ("id", "email", "username", "name")] + return df[cols].rename(columns={"id": "api_user_id"}) diff --git a/examples/api_demo/models/engines/databricks_spark/api_users_http.ff.py b/examples/api_demo/models/engines/databricks_spark/api_users_http.ff.py index d3276e7..e31b3f8 100644 --- a/examples/api_demo/models/engines/databricks_spark/api_users_http.ff.py +++ b/examples/api_demo/models/engines/databricks_spark/api_users_http.ff.py @@ -1,7 +1,34 @@ +from typing import TYPE_CHECKING, Any + from fastflowtransform import engine_model from fastflowtransform.api.http import get_df -from pyspark.sql import DataFrame as SparkDataFrame -from pyspark.sql import SparkSession + +if TYPE_CHECKING: + from pyspark.sql import DataFrame as SparkDataFrame + from pyspark.sql import SparkSession +else: + + class SparkDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + class SparkSession: # pragma: no cover - placeholder for runtime type hints + ... + + +def _ensure_spark_session(users_df: Any) -> "SparkSession": + try: + from pyspark.sql import SparkSession as _SparkSession + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "pyspark is required for this model. Install fastflowtransform[spark]." + ) from exc + + session: _SparkSession | None = getattr(users_df, "sparkSession", None) + if session is None: + session = _SparkSession.getActiveSession() + if session is None: + session = _SparkSession.builder.getOrCreate() + return session @engine_model( @@ -15,14 +42,7 @@ def fetch(users_df: SparkDataFrame) -> SparkDataFrame: Fetch demo users via the FFT HTTP helper and return a Spark DataFrame. Leverages get_df(..., output='spark') to stay entirely in Spark. """ - spark = ( - users_df.sparkSession - if isinstance(users_df, SparkDataFrame) - else SparkSession.getActiveSession() - ) - if spark is None: - spark = SparkSession.builder.getOrCreate() - + spark = _ensure_spark_session(users_df) df = get_df( url="https://jsonplaceholder.typicode.com/users", record_path=None, diff --git a/examples/api_demo/models/engines/databricks_spark/api_users_requests.ff.py b/examples/api_demo/models/engines/databricks_spark/api_users_requests.ff.py index ad36b18..7ea58df 100644 --- a/examples/api_demo/models/engines/databricks_spark/api_users_requests.ff.py +++ b/examples/api_demo/models/engines/databricks_spark/api_users_requests.ff.py @@ -1,11 +1,39 @@ +from typing import TYPE_CHECKING, Any + from fastflowtransform import engine_model -from pyspark.sql import DataFrame as SparkDataFrame -from pyspark.sql import SparkSession + +if TYPE_CHECKING: + from pyspark.sql import DataFrame as SparkDataFrame + from pyspark.sql import SparkSession +else: + + class SparkDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + class SparkSession: # pragma: no cover - placeholder for runtime type hints + ... + + +def _ensure_spark_session(users_df: Any) -> "SparkSession": + try: + from pyspark.sql import SparkSession as _SparkSession + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "pyspark is required for this model. Install fastflowtransform[spark]." + ) from exc + + session: _SparkSession | None = getattr(users_df, "sparkSession", None) + if session is None: + session = _SparkSession.getActiveSession() + if session is None: + session = _SparkSession.builder.getOrCreate() + return session + try: - import requests + import httpx except Exception as _e: # pragma: no cover - raise RuntimeError("Please install 'requests' to run this model") from _e + raise RuntimeError("Please install 'httpx' to run this model") from _e @engine_model( @@ -19,15 +47,8 @@ def fetch(users_df: SparkDataFrame) -> SparkDataFrame: Plain requests-based HTTP fetch that returns a Spark DataFrame. Useful when you need full control over authentication, retries, etc. """ - spark = ( - users_df.sparkSession - if isinstance(users_df, SparkDataFrame) - else SparkSession.getActiveSession() - ) - if spark is None: - spark = SparkSession.builder.getOrCreate() - - resp = requests.get("https://jsonplaceholder.typicode.com/users", timeout=30) + spark = _ensure_spark_session(users_df) + resp = httpx.get("https://jsonplaceholder.typicode.com/users", timeout=30.0) resp.raise_for_status() rows = resp.json() diff --git a/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py b/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py index ec5254e..ddcfeae 100644 --- a/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py +++ b/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py @@ -1,11 +1,11 @@ -# NOTE: Plain Python variant (requests/httpx). No built-in FFT telemetry or HTTP cache here. +# NOTE: Plain Python variant (httpx). No built-in FFT telemetry or HTTP cache here. from fastflowtransform import engine_model import pandas as pd try: - import requests # you can swap this with httpx if you prefer + import httpx except Exception as _e: # pragma: no cover - raise RuntimeError("Please install 'requests' to run this model") from _e + raise RuntimeError("Please install 'httpx' to run this model") from _e @engine_model( @@ -25,7 +25,7 @@ def fetch(_: pd.DataFrame) -> pd.DataFrame: # Add your auth headers here if needed: # "Authorization": f"Bearer {os.getenv('MY_TOKEN')}", } - resp = requests.get(url, headers=headers, timeout=30) + resp = httpx.get(url, headers=headers, timeout=30.0) resp.raise_for_status() data = resp.json() # list[dict] diff --git a/examples/api_demo/models/engines/postgres/api_users_requests.ff.py b/examples/api_demo/models/engines/postgres/api_users_requests.ff.py index f67caec..9ebe61d 100644 --- a/examples/api_demo/models/engines/postgres/api_users_requests.ff.py +++ b/examples/api_demo/models/engines/postgres/api_users_requests.ff.py @@ -1,11 +1,11 @@ -# NOTE: Plain Python variant (requests/httpx). No built-in FFT telemetry or HTTP cache here. +# NOTE: Plain Python variant (httpx). No built-in FFT telemetry or HTTP cache here. from fastflowtransform import engine_model import pandas as pd try: - import requests # you can swap this with httpx if you prefer + import httpx except Exception as _e: # pragma: no cover - raise RuntimeError("Please install 'requests' to run this model") from _e + raise RuntimeError("Please install 'httpx' to run this model") from _e @engine_model( @@ -25,7 +25,7 @@ def fetch(_: pd.DataFrame) -> pd.DataFrame: # Add your auth headers here if needed: # "Authorization": f"Bearer {os.getenv('MY_TOKEN')}", } - resp = requests.get(url, headers=headers, timeout=30) + resp = httpx.get(url, headers=headers, timeout=30.0) resp.raise_for_status() data = resp.json() # list[dict] diff --git a/examples/api_demo/profiles.yml b/examples/api_demo/profiles.yml index 647fb16..88d7cd5 100644 --- a/examples/api_demo/profiles.yml +++ b/examples/api_demo/profiles.yml @@ -24,3 +24,21 @@ dev_databricks: spark.hadoop.datanucleus.schema.autoCreateAll: "true" spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'api_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true + # allow_create_dataset: true # uncomment to auto-create dataset on first run + +dev_bigquery_pandas: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'api_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: false + # allow_create_dataset: true # uncomment to auto-create dataset on first run diff --git a/examples/api_demo/site/dag/api_users_http.html b/examples/api_demo/site/dag/api_users_http.html deleted file mode 100644 index ef69b8d..0000000 --- a/examples/api_demo/site/dag/api_users_http.html +++ /dev/null @@ -1,255 +0,0 @@ - - - - - - api_users_http – FastFlowTransform - - - -

← Back to overview

- -
-
-

- api_users_http - table -

-
Model Detail β€’ FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
api_users_http
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/api_demo/models/engines/databricks_spark/api_users_http.ff.py - -
- -
Dependencies
-
- - - -
- - -
Referenced by
-
- -
- -
-
- - - - -
-

Columns

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
api_user_idbigint - - yes - - - - β€” - - - - unknown - -
emailstring - - yes - - - - β€” - - - - unknown - -
usernamestring - - yes - - - - β€” - - - - unknown - -
namestring - - yes - - - - β€” - - - - unknown - -
- - - - - - - - - \ No newline at end of file diff --git a/examples/api_demo/site/dag/api_users_requests.html b/examples/api_demo/site/dag/api_users_requests.html deleted file mode 100644 index 429b215..0000000 --- a/examples/api_demo/site/dag/api_users_requests.html +++ /dev/null @@ -1,255 +0,0 @@ - - - - - - api_users_requests – FastFlowTransform - - - -

← Back to overview

- -
-
-

- api_users_requests - table -

-
Model Detail β€’ FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
api_users_requests
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/api_demo/models/engines/databricks_spark/api_users_requests.ff.py - -
- -
Dependencies
-
- - - -
- - -
Referenced by
-
- -
- -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
api_user_idbigint - - yes - - - - β€” - - - - unknown - -
emailstring - - yes - - - - β€” - - - - unknown - -
namestring - - yes - - - - β€” - - - - unknown - -
usernamestring - - yes - - - - β€” - - - - unknown - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/api_demo/site/dag/index.html b/examples/api_demo/site/dag/index.html deleted file mode 100644 index 5e75b72..0000000 --- a/examples/api_demo/site/dag/index.html +++ /dev/null @@ -1,272 +0,0 @@ - - - - - - FastFlowTransform - DAG & Mini Docs - - - - - - - -
-
-

FastFlowTransform - DAG & Mini Docs

-
Mermaid renders automatically (light/dark)
-
-
- - -
-
- -
-
-

DAG

-
- SQL - Python - β€’ - Materialization: - - table - - view - - ephemeral - - incremental - -
-
flowchart TD - classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; - classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - api_users_http("api_users_http
(api_users_http)") - class api_users_http py; - api_users_requests("api_users_requests
(api_users_requests)") - class api_users_requests py; - mart_users_join_ff["mart_users_join.ff
(mart_users_join)"] - class mart_users_join_ff sql; - users_ff["users.ff
(users)"] - class users_ff sql; - api_users_http --> mart_users_join_ff - api_users_requests --> mart_users_join_ff - users_ff --> mart_users_join_ff - users_ff --> api_users_requests - users_ff --> api_users_http -
-
- - - -
-

Macros

- -

No macros found.

- -
-
- - - - \ No newline at end of file diff --git a/examples/api_demo/site/dag/mart_users_join.ff.html b/examples/api_demo/site/dag/mart_users_join.ff.html deleted file mode 100644 index 6562004..0000000 --- a/examples/api_demo/site/dag/mart_users_join.ff.html +++ /dev/null @@ -1,282 +0,0 @@ - - - - - - mart_users_join.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- mart_users_join.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
mart_users_join
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/api_demo/models/common/mart_users_join.ff.sql - -
- -
Dependencies
-
- - - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
user_idbigint - - yes - - - - β€” - - - - - ?.id - - direct - - - - -
emailstring - - yes - - - - β€” - - - - - ?.email - - direct - - - - -
api_user_idbigint - - yes - - - - β€” - - - - unknown - -
usernamestring - - yes - - - - β€” - - - - unknown - -
namestring - - yes - - - - β€” - - - - unknown - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/api_demo/site/dag/users.ff.html b/examples/api_demo/site/dag/users.ff.html deleted file mode 100644 index 2915cde..0000000 --- a/examples/api_demo/site/dag/users.ff.html +++ /dev/null @@ -1,227 +0,0 @@ - - - - - - users.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- users.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
users
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/api_demo/models/common/users.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
idbigint - - yes - - - - β€” - - - - - ?.id - - direct - - - - -
emailstring - - yes - - - - β€” - - - - - ?.email - - direct - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/basic_demo/.env.dev_bigquery_bigframes b/examples/basic_demo/.env.dev_bigquery_bigframes new file mode 100644 index 0000000..73a3d3e --- /dev/null +++ b/examples/basic_demo/.env.dev_bigquery_bigframes @@ -0,0 +1,7 @@ +# BigQuery profile for the basic demo +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=basic_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/basic_demo/.env.dev_bigquery_pandas b/examples/basic_demo/.env.dev_bigquery_pandas new file mode 100644 index 0000000..73a3d3e --- /dev/null +++ b/examples/basic_demo/.env.dev_bigquery_pandas @@ -0,0 +1,7 @@ +# BigQuery profile for the basic demo +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=basic_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/basic_demo/Makefile b/examples/basic_demo/Makefile index 8f5a0be..7c4c675 100644 --- a/examples/basic_demo/Makefile +++ b/examples/basic_demo/Makefile @@ -9,6 +9,9 @@ UV ?= uv # Engine selector (duckdb|postgres|databricks_spark) ENGINE ?= duckdb +# BigQuery frame type selector (pandas | bigframes) +BQ_FRAME ?= bigframes + # Resolve profile and tags per engine ifeq ($(ENGINE),duckdb) PROFILE_ENV = dev_duckdb @@ -22,8 +25,22 @@ ifeq ($(ENGINE),databricks_spark) PROFILE_ENV = dev_databricks ENGINE_TAG = engine:databricks_spark endif +ifeq ($(ENGINE),bigquery) + # Default tags / profile for BigQuery + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) + +ifeq ($(ENGINE),bigquery) + BASE_ENV := $(BASE_ENV) FF_ENGINE_VARIANT=$(BQ_FRAME) +endif -BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) RUN_ENV = $(BASE_ENV) SELECT_FLAGS = --select tag:example:basic_demo --select tag:$(ENGINE_TAG) @@ -38,6 +55,8 @@ else ifeq ($(ENGINE),postgres) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),bigquery) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" else $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark) endif @@ -55,7 +74,7 @@ help: @echo " make demo ENGINE=$(ENGINE)" @echo " make clean ENGINE=$(ENGINE)" @echo - @echo "Variables: DB=$(DB) PROJECT=$(PROJECT) UV=$(UV)" + @echo "Variables: DB=$(DB) PROJECT=$(PROJECT) UV=$(UV) ENGINE=$(ENGINE) BQ_FRAME=$(BQ_FRAME)" seed: env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) diff --git a/examples/basic_demo/README.md b/examples/basic_demo/README.md index 106a814..adf92ec 100644 --- a/examples/basic_demo/README.md +++ b/examples/basic_demo/README.md @@ -1,52 +1,8 @@ # Basic demo -This project is a minimal FastFlowTransform pipeline that works unchanged on DuckDB, Postgres, and Databricks Spark. It ships with: -- `seeds/seed_users.csv` – three sample users that bootstrap the project. -- `models/staging/users_clean.ff.sql` – normalizes emails and signup timestamps. -- `models/marts/mart_users_by_domain.ff.sql` – aggregates users by email domain. -- `models/engines/*/mart_latest_signup.ff.py` – engine-scoped Python models (pandas for DuckDB/Postgres, PySpark for Databricks) that grab the latest signup per domain from the staging view. +Minimal FFT pipeline that runs unchanged on DuckDB, Postgres, Databricks Spark, and BigQuery. -## Quickstart - -1. Install the package and CLI (see repository root instructions). -2. `cd examples/basic_demo` (this folder) so relative paths line up. -3. Load one of the provided engine environments, then seed and run the project. - -> ⚠️ `make clean` (or direct calls to `cleanup_env.py`) rely on the same environment variables as the run commands. Always export the `.env.dev_*` file for the engine you are cleaning so paths, schemas, and credentials are available. - -### DuckDB - -```bash -cp .env.dev_duckdb .env.local # optional convenience copy -set -a; source .env.dev_duckdb; set +a # export FF_DUCKDB_PATH -fft seed basic_demo --env dev_duckdb -fft run basic_demo --env dev_duckdb -fft show basic_demo.mart_users_by_domain --env dev_duckdb -fft show basic_demo.mart_latest_signup --env dev_duckdb -``` - -### Postgres - -```bash -cp .env.dev_postgres .env.local # fill in FF_PG_DSN with your credentials -set -a; source .env.dev_postgres; set +a -fft seed basic_demo --env dev_postgres -fft run basic_demo --env dev_postgres -fft show basic_demo.mart_users_by_domain --env dev_postgres -fft show basic_demo.mart_latest_signup --env dev_postgres -``` - -### Databricks Spark (local or hosted) - -```bash -cp .env.dev_databricks .env.local # adjust Spark master / credentials as needed -set -a; source .env.dev_databricks; set +a -fft seed basic_demo --env dev_databricks -fft run basic_demo --env dev_databricks -fft show basic_demo.mart_users_by_domain --env dev_databricks -fft show basic_demo.mart_latest_signup --env dev_databricks -``` - -The resulting tables report user counts per email domain and spotlight the most recent signup per domain. Extend any of the CSV, SQL, or Python assets to explore more complex scenarios. - -Further background is documented in [`docs/examples/Basic_Demo.md`](../../docs/examples/Basic_Demo.md). +## How to use +- See the full walkthrough (env setup, Makefile targets, engine notes, DQ tests) in `docs/examples/Basic_Demo.md`. +- From this directory: set the desired `.env.dev_*` (for BigQuery choose `.env.dev_bigquery_pandas` or `.env.dev_bigquery_bigframes`), then run `make demo ENGINE=` (set `BQ_FRAME` to switch BigQuery client) to seed β†’ run β†’ dag β†’ test. +- To inspect results, open `site/dag/index.html` after a run or query the mart tables via your engine client. diff --git a/examples/basic_demo/models/engines/bigquery/bigframes/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/bigquery/bigframes/mart_latest_signup.ff.py new file mode 100644 index 0000000..bdaa876 --- /dev/null +++ b/examples/basic_demo/models/engines/bigquery/bigframes/mart_latest_signup.ff.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: + import bigframes.pandas as bpd + from bigframes.pandas import DataFrame as BFDataFrame +else: + bpd: Any = None + + class BFDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _get_bigframes() -> Any: + try: + import bigframes.pandas as bpd_mod + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "bigframes is required for this model. Install fastflowtransform[bigquery_bf]." + ) from exc + return bpd_mod + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "bigframes", + }, + name="mart_latest_signup", + materialized="table", + tags=[ + "example:basic_demo", + "scope:mart", + "engine:bigquery", + ], + deps=["users_clean.ff"], + require={"users_clean.ff": ["user_id", "email", "email_domain", "signup_date"]}, +) +def build(users_clean: BFDataFrame) -> BFDataFrame: + _get_bigframes() + latest = ( + users_clean.sort_values("signup_date", ascending=False) + .drop_duplicates(subset="email_domain") + .loc[:, ["email_domain", "user_id", "email", "signup_date"]] + .rename( + columns={ + "user_id": "latest_user_id", + "email": "latest_email", + "signup_date": "latest_signup_date", + } + ) # type: ignore[arg-type] + ) + return latest diff --git a/examples/basic_demo/models/engines/bigquery/pandas/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/bigquery/pandas/mart_latest_signup.ff.py new file mode 100644 index 0000000..263aba9 --- /dev/null +++ b/examples/basic_demo/models/engines/bigquery/pandas/mart_latest_signup.ff.py @@ -0,0 +1,36 @@ +import pandas as pd + +from fastflowtransform import engine_model + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "pandas", + }, + name="mart_latest_signup", + materialized="table", + tags=[ + "example:basic_demo", + "scope:mart", + "engine:bigquery", + ], + deps=["users_clean.ff"], + require={"users_clean.ff": ["user_id", "email", "email_domain", "signup_date"]}, +) +def build(users_clean: pd.DataFrame) -> pd.DataFrame: + """Return the latest signup per email domain using pandas (BigQuery).""" + latest = ( + users_clean.sort_values("signup_date", ascending=False) + .drop_duplicates("email_domain") + .loc[:, ["email_domain", "user_id", "email", "signup_date"]] + .rename( + columns={ + "user_id": "latest_user_id", + "email": "latest_email", + "signup_date": "latest_signup_date", + } + ) + .reset_index(drop=True) + ) + return latest diff --git a/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py index 0b1a77a..84ec4cb 100644 --- a/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py +++ b/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py @@ -1,9 +1,27 @@ -from pyspark.sql import DataFrame -from pyspark.sql import Window -from pyspark.sql import functions as F +from typing import TYPE_CHECKING, Any from fastflowtransform import engine_model +if TYPE_CHECKING: + from pyspark.sql import DataFrame + from pyspark.sql import WindowSpec + from pyspark.sql import functions as F +else: + DataFrame = Any + WindowSpec = Any + F = Any + + +def _get_spark_utils() -> tuple[Any, Any]: + try: + from pyspark.sql import Window as _Window + from pyspark.sql import functions as _F + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "pyspark is required for this model. Install fastflowtransform[spark]." + ) from exc + return _Window, _F + @engine_model( only="databricks_spark", @@ -19,6 +37,7 @@ ) def build(users_clean: DataFrame) -> DataFrame: """Return the latest signup per email domain using PySpark DataFrame operations.""" + Window, F = _get_spark_utils() window = Window.partitionBy("email_domain").orderBy(F.col("signup_date").desc()) latest = ( diff --git a/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql b/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql index d74c06d..170632c 100644 --- a/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql +++ b/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql @@ -5,7 +5,8 @@ 'scope:mart', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} diff --git a/examples/basic_demo/models/staging/users_clean.ff.sql b/examples/basic_demo/models/staging/users_clean.ff.sql index 3795e73..b91afbb 100644 --- a/examples/basic_demo/models/staging/users_clean.ff.sql +++ b/examples/basic_demo/models/staging/users_clean.ff.sql @@ -5,7 +5,8 @@ 'scope:staging', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} diff --git a/examples/basic_demo/profiles.yml b/examples/basic_demo/profiles.yml index 514028f..f0d88bc 100644 --- a/examples/basic_demo/profiles.yml +++ b/examples/basic_demo/profiles.yml @@ -25,3 +25,21 @@ dev_databricks: spark.hadoop.datanucleus.schema.autoCreateAll: "true" spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'basic_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true + # allow_create_dataset: true # uncomment to auto-create dataset on first run + +dev_bigquery_pandas: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'basic_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: false + # allow_create_dataset: true # uncomment to auto-create dataset on first run diff --git a/examples/basic_demo/site/dag/index.html b/examples/basic_demo/site/dag/index.html deleted file mode 100644 index 19e7a44..0000000 --- a/examples/basic_demo/site/dag/index.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - FastFlowTransform - DAG & Mini Docs - - - - - - - -
-
-

FastFlowTransform - DAG & Mini Docs

-
Mermaid renders automatically (light/dark)
-
-
- - -
-
- -
-
-

DAG

-
- SQL - Python - β€’ - Materialization: - - table - - view - - ephemeral - - incremental - -
-
flowchart TD - classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; - classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - mart_latest_signup("mart_latest_signup
(mart_latest_signup)") - class mart_latest_signup py; - mart_users_by_domain_ff["mart_users_by_domain.ff
(mart_users_by_domain)"] - class mart_users_by_domain_ff sql; - users_clean_ff["users_clean.ff
(users_clean)"] - class users_clean_ff sql; - users_clean_ff --> mart_users_by_domain_ff - users_clean_ff --> mart_latest_signup -
-
- - - -
-

Macros

- -

No macros found.

- -
-
- - - - \ No newline at end of file diff --git a/examples/basic_demo/site/dag/mart_latest_signup.html b/examples/basic_demo/site/dag/mart_latest_signup.html deleted file mode 100644 index f6561dc..0000000 --- a/examples/basic_demo/site/dag/mart_latest_signup.html +++ /dev/null @@ -1,246 +0,0 @@ - - - - - - mart_latest_signup – FastFlowTransform - - - -

← Back to overview

- -
-
-

- mart_latest_signup - table -

-
Model Detail β€’ FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
mart_latest_signup
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py - -
- -
Dependencies
-
- - - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
email_domainstring - - yes - - - - β€” - - - - unknown - -
latest_user_idint - - yes - - - - β€” - - - - unknown - -
latest_emailstring - - yes - - - - β€” - - - - unknown - -
latest_signup_datedate - - yes - - - - β€” - - - - unknown - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/basic_demo/site/dag/mart_users_by_domain.ff.html b/examples/basic_demo/site/dag/mart_users_by_domain.ff.html deleted file mode 100644 index 4cccafe..0000000 --- a/examples/basic_demo/site/dag/mart_users_by_domain.ff.html +++ /dev/null @@ -1,252 +0,0 @@ - - - - - - mart_users_by_domain.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- mart_users_by_domain.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
mart_users_by_domain
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql - -
- -
Dependencies
-
- - - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
email_domainstring - - yes - - - - β€” - - - - - ?.email_domain - - direct - - - - -
user_countbigint - - yes - - - - β€” - - - - unknown - -
first_signupdate - - yes - - - - β€” - - - - unknown - -
last_signupdate - - yes - - - - β€” - - - - unknown - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/basic_demo/site/dag/users_clean.ff.html b/examples/basic_demo/site/dag/users_clean.ff.html deleted file mode 100644 index ba353fd..0000000 --- a/examples/basic_demo/site/dag/users_clean.ff.html +++ /dev/null @@ -1,271 +0,0 @@ - - - - - - users_clean.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- users_clean.ff - view -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
view
- -
Relation
-
users_clean
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/basic_demo/models/staging/users_clean.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
user_idint - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
emailstring - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
email_domainstring - - yes - - - - β€” - - - - unknown - -
signup_datedate - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/cache_demo/.env.dev_bigquery_bigframes b/examples/cache_demo/.env.dev_bigquery_bigframes new file mode 100644 index 0000000..cc90396 --- /dev/null +++ b/examples/cache_demo/.env.dev_bigquery_bigframes @@ -0,0 +1,14 @@ +# BigQuery profile for the cache demo (BigFrames client) +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=cache_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json + +# HTTP cache knobs (optional; mirror duckdb defaults) +FF_HTTP_ALLOWED_DOMAINS=jsonplaceholder.typicode.com +FF_HTTP_CACHE_DIR=.local/http-cache +FF_HTTP_MAX_RPS=5 +FF_HTTP_MAX_RETRIES=2 +FF_HTTP_TIMEOUT=10 diff --git a/examples/cache_demo/.env.dev_bigquery_pandas b/examples/cache_demo/.env.dev_bigquery_pandas new file mode 100644 index 0000000..cc94b83 --- /dev/null +++ b/examples/cache_demo/.env.dev_bigquery_pandas @@ -0,0 +1,14 @@ +# BigQuery profile for the cache demo (pandas client) +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=cache_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json + +# HTTP cache knobs (optional; mirror duckdb defaults) +FF_HTTP_ALLOWED_DOMAINS=jsonplaceholder.typicode.com +FF_HTTP_CACHE_DIR=.local/http-cache +FF_HTTP_MAX_RPS=5 +FF_HTTP_MAX_RETRIES=2 +FF_HTTP_TIMEOUT=10 diff --git a/examples/cache_demo/.env.dev_databricks b/examples/cache_demo/.env.dev_databricks new file mode 100644 index 0000000..3b1fbc4 --- /dev/null +++ b/examples/cache_demo/.env.dev_databricks @@ -0,0 +1,7 @@ +# Databricks/Spark profile for the cache demo (local Spark defaults; adjust for real DBR) +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=cache_demo +# FF_DBR_DATABASE=cache_demo +# FF_DBR_WAREHOUSE_DIR=.local/spark_warehouse + +FF_DBR_ENABLE_HIVE=1 \ No newline at end of file diff --git a/examples/cache_demo/.env.dev_postgres b/examples/cache_demo/.env.dev_postgres new file mode 100644 index 0000000..df0bb72 --- /dev/null +++ b/examples/cache_demo/.env.dev_postgres @@ -0,0 +1,3 @@ +# Postgres profile for the cache demo (replace with your own connection string) +FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 +FF_PG_SCHEMA=cache_demo diff --git a/examples/cache_demo/Makefile b/examples/cache_demo/Makefile index da6aa4c..c2cf2f9 100644 --- a/examples/cache_demo/Makefile +++ b/examples/cache_demo/Makefile @@ -3,15 +3,58 @@ http_first http_offline http_cache_clear artifacts dag clean \ demo -ENGINE ?= duckdb -PROFILE_ENV = dev_duckdb +ENGINE ?= duckdb # duckdb | postgres | databricks_spark | bigquery +# BigQuery frame selector (pandas | bigframes) +BQ_FRAME ?= bigframes PROJECT ?= . UV ?= uv - -BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +DB ?= .local/cache_demo.duckdb + +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb + ENGINE_TAG = engine:duckdb +endif +ifeq ($(ENGINE),postgres) + PROFILE_ENV = dev_postgres + ENGINE_TAG = engine:postgres +endif +ifeq ($(ENGINE),databricks_spark) + PROFILE_ENV = dev_databricks + ENGINE_TAG = engine:databricks_spark +endif +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif +ifndef PROFILE_ENV + $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark|bigquery) +endif + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) +ifeq ($(ENGINE),bigquery) + BASE_ENV := $(BASE_ENV) FF_ENGINE_VARIANT=$(BQ_FRAME) +endif RUN_ENV = $(BASE_ENV) -SELECT_ALL = --select tag:example:cache_demo +SELECT_ALL = --select tag:example:cache_demo --select tag:$(ENGINE_TAG) + +CLEAN_SCRIPT = ../_scripts/cleanup_env.py + +ifeq ($(ENGINE),duckdb) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine duckdb --env "$(PROFILE_ENV)" --project "$(PROJECT)" --duckdb-path "$(DB)" +else ifeq ($(ENGINE),postgres) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),databricks_spark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),bigquery) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else + $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark|bigquery) +endif seed: env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) @@ -31,9 +74,15 @@ change_sql: +$(MAKE) run change_seed: - # Modify seed β†’ staging and mart rebuild - @printf "\n4,dan@example.com\n" >> "$(PROJECT)/seeds/seed_users.csv" - +$(MAKE) seed + # Build a combined seed in .local without touching tracked files + @mkdir -p "$(PROJECT)/.local/seeds" + @cp "$(PROJECT)/seeds/seed_orders.csv" "$(PROJECT)/.local/seeds/seed_orders.csv" + @cp "$(PROJECT)/seeds/seed_users.csv" "$(PROJECT)/.local/seeds/seed_users.csv" + @# If present, keep schema.yml aligned with the temporary seeds dir + @test -f "$(PROJECT)/seeds/schema.yml" && cp "$(PROJECT)/seeds/schema.yml" "$(PROJECT)/.local/seeds/schema.yml" || true + @# Append the patch rows (skip header) to the temporary copy + @tail -n +2 "$(PROJECT)/patches/seed_users_patch.csv" >> "$(PROJECT)/.local/seeds/seed_users.csv" + env $(BASE_ENV) FFT_SEEDS_DIR="$(PROJECT)/.local/seeds" $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) +$(MAKE) run change_env: @@ -61,6 +110,7 @@ dag: env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_ALL) --html clean: + $(CLEAN_CMD) rm -rf .local cache_demo.duckdb site .fastflowtransform demo: clean diff --git a/examples/cache_demo/README.md b/examples/cache_demo/README.md index 9482057..419ea60 100644 --- a/examples/cache_demo/README.md +++ b/examples/cache_demo/README.md @@ -10,17 +10,26 @@ This demo shows: ## Quickstart ```bash +# pick your engine (duckdb, postgres, databricks_spark, or bigquery); defaults to duckdb +cp .env.dev_duckdb .env +# or: cp .env.dev_postgres .env (then edit DSN/schema) +# or: cp .env.dev_databricks .env +# or: cp .env.dev_bigquery_pandas .env # or .env.dev_bigquery_bigframes + cd examples/cache_demo -make cache_first # builds and writes cache -make cache_second # should SKIP everything -make change_sql # touch SQL β†’ mart rebuilds -make change_seed # add a seed row β†’ staging + mart rebuild -make change_env # FF_* env change β†’ full rebuild -make change_py # edit constant in py_constants.ff.py β†’ it rebuilds - -make http_first # warms HTTP cache -make http_offline # reuses HTTP cache without network -make http_cache_clear # clears HTTP response cache +make cache_first ENGINE=duckdb # builds and writes cache +make cache_second ENGINE=duckdb # should SKIP everything +make change_sql ENGINE=duckdb # touch SQL β†’ mart rebuilds +make change_seed ENGINE=duckdb # seed with base + patches/seed_users_patch.csv (no tracked edits) +make change_env ENGINE=duckdb # FF_* env change β†’ full rebuild +make change_py ENGINE=duckdb # edit constant in py_constants.ff.py β†’ it rebuilds + +make http_first ENGINE=duckdb # warms HTTP cache +make http_offline ENGINE=duckdb # reuses HTTP cache without network +make http_cache_clear # clears HTTP response cache +# +# Seeds stay immutable: change_seed builds a temporary combined copy in .local/seeds using +# patches/seed_users_patch.csv so the repo doesn’t become dirty. Inspect: site/dag/index.html @@ -32,6 +41,10 @@ Code kopieren --- +To run everything on Postgres, set `ENGINE=postgres` and copy/edit `.env.dev_postgres`, e.g. `make demo ENGINE=postgres`. +To run on Databricks/Spark locally, set `ENGINE=databricks_spark` and copy/edit `.env.dev_databricks`, e.g. `make demo ENGINE=databricks_spark`. +To run on BigQuery, set `ENGINE=bigquery` and copy/edit `.env.dev_bigquery_pandas` (or `.env.dev_bigquery_bigframes`), e.g. `make demo ENGINE=bigquery BQ_FRAME=bigframes` (default) or `BQ_FRAME=pandas`. + ## What this demo proves (in a minute) - **Cache hit/skip:** `make cache_second` should skip everything (if nothing changed). diff --git a/examples/cache_demo/models/engines/bigquery/bigframes/http_users.ff.py b/examples/cache_demo/models/engines/bigquery/bigframes/http_users.ff.py new file mode 100644 index 0000000..5d9e216 --- /dev/null +++ b/examples/cache_demo/models/engines/bigquery/bigframes/http_users.ff.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df + +if TYPE_CHECKING: + import bigframes.pandas as bpd + from bigframes.pandas import DataFrame as BFDataFrame +else: + bpd: Any = None + + class BFDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _get_bigframes() -> Any: + try: + import bigframes.pandas as bpd_mod + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "bigframes is required for this model. Install fastflowtransform[bigquery_bf]." + ) from exc + return bpd_mod + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "bigframes", + }, + name="http_users", + deps=["stg_users.ff"], # dependency for cache invalidation symmetry + meta={ + "materialized": "table", + "tags": ["example:cache_demo", "engine:bigquery"], + }, +) +def fetch(_: BFDataFrame) -> BFDataFrame: + _get_bigframes() + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, + normalize=True, + output="bigframes", + ) + return df.loc[:, ["id", "email", "username"]].rename(columns={"id": "api_user_id"}) # type: ignore[arg-type] diff --git a/examples/cache_demo/models/engines/bigquery/bigframes/py_constants.ff.py b/examples/cache_demo/models/engines/bigquery/bigframes/py_constants.ff.py new file mode 100644 index 0000000..d2239cd --- /dev/null +++ b/examples/cache_demo/models/engines/bigquery/bigframes/py_constants.ff.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: + import bigframes.pandas as bpd + from bigframes.pandas import DataFrame as BFDataFrame +else: + bpd: Any = None + + class BFDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _get_bigframes() -> Any: + try: + import bigframes.pandas as bpd_mod + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "bigframes is required for this model. Install fastflowtransform[bigquery_bf]." + ) from exc + return bpd_mod + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "bigframes", + }, + name="py_constants", + materialized="table", + tags=[ + "example:cache_demo", + "engine:bigquery", + ], +) +def build() -> BFDataFrame: + """BigQuery (BigFrames) version returning a BigFrames DataFrame.""" + bpd_mod = _get_bigframes() + return bpd_mod.DataFrame([{"k": "answer", "v": 42}]) diff --git a/examples/cache_demo/models/engines/bigquery/pandas/http_users.ff.py b/examples/cache_demo/models/engines/bigquery/pandas/http_users.ff.py new file mode 100644 index 0000000..28f3671 --- /dev/null +++ b/examples/cache_demo/models/engines/bigquery/pandas/http_users.ff.py @@ -0,0 +1,25 @@ +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df +import pandas as pd + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "pandas", + }, + name="http_users", + deps=["stg_users.ff"], # dependency for cache invalidation symmetry + meta={ + "materialized": "table", + "tags": ["example:cache_demo", "engine:bigquery"], + }, +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, + normalize=True, + ) + cols = [c for c in df.columns if c in ("id", "email", "username")] + return df[cols].rename(columns={"id": "api_user_id"}) diff --git a/examples/cache_demo/models/engines/bigquery/pandas/py_constants.ff.py b/examples/cache_demo/models/engines/bigquery/pandas/py_constants.ff.py new file mode 100644 index 0000000..367507d --- /dev/null +++ b/examples/cache_demo/models/engines/bigquery/pandas/py_constants.ff.py @@ -0,0 +1,20 @@ +import pandas as pd + +from fastflowtransform import engine_model + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "pandas", + }, + name="py_constants", + materialized="table", + tags=[ + "example:cache_demo", + "engine:bigquery", + ], +) +def build() -> pd.DataFrame: + """BigQuery (pandas) version returning a pandas DataFrame.""" + return pd.DataFrame([{"k": "answer", "v": 42}]) diff --git a/examples/cache_demo/models/engines/databricks_spark/http_users.ff.py b/examples/cache_demo/models/engines/databricks_spark/http_users.ff.py new file mode 100644 index 0000000..ad36cb7 --- /dev/null +++ b/examples/cache_demo/models/engines/databricks_spark/http_users.ff.py @@ -0,0 +1,42 @@ +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from pyspark.sql import DataFrame + from pyspark.sql.functions import col +else: + DataFrame = Any + + def col(*args: Any, **kwargs: Any): # pragma: no cover - placeholder + ... + + +def _get_col(): + try: + from pyspark.sql.functions import col as _col + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "pyspark is required for this model. Install fastflowtransform[spark]." + ) from exc + return _col + + +@engine_model( + only="databricks_spark", + name="http_users", + deps=["stg_users.ff"], # just to show a dependency; not used in code + meta={ + "materialized": "table", + "tags": ["example:cache_demo", "engine:databricks_spark"], + }, +) +def fetch(_: DataFrame) -> DataFrame: + col_fn = _get_col() + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, + normalize=True, + output="spark", + ) + return df.select(col_fn("id").alias("api_user_id"), col_fn("email"), col_fn("username")) diff --git a/examples/cache_demo/models/engines/databricks_spark/py_constants.ff.py b/examples/cache_demo/models/engines/databricks_spark/py_constants.ff.py new file mode 100644 index 0000000..5a11d65 --- /dev/null +++ b/examples/cache_demo/models/engines/databricks_spark/py_constants.ff.py @@ -0,0 +1,34 @@ +from fastflowtransform import engine_model +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from pyspark.sql import SparkSession +else: + + class SparkSession: # pragma: no cover - placeholder for runtime type hints + ... + + +def _ensure_spark_session() -> "SparkSession": + try: + from pyspark.sql import SparkSession as _SparkSession + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "pyspark is required for this model. Install fastflowtransform[spark]." + ) from exc + return _SparkSession.getActiveSession() or _SparkSession.builder.getOrCreate() + + +@engine_model( + only="databricks_spark", + name="py_constants", + materialized="table", + tags=[ + "example:cache_demo", + "engine:databricks_spark", + ], +) +def build(): + """Spark version returning a Spark DataFrame.""" + spark = _ensure_spark_session() + return spark.createDataFrame([{"k": "answer", "v": 42}]) diff --git a/examples/cache_demo/models/http/http_users.ff.py b/examples/cache_demo/models/engines/duckdb/http_users.ff.py similarity index 88% rename from examples/cache_demo/models/http/http_users.ff.py rename to examples/cache_demo/models/engines/duckdb/http_users.ff.py index bf1bf80..d835814 100644 --- a/examples/cache_demo/models/http/http_users.ff.py +++ b/examples/cache_demo/models/engines/duckdb/http_users.ff.py @@ -1,9 +1,10 @@ -from fastflowtransform import model +from fastflowtransform import engine_model from fastflowtransform.api.http import get_df import pandas as pd -@model( +@engine_model( + only="duckdb", name="http_users", deps=["stg_users.ff"], # just to show a dependency; not used in code meta={ diff --git a/examples/cache_demo/models/engines/duckdb/py_constants.ff.py b/examples/cache_demo/models/engines/duckdb/py_constants.ff.py new file mode 100644 index 0000000..999a763 --- /dev/null +++ b/examples/cache_demo/models/engines/duckdb/py_constants.ff.py @@ -0,0 +1,17 @@ +import pandas as pd + +from fastflowtransform import engine_model + + +@engine_model( + only="duckdb", + name="py_constants", + materialized="table", + tags=[ + "example:cache_demo", + "engine:duckdb", + ], +) +def build() -> pd.DataFrame: + """DuckDB/Postgres-friendly version using pandas.""" + return pd.DataFrame([{"k": "answer", "v": 42}]) diff --git a/examples/cache_demo/models/engines/postgres/http_users.ff.py b/examples/cache_demo/models/engines/postgres/http_users.ff.py new file mode 100644 index 0000000..df33a90 --- /dev/null +++ b/examples/cache_demo/models/engines/postgres/http_users.ff.py @@ -0,0 +1,22 @@ +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df +import pandas as pd + + +@engine_model( + only="postgres", + name="http_users", + deps=["stg_users.ff"], # just to show a dependency; not used in code + meta={ + "materialized": "table", + "tags": ["example:cache_demo", "engine:postgres"], + }, +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, + normalize=True, + ) + cols = [c for c in df.columns if c in ("id", "email", "username")] + return df[cols].rename(columns={"id": "api_user_id"}) diff --git a/examples/cache_demo/models/engines/postgres/py_constants.ff.py b/examples/cache_demo/models/engines/postgres/py_constants.ff.py new file mode 100644 index 0000000..89bb2b1 --- /dev/null +++ b/examples/cache_demo/models/engines/postgres/py_constants.ff.py @@ -0,0 +1,17 @@ +import pandas as pd + +from fastflowtransform import engine_model + + +@engine_model( + only="postgres", + name="py_constants", + materialized="table", + tags=[ + "example:cache_demo", + "engine:postgres", + ], +) +def build() -> pd.DataFrame: + """Postgres version using pandas.""" + return pd.DataFrame([{"k": "answer", "v": 42}]) diff --git a/examples/cache_demo/models/marts/mart_user_orders.ff.sql b/examples/cache_demo/models/marts/mart_user_orders.ff.sql index 406aa61..b5aa71b 100644 --- a/examples/cache_demo/models/marts/mart_user_orders.ff.sql +++ b/examples/cache_demo/models/marts/mart_user_orders.ff.sql @@ -1,4 +1,4 @@ -{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb']) }} +{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb','engine:postgres','engine:databricks_spark','engine:bigquery']) }} with u as ( select user_id, email from {{ ref('stg_users.ff') }} ), diff --git a/examples/cache_demo/models/python/py_constants.ff.py b/examples/cache_demo/models/python/py_constants.ff.py deleted file mode 100644 index 5e22b7a..0000000 --- a/examples/cache_demo/models/python/py_constants.ff.py +++ /dev/null @@ -1,16 +0,0 @@ -from fastflowtransform import model -import pandas as pd - - -@model( - name="py_constants", - deps=[], # independent - meta={ - "materialized": "table", - "tags": ["example:cache_demo", "engine:duckdb"], - }, -) -def build() -> pd.DataFrame: - # Change this constant to trigger a fingerprint change for a pure Python model. - CONSTANT = 42 - return pd.DataFrame([{"k": "answer", "v": CONSTANT}]) diff --git a/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql b/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql index 9444fcc..156f624 100644 --- a/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql +++ b/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql @@ -1,6 +1,6 @@ -{{ config(materialized='view', tags=['example:cache_demo','engine:duckdb']) }} +{{ config(materialized='view', tags=['example:cache_demo','engine:duckdb','engine:postgres','engine:databricks_spark','engine:bigquery']) }} select cast(order_id as int) as order_id, cast(customer_id as int) as user_id, - cast(amount as double) as amount + cast(amount as decimal) as amount from {{ source('crm', 'orders') }}; diff --git a/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql b/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql index 5d55e6a..7862f7b 100644 --- a/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql +++ b/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql @@ -1,3 +1,3 @@ -{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb']) }} +{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb','engine:postgres','engine:databricks_spark','engine:bigquery']) }} select cast(id as int) as user_id, lower(email) as email from {{ source('crm', 'users') }}; diff --git a/examples/cache_demo/patches/seed_users_patch.csv b/examples/cache_demo/patches/seed_users_patch.csv new file mode 100644 index 0000000..393621d --- /dev/null +++ b/examples/cache_demo/patches/seed_users_patch.csv @@ -0,0 +1,2 @@ +id,email +4,dan@example.com diff --git a/examples/cache_demo/profiles.yml b/examples/cache_demo/profiles.yml index b65dc2f..fccfda5 100644 --- a/examples/cache_demo/profiles.yml +++ b/examples/cache_demo/profiles.yml @@ -2,3 +2,38 @@ dev_duckdb: engine: duckdb duckdb: path: "{{ env('FF_DUCKDB_PATH', '.local/cache_demo.duckdb') }}" + +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +dev_databricks: + engine: databricks_spark + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'cache_demo') }}" + warehouse_dir: "{{ project_dir() }}/{{ env('FF_DBR_WAREHOUSE_DIR', '.local/spark_warehouse') }}" + database: "{{ env('FF_DBR_DATABASE', 'cache_demo') }}" + extra_conf: + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" + spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'cache_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true + # allow_create_dataset: true # uncomment to auto-create dataset on first run + +dev_bigquery_pandas: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'cache_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: false + # allow_create_dataset: true # uncomment to auto-create dataset on first run diff --git a/examples/cache_demo/project.yml b/examples/cache_demo/project.yml index 25b996d..fda1476 100644 --- a/examples/cache_demo/project.yml +++ b/examples/cache_demo/project.yml @@ -5,16 +5,23 @@ vars: {} models: storage: - stg_users.ff: { path: ".local/duck/users", format: parquet } - stg_orders.ff: { path: ".local/duck/orders", format: parquet } - mart_user_orders.ff: { path: ".local/duck/mart_user_orders", format: parquet } - py_constants: { path: ".local/duck/py_constants", format: parquet } - http_users: { path: ".local/duck/http_users", format: parquet } + stg_users.ff: + path: ".local/spark/stg_users" + stg_orders.ff: + path: ".local/spark/stg_orders" + mart_user_orders.ff: + path: ".local/spark/mart_user_orders" + py_constants: + path: ".local/spark/py_constants" + http_users: + path: ".local/spark/http_users" seeds: storage: - seed_users: { path: ".local/duck/seed_users", format: parquet } - seed_orders: { path: ".local/duck/seed_orders", format: parquet } + seed_users: + path: ".local/spark/seed_users" + seed_orders: + path: ".local/spark/seed_orders" tests: - type: row_count_between diff --git a/examples/cache_demo/seeds/seed_users.csv b/examples/cache_demo/seeds/seed_users.csv index 5559381..dc9b5c0 100644 --- a/examples/cache_demo/seeds/seed_users.csv +++ b/examples/cache_demo/seeds/seed_users.csv @@ -2,5 +2,3 @@ id,email 1,alice@example.com 2,bob@example.com 3,carol@example.com - -4,dan@example.com diff --git a/examples/cache_demo/site/dag/http_users.html b/examples/cache_demo/site/dag/http_users.html deleted file mode 100644 index 220718b..0000000 --- a/examples/cache_demo/site/dag/http_users.html +++ /dev/null @@ -1,141 +0,0 @@ - - - - - - http_users – FastFlowTransform - - - -

← Back to overview

- -
-
-

- http_users - table -

-
Model Detail β€’ FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
http_users
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/cache_demo/models/http/http_users.ff.py - -
- -
Dependencies
-
- - - -
- - -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/cache_demo/site/dag/index.html b/examples/cache_demo/site/dag/index.html deleted file mode 100644 index bcabd2c..0000000 --- a/examples/cache_demo/site/dag/index.html +++ /dev/null @@ -1,291 +0,0 @@ - - - - - - FastFlowTransform - DAG & Mini Docs - - - - - - - -
-
-

FastFlowTransform - DAG & Mini Docs

-
Mermaid renders automatically (light/dark)
-
-
- - -
-
- -
-
-

DAG

-
- SQL - Python - β€’ - Materialization: - - table - - view - - ephemeral - - incremental - -
-
flowchart TD - classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; - classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - http_users("http_users
(http_users)") - class http_users py; - mart_user_orders_ff["mart_user_orders.ff
(mart_user_orders)"] - class mart_user_orders_ff sql; - py_constants("py_constants
(py_constants)") - class py_constants py; - stg_orders_ff["stg_orders.ff
(stg_orders)"] - class stg_orders_ff sql; - stg_users_ff["stg_users.ff
(stg_users)"] - class stg_users_ff sql; - stg_users_ff --> mart_user_orders_ff - stg_orders_ff --> mart_user_orders_ff - stg_users_ff --> http_users -
-
- - - -
-

Macros

- -

No macros found.

- -
-
- - - - \ No newline at end of file diff --git a/examples/cache_demo/site/dag/mart_user_orders.ff.html b/examples/cache_demo/site/dag/mart_user_orders.ff.html deleted file mode 100644 index 96f9452..0000000 --- a/examples/cache_demo/site/dag/mart_user_orders.ff.html +++ /dev/null @@ -1,143 +0,0 @@ - - - - - - mart_user_orders.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- mart_user_orders.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
mart_user_orders
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/cache_demo/models/marts/mart_user_orders.ff.sql - -
- -
Dependencies
-
- - - -
- - -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/cache_demo/site/dag/py_constants.html b/examples/cache_demo/site/dag/py_constants.html deleted file mode 100644 index 98cdb7a..0000000 --- a/examples/cache_demo/site/dag/py_constants.html +++ /dev/null @@ -1,137 +0,0 @@ - - - - - - py_constants – FastFlowTransform - - - -

← Back to overview

- -
-
-

- py_constants - table -

-
Model Detail β€’ FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
py_constants
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/cache_demo/models/python/py_constants.ff.py - -
- -
Dependencies
-
- - – - -
- - -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/cache_demo/site/dag/stg_orders.ff.html b/examples/cache_demo/site/dag/stg_orders.ff.html deleted file mode 100644 index de0f1b9..0000000 --- a/examples/cache_demo/site/dag/stg_orders.ff.html +++ /dev/null @@ -1,146 +0,0 @@ - - - - - - stg_orders.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- stg_orders.ff - view -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
view
- -
Relation
-
stg_orders
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
-
- -
- -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/cache_demo/site/dag/stg_users.ff.html b/examples/cache_demo/site/dag/stg_users.ff.html deleted file mode 100644 index 3e4ee0e..0000000 --- a/examples/cache_demo/site/dag/stg_users.ff.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - stg_users.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- stg_users.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
stg_users
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
- - -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/cache_demo/sources.yml b/examples/cache_demo/sources.yml index 1348091..0490edc 100644 --- a/examples/cache_demo/sources.yml +++ b/examples/cache_demo/sources.yml @@ -1,6 +1,7 @@ version: 2 sources: - name: crm + schema: cache_demo tables: - name: users identifier: seed_users diff --git a/examples/dq_demo/.env.dev_bigquery_bigframes b/examples/dq_demo/.env.dev_bigquery_bigframes new file mode 100644 index 0000000..d91d23f --- /dev/null +++ b/examples/dq_demo/.env.dev_bigquery_bigframes @@ -0,0 +1,7 @@ +# BigQuery profile for the dq demo +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=dq_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/dq_demo/.env.dev_bigquery_pandas b/examples/dq_demo/.env.dev_bigquery_pandas new file mode 100644 index 0000000..d91d23f --- /dev/null +++ b/examples/dq_demo/.env.dev_bigquery_pandas @@ -0,0 +1,7 @@ +# BigQuery profile for the dq demo +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=dq_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/dq_demo/.env.dev_databricks b/examples/dq_demo/.env.dev_databricks index add68a2..a883c0b 100644 --- a/examples/dq_demo/.env.dev_databricks +++ b/examples/dq_demo/.env.dev_databricks @@ -8,5 +8,7 @@ FF_DBR_DATABASE=dq_demo # Uncomment to switch to Delta Lake (requires delta-spark dependency) # FF_DBR_TABLE_FORMAT=delta -# Adjust to your JDK installation if needed -JAVA_HOME=/opt/homebrew/opt/openjdk@17 +# Prefer an existing JAVA_HOME (e.g., in CI); fall back to the macOS brew path for local use. +if [ -z "${JAVA_HOME:-}" ] && [ -d "/opt/homebrew/opt/openjdk@17" ]; then + JAVA_HOME=/opt/homebrew/opt/openjdk@17 +fi diff --git a/examples/dq_demo/Makefile b/examples/dq_demo/Makefile index 8306f84..ce2fca4 100644 --- a/examples/dq_demo/Makefile +++ b/examples/dq_demo/Makefile @@ -8,6 +8,9 @@ UV ?= uv # Engine selector (duckdb|postgres|databricks_spark) ENGINE ?= duckdb +# BigQuery frame selector (pandas|bigframes) +BQ_FRAME ?= bigframes + # Detect OS opener (macOS: open, Linux: xdg-open) UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) @@ -29,8 +32,20 @@ ifeq ($(ENGINE),databricks_spark) PROFILE_ENV = dev_databricks ENGINE_TAG = engine:databricks_spark endif +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif -BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) + +ifeq ($(ENGINE),bigquery) + BASE_ENV := $(BASE_ENV) FF_ENGINE_VARIANT=$(BQ_FRAME) +endif RUN_ENV = $(BASE_ENV) # Select only dq_demo models for the active engine @@ -44,6 +59,8 @@ else ifeq ($(ENGINE),postgres) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),bigquery) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" else CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) endif diff --git a/examples/dq_demo/README.md b/examples/dq_demo/README.md index 5e977f7..68319e8 100644 --- a/examples/dq_demo/README.md +++ b/examples/dq_demo/README.md @@ -1,7 +1,25 @@ -# FastFlowTransform project scaffold +# Data Quality Demo -This project was created with `fft init`. -Next steps: -1. Update `profiles.yml` with real connection details (docs/Profiles.md). -2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). -3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). +Run the complete DQ demo (seeds β†’ models β†’ DAG β†’ tests) on DuckDB, Postgres, Databricks Spark, or BigQuery (pandas or BigFrames). + +## Quickstart +From this directory: + +1) Pick an engine and copy the matching `.env.dev_*` to `.env` (edit project/dataset if needed): + - DuckDB: `.env.dev_duckdb` + - Postgres: `.env.dev_postgres` + - Databricks Spark: `.env.dev_databricks` + - BigQuery (pandas): `.env.dev_bigquery_pandas` + - BigQuery (BigFrames): `.env.dev_bigquery_bigframes` + +2) Run the demo (set `BQ_FRAME` when using BigQuery): + ```sh + make demo ENGINE=duckdb + make demo ENGINE=postgres + make demo ENGINE=databricks_spark + make demo ENGINE=bigquery BQ_FRAME=pandas # or bigframes + ``` + +Artifacts: +- Target metadata: `.fastflowtransform/target/{manifest.json,run_results.json,catalog.xml}` +- DAG HTML: `site/dag/index.html` diff --git a/examples/dq_demo/models/engines/bigquery/bigframes/mart_orders_agg.ff.py b/examples/dq_demo/models/engines/bigquery/bigframes/mart_orders_agg.ff.py new file mode 100644 index 0000000..b52ada0 --- /dev/null +++ b/examples/dq_demo/models/engines/bigquery/bigframes/mart_orders_agg.ff.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: + import bigframes.pandas as bpd + from bigframes.pandas import DataFrame as BFDataFrame +else: + bpd: Any = None + + class BFDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _get_bigframes() -> Any: + try: + import bigframes.pandas as bpd_mod + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "bigframes is required for this model. Install fastflowtransform[bigquery_bf]." + ) from exc + return bpd_mod + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "bigframes", + }, + name="mart_orders_agg", + materialized="table", + tags=[ + "example:dq_demo", + "scope:mart", + "engine:bigquery", + ], + deps=["orders.ff", "customers.ff"], + require={ + "orders.ff": ["order_id", "customer_id", "amount", "order_ts"], + "customers.ff": ["customer_id", "name", "status"], + }, +) +def build(orders: BFDataFrame, customers: BFDataFrame) -> BFDataFrame: + _get_bigframes() + base = orders.merge(customers, on="customer_id", how="inner", suffixes=("", "_cust")) + + grouped = ( + base.groupby(["customer_id", "name", "status"], dropna=False) + .agg( + order_count=("order_id", "count"), + total_amount=("amount", "sum"), + first_order_ts=("order_ts", "min"), + last_order_ts=("order_ts", "max"), + ) + .reset_index() + ) + + grouped = grouped.rename(columns={"name": "customer_name"}) # type: ignore[arg-type] + return grouped[ + [ + "customer_id", + "customer_name", + "status", + "order_count", + "total_amount", + "first_order_ts", + "last_order_ts", + ] + ] diff --git a/examples/dq_demo/models/engines/bigquery/pandas/mart_orders_agg.ff.py b/examples/dq_demo/models/engines/bigquery/pandas/mart_orders_agg.ff.py new file mode 100644 index 0000000..0da4018 --- /dev/null +++ b/examples/dq_demo/models/engines/bigquery/pandas/mart_orders_agg.ff.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import pandas as pd + +from fastflowtransform import engine_model + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "pandas", + }, + name="mart_orders_agg", + materialized="table", + tags=[ + "example:dq_demo", + "scope:mart", + "engine:bigquery", + ], + deps=["orders.ff", "customers.ff"], + require={ + "orders.ff": ["order_id", "customer_id", "amount", "order_ts"], + "customers.ff": ["customer_id", "name", "status"], + }, +) +def build(orders: pd.DataFrame, customers: pd.DataFrame) -> pd.DataFrame: + """Aggregate orders per customer for reconciliation checks.""" + base = orders.merge(customers, on="customer_id", how="inner", suffixes=("", "_cust")) + + grouped = ( + base.groupby(["customer_id", "name", "status"], dropna=False) + .agg( + order_count=("order_id", "count"), + total_amount=("amount", "sum"), + first_order_ts=("order_ts", "min"), + last_order_ts=("order_ts", "max"), + ) + .reset_index() + ) + + grouped.rename( + columns={ + "name": "customer_name", + }, + inplace=True, + ) + return grouped[ + [ + "customer_id", + "customer_name", + "status", + "order_count", + "total_amount", + "first_order_ts", + "last_order_ts", + ] + ] diff --git a/examples/dq_demo/models/marts/mart_orders_agg.ff.sql b/examples/dq_demo/models/marts/mart_orders_agg.ff.sql index 5ad5386..1a9d686 100644 --- a/examples/dq_demo/models/marts/mart_orders_agg.ff.sql +++ b/examples/dq_demo/models/marts/mart_orders_agg.ff.sql @@ -5,7 +5,8 @@ 'scope:mart', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} diff --git a/examples/dq_demo/models/staging/customers.ff.sql b/examples/dq_demo/models/staging/customers.ff.sql index 0fb2a31..d1e2583 100644 --- a/examples/dq_demo/models/staging/customers.ff.sql +++ b/examples/dq_demo/models/staging/customers.ff.sql @@ -5,7 +5,8 @@ 'scope:staging', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} diff --git a/examples/dq_demo/models/staging/orders.ff.sql b/examples/dq_demo/models/staging/orders.ff.sql index e864093..3a2c511 100644 --- a/examples/dq_demo/models/staging/orders.ff.sql +++ b/examples/dq_demo/models/staging/orders.ff.sql @@ -5,7 +5,8 @@ 'scope:staging', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} diff --git a/examples/dq_demo/profiles.yml b/examples/dq_demo/profiles.yml index 363ab8c..8df61e7 100644 --- a/examples/dq_demo/profiles.yml +++ b/examples/dq_demo/profiles.yml @@ -24,3 +24,21 @@ dev_databricks: spark.hadoop.datanucleus.schema.autoCreateAll: "true" spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'dq_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true + # allow_create_dataset: true + +dev_bigquery_pandas: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'dq_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: false + # allow_create_dataset: true diff --git a/examples/dq_demo/site/dag/customers.ff.html b/examples/dq_demo/site/dag/customers.ff.html deleted file mode 100644 index 05cae2e..0000000 --- a/examples/dq_demo/site/dag/customers.ff.html +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - customers.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- customers.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
customers
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/dq_demo/models/staging/customers.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
-
- -
- -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
customer_idbigint - - yes - - - - β€” - - - - - ?.customer_id - - direct - - - - -
namestring - - yes - - - - β€” - - - - - ?.name - - direct - - - - -
statusstring - - yes - - - - β€” - - - - - ?.status - - direct - - - - -
created_atstring - - yes - - - - β€” - - - - - ?.created_at - - direct - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/dq_demo/site/dag/index.html b/examples/dq_demo/site/dag/index.html deleted file mode 100644 index c9a164b..0000000 --- a/examples/dq_demo/site/dag/index.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - FastFlowTransform - DAG & Mini Docs - - - - - - - -
-
-

FastFlowTransform - DAG & Mini Docs

-
Mermaid renders automatically (light/dark)
-
-
- - -
-
- -
-
-

DAG

-
- SQL - Python - β€’ - Materialization: - - table - - view - - ephemeral - - incremental - -
-
flowchart TD - classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; - classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - customers_ff["customers.ff
(customers)"] - class customers_ff sql; - mart_orders_agg_ff["mart_orders_agg.ff
(mart_orders_agg)"] - class mart_orders_agg_ff sql; - orders_ff["orders.ff
(orders)"] - class orders_ff sql; - orders_ff --> mart_orders_agg_ff - customers_ff --> mart_orders_agg_ff -
-
- - - -
-

Macros

- -

No macros found.

- -
-
- - - - \ No newline at end of file diff --git a/examples/dq_demo/site/dag/mart_orders_agg.ff.html b/examples/dq_demo/site/dag/mart_orders_agg.ff.html deleted file mode 100644 index 91495af..0000000 --- a/examples/dq_demo/site/dag/mart_orders_agg.ff.html +++ /dev/null @@ -1,320 +0,0 @@ - - - - - - mart_orders_agg.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- mart_orders_agg.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
mart_orders_agg
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/dq_demo/models/marts/mart_orders_agg.ff.sql - -
- -
Dependencies
-
- - - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
customer_idint - - yes - - - - β€” - - - - - ?.customer_id - - direct - - - - -
customer_namestring - - yes - - - - β€” - - - - - ?.name - - direct - - - - -
statusstring - - yes - - - - β€” - - - - unknown - -
order_countbigint - - yes - - - - β€” - - - - unknown - -
total_amountdecimal(20,0) - - yes - - - - β€” - - - - unknown - -
first_order_tstimestamp - - yes - - - - β€” - - - - unknown - -
last_order_tstimestamp - - yes - - - - β€” - - - - unknown - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/dq_demo/site/dag/orders.ff.html b/examples/dq_demo/site/dag/orders.ff.html deleted file mode 100644 index 40bcc68..0000000 --- a/examples/dq_demo/site/dag/orders.ff.html +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - orders.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- orders.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
orders
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/dq_demo/models/staging/orders.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
-
- -
- -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
order_idint - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
customer_idint - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
amountdecimal(10,0) - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
order_tstimestamp - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/env_matrix/site/dag/env_vars.ff.html b/examples/env_matrix/site/dag/env_vars.ff.html deleted file mode 100644 index 5605883..0000000 --- a/examples/env_matrix/site/dag/env_vars.ff.html +++ /dev/null @@ -1,262 +0,0 @@ - - - - - - env_vars.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- env_vars.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
env_vars
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/env_matrix/models/env_vars.ff.py - -
- -
Dependencies
-
- - – - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
active_env_hintVARCHAR - - yes - - - - β€” - - - - unknown - -
ff_engineVARCHAR - - yes - - - - β€” - - - - unknown - -
duckdb_pathVARCHAR - - yes - - - - β€” - - - - unknown - -
duckdb_existsBOOLEAN - - yes - - - - β€” - - - - unknown - -
duckdb_size_bytesBIGINT - - yes - - - - β€” - - - - unknown - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/env_matrix/site/dag/index.html b/examples/env_matrix/site/dag/index.html deleted file mode 100644 index ef240f3..0000000 --- a/examples/env_matrix/site/dag/index.html +++ /dev/null @@ -1,225 +0,0 @@ - - - - - - FastFlowTransform - DAG & Mini Docs - - - - - - - -
-
-

FastFlowTransform - DAG & Mini Docs

-
Mermaid renders automatically (light/dark)
-
-
- - -
-
- -
-
-

DAG

-
- SQL - Python - β€’ - Materialization: - - table - - view - - ephemeral - - incremental - -
-
flowchart TD - classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; - classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - env_vars_ff("env_vars.ff
(env_vars)") - class env_vars_ff py; - users_ff["users.ff
(users)"] - class users_ff sql; -
-
- - - -
-

Macros

- -

No macros found.

- -
-
- - - - \ No newline at end of file diff --git a/examples/env_matrix/site/dag/users.ff.html b/examples/env_matrix/site/dag/users.ff.html deleted file mode 100644 index 7a9026f..0000000 --- a/examples/env_matrix/site/dag/users.ff.html +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - users.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- users.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
users
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/env_matrix/models/users.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
idBIGINT - - yes - - - - β€” - - - - - ?.id - - direct - - - - -
emailVARCHAR - - yes - - - - β€” - - - - - ?.email - - direct - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/events_users_duckdb/.fastflowtransform/cache/dev-duckdb.json b/examples/events_users_duckdb/.fastflowtransform/cache/dev-duckdb.json deleted file mode 100644 index 8319263..0000000 --- a/examples/events_users_duckdb/.fastflowtransform/cache/dev-duckdb.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "engine": "duckdb", - "entries": { - "fct_events_inc.ff": "9b48a28e3100c719dedfe953a49ae7ba9b88a7f7d983a4a2dc7065cf0b7124d9", - "users.ff": "dbd191f9ad5ada230f5099832f5f8c91b4c905a7a800330d282ab854777c4622", - "users_enriched": "670800d172c85bb405ed1bab65ffc8e723fd98f442ebe64bd9c92e17fc19cb8f" - }, - "profile": "dev", - "version": 1 -} \ No newline at end of file diff --git a/examples/events_users_duckdb/.fastflowtransform/target/catalog.json b/examples/events_users_duckdb/.fastflowtransform/target/catalog.json deleted file mode 100644 index 10194f5..0000000 --- a/examples/events_users_duckdb/.fastflowtransform/target/catalog.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "metadata": { - "generated_at": "2025-10-27T16:43:12+00:00", - "tool": "fastflowtransform" - }, - "relations": { - "fct_events_inc": { - "columns": [ - { - "dtype": "BIGINT", - "name": "event_id", - "nullable": true - }, - { - "dtype": "BIGINT", - "name": "user_id", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "event_type", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "ingested_at", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "meta_json", - "nullable": true - } - ] - }, - "users": { - "columns": [ - { - "dtype": "BIGINT", - "name": "id", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email", - "nullable": true - } - ] - }, - "users_enriched": { - "columns": [ - { - "dtype": "BIGINT", - "name": "id", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email", - "nullable": true - }, - { - "dtype": "BOOLEAN", - "name": "is_gmail", - "nullable": true - } - ] - } - } -} diff --git a/examples/events_users_duckdb/.fastflowtransform/target/manifest.json b/examples/events_users_duckdb/.fastflowtransform/target/manifest.json deleted file mode 100644 index 0f84982..0000000 --- a/examples/events_users_duckdb/.fastflowtransform/target/manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "macros": {}, - "metadata": { - "generated_at": "2025-10-27T16:43:12+00:00", - "tool": "fastflowtransform" - }, - "nodes": { - "fct_events_inc.ff": { - "deps": [], - "kind": "sql", - "materialized": "incremental", - "name": "fct_events_inc.ff", - "path": "models/fct_events_inc.ff.sql", - "relation": "fct_events_inc" - }, - "users.ff": { - "deps": [], - "kind": "sql", - "materialized": "table", - "name": "users.ff", - "path": "models/users.ff.sql", - "relation": "users" - }, - "users_enriched": { - "deps": [ - "users.ff" - ], - "kind": "python", - "materialized": "table", - "name": "users_enriched", - "path": "models/users_enriched.ff.py", - "relation": "users_enriched" - } - }, - "sources": { - "app": { - "events": { - "identifier": "seed_events_initial" - }, - "users": { - "identifier": "seed_users" - } - } - } -} diff --git a/examples/events_users_duckdb/.fastflowtransform/target/run_results.json b/examples/events_users_duckdb/.fastflowtransform/target/run_results.json deleted file mode 100644 index 9a8cdd6..0000000 --- a/examples/events_users_duckdb/.fastflowtransform/target/run_results.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "metadata": { - "generated_at": "2025-10-27T16:43:12+00:00", - "tool": "fastflowtransform" - }, - "results": [ - { - "duration_ms": 4, - "finished_at": "2025-10-27T16:43:12+00:00", - "message": null, - "name": "fct_events_inc.ff", - "started_at": "2025-10-27T16:43:12+00:00", - "status": "success" - }, - { - "duration_ms": 1, - "finished_at": "2025-10-27T16:43:12+00:00", - "message": null, - "name": "users.ff", - "started_at": "2025-10-27T16:43:12+00:00", - "status": "success" - }, - { - "duration_ms": 0, - "finished_at": "2025-10-27T16:43:12+00:00", - "message": null, - "name": "users_enriched", - "started_at": "2025-10-27T16:43:12+00:00", - "status": "success" - } - ], - "run_finished_at": "2025-10-27T16:43:12+00:00", - "run_started_at": "2025-10-27T16:43:12+00:00" -} diff --git a/examples/events_users_duckdb/Makefile b/examples/events_users_duckdb/Makefile deleted file mode 100644 index dec2227..0000000 --- a/examples/events_users_duckdb/Makefile +++ /dev/null @@ -1,86 +0,0 @@ -.PHONY: demo seed run dag test artifacts incr state-mod state-mod-plus res-error res-warn clean - -DB ?= .local/demo.duckdb -PROJECT ?= . -UV ?= uv - -# Detect OS opener (macOS: open, Linux: xdg-open) -UNAME_S := $(shell uname -s) -ifeq ($(UNAME_S),Darwin) - OPENER := open -else - OPENER := xdg-open -endif - -seed: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" $(UV) run fft seed "$(PROJECT)" --env dev - -run: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" $(UV) run fft run "$(PROJECT)" --env dev - -test: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" $(UV) run fft test "$(PROJECT)" --env dev - -dag: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" $(UV) run fft dag "$(PROJECT)" --env dev --html - -artifacts: - @echo - @echo "== πŸ“¦ Artifacts ==" - @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" - @echo " DAG HTML: $(PROJECT)/site/dag/index.html" - -incr: - $(UV) run fft run "$(PROJECT)" --env dev --select fct_events_inc.ff --cache rw || true - -state-mod: - @if [ -f "$(PROJECT)/models/users.ff.sql" ]; then touch "$(PROJECT)/models/users.ff.sql"; fi - FF_ENGINE=duckdb $(UV) run fft run "$(PROJECT)" --env dev --cache rw --select state:modified - -state-mod-plus: - FF_ENGINE=duckdb $(UV) run fft run "$(PROJECT)" --env dev --cache rw --select state:modified+ - -res-error: - FF_ENGINE=duckdb $(UV) run fft run "$(PROJECT)" --env dev --select result:error || true - -res-warn: - FF_ENGINE=duckdb $(UV) run fft run "$(PROJECT)" --env dev --select result:warn || true - -pg-seed: - FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" $(UV) run fft seed "$(PROJECT)" --env stg - -pg-run: - FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" $(UV) run fft run "$(PROJECT)" --env stg - -clean: - rm -rf .local "$(PROJECT)/docs" dist build *.egg-info .fastflowtransform - -demo-open: - @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ - $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ - else \ - echo "No HTML found: $(PROJECT)/site/dag/index.html"; \ - fi - -demo: clean - @echo "== πŸš€ R1 Demo (DuckDB) ==" - @echo "DB=$(DB) PROJECT=$(PROJECT)" - +$(MAKE) seed - +$(MAKE) run - +$(MAKE) dag - +$(MAKE) test - +$(MAKE) artifacts - @echo - @echo "== πŸ” Incremental Model ==" - +$(MAKE) incr - @echo - @echo "== 🧠 State Selection (changed only) ==" - +$(MAKE) state-mod - +$(MAKE) state-mod-plus - @echo - @echo "== πŸ§ͺ Result Selection (from last run_results.json) ==" - +$(MAKE) res-error - +$(MAKE) res-warn - @echo - @echo "βœ… Demo done. Open DAG here: $(PROJECT)/site/dag/index.html" - +$(MAKE) demo-open diff --git a/examples/events_users_duckdb/README.md b/examples/events_users_duckdb/README.md deleted file mode 100644 index 6e079dd..0000000 --- a/examples/events_users_duckdb/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# R1 Demo - -Minimal project showing: -- Incremental model (`fct_events_inc.ff.sql`) -- YAML tests (`models/users_enriched.yml`) -- State selection (`state:modified`, `result:*`) - -## DuckDB (local) - -```bash -make -C examples/r1_demo seed -make -C examples/r1_demo run -make -C examples/r1_demo dag -```` - -Incremental-only: - -```bash -make -C examples/r1_demo inc -``` - -## Postgres (optional) - -Set `FF_PG_DSN` and `FF_PG_SCHEMA`, then: - -```bash -make -C examples/r1_demo pg-seed -make -C examples/r1_demo pg-run -``` - -## Expected Artifacts - -``` -examples/r1_demo/.fastflowtransform/target/ -β”œβ”€β”€ manifest.json -β”œβ”€β”€ run_results.json -└── catalog.json -``` - -## Sample Output (excerpt) - -``` -βœ” L00 [DUCK] users.ff (120ms) -βœ” L01 [DUCK] users_enriched (35ms) -βœ” L01 [DUCK] fct_events_inc.ff (41ms) - -Data Quality Summary -──────────────────── -βœ… not_null users.email (2ms) -❕ accepted_values users_enriched.email (1ms) \ No newline at end of file diff --git a/examples/events_users_duckdb/models/fct_events_inc.ff.sql b/examples/events_users_duckdb/models/fct_events_inc.ff.sql deleted file mode 100644 index 02916fb..0000000 --- a/examples/events_users_duckdb/models/fct_events_inc.ff.sql +++ /dev/null @@ -1,24 +0,0 @@ -{{ config( - materialized='incremental', - unique_key=['event_id'], - on_schema_change='append_new_columns', - tags=['fact','incremental'] -) }} -with base as ( - select * - from {{ source('app','events') }} - {% if is_incremental() %} - where cast(ingested_at as timestamp) - > coalesce( - (select max(cast(ingested_at as timestamp)) from {{ this }}), - timestamp '1970-01-01' - ) - {% endif %} -) -select - event_id, - user_id, - event_type, - ingested_at, - meta_json -from base; \ No newline at end of file diff --git a/examples/events_users_duckdb/models/users.ff.sql b/examples/events_users_duckdb/models/users.ff.sql deleted file mode 100644 index 5283037..0000000 --- a/examples/events_users_duckdb/models/users.ff.sql +++ /dev/null @@ -1,3 +0,0 @@ -{{ config(materialized='table', tags=['staging']) }} -select id, email -from {{ source('app','users') }}; \ No newline at end of file diff --git a/examples/events_users_duckdb/models/users_enriched.ff.py b/examples/events_users_duckdb/models/users_enriched.ff.py deleted file mode 100644 index 383e3fc..0000000 --- a/examples/events_users_duckdb/models/users_enriched.ff.py +++ /dev/null @@ -1,13 +0,0 @@ -from fastflowtransform import model -import pandas as pd - - -@model( - name="users_enriched", - deps=["users.ff"], - require={"users": {"id", "email"}}, -) -def enrich(df: pd.DataFrame) -> pd.DataFrame: - out = df.copy() - out["is_gmail"] = out["email"].str.endswith("@gmail.com") - return out diff --git a/examples/events_users_duckdb/models/users_enriched.yml b/examples/events_users_duckdb/models/users_enriched.yml deleted file mode 100644 index a113bbe..0000000 --- a/examples/events_users_duckdb/models/users_enriched.yml +++ /dev/null @@ -1,15 +0,0 @@ -version: 2 -models: - - name: users_enriched - description: "Adds gmail flag" - columns: - - name: id - tests: - - not_null: { severity: error } - - unique - - name: email - tests: - - not_null - - accepted_values: - values: ["a@example.com","b@gmail.com","c@gmail.com"] - severity: warn \ No newline at end of file diff --git a/examples/events_users_duckdb/profiles.yml b/examples/events_users_duckdb/profiles.yml deleted file mode 100644 index 50de7b1..0000000 --- a/examples/events_users_duckdb/profiles.yml +++ /dev/null @@ -1,10 +0,0 @@ -dev: - engine: duckdb - duckdb: - path: .local/demo.duckdb - -stg: - engine: postgres - postgres: - dsn: postgresql+psycopg://postgres:postgres@localhost:5432/ffdb - db_schema: public \ No newline at end of file diff --git a/examples/events_users_duckdb/project.yml b/examples/events_users_duckdb/project.yml deleted file mode 100644 index 99a4f07..0000000 --- a/examples/events_users_duckdb/project.yml +++ /dev/null @@ -1,8 +0,0 @@ -vars: - run_date: "2025-01-01" - -tests: - - type: not_null - table: users - column: email - tags: [batch] \ No newline at end of file diff --git a/examples/events_users_duckdb/seeds/seed_events_initial.csv b/examples/events_users_duckdb/seeds/seed_events_initial.csv deleted file mode 100644 index 87c320f..0000000 --- a/examples/events_users_duckdb/seeds/seed_events_initial.csv +++ /dev/null @@ -1,3 +0,0 @@ -event_id,user_id,event_type,ingested_at,meta_json -100,1,signup,2025-01-01T00:00:00Z,"{}" -101,2,purchase,2025-01-01T00:05:00Z,"{}" \ No newline at end of file diff --git a/examples/events_users_duckdb/seeds/seed_users.csv b/examples/events_users_duckdb/seeds/seed_users.csv deleted file mode 100644 index f423042..0000000 --- a/examples/events_users_duckdb/seeds/seed_users.csv +++ /dev/null @@ -1,4 +0,0 @@ -id,email -1,a@example.com -2,b@gmail.com -3,c@gmail.com \ No newline at end of file diff --git a/examples/events_users_duckdb/site/dag/fct_events_inc.ff.html b/examples/events_users_duckdb/site/dag/fct_events_inc.ff.html deleted file mode 100644 index bb2b563..0000000 --- a/examples/events_users_duckdb/site/dag/fct_events_inc.ff.html +++ /dev/null @@ -1,262 +0,0 @@ - - - - - - fct_events_inc.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- fct_events_inc.ff - incremental -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
incremental
- -
Relation
-
fct_events_inc
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/events_users_duckdb/models/fct_events_inc.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
event_idBIGINT - - yes - - - - β€” - - - - unknown - -
user_idBIGINT - - yes - - - - β€” - - - - unknown - -
event_typeVARCHAR - - yes - - - - β€” - - - - unknown - -
ingested_atVARCHAR - - yes - - - - β€” - - - - unknown - -
meta_jsonVARCHAR - - yes - - - - β€” - - - - unknown - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/events_users_duckdb/site/dag/index.html b/examples/events_users_duckdb/site/dag/index.html deleted file mode 100644 index e438d5a..0000000 --- a/examples/events_users_duckdb/site/dag/index.html +++ /dev/null @@ -1,247 +0,0 @@ - - - - - - FastFlowTransform - DAG & Mini Docs - - - - - - - -
-
-

FastFlowTransform - DAG & Mini Docs

-
Mermaid renders automatically (light/dark)
-
-
- - -
-
- -
-
-

DAG

-
- SQL - Python - β€’ - Materialization: - - table - - view - - ephemeral - - incremental - -
-
flowchart TD - classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; - classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - fct_events_inc_ff["fct_events_inc.ff
(fct_events_inc)"] - class fct_events_inc_ff sql; - users_ff["users.ff
(users)"] - class users_ff sql; - users_enriched("users_enriched
(users_enriched)") - class users_enriched py; - users_ff --> users_enriched -
-
- - - -
-

Macros

- -

No macros found.

- -
-
- - - - \ No newline at end of file diff --git a/examples/events_users_duckdb/site/dag/users.ff.html b/examples/events_users_duckdb/site/dag/users.ff.html deleted file mode 100644 index 83e6158..0000000 --- a/examples/events_users_duckdb/site/dag/users.ff.html +++ /dev/null @@ -1,223 +0,0 @@ - - - - - - users.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- users.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
users
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/events_users_duckdb/models/users.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
-
- -
- -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
idBIGINT - - yes - - - - β€” - - - - - ?.id - - direct - - - - -
emailVARCHAR - - yes - - - - β€” - - - - - ?.email - - direct - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/events_users_duckdb/site/dag/users_enriched.html b/examples/events_users_duckdb/site/dag/users_enriched.html deleted file mode 100644 index 8fca7e0..0000000 --- a/examples/events_users_duckdb/site/dag/users_enriched.html +++ /dev/null @@ -1,232 +0,0 @@ - - - - - - users_enriched – FastFlowTransform - - - -

← Back to overview

- -
-
-

- users_enriched - table -

-
Model Detail β€’ FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
users_enriched
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/events_users_duckdb/models/users_enriched.ff.py - -
- -
Dependencies
-
- - - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
idBIGINT - - yes - - - - β€” - - - - unknown - -
emailVARCHAR - - yes - - - - β€” - - - - unknown - -
is_gmailBOOLEAN - - yes - - - - β€” - - - - - ?.email - - direct - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/events_users_duckdb/sources.yml b/examples/events_users_duckdb/sources.yml deleted file mode 100644 index 296aa75..0000000 --- a/examples/events_users_duckdb/sources.yml +++ /dev/null @@ -1,9 +0,0 @@ -version: 2 - -sources: - - name: app - tables: - - name: users - identifier: seed_users - - name: events - identifier: seed_events_initial diff --git a/examples/incremental_demo/.env.dev_bigquery_bigframes b/examples/incremental_demo/.env.dev_bigquery_bigframes new file mode 100644 index 0000000..3e29f4a --- /dev/null +++ b/examples/incremental_demo/.env.dev_bigquery_bigframes @@ -0,0 +1,7 @@ +# BigQuery profile for the incremental demo (BigFrames client) +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=incremental_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/incremental_demo/.env.dev_bigquery_pandas b/examples/incremental_demo/.env.dev_bigquery_pandas new file mode 100644 index 0000000..ee224c5 --- /dev/null +++ b/examples/incremental_demo/.env.dev_bigquery_pandas @@ -0,0 +1,7 @@ +# BigQuery profile for the incremental demo (pandas client) +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=incremental_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/incremental_demo/Makefile b/examples/incremental_demo/Makefile index 4b2c883..03b97d1 100644 --- a/examples/incremental_demo/Makefile +++ b/examples/incremental_demo/Makefile @@ -9,6 +9,9 @@ UV ?= uv # Engine selector (duckdb|postgres|databricks_spark) ENGINE ?= duckdb +# BigQuery frame type selector (pandas | bigframes) +BQ_FRAME ?= bigframes + # For Databricks Spark: control table format (parquet|delta|iceberg) DBR_TABLE_FORMAT ?= parquet @@ -41,11 +44,22 @@ ifeq ($(ENGINE),databricks_spark) PROFILE_ENV = dev_databricks_delta endif endif +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) ifeq ($(ENGINE),databricks_spark) BASE_ENV := $(BASE_ENV) FF_DBR_TABLE_FORMAT=$(DBR_TABLE_FORMAT) endif +ifeq ($(ENGINE),bigquery) + BASE_ENV := $(BASE_ENV) FF_ENGINE=$(ENGINE) FF_ENGINE_VARIANT=$(BQ_FRAME) +endif RUN_ENV = $(BASE_ENV) @@ -59,6 +73,8 @@ else ifeq ($(ENGINE),postgres) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),bigquery) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" else CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) endif diff --git a/examples/incremental_demo/README.md b/examples/incremental_demo/README.md index 5e977f7..c61aa84 100644 --- a/examples/incremental_demo/README.md +++ b/examples/incremental_demo/README.md @@ -1,7 +1,10 @@ -# FastFlowTransform project scaffold +# Incremental demo -This project was created with `fft init`. -Next steps: -1. Update `profiles.yml` with real connection details (docs/Profiles.md). -2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). -3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). +Small FFT example that showcases incremental models and Delta/Iceberg-style merges +across DuckDB, Postgres, Databricks Spark, and BigQuery (pandas or BigFrames). + +## How to use +- Fill an `.env.dev_*` for your engine (DuckDB/Postgres/Databricks/BigQuery). For BigQuery use `.env.dev_bigquery_pandas` or `.env.dev_bigquery_bigframes` plus a service-account key in `secrets/`. +- From this directory run `make demo ENGINE=` (set `BQ_FRAME` to switch BigQuery client; set `DBR_TABLE_FORMAT` for Spark). +- Artifacts: DAG HTML in `site/dag/index.html`, FFT metadata in `.fastflowtransform/target/`. +- See `docs/examples/Incremental_Demo.md` for a full walkthrough of the models and incremental configs. diff --git a/examples/incremental_demo/models/common/events_base.ff.sql b/examples/incremental_demo/models/common/events_base.ff.sql index b23424d..68ea352 100644 --- a/examples/incremental_demo/models/common/events_base.ff.sql +++ b/examples/incremental_demo/models/common/events_base.ff.sql @@ -6,7 +6,8 @@ 'kind:staging', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} diff --git a/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql b/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql index 4aee4d1..5bee73f 100644 --- a/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql +++ b/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql @@ -11,7 +11,8 @@ 'inc:type:inline-sql', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} diff --git a/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql b/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql index 2457d40..34ee345 100644 --- a/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql +++ b/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql @@ -7,7 +7,8 @@ 'inc:type:yaml-config', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ], ) }} diff --git a/examples/incremental_demo/models/engines/bigquery/bigframes/fct_events_py_incremental.ff.py b/examples/incremental_demo/models/engines/bigquery/bigframes/fct_events_py_incremental.ff.py new file mode 100644 index 0000000..1d41c9c --- /dev/null +++ b/examples/incremental_demo/models/engines/bigquery/bigframes/fct_events_py_incremental.ff.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: + import bigframes.pandas as bpd + from bigframes.pandas import DataFrame as BFDataFrame +else: + bpd: Any = None + + class BFDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _get_bigframes() -> Any: + try: + import bigframes.pandas as bpd_mod + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "bigframes is required for this model. Install fastflowtransform[bigquery_bf]." + ) from exc + return bpd_mod + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "bigframes", + }, + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=[ + "example:incremental_demo", + "scope:engine", + "engine:bigquery", + "kind:python", + "kind:incremental", + ], +) +def build(events_df: BFDataFrame) -> BFDataFrame: + """ + Python incremental example for BigQuery (BigFrames client). + + Applies a small transformation; incremental merge/insert behaviour is + governed by the project.yml incremental config. + """ + _get_bigframes() + df = events_df.copy() + df["value_x10"] = df["value"] * 10 + return df[["event_id", "updated_at", "value", "value_x10"]] diff --git a/examples/incremental_demo/models/engines/bigquery/pandas/fct_events_py_incremental.ff.py b/examples/incremental_demo/models/engines/bigquery/pandas/fct_events_py_incremental.ff.py new file mode 100644 index 0000000..8408ba7 --- /dev/null +++ b/examples/incremental_demo/models/engines/bigquery/pandas/fct_events_py_incremental.ff.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import pandas as pd +from fastflowtransform import engine_model + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "pandas", + }, + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=[ + "example:incremental_demo", + "scope:engine", + "engine:bigquery", + "kind:python", + "kind:incremental", + ], +) +def build(events_df: pd.DataFrame) -> pd.DataFrame: + """ + Python incremental example for BigQuery (pandas client). + + Computes the delta dataset; merge/update behaviour is configured via + project.yml β†’ models.incremental.fct_events_py_incremental.ff. + """ + df = events_df.copy() + df["value_x10"] = df["value"] * 10 + return df[["event_id", "updated_at", "value", "value_x10"]] diff --git a/examples/incremental_demo/models/engines/databricks_spark/fct_events_py_incremental.ff.py b/examples/incremental_demo/models/engines/databricks_spark/fct_events_py_incremental.ff.py index cd414cd..4eb369a 100644 --- a/examples/incremental_demo/models/engines/databricks_spark/fct_events_py_incremental.ff.py +++ b/examples/incremental_demo/models/engines/databricks_spark/fct_events_py_incremental.ff.py @@ -1,6 +1,23 @@ +from typing import TYPE_CHECKING, Any + from fastflowtransform import engine_model -from pyspark.sql import DataFrame as SparkDataFrame -from pyspark.sql import functions as F + +if TYPE_CHECKING: + from pyspark.sql import DataFrame as SparkDataFrame + from pyspark.sql import functions as F +else: + SparkDataFrame = Any + F = Any + + +def _get_functions() -> Any: + try: + from pyspark.sql import functions as _F + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "pyspark is required for this model. Install fastflowtransform[spark]." + ) from exc + return _F @engine_model( @@ -17,12 +34,9 @@ ) def build(events_df: SparkDataFrame) -> SparkDataFrame: """ - Python-Incremental-Beispiel (Databricks Spark). - - Auch hier: - - Build-Snapshot im Python-Model - - Merge/Delta wird ΓΌber Konfiguration gesteuert. + Python-Incremental-Example (Databricks Spark). """ - return events_df.withColumn("value_x10", F.col("value") * F.lit(10)).select( + _F = _get_functions() + return events_df.withColumn("value_x10", _F.col("value") * _F.lit(10)).select( "event_id", "updated_at", "value", "value_x10" ) diff --git a/examples/incremental_demo/models/engines/duckdb/fct_events_py_incremental.ff.py b/examples/incremental_demo/models/engines/duckdb/fct_events_py_incremental.ff.py index d1deb9e..cdff7e2 100644 --- a/examples/incremental_demo/models/engines/duckdb/fct_events_py_incremental.ff.py +++ b/examples/incremental_demo/models/engines/duckdb/fct_events_py_incremental.ff.py @@ -15,15 +15,6 @@ ], ) def build(events_df: pd.DataFrame) -> pd.DataFrame: - """ - Python-Incremental-Beispiel (DuckDB). - - Dieses Modell baut immer einen vollstΓ€ndigen Snapshot: - - ggf. leichte Transformation - Die eigentliche Incremental-Logik (Merge per unique_key, Delta-Spalten etc.) - kommt aus project.yml β†’ models.incremental.fct_events_py_incremental.ff. - """ - # kleine Beispiel-Transformation: value * 10 df = events_df.copy() df["value_x10"] = df["value"] * 10 return df[["event_id", "updated_at", "value", "value_x10"]] diff --git a/examples/incremental_demo/profiles.yml b/examples/incremental_demo/profiles.yml index a982986..8ab139b 100644 --- a/examples/incremental_demo/profiles.yml +++ b/examples/incremental_demo/profiles.yml @@ -65,3 +65,21 @@ dev_databricks_hudi: # Keep warehouse on disk under the project (matches warehouse_dir) spark.sql.warehouse.dir: "file://{{ project_dir() }}/.local/spark_warehouse_hudi" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'incremental_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true + # allow_create_dataset: true # uncomment to auto-create dataset on first run + +dev_bigquery_pandas: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'incremental_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: false + # allow_create_dataset: true # uncomment to auto-create dataset on first run diff --git a/examples/macros_demo/.env.dev_bigquery_bigframes b/examples/macros_demo/.env.dev_bigquery_bigframes new file mode 100644 index 0000000..d0b6ceb --- /dev/null +++ b/examples/macros_demo/.env.dev_bigquery_bigframes @@ -0,0 +1,7 @@ +# BigQuery profile for the macros demo (BigFrames) +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=macros_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/macros_demo/.env.dev_bigquery_pandas b/examples/macros_demo/.env.dev_bigquery_pandas new file mode 100644 index 0000000..22cfb93 --- /dev/null +++ b/examples/macros_demo/.env.dev_bigquery_pandas @@ -0,0 +1,7 @@ +# BigQuery profile for the macros demo (pandas) +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=macros_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/macros_demo/.env.dev_databricks b/examples/macros_demo/.env.dev_databricks index 45685d8..9362667 100644 --- a/examples/macros_demo/.env.dev_databricks +++ b/examples/macros_demo/.env.dev_databricks @@ -3,4 +3,8 @@ FF_SPARK_APP_NAME=macros_demo FF_DBR_ENABLE_HIVE=1 FF_DBR_DATABASE=macros_demo # FF_DBR_TABLE_FORMAT=delta -JAVA_HOME=/opt/homebrew/opt/openjdk@17 + +# Prefer an existing JAVA_HOME (e.g., in CI); fall back to the macOS brew path for local use. +if [ -z "${JAVA_HOME:-}" ] && [ -d "/opt/homebrew/opt/openjdk@17" ]; then + JAVA_HOME=/opt/homebrew/opt/openjdk@17 +fi diff --git a/examples/macros_demo/Makefile b/examples/macros_demo/Makefile index b5cd9bc..fbc7b75 100644 --- a/examples/macros_demo/Makefile +++ b/examples/macros_demo/Makefile @@ -4,6 +4,7 @@ DB ?= .local/macros_demo.duckdb PROJECT ?= . UV ?= uv ENGINE ?= duckdb +BQ_FRAME ?= bigframes UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) @@ -24,8 +25,21 @@ ifeq ($(ENGINE),databricks_spark) PROFILE_ENV = dev_databricks ENGINE_TAG = engine:databricks_spark endif +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) + +ifeq ($(ENGINE),bigquery) + BASE_ENV := $(BASE_ENV) FF_ENGINE_VARIANT=$(BQ_FRAME) +endif -BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) RUN_ENV = $(BASE_ENV) SELECT_FLAGS = --select tag:example:macros_demo --select tag:$(ENGINE_TAG) @@ -36,6 +50,8 @@ else ifeq ($(ENGINE),postgres) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),bigquery) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" else CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) endif diff --git a/examples/macros_demo/README.md b/examples/macros_demo/README.md index 5e977f7..95ca70c 100644 --- a/examples/macros_demo/README.md +++ b/examples/macros_demo/README.md @@ -5,3 +5,9 @@ Next steps: 1. Update `profiles.yml` with real connection details (docs/Profiles.md). 2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). 3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). + +## Engines + +- DuckDB/Postgres/Databricks Spark are pre-wired. Use `make demo ENGINE=duckdb|postgres|databricks_spark`. +- BigQuery (pandas or BigFrames) mirrors the basic demo setup. Set `ENGINE=bigquery` and optionally `BQ_FRAME=pandas|bigframes` (default bigframes), then run `make demo ENGINE=bigquery BQ_FRAME=bigframes`. +- Sample env files: `.env.dev_bigquery_bigframes` and `.env.dev_bigquery_pandas` contain the required `FF_BQ_*` variables and `GOOGLE_APPLICATION_CREDENTIALS` hint. diff --git a/examples/macros_demo/models/common/dim_users.ff.sql b/examples/macros_demo/models/common/dim_users.ff.sql index 7769d2d..30bc807 100644 --- a/examples/macros_demo/models/common/dim_users.ff.sql +++ b/examples/macros_demo/models/common/dim_users.ff.sql @@ -1,20 +1,30 @@ {{ config( materialized='table', - tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'] ) }} with u as ( select * from {{ ref('stg_users.ff') }} ), labels as ( - -- Tiny lookup generated by Python render macro - select * from (values {{ csv_values( - [ + -- Tiny lookup generated at render time (engine-aware) + {% set labels = [ {"domain":"example.com", "label":"internal"}, {"domain":"gmail.com", "label":"consumer"}, - ], - ["domain","label"] - ) }}) as t(domain, label) + ] %} + {%- if engine('duckdb') == 'bigquery' -%} + select * from unnest([ + {%- for row in labels -%} + struct('{{ row['domain'] }}' as domain, '{{ row['label'] }}' as label){% if not loop.last %},{% endif %} + {%- endfor -%} + ]) + {%- else -%} + select * from (values + {%- for row in labels -%} + ('{{ row['domain'] }}', '{{ row['label'] }}'){% if not loop.last %},{% endif %} + {%- endfor -%} + ) as t(domain, label) + {%- endif -%} ) select u.user_id, diff --git a/examples/macros_demo/models/common/fct_user_sales.ff.sql b/examples/macros_demo/models/common/fct_user_sales.ff.sql index 1f3de13..c3e8fa2 100644 --- a/examples/macros_demo/models/common/fct_user_sales.ff.sql +++ b/examples/macros_demo/models/common/fct_user_sales.ff.sql @@ -1,6 +1,6 @@ {{ config( materialized='table', - tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'] ) }} with o as ( diff --git a/examples/macros_demo/models/common/stg_orders.ff.sql b/examples/macros_demo/models/common/stg_orders.ff.sql index b92e9b1..f872e6c 100644 --- a/examples/macros_demo/models/common/stg_orders.ff.sql +++ b/examples/macros_demo/models/common/stg_orders.ff.sql @@ -1,6 +1,6 @@ {{ config( materialized='view', - tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'] ) }} select diff --git a/examples/macros_demo/models/common/stg_users.ff.sql b/examples/macros_demo/models/common/stg_users.ff.sql index 4a14e62..1da7408 100644 --- a/examples/macros_demo/models/common/stg_users.ff.sql +++ b/examples/macros_demo/models/common/stg_users.ff.sql @@ -1,6 +1,6 @@ {{ config( materialized='view', - tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'] ) }} with src as ( diff --git a/examples/macros_demo/models/engines/bigquery/bigframes/py_exmaple.ff.py b/examples/macros_demo/models/engines/bigquery/bigframes/py_exmaple.ff.py new file mode 100644 index 0000000..027e377 --- /dev/null +++ b/examples/macros_demo/models/engines/bigquery/bigframes/py_exmaple.ff.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: + import bigframes.pandas as bpd + from bigframes.pandas import DataFrame as BFDataFrame +else: + bpd: Any = None + + class BFDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _get_bigframes() -> Any: + try: + import bigframes.pandas as bpd_mod + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "bigframes is required for this model. Install fastflowtransform[bigquery_bf]." + ) from exc + return bpd_mod + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "bigframes", + }, + name="py_example", + deps=["fct_user_sales.ff"], + tags=["example:macros_demo", "scope:engine", "engine:bigquery"], +) +def produce(_: BFDataFrame) -> BFDataFrame: + # In a real project, you might fetch extra metadata here or post-process + bpd_mod = _get_bigframes() + return bpd_mod.DataFrame([{"note": "Python model ran on BigQuery (BigFrames)"}]) diff --git a/examples/macros_demo/models/engines/bigquery/pandas/py_exmaple.ff.py b/examples/macros_demo/models/engines/bigquery/pandas/py_exmaple.ff.py new file mode 100644 index 0000000..eadea80 --- /dev/null +++ b/examples/macros_demo/models/engines/bigquery/pandas/py_exmaple.ff.py @@ -0,0 +1,17 @@ +import pandas as pd + +from fastflowtransform import engine_model + + +@engine_model( + env_match={ + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": "pandas", + }, + name="py_example", + deps=["fct_user_sales.ff"], + tags=["example:macros_demo", "scope:engine", "engine:bigquery"], +) +def produce(_: pd.DataFrame) -> pd.DataFrame: + # In a real project, you might fetch extra metadata here or post-process + return pd.DataFrame([{"note": "Python model ran on BigQuery (pandas)"}]) diff --git a/examples/macros_demo/models/engines/databricks_spark/py_exmaple.ff.py b/examples/macros_demo/models/engines/databricks_spark/py_exmaple.ff.py new file mode 100644 index 0000000..e3ce072 --- /dev/null +++ b/examples/macros_demo/models/engines/databricks_spark/py_exmaple.ff.py @@ -0,0 +1,38 @@ +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: + from pyspark.sql import DataFrame as SparkDataFrame +else: + + class SparkDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _ensure_spark_session(df: Any): + try: + from pyspark.sql import SparkSession as _SparkSession + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "pyspark is required for this model. Install fastflowtransform[spark]." + ) from exc + + session = getattr(df, "sparkSession", None) + if session is None: + session = _SparkSession.getActiveSession() + if session is None: + session = _SparkSession.builder.getOrCreate() + return session + + +@engine_model( + only="databricks_spark", + name="py_example", + deps=["fct_user_sales.ff"], + tags=["example:macros_demo", "scope:engine", "engine:databricks_spark"], +) +def produce(df: SparkDataFrame) -> SparkDataFrame: + # Use the incoming Spark session to return a simple marker row + spark = _ensure_spark_session(df) + return spark.createDataFrame([{"note": "Python model ran on Databricks Spark"}]) diff --git a/examples/macros_demo/models/engines/postgres/py_exmaple.ff.py b/examples/macros_demo/models/engines/postgres/py_exmaple.ff.py new file mode 100644 index 0000000..01989e8 --- /dev/null +++ b/examples/macros_demo/models/engines/postgres/py_exmaple.ff.py @@ -0,0 +1,14 @@ +import pandas as pd + +from fastflowtransform import engine_model + + +@engine_model( + only="postgres", + name="py_example", + deps=["fct_user_sales.ff"], + tags=["example:macros_demo", "scope:engine", "engine:postgres"], +) +def produce(_: pd.DataFrame) -> pd.DataFrame: + # In a real project, you might fetch extra metadata here or post-process + return pd.DataFrame([{"note": "Python model ran on Postgres"}]) diff --git a/examples/macros_demo/models/macros/utils.sql b/examples/macros_demo/models/macros/utils.sql index e72debe..eb0020b 100644 --- a/examples/macros_demo/models/macros/utils.sql +++ b/examples/macros_demo/models/macros/utils.sql @@ -1,16 +1,25 @@ {# Reusable SQL helpers #} {%- macro email_domain(expr) -%} +{%- set e = engine('duckdb') -%} +{%- if e == 'bigquery' -%} + lower(split({{ expr }}, '@')[SAFE_OFFSET(1)]) +{%- else -%} lower(split_part({{ expr }}, '@', 2)) +{%- endif -%} {%- endmacro -%} {%- macro safe_cast_amount(expr) -%} {# engine-aware numeric type #} {%- set e = engine('duckdb') -%} -{%- if e in ['duckdb', 'postgres'] -%} +{%- if e == 'duckdb' -%} cast({{ expr }} as double) +{%- elif e == 'postgres' -%} + cast({{ expr }} as double precision) {%- elif e == 'databricks_spark' -%} cast({{ expr }} as double) +{%- elif e == 'bigquery' -%} + cast({{ expr }} as float64) {%- else -%} cast({{ expr }} as double) {%- endif -%} diff --git a/examples/macros_demo/profiles.yml b/examples/macros_demo/profiles.yml index 143ee7a..3da5cee 100644 --- a/examples/macros_demo/profiles.yml +++ b/examples/macros_demo/profiles.yml @@ -21,3 +21,21 @@ dev_databricks: spark.hadoop.datanucleus.schema.autoCreateAll: "true" spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'macros_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true + allow_create_dataset: true + +dev_bigquery_pandas: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'macros_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: false + allow_create_dataset: true diff --git a/examples/macros_demo/site/dag/dim_users.ff.html b/examples/macros_demo/site/dag/dim_users.ff.html deleted file mode 100644 index 724aa2c..0000000 --- a/examples/macros_demo/site/dag/dim_users.ff.html +++ /dev/null @@ -1,150 +0,0 @@ - - - - - - dim_users.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- dim_users.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
dim_users
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/macros_demo/models/common/dim_users.ff.sql - -
- -
Dependencies
-
- - - -
- - -
Referenced by
-
- -
- -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/macros_demo/site/dag/fct_user_sales.ff.html b/examples/macros_demo/site/dag/fct_user_sales.ff.html deleted file mode 100644 index 2b6b987..0000000 --- a/examples/macros_demo/site/dag/fct_user_sales.ff.html +++ /dev/null @@ -1,152 +0,0 @@ - - - - - - fct_user_sales.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- fct_user_sales.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
fct_user_sales
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/macros_demo/models/common/fct_user_sales.ff.sql - -
- -
Dependencies
-
- - - -
- - -
Referenced by
-
- -
- -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/macros_demo/site/dag/index.html b/examples/macros_demo/site/dag/index.html deleted file mode 100644 index 9d3874d..0000000 --- a/examples/macros_demo/site/dag/index.html +++ /dev/null @@ -1,393 +0,0 @@ - - - - - - FastFlowTransform - DAG & Mini Docs - - - - - - - -
-
-

FastFlowTransform - DAG & Mini Docs

-
Mermaid renders automatically (light/dark)
-
-
- - -
-
- -
-
-

DAG

-
- SQL - Python - β€’ - Materialization: - - table - - view - - ephemeral - - incremental - -
-
flowchart TD - classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; - classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - dim_users_ff["dim_users.ff
(dim_users)"] - class dim_users_ff sql; - fct_user_sales_ff["fct_user_sales.ff
(fct_user_sales)"] - class fct_user_sales_ff sql; - py_example("py_example
(py_example)") - class py_example py; - stg_orders_ff["stg_orders.ff
(stg_orders)"] - class stg_orders_ff sql; - stg_users_ff["stg_users.ff
(stg_users)"] - class stg_users_ff sql; - stg_orders_ff --> fct_user_sales_ff - dim_users_ff --> fct_user_sales_ff - stg_users_ff --> dim_users_ff - fct_user_sales_ff --> py_example -
-
- - - -
-

Macros

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypePath
Any - - python - - models/macros_py/helpers.py
csv_values - - python - - models/macros_py/helpers.py
mask_email - - python - - models/macros_py/helpers.py
slugify - - python - - models/macros_py/helpers.py
coalesce_any - - sql - - models/macros/utils.sql
default_country - - sql - - models/macros/utils.sql
email_domain - - sql - - models/macros/utils.sql
safe_cast_amount - - sql - - models/macros/utils.sql
star_except - - sql - - models/macros/star.sql
- -
-
- - - - \ No newline at end of file diff --git a/examples/macros_demo/site/dag/py_example.html b/examples/macros_demo/site/dag/py_example.html deleted file mode 100644 index be23140..0000000 --- a/examples/macros_demo/site/dag/py_example.html +++ /dev/null @@ -1,141 +0,0 @@ - - - - - - py_example – FastFlowTransform - - - -

← Back to overview

- -
-
-

- py_example - table -

-
Model Detail β€’ FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
py_example
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/macros_demo/models/engines/duckdb/py_exmaple.ff.py - -
- -
Dependencies
-
- - - -
- - -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/macros_demo/site/dag/stg_orders.ff.html b/examples/macros_demo/site/dag/stg_orders.ff.html deleted file mode 100644 index 700ea1e..0000000 --- a/examples/macros_demo/site/dag/stg_orders.ff.html +++ /dev/null @@ -1,146 +0,0 @@ - - - - - - stg_orders.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- stg_orders.ff - view -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
view
- -
Relation
-
stg_orders
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/macros_demo/models/common/stg_orders.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
-
- -
- -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/macros_demo/site/dag/stg_users.ff.html b/examples/macros_demo/site/dag/stg_users.ff.html deleted file mode 100644 index 4d79b86..0000000 --- a/examples/macros_demo/site/dag/stg_users.ff.html +++ /dev/null @@ -1,146 +0,0 @@ - - - - - - stg_users.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- stg_users.ff - view -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
view
- -
Relation
-
stg_users
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/macros_demo/models/common/stg_users.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
-
- -
- -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/materializations_demo/.env.dev_databricks b/examples/materializations_demo/.env.dev_databricks index c7b2d16..6d5ac7f 100644 --- a/examples/materializations_demo/.env.dev_databricks +++ b/examples/materializations_demo/.env.dev_databricks @@ -8,4 +8,7 @@ FF_DBR_DATABASE=materializations_demo # Optional: Delta Lake # FF_DBR_TABLE_FORMAT=delta -JAVA_HOME=/opt/homebrew/opt/openjdk@17 +# Prefer an existing JAVA_HOME (e.g., in CI); fall back to the macOS brew path for local use. +if [ -z "${JAVA_HOME:-}" ] && [ -d "/opt/homebrew/opt/openjdk@17" ]; then + JAVA_HOME=/opt/homebrew/opt/openjdk@17 +fi diff --git a/examples/materializations_demo/models/engines/databricks_spark/demo_py_emit.ff.py b/examples/materializations_demo/models/engines/databricks_spark/demo_py_emit.ff.py index d921837..2b554ca 100644 --- a/examples/materializations_demo/models/engines/databricks_spark/demo_py_emit.ff.py +++ b/examples/materializations_demo/models/engines/databricks_spark/demo_py_emit.ff.py @@ -1,6 +1,27 @@ +from typing import TYPE_CHECKING, Any + from fastflowtransform import engine_model -from pyspark.sql import SparkSession -from pyspark.sql import DataFrame as SparkDataFrame + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + from pyspark.sql import DataFrame as SparkDataFrame +else: + + class SparkSession: # pragma: no cover - placeholder for runtime type hints + ... + + class SparkDataFrame: # pragma: no cover - placeholder for runtime type hints + ... + + +def _ensure_spark_session() -> "SparkSession": + try: + from pyspark.sql import SparkSession as _SparkSession + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "pyspark is required for this model. Install fastflowtransform[spark]." + ) from exc + return _SparkSession.getActiveSession() or _SparkSession.builder.getOrCreate() @engine_model( @@ -10,7 +31,7 @@ tags=["example:materializations_demo", "scope:python", "engine:databricks_spark"], ) def fetch(_: SparkDataFrame) -> SparkDataFrame: - spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate() + spark = _ensure_spark_session() return spark.createDataFrame( [{"note": "hello from python", "emitted_at": "2020-01-01T00:00:00Z"}] ) diff --git a/examples/materializations_demo/site/dag/demo_py_emit.html b/examples/materializations_demo/site/dag/demo_py_emit.html deleted file mode 100644 index c888e6d..0000000 --- a/examples/materializations_demo/site/dag/demo_py_emit.html +++ /dev/null @@ -1,206 +0,0 @@ - - - - - - demo_py_emit – FastFlowTransform - - - -

← Back to overview

- -
-
-

- demo_py_emit - table -

-
Model Detail β€’ FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
demo_py_emit
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/engines/databricks_spark/demo_py_emit.ff.py - -
- -
Dependencies
-
- - - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
emitted_atstring - - yes - - - - β€” - - - - unknown - -
notestring - - yes - - - - β€” - - - - unknown - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/dim_customers.ff.html b/examples/materializations_demo/site/dag/dim_customers.ff.html deleted file mode 100644 index 3d89ad5..0000000 --- a/examples/materializations_demo/site/dag/dim_customers.ff.html +++ /dev/null @@ -1,279 +0,0 @@ - - - - - - dim_customers.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- dim_customers.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
dim_customers
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/dim_customers.ff.sql - -
- -
Dependencies
-
- - - -
- - -
Referenced by
-
- -
- -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
customer_idint - - yes - - - - β€” - - - - - ?.customer_id - - direct - - - - -
customer_namestring - - yes - - - - β€” - - - - - ?.customer_name - - direct - - - - -
customer_statusstring - - yes - - - - β€” - - - - - ?.customer_status - - direct - - - - -
loaded_attimestamp - - yes - - - - β€” - - - - - ?.current_timestamp - - direct - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/fct_orders_inc.ff.html b/examples/materializations_demo/site/dag/fct_orders_inc.ff.html deleted file mode 100644 index 7897e80..0000000 --- a/examples/materializations_demo/site/dag/fct_orders_inc.ff.html +++ /dev/null @@ -1,270 +0,0 @@ - - - - - - fct_orders_inc.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- fct_orders_inc.ff - incremental -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
incremental
- -
Relation
-
fct_orders_inc
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/fct_orders_inc.ff.sql - -
- -
Dependencies
-
- - - -
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
order_idint - - yes - - - - β€” - - - - - ?.order_id - - direct - - - - -
customer_idint - - yes - - - - β€” - - - - - ?.customer_id - - direct - - - - -
amountdouble - - yes - - - - β€” - - - - - ?.amount - - direct - - - - -
order_tstimestamp - - yes - - - - β€” - - - - - ?.order_ts - - direct - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/index.html b/examples/materializations_demo/site/dag/index.html deleted file mode 100644 index a630c86..0000000 --- a/examples/materializations_demo/site/dag/index.html +++ /dev/null @@ -1,358 +0,0 @@ - - - - - - FastFlowTransform - DAG & Mini Docs - - - - - - - -
-
-

FastFlowTransform - DAG & Mini Docs

-
Mermaid renders automatically (light/dark)
-
-
- - -
-
- -
-
-

DAG

-
- SQL - Python - β€’ - Materialization: - - table - - view - - ephemeral - - incremental - -
-
flowchart TD - classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; - classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - demo_py_emit("demo_py_emit
(demo_py_emit)") - class demo_py_emit py; - dim_customers_ff["dim_customers.ff
(dim_customers)"] - class dim_customers_ff sql; - fct_orders_inc_ff["fct_orders_inc.ff
(fct_orders_inc)"] - class fct_orders_inc_ff sql; - mart_order_summary_ff["mart_order_summary.ff
(mart_order_summary)"] - class mart_order_summary_ff sql; - order_flags_ephemeral_ff["order_flags_ephemeral.ff
(order_flags_ephemeral)"] - class order_flags_ephemeral_ff sql; - stg_customers_ff["stg_customers.ff
(stg_customers)"] - class stg_customers_ff sql; - stg_orders_ff["stg_orders.ff
(stg_orders)"] - class stg_orders_ff sql; - stg_orders_ff --> order_flags_ephemeral_ff - stg_orders_ff --> mart_order_summary_ff - order_flags_ephemeral_ff --> mart_order_summary_ff - stg_customers_ff --> mart_order_summary_ff - stg_orders_ff --> fct_orders_inc_ff - stg_customers_ff --> dim_customers_ff - dim_customers_ff --> demo_py_emit -
-
- - - -
-

Macros

- - - - - - - - - - - - - - - - - - -
NameTypePath
dtype_double - - sql - - models/macros/types.sql.j2
- -
-
- - - - \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/mart_order_summary.ff.html b/examples/materializations_demo/site/dag/mart_order_summary.ff.html deleted file mode 100644 index cf5e1b0..0000000 --- a/examples/materializations_demo/site/dag/mart_order_summary.ff.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - mart_order_summary.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- mart_order_summary.ff - table -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
mart_order_summary
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/mart_order_summary.ff.sql - -
- -
Dependencies
- - - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
customer_idint - - yes - - - - β€” - - - - unknown - -
customer_namestring - - yes - - - - β€” - - - - unknown - -
customer_statusstring - - yes - - - - β€” - - - - unknown - -
order_countbigint - - yes - - - - β€” - - - - unknown - -
big_order_countbigint - - yes - - - - β€” - - - - unknown - -
total_amountdouble - - yes - - - - β€” - - - - unknown - -
first_order_tstimestamp - - yes - - - - β€” - - - - unknown - -
last_order_tstimestamp - - yes - - - - β€” - - - - unknown - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/order_flags_ephemeral.ff.html b/examples/materializations_demo/site/dag/order_flags_ephemeral.ff.html deleted file mode 100644 index 095150c..0000000 --- a/examples/materializations_demo/site/dag/order_flags_ephemeral.ff.html +++ /dev/null @@ -1,150 +0,0 @@ - - - - - - order_flags_ephemeral.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- order_flags_ephemeral.ff - ephemeral -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
ephemeral
- -
Relation
-
order_flags_ephemeral
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/order_flags_ephemeral.ff.sql - -
- -
Dependencies
-
- - - -
- - -
Referenced by
-
- -
- -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/stg_customers.ff.html b/examples/materializations_demo/site/dag/stg_customers.ff.html deleted file mode 100644 index 64ef95b..0000000 --- a/examples/materializations_demo/site/dag/stg_customers.ff.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - stg_customers.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- stg_customers.ff - view -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
view
- -
Relation
-
stg_customers
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/stg_customers.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
customer_idint - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
customer_namestring - - yes - - - - β€” - - - - - ?.name - - direct - - - - -
customer_statusstring - - yes - - - - β€” - - - - - ?.status - - direct - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/materializations_demo/site/dag/stg_orders.ff.html b/examples/materializations_demo/site/dag/stg_orders.ff.html deleted file mode 100644 index 959ffd5..0000000 --- a/examples/materializations_demo/site/dag/stg_orders.ff.html +++ /dev/null @@ -1,273 +0,0 @@ - - - - - - stg_orders.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- stg_orders.ff - view -

-
Model Detail β€’ FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
view
- -
Relation
-
stg_orders
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/materializations_demo/models/common/stg_orders.ff.sql - -
- -
Dependencies
-
- - – - -
- - -
Referenced by
- - -
-
- - - - -
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
order_idint - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
customer_idint - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
amountdouble - - yes - - - - β€” - - - - unknown - -
order_tstimestamp - - yes - - - - β€” - - - - - ?.? - - transformed - - - - -
-
- - - -
- - - - \ No newline at end of file diff --git a/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json b/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json deleted file mode 100644 index 5bb2736..0000000 --- a/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "engine": "duckdb", - "entries": { - "ephemeral_ids.ff": "56f5222b3e2469d71e63fb1f664ae43bf2740b5cd64413ed6cb66fa20189ecc5", - "mart_orders_enriched": "bb79bdd27cdd29755517a3b3a28282e27ebdb692b39557295d3f30dee6af45c3", - "mart_users.ff": "a1ac028ccbe1496f0c3d0d54f6f59af224fec0aae5e63cfb271bb62a8fb640ae", - "orders.ff": "d6c8111b8a3d35990f7ca2062e1988a61cd64862c64fc053ffd173197035db10", - "users.ff": "6c1fbf51b1d449282d68b7667923039c177b350219f5709bfa29c62ceb257069", - "users_enriched": "867768eaf110ff081e5b0e0fddb5eead4c0f27209fd40c29b5b0f99d20602608", - "v_users.ff": "63c4587f79f58540afe06b8841a5b4e2b4de5171586b4007a32486393a59c24e", - "v_users_enriched.ff": "e19645f66bb1652495b3608cbc4cb658e90a9f9662cf7f927b8d999b9bdfee2f" - }, - "profile": "dev", - "version": 1 -} \ No newline at end of file diff --git a/examples/simple_duckdb/.fastflowtransform/target/catalog.json b/examples/simple_duckdb/.fastflowtransform/target/catalog.json deleted file mode 100644 index 267b6af..0000000 --- a/examples/simple_duckdb/.fastflowtransform/target/catalog.json +++ /dev/null @@ -1,179 +0,0 @@ -{ - "metadata": { - "generated_at": "2025-11-11T08:35:42+00:00", - "tool": "fastflowtransform" - }, - "relations": { - "ephemeral_ids": { - "columns": [] - }, - "mart_orders_enriched": { - "columns": [ - { - "dtype": "BIGINT", - "name": "order_id", - "nullable": true - }, - { - "dtype": "BIGINT", - "name": "user_id", - "nullable": true - }, - { - "dtype": "DOUBLE", - "name": "amount", - "nullable": true - }, - { - "dtype": "BIGINT", - "name": "id", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email", - "nullable": true - }, - { - "dtype": "TIMESTAMP", - "name": "signup_ts", - "nullable": true - }, - { - "dtype": "BOOLEAN", - "name": "is_gmail", - "nullable": true - }, - { - "dtype": "BOOLEAN", - "name": "valid_amt", - "nullable": true - } - ] - }, - "mart_users": { - "columns": [ - { - "dtype": "BIGINT", - "name": "id", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email", - "nullable": true - }, - { - "dtype": "BOOLEAN", - "name": "is_gmail", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email_domain", - "nullable": true - } - ] - }, - "orders": { - "columns": [ - { - "dtype": "BIGINT", - "name": "order_id", - "nullable": true - }, - { - "dtype": "BIGINT", - "name": "user_id", - "nullable": true - }, - { - "dtype": "DOUBLE", - "name": "amount", - "nullable": true - } - ] - }, - "users": { - "columns": [ - { - "dtype": "BIGINT", - "name": "id", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email", - "nullable": true - }, - { - "dtype": "DATE", - "name": "signup_ts", - "nullable": true - } - ] - }, - "users_enriched": { - "columns": [ - { - "dtype": "BIGINT", - "name": "id", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email", - "nullable": true - }, - { - "dtype": "TIMESTAMP", - "name": "signup_ts", - "nullable": true - }, - { - "dtype": "BOOLEAN", - "name": "is_gmail", - "nullable": true - } - ] - }, - "v_users": { - "columns": [ - { - "dtype": "BIGINT", - "name": "id", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email_upper", - "nullable": true - } - ] - }, - "v_users_enriched": { - "columns": [ - { - "dtype": "BIGINT", - "name": "id", - "nullable": true - }, - { - "dtype": "VARCHAR", - "name": "email", - "nullable": true - }, - { - "dtype": "BOOLEAN", - "name": "is_gmail", - "nullable": true - } - ] - } - } -} diff --git a/examples/simple_duckdb/.fastflowtransform/target/manifest.json b/examples/simple_duckdb/.fastflowtransform/target/manifest.json deleted file mode 100644 index b1a7af4..0000000 --- a/examples/simple_duckdb/.fastflowtransform/target/manifest.json +++ /dev/null @@ -1,126 +0,0 @@ -{ - "macros": { - "nz": "models/macros/util.sql", - "on_or_before": "models/macros/util.sql", - "sql_email_domain": "models/macros_py/sql_helpers.py", - "upper_col": "models/macros/util.sql" - }, - "metadata": { - "generated_at": "2025-11-11T08:35:42+00:00", - "tool": "fastflowtransform" - }, - "nodes": { - "ephemeral_ids.ff": { - "deps": [ - "users.ff" - ], - "kind": "sql", - "materialized": "ephemeral", - "name": "ephemeral_ids.ff", - "path": "models/ephemeral_ids.ff.sql", - "relation": "ephemeral_ids" - }, - "mart_orders_enriched": { - "deps": [ - "orders.ff", - "users_enriched" - ], - "kind": "python", - "materialized": "table", - "name": "mart_orders_enriched", - "path": "models/mart_orders_enriched.ff.py", - "relation": "mart_orders_enriched" - }, - "mart_users.ff": { - "deps": [ - "users_enriched" - ], - "kind": "sql", - "materialized": "table", - "name": "mart_users.ff", - "path": "models/mart_users.ff.sql", - "relation": "mart_users" - }, - "orders.ff": { - "deps": [], - "kind": "sql", - "materialized": "table", - "name": "orders.ff", - "path": "models/orders.ff.sql", - "relation": "orders" - }, - "users.ff": { - "deps": [], - "kind": "sql", - "materialized": "table", - "name": "users.ff", - "path": "models/users.ff.sql", - "relation": "users" - }, - "users_enriched": { - "deps": [ - "users.ff" - ], - "kind": "python", - "materialized": "table", - "name": "users_enriched", - "path": "models/users_enriched.ff.py", - "relation": "users_enriched" - }, - "v_users.ff": { - "deps": [ - "ephemeral_ids.ff", - "users.ff" - ], - "kind": "sql", - "materialized": "view", - "name": "v_users.ff", - "path": "models/v_users.ff.sql", - "relation": "v_users" - }, - "v_users_enriched.ff": { - "deps": [ - "users_enriched" - ], - "kind": "sql", - "materialized": "view", - "name": "v_users_enriched.ff", - "path": "models/v_users_enriched.ff.sql", - "relation": "v_users_enriched" - } - }, - "sources": { - "crm": { - "users": { - "base": { - "catalog": null, - "database": null, - "dataset": null, - "format": null, - "identifier": "seed_users", - "location": null, - "options": {}, - "project": null, - "schema": null - }, - "overrides": {} - } - }, - "erp": { - "orders": { - "base": { - "catalog": null, - "database": null, - "dataset": null, - "format": null, - "identifier": "seed_orders", - "location": null, - "options": {}, - "project": null, - "schema": null - }, - "overrides": {} - } - } - } -} diff --git a/examples/simple_duckdb/.fastflowtransform/target/run_results.json b/examples/simple_duckdb/.fastflowtransform/target/run_results.json deleted file mode 100644 index cee9bdf..0000000 --- a/examples/simple_duckdb/.fastflowtransform/target/run_results.json +++ /dev/null @@ -1,82 +0,0 @@ -{ - "metadata": { - "generated_at": "2025-11-11T08:35:42+00:00", - "tool": "fastflowtransform" - }, - "results": [ - { - "duration_ms": 1, - "finished_at": "2025-11-11T08:35:42+00:00", - "http": null, - "message": null, - "name": "ephemeral_ids.ff", - "started_at": "2025-11-11T08:35:42+00:00", - "status": "success" - }, - { - "duration_ms": 4, - "finished_at": "2025-11-11T08:35:42+00:00", - "http": null, - "message": null, - "name": "mart_orders_enriched", - "started_at": "2025-11-11T08:35:42+00:00", - "status": "success" - }, - { - "duration_ms": 2, - "finished_at": "2025-11-11T08:35:42+00:00", - "http": null, - "message": null, - "name": "mart_users.ff", - "started_at": "2025-11-11T08:35:42+00:00", - "status": "success" - }, - { - "duration_ms": 6, - "finished_at": "2025-11-11T08:35:42+00:00", - "http": null, - "message": null, - "name": "orders.ff", - "started_at": "2025-11-11T08:35:42+00:00", - "status": "success" - }, - { - "duration_ms": 2, - "finished_at": "2025-11-11T08:35:42+00:00", - "http": null, - "message": null, - "name": "users.ff", - "started_at": "2025-11-11T08:35:42+00:00", - "status": "success" - }, - { - "duration_ms": 3, - "finished_at": "2025-11-11T08:35:42+00:00", - "http": null, - "message": null, - "name": "users_enriched", - "started_at": "2025-11-11T08:35:42+00:00", - "status": "success" - }, - { - "duration_ms": 3, - "finished_at": "2025-11-11T08:35:42+00:00", - "http": null, - "message": null, - "name": "v_users.ff", - "started_at": "2025-11-11T08:35:42+00:00", - "status": "success" - }, - { - "duration_ms": 1, - "finished_at": "2025-11-11T08:35:42+00:00", - "http": null, - "message": null, - "name": "v_users_enriched.ff", - "started_at": "2025-11-11T08:35:42+00:00", - "status": "success" - } - ], - "run_finished_at": "2025-11-11T08:35:42+00:00", - "run_started_at": "2025-11-11T08:35:42+00:00" -} diff --git a/examples/simple_duckdb/Makefile b/examples/simple_duckdb/Makefile deleted file mode 100644 index f333327..0000000 --- a/examples/simple_duckdb/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -# examples/simple_duckdb/Makefile - -DB ?= .local/demo.duckdb -PROJECT ?= . - -.PHONY: seed run dag test clean - -seed: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft seed "$(PROJECT)" --env dev - -run: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run $(PROJECT) --env dev - -dag: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft dag $(PROJECT) --env dev --html - -test: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft test $(PROJECT) --env dev --select batch - -clean: - rm -rf .local $(PROJECT)/docs - -utest: - fft utest . --model mart_orders_enriched --case join_and_flag - fft utest . --path tests/unit/users_enriched.yml - -utest-all: - fft utest . diff --git a/examples/simple_duckdb/__init__.py b/examples/simple_duckdb/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/examples/simple_duckdb/models/ephemeral_ids.ff.sql b/examples/simple_duckdb/models/ephemeral_ids.ff.sql deleted file mode 100644 index 3fe0968..0000000 --- a/examples/simple_duckdb/models/ephemeral_ids.ff.sql +++ /dev/null @@ -1,7 +0,0 @@ -{{ config(materialized='ephemeral', tags=['helper']) }} - --- Use var('day') and macro on_or_before() --- In this demo we just return all user ids; tweak to your data shape if you add a date column later. -select id -from {{ ref('users.ff') }} -where {{ on_or_before("signup_ts", "'" ~ var('day','2000-01-01') ~ "'") }} diff --git a/examples/simple_duckdb/models/macros/util.sql b/examples/simple_duckdb/models/macros/util.sql deleted file mode 100644 index 65d1c60..0000000 --- a/examples/simple_duckdb/models/macros/util.sql +++ /dev/null @@ -1,14 +0,0 @@ -{# String/date helpers #} -{% macro upper_col(c) -%} -upper({{ c }}) -{%- endmacro %} - -{% macro on_or_before(ts_col, day_var) -%} --- day_var is a string like 'YYYY-MM-DD' -{{ ts_col }} <= cast({{ day_var }} as date) -{%- endmacro %} - -{# Tiny quality helper: nullable->coalesce #} -{% macro nz(expr, fallback) -%} -coalesce({{ expr }}, {{ fallback }}) -{%- endmacro %} diff --git a/examples/simple_duckdb/models/macros_py/sql_helpers.py b/examples/simple_duckdb/models/macros_py/sql_helpers.py deleted file mode 100644 index 5458702..0000000 --- a/examples/simple_duckdb/models/macros_py/sql_helpers.py +++ /dev/null @@ -1,8 +0,0 @@ -def sql_email_domain(col: str, engine: str = "duckdb") -> str: - """ - Return a SQL snippet that extracts the email domain from a column/expression. - Intended for DuckDB/Postgres. Use like: {{ sql_email_domain("u.email") }} AS email_domain - """ - col = col.strip() - base = f"coalesce({col}, '')" - return f"lower(split_part({base}, '@', 2))" diff --git a/examples/simple_duckdb/models/mart_orders_enriched.ff.py b/examples/simple_duckdb/models/mart_orders_enriched.ff.py deleted file mode 100644 index cd6aef2..0000000 --- a/examples/simple_duckdb/models/mart_orders_enriched.ff.py +++ /dev/null @@ -1,23 +0,0 @@ -# examples/simple_duckdb/models/mart_orders_enriched.ff.py - -import pandas as pd - -from fastflowtransform import model - - -@model( - name="mart_orders_enriched", - deps=["orders.ff", "users_enriched"], - require={ - "orders.ff": ["order_id", "user_id", "amount"], # logical name works - "users_enriched": ["id", "email", "is_gmail"], # physical relation works too - }, -) -def build(dfs: dict[str, pd.DataFrame]) -> pd.DataFrame: - # Keys already resolve to physical relations via relation_for(): "orders", "users_enriched" - orders = dfs["orders"] - users = dfs["users_enriched"] - out = orders.merge(users, left_on="user_id", right_on="id", how="left").assign( - valid_amt=lambda x: x["amount"] >= 0 - ) - return out diff --git a/examples/simple_duckdb/models/mart_users.ff.sql b/examples/simple_duckdb/models/mart_users.ff.sql deleted file mode 100644 index d646fde..0000000 --- a/examples/simple_duckdb/models/mart_users.ff.sql +++ /dev/null @@ -1,8 +0,0 @@ --- materialized table from a view (keeps example simple) -create or replace table mart_users as -select - id, - email, - is_gmail, - {{ sql_email_domain("email") }} as email_domain -from {{ ref('users_enriched') }}; diff --git a/examples/simple_duckdb/models/orders.ff.sql b/examples/simple_duckdb/models/orders.ff.sql deleted file mode 100644 index d8aeb46..0000000 --- a/examples/simple_duckdb/models/orders.ff.sql +++ /dev/null @@ -1,4 +0,0 @@ --- Independent of users.ff β†’ eligible for parallel execution within the same level (v0.3). -create or replace table orders as -select order_id, user_id, amount -from {{ source('erp','orders') }}; diff --git a/examples/simple_duckdb/models/users.ff.sql b/examples/simple_duckdb/models/users.ff.sql deleted file mode 100644 index c2e3fc7..0000000 --- a/examples/simple_duckdb/models/users.ff.sql +++ /dev/null @@ -1,8 +0,0 @@ --- This staging node has no upstream model dependencies and can run in parallel --- with other independent nodes (v0.3 parallel scheduler). -create or replace table users as -select - id, - email, - cast(signup_ts as date) as signup_ts -from {{ source('crm','users') }}; diff --git a/examples/simple_duckdb/models/users_enriched.ff.py b/examples/simple_duckdb/models/users_enriched.ff.py deleted file mode 100644 index 6fd575c..0000000 --- a/examples/simple_duckdb/models/users_enriched.ff.py +++ /dev/null @@ -1,10 +0,0 @@ -import pandas as pd - -from fastflowtransform import model - - -@model(name="users_enriched", deps=["users.ff"], require={"users.ff": ["id", "email"]}) -def enrich(df: pd.DataFrame) -> pd.DataFrame: - out = df.copy() - out["is_gmail"] = out["email"].str.endswith("@gmail.com") - return out diff --git a/examples/simple_duckdb/models/v_users.ff.sql b/examples/simple_duckdb/models/v_users.ff.sql deleted file mode 100644 index 7fd5168..0000000 --- a/examples/simple_duckdb/models/v_users.ff.sql +++ /dev/null @@ -1,7 +0,0 @@ -{{ config(materialized='view', tags=['mart']) }} -select - u.id, - u.email, - {{ upper_col("u.email") }} as email_upper -from {{ ref('users.ff') }} u -join {{ ref('ephemeral_ids.ff') }} e using(id) diff --git a/examples/simple_duckdb/models/v_users_enriched.ff.sql b/examples/simple_duckdb/models/v_users_enriched.ff.sql deleted file mode 100644 index 8c9f09c..0000000 --- a/examples/simple_duckdb/models/v_users_enriched.ff.sql +++ /dev/null @@ -1,3 +0,0 @@ -{{ config(materialized='view', tags=['mart']) }} -select id, email, is_gmail -from {{ ref('users_enriched') }} diff --git a/examples/simple_duckdb/profiles.yml b/examples/simple_duckdb/profiles.yml deleted file mode 100644 index 2876e7a..0000000 --- a/examples/simple_duckdb/profiles.yml +++ /dev/null @@ -1,20 +0,0 @@ -dev: - engine: duckdb - duckdb: - path: ":memory:" - -stg: - engine: postgres - postgres: - dsn: postgresql+psycopg://user:pass@localhost:5432/ffdb - schema: analytics - -prod: - engine: bigquery - bigquery: - dataset: my_company_analytics - location: EU - -# Fallback -default: - engine: duckdb diff --git a/examples/simple_duckdb/project.yml b/examples/simple_duckdb/project.yml deleted file mode 100644 index c5e73e1..0000000 --- a/examples/simple_duckdb/project.yml +++ /dev/null @@ -1,81 +0,0 @@ -name: simple_duckdb -version: 0.1 -models_dir: models - -docs: - dag_dir: site/dag - models: - users.ff: - description: "Raw users table imported from CRM." - columns: - id: "Primary key." - email: "User email address." - users_enriched: - description: "Adds gmail flag." - columns: - is_gmail: "True if email ends with @gmail.com" - -# variables exposed to var('key') -vars: - day: "2000-01-01" - min_amount: 0 - -tests: - # Batch‑Tabellen - - type: not_null - table: users - column: id - tags: [batch] - - - type: unique - table: users - column: id - tags: [batch] - - - type: row_count_between - table: users_enriched - min: 1 - tags: [batch] - - # Streaming‑Tabelle (aus streaming.py) - - type: greater_equal - table: fct_sessions_streaming - column: revenue - threshold: 0 - tags: [streaming] - - - type: greater_equal - table: mart_orders_enriched - column: valid_amt - threshold: 0 - tags: [batch] - - - type: non_negative_sum - table: mart_orders_enriched - column: valid_amt - tags: [batch] - - - type: non_negative_sum - table: fct_sessions_streaming - column: revenue - tags: [streaming] - - - type: freshness - table: fct_sessions_streaming - column: session_end - max_delay_minutes: 15 - tags: [streaming] - - # ── Cross-table reconciliations (FF-310) ───────────────────────────────── - - type: reconcile_equal - name: orders_count_equals_mart - tags: [reconcile] - left: { table: orders, expr: "count(*)" } - right: { table: mart_orders_enriched, expr: "count(*)" } - # exact equality on counts (no tolerance) - - - type: reconcile_coverage - name: orders_fully_covered_in_mart - tags: [reconcile] - source: { table: orders, key: "order_id" } - target: { table: mart_orders_enriched, key: "order_id" } diff --git a/examples/simple_duckdb/seeds/seed_orders.csv b/examples/simple_duckdb/seeds/seed_orders.csv deleted file mode 100644 index f9c5a47..0000000 --- a/examples/simple_duckdb/seeds/seed_orders.csv +++ /dev/null @@ -1,4 +0,0 @@ -order_id,user_id,amount -101,1,10.0 -102,2,9.9 -103,1, diff --git a/examples/simple_duckdb/seeds/seed_users.csv b/examples/simple_duckdb/seeds/seed_users.csv deleted file mode 100644 index 53332f3..0000000 --- a/examples/simple_duckdb/seeds/seed_users.csv +++ /dev/null @@ -1,4 +0,0 @@ -id,email,signup_ts -1,a@example.com,2025-09-28 -2,b@gmail.com,2025-10-01 -3,c@gmail.com,2025-10-05 diff --git a/examples/simple_duckdb/sources.yml b/examples/simple_duckdb/sources.yml deleted file mode 100644 index 40975cf..0000000 --- a/examples/simple_duckdb/sources.yml +++ /dev/null @@ -1,11 +0,0 @@ -version: 2 - -sources: - - name: crm - tables: - - name: users - identifier: seed_users - - name: erp - tables: - - name: orders - identifier: seed_orders diff --git a/examples/simple_duckdb/tests/fixtures/orders_small.csv b/examples/simple_duckdb/tests/fixtures/orders_small.csv deleted file mode 100644 index 7addcb6..0000000 --- a/examples/simple_duckdb/tests/fixtures/orders_small.csv +++ /dev/null @@ -1,4 +0,0 @@ -order_id,user_id,amount -10,1,19.9 -11,1,-1.0 -12,2,0.0 diff --git a/examples/simple_duckdb/tests/fixtures/users_enriched_small.csv b/examples/simple_duckdb/tests/fixtures/users_enriched_small.csv deleted file mode 100644 index be3b450..0000000 --- a/examples/simple_duckdb/tests/fixtures/users_enriched_small.csv +++ /dev/null @@ -1,3 +0,0 @@ -id,email,is_gmail -1,x@gmail.com,true -2,a@example.com,false diff --git a/examples/simple_duckdb/tests/unit/mart_orders_enriched.yml b/examples/simple_duckdb/tests/unit/mart_orders_enriched.yml deleted file mode 100644 index f58e103..0000000 --- a/examples/simple_duckdb/tests/unit/mart_orders_enriched.yml +++ /dev/null @@ -1,32 +0,0 @@ -model: mart_orders_enriched - -defaults: - inputs: - users_enriched: - rows: - - {id: 1, email: "x@gmail.com", is_gmail: true} - orders: - rows: - - {order_id: 10, user_id: 1, amount: 19.9} - - {order_id: 11, user_id: 1, amount: -1.0} - expect: - any_order: true - ignore_columns: [] # e.g. technical columns - approx: - amount: 1e-9 - -cases: - - name: join_and_flag - expect: - rows: - - {order_id: 10, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: 19.9, valid_amt: true} - - {order_id: 11, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: -1.0, valid_amt: false} - - - name: override_amount - inputs: - orders: - rows: - - {order_id: 12, user_id: 1, amount: 0.001} - expect: - rows: - - {order_id: 12, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: 0.001, valid_amt: true} diff --git a/examples/simple_duckdb/tests/unit/mart_orders_enriched_csv.yml b/examples/simple_duckdb/tests/unit/mart_orders_enriched_csv.yml deleted file mode 100644 index 21c7722..0000000 --- a/examples/simple_duckdb/tests/unit/mart_orders_enriched_csv.yml +++ /dev/null @@ -1,18 +0,0 @@ -model: mart_orders_enriched - -defaults: - inputs: - orders: - csv: tests/fixtures/orders_small.csv - users_enriched: - csv: tests/fixtures/users_enriched_small.csv - expect: - any_order: true - -cases: - - name: join_and_flag_from_csv - expect: - rows: - - {order_id: 10, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: 19.9, valid_amt: true} - - {order_id: 11, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: -1.0, valid_amt: false} - - {order_id: 12, user_id: 2, email: "a@example.com", is_gmail: false, amount: 0.0, valid_amt: true} diff --git a/examples/simple_duckdb/tests/unit/marts_daily.yml b/examples/simple_duckdb/tests/unit/marts_daily.yml deleted file mode 100644 index 55999c1..0000000 --- a/examples/simple_duckdb/tests/unit/marts_daily.yml +++ /dev/null @@ -1,29 +0,0 @@ -model: mart_users.ff # node name (filename including .ff) - -defaults: - inputs: - users_enriched: # physical relation (ref('users_enriched') -> "users_enriched") - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} - expect: - relation: mart_users # important because the SQL materialises this table - order_by: [id] - any_order: false - ignore_columns: [] - -cases: - - name: passthrough_columns - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} - - - name: override_upstream - inputs: - users_enriched: - rows: - - {id: 10, email: "x@gmail.com", is_gmail: true} - expect: - rows: - - {id: 10, email: "x@gmail.com", is_gmail: true} diff --git a/examples/simple_duckdb/tests/unit/users_enriched.yml b/examples/simple_duckdb/tests/unit/users_enriched.yml deleted file mode 100644 index a5617bf..0000000 --- a/examples/simple_duckdb/tests/unit/users_enriched.yml +++ /dev/null @@ -1,32 +0,0 @@ -model: users_enriched # logical model name (registry entry) -# engine: duckdb # optional: overrides profiles/--engine - -# Defaults – deep-merged; overridden values win per case -defaults: - inputs: - users: # physical relation (relation_for('users.ff') -> "users") - rows: - - {id: 1, email: "a@example.com"} - - {id: 2, email: "b@gmail.com"} - expect: - relation: users_enriched # default target relation (otherwise relation_for(model)) - order_by: [id] # sort before comparing - any_order: false # individual cases can set true - ignore_columns: [] # additional comparison options below - -cases: - - name: basic # relies entirely on defaults - inputs: {} # nothing else needed here - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} - - - name: new_user_overrides - inputs: - users: - rows: - - {id: 3, email: "c@hotmail.com"} # overrides defaults.inputs.users.rows - expect: - rows: - - {id: 3, email: "c@hotmail.com", is_gmail: false} diff --git a/exports/Combined.md b/exports/Combined.md index 8cf3868..f68b473 100644 --- a/exports/Combined.md +++ b/exports/Combined.md @@ -105,9 +105,10 @@ The command is non-interactive, refuses to overwrite existing directories, and l ## 1. Install & bootstrap ```bash -python -m venv .venv -. .venv/bin/activate -pip install -e ./fastflowtransform +python3 -m venv .venv +. .venv/bin/activate # or source .venv/bin/activate +pip install --upgrade pip +pip install -e . # run from the repo root; use `uv pip install --editable .` if you prefer uv fft --help ``` @@ -137,12 +138,19 @@ cat <<'SQL' > demo/models/users.ff.sql select id, email from {{ source('raw', 'users') }} SQL + +cat <<'YAML' > demo/profiles.yml +dev: + engine: duckdb + duckdb: + path: ".local/demo.duckdb" +YAML ``` ## 3. Seed static inputs ```bash -fft seed demo --profile dev +fft seed demo --env dev ``` This materializes the CSV into the configured engine (DuckDB by default) using `seed_users` as the physical table. @@ -150,7 +158,7 @@ This materializes the CSV into the configured engine (DuckDB by default) using ` ## 4. Run the pipeline ```bash -fft run demo --cache off +fft run demo --env dev --cache off ``` You should see log lines similar to `βœ“ L01 [DUCK] users.ff`. The resulting table lives in the target schema (`staging` in this example). @@ -170,6 +178,7 @@ You should see log lines similar to `βœ“ L01 [DUCK] users.ff`. The resulting tab - Add `project.yml` for reusable `vars:` and metadata - Explore `fft docs` to generate HTML documentation - Use engine profiles under `profiles.yml` to target Postgres, BigQuery, or Databricks (path-based sources supported via `format` + `location` overrides) +- Render the DAG site for this project: `fft dag demo --env dev --html` (find it under `demo/site/dag/index.html`) Refer to `docs/Config_and_Macros.md` for advanced configuration options. @@ -177,7 +186,7 @@ Refer to `docs/Config_and_Macros.md` for advanced configuration options. -# 🧭 FastFlowTransform – Technical Developer Documentation (v0.4) +# 🧭 FastFlowTransform – Technical Developer Documentation > Status: latest updates from your context dump. This document consolidates project structure, architecture, core APIs, error handling, CLI, examples, and roadmap into a print/git-friendly Markdown. > @@ -232,64 +241,7 @@ Refer to `docs/Config_and_Macros.md` for advanced configuration options. ### Project Layout -```text -fastflowtransform/ -β”œβ”€β”€ pyproject.toml -β”œβ”€β”€ src/ -β”‚ └── fastflowtransform/ -β”‚ β”œβ”€β”€ __init__.py -β”‚ β”œβ”€β”€ cli.py -β”‚ β”œβ”€β”€ core.py -β”‚ β”œβ”€β”€ dag.py -β”‚ β”œβ”€β”€ docs.py -β”‚ β”œβ”€β”€ errors.py -β”‚ β”œβ”€β”€ settings.py -β”‚ β”œβ”€β”€ seeding.py -β”‚ β”œβ”€β”€ testing.py -β”‚ β”œβ”€β”€ validation.py -β”‚ β”œβ”€β”€ decorators.py # optional, if not kept in core.py -β”‚ β”œβ”€β”€ docs/ -β”‚ β”‚ └── templates/ -β”‚ β”‚ β”œβ”€β”€ index.html.j2 -β”‚ β”‚ └── model.html.j2 -β”‚ β”œβ”€β”€ executors/ -β”‚ β”‚ β”œβ”€β”€ __init__.py -β”‚ β”‚ β”œβ”€β”€ base.py -β”‚ β”‚ β”œβ”€β”€ duckdb_exec.py -β”‚ β”‚ β”œβ”€β”€ postgres_exec.py -β”‚ β”‚ β”œβ”€β”€ bigquery_exec.py # pandas + BigQuery client -β”‚ β”‚ β”œβ”€β”€ bigquery_bf_exec.py # BigQuery DataFrames (bigframes) -β”‚ β”‚ β”œβ”€β”€ databricks_spark_exec.py # PySpark (without pandas) -β”‚ β”‚ └── snowflake_snowpark_exec.py# Snowpark (without pandas) -β”‚ └── streaming/ -β”‚ β”œβ”€β”€ __init__.py -β”‚ β”œβ”€β”€ file_tail.py -β”‚ └── sessionizer.py -β”‚ -β”œβ”€β”€ examples/ -β”‚ β”œβ”€β”€ simple_duckdb/ -β”‚ β”‚ β”œβ”€β”€ models/ -β”‚ β”‚ β”‚ β”œβ”€β”€ users.ff.sql -β”‚ β”‚ β”‚ β”œβ”€β”€ users_enriched.ff.py -β”‚ β”‚ β”‚ β”œβ”€β”€ orders.ff.sql -β”‚ β”‚ β”‚ β”œβ”€β”€ mart_orders_enriched.ff.py -β”‚ β”‚ β”‚ └── mart_users.ff.sql -β”‚ β”‚ β”œβ”€β”€ seeds/ -β”‚ β”‚ β”‚ β”œβ”€β”€ seed_users.csv -β”‚ β”‚ β”‚ └── seed_orders.csv -β”‚ β”‚ β”œβ”€β”€ sources.yml -β”‚ β”‚ β”œβ”€β”€ project.yml -β”‚ β”‚ β”œβ”€β”€ Makefile -β”‚ β”‚ └── .local/demo.duckdb (after make seed/run) -β”‚ └── postgres/ # similar structure if needed -β”‚ -β”œβ”€β”€ tests/ -β”‚ β”œβ”€β”€ conftest.py -β”‚ β”œβ”€β”€ duckdb/ … # end-to-end + unit -β”‚ β”œβ”€β”€ postgres/ … -β”‚ └── streaming/ … -└── README.md -``` +For an up-to-date view, browse the repository tree or run `find . -maxdepth 2` from the root; all examples live under `examples/` with their own READMEs. ### Example Projects and Seeds @@ -324,20 +276,6 @@ Need to understand profile precedence, `.env` layering, or the Pydantic models t Level-wise parallelism, cache modes, fingerprint formula, and the `_ff_meta` audit table are documented in [Cache_and_Parallelism.md](./Cache_and_Parallelism.md). Use that reference for CLI examples (`--jobs`, `--cache`, `--rebuild`), skip conditions, and troubleshooting tips related to concurrency. -### Roadmap Snapshot - -| Version | Content | -|---------|---------------------------------------------------| -| 0.2 | `config(materialized=...)`, Jinja macros, variables | -| 0.3 | Parallel execution, cache | -| 0.4 | Incremental models | -| 0.5 | Streaming connectors (Kafka, S3) | -| 1.0 | Stable API, plugin SDK | - -> See also: feature pyramid & roadmap phases (OSS/SaaS) in the separate document. - ---- - ### Cross-Table Reconciliations Reconciliation tests (`reconcile_equal`, `reconcile_ratio_within`, `reconcile_diff_within`, `reconcile_coverage`) are fully documented in the [Data Quality Test Reference](./Data_Quality_Tests.md#cross-table-reconciliations). Use that guide for YAML schemas, tolerance parameters, and engine notes before wiring the checks into `fft test`. @@ -472,13 +410,13 @@ class BaseExecutor(ABC): def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None: ... ``` -**DuckDB (`duckdb_exec.py`)** +**DuckDB (`duckdb.py`)** - `run_sql(node, env)` renders Jinja (`ref/source`) and executes the SQL. - `_read_relation` loads a table as `DataFrame`; surfaces actionable errors when a dependency is missing. - `_materialize_relation` writes the `DataFrame` as a table (`create or replace table ...`). -**Postgres (`postgres_exec.py`)** +**Postgres (`postgres.py`)** - `_SAConnShim` (compatible with `testing._exec`). - `run_sql` renders SQL and rewrites `CREATE OR REPLACE TABLE` to `DROP + CREATE AS`. @@ -536,41 +474,7 @@ def seed_project(project_dir: Path, executor, schema: Optional[str] = None) -> i ### CLI Implementation -Operational usage lives in [CLI Flows](#cli-flows). This section drills into the Typer command definitions in `cli.py`. - -**Commands:** - -- `fft run [--env dev] [--engine ...]` -- `fft dag [--env dev] [--html] [--select ...] [--with-schema/--no-schema]` -- `fft docgen [--env dev] [--out dir] [--emit-json path] [--open-source]` -- `fft test [--env dev] [--select batch|streaming|tag:...]` -- `fft seed [--env dev]` -- `fft sync-db-comments [--env dev] [--dry-run]` -- `fft utest [--env dev] [--cache off|ro|rw] [--reuse-meta]` -- `fft --version` - -**Key components:** - -```python -def _load_project_and_env(project_arg) -> tuple[Path, Environment]: ... -def _resolve_profile(env_name, engine, proj) -> tuple[EnvSettings, Profile]: ... -def _get_test_con(executor: Any) -> Any: ... -``` - -**Test summary (exit 2 on failures):** - -``` -Data Quality Summary -──────────────────── -βœ… not_null users.email (3ms) -❌ unique users.id (2ms) - ↳ users.id has 1 duplicate - -Totals -────── -βœ“ passed: 1 -βœ— failed: 1 -``` +Operational usage lives in [CLI Flows](#cli-flows) and the dedicated [CLI Guide](CLI_Guide.md). For implementation details, see the Typer commands in `src/fastflowtransform/cli/`. --- @@ -615,7 +519,7 @@ from pathlib import Path from jinja2 import Environment, FileSystemLoader from fastflowtransform.core import REGISTRY from fastflowtransform.dag import topo_sort -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor proj = Path("examples/simple_duckdb").resolve() REGISTRY.load_project(proj) @@ -950,17 +854,17 @@ fft run . --env dev --select dim_countries_from_api --http-cache ro - Technical guide: *Developer Guide – Architecture & Internals* - Unit tests: `tests/api/test_http_*.py` -- Runtime & cache: *Parallelism & Cache (v0.3)* +- Runtime & cache: *Parallelism & Cache* -# FastFlowTransform Modeling Reference (v0.1) +# FastFlowTransform Modeling Reference > Authoritative reference for FastFlowTransform’s modeling layer: SQL/Python models, configuration macros, templating helpers, and testing hooks. -> Works with FastFlowTransform v0.1 (T1–T11). Supported engines: DuckDB, Postgres, BigQuery (pandas & BigFrames), Databricks/Spark, Snowflake/Snowpark. -> **Execution & Cache (v0.3) quick notes** +> Supported engines: DuckDB, Postgres, BigQuery (pandas & BigFrames), Databricks/Spark, Snowflake/Snowpark. +> **Execution & Cache quick notes** > - Parallelism is level-wise; use `fft run --jobs N`. > - Use `--cache={off|ro|rw|wo}` to control skipping behavior. > - Fingerprints include rendered SQL / Python function source, selected `FF_*` env vars, `sources.yml` and upstream fingerprints. @@ -1142,7 +1046,7 @@ Call `config()` at the top of SQL models. Python models get the same options via ) }} ``` -Supported keys (v0.1): +Supported keys: | Key | Type | Description | |----------------|-----------------|------------------------------------------------------------------------------| @@ -1265,7 +1169,7 @@ from {{ ref('users.ff') }}; - Default β†’ materialized as `table`. - `materialized='view'` produces an engine-specific temporary table first, then creates/overwrites a view that selects from it. -- Ephemeral Python models are not supported in v0.1. +- Ephemeral Python models are not supported. --- @@ -1597,7 +1501,7 @@ FF_RUN_DATE=2025-01-01 fft run . --env dev --cache=rw @@ -10,6 +10,7 @@ - [User Guide – Operational](./Technical_Overview.md#part-i--operational-guide) - [Modeling Reference](./Config_and_Macros.md) - - [Parallelism & Cache (v0.3)](./Cache_and_Parallelism.md) +- [Parallelism & Cache](./Cache_and_Parallelism.md) - [Developer Guide – Architecture & Internals](./Technical_Overview.md#part-ii--architecture--internals) ```` @@ -2756,7 +2660,7 @@ FastFlowTransform’s CLI is the entry point for seeding data, running DAGs, gen | `fft seed [--env dev]` | Materialize CSV/Parquet seeds into the configured engine. | | `fft run [--env dev]` | Execute the DAG (obeys cache + parallel flags). | | `fft dag --html` | Render the DAG graph/site for quick inspection. | -| `fft docgen --out site/docs` | Generate the full documentation bundle (graph + model pages + optional JSON). | +| `fft docgen [--out site/docs] [--emit-json path] [--open-source]` | Generate the full documentation bundle (graph + model pages + optional JSON). Default output is `/site/docs`. | | `fft test [--env dev]` | Run schema/data-quality tests defined in `project.yml` or schema YAML files. | | `fft utest ` | Execute unit tests defined under `tests/unit/*.yml`. | | `fft sync-db-comments ` | Push model/column descriptions into Postgres or Snowflake comments. | @@ -3128,11 +3032,13 @@ Error types map to the classes documented in `docs/Technical_Overview.md#core-mo # Basic Demo Project -The `examples/basic_demo` project shows the smallest end-to-end FastFlowTransform pipeline. It combines one seed, a staging model, and a final mart while staying portable across DuckDB, Postgres, and Databricks Spark. +The `examples/basic_demo` project shows the smallest end-to-end FastFlowTransform pipeline. It combines one seed, a staging model, and a final mart while staying portable across DuckDB, Postgres, Databricks Spark, and BigQuery. ## Why it exists + - **Start small** – demonstrate the minimum folder structure (`seeds/`, `models/`, `profiles.yml`) needed to run `fft`. - **Engine parity** – prove that a single project can target multiple engines by swapping profiles. +- **Cloud & local** – show that the same project runs both on local engines (DuckDB/Postgres/Spark) and in a cloud warehouse (BigQuery). - **Understand outputs** – show where documentation and manifests land after a run. Use it as a sandbox before adding your own sources, macros, or Python models. @@ -3141,12 +3047,12 @@ Use it as a sandbox before adding your own sources, macros, or Python models. | Path | Purpose | |------|---------| -| `seeds/seed_users.csv` | Sample CRM-style user data. `fft seed` materializes it as `crm.users`. | +| `seeds/seed_users.csv` | Sample CRM-style user data. `fft seed` materializes it as a physical `seed_users` table in the active engine (schema/dataset depends on the profile). | | `models/staging/users_clean.ff.sql` | Normalizes emails, casts types, and tags the model for all engines. | | `models/marts/mart_users_by_domain.ff.sql` | Aggregates users per email domain and records the first/last signup dates. | -| `models/engines/*/mart_latest_signup.ff.py` | Engine-specific Python models (pandas for DuckDB/Postgres, PySpark for Databricks) selecting the most recent signup per domain from the staging view. | -| `profiles.yml` | Declares `dev_duckdb`, `dev_postgres`, and `dev_databricks` profiles driven by environment variables. | -| `.env.dev_*` | Template environment files you can `source` per engine. | +| `models/engines/*/mart_latest_signup.ff.py` | Engine-specific Python models selecting the most recent signup per domain from the staging view:
β€’ pandas for DuckDB/Postgres
β€’ PySpark for Databricks
β€’ BigQuery DataFrames (BigFrames) for BigQuery. | +| `profiles.yml` | Declares `dev_duckdb`, `dev_postgres`, `dev_databricks`, and `dev_bigquery` profiles driven by environment variables. | +| `.env.dev_*` | Template environment files you can `source` per engine (`.env.dev_duckdb`, `.env.dev_postgres`, `.env.dev_databricks`, `.env.dev_bigquery`). | | `Makefile` | One command (`make demo ENGINE=…`) to seed, run, document, test, and preview results. | ## Running the demo @@ -3154,27 +3060,64 @@ Use it as a sandbox before adding your own sources, macros, or Python models. 1. `cd examples/basic_demo` 2. Choose an engine and export its environment variables: ```bash + # DuckDB set -a; source .env.dev_duckdb; set +a - # swap to .env.dev_postgres or .env.dev_databricks for other engines + + # Postgres + # set -a; source .env.dev_postgres; set +a + + # Databricks Spark + # set -a; source .env.dev_databricks; set +a + + # BigQuery (choose one) + # set -a; source .env.dev_bigquery_pandas; set +a # pandas client + # set -a; source .env.dev_bigquery_bigframes; set +a # BigFrames ``` -3. Execute the full flow: + +3. Execute the full flow for the selected engine: + ```bash + # DuckDB / Postgres / Databricks make demo ENGINE=duckdb + # make demo ENGINE=postgres + # make demo ENGINE=databricks_spark + + # BigQuery (set BQ_FRAME to choose pandas vs bigframes) + # builds into ..* + # requires a GCP project, dataset, and credentials (see BigQuery setup docs) + # set profiles.yml β†’ bigquery.allow_create_dataset: true if the dataset should be auto-created + # make demo ENGINE=bigquery BQ_FRAME=bigframes + # make demo ENGINE=bigquery BQ_FRAME=pandas + ``` + + The Makefile runs `fft seed`, `fft run`, `fft dag`, and `fft test`. + + To open the rendered DAG site after a run: + + ```bash + make show ENGINE=duckdb + make show ENGINE=bigquery ``` - The Makefile runs `fft seed`, `fft run`, `fft dag`, `fft test`, and `fft show basic_demo.mart_users_by_domain`. To preview the Python mart, run `make show ENGINE=duckdb SHOW_MODEL=mart_latest_signup` (or swap `ENGINE` as needed). 4. Inspect artifacts: - - `.fastflowtransform/target/manifest.json` and `run_results.json` - - `site/dag/index.html` for the rendered model graph - - CLI output from `fft show` displaying the aggregated mart -The demo also enables baseline data quality checks in `project.yml`. Running `fft test` (or `make test`) verifies that primary keys remain unique/not-null across `seed_users`, `users_clean`, `mart_users_by_domain`, and the Python mart, while ensuring aggregate metrics such as `user_count` never drop below zero and each domain appears only once in `mart_latest_signup`. + * `.fastflowtransform/target/manifest.json` and `run_results.json` + * `site/dag/index.html` for the rendered model graph + * Use your engine’s client (or `fft run` logs) to inspect the mart outputs + +## Data quality tests + +The demo enables baseline data quality checks in `project.yml`. Running `fft test` (or `make test ENGINE=…`) verifies that: -## Next steps +* Primary keys remain unique/not-null across: -- Add more CSVs under `seeds/` and declare them in `sources.yml`. -- Create additional staging models so marts can reuse normalized data. -- Introduce Python models or macros mirroring how the API demo scales up. -- Update `.env.dev_*` with real credentials once you connect to shared databases. + * `seed_users` + * `users_clean` + * `mart_users_by_domain` + * the Python mart `mart_latest_signup` +* Aggregate metrics such as `user_count` never drop below zero. +* Each email domain appears only once in `mart_latest_signup`. + +These tests run against whatever engine/profile is active β€” including BigQuery, where they execute as standard SQL queries on the configured dataset. @@ -4432,10 +4375,10 @@ Together, these features make iterative development **fast, reliable, and reprod -````markdown -# Incremental & Delta Demo +# Incremental, Delta & Iceberg Demo + +This example project shows how to use **incremental models** and **Delta-/Iceberg-style merges** in FastFlowTransform across DuckDB, Postgres, Databricks Spark (Parquet, Delta & Iceberg), and BigQuery (pandas or BigFrames). -This example project shows how to use **incremental models** and **Delta-style merges** in FastFlowTransform across DuckDB, Postgres and Databricks Spark. It is intentionally small and self-contained so you can copy/paste patterns into your own project. @@ -4456,7 +4399,10 @@ incremental_demo/ .env .env.dev_duckdb .env.dev_postgres - .env.dev_databricks + .env.dev_databricks_delta + .env.dev_databricks_iceberg + .env.dev_bigquery_pandas + .env.dev_bigquery_bigframes Makefile profiles.yml project.yml @@ -4477,6 +4423,11 @@ incremental_demo/ fct_events_py_incremental.ff.py databricks_spark/ fct_events_py_incremental.ff.py + bigquery/ + pandas/ + fct_events_py_incremental.ff.py + bigframes/ + fct_events_py_incremental.ff.py ``` *Your actual filenames may differ slightly; the concepts are the same.* @@ -4503,7 +4454,14 @@ The demo revolves around a tiny `events` dataset and three different ways to bui * A Python model that returns a DataFrame; the executor applies incremental behaviour based on model `meta` (unique key + updated-at timestamp) and the target engine: * DuckDB / Postgres: incremental insert/merge in SQL - * Databricks Spark: `MERGE INTO` for Delta where available, with a fallback full-refresh strategy + * Databricks Spark: `MERGE INTO` for Delta or Iceberg where available (Spark 4), with a fallback full-refresh strategy for other formats + * BigQuery: pandas- or BigFrames-backed DataFrame models with incremental merge logic handled by the BigQuery executor + +4. **Iceberg profile for Spark 4** + + * Optional Databricks/Spark profile that uses the built-in **Iceberg catalog**. + * Seeds and models are materialized as Iceberg tables in a local warehouse directory. + * `ref()` and `source()` automatically point to the Iceberg catalog when the `databricks_spark.table_format` is set to `iceberg`. --- @@ -4667,6 +4625,7 @@ On subsequent runs, the engine evaluates the `delta.sql` snippet and: * **DuckDB / Postgres**: inserts or merges the resulting rows into the target table * **Databricks Spark**: tries a `MERGE INTO` (Delta) and falls back to a full-refresh if necessary +* **BigQuery**: applies incremental insert/merge logic in SQL via the BigQuery executor --- @@ -4740,6 +4699,8 @@ Files: models/engines/duckdb/fct_events_py_incremental.ff.py models/engines/postgres/fct_events_py_incremental.ff.py models/engines/databricks_spark/fct_events_py_incremental.ff.py +models/engines/bigquery/pandas/fct_events_py_incremental.ff.py +models/engines/bigquery/bigframes/fct_events_py_incremental.ff.py ``` Each engine variant uses the same logical signature: @@ -4789,9 +4750,9 @@ The executor uses the `meta.incremental` / `meta.unique_key` / `meta.updated_at` --- -## Delta variant (Databricks / Spark) +## Delta & Iceberg variants (Databricks / Spark) -In addition to the β€œregular” incremental models, the demo also includes a **Delta Lake variant** +In addition to the β€œregular” incremental models, the demo also includes **Delta Lake** and **Iceberg** variants that shows how to: - route a model to **Delta tables** via `project.yml` @@ -4802,7 +4763,7 @@ This is optional and only relevant for the `databricks_spark` engine. --- -### Storage configuration for the Delta model +### Storage configuration for the Delta / Iceberg models In `project.yml`, the Delta variant gets its own storage entry, separate from the Parquet fact table: @@ -4818,6 +4779,12 @@ models: fct_events_sql_inline_delta: path: ".local/spark_delta/fct_events_sql_inline" format: delta + + # ❄️ Iceberg-based fact table (Spark 4 / Databricks only) + fct_events_sql_inline_iceberg: + # Points into the Iceberg warehouse; must match your Iceberg catalog config + path: ".local/iceberg_warehouse/incremental_demo/fct_events_sql_inline" + format: iceberg ```` Notes: @@ -4971,9 +4938,26 @@ FFT_ACTIVE_ENV=dev_postgres fft test . \ --select tag:example:incremental_demo ``` -Packen wΓΌrde ich den Hinweis direkt an die Stelle, wo du schon beschreibst, wie man die Demo auf Databricks startet – also deine aktuelle Sektion: +### BigQuery + +```bash +# pandas +FF_ENGINE=bigquery FF_ENGINE_VARIANT=pandas FFT_ACTIVE_ENV=dev_bigquery_pandas fft seed . +FF_ENGINE=bigquery FF_ENGINE_VARIANT=pandas FFT_ACTIVE_ENV=dev_bigquery_pandas fft run . \ + --select tag:example:incremental_demo --select tag:engine:bigquery --cache rw +FF_ENGINE=bigquery FF_ENGINE_VARIANT=pandas FFT_ACTIVE_ENV=dev_bigquery_pandas fft test . \ + --select tag:example:incremental_demo + +# BigFrames +FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigframes fft seed . +FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigframes fft run . \ + --select tag:example:incremental_demo --select tag:engine:bigquery --cache rw +FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigframes fft test . \ + --select tag:example:incremental_demo +``` + +Ensure the service account credentials pointed to by `GOOGLE_APPLICATION_CREDENTIALS` can create/drop tables in the target dataset. -````markdown ### Databricks Spark ```bash @@ -5029,67 +5013,53 @@ environment variable, without touching the models or project.yml. Adjust environment names to match your `profiles.yml`. ---- - -## How to link this page into your docs - -### 1. MkDocs (`mkdocs.yml`) - -If you use MkDocs, place this file under e.g.: +### Databricks Spark (Iceberg / Spark 4+) -```text -docs/examples/incremental_demo.md -``` - -and add it to your `mkdocs.yml` nav: - -```yaml -nav: - - Overview: index.md - - Examples: - - API demo: examples/api_demo.md - - Incremental & Delta demo: examples/incremental_demo.md -``` - -### 2. Sphinx (`index.rst` + Markdown) - -If you use Sphinx with MyST or Markdown support, put the file under: - -```text -docs/examples/incremental_demo.md -``` - -and reference it from your main `index.rst`: +If you are on Spark 4 / Databricks with Iceberg support, you can also run the incremental demo +purely against Iceberg tables using a dedicated profile (for example `dev_databricks_iceberg`). -```rst -Welcome to FastFlowTransform's documentation! -============================================= +That profile typically: -.. toctree:: - :maxdepth: 2 +* uses `engine: databricks_spark` +* sets `databricks_spark.table_format: iceberg` +* configures an Iceberg catalog via `extra_conf`, for example: - overview - examples/api_demo - examples/incremental_demo -``` + models: + storage: + # Example warehouse location, adjust as needed + fct_events_sql_inline_iceberg: + path: ".local/iceberg_warehouse/incremental_demo/fct_events_sql_inline" + format: iceberg + +and in the profile (profiles.yml) something like: + + dev_databricks_iceberg: + engine: databricks_spark + databricks_spark: + master: "local[*]" + app_name: "incremental_demo" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" + extra_conf: + spark.sql.catalog.iceberg: org.apache.iceberg.spark.SparkCatalog + spark.sql.catalog.iceberg.type: hadoop + spark.sql.catalog.iceberg.warehouse: "file:///{{ project_dir() }}/.local/iceberg_warehouse" -(Adjust paths to match your actual layout.) +From the repo root: -### 3. Top-level `index.md` (Markdown-only docs) + cd examples/incremental_demo -If your docs use a pure Markdown index, just add a link: +Run seeds and models against Iceberg: -```markdown -## Examples + FFT_ACTIVE_ENV=dev_databricks_iceberg fft seed . -- [API demo](examples/api_demo.md) -- [Incremental & Delta demo](examples/incremental_demo.md) -``` + FFT_ACTIVE_ENV=dev_databricks_iceberg fft run . \ + --select tag:example:incremental_demo --select tag:engine:databricks_spark -This way, the incremental demo appears alongside your existing API demo and other examples in your global documentation navigation. + FFT_ACTIVE_ENV=dev_databricks_iceberg fft test . \ + --select tag:example:incremental_demo -``` -``` +Under this profile, all `ref()` / `source()` calls in Spark SQL and Python models are resolved +against the Iceberg catalog, so seeds and incremental models operate purely on Iceberg tables. @@ -5100,7 +5070,7 @@ This way, the incremental demo appears alongside your existing API demo and othe The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local data, external APIs, and multiple execution engines. It highlights: - **Hybrid data model**: joins a local seed (`crm.users`) with live user data from JSONPlaceholder. -- **Multiple environments**: switch between DuckDB, Postgres, and Databricks Spark using `profiles.yml` + `.env.*`. +- **Multiple environments**: switch between DuckDB, Postgres, Databricks Spark, and BigQuery (pandas or BigFrames client) using `profiles.yml` + `.env.*`. - **HTTP integration**: compare the built-in FastFlowTransform HTTP client (`api_users_http`) with a plain `requests` implementation (`api_users_requests`). - **Offline caching & telemetry**: inspect HTTP snapshots via `run_results.json`. - **Engine-aware registration**: scope Python models via `engine_model` and SQL models via `config(engines=[...])` so only the active engine’s nodes load. @@ -5117,7 +5087,8 @@ The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local 'kind:seed-consumer', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery' ] ) }} select id, email @@ -5125,14 +5096,14 @@ The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local ``` Consumes `sources.yml β†’ crm.users` (seeded from `seeds/seed_users.csv`). -2. **API enrichment** – two Python implementations under `models/engines/duckdb/`: +2. **API enrichment** – engine-specific Python implementations under `models/engines//`: - `api_users_http.ff.py` uses the built-in HTTP wrapper (`fastflowtransform.api.http.get_df`) with cache/offline support. - `api_users_requests.ff.py` uses raw `requests` for maximum flexibility. - - Wrap engine-specific callables with `engine_model(only="duckdb", ...)` to skip registration when another engine is selected. + - Engine-specific callables are scoped with `engine_model(only=...)` (DuckDB/Postgres/Spark) or `env_match={"FF_ENGINE": "bigquery", "FF_ENGINE_VARIANT": ...}` (BigQuery pandas/BigFrames) to stay isolated per engine. 3. **Mart join** – `models/common/mart_users_join.ff.sql` ```sql - {{ config(engines=['duckdb','postgres','databricks_spark']) }} + {{ config(engines=['duckdb','postgres','databricks_spark','bigquery']) }} {% set api_users_model = var('api_users_model', 'api_users_http') %} {% set api_users_refs = { 'api_users_http': ref('api_users_http'), @@ -5166,14 +5137,28 @@ dev_postgres: postgres: dsn: "{{ env('FF_PG_DSN') }}" db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'api_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true ``` `.env.dev_*` files supply the actual values. `_load_dotenv_layered()` loads them in priority order: repo `.env` β†’ project `.env` β†’ `.env.` β†’ shell overrides (highest priority). Secrets stay out of version control. +### BigQuery specifics + +- Set `ENGINE=bigquery` in the Makefile targets and choose a client via `BQ_FRAME=pandas` or `BQ_FRAME=bigframes` (default). +- Required env vars: `FF_BQ_PROJECT`, `FF_BQ_DATASET` (defaults to `api_demo`), and optionally `FF_BQ_LOCATION`. Uncomment `allow_create_dataset` in `profiles.yml` for first-run convenience. +- BigFrames variants ingest the HTTP payload into a pandas DataFrame, then wrap it as a BigFrames DataFrame (FFT’s `get_df(..., output="bigframes")` is not implemented yet). + ## Makefile Workflow -`Makefile` chooses the profile via `ENGINE` (`duckdb`/`postgres`/`databricks_spark`) and wraps the main commands: +`Makefile` chooses the profile via `ENGINE` (`duckdb`/`postgres`/`databricks_spark`/`bigquery`) and wraps the main commands. For BigQuery, set `BQ_FRAME=pandas|bigframes`: ```make ENGINE ?= duckdb @@ -5182,6 +5167,14 @@ ifeq ($(ENGINE),duckdb) PROFILE_ENV = dev_duckdb endif ... +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif seed: uv run fft seed "$(PROJECT)" --env $(PROFILE_ENV) @@ -5195,6 +5188,7 @@ Common targets: |--------------------------|-------------| | `make ENGINE=duckdb seed`| Materialize seeds into DuckDB. | | `make ENGINE=postgres run`| Execute the full pipeline against Postgres. | +| `make ENGINE=bigquery run BQ_FRAME=bigframes`| Run against BigQuery (default BigFrames client; set `BQ_FRAME=pandas` to switch). | | `make dag` | Render documentation (`site/dag/`). | | `make api-run` | Run only API models (uses HTTP cache). | | `make api-offline` | Force offline mode (`FF_HTTP_OFFLINE=1`). | @@ -5204,7 +5198,7 @@ HTTP tuning parameters (`FF_HTTP_ALLOWED_DOMAINS`, cache dir, timeouts) live in ## End-to-End Demo -1. **Select engine**: `make ENGINE=duckdb` (default). Set `ENGINE=postgres` or `ENGINE=databricks_spark` to switch. +1. **Select engine**: `make ENGINE=duckdb` (default). Set `ENGINE=postgres`, `ENGINE=databricks_spark`, or `ENGINE=bigquery BQ_FRAME=` to switch. 2. **Seed data**: `make seed` 3. **Run pipeline**: `make run` 4. **Explore docs**: `make dag` β†’ open `examples/api_demo/site/dag/index.html` @@ -5245,6 +5239,188 @@ This example demonstrates multi-engine configuration, environment-driven secrets - Ensure your shell loads `.env.dev_databricks` (via `make`, `direnv`, or manual export) and run `make ENGINE=databricks_spark seed run`. +Yep, let’s bolt on a β€œhow to set it up in GCP” section that fits with what you already have. + +Here’s an extended BigQuery section you can drop into your docs (you can keep or trim the parts you already added): + +### BigQuery + +#### 1. One-time setup in Google Cloud + +You only need to do this once per project / environment. + +1. **Create (or pick) a GCP project** + + - Go to the *Google Cloud Console* β†’ **IAM & Admin β†’ Create project**. + - Give it a name, e.g. `FFT Basic Demo`, and note the **Project ID**, e.g. `fft-basic-demo`. + - All further steps refer to this project id. + +2. **Enable the BigQuery API** + + - In the console, go to **APIs & Services β†’ Library**. + - Search for **β€œBigQuery API”** and click **Enable**. + - (Optional but recommended) Also enable **BigQuery Storage API** for faster reads. + +3. **Create a BigQuery dataset** + + - Go to **BigQuery** in the console (left sidebar). + - Make sure your project `fft-basic-demo` is selected. + - Click **β€œ+ Create dataset”**: + - **Dataset ID**: e.g. `basic_demo` + - **Location type**: choose a **multi-region**, e.g.: + - `EU` or `US` + - Click **Create dataset**. + + ⚠️ **Important:** The dataset **location must match** the location you use in your env (`FF_BQ_LOCATION`). + - If your dataset is in `EU` (multi-region), then `FF_BQ_LOCATION=EU`. + - If the dataset is in a single region like `europe-west3`, use that exact region name. + +4. **Create a service account (for CI / non-interactive use)** + + For local dev you can use your own user credentials (see below), but for CI/CD or shared environments + a service account is better. + + - Go to **IAM & Admin β†’ Service Accounts β†’ Create service account**. + - Name it e.g. `fft-runner`. + - On the **Roles** step, add roles with BigQuery write access, for example: + - `BigQuery Job User` + - `BigQuery Data Editor` + - (Optionally) Restrict to dataset level later if you want stricter permissions. + + Then create a key: + + - Click your service account β†’ **Keys β†’ Add key β†’ Create new key**. + - Select **JSON**, download the file, and store it somewhere safe (e.g. `~/.config/gcloud/fft-sa.json`). + +5. **Authentication options** + + You have two ways to authenticate locally: + + **A) Application Default Credentials via gcloud (easy for dev)** + + ```bash + gcloud auth application-default login + ``` + +This opens a browser, you log in, and Google stores your ADC in +`~/.config/gcloud/application_default_credentials.json`. + +The BigQuery client in `fastflowtransform` will pick this up automatically **as long as** +`FF_BQ_PROJECT` points to a project you have access to. + +**B) Service account key (good for CI)** + +* Put the downloaded JSON key (from step 4) somewhere on disk. + +* Set the environment variable before running `fft`: + + ```bash + export GOOGLE_APPLICATION_CREDENTIALS=/path/to/fft-sa.json + ``` + +* Make sure the service account has at least: + + * `BigQuery Job User` + * `BigQuery Data Editor` + +* Optionally grant `BigQuery Data Viewer` if you’re only reading some tables. + +--- + +#### 2. Local configuration (env + profiles) + +1. **Environment file (`.env.dev_bigquery`)** + + ```env + # BigQuery connection + FF_BQ_PROJECT=fft-basic-demo # your GCP project id + FF_BQ_DATASET=basic_demo # dataset from step 3 + FF_BQ_LOCATION=EU # or europe-west3, US, etc. MUST match dataset location + + # Active fft environment name (must match profiles.yml) + FFT_ACTIVE_ENV=dev_bigquery + ``` + + Load this via `direnv`, `make`, or manual `export`. + +2. **profiles.yml** + + ```yaml + dev_bigquery: + engine: bigquery + bigquery: + project: ${FF_BQ_PROJECT} + dataset: ${FF_BQ_DATASET} + location: ${FF_BQ_LOCATION} + use_bigframes: true # Python models use BigQuery DataFrames (BigFrames) + ``` + +--- + +#### 3. Running seeds, models, and tests + +* **Seed BigQuery from `seeds/`:** + + ```bash + make ENGINE=bigquery seed + ``` + + This writes all `seeds/*.csv|parquet` to tables under + `${FF_BQ_PROJECT}.${FF_BQ_DATASET}.*`. + +* **Build models:** + + ```bash + make ENGINE=bigquery run + ``` + + * SQL models are executed as BigQuery queries. + * Python models with `only="bigquery"` run via `BigQueryBFExecutor` (BigQuery DataFrames) + and are written back into the same dataset. + +* **Run data-quality tests:** + + ```bash + make ENGINE=bigquery test + ``` + + `fft test` uses the BigQuery shim to run checks like `not_null`, `unique`, + `row_count_between`, `greater_equal`, etc. against + `${FF_BQ_PROJECT}.${FF_BQ_DATASET}.`. + +--- + +#### 4. Common BigQuery gotchas + +* **Location mismatch** + + * Error like `Location basic_demo does not support this operation` or `Not found: Dataset ...`: + + * Check the **dataset location** in the BigQuery UI. + * Make sure `FF_BQ_LOCATION` is exactly that value (`EU`, `US`, `europe-west3`, …). + * Ensure the executor is initialized with the same location (via `profiles.yml` β†’ `location`). + +* **Permission issues** + + * If you see `accessDenied` or `Permission denied`: + + * Confirm you authenticated (ADC or service account). + * Ensure your user / service account has at least: + + * `BigQuery Job User` + * `BigQuery Data Editor` on the project or dataset. + +* **Dataset not found** + + * Error `Not found: Dataset fft-basic-demo:basic_demo`: + + * Check that the dataset id matches exactly: + + * Project: `fft-basic-demo` + * Dataset: `basic_demo` + * Verify it exists and is in the same project you set in `FF_BQ_PROJECT`. + + diff --git a/pyproject.toml b/pyproject.toml index ff3cce9..dd39980 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,22 +38,49 @@ dependencies = [ "duckdb>=1.0", "pandas>=2.0", "pyyaml>=6.0", - "google-cloud-bigquery>=3.25", "sqlalchemy>=2.0", - "psycopg[binary]>=3.1", - "psycopg2-binary>=2.9", "pydantic>=2.8", "pydantic-settings>=2.4", "python-dotenv>=1.0", + "httpx>=0.28.1", +] + +[project.optional-dependencies] +postgres = [ + "psycopg[binary]>=3.1", + "psycopg2-binary>=2.9", +] + +bigquery = [ + "google-cloud-bigquery>=3.25", +] + +bigquery_bf = [ + "bigframes>=2.24.0", + "google-cloud-bigquery>=3.25", +] + +spark = [ "pyspark>=4.0.1", + "delta-spark>=4.0.0", +] + +snowflake = [ "snowflake>=1.8.0", "snowflake-snowpark-python>=1.40.0", +] + +full = [ + "psycopg[binary]>=3.1", + "psycopg2-binary>=2.9", + "google-cloud-bigquery>=3.25", "bigframes>=2.24.0", - "httpx>=0.28.1", + "pyspark>=4.0.1", "delta-spark>=4.0.0", + "snowflake>=1.8.0", + "snowflake-snowpark-python>=1.40.0", ] -[project.optional-dependencies] dev = [ "pytest==8.4.*", "pytest-cov==7.0.*", diff --git a/src/fastflowtransform/api/http.py b/src/fastflowtransform/api/http.py index 7f51bea..9f53773 100644 --- a/src/fastflowtransform/api/http.py +++ b/src/fastflowtransform/api/http.py @@ -304,7 +304,7 @@ def get_df( Controls the returned frame type. "pandas" (default) yields a pandas DataFrame. "spark" materialises a pyspark.sql.DataFrame using the provided session (or an active/builder session). - "bigframes" is reserved for future integration and currently raises NotImplementedError. + "bigframes" returns a BigFrames DataFrame (requires `bigframes`). session : Any | None Optional backend handle. For Spark, pass a SparkSession; otherwise the active session or a new one is used. @@ -374,10 +374,13 @@ def _finalize(pdf: pd.DataFrame) -> Any: spark = SparkSession.builder.getOrCreate() return spark.createDataFrame(pdf) if mode == "bigframes": - raise NotImplementedError( - "get_df(..., output='bigframes') is not implemented yet. " - "Open an issue if you need this backend." - ) + try: + import bigframes.pandas as bpd # noqa: PLC0415 + except Exception as exc: # pragma: no cover - bigframes optional dependency + raise RuntimeError( + "get_df(..., output='bigframes') requires the 'bigframes' package." + ) from exc + return bpd.DataFrame(pdf) raise ValueError( f"Unsupported output backend '{output}' (expected pandas|spark|bigframes)." ) diff --git a/src/fastflowtransform/cli/bootstrap.py b/src/fastflowtransform/cli/bootstrap.py index 835e2e4..5a641b5 100644 --- a/src/fastflowtransform/cli/bootstrap.py +++ b/src/fastflowtransform/cli/bootstrap.py @@ -1,6 +1,7 @@ # fastflowtransform/cli/bootstrap.py from __future__ import annotations +import importlib import os from collections.abc import Callable from dataclasses import dataclass @@ -14,14 +15,6 @@ from fastflowtransform.core import REGISTRY from fastflowtransform.errors import DependencyNotFoundError -from fastflowtransform.executors import ( - BigQueryBFExecutor, - BigQueryExecutor, - DatabricksSparkExecutor, - DuckExecutor, - PostgresExecutor, - SnowflakeSnowparkExecutor, -) from fastflowtransform.executors._shims import BigQueryConnShim, SAConnShim from fastflowtransform.executors.base import BaseExecutor from fastflowtransform.logging import echo @@ -75,6 +68,19 @@ def _die(msg: str, code: int = 1) -> NoReturn: raise typer.Exit(code) +def _import_optional(module_path: str, attr: str, *, extra: str | None = None) -> Any: + try: + module = importlib.import_module(module_path) + return getattr(module, attr) + except ModuleNotFoundError as exc: # pragma: no cover - import guard + if extra: + _die( + f"Optional dependency for '{attr}' not found ({exc.name}).\n" + f"Install it with `pip install fastflowtransform[{extra}]`." + ) + raise + + def _load_project_and_env(project_arg: str) -> tuple[Path, Environment]: proj = _resolve_project_path(project_arg) try: @@ -285,6 +291,9 @@ def _get_test_con(executor: Any) -> Any: def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Callable]: ex: BaseExecutor if prof.engine == "duckdb": + DuckExecutor = _import_optional( + "fastflowtransform.executors.duckdb", "DuckExecutor", extra=None + ) ex = DuckExecutor( db_path=prof.duckdb.path, schema=getattr(prof.duckdb, "db_schema", None), @@ -296,6 +305,9 @@ def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Cal if prof.postgres.dsn is None: raise RuntimeError("Postgres DSN must be set") + PostgresExecutor = _import_optional( + "fastflowtransform.executors.postgres", "PostgresExecutor", extra="postgres" + ) ex = PostgresExecutor(dsn=prof.postgres.dsn, schema=prof.postgres.db_schema) return ex, (lambda n: ex.run_sql(n, jenv)), ex.run_python @@ -304,20 +316,37 @@ def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Cal raise RuntimeError("BigQuery dataset must be set") if prof.bigquery.use_bigframes: + BigQueryBFExecutor = _import_optional( + "fastflowtransform.executors.bigquery.bigframes", + "BigQueryBFExecutor", + extra="bigquery_bf", + ) ex = BigQueryBFExecutor( project=prof.bigquery.project or "", dataset=prof.bigquery.dataset, location=prof.bigquery.location, + allow_create_dataset=prof.bigquery.allow_create_dataset, ) else: + BigQueryExecutor = _import_optional( + "fastflowtransform.executors.bigquery.pandas", + "BigQueryExecutor", + extra="bigquery", + ) ex = BigQueryExecutor( project=prof.bigquery.project or "", dataset=prof.bigquery.dataset, location=prof.bigquery.location, + allow_create_dataset=prof.bigquery.allow_create_dataset, ) return ex, (lambda n: ex.run_sql(n, jenv)), ex.run_python if prof.engine == "databricks_spark": + DatabricksSparkExecutor = _import_optional( + "fastflowtransform.executors.databricks_spark", + "DatabricksSparkExecutor", + extra="spark", + ) ex = DatabricksSparkExecutor( master=prof.databricks_spark.master, app_name=prof.databricks_spark.app_name, @@ -342,6 +371,11 @@ def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Cal } if prof.snowflake_snowpark.role: cfg["role"] = prof.snowflake_snowpark.role + SnowflakeSnowparkExecutor = _import_optional( + "fastflowtransform.executors.snowflake_snowpark", + "SnowflakeSnowparkExecutor", + extra="snowflake", + ) ex = SnowflakeSnowparkExecutor(cfg) return ex, (lambda n: ex.run_sql(n, jenv)), ex.run_python diff --git a/src/fastflowtransform/cli/options.py b/src/fastflowtransform/cli/options.py index 2e6dde4..14145f9 100644 --- a/src/fastflowtransform/cli/options.py +++ b/src/fastflowtransform/cli/options.py @@ -113,7 +113,7 @@ class HttpCacheMode(str, Enum): HttpCacheMode | None, typer.Option( "--http-cache", - help="HTTP-Cache-Modus fΓΌr API-Modelle: off | ro | rw.", + help="HTTP-Cache-Mode for API models: off | ro | rw.", case_sensitive=False, ), ] diff --git a/src/fastflowtransform/cli/run.py b/src/fastflowtransform/cli/run.py index ab234b4..0429d49 100644 --- a/src/fastflowtransform/cli/run.py +++ b/src/fastflowtransform/cli/run.py @@ -209,9 +209,10 @@ def format_run_label(self, name: str) -> str: except Exception: fmt = None - # For database engines (DuckDB/Postgres), we do not show a format suffix - # at all to avoid misleading '[parquet]' labels. - if engine in {"duckdb", "postgres", "postgresql"}: + # For database engines (DuckDB/Postgres/BigQuery), we do not show a format suffix + # at all to avoid misleading '[parquet]' labels (these engines don't expose + # a user-selectable table file format in FFT). + if engine in {"duckdb", "postgres", "postgresql", "bigquery"}: fmt_suffix = "" else: fmt_suffix = f" [{fmt}]" if fmt else "" diff --git a/src/fastflowtransform/config/project.py b/src/fastflowtransform/config/project.py index d9ff1fc..f12a710 100644 --- a/src/fastflowtransform/config/project.py +++ b/src/fastflowtransform/config/project.py @@ -1,3 +1,4 @@ +# fastflowtransform/config/project.py from __future__ import annotations from collections.abc import Sequence diff --git a/src/fastflowtransform/config/seeds.py b/src/fastflowtransform/config/seeds.py index 437b540..a36a9e4 100644 --- a/src/fastflowtransform/config/seeds.py +++ b/src/fastflowtransform/config/seeds.py @@ -105,7 +105,7 @@ def _normalize_dtypes(cls, value: dict[str, dict[str, Any]]) -> dict[str, dict[s return out -def load_seeds_schema(project_dir: Path) -> SeedsSchemaConfig | None: +def load_seeds_schema(project_dir: Path, seeds_dir: Path | None = None) -> SeedsSchemaConfig | None: """ Load and validate seeds/schema.yml for a given project. @@ -116,7 +116,7 @@ def load_seeds_schema(project_dir: Path) -> SeedsSchemaConfig | None: Raises: ValueError: when YAML is present but does not match the expected schema. """ - seeds_dir = project_dir / "seeds" + seeds_dir = seeds_dir or project_dir / "seeds" cfg_path = seeds_dir / "schema.yml" if not cfg_path.exists(): return None diff --git a/src/fastflowtransform/core.py b/src/fastflowtransform/core.py index a2612af..b2c9d79 100644 --- a/src/fastflowtransform/core.py +++ b/src/fastflowtransform/core.py @@ -806,11 +806,9 @@ def _iter_public_attrs(obj: object) -> Iterable[tuple[str, object]]: def _is_jinja_macro(obj: object) -> bool: if obj is None: return False - # 1) Klassenname-Match (funktioniert ohne direkten Import) cls = getattr(obj, "__class__", None) if getattr(cls, "__name__", "") == "Macro": return True - # 2) isinstance gegen jinja2.runtime.Macro (falls vorhanden) MacroClass = getattr(jinja2.runtime, "Macro", None) if MacroClass is not None: with suppress(Exception, TypeError): diff --git a/src/fastflowtransform/dag.py b/src/fastflowtransform/dag.py index b6ff558..fc06819 100644 --- a/src/fastflowtransform/dag.py +++ b/src/fastflowtransform/dag.py @@ -92,7 +92,6 @@ def _mm_id(name: str) -> str: def _quote_label(s: str) -> str: - # Nur fΓΌr Mermaid-Label:
ist ok (mit securityLevel 'loose') s = s.replace("\\", "\\\\").replace('"', '\\"') return f'"{s}"' diff --git a/src/fastflowtransform/decorators.py b/src/fastflowtransform/decorators.py index 1f35f54..e0f62ad 100644 --- a/src/fastflowtransform/decorators.py +++ b/src/fastflowtransform/decorators.py @@ -55,6 +55,7 @@ def model( name: str | None = None, deps: Sequence[str] | None = None, require: Any | None = None, + requires: Any | None = None, *, tags: Sequence[str] | None = None, kind: str = "python", @@ -67,7 +68,8 @@ def model( Args: name: Logical node name in the DAG (defaults to function name). deps: Upstream node names (e.g., ['users.ff']). - require: + require: Required columns per dependency; accepted shapes mirror `requires`. + requires: Alias for `require` (only one of require/requires may be set). - Single dependency: Iterable[str] of required columns from that dependency. - Multiple dependencies: Mapping[dep_name, Iterable[str]] (dep_name = logical name or physical relation). @@ -76,6 +78,11 @@ def model( materialized: Shorthand for meta['materialized']; mirrors config(materialized='...'). meta: Arbitrary metadata for executors/docs (merged with materialized if provided). """ + # Normalize the alias: allow only one of require/requires + if require is not None and requires is not None: + raise TypeError("Pass at most one of 'require' or 'requires', not both") + + effective_require = require if require is not None else requires def deco(func: Callable[P, R_co]) -> HasFFMeta[P, R_co]: f_any = cast(Any, func) @@ -88,7 +95,7 @@ def deco(func: Callable[P, R_co]) -> HasFFMeta[P, R_co]: f_any.__ff_deps__ = fdeps # Normalize require and mirror it on the function and inside the registry - req_norm = _normalize_require(fdeps, require) + req_norm = _normalize_require(fdeps, effective_require) f_any.__ff_require__ = req_norm # useful for tooling/loaders REGISTRY.py_requires[fname] = req_norm # executors read this directly @@ -120,14 +127,52 @@ def deco(func: Callable[P, R_co]) -> HasFFMeta[P, R_co]: def engine_model( - *, only: str | tuple[str, ...], **model_kwargs: Any -) -> Callable[[Callable[P, R_co]], HasFFMeta[P, R_co]]: - allowed = {only} if isinstance(only, str) else {e.lower() for e in only} + *, + only: str | Iterable[str] | None = None, + env_match: Mapping[str, str] | None = None, + **model_kwargs: Any, +) -> Callable[[Callable[P, R_co]], HasFFMeta[P, R_co] | Callable[P, R_co]]: + """ + Env-aware decorator to register a Python model only when the current + environment matches. + + Args: + only: + Backwards compatible engine filter based on FF_ENGINE + (e.g. only="bigquery" or only=("duckdb", "postgres")). + env_match: + Arbitrary environment match, e.g.: + env_match={"FF_ENGINE": "bigquery", "FF_ENGINE_VARIANT": "bigframes"} + """ - def deco(fn): - current = os.getenv("FF_ENGINE", "").lower() - if current in allowed: + # Normalize "only" β†’ allowed engine names (lowercased) + allowed_engines: set[str] | None = None + if only is not None: + if isinstance(only, str): + allowed_engines = {only.lower()} + else: + allowed_engines = {str(e).lower() for e in only} + + def should_register() -> bool: + # 1) Check env_match if provided + if env_match: + for key, expected in env_match.items(): + if os.getenv(key) != expected: + return False + + # 2) Check FF_ENGINE against "only" if provided + if allowed_engines is not None: + current = os.getenv("FF_ENGINE", "").lower() + if current not in allowed_engines: + return False + + return True + + def deco(fn: Callable[P, R_co]) -> HasFFMeta[P, R_co] | Callable[P, R_co]: + if should_register(): + # Register in REGISTRY and attach __ff_* metadata return model(**model_kwargs)(fn) - return fn # stays undecorated β†’ no registry entry + # No registration in this env β†’ return the plain function + return fn return deco diff --git a/src/fastflowtransform/executors/__init__.py b/src/fastflowtransform/executors/__init__.py index f592235..44b1bd3 100644 --- a/src/fastflowtransform/executors/__init__.py +++ b/src/fastflowtransform/executors/__init__.py @@ -1,16 +1,55 @@ # src/fastflowtransform/executors/__init__.py -from .bigquery_bf_exec import BigQueryBFExecutor -from .bigquery_exec import BigQueryExecutor -from .databricks_spark_exec import DatabricksSparkExecutor -from .duckdb_exec import DuckExecutor -from .postgres_exec import PostgresExecutor -from .snowflake_snowpark_exec import SnowflakeSnowparkExecutor - -__all__ = [ - "BigQueryBFExecutor", - "BigQueryExecutor", - "DatabricksSparkExecutor", - "DuckExecutor", - "PostgresExecutor", - "SnowflakeSnowparkExecutor", -] +from __future__ import annotations + +import importlib +from typing import Any + +_EXECUTORS: dict[str, tuple[str, str, str | None]] = { + # name: (module path, attribute, extra) + "DuckExecutor": ("fastflowtransform.executors.duckdb", "DuckExecutor", None), + "PostgresExecutor": ( + "fastflowtransform.executors.postgres", + "PostgresExecutor", + "postgres", + ), + "DatabricksSparkExecutor": ( + "fastflowtransform.executors.databricks_spark", + "DatabricksSparkExecutor", + "spark", + ), + "SnowflakeSnowparkExecutor": ( + "fastflowtransform.executors.snowflake_snowpark", + "SnowflakeSnowparkExecutor", + "snowflake", + ), +} + +__all__: list[str] = list(_EXECUTORS.keys()) # pyright: ignore[reportUnsupportedDunderAll] + + +def _load_executor(name: str) -> Any: + module_path, attr, extra = _EXECUTORS[name] + try: + module = importlib.import_module(module_path) + except ModuleNotFoundError as exc: # pragma: no cover - import guard + if ( + extra + and exc.name + and exc.name.split(".")[0] in {extra, "psycopg", "pyspark", "snowflake"} + ): + raise ImportError( + f"{name} requires the optional dependency set '{extra}'. " + f"Install it with `pip install fastflowtransform[{extra}]`." + ) from exc + raise + return getattr(module, attr) + + +def __getattr__(name: str) -> Any: # pragma: no cover - import guard + if name in _EXECUTORS: + return _load_executor(name) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: # pragma: no cover - import guard + return sorted(list(globals().keys()) + list(_EXECUTORS.keys())) diff --git a/src/fastflowtransform/executors/_shims.py b/src/fastflowtransform/executors/_shims.py index 1cc0e28..f0a0d84 100644 --- a/src/fastflowtransform/executors/_shims.py +++ b/src/fastflowtransform/executors/_shims.py @@ -1,14 +1,16 @@ +# fastflowtransform/executors/_shims.py from __future__ import annotations import re from collections.abc import Iterable, Sequence from typing import Any -from google.cloud.bigquery import Client from sqlalchemy import text from sqlalchemy.engine import Engine from sqlalchemy.sql.elements import ClauseElement +from fastflowtransform.typing import Client + class BigQueryConnShim: """ @@ -30,17 +32,37 @@ def __init__( self.project = project self.dataset = dataset + class _ResultWrapper: + """ + Minimal wrapper around a BigQuery RowIterator so that testing helpers + can call .fetchone() like on a DB-API cursor. + """ + + def __init__(self, row_iter: Any): + self._iter = iter(row_iter) + + def fetchone(self): + try: + return next(self._iter) + except StopIteration: + return None + def execute(self, sql_or_stmts: Any) -> Any: if isinstance(sql_or_stmts, str): - return self.client.query(sql_or_stmts, location=self.location) + # Execute the query and return a cursor-like wrapper with .fetchone() + job = self.client.query(sql_or_stmts) + rows = job.result() + return BigQueryConnShim._ResultWrapper(rows) + if isinstance(sql_or_stmts, Sequence) and not isinstance( sql_or_stmts, (bytes, bytearray, str) ): - job = None + # Execute a sequence of statements; return wrapper for the last result. + last_rows: Any = None for stmt in sql_or_stmts: - job = self.client.query(str(stmt), location=self.location) - job.result() - return job + job = self.client.query(str(stmt)) + last_rows = job.result() + return BigQueryConnShim._ResultWrapper(last_rows or []) raise TypeError(f"Unsupported sql argument type for BigQuery shim: {type(sql_or_stmts)}") diff --git a/src/fastflowtransform/executors/base.py b/src/fastflowtransform/executors/base.py index c85bc36..7ef7672 100644 --- a/src/fastflowtransform/executors/base.py +++ b/src/fastflowtransform/executors/base.py @@ -86,14 +86,14 @@ def _load_callable(path: str) -> Callable[..., Any]: class _ThisProxy: """ - Jinja-kompatibler Proxy fΓΌr {{ this }}: - - Als String verwendbar ({{ this }}) -> physischer Relationsname. - - Attribute verfΓΌgbar ({{ this.name }}, {{ this.materialized }}, ...) + Jinja compatible proxy for {{ this }}: + - Use as string ({{ this }}) -> physical relation name. + - attributes available ({{ this.name }}, {{ this.materialized }}, ...) """ def __init__(self, relation: str, materialized: str, schema: str | None, database: str | None): self.name = relation # Back-compat: {{ this.name }} - self.relation = relation # Alias, falls jemand {{ this.relation }} nutzt + self.relation = relation # alias, if someone uses {{ this.relation }} self.materialized = materialized self.schema = schema self.database = database @@ -138,9 +138,9 @@ def get_render_cfg() -> dict[str, Any]: return cfg def _config_hook(**kwargs: Any) -> str: - cfg = get_render_cfg() # garantiert ein Dict - cfg.update(kwargs) # gleiche Referenz, kein erneutes set() nΓΆtig - return "" # nichts in SQL emittieren + cfg = get_render_cfg() + cfg.update(kwargs) + return "" if "config" not in env.globals: env.globals["config"] = _config_hook @@ -479,7 +479,7 @@ def _execute_python_func( raw = func(*args) if not self._is_frame(raw): raise TypeError( - f"Python-Modell '{node.name}' muss {self._frame_name()} DataFrame zurΓΌckgeben." + f"Python model '{node.name}' must return {self._frame_name()} DataFrame." ) return cast(TFrame, raw) diff --git a/src/fastflowtransform/executors/bigquery/__init__.py b/src/fastflowtransform/executors/bigquery/__init__.py new file mode 100644 index 0000000..4f128fa --- /dev/null +++ b/src/fastflowtransform/executors/bigquery/__init__.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import importlib +from typing import Any + +_EXECUTORS: dict[str, tuple[str, str, str]] = { + "BigQueryBaseExecutor": ( + "fastflowtransform.executors.bigquery.base", + "BigQueryBaseExecutor", + "bigquery", + ), + "BigQueryExecutor": ( + "fastflowtransform.executors.bigquery.pandas", + "BigQueryExecutor", + "bigquery", + ), + "BigQueryBFExecutor": ( + "fastflowtransform.executors.bigquery.bigframes", + "BigQueryBFExecutor", + "bigquery_bf", + ), +} + +__all__: list[str] = list(_EXECUTORS.keys()) # pyright: ignore[reportUnsupportedDunderAll] + + +def _load_executor(name: str) -> Any: + module_path, attr, extra = _EXECUTORS[name] + try: + module = importlib.import_module(module_path) + except ModuleNotFoundError as exc: # pragma: no cover - import guard + if exc.name and exc.name.split(".")[0] in {"google", "bigframes"}: + raise ImportError( + f"{name} requires the optional dependency set '{extra}'. " + f"Install it with `pip install fastflowtransform[{extra}]`." + ) from exc + raise + return getattr(module, attr) + + +def __getattr__(name: str) -> Any: # pragma: no cover - import guard + if name in _EXECUTORS: + return _load_executor(name) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: # pragma: no cover - import guard + return sorted(list(globals().keys()) + list(_EXECUTORS.keys())) diff --git a/src/fastflowtransform/executors/_bigquery_mixin.py b/src/fastflowtransform/executors/bigquery/_bigquery_mixin.py similarity index 67% rename from src/fastflowtransform/executors/_bigquery_mixin.py rename to src/fastflowtransform/executors/bigquery/_bigquery_mixin.py index d43a056..3da7441 100644 --- a/src/fastflowtransform/executors/_bigquery_mixin.py +++ b/src/fastflowtransform/executors/bigquery/_bigquery_mixin.py @@ -1,8 +1,7 @@ -# Shared helpers for BigQuery-based executors. +# fastflowtransform/executors/_bigquery_mixin.py from __future__ import annotations -from google.api_core.exceptions import NotFound -from google.cloud import bigquery +from fastflowtransform.typing import NotFound, bigquery class BigQueryIdentifierMixin: @@ -30,5 +29,12 @@ def _ensure_dataset(self) -> None: ds_id = f"{self.project}.{self.dataset}" try: self.client.get_dataset(ds_id) + return except NotFound: - self.client.create_dataset(bigquery.Dataset(ds_id)) + if not getattr(self, "allow_create_dataset", False): + raise + + ds_obj = bigquery.Dataset(ds_id) + if getattr(self, "location", None): + ds_obj.location = self.location # type: ignore[attr-defined] + self.client.create_dataset(ds_obj, exists_ok=True) diff --git a/src/fastflowtransform/executors/bigquery_exec.py b/src/fastflowtransform/executors/bigquery/base.py similarity index 61% rename from src/fastflowtransform/executors/bigquery_exec.py rename to src/fastflowtransform/executors/bigquery/base.py index 57a058d..cfb630e 100644 --- a/src/fastflowtransform/executors/bigquery_exec.py +++ b/src/fastflowtransform/executors/bigquery/base.py @@ -1,99 +1,84 @@ -# src/fastflowtransform/executors/bigquery_exec.py +# fastflowtransform/executors/bigquery/base.py from __future__ import annotations -from collections.abc import Iterable -from typing import Any - -import pandas as pd -from google.api_core.exceptions import BadRequest, NotFound -from google.cloud import bigquery -from google.cloud.bigquery import Client, LoadJobConfig +from typing import Any, TypeVar from fastflowtransform.core import Node, relation_for -from fastflowtransform.executors._bigquery_mixin import BigQueryIdentifierMixin from fastflowtransform.executors._shims import BigQueryConnShim from fastflowtransform.executors.base import BaseExecutor +from fastflowtransform.executors.bigquery._bigquery_mixin import BigQueryIdentifierMixin from fastflowtransform.meta import ensure_meta_table, upsert_meta +from fastflowtransform.typing import BadRequest, Client, NotFound, bigquery + +TFrame = TypeVar("TFrame") -# ---- Executor -------------------------------------------------------------- -class BigQueryExecutor(BigQueryIdentifierMixin, BaseExecutor[pd.DataFrame]): - ENGINE_NAME = "bigquery" +class BigQueryBaseExecutor(BigQueryIdentifierMixin, BaseExecutor[TFrame]): """ - BigQuery executor (pandas DataFrames). - ENV/Profiles typically use: - - FF_BQ_PROJECT - - FF_BQ_DATASET - - FF_BQ_LOCATION (optional) + Shared BigQuery executor logic (SQL, incremental, meta, DQ helpers). + + Subclasses are responsible for: + - frame type (pandas / BigFrames / ...) + - _read_relation() + - _materialize_relation() + - _is_frame() + - _frame_name() """ + # Subclasses override ENGINE_NAME ("bigquery", "bigquery_batch", ...) + ENGINE_NAME = "bigquery_base" + def __init__( self, project: str, dataset: str, location: str | None = None, client: Client | None = None, + allow_create_dataset: bool = False, ): self.project = project self.dataset = dataset self.location = location + self.allow_create_dataset = allow_create_dataset self.client: Client = client or bigquery.Client( - project=self.project, location=self.location + project=self.project, + location=self.location, ) # Testing-API: con.execute(...) self.con = BigQueryConnShim( - self.client, location=self.location, project=self.project, dataset=self.dataset - ) - - # ---------- Helpers ---------- - # ---------- Python (Frames) ---------- - def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.DataFrame: - q = f"SELECT * FROM {self._qualified_identifier(relation)}" - try: - job = self.client.query(q, location=self.location) - return job.result().to_dataframe(create_bqstorage_client=True) - except NotFound as e: - # list existing tables to aid debugging - tables = list(self.client.list_tables(f"{self.project}.{self.dataset}")) - existing = [t.table_id for t in tables] - raise RuntimeError( - f"Dependency table not found: {self.project}.{self.dataset}.{relation}\n" - f"Deps: {list(deps)}\nExisting in dataset: {existing}\n" - "Hinweis: Seeds/Upstream-Modelle erzeugt? DATASET korrekt?" - ) from e - - def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None: - self._ensure_dataset() - table_id = f"{self.project}.{self.dataset}.{relation}" - job_config = LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) - # Optionally extend dtype mapping here (NUMERIC/STRING etc.) - try: - job = self.client.load_table_from_dataframe( - df, table_id, job_config=job_config, location=self.location - ) - job.result() - except BadRequest as e: - raise RuntimeError(f"BigQuery write failed: {table_id}\n{e}") from e - - def _create_view_over_table(self, view_name: str, backing_table: str, node: Node) -> None: - view_id = self._qualified_identifier(view_name) - back_id = self._qualified_identifier(backing_table) - self._ensure_dataset() - job = self.client.query( - f"CREATE OR REPLACE VIEW {view_id} AS SELECT * FROM {back_id}", + self.client, location=self.location, + project=self.project, + dataset=self.dataset, ) - job.result() - def _frame_name(self) -> str: - return "pandas" + # ---- DQ test table formatting (fft test) ---- + def _format_test_table(self, table: str | None) -> str | None: + """ + Ensure tests use fully-qualified BigQuery identifiers in fft test. + """ + table = super()._format_test_table(table) + if not isinstance(table, str) or not table.strip(): + return table + return self._qualified_identifier(table.strip()) # ---- SQL hooks ---- def _format_relation_for_ref(self, name: str) -> str: return self._qualified_identifier(relation_for(name)) + def _this_identifier(self, node: Node) -> str: + """ + Ensure {{ this }} renders as a fully-qualified identifier so BigQuery + incremental SQL (e.g., subqueries against {{ this }}) includes project + and dataset. + """ + return self._qualified_identifier(relation_for(node.name)) + def _format_source_reference( - self, cfg: dict[str, Any], source_name: str, table_name: str + self, + cfg: dict[str, Any], + source_name: str, + table_name: str, ) -> str: if cfg.get("location"): raise NotImplementedError("BigQuery executor does not support path-based sources.") @@ -107,7 +92,11 @@ def _format_source_reference( return self._qualified_identifier(ident, project=proj, dataset=dset) def _apply_sql_materialization( - self, node: Node, target_sql: str, select_body: str, materialization: str + self, + node: Node, + target_sql: str, + select_body: str, + materialization: str, ) -> None: self._ensure_dataset() try: @@ -118,42 +107,45 @@ def _apply_sql_materialization( ) from e def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None: - job = self.client.query( + self.client.query( f"CREATE OR REPLACE VIEW {target_sql} AS {select_body}", location=self.location, - ) - job.result() + ).result() def _create_or_replace_table(self, target_sql: str, select_body: str, node: Node) -> None: - job = self.client.query( + self.client.query( f"CREATE OR REPLACE TABLE {target_sql} AS {select_body}", location=self.location, - ) - job.result() + ).result() def _create_or_replace_view_from_table( - self, view_name: str, backing_table: str, node: Node + self, + view_name: str, + backing_table: str, + node: Node, ) -> None: view_id = self._qualified_identifier(view_name) back_id = self._qualified_identifier(backing_table) self._ensure_dataset() - job = self.client.query( + self.client.query( f"CREATE OR REPLACE VIEW {view_id} AS SELECT * FROM {back_id}", location=self.location, - ) - job.result() + ).result() + # ---- Meta hook ---- def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: """ Write/update dataset._ff_meta after a successful build. + Both pandas + BigFrames executors use the logical engine key 'bigquery'. """ try: ensure_meta_table(self) upsert_meta(self, node.name, relation, fingerprint, "bigquery") except Exception: + # Best-effort: meta must not break the run pass - # ── Incremental API (parity with DuckDB/PG) ─────────────────────────── + # ── Incremental API (shared across BigQuery executors) ─────────────── def exists_relation(self, relation: str) -> bool: """ Check presence in INFORMATION_SCHEMA for tables/views. @@ -185,8 +177,12 @@ def create_table_as(self, relation: str, select_sql: str) -> None: CREATE TABLE AS with cleaned SELECT body (no trailing semicolons). """ self._ensure_dataset() - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - target = self._qualified_identifier(relation, project=self.project, dataset=self.dataset) + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + target = self._qualified_identifier( + relation, + project=self.project, + dataset=self.dataset, + ) self.client.query( f"CREATE TABLE {target} AS {body}", location=self.location, @@ -197,8 +193,12 @@ def incremental_insert(self, relation: str, select_sql: str) -> None: INSERT INTO with cleaned SELECT body. """ self._ensure_dataset() - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - target = self._qualified_identifier(relation, project=self.project, dataset=self.dataset) + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + target = self._qualified_identifier( + relation, + project=self.project, + dataset=self.dataset, + ) self.client.query( f"INSERT INTO {target} {body}", location=self.location, @@ -211,8 +211,12 @@ def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str - INSERT new rows from the same body """ self._ensure_dataset() - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - target = self._qualified_identifier(relation, project=self.project, dataset=self.dataset) + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + target = self._qualified_identifier( + relation, + project=self.project, + dataset=self.dataset, + ) pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE" delete_sql = f""" @@ -225,7 +229,11 @@ def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str self.client.query(insert_sql, location=self.location).result() def alter_table_sync_schema( - self, relation: str, select_sql: str, *, mode: str = "append_new_columns" + self, + relation: str, + select_sql: str, + *, + mode: str = "append_new_columns", ) -> None: """ Best-effort additive schema sync: @@ -236,7 +244,8 @@ def alter_table_sync_schema( return self._ensure_dataset() - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + # Infer schema using a no-row query (lets BigQuery type the expressions) probe = self.client.query( f"SELECT * FROM ({body}) WHERE 1=0", @@ -257,7 +266,12 @@ def alter_table_sync_schema( to_add = [name for name in out_fields if name not in existing_cols] if not to_add: return - target = self._qualified_identifier(relation, project=self.project, dataset=self.dataset) + + target = self._qualified_identifier( + relation, + project=self.project, + dataset=self.dataset, + ) for col in to_add: f = out_fields[col] typ = str(f.field_type) if hasattr(f, "field_type") else "STRING" diff --git a/src/fastflowtransform/executors/bigquery/bigframes.py b/src/fastflowtransform/executors/bigquery/bigframes.py new file mode 100644 index 0000000..6dfb549 --- /dev/null +++ b/src/fastflowtransform/executors/bigquery/bigframes.py @@ -0,0 +1,158 @@ +# fastflowtransform/executors/bigquery/bigframes.py +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any + +from fastflowtransform.core import Node +from fastflowtransform.executors.bigquery.base import BigQueryBaseExecutor +from fastflowtransform.typing import ( + BFDataFrame, + BigQueryOptions, + NotFound, + bf_global_session, + bigframes, +) + + +class BigQueryBFExecutor(BigQueryBaseExecutor[BFDataFrame]): + ENGINE_NAME = "bigquery_batch" + + def __init__( + self, + project: str, + dataset: str, + location: str | None = None, + allow_create_dataset: bool = False, + ): + if not project: + raise RuntimeError("BigFrames executor requires FF_BQ_PROJECT to be set.") + if not location: + raise RuntimeError( + "BigFrames executor requires FF_BQ_LOCATION to be set. " + "Use the dataset's region (e.g., EU or US)." + ) + super().__init__( + project=project, + dataset=dataset, + location=location, + allow_create_dataset=allow_create_dataset, + ) + + try: + ctx = BigQueryOptions( + project=project, + location=location, + ) + self.session = bigframes.Session(context=ctx) + except Exception as exc: + raise RuntimeError( + "Failed to initialize BigFrames session. Verify FF_BQ_PROJECT, " + "FF_BQ_DATASET, and FF_BQ_LOCATION are set for the active profile." + ) from exc + + def run_python(self, node: Node) -> None: + """ + Execute Python models with a session scoped to this executor. + + We avoid mutating the process-wide default session; instead we + temporarily set the executor session as the active global session so + model code using bpd.DataFrame(...) picks up the configured location, + then restore afterward. + """ + ctx = bf_global_session._GlobalSessionContext(self.session) + with ctx: + super().run_python(node) + + # ---------- Python (Frames) ---------- + def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> BFDataFrame: + table_id = f"{self.project}.{self.dataset}.{relation}" + try: + return self.session.read_gbq(table_id) + except NotFound as e: + existing = [ + t.table_id for t in self.client.list_tables(f"{self.project}.{self.dataset}") + ] + raise RuntimeError( + f"Dependency table not found: {table_id}\n" + f"Deps: {list(deps)}\nExisting in dataset: {existing}\n" + "Hinweis: Seeds/Upstream-Modelle erzeugt? DATASET korrekt?" + ) from e + + def _materialize_relation(self, relation: str, df: BFDataFrame, node: Node) -> None: + table_id = f"{self.project}.{self.dataset}.{relation}" + + to_gbq = getattr(df, "to_gbq", None) + if callable(to_gbq): + to_gbq(table_id, if_exists="replace") + return + + # Fallback only when it is truly a method (not a column name!) + mat = getattr(df, "materialize", None) + if callable(mat): + mat(table=table_id, mode="overwrite") + return + + raise RuntimeError( + "BigQuery DataFrames: Ergebnis nicht materialisierbar. " + "Erwarte df.to_gbq(...) oder df.materialize(...)." + ) + + # ---- Required-columns validation tuned for BigFrames ---- + def _validate_required( + self, + node_name: str, + inputs: Any, + requires: dict[str, set[str]], + ) -> None: + if not requires: + return + + def cols(bf_df: BFDataFrame) -> set[str]: + if hasattr(bf_df, "columns"): + return set(map(str, list(bf_df.columns))) + if hasattr(bf_df, "schema") and hasattr(bf_df.schema, "names"): + return set(bf_df.schema.names) + return set() + + errs: list[str] = [] + if self._is_frame(inputs): + # Single input frame case + need = next(iter(requires.values()), set()) + miss = need - cols(inputs) + if miss: + errs.append(f"- missing columns: {sorted(miss)}") + else: + # Mapping {rel -> frame} + for rel, need in requires.items(): + if rel not in inputs: + errs.append(f"- missing dependency key '{rel}'") + continue + miss = need - cols(inputs[rel]) + if miss: + errs.append(f"- [{rel}] missing: {sorted(miss)}") + + if errs: + raise ValueError( + f"Required columns check failed for BigQuery DataFrames model '{node_name}'.\n" + + "\n".join(errs) + ) + + def _columns_of(self, frame: BFDataFrame) -> list[str]: + if hasattr(frame, "columns"): + return [str(c) for c in list(frame.columns)] + if hasattr(frame, "schema") and hasattr(frame.schema, "names"): + return list(frame.schema.names) + return [] + + def _is_frame(self, obj: Any) -> bool: + if obj is None: + return False + return ( + callable(getattr(obj, "to_gbq", None)) + or callable(getattr(obj, "materialize", None)) + or hasattr(obj, "columns") + ) + + def _frame_name(self) -> str: + return "BigQuery DataFrame (BigFrames)" diff --git a/src/fastflowtransform/executors/bigquery/pandas.py b/src/fastflowtransform/executors/bigquery/pandas.py new file mode 100644 index 0000000..83b0d49 --- /dev/null +++ b/src/fastflowtransform/executors/bigquery/pandas.py @@ -0,0 +1,79 @@ +# fastflowtransform/executors/bigquery/pandas.py +from __future__ import annotations + +from collections.abc import Iterable + +import pandas as pd + +from fastflowtransform.core import Node +from fastflowtransform.executors.bigquery.base import BigQueryBaseExecutor +from fastflowtransform.typing import BadRequest, Client, LoadJobConfig, NotFound, bigquery + + +class BigQueryExecutor(BigQueryBaseExecutor[pd.DataFrame]): + ENGINE_NAME = "bigquery" + """ + BigQuery executor (pandas DataFrames). + ENV/Profiles typically use: + - FF_BQ_PROJECT + - FF_BQ_DATASET + - FF_BQ_LOCATION (optional) + """ + + def __init__( + self, + project: str, + dataset: str, + location: str | None = None, + client: Client | None = None, + allow_create_dataset: bool = False, + ): + super().__init__( + project=project, + dataset=dataset, + location=location, + client=client, + allow_create_dataset=allow_create_dataset, + ) + + # ---------- Python (Frames) ---------- + def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.DataFrame: + q = f"SELECT * FROM {self._qualified_identifier(relation)}" + try: + job = self.client.query(q, location=self.location) + return job.result().to_dataframe(create_bqstorage_client=True) + except NotFound as e: + # list existing tables to aid debugging + tables = list(self.client.list_tables(f"{self.project}.{self.dataset}")) + existing = [t.table_id for t in tables] + raise RuntimeError( + f"Dependency table not found: {self.project}.{self.dataset}.{relation}\n" + f"Deps: {list(deps)}\nExisting in dataset: {existing}\n" + "Hinweis: Seeds/Upstream-Modelle erzeugt? DATASET korrekt?" + ) from e + + def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None: + self._ensure_dataset() + table_id = f"{self.project}.{self.dataset}.{relation}" + job_config = LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) + # Optionally extend dtype mapping here (NUMERIC/STRING etc.) + try: + job = self.client.load_table_from_dataframe( + df, + table_id, + job_config=job_config, + location=self.location, + ) + job.result() + except BadRequest as e: + raise RuntimeError(f"BigQuery write failed: {table_id}\n{e}") from e + + def _create_view_over_table(self, view_name: str, backing_table: str, node: Node) -> None: + """ + Convenience helper for a simple view on top of a backing table. + """ + # Delegate to the shared base implementation + self._create_or_replace_view_from_table(view_name, backing_table, node) + + def _frame_name(self) -> str: + return "pandas" diff --git a/src/fastflowtransform/executors/bigquery_bf_exec.py b/src/fastflowtransform/executors/bigquery_bf_exec.py deleted file mode 100644 index e7175aa..0000000 --- a/src/fastflowtransform/executors/bigquery_bf_exec.py +++ /dev/null @@ -1,302 +0,0 @@ -# src/fastflowtransform/executors/bigquery_bf_exec.py -from __future__ import annotations - -from collections.abc import Iterable -from typing import TYPE_CHECKING, Any - -import bigframes # Package: google-cloud-bigquery-dataframes -from bigframes._config.bigquery_options import BigQueryOptions -from google.api_core.exceptions import BadRequest, NotFound -from google.cloud import bigquery - -from fastflowtransform.core import Node, relation_for -from fastflowtransform.executors._bigquery_mixin import BigQueryIdentifierMixin -from fastflowtransform.executors._shims import BigQueryConnShim -from fastflowtransform.executors.base import BaseExecutor -from fastflowtransform.meta import ensure_meta_table, upsert_meta - -if TYPE_CHECKING: - from bigframes.dataframe import DataFrame as BFDataFrame -else: - BFDataFrame = Any - - -class BigQueryBFExecutor(BigQueryIdentifierMixin, BaseExecutor[BFDataFrame]): - ENGINE_NAME = "bigquery_batch" - - def __init__(self, project: str, dataset: str, location: str | None = None): - self.project = project - self.dataset = dataset - self.location = location - self.client = bigquery.Client(project=project, location=location) - - try: - ctx = BigQueryOptions( - project=project, - # default_dataset=dataset, - location=location, - ) - self.session = bigframes.Session(context=ctx) - except Exception: - # Fallback: session without explicit context (ADC/default project), - # though you typically use fully qualified table IDs anyway. - self.session = bigframes.Session() - - self.con = BigQueryConnShim(self.client, location=self.location) - - def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> BFDataFrame: - table_id = f"{self.project}.{self.dataset}.{relation}" - try: - return self.session.read_gbq(table_id) - except NotFound as e: - existing = [ - t.table_id for t in self.client.list_tables(f"{self.project}.{self.dataset}") - ] - raise RuntimeError( - f"Dependency table not found: {table_id}\n" - f"Deps: {list(deps)}\nExisting in dataset: {existing}\n" - "Hinweis: Seeds/Upstream-Modelle erzeugt? DATASET korrekt?" - ) from e - - def _materialize_relation(self, relation: str, df: BFDataFrame, node: Node) -> None: - table_id = f"{self.project}.{self.dataset}.{relation}" - - to_gbq = getattr(df, "to_gbq", None) - if callable(to_gbq): - to_gbq(table_id, if_exists="replace") - return - - # Fallback only when it is truly a method (not a column name!) - mat = getattr(df, "materialize", None) - if callable(mat): - mat(table=table_id, mode="overwrite") - return - - raise RuntimeError( - "BigQuery DataFrames: Ergebnis nicht materialisierbar. " - "Erwarte df.to_gbq(...) oder df.materialize(...)." - ) - - # ---- Meta hook ---- - def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: - """Mirror DuckDB/PG: write/update _ff_meta after successful build.""" - try: - ensure_meta_table(self) - upsert_meta(self, node.name, relation, fingerprint, "bigquery") - except Exception: - # Best-effort: meta must not break the run - pass - - def _validate_required( - self, node_name: str, inputs: Any, requires: dict[str, set[str]] - ) -> None: - if not requires: - return - - def cols(bf_df: BFDataFrame) -> set[str]: - if hasattr(bf_df, "columns"): - return set(map(str, list(bf_df.columns))) - if hasattr(bf_df, "schema") and hasattr(bf_df.schema, "names"): - return set(bf_df.schema.names) - return set() - - errs: list[str] = [] - if self._is_frame(inputs): - need = next(iter(requires.values()), set()) - miss = need - cols(inputs) - if miss: - errs.append(f"- missing columns: {sorted(miss)}") - else: - for rel, need in requires.items(): - if rel not in inputs: - errs.append(f"- missing dependency key '{rel}'") - continue - miss = need - cols(inputs[rel]) - if miss: - errs.append(f"- [{rel}] missing: {sorted(miss)}") - if errs: - raise ValueError( - f"Required columns check failed for BigQuery DataFrames model '{node_name}'.\n" - + "\n".join(errs) - ) - - def _columns_of(self, frame: BFDataFrame) -> list[str]: - if hasattr(frame, "columns"): - return [str(c) for c in list(frame.columns)] - if hasattr(frame, "schema") and hasattr(frame.schema, "names"): - return list(frame.schema.names) - return [] - - def _is_frame(self, obj: Any) -> bool: - return bool(obj) and ( - callable(getattr(obj, "to_gbq", None)) - or callable(getattr(obj, "materialize", None)) - or hasattr(obj, "columns") - ) - - def _frame_name(self) -> str: - return "BigQuery DataFrame (BigFrames)" - - # ---- Helpers ---- - # ---- SQL hooks ---- - def _format_relation_for_ref(self, name: str) -> str: - return self._qualified_identifier(relation_for(name)) - - def _format_source_reference( - self, cfg: dict[str, Any], source_name: str, table_name: str - ) -> str: - if cfg.get("location"): - raise NotImplementedError("BigQuery executor does not support path-based sources.") - - ident = cfg.get("identifier") - if not ident: - raise KeyError(f"Source {source_name}.{table_name} missing identifier") - - proj = cfg.get("project") or cfg.get("database") or cfg.get("catalog") or self.project - dset = cfg.get("dataset") or cfg.get("schema") or self.dataset - return self._qualified_identifier(ident, project=proj, dataset=dset) - - def _apply_sql_materialization( - self, node: Node, target_sql: str, select_body: str, materialization: str - ) -> None: - self._ensure_dataset() - try: - super()._apply_sql_materialization(node, target_sql, select_body, materialization) - except BadRequest as e: - raise RuntimeError( - f"BigQuery SQL failed for {target_sql}:\n{select_body}\n\n{e}" - ) from e - - def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None: - self.client.query( - f"CREATE OR REPLACE VIEW {target_sql} AS {select_body}", - location=self.location, - ).result() - - def _create_or_replace_table(self, target_sql: str, select_body: str, node: Node) -> None: - self.client.query( - f"CREATE OR REPLACE TABLE {target_sql} AS {select_body}", - location=self.location, - ).result() - - def _create_or_replace_view_from_table( - self, view_name: str, backing_table: str, node: Node - ) -> None: - view_id = self._qualified_identifier(view_name) - back_id = self._qualified_identifier(backing_table) - self.client.query( - f"CREATE OR REPLACE VIEW {view_id} AS SELECT * FROM {back_id}", - location=self.location, - ).result() - - # ── Incremental API (feature parity with DuckDB/PG) ────────────────── - def exists_relation(self, relation: str) -> bool: - """Check presence in TABLES or VIEWS information schema.""" - proj = self.project - dset = self.dataset - rel = relation - q = f""" - SELECT 1 - FROM `{proj}.{dset}.INFORMATION_SCHEMA.TABLES` - WHERE LOWER(table_name)=LOWER(@rel) - UNION ALL - SELECT 1 - FROM `{proj}.{dset}.INFORMATION_SCHEMA.VIEWS` - WHERE LOWER(table_name)=LOWER(@rel) - LIMIT 1 - """ - job = self.client.query( - q, - job_config=bigquery.QueryJobConfig( - query_parameters=[bigquery.ScalarQueryParameter("rel", "STRING", rel)] - ), - location=self.location, - ) - return bool(list(job.result())) - - def create_table_as(self, relation: str, select_sql: str) -> None: - """CTAS with cleaned SELECT body (no trailing semicolons).""" - self._ensure_dataset() - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - target = self._qualified_identifier(relation, project=self.project, dataset=self.dataset) - self.client.query( - f"CREATE TABLE {target} AS {body}", - location=self.location, - ).result() - - def incremental_insert(self, relation: str, select_sql: str) -> None: - """INSERT INTO with cleaned SELECT body.""" - self._ensure_dataset() - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - target = self._qualified_identifier(relation, project=self.project, dataset=self.dataset) - self.client.query( - f"INSERT INTO {target} {body}", - location=self.location, - ).result() - - def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None: - """ - Portable fallback in BigQuery (without full MERGE): - - DELETE collisions via WHERE EXISTS against the cleaned SELECT body - - INSERT all rows from the body - Executed as two statements to keep error surfaces clean. - """ - self._ensure_dataset() - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - target = self._qualified_identifier(relation, project=self.project, dataset=self.dataset) - pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE" - - # DELETE … WHERE EXISTS (SELECT 1 FROM (body) s WHERE pred) - delete_sql = f""" - DELETE FROM {target} t - WHERE EXISTS (SELECT 1 FROM ({body}) s WHERE {pred}) - """ - self.client.query(delete_sql, location=self.location).result() - - # INSERT new rows - insert_sql = f"INSERT INTO {target} SELECT * FROM ({body})" - self.client.query(insert_sql, location=self.location).result() - - def alter_table_sync_schema( - self, relation: str, select_sql: str, *, mode: str = "append_new_columns" - ) -> None: - """ - Best-effort additive schema sync: - - infer select schema via dry-run (schema on QueryJob) - - add missing columns as NULLABLE with inferred type - """ - if mode not in {"append_new_columns", "sync_all_columns"}: - return - - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - # Infer target schema from the query (no data read) - probe_job = self.client.query( - f"SELECT * FROM ({body}) WHERE 1=0", - job_config=bigquery.QueryJobConfig(dry_run=False, use_query_cache=False), - location=self.location, - ) - probe_job.result() - select_fields = {f.name: f for f in (probe_job.schema or [])} - - # Existing table schema - table_ref = f"{self.project}.{self.dataset}.{relation}" - try: - tbl = self.client.get_table(table_ref) - except NotFound: - return - existing_cols = {f.name for f in (tbl.schema or [])} - - to_add = [name for name in select_fields if name not in existing_cols] - if not to_add: - return - - target = self._qualified_identifier(relation, project=self.project, dataset=self.dataset) - for col in to_add: - bf = select_fields[col] - # Use BigQuery standard SQL type string (e.g., STRING, INT64, BOOL, FLOAT64, …) - typ = str(bf.field_type) if hasattr(bf, "field_type") else "STRING" - # Nullable by default - self.client.query( - f"ALTER TABLE {target} ADD COLUMN {col} {typ}", - location=self.location, - ).result() diff --git a/src/fastflowtransform/executors/databricks_spark_exec.py b/src/fastflowtransform/executors/databricks_spark.py similarity index 98% rename from src/fastflowtransform/executors/databricks_spark_exec.py rename to src/fastflowtransform/executors/databricks_spark.py index a8afca6..1c9aaae 100644 --- a/src/fastflowtransform/executors/databricks_spark_exec.py +++ b/src/fastflowtransform/executors/databricks_spark.py @@ -1,21 +1,12 @@ -# src/fastflowtransform/executors/databricks_spark_exec.py +# src/fastflowtransform/executors/databricks_spark.py from __future__ import annotations -from collections.abc import Iterable +from collections.abc import Callable, Iterable from contextlib import suppress from pathlib import Path from typing import Any from urllib.parse import unquote, urlparse -from pyspark.sql import DataFrame as SDF, SparkSession -from pyspark.sql.types import DataType - -try: - # Enable Delta Lake via delta-spark when available - from delta import configure_spark_with_delta_pip -except Exception: # pragma: no cover - configure_spark_with_delta_pip = None # type: ignore[assignment] - from fastflowtransform import storage from fastflowtransform.core import REGISTRY, Node, relation_for from fastflowtransform.errors import ModelExecutionError @@ -24,6 +15,16 @@ from fastflowtransform.meta import ensure_meta_table, upsert_meta from fastflowtransform.table_formats import get_spark_format_handler from fastflowtransform.table_formats.base import SparkFormatHandler +from fastflowtransform.typing import SDF, DataType, SparkSession + +# Enable Delta Lake via delta-spark when available +configure_spark_with_delta_pip: Callable[..., Any] | None +try: + from delta import configure_spark_with_delta_pip as _configure_spark_with_delta_pip + + configure_spark_with_delta_pip = _configure_spark_with_delta_pip +except Exception: # pragma: no cover + configure_spark_with_delta_pip = None _DELTA_EXTENSION = "io.delta.sql.DeltaSparkSessionExtension" _DELTA_CATALOG = "org.apache.spark.sql.delta.catalog.DeltaCatalog" diff --git a/src/fastflowtransform/executors/duckdb_exec.py b/src/fastflowtransform/executors/duckdb.py similarity index 98% rename from src/fastflowtransform/executors/duckdb_exec.py rename to src/fastflowtransform/executors/duckdb.py index c642672..f1c13ec 100644 --- a/src/fastflowtransform/executors/duckdb_exec.py +++ b/src/fastflowtransform/executors/duckdb.py @@ -1,4 +1,4 @@ -# fastflowtransform/executors/duckdb_exec.py +# fastflowtransform/executors/duckdb.py from __future__ import annotations from collections.abc import Iterable @@ -124,7 +124,7 @@ def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.D raise RuntimeError( f"Dependency table not found: '{relation}'\n" f"Deps: {list(deps)}\nExisting tables: {existing}\n" - "Hinweis: gleiche Datei-DB/Connection fΓΌr Seeding & Run verwenden." + "Note: Use same File-DB/Connection for Seeding & Run." ) from e def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None: diff --git a/src/fastflowtransform/executors/postgres_exec.py b/src/fastflowtransform/executors/postgres.py similarity index 99% rename from src/fastflowtransform/executors/postgres_exec.py rename to src/fastflowtransform/executors/postgres.py index 7590d4a..9eac580 100644 --- a/src/fastflowtransform/executors/postgres_exec.py +++ b/src/fastflowtransform/executors/postgres.py @@ -1,4 +1,4 @@ -# fastflowtransform/executors/postgres_exec.py +# fastflowtransform/executors/postgres.py from collections.abc import Iterable from typing import Any diff --git a/src/fastflowtransform/executors/snowflake_snowpark_exec.py b/src/fastflowtransform/executors/snowflake_snowpark.py similarity index 95% rename from src/fastflowtransform/executors/snowflake_snowpark_exec.py rename to src/fastflowtransform/executors/snowflake_snowpark.py index b63b21e..8e7c3a9 100644 --- a/src/fastflowtransform/executors/snowflake_snowpark_exec.py +++ b/src/fastflowtransform/executors/snowflake_snowpark.py @@ -1,14 +1,13 @@ -# src/fastflowtransform/executors/snowflake_snowpark_exec.py +# src/fastflowtransform/executors/snowflake_snowpark.py from __future__ import annotations from collections.abc import Iterable from typing import Any -from snowflake.snowpark import DataFrame as SNDF, Session - from fastflowtransform.core import Node, relation_for from fastflowtransform.executors.base import BaseExecutor from fastflowtransform.meta import ensure_meta_table, upsert_meta +from fastflowtransform.typing import SNDF, SnowparkSession as Session class SnowflakeSnowparkExecutor(BaseExecutor[SNDF]): @@ -84,7 +83,13 @@ def _columns_of(self, frame: SNDF) -> list[str]: return list(frame.schema.names) def _is_frame(self, obj: Any) -> bool: - return isinstance(obj, SNDF) + # Accept real Snowpark DataFrames and test doubles with a compatible surface. + schema = getattr(obj, "schema", None) + return isinstance(obj, SNDF) or ( + schema is not None + and hasattr(schema, "names") + and callable(getattr(obj, "collect", None)) + ) def _frame_name(self) -> str: return "Snowpark" diff --git a/src/fastflowtransform/incremental.py b/src/fastflowtransform/incremental.py index f522ec2..bc00d5a 100644 --- a/src/fastflowtransform/incremental.py +++ b/src/fastflowtransform/incremental.py @@ -158,10 +158,24 @@ def _full_refresh_table(executor: Any, relation: Any, rendered_sql: str) -> None full_refresh(relation, rendered_sql) return + # Best-effort qualified identifier for engines that expose it (e.g. BigQuery) + target = relation + qualify = getattr(executor, "_qualified_identifier", None) + if callable(qualify): + try: + proj = getattr(executor, "project", None) + dset = getattr(executor, "dataset", None) or getattr(executor, "schema", None) + if proj is not None or dset is not None: + target = qualify(relation, project=proj, dataset=dset) + else: + target = qualify(relation) + except Exception: + target = relation + try: executor.create_table_as(relation, rendered_sql) except Exception: - _exec_sql(executor, f"create or replace table {relation} as {rendered_sql}") + _exec_sql(executor, f"create or replace table {target} as {rendered_sql}") UniqueKey = str | Sequence[str] | None diff --git a/src/fastflowtransform/logging.py b/src/fastflowtransform/logging.py index f7efb19..4bfb8c2 100644 --- a/src/fastflowtransform/logging.py +++ b/src/fastflowtransform/logging.py @@ -208,7 +208,6 @@ def setup( if h.get_name() == "ff_console": root.removeHandler(h) except Exception: - # get_name gibt es ab 3.8 - fallback ΓΌber .name if getattr(h, "name", None) == "ff_console": root.removeHandler(h) diff --git a/src/fastflowtransform/run_executor.py b/src/fastflowtransform/run_executor.py index 36e8cfb..cd2ba49 100644 --- a/src/fastflowtransform/run_executor.py +++ b/src/fastflowtransform/run_executor.py @@ -101,7 +101,7 @@ def _run_level( name_width: int, name_formatter: Callable[[str], str] | None, ) -> tuple[bool, int, int, int]: - """FΓΌhrt eine Ebene aus und loggt. RΓΌckgabe: (had_error, ok_count, fail_count, lvl_ms).""" + """Executes one level and logs. Returns: (had_error, ok_count, fail_count, lvl_ms).""" if not names: return False, 0, 0, 0 diff --git a/src/fastflowtransform/schema_loader.py b/src/fastflowtransform/schema_loader.py index 99a551f..02520aa 100644 --- a/src/fastflowtransform/schema_loader.py +++ b/src/fastflowtransform/schema_loader.py @@ -15,8 +15,8 @@ @dataclass(frozen=True) class TestSpec: """ - Normalisierte Test-Spezifikation fΓΌr die CLI. - Beispiel: not_null(users_enriched.email), unique(users_enriched.id), + Normalized test spec for CLI. + Example: not_null(users_enriched.email), unique(users_enriched.id), accepted_values(users_enriched.email, values=[...]) … """ @@ -39,8 +39,8 @@ def __post_init__(self): def load_schema_tests(project_dir: Path) -> list[TestSpec]: """ - LΓ€dt dbt-Γ€hnliche Schema-YAMLs (version: 2) unter models/**.yml (& schema.yml), - und gibt normalisierte TestSpec-Objekte zurΓΌck. + Loads schema yamls (version: 2) in models/**.yml (& schema.yml), + and returns normalized TestSpec objects. """ project_dir = Path(project_dir) models_dir = project_dir / "models" diff --git a/src/fastflowtransform/seeding.py b/src/fastflowtransform/seeding.py index 72a1c6e..071c7be 100644 --- a/src/fastflowtransform/seeding.py +++ b/src/fastflowtransform/seeding.py @@ -2,6 +2,7 @@ from __future__ import annotations import math +import os import shutil import uuid from collections.abc import Callable, Iterable @@ -12,18 +13,12 @@ from urllib.parse import unquote, urlparse import pandas as pd -from pyspark.sql import DataFrame as SDF, SparkSession from fastflowtransform import storage from fastflowtransform.config.seeds import SeedsSchemaConfig, load_seeds_schema from fastflowtransform.logging import echo from fastflowtransform.settings import EngineType - -try: # Optional Spark dependency - from pyspark.errors.exceptions.base import AnalysisException as _SparkAnalysisException -except Exception: # pragma: no cover - Spark not installed - _SparkAnalysisException = Exception # type: ignore - +from fastflowtransform.typing import SDF, SparkAnalysisException, SparkSession # ----------------------------- File I/O & Schema (dtypes) ----------------------------- @@ -333,7 +328,6 @@ def _resolve_schema_and_table_by_cfg( def _handle_duckdb(table: str, df: pd.DataFrame, executor: Any, schema: str | None) -> bool: - """Versucht DuckDB zu erkennen und zu bedienen. Gibt True zurΓΌck, wenn ausgefΓΌhrt.""" con = getattr(executor, "con", None) if con is None: return False @@ -380,22 +374,27 @@ def _handle_duckdb(table: str, df: pd.DataFrame, executor: Any, schema: str | No def _handle_sqlalchemy(table: str, df: pd.DataFrame, executor: Any, schema: str | None) -> bool: - """Versucht SQLAlchemy-Engine/-Connection zu erkennen und zu bedienen.""" eng = getattr(executor, "engine", None) if eng is None: return False - # heuristik: viele SQLAlchemy-Engines haben 'sqlalchemy' im Modulpfad der Klasse if "sqlalchemy" not in getattr(eng.__class__, "__module__", ""): return False + full_name = _qualify(table, schema) + dialect_name = getattr(getattr(eng, "dialect", None), "name", "") or "" + if dialect_name.lower() == "postgresql": + # Postgres blocks DROP TABLE when dependent views exist (e.g. stg_* views). + drop_sql = f"DROP TABLE IF EXISTS {full_name} CASCADE" + with eng.begin() as conn: + conn.exec_driver_sql(drop_sql) + t0 = perf_counter() - # pandas ΓΌbernimmt die DDL/DML β€” replace-Semantik wie im Original df.to_sql(table, eng, if_exists="replace", index=False, schema=schema, method="multi") dt_ms = int((perf_counter() - t0) * 1000) - dialect = getattr(getattr(eng, "dialect", None), "name", "sqlalchemy") + dialect = dialect_name or getattr(getattr(eng, "dialect", None), "name", "sqlalchemy") _echo_seed_line( - full_name=_qualify(table, schema), + full_name=full_name, rows=len(df), cols=df.shape[1], engine=dialect, @@ -406,6 +405,80 @@ def _handle_sqlalchemy(table: str, df: pd.DataFrame, executor: Any, schema: str return True +def _handle_bigquery(table: str, df: pd.DataFrame, executor: Any, schema: str | None) -> bool: + """ + Handle seeding for the BigQuery executor using the official client. + + We detect BigQuery by the presence of an attribute named ``client`` + that behaves like ``google.cloud.bigquery.Client``. The target dataset + is resolved as: + + 1) the provided ``schema`` argument (preferred; allows seeds/schema.yml + to control datasets explicitly), or + 2) an executor attribute such as ``dataset`` / ``dataset_id``. + + Notes: + - The dataset must already exist; this function does not create it. + - We use WRITE_TRUNCATE semantics (replace) to mirror the behavior of + the DuckDB / SQLAlchemy handlers. + """ + client = getattr(executor, "client", None) + if client is None: + return False + + # Prefer explicit schema from the caller / seeds/schema.yml. + dataset_id = ( + schema or getattr(executor, "dataset", None) or getattr(executor, "dataset_id", None) + ) + if not isinstance(dataset_id, str) or not dataset_id.strip(): + # Not a BigQuery executor we know how to handle. + return False + + dataset_id = dataset_id.strip() + + # Project: executor may expose it explicitly; otherwise fall back to the + # client project (Application Default Credentials, etc.). + project_id = getattr(executor, "project", None) + if not isinstance(project_id, str) or not project_id.strip(): + project_id = getattr(client, "project", None) + + if isinstance(project_id, str) and project_id.strip(): + table_id = f"{project_id.strip()}.{dataset_id}.{table}" + full_name = table_id + else: + # Dataset-qualified ID still works if a default project is set on the client. + table_id = f"{dataset_id}.{table}" + full_name = table_id + + try: + from google.cloud import bigquery # noqa PLC0415 type: ignore # pragma: no cover + except Exception as exc: # pragma: no cover - missing optional dependency + raise RuntimeError( + "google-cloud-bigquery is required for seeding into BigQuery, " + "but it is not installed. Install the BigQuery extras for " + "FastFlowTransform or add google-cloud-bigquery to your environment." + ) from exc + + job_config = bigquery.LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) + + t0 = perf_counter() + # Let the BigQuery client infer the schema from the pandas DataFrame. + load_job = client.load_table_from_dataframe(df, table_id, job_config=job_config) + load_job.result() # Wait for completion + dt_ms = int((perf_counter() - t0) * 1000) + + _echo_seed_line( + full_name=full_name, + rows=len(df), + cols=df.shape[1], + engine="bigquery", + ms=dt_ms, + created_schema=False, + action="replaced", + ) + return True + + def _spark_ident(name: str) -> str: """Return a Spark-safe identifier (escapes backticks).""" return name.replace("`", "``") @@ -561,7 +634,7 @@ def _write_spark_seed_to_table( try: _spark_write_table(sdf, target_identifier, table_format, table_options) return cleanup_hint - except _SparkAnalysisException as exc: + except SparkAnalysisException as exc: message = str(exc) if target_location and "LOCATION_ALREADY_EXISTS" in message.upper(): # Attempt to fix by resetting the table location and retrying once. @@ -678,11 +751,7 @@ def _handle_spark( Handler = Callable[[str, pd.DataFrame, Any, str | None], bool] -_HANDLERS: Iterable[Handler] = ( - _handle_duckdb, - _handle_sqlalchemy, - _handle_spark, -) +_HANDLERS: Iterable[Handler] = (_handle_duckdb, _handle_sqlalchemy, _handle_spark, _handle_bigquery) def materialize_seed( @@ -690,10 +759,6 @@ def materialize_seed( ) -> None: """ Materialize a DataFrame as a database table across engines. - - Engine-spezifische Logik ist in dedizierten Handlern gekapselt - (_handle_duckdb/_handle_sqlalchemy/_handle_spark). Der Dispatcher - ruft sie der Reihe nach auf, bis einer ΓΌbernimmt. """ for handler in _HANDLERS: if handler(table, df, executor, schema): @@ -705,6 +770,20 @@ def materialize_seed( # ----------------------------------- Seeding runner ----------------------------------- +def _resolve_seeds_dir(project_dir: Path) -> Path: + """ + Allow overriding the seeds directory via FFT_SEEDS_DIR, falling back to /seeds. + Relative overrides are resolved against the project directory. + """ + override = os.getenv("FFT_SEEDS_DIR") + if override: + path = Path(override) + if not path.is_absolute(): + path = project_dir / path + return path + return project_dir / "seeds" + + def seed_project(project_dir: Path, executor: Any, default_schema: str | None = None) -> int: """ Load every seed file under /seeds recursively and materialize it. @@ -736,12 +815,12 @@ def seed_project(project_dir: Path, executor: Any, default_schema: str | None = Raises: ValueError: if schema.yml uses a plain stem key while multiple files share that stem. """ - seeds_dir = project_dir / "seeds" + seeds_dir = _resolve_seeds_dir(project_dir) if not seeds_dir.exists(): return 0 # Pydantic-validated seeds/schema.yml (or None if not present) - schema_cfg = load_seeds_schema(project_dir) + schema_cfg = load_seeds_schema(project_dir, seeds_dir=seeds_dir) # Collect seed files recursively to allow folder-based schema conventions. paths: list[Path] = [ diff --git a/src/fastflowtransform/settings.py b/src/fastflowtransform/settings.py index ea34152..0436154 100644 --- a/src/fastflowtransform/settings.py +++ b/src/fastflowtransform/settings.py @@ -37,6 +37,7 @@ class BigQueryConfig(BaseConfig): dataset: str | None = None location: str | None = None use_bigframes: bool = True + allow_create_dataset: bool = False class DatabricksSparkConfig(BaseConfig): @@ -119,6 +120,7 @@ class EnvSettings(BaseSettings): BQ_PROJECT: str | None = None BQ_DATASET: str | None = None BQ_LOCATION: str | None = None + BQ_ALLOW_CREATE_DATASET: int | None = None # databricks spark DBR_MASTER: str | None = None @@ -249,6 +251,13 @@ def _ov_bigquery(raw: dict[str, Any], env: EnvSettings) -> None: if uf is not None: bq["use_bigframes"] = uf.lower() in ("1", "true", "yes", "on") + acd = getattr(env, "BQ_ALLOW_CREATE_DATASET", None) + if acd is not None: + if isinstance(acd, str): + bq["allow_create_dataset"] = acd.strip().lower() in {"1", "true", "yes", "on"} + else: + bq["allow_create_dataset"] = bool(acd) + def _ov_databricks_spark(raw: dict[str, Any], env: EnvSettings) -> None: dbr = raw.setdefault("databricks_spark", {}) @@ -269,7 +278,6 @@ def _ov_databricks_spark(raw: dict[str, Any], env: EnvSettings) -> None: def _ov_snowflake_snowpark(raw: dict[str, Any], env: EnvSettings) -> None: sf = raw.setdefault("snowflake_snowpark", {}) - # Feld heißt ΓΌberall "schema" _set_if(sf, "account", getattr(env, "SF_ACCOUNT", None)) _set_if(sf, "user", getattr(env, "SF_USER", None)) _set_if(sf, "password", getattr(env, "SF_PASSWORD", None)) diff --git a/src/fastflowtransform/table_formats/__init__.py b/src/fastflowtransform/table_formats/__init__.py index a1083b0..06dbf2b 100644 --- a/src/fastflowtransform/table_formats/__init__.py +++ b/src/fastflowtransform/table_formats/__init__.py @@ -3,7 +3,7 @@ from typing import Any -from pyspark.sql import SparkSession +from fastflowtransform.typing import SparkSession from .base import SparkFormatHandler from .spark_default import DefaultSparkFormatHandler diff --git a/src/fastflowtransform/table_formats/base.py b/src/fastflowtransform/table_formats/base.py index b251d0d..5b3b7a7 100644 --- a/src/fastflowtransform/table_formats/base.py +++ b/src/fastflowtransform/table_formats/base.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from typing import Any -from pyspark.sql import DataFrame as SDF, SparkSession +from fastflowtransform.typing import SDF, SparkSession class SparkFormatHandler(ABC): diff --git a/src/fastflowtransform/table_formats/spark_default.py b/src/fastflowtransform/table_formats/spark_default.py index 24b0ac3..f1735d4 100644 --- a/src/fastflowtransform/table_formats/spark_default.py +++ b/src/fastflowtransform/table_formats/spark_default.py @@ -3,9 +3,8 @@ from typing import Any -from pyspark.sql import DataFrame as SDF, SparkSession - from fastflowtransform.table_formats.base import SparkFormatHandler +from fastflowtransform.typing import SDF, SparkSession class DefaultSparkFormatHandler(SparkFormatHandler): diff --git a/src/fastflowtransform/table_formats/spark_delta.py b/src/fastflowtransform/table_formats/spark_delta.py index 66937df..756a2e5 100644 --- a/src/fastflowtransform/table_formats/spark_delta.py +++ b/src/fastflowtransform/table_formats/spark_delta.py @@ -1,12 +1,26 @@ # fastflowtransform/table_formats/spark_delta.py from __future__ import annotations -from typing import Any - -from delta.tables import DeltaTable -from pyspark.sql import DataFrame as SDF, SparkSession +from typing import TYPE_CHECKING, Any from fastflowtransform.table_formats.base import SparkFormatHandler +from fastflowtransform.typing import SDF, SparkSession + +if TYPE_CHECKING: # pragma: no cover - typing only + from delta.tables import DeltaTable +else: # pragma: no cover - runtime import + try: + from delta.tables import DeltaTable # type: ignore + except Exception: + + class DeltaTable: # type: ignore[misc] + """Fallback stub when delta-spark is unavailable.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + raise ImportError( + "delta-spark is required for DeltaFormatHandler. " + "Install fastflowtransform[spark] or delta-spark." + ) class DeltaFormatHandler(SparkFormatHandler): diff --git a/src/fastflowtransform/table_formats/spark_hudi.py b/src/fastflowtransform/table_formats/spark_hudi.py index a0ad8ed..318b9fd 100644 --- a/src/fastflowtransform/table_formats/spark_hudi.py +++ b/src/fastflowtransform/table_formats/spark_hudi.py @@ -2,9 +2,8 @@ from typing import Any -from pyspark.sql import DataFrame as SDF, SparkSession - from fastflowtransform.table_formats.base import SparkFormatHandler +from fastflowtransform.typing import SDF, SparkSession class HudiFormatHandler(SparkFormatHandler): diff --git a/src/fastflowtransform/table_formats/spark_iceberg.py b/src/fastflowtransform/table_formats/spark_iceberg.py index 7c6cf36..9175139 100644 --- a/src/fastflowtransform/table_formats/spark_iceberg.py +++ b/src/fastflowtransform/table_formats/spark_iceberg.py @@ -2,9 +2,8 @@ from typing import Any -from pyspark.sql import DataFrame as SDF, SparkSession - from fastflowtransform.table_formats.base import SparkFormatHandler +from fastflowtransform.typing import SDF, SparkSession class IcebergFormatHandler(SparkFormatHandler): diff --git a/src/fastflowtransform/testing/__init__.py b/src/fastflowtransform/testing/__init__.py index e69de29..1eb3ee2 100644 --- a/src/fastflowtransform/testing/__init__.py +++ b/src/fastflowtransform/testing/__init__.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from .registry import TESTS, Runner, register_test + +__all__ = [ + "TESTS", + "Runner", + "register_test", +] diff --git a/src/fastflowtransform/testing/base.py b/src/fastflowtransform/testing/base.py index cac08f8..95cb34b 100644 --- a/src/fastflowtransform/testing/base.py +++ b/src/fastflowtransform/testing/base.py @@ -276,7 +276,7 @@ def freshness(con: Any, table: str, ts_col: str, max_delay_minutes: int) -> None # 2) Decide which SQL to use based on the connection type. # # We cannot rely on a formal engine flag here, but the Databricks/Spark - # test connection lives in the databricks_spark_exec module and/or wraps + # test connection lives in the databricks_spark module and/or wraps # a SparkSession. We use a simple heuristic on the connection type name # and module to detect "Spark-like" behaviour. con_type = type(con) @@ -285,6 +285,11 @@ def freshness(con: Any, table: str, ts_col: str, max_delay_minutes: int) -> None mod_l = mod.lower() name_l = name.lower() is_spark_like = any(token in mod_l or token in name_l for token in ("spark", "databricks")) + is_bigquery = ( + "bigquery" in mod_l + or "bigquery" in name_l + or str(getattr(con, "marker", "")).upper() == "BQ_SHIM" + ) # Primary SQL (Postgres / DuckDB style) sql_primary = ( @@ -298,6 +303,12 @@ def freshness(con: Any, table: str, ts_col: str, max_delay_minutes: int) -> None f"as delay_min from {table}" ) + # BigQuery: TIMESTAMP_DIFF returns integer minutes; keep float compatibility + sql_bigquery = ( + f"select cast(TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), max({ts_col}), MINUTE) as float64) " + f"as delay_min from {table}" + ) + delay = None sql_used: str @@ -309,6 +320,13 @@ def freshness(con: Any, table: str, ts_col: str, max_delay_minutes: int) -> None delay = _scalar(con, sql_spark) except Exception as e: raise _wrap_db_error("freshness", table, ts_col, sql_spark, e) from e + elif is_bigquery: + sql_used = sql_bigquery + try: + delay = _scalar(con, sql_bigquery) + except Exception as e: + # BigQuery error messages don't mention EXTRACT/EPOCH; surface directly. + raise _wrap_db_error("freshness", table, ts_col, sql_bigquery, e) from e else: # Non-Spark engines: try the Postgres/DuckDB expression first. sql_used = sql_primary diff --git a/src/fastflowtransform/testing/registry.py b/src/fastflowtransform/testing/registry.py index 2dc60b8..28fd88b 100644 --- a/src/fastflowtransform/testing/registry.py +++ b/src/fastflowtransform/testing/registry.py @@ -368,3 +368,49 @@ def run_reconcile_coverage( "reconcile_diff_within": run_reconcile_diff_within, "reconcile_coverage": run_reconcile_coverage, } + + +# --------------------------------------------------------------------------- +# Public registration API +# --------------------------------------------------------------------------- + + +def register_test(name: str, runner: Runner, *, overwrite: bool = False) -> None: + """ + Register (or override) a data-quality test runner. + + Usage: + + from fastflowtransform.testing import register_test + + def my_runner(con, table, column, params): + ... + return True, None, None + + register_test("my_custom_test", my_runner) + + Args: + name: Name of the test as used in project.yml / schema.yml (`type:` field). + runner: Callable implementing the Runner protocol. + overwrite: If False (default), attempting to override an existing name + raises ValueError. Set True to replace built-ins or earlier + registrations. + + Raises: + ValueError: If name is empty or already registered (and overwrite=False). + TypeError: If runner is not callable. + """ + if not isinstance(name, (str, bytes)) or not str(name).strip(): + raise ValueError("Test name must be a non-empty string") + + if not callable(runner): + raise TypeError("runner must be callable") + + key = str(name).strip() + if key in TESTS and not overwrite: + raise ValueError( + f"Test '{key}' is already registered. " + "Pass overwrite=True to replace the existing runner." + ) + + TESTS[key] = runner diff --git a/src/fastflowtransform/typing.py b/src/fastflowtransform/typing.py new file mode 100644 index 0000000..d3efbfa --- /dev/null +++ b/src/fastflowtransform/typing.py @@ -0,0 +1,168 @@ +# Centralized optional-dependency imports for typing and runtime hints. +# Provides best-effort imports with lightweight fallbacks so modules can +# reference these names without duplicating TYPE_CHECKING blocks. +from __future__ import annotations + +from types import SimpleNamespace +from typing import TYPE_CHECKING, Any, cast + +__all__ = [ + "SDF", + "SNDF", + "BFDataFrame", + "BadRequest", + "BigQueryOptions", + "Client", + "DataType", + "LoadJobConfig", + "NotFound", + "SnowparkSession", + "SparkAnalysisException", + "SparkSession", + "WriteDisposition", + "bf_global_session", + "bigframes", + "bigquery", +] + +# --- Google client + exceptions --- +if TYPE_CHECKING: # pragma: no cover - typing only + from google.api_core.exceptions import BadRequest, NotFound +else: # pragma: no cover - runtime import + try: + from google.api_core.exceptions import BadRequest, NotFound # type: ignore + except Exception: + + class BadRequest(Exception): + """Fallback when google.api_core is unavailable.""" + + class NotFound(Exception): + """Fallback when google.api_core is unavailable.""" + + +if TYPE_CHECKING: # pragma: no cover - typing only + from google.cloud import bigquery + from google.cloud.bigquery import Client, LoadJobConfig, WriteDisposition +else: # pragma: no cover - runtime import + try: + from google.cloud import bigquery # type: ignore + from google.cloud.bigquery import Client, LoadJobConfig, WriteDisposition # type: ignore + except Exception: + # Minimal stubs so imports don't fail without google installed. + class _DatasetStub: + def __init__(self, dataset_id: str): + self.dataset_id = dataset_id + self.location: str | None = None + + class _WriteDispositionStub: + WRITE_TRUNCATE = "WRITE_TRUNCATE" + + class _QueryJobConfigStub: + def __init__(self, **kwargs: Any): + self.kwargs = kwargs + + class _ScalarQueryParameterStub: + def __init__(self, name: str, typ: str, val: Any): + self.name = name + self.type_ = typ + self.value = val + + bigquery = cast( + Any, + SimpleNamespace( + Dataset=_DatasetStub, + WriteDisposition=_WriteDispositionStub, + QueryJobConfig=_QueryJobConfigStub, + ScalarQueryParameter=_ScalarQueryParameterStub, + LoadJobConfig=lambda **kwargs: SimpleNamespace(**kwargs), + ), + ) + Client = Any + LoadJobConfig = Any + WriteDisposition = _WriteDispositionStub + +# --- BigFrames (BigQuery DataFrames) --- +if TYPE_CHECKING: # pragma: no cover - typing only + import bigframes # Package: google-cloud-bigquery-dataframes + import bigframes.core.global_session as bf_global_session + from bigframes._config.bigquery_options import BigQueryOptions + from bigframes.dataframe import DataFrame as BFDataFrame +else: # pragma: no cover - runtime import + try: + import bigframes # type: ignore + import bigframes.core.global_session as bf_global_session # type: ignore + from bigframes._config.bigquery_options import ( # type: ignore + BigQueryOptions as _BFBigQueryOptions, + ) + except Exception: + # Provide minimal stubs so imports succeed; tests monkeypatch these. + class _BFBigQueryOptions: + def __init__(self, *a: Any, **kw: Any): + self.args = (a, kw) + + class _SessionStub: + def __init__(self, *a: Any, **kw: Any): + self.args = (a, kw) + + class _GlobalSessionContext: + def __init__(self, session: Any): + self.session = session + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + bigframes = cast(Any, SimpleNamespace(Session=_SessionStub)) + bf_global_session = cast(Any, SimpleNamespace(_GlobalSessionContext=_GlobalSessionContext)) + + BigQueryOptions = _BFBigQueryOptions # type: ignore[assignment] + + try: + from bigframes.dataframe import DataFrame as BFDataFrame # type: ignore + except Exception: + BFDataFrame = Any # type: ignore[assignment] + +# --- Spark --- +if TYPE_CHECKING: # pragma: no cover - typing only + from pyspark.errors.exceptions.base import AnalysisException as SparkAnalysisException + from pyspark.sql import DataFrame as SDF, SparkSession + from pyspark.sql.types import DataType +else: # pragma: no cover - runtime import + try: + from pyspark.sql import DataFrame as SDF, SparkSession # type: ignore + except Exception: + SDF = Any # type: ignore[assignment] + SparkSession = Any # type: ignore[assignment] + + try: + from pyspark.sql.types import DataType # type: ignore + except Exception: + DataType = Any # type: ignore[assignment] + + try: + from pyspark.errors.exceptions.base import ( # type: ignore + AnalysisException as SparkAnalysisException, + ) + except Exception: + + class SparkAnalysisException(Exception): + """Fallback if pyspark is unavailable.""" + + +# --- Snowflake Snowpark --- +if TYPE_CHECKING: # pragma: no cover - typing only + from snowflake.snowpark import DataFrame as SNDF, Session as SnowparkSession +else: # pragma: no cover - runtime import + try: + from snowflake.snowpark import DataFrame as SNDF, Session as SnowparkSession # type: ignore + except Exception: + + class SnowparkSession: # type: ignore[misc] + """Fallback stub when snowflake.snowpark is unavailable.""" + + builder = SimpleNamespace(configs=lambda cfg: SimpleNamespace(create=lambda: None)) + + class SNDF: # type: ignore[misc] + """Fallback Snowpark DataFrame stub.""" diff --git a/src/fastflowtransform/utest.py b/src/fastflowtransform/utest.py index 0b50db2..5e46c2f 100644 --- a/src/fastflowtransform/utest.py +++ b/src/fastflowtransform/utest.py @@ -420,7 +420,6 @@ def _apply_approx_equalization( f"expected={e_num[bad].tolist()} vs actual={a_num[bad].tolist()}" ) - # Gleichziehen, damit equals() spΓ€ter nicht stolpert actual_df[col] = exp[col] checked.append(col) @@ -649,22 +648,17 @@ def run_unit_specs( cand_fp = _fingerprint_case(node, spec, case, ctx) - # Inputs laden/prΓΌfen (zΓ€hlt failures selbst) ctx.failures += _load_inputs_for_case(executor, spec, case, node) - # ggf. Skip if _maybe_skip_by_cache(node, cand_fp, ctx): _read_and_assert(spec, case, ctx) continue - # ausfΓΌhren + ggf. Cache aktualisieren if not _execute_and_update_cache(node, cand_fp, ctx): continue - # Ergebnis prΓΌfen _read_and_assert(spec, case, ctx) - # Cache persistieren (nur rw) if ctx.cache and ctx.computed_fps and ctx.cache_mode == "rw": # pragma: no cover ctx.cache.update_many(ctx.computed_fps) ctx.cache.save() diff --git a/tests/.env.dev_postgres b/tests/.env.dev_postgres index c374662..88493d4 100644 --- a/tests/.env.dev_postgres +++ b/tests/.env.dev_postgres @@ -1,3 +1,2 @@ # Postgres profile for unittests FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 -FF_PG_SCHEMA=incremental_demo diff --git a/tests/common/fixtures.py b/tests/common/fixtures.py index a46c8b3..a3021f3 100644 --- a/tests/common/fixtures.py +++ b/tests/common/fixtures.py @@ -1,24 +1,51 @@ # tests/common/fixtures.py +from __future__ import annotations + import os from contextlib import suppress from pathlib import Path from types import SimpleNamespace +from typing import TYPE_CHECKING, Any from unittest.mock import MagicMock, patch import pandas as pd -import psycopg import pytest import sqlalchemy as sa from dotenv import load_dotenv from jinja2 import Environment, FileSystemLoader, select_autoescape -from psycopg import sql from sqlalchemy import text +if TYPE_CHECKING: # pragma: no cover - typing only + import psycopg + from psycopg import sql + + from fastflowtransform.executors.databricks_spark import ( + DatabricksSparkExecutor as DatabricksSparkExecutorType, + ) +else: + try: + import psycopg # type: ignore + from psycopg import sql # type: ignore + except ModuleNotFoundError: + psycopg = None # type: ignore + sql = None # type: ignore + + DatabricksSparkExecutorType = Any + +try: # Optional: Spark deps may not be installed in core runs + from fastflowtransform.executors.databricks_spark import DatabricksSparkExecutor +except ModuleNotFoundError: # pragma: no cover - import guard + DatabricksSparkExecutor = None # type: ignore + from fastflowtransform import utest from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.databricks_spark_exec import DatabricksSparkExecutor from tests.common.utils import ROOT, run +try: # Optional: Spark deps may not be installed in core runs + from fastflowtransform.executors.databricks_spark import DatabricksSparkExecutor +except ModuleNotFoundError: # pragma: no cover - import guard + DatabricksSparkExecutor = None # type: ignore + # ---- Load Env Variables ---- @pytest.fixture(scope="session", autouse=True) @@ -111,6 +138,8 @@ def pg_env(): def pg_seeded(pg_project, pg_env): dsn = pg_env.get("FF_PG_DSN") schema = pg_env.get("FF_PG_SCHEMA") or "public" + if psycopg is None or sql is None: + pytest.skip("psycopg not installed; install fastflowtransform[postgres] to run PG fixtures") if dsn and schema and ("psycopg://" in dsn or "+psycopg" in dsn): with suppress(Exception), psycopg.connect(dsn) as conn: conn.execute(sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format(sql.Identifier(schema))) @@ -123,7 +152,11 @@ def pg_seeded(pg_project, pg_env): # ---- Spark ---- @pytest.fixture def exec_minimal(monkeypatch): - with patch("fastflowtransform.executors.databricks_spark_exec.SparkSession") as SP: + if DatabricksSparkExecutor is None: + pytest.skip( + "pyspark/delta not installed; install fastflowtransform[spark] to run Spark tests" + ) + with patch("fastflowtransform.executors.databricks_spark.SparkSession") as SP: fake_spark = MagicMock() SP.builder.master.return_value.appName.return_value.getOrCreate.return_value = fake_spark ex = DatabricksSparkExecutor() @@ -140,8 +173,12 @@ def exec_factory(): Returns (executor, fake_builder, fake_spark). """ - def _make(**kwargs): - with patch("fastflowtransform.executors.databricks_spark_exec.SparkSession") as SP: + def _make(**kwargs) -> tuple[DatabricksSparkExecutorType, Any, MagicMock]: + if DatabricksSparkExecutor is None: + pytest.skip( + "pyspark/delta not installed; install fastflowtransform[spark] to run Spark tests" + ) + with patch("fastflowtransform.executors.databricks_spark.SparkSession") as SP: fake_builder = SP.builder.master.return_value.appName.return_value # make .config(...) chainable fake_builder.config.return_value = fake_builder @@ -164,7 +201,11 @@ def spark_tmpdir(tmp_path_factory: pytest.TempPathFactory) -> Path: @pytest.fixture(scope="session") -def spark_exec(spark_tmpdir: Path) -> DatabricksSparkExecutor: +def spark_exec(spark_tmpdir: Path) -> DatabricksSparkExecutorType: + if DatabricksSparkExecutor is None: + pytest.skip( + "pyspark/delta not installed; install fastflowtransform[spark] to run Spark tests" + ) return DatabricksSparkExecutor( master="local[*]", app_name="fft-it", @@ -175,10 +216,10 @@ def spark_exec(spark_tmpdir: Path) -> DatabricksSparkExecutor: @pytest.fixture(scope="session") def spark_exec_delta(spark_tmpdir): - try: - pass - except Exception: - pytest.skip("delta-spark is not installed; skipping Delta tests") + if DatabricksSparkExecutor is None: + pytest.skip( + "pyspark/delta not installed; install fastflowtransform[spark] to run Spark tests" + ) extra_conf = { "spark.ui.enabled": "false", @@ -210,7 +251,7 @@ def fake_registry(tmp_path, monkeypatch): @pytest.fixture -def duckdb_executor(): +def duckdbutor(): """ Fake-Executor: - has .con @@ -236,9 +277,9 @@ def run_python(self, node): @pytest.fixture -def postgres_executor(): +def postgresutor(): """ - Fake-Executor fΓΌr den Postgres-Zweig in _read_result. + Fake-Executor for Postgres in _read_result. """ engine = MagicMock() @@ -264,7 +305,7 @@ def duckdb_engine_env(tmp_path_factory): @pytest.fixture(scope="session") def postgres_engine_env(): - """Basic env fΓΌr Postgres. Skipped if DSN is missing or DB not reachable.""" + """Basic env for Postgres. Skipped if DSN is missing or DB not reachable.""" dsn = os.environ.get( "FF_PG_DSN", "postgresql+psycopg://postgres:postgres@localhost:5432/ffdb", @@ -297,8 +338,33 @@ def spark_engine_env(tmp_path_factory): return { "FF_ENGINE": "databricks_spark", "FF_SPARK_MASTER": "local[*]", - "FF_SPARK_APP_NAME": "fft_examples_ci", "FF_DBR_ENABLE_HIVE": "1", - "FF_DBR_DATABASE": "ff_examples_ci", "FF_SPARK_WAREHOUSE_DIR": str(warehouse), } + + +@pytest.fixture(scope="session") +def bigquery_engine_env(): + """ + Basic env for BigQuery examples. Skips if required env vars are missing. + """ + project = os.environ.get("FF_BQ_PROJECT") + dataset = os.environ.get("FF_BQ_DATASET") + location = os.environ.get("FF_BQ_LOCATION") + + if not (project and dataset and location): + pytest.skip("FF_BQ_PROJECT/FF_BQ_DATASET/FF_BQ_LOCATION not set for BigQuery tests") + + env = { + "FF_ENGINE": "bigquery", + "FF_ENGINE_VARIANT": os.environ.get("FF_ENGINE_VARIANT", "bigframes"), + "FF_BQ_PROJECT": project, + "FF_BQ_DATASET": dataset, + "FF_BQ_LOCATION": location, + } + + creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") + if creds: + env["GOOGLE_APPLICATION_CREDENTIALS"] = creds + + return env diff --git a/tests/common/mock/bigquery.py b/tests/common/mock/bigquery.py index b27bdb3..49d8ace 100644 --- a/tests/common/mock/bigquery.py +++ b/tests/common/mock/bigquery.py @@ -1,4 +1,4 @@ -# tests/helpers/fake_bigquery.py +# tests/common/mock/bigquery.py from __future__ import annotations import sys @@ -8,6 +8,17 @@ import pandas as pd +# Optional dependency: provide lightweight fallbacks when google libs are absent. +try: + from google.api_core.exceptions import BadRequest, NotFound +except Exception: # pragma: no cover - when google is not installed + + class BadRequest(Exception): + pass + + class NotFound(Exception): + pass + # --------------------------------------------------------------------------- # Fake types # --------------------------------------------------------------------------- @@ -68,25 +79,29 @@ def __init__(self, dataset_id: str): self.location: str | None = None -class FakeBadRequest(Exception): +class FakeBadRequest(BadRequest): + """Test helper that behaves like a BadRequest for our wrappers.""" + pass -class FakeNotFound(Exception): +class FakeNotFound(NotFound): + """Same idea for NotFound, if you need it.""" + pass class FakeClient: """ - Gemeinsamer Client fΓΌr beide Executor-Tests. - Kann: + Common Client for both Executor-Tests. + Can: - query(...) - list_tables(...) - get_table(...) - get_dataset(...) - create_dataset(...) - - load_table_from_dataframe(...) (fΓΌr pandas-Executor) - und hat: + - load_table_from_dataframe(...) (for pandas-Executor) + and has: - _datasets: set[str] - _tables: dict[str, list[Any]] """ @@ -95,12 +110,12 @@ def __init__(self, project: str, location: str | None = None): self.project = project self.location = location self.queries: list[tuple[str, str | None, Any | None]] = [] - self._datasets: set[str] = set() + self._datasets: dict[str, FakeDataset] = {} self._tables: dict[str, list[Any]] = {} # ---- Test helper ---- def add_dataset(self, ds_id: str) -> None: - self._datasets.add(ds_id) + self._datasets.setdefault(ds_id, FakeDataset(ds_id)) def add_table(self, dataset_id: str, table_id: str) -> None: self._tables.setdefault(dataset_id, []).append(SimpleNamespace(table_id=table_id)) @@ -140,15 +155,18 @@ def get_table(self, table_ref: str): raise FakeNotFound(f"table {table_ref} not found") def get_dataset(self, ds_id: str): - if ds_id not in self._datasets: + ds = self._datasets.get(ds_id) + if ds is None: raise FakeNotFound(f"dataset {ds_id} not found") - return FakeDataset(ds_id) + return ds - def create_dataset(self, ds_obj: Any): + def create_dataset(self, ds_obj: Any, exists_ok: bool | None = None): ds_id = getattr(ds_obj, "dataset_id", ds_obj) - self._datasets.add(ds_id) - ds = FakeDataset(ds_id) + ds = self._datasets.get(ds_id) + if ds is None or not exists_ok: + ds = FakeDataset(ds_id) ds.location = getattr(ds_obj, "location", None) + self._datasets[ds_id] = ds return ds def load_table_from_dataframe(self, df, table_id: str, job_config: Any, location: str | None): @@ -183,17 +201,34 @@ def make_fake_bigquery_module() -> types.ModuleType: def install_fake_bigquery(monkeypatch, target_modules: list[types.ModuleType]) -> types.ModuleType: """ - Installiert unser Fake-bigquery sowohl in sys.modules als auch in den angegebenen - Zielmodulen (per monkeypatch.setattr(mod, "bigquery", ...)). - Gibt das Fake-Modul zurΓΌck. + Install the fake BigQuery module into sys.modules and optionally patch + target modules that expose a top-level ``bigquery`` attribute. + + This ensures that imports like ``from google.cloud import bigquery`` and + ``import google.cloud.bigquery as bigquery`` see the fake implementation + during tests. + + Args: + monkeypatch: pytest's monkeypatch fixture. + target_modules: Modules that may reference a top-level ``bigquery`` + symbol. For each module that actually has such an attribute, it + will be replaced with the fake BigQuery module. + + Returns: + The fake BigQuery module that was installed. """ fake_bq = make_fake_bigquery_module() + # Make the fake visible as google.cloud.bigquery gc_mod = sys.modules.setdefault("google.cloud", types.ModuleType("google.cloud")) gc_mod.bigquery = fake_bq # type: ignore[attr-defined] sys.modules["google.cloud.bigquery"] = fake_bq + # For backwards compatibility, only patch modules that actually expose + # a top-level `bigquery` attribute. After the executor refactor, not + # every BigQuery-related module has that symbol anymore. for m in target_modules: - monkeypatch.setattr(m, "bigquery", fake_bq, raising=True) + if hasattr(m, "bigquery"): + monkeypatch.setattr(m, "bigquery", fake_bq, raising=True) return fake_bq diff --git a/tests/common/mock/profiles.py b/tests/common/mock/profiles.py index b36fc41..5b46496 100644 --- a/tests/common/mock/profiles.py +++ b/tests/common/mock/profiles.py @@ -4,7 +4,6 @@ from types import SimpleNamespace from typing import cast -# wir brauchen nur den Typ fΓΌr das cast from fastflowtransform.settings import ( Profile, ) @@ -16,6 +15,7 @@ def fake_bigquery_profile( dataset: str = "ds1", location: str | None = "EU", use_bigframes: bool = False, + allow_create_dataset: bool = False, ) -> Profile: """ Return a shape-compatible fake of a BigQuery profile. @@ -32,6 +32,7 @@ def fake_bigquery_profile( dataset=dataset, location=location, use_bigframes=use_bigframes, + allow_create_dataset=allow_create_dataset, ), ) # tell the type checker: "this is good enough to be treated as Profile" diff --git a/tests/integration/artifacts/test_catalog_duckdb_integration.py b/tests/integration/artifacts/test_catalog_duckdb_integration.py index 00a0640..cdf3aa3 100644 --- a/tests/integration/artifacts/test_catalog_duckdb_integration.py +++ b/tests/integration/artifacts/test_catalog_duckdb_integration.py @@ -5,7 +5,7 @@ from fastflowtransform.artifacts import write_catalog from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor @pytest.mark.integration diff --git a/tests/integration/cli/test_cmd/test_test_cmd_schema_merge_integration.py b/tests/integration/cli/test_cmd/test_test_cmd_schema_merge_integration.py index 51f178c..1941ac5 100644 --- a/tests/integration/cli/test_cmd/test_test_cmd_schema_merge_integration.py +++ b/tests/integration/cli/test_cmd/test_test_cmd_schema_merge_integration.py @@ -4,7 +4,7 @@ from fastflowtransform.cli.test_cmd import _apply_legacy_tag_filter, _run_dq_tests from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor from fastflowtransform.schema_loader import load_schema_tests diff --git a/tests/integration/core/test_buildins_var_this_integration.py b/tests/integration/core/test_buildins_var_this_integration.py index 48089d4..2f9c3e8 100644 --- a/tests/integration/core/test_buildins_var_this_integration.py +++ b/tests/integration/core/test_buildins_var_this_integration.py @@ -4,7 +4,7 @@ import pytest from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor @pytest.mark.duckdb diff --git a/tests/integration/core/test_python_model_dependencies_integration.py b/tests/integration/core/test_python_model_dependencies_integration.py index 7860d2b..ffb4f9e 100644 --- a/tests/integration/core/test_python_model_dependencies_integration.py +++ b/tests/integration/core/test_python_model_dependencies_integration.py @@ -3,7 +3,7 @@ import pytest from fastflowtransform.core import REGISTRY, Node -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor @pytest.mark.integration diff --git a/tests/integration/examples/config.py b/tests/integration/examples/config.py index dd076cb..4734ffe 100644 --- a/tests/integration/examples/config.py +++ b/tests/integration/examples/config.py @@ -17,6 +17,16 @@ class ExampleConfig: EXAMPLES: list[ExampleConfig] = [ + ExampleConfig( + name="api_demo", + path=ROOT / "examples" / "api_demo", + make_target="demo", + env_by_engine={ + "duckdb": "dev_duckdb", + "postgres": "dev_postgres", + "databricks_spark": "dev_databricks", + }, + ), ExampleConfig( name="basic_demo", path=ROOT / "examples" / "basic_demo", @@ -28,8 +38,8 @@ class ExampleConfig: }, ), ExampleConfig( - name="materializations_demo", - path=ROOT / "examples" / "materializations_demo", + name="cache_demo", + path=ROOT / "examples" / "cache_demo", make_target="demo", env_by_engine={ "duckdb": "dev_duckdb", @@ -48,16 +58,19 @@ class ExampleConfig: }, ), ExampleConfig( - name="macros_demo", - path=ROOT / "examples" / "macros_demo", + name="incremental_demo", + path=ROOT / "examples" / "incremental_demo", make_target="demo", env_by_engine={ "duckdb": "dev_duckdb", + "postgres": "dev_postgres", + "databricks_spark": "dev_databricks", }, + spark_table_formats=["parquet", "delta", "iceberg"], ), ExampleConfig( - name="api_demo", - path=ROOT / "examples" / "api_demo", + name="macros_demo", + path=ROOT / "examples" / "macros_demo", make_target="demo", env_by_engine={ "duckdb": "dev_duckdb", @@ -66,14 +79,13 @@ class ExampleConfig: }, ), ExampleConfig( - name="incremental_demo", - path=ROOT / "examples" / "incremental_demo", + name="materializations_demo", + path=ROOT / "examples" / "materializations_demo", make_target="demo", env_by_engine={ "duckdb": "dev_duckdb", "postgres": "dev_postgres", "databricks_spark": "dev_databricks", }, - spark_table_formats=["parquet", "delta", "iceberg"], ), ] diff --git a/tests/integration/examples/test_examples_matrix.py b/tests/integration/examples/test_examples_matrix.py index f43d77d..32df40e 100644 --- a/tests/integration/examples/test_examples_matrix.py +++ b/tests/integration/examples/test_examples_matrix.py @@ -16,6 +16,16 @@ def _run_cmd(cmd: list[str], cwd: Path, extra_env: dict[str, str] | None = None) env.update(extra_env) proc = run(cmd, check=False, cwd=str(cwd), env=env, text=True, capture_output=True) if proc.returncode != 0: + # Echo outputs to aid CI debugging before raising. + out = proc.stdout or "" + err = proc.stderr or "" + print(f"\n--- Command failed: {' '.join(cmd)} (cwd={cwd}) ---") + if out: + print("STDOUT:") + print(out) + if err: + print("STDERR:") + print(err) raise CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr) diff --git a/tests/integration/executors/duckdb/test_ephemeral_inlining_integration.py b/tests/integration/executors/duckdb/test_ephemeral_inlining_integration.py index f185132..16f95f9 100644 --- a/tests/integration/executors/duckdb/test_ephemeral_inlining_integration.py +++ b/tests/integration/executors/duckdb/test_ephemeral_inlining_integration.py @@ -6,7 +6,7 @@ from fastflowtransform.core import REGISTRY from fastflowtransform.dag import topo_sort -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor from fastflowtransform.seeding import seed_project pytestmark = pytest.mark.duckdb # uses DuckDB diff --git a/tests/integration/executors/duckdb/test_executor_meta_hook_duckdb.py b/tests/integration/executors/duckdb/test_executor_meta_hook_duckdb.py index a2a4c9f..a66085a 100644 --- a/tests/integration/executors/duckdb/test_executor_meta_hook_duckdb.py +++ b/tests/integration/executors/duckdb/test_executor_meta_hook_duckdb.py @@ -10,7 +10,7 @@ from fastflowtransform.cli import app from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor from fastflowtransform.meta import get_meta diff --git a/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py b/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py index a4871e3..f2e5a39 100644 --- a/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py +++ b/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py @@ -5,7 +5,7 @@ import pytest from fastflowtransform.core import Node -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor @pytest.mark.integration diff --git a/tests/integration/executors/duckdb/test_materializations_integration.py b/tests/integration/executors/duckdb/test_materializations_integration.py index 18e8e32..f449c65 100644 --- a/tests/integration/executors/duckdb/test_materializations_integration.py +++ b/tests/integration/executors/duckdb/test_materializations_integration.py @@ -4,7 +4,7 @@ import pytest from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor @pytest.mark.duckdb diff --git a/tests/integration/executors/duckdb/test_python_dependency_loading_integration.py b/tests/integration/executors/duckdb/test_python_dependency_loading_integration.py index 95e55a6..eb5bf56 100644 --- a/tests/integration/executors/duckdb/test_python_dependency_loading_integration.py +++ b/tests/integration/executors/duckdb/test_python_dependency_loading_integration.py @@ -2,12 +2,12 @@ import pytest from fastflowtransform.core import REGISTRY, Node -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor @pytest.mark.duckdb @pytest.mark.integration -def test_duckdb_executor_dep_loading_unit(tmp_path): +def test_duckdbutor_dep_loading_unit(tmp_path): ex = DuckExecutor() con = ex.con con.execute("create table users as select 1::int as id, 'a@example.com'::varchar as email") diff --git a/tests/integration/executors/duckdb/test_python_model_materialized_view_integration.py b/tests/integration/executors/duckdb/test_python_model_materialized_view_integration.py index 43cf4d5..d43b115 100644 --- a/tests/integration/executors/duckdb/test_python_model_materialized_view_integration.py +++ b/tests/integration/executors/duckdb/test_python_model_materialized_view_integration.py @@ -4,7 +4,7 @@ import pytest from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor @pytest.mark.duckdb diff --git a/tests/integration/executors/test_databricks_spark_exec_integration.py b/tests/integration/executors/test_databricks_spark_exec_integration.py index cd5b7b3..d9e6016 100644 --- a/tests/integration/executors/test_databricks_spark_exec_integration.py +++ b/tests/integration/executors/test_databricks_spark_exec_integration.py @@ -1,4 +1,4 @@ -# tests/integration/executors/test_databricks_spark_exec_integration.py +# tests/integration/executors/test_databricks_spark_integration.py from __future__ import annotations from pathlib import Path @@ -9,7 +9,7 @@ from fastflowtransform.core import Node # noqa: E402 from fastflowtransform.errors import ModelExecutionError # noqa: E402 -from fastflowtransform.executors.databricks_spark_exec import DatabricksSparkExecutor # noqa: E402 +from fastflowtransform.executors.databricks_spark import DatabricksSparkExecutor # noqa: E402 @pytest.mark.integration diff --git a/tests/integration/meta/test_meta_duckdb_integration.py b/tests/integration/meta/test_meta_duckdb_integration.py index b843720..7ce1188 100644 --- a/tests/integration/meta/test_meta_duckdb_integration.py +++ b/tests/integration/meta/test_meta_duckdb_integration.py @@ -5,7 +5,7 @@ import pytest -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor from fastflowtransform.meta import ensure_meta_table, get_meta, relation_exists, upsert_meta diff --git a/tests/integration/schema_loader/test_schema_yaml_basic_integration.py b/tests/integration/schema_loader/test_schema_yaml_basic_integration.py index 18a7d08..3c491d3 100644 --- a/tests/integration/schema_loader/test_schema_yaml_basic_integration.py +++ b/tests/integration/schema_loader/test_schema_yaml_basic_integration.py @@ -4,7 +4,7 @@ from fastflowtransform.cli.test_cmd import _apply_legacy_tag_filter, _run_dq_tests from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor from fastflowtransform.schema_loader import load_schema_tests diff --git a/tests/integration/schema_loader/test_schema_yaml_registry_mix_integration.py b/tests/integration/schema_loader/test_schema_yaml_registry_mix_integration.py index b8fe817..d80d454 100644 --- a/tests/integration/schema_loader/test_schema_yaml_registry_mix_integration.py +++ b/tests/integration/schema_loader/test_schema_yaml_registry_mix_integration.py @@ -4,7 +4,7 @@ from fastflowtransform.cli.test_cmd import _run_dq_tests from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor from fastflowtransform.schema_loader import load_schema_tests diff --git a/tests/integration/test_artifacts_integration.py b/tests/integration/test_artifacts_integration.py index 5d28dff..5f78c82 100644 --- a/tests/integration/test_artifacts_integration.py +++ b/tests/integration/test_artifacts_integration.py @@ -10,7 +10,7 @@ write_run_results, ) from fastflowtransform.core import REGISTRY -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor @pytest.mark.integration diff --git a/tests/integration/testing/registry/test_dispatch_integration.py b/tests/integration/testing/registry/test_dispatch_integration.py index f09be81..edf350f 100644 --- a/tests/integration/testing/registry/test_dispatch_integration.py +++ b/tests/integration/testing/registry/test_dispatch_integration.py @@ -1,6 +1,6 @@ import pytest -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor from fastflowtransform.testing.registry import TESTS diff --git a/tests/unit/cli/test_bootstrap_unit.py b/tests/unit/cli/test_bootstrap_unit.py index cadbd76..ef79b62 100644 --- a/tests/unit/cli/test_bootstrap_unit.py +++ b/tests/unit/cli/test_bootstrap_unit.py @@ -177,9 +177,14 @@ def __init__(self, *a, **k): def run_python(self, *a, **k): pass - # patch BOTH BF and normal - code branches on use_bigframes - monkeypatch.setattr(bootstrap, "BigQueryExecutor", _FakeBQExec, raising=True) - monkeypatch.setattr(bootstrap, "BigQueryBFExecutor", _FakeBQExec, raising=True) + original_import = bootstrap._import_optional + + def _fake_import(module_path: str, attr: str, *, extra: str | None = None): + if attr in {"BigQueryExecutor", "BigQueryBFExecutor"}: + return _FakeBQExec + return original_import(module_path, attr, extra=extra) + + monkeypatch.setattr(bootstrap, "_import_optional", _fake_import, raising=True) prof = fake_bigquery_profile(use_bigframes=False) jenv = Environment() @@ -202,7 +207,14 @@ def __init__(self, db_path: str, schema: str | None = None, catalog: str | None def run_python(self, *a, **k): pass - monkeypatch.setattr(bootstrap, "DuckExecutor", _FakeDuckExec, raising=True) + original_import = bootstrap._import_optional + + def _fake_import(module_path: str, attr: str, *, extra: str | None = None): + if attr == "DuckExecutor": + return _FakeDuckExec + return original_import(module_path, attr, extra=extra) + + monkeypatch.setattr(bootstrap, "_import_optional", _fake_import, raising=True) prof = fake_duckdb_profile(path=str(tmp_path / "test.duckdb"), schema="demo", catalog="demo") jenv = Environment() diff --git a/tests/unit/cli/test_sync_db_comments_unit.py b/tests/unit/cli/test_sync_db_comments_unit.py index 8946467..fc79ee8 100644 --- a/tests/unit/cli/test_sync_db_comments_unit.py +++ b/tests/unit/cli/test_sync_db_comments_unit.py @@ -71,7 +71,7 @@ def test_sync_comments_postgres_dry_run(capsys): @pytest.mark.unit -def test_sync_comments_postgres_executes_on_engine(capsys): +def test_sync_comments_postgresutes_on_engine(capsys): # fake sqlalchemy engine fake_conn = MagicMock() fake_engine = MagicMock() @@ -84,10 +84,8 @@ def test_sync_comments_postgres_executes_on_engine(capsys): mod._sync_comments_postgres(fake_exec, intents, schema="public", dry_run=False) - # sollte genau 1 statement ausfΓΌhren assert fake_conn.execute.call_count == 1 - stmt_arg = fake_conn.execute.call_args[0][0] # sa_text(...) - # sqlalchemy.text hat .text oder .textual? + stmt_arg = fake_conn.execute.call_args[0][0] assert 'COMMENT ON TABLE "public"."users" IS \'Users table\';' in str(stmt_arg) out = capsys.readouterr().out @@ -153,20 +151,13 @@ def test_sync_comments_snowflake_with_execute_method(): @pytest.mark.unit def test_sync_db_comments_no_intents_exits(monkeypatch): - """ - Fall: es gibt gar keine Descriptions -> sofort Exit(0) mit gelb. - """ - # fake context fake_ctx = SimpleNamespace( project=Path("."), profile=SimpleNamespace(engine="postgres", postgres=SimpleNamespace(db_schema="public")), make_executor=lambda: (MagicMock(), None, None), ) - # REGISTRY ohne Nodes monkeypatch.setattr(mod, "REGISTRY", SimpleNamespace(nodes={})) - # docs metadata -> leer monkeypatch.setattr(mod, "read_docs_metadata", lambda _: {}) - # keine Spalten gefunden monkeypatch.setattr(mod, "_collect_columns", lambda _: {}) monkeypatch.setattr(mod, "_prepare_context", lambda *a, **k: fake_ctx) @@ -178,7 +169,6 @@ def test_sync_db_comments_no_intents_exits(monkeypatch): @pytest.mark.unit def test_sync_db_comments_postgres_path(monkeypatch): - # 1) Kontext vorbereiten fake_exec = MagicMock() fake_ctx = SimpleNamespace( project=Path("."), @@ -187,14 +177,11 @@ def test_sync_db_comments_postgres_path(monkeypatch): ) monkeypatch.setattr(mod, "_prepare_context", lambda *a, **k: fake_ctx) - # 2) Registry mit einem Node fake_node = SimpleNamespace(name="users.ff") monkeypatch.setattr(mod, "REGISTRY", SimpleNamespace(nodes={"users.ff": fake_node})) - # 3) relation_for -> "users" monkeypatch.setattr(mod, "relation_for", lambda name: "users") - # 4) docs metadata: model-beschreibung + column-beschreibung monkeypatch.setattr( mod, "read_docs_metadata", @@ -206,11 +193,9 @@ def test_sync_db_comments_postgres_path(monkeypatch): }, ) - # 5) _collect_columns: table "users" has column "id" col = SimpleNamespace(name="id") monkeypatch.setattr(mod, "_collect_columns", lambda _: {"users": [col]}) - # 6) _sync_comments_postgres beobachten called = {} def fake_sync_pg(execu, intents, schema, dry_run): @@ -228,7 +213,6 @@ def fake_sync_pg(execu, intents, schema, dry_run): # Assertions assert called["execu"] is fake_exec assert called["schema"] == "public" - # wir erwarten 2 intents: table + column kinds = {i["kind"] for i in called["intents"]} assert kinds == {"table", "column"} @@ -288,7 +272,6 @@ def test_sync_db_comments_unsupported_engine(monkeypatch, capsys): ) monkeypatch.setattr(mod, "_prepare_context", lambda *a, **k: fake_ctx) - # mindestens ein Node, sonst wΓΌrden wir vorher returnen monkeypatch.setattr(mod, "REGISTRY", SimpleNamespace(nodes={"n": SimpleNamespace(name="n")})) monkeypatch.setattr(mod, "relation_for", lambda name: "N") monkeypatch.setattr( diff --git a/tests/unit/docs/test_docs_unit.py b/tests/unit/docs/test_docs_unit.py index 1ad9cff..4eef8a1 100644 --- a/tests/unit/docs/test_docs_unit.py +++ b/tests/unit/docs/test_docs_unit.py @@ -407,7 +407,6 @@ def begin(self): @pytest.mark.unit def test_columns_snowflake_collects_from_session(): - # snowflake .collect() liefert list[Row], aber wir kΓΆnnen dicts nehmen class FakeDF: def __init__(self, rows): self._rows = rows diff --git a/tests/unit/executors/test_bigquery_bf_exec_unit.py b/tests/unit/executors/test_bigquery_bf_exec_unit.py index 9275053..74b6189 100644 --- a/tests/unit/executors/test_bigquery_bf_exec_unit.py +++ b/tests/unit/executors/test_bigquery_bf_exec_unit.py @@ -5,7 +5,7 @@ import types from pathlib import Path from types import SimpleNamespace -from typing import Any, ClassVar +from typing import Any, ClassVar, cast import pytest from tests.common.mock.bigquery import ( @@ -17,9 +17,11 @@ install_fake_bigquery, ) -import fastflowtransform.executors._bigquery_mixin as bq_mix_mod -import fastflowtransform.executors.bigquery_bf_exec as bq_exec_mod +import fastflowtransform.executors.bigquery._bigquery_mixin as bq_mix_mod +import fastflowtransform.executors.bigquery.base as bq_base_mod +import fastflowtransform.executors.bigquery.bigframes as bq_exec_mod from fastflowtransform.core import Node +from fastflowtransform.executors.base import BaseExecutor # ---------------------- BigFrames-Fakes ------------------------------------ @@ -45,7 +47,7 @@ def read_gbq(self, table_id: str) -> Any: @pytest.fixture def bq_exec(monkeypatch): - _ = install_fake_bigquery(monkeypatch, [bq_exec_mod, bq_mix_mod]) + _ = install_fake_bigquery(monkeypatch, [bq_exec_mod, bq_mix_mod, bq_base_mod]) fake_bigframes = types.ModuleType("bigframes") fake_conf = types.ModuleType("bigframes._config") @@ -67,9 +69,6 @@ def bq_exec(monkeypatch): ex.client.add_dataset("p1.ds1") - monkeypatch.setattr(bq_exec_mod, "NotFound", FakeNotFound, raising=True) - monkeypatch.setattr(bq_exec_mod, "BadRequest", FakeBadRequest, raising=True) - return ex @@ -123,6 +122,65 @@ def to_gbq(self, table_id, if_exists="replace"): assert called["if_exists"] == "replace" +@pytest.mark.unit +@pytest.mark.bigquery +def test_ensure_dataset_respects_flag(monkeypatch): + _ = install_fake_bigquery(monkeypatch, [bq_exec_mod, bq_mix_mod, bq_base_mod]) + + fake_bigframes = types.ModuleType("bigframes") + fake_conf = types.ModuleType("bigframes._config") + fake_conf_bq = types.ModuleType("bigframes._config.bigquery_options") + fake_conf_bq.BigQueryOptions = _FakeBigQueryOptions # type: ignore[attr-defined] + fake_bigframes.Session = _FakeBFSession # type: ignore[attr-defined] + sys.modules.setdefault("bigframes", fake_bigframes) + sys.modules.setdefault("bigframes._config", fake_conf) + sys.modules["bigframes._config.bigquery_options"] = fake_conf_bq + monkeypatch.setattr(bq_exec_mod, "bigframes", fake_bigframes, raising=True) + + fake_client = FakeClient(project="p1", location="EU") + ex = bq_exec_mod.BigQueryBFExecutor( + project="p1", + dataset="ds_missing", + location="EU", + allow_create_dataset=False, + ) + # inject the fake client after construction (session uses fake bigframes) + ex.client = cast(Any, fake_client) + + with pytest.raises(FakeNotFound): + ex._ensure_dataset() + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_ensure_dataset_creates_when_allowed(monkeypatch): + _ = install_fake_bigquery(monkeypatch, [bq_exec_mod, bq_mix_mod, bq_base_mod]) + + fake_bigframes = types.ModuleType("bigframes") + fake_conf = types.ModuleType("bigframes._config") + fake_conf_bq = types.ModuleType("bigframes._config.bigquery_options") + fake_conf_bq.BigQueryOptions = _FakeBigQueryOptions # type: ignore[attr-defined] + fake_bigframes.Session = _FakeBFSession # type: ignore[attr-defined] + sys.modules.setdefault("bigframes", fake_bigframes) + sys.modules.setdefault("bigframes._config", fake_conf) + sys.modules["bigframes._config.bigquery_options"] = fake_conf_bq + monkeypatch.setattr(bq_exec_mod, "bigframes", fake_bigframes, raising=True) + + fake_client = FakeClient(project="p1", location="EU") + ex = bq_exec_mod.BigQueryBFExecutor( + project="p1", + dataset="ds_new", + location="EU", + allow_create_dataset=True, + ) + ex.client = cast(Any, fake_client) + + ex._ensure_dataset() + ds_id = "p1.ds_new" + assert ds_id in fake_client._datasets + assert fake_client.get_dataset(ds_id).location == "EU" + + @pytest.mark.unit @pytest.mark.bigquery def test_materialize_relation_fallback_to_materialize(bq_exec): @@ -341,8 +399,8 @@ def fake_ensure(ex): def fake_upsert(ex, name, rel, fp, eng): called["upsert"] += 1 - monkeypatch.setattr(bq_exec_mod, "ensure_meta_table", fake_ensure) - monkeypatch.setattr(bq_exec_mod, "upsert_meta", fake_upsert) + monkeypatch.setattr(bq_base_mod, "ensure_meta_table", fake_ensure) + monkeypatch.setattr(bq_base_mod, "upsert_meta", fake_upsert) bq_exec.on_node_built(Node(name="m", kind="sql", path=Path(".")), "p1.ds1.m", "fp123") @@ -355,12 +413,10 @@ def fake_upsert(ex, name, rel, fp, eng): def test_bf_apply_sql_materialization_calls_super(monkeypatch, bq_exec): monkeypatch.setattr(bq_exec, "_ensure_dataset", lambda: None, raising=True) - import fastflowtransform.executors.bigquery_bf_exec as bq_bf_mod # noqa PLC0415 - called: dict[str, str] = {} monkeypatch.setattr( - bq_bf_mod.BaseExecutor, + BaseExecutor, "_apply_sql_materialization", lambda self, node, target_sql, select_body, materialization: called.update( { @@ -389,7 +445,7 @@ def test_bf_apply_sql_materialization_calls_super(monkeypatch, bq_exec): @pytest.mark.bigquery def test_apply_sql_materialization_wraps_badrequest(monkeypatch, bq_exec): monkeypatch.setattr( - bq_exec_mod.BaseExecutor, + BaseExecutor, "_apply_sql_materialization", lambda *a, **k: (_ for _ in ()).throw(FakeBadRequest("bad SQL")), raising=True, diff --git a/tests/unit/executors/test_bigquery_exec_unit.py b/tests/unit/executors/test_bigquery_exec_unit.py index f0d5455..a4dc33d 100644 --- a/tests/unit/executors/test_bigquery_exec_unit.py +++ b/tests/unit/executors/test_bigquery_exec_unit.py @@ -17,9 +17,11 @@ install_fake_bigquery, ) -import fastflowtransform.executors._bigquery_mixin as bq_mix_mod -import fastflowtransform.executors.bigquery_exec as bq_exec_mod +import fastflowtransform.executors.bigquery._bigquery_mixin as bq_mix_mod +import fastflowtransform.executors.bigquery.base as bq_base_mod +import fastflowtransform.executors.bigquery.pandas as bq_exec_mod from fastflowtransform.core import Node +from fastflowtransform.executors.base import BaseExecutor @pytest.fixture @@ -167,17 +169,53 @@ def test_format_source_reference(bq_exec): assert "src_tbl" in ref +@pytest.mark.unit +@pytest.mark.bigquery +def test_ensure_dataset_respects_flag(monkeypatch): + _ = install_fake_bigquery(monkeypatch, [bq_exec_mod, bq_mix_mod]) + fake_client = FakeClient(project="p1", location="EU") + + ex = bq_exec_mod.BigQueryExecutor( + project="p1", + dataset="ds_missing", + location="EU", + client=cast(Any, fake_client), + allow_create_dataset=False, + ) + + with pytest.raises(FakeNotFound): + ex._ensure_dataset() + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_ensure_dataset_creates_when_allowed(monkeypatch): + _ = install_fake_bigquery(monkeypatch, [bq_exec_mod, bq_mix_mod]) + fake_client = FakeClient(project="p1", location="EU") + + ex = bq_exec_mod.BigQueryExecutor( + project="p1", + dataset="ds_new", + location="EU", + client=cast(Any, fake_client), + allow_create_dataset=True, + ) + + ex._ensure_dataset() + ds_id = "p1.ds_new" + assert ds_id in fake_client._datasets + assert fake_client.get_dataset(ds_id).location == "EU" + + @pytest.mark.unit @pytest.mark.bigquery def test_apply_sql_materialization_calls_super_and_ensures_dataset(monkeypatch, bq_exec): monkeypatch.setattr(bq_exec, "_ensure_dataset", lambda: None, raising=True) - import fastflowtransform.executors.bigquery_exec as bq_exec_mod # noqa PLC0415 - called: dict[str, str] = {} monkeypatch.setattr( - bq_exec_mod.BaseExecutor, + BaseExecutor, "_apply_sql_materialization", lambda self, node, target_sql, select_body, materialization: called.update( { @@ -208,7 +246,7 @@ def test_apply_sql_materialization_calls_super_and_ensures_dataset(monkeypatch, @pytest.mark.bigquery def test_apply_sql_materialization_wraps_badrequest(monkeypatch, bq_exec): monkeypatch.setattr( - bq_exec_mod.BaseExecutor, + BaseExecutor, "_apply_sql_materialization", lambda *a, **k: (_ for _ in ()).throw(FakeBadRequest("bq exploded")), raising=True, @@ -269,8 +307,8 @@ def fake_ensure(ex): def fake_upsert(ex, name, rel, fp, eng): called["upsert"] += 1 - monkeypatch.setattr(bq_exec_mod, "ensure_meta_table", fake_ensure) - monkeypatch.setattr(bq_exec_mod, "upsert_meta", fake_upsert) + monkeypatch.setattr(bq_base_mod, "ensure_meta_table", fake_ensure) + monkeypatch.setattr(bq_base_mod, "upsert_meta", fake_upsert) bq_exec.on_node_built(Node(name="m", kind="sql", path=Path(".")), "p1.ds1.m", "fp123") diff --git a/tests/unit/executors/test_databricks_spark_exec_unit.py b/tests/unit/executors/test_databricks_spark_exec_unit.py index addcbe1..54dea8b 100644 --- a/tests/unit/executors/test_databricks_spark_exec_unit.py +++ b/tests/unit/executors/test_databricks_spark_exec_unit.py @@ -1,4 +1,4 @@ -# tests/unit/executors/test_databricks_spark_exec_unit.py +# tests/unit/executors/test_databricks_spark_unit.py from __future__ import annotations from pathlib import Path @@ -8,8 +8,8 @@ import pytest from fastflowtransform.core import REGISTRY, Node -from fastflowtransform.executors import databricks_spark_exec as mod -from fastflowtransform.executors.databricks_spark_exec import ( +from fastflowtransform.executors import databricks_spark as mod +from fastflowtransform.executors.databricks_spark import ( _SparkConnShim, _split_db_table, ) @@ -347,7 +347,6 @@ def test_validate_required_no_requires_is_noop(exec_minimal): @pytest.mark.unit @pytest.mark.databricks_spark def test_materialize_relation_rejects_non_frame(exec_minimal, monkeypatch): - # fΓΌr diesen Test brauchen wir das echte Verhalten monkeypatch.setattr(exec_minimal, "_is_frame", lambda obj: False) node = Node(name="x", kind="python", path=Path(".")) with pytest.raises(TypeError, match="Spark model must return a Spark DataFrame"): @@ -412,7 +411,7 @@ def test_storage_meta_prefers_node_storage(exec_minimal): @pytest.mark.unit @pytest.mark.databricks_spark def test_storage_meta_uses_global_lookup_when_node_empty(exec_minimal): - with patch("fastflowtransform.executors.databricks_spark_exec.storage.get_model_storage") as gm: + with patch("fastflowtransform.executors.databricks_spark.storage.get_model_storage") as gm: gm.return_value = {"path": "/tmp/global"} meta = exec_minimal._storage_meta(None, "some_relation") assert meta == {"path": "/tmp/global"} @@ -432,7 +431,7 @@ def test_storage_meta_falls_back_to_registry_scan(exec_minimal, monkeypatch): REGISTRY.nodes = {"orders.ff": reg_node} # 2) relation_for(...) so patchen, dass es "orders" ergibt - with patch("fastflowtransform.executors.databricks_spark_exec.relation_for") as rel_for: + with patch("fastflowtransform.executors.databricks_spark.relation_for") as rel_for: rel_for.return_value = "orders" meta = exec_minimal._storage_meta(None, "orders") @@ -452,8 +451,8 @@ def test_storage_meta_registry_scan_then_global(exec_minimal, monkeypatch): REGISTRY.nodes = {"orders.ff": reg_node} with ( - patch("fastflowtransform.executors.databricks_spark_exec.relation_for") as rel_for, - patch("fastflowtransform.executors.databricks_spark_exec.storage.get_model_storage") as gm, + patch("fastflowtransform.executors.databricks_spark.relation_for") as rel_for, + patch("fastflowtransform.executors.databricks_spark.storage.get_model_storage") as gm, ): rel_for.return_value = "orders" gm.return_value = {"path": "/tmp/from_global"} @@ -471,7 +470,7 @@ def test_format_relation_for_ref_iceberg(exec_minimal): exec_minimal.spark.catalog.currentDatabase.return_value = "demo" exec_minimal._format_handler = IcebergFormatHandler(exec_minimal.spark) - with patch("fastflowtransform.executors.databricks_spark_exec.relation_for") as rel_for: + with patch("fastflowtransform.executors.databricks_spark.relation_for") as rel_for: rel_for.return_value = "events_base" out = exec_minimal._format_relation_for_ref("events_base.ff") diff --git a/tests/unit/executors/test_duckdb_exec_unit.py b/tests/unit/executors/test_duckdb_exec_unit.py index e8446d1..cff4e89 100644 --- a/tests/unit/executors/test_duckdb_exec_unit.py +++ b/tests/unit/executors/test_duckdb_exec_unit.py @@ -7,7 +7,7 @@ import pytest from fastflowtransform.core import Node -from fastflowtransform.executors.duckdb_exec import DuckExecutor, _q +from fastflowtransform.executors.duckdb import DuckExecutor, _q @pytest.fixture @@ -185,7 +185,7 @@ def fake_upsert(ex: Any, name: str, rel: str, fp: str, eng: str) -> None: called["upsert"] += 1 # patch the functions used in on_node_built - import fastflowtransform.executors.duckdb_exec as duck_mod # noqa PLC0415 + import fastflowtransform.executors.duckdb as duck_mod # noqa PLC0415 monkeypatch.setattr(duck_mod, "ensure_meta_table", fake_ensure, raising=True) monkeypatch.setattr(duck_mod, "upsert_meta", fake_upsert, raising=True) diff --git a/tests/unit/executors/test_postgres_exec_unit.py b/tests/unit/executors/test_postgres_exec_unit.py index b54a1a9..37e09a8 100644 --- a/tests/unit/executors/test_postgres_exec_unit.py +++ b/tests/unit/executors/test_postgres_exec_unit.py @@ -1,4 +1,4 @@ -# tests/unit/executors/test_postgres_exec_unit.py +# tests/unit/executors/test_postgres_unit.py from __future__ import annotations from pathlib import Path @@ -8,10 +8,10 @@ import pytest # Wichtig: wir testen genau dieses Modul -import fastflowtransform.executors.postgres_exec as pgmod +import fastflowtransform.executors.postgres as pgmod from fastflowtransform.core import Node from fastflowtransform.errors import ModelExecutionError, ProfileConfigError -from fastflowtransform.executors.postgres_exec import PostgresExecutor +from fastflowtransform.executors.postgres import PostgresExecutor # --------------------------------------------------------------------------- # Hilfs-Fakes diff --git a/tests/unit/executors/test_shims_unit.py b/tests/unit/executors/test_shims_unit.py index 79b880a..e8549da 100644 --- a/tests/unit/executors/test_shims_unit.py +++ b/tests/unit/executors/test_shims_unit.py @@ -6,7 +6,6 @@ from typing import Any, cast import pytest -from google.cloud.bigquery import Client from sqlalchemy import text as sa_text from sqlalchemy.engine import Engine @@ -15,6 +14,7 @@ SAConnShim, _rewrite_pg_create_or_replace_table, ) +from fastflowtransform.typing import Client # --------------------------------------------------------------------------- # Fakes / helpers @@ -180,19 +180,33 @@ def test_sa_shim_executes_sqlalchemy_clauseelement(): def test_bq_shim_executes_single_sql(): calls: dict[str, Any] = {} + class FakeJob: + def __init__(self) -> None: + self.result_called = False + + def result(self): + self.result_called = True + # Simulate a RowIterator; list is fine for the wrapper. + return ["ROW-1"] + class FakeClient: def query(self, sql: str, location: str | None = None): calls["sql"] = sql calls["location"] = location - return "JOB-1" + return FakeJob() fake = FakeClient() shim = BigQueryConnShim(cast(Client, fake), location="EU") res = shim.execute("SELECT 1") - assert res == "JOB-1" + # Shim now returns a cursor-like wrapper + assert isinstance(res, BigQueryConnShim._ResultWrapper) assert calls["sql"] == "SELECT 1" - assert calls["location"] == "EU" + # We don't pass location into client.query, so it should be None. + assert calls["location"] is None + # And fetchone() should give the first row. + assert res.fetchone() == "ROW-1" + assert res.fetchone() is None @pytest.mark.unit @@ -214,14 +228,13 @@ def query(self, sql: str, location: str | None = None): # should have executed all assert seen == ["SELECT 1", "SELECT 2", "SELECT 3"] - # and returned the last job - assert isinstance(res, FakeJob) + # and returned a cursor-like wrapper over the last result + assert isinstance(res, BigQueryConnShim._ResultWrapper) def test_bq_shim_raises_on_unsupported_type(): fake_client = SimpleNamespace(query=lambda *a, **k: None) - # fΓΌr den Typchecker so tun, als wΓ€re es ein echter Client shim = BigQueryConnShim(client=cast(Client, fake_client)) with pytest.raises(TypeError): diff --git a/tests/unit/executors/test_snowflake_snowpark_exec.py b/tests/unit/executors/test_snowflake_snowpark_exec.py index c19bc61..6cf2434 100644 --- a/tests/unit/executors/test_snowflake_snowpark_exec.py +++ b/tests/unit/executors/test_snowflake_snowpark_exec.py @@ -1,4 +1,4 @@ -# tests/unit/executors/test_snowflake_snowpark_exec_unit.py +# tests/unit/executors/test_snowflake_snowpark_unit.py from __future__ import annotations import sys @@ -8,10 +8,9 @@ from typing import Any import pytest -import snowflake.snowpark as sf -import fastflowtransform.executors.snowflake_snowpark_exec as sf_mod -from fastflowtransform.executors.snowflake_snowpark_exec import _SFResult +import fastflowtransform.executors.snowflake_snowpark as sf_mod +from fastflowtransform.executors.snowflake_snowpark import _SFResult # --------------------------------------------------------------------------- # 1) Install a fake snowflake.snowpark BEFORE importing the executor module @@ -131,13 +130,14 @@ def create(self) -> FakeSession: fake_sf_snowpark_mod.DataFrame = FakeSnowparkDataFrame # type: ignore[attr-defined] sys.modules["snowflake.snowpark"] = fake_sf_snowpark_mod +sf = fake_sf_snowpark_mod # --------------------------------------------------------------------------- # 2) Now we can safely import the module under test # --------------------------------------------------------------------------- -import fastflowtransform.executors.snowflake_snowpark_exec as sf_exec_mod # noqa: E402 +import fastflowtransform.executors.snowflake_snowpark as sf_exec_mod # noqa: E402 from fastflowtransform.core import Node # noqa: E402 -from fastflowtransform.executors.snowflake_snowpark_exec import ( # noqa: E402 +from fastflowtransform.executors.snowflake_snowpark import ( # noqa: E402 SnowflakeSnowparkExecutor, _SFCursorShim, ) diff --git a/tests/unit/render/test_this_proxy_unit.py b/tests/unit/render/test_this_proxy_unit.py index ea575c4..a4ec945 100644 --- a/tests/unit/render/test_this_proxy_unit.py +++ b/tests/unit/render/test_this_proxy_unit.py @@ -5,7 +5,7 @@ from jinja2 import Environment from fastflowtransform.core import Node -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor @pytest.mark.unit diff --git a/tests/unit/render/test_this_relation_unit.py b/tests/unit/render/test_this_relation_unit.py index 3536756..71d0efe 100644 --- a/tests/unit/render/test_this_relation_unit.py +++ b/tests/unit/render/test_this_relation_unit.py @@ -3,7 +3,7 @@ from jinja2 import Environment, FileSystemLoader, select_autoescape from fastflowtransform.core import Node -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor def _env_for_tests() -> Environment: @@ -17,18 +17,16 @@ def _env_for_tests() -> Environment: @pytest.mark.unit def test_this_renders_physical_relation(tmp_path): - # Arrange: Minimal SQL-Model, das nur `{{ this }}` rendert sql_path = tmp_path / "m.ff.sql" sql_path.write_text("select '{{ this }}' as rel\n", encoding="utf-8") node = Node(name="m.ff", kind="sql", path=sql_path) env = _env_for_tests() - ex = DuckExecutor() # nur fΓΌr render_sql(), DB wird nicht genutzt + ex = DuckExecutor() # Act rendered = ex.render_sql(node, env).strip() # Assert - # Erwartet: physischer Name ist 'm' (relation_for("m.ff")) assert rendered.lower() == "select 'm' as rel" diff --git a/tests/unit/test_utest_unit.py b/tests/unit/test_utest_unit.py index 9dfb925..1319616 100644 --- a/tests/unit/test_utest_unit.py +++ b/tests/unit/test_utest_unit.py @@ -78,7 +78,6 @@ def test_extract_defaults_inputs_missing_returns_empty(): @pytest.mark.unit def test_fingerprint_case_inputs_merges_defaults_and_case(tmp_path, monkeypatch): - # wir brauchen einen existierenden CSV-Pfad fΓΌr die Hash-Pfade csv_file = tmp_path / "seed.csv" csv_file.write_text("id,name\n1,A\n", encoding="utf-8") @@ -89,9 +88,7 @@ def test_fingerprint_case_inputs_merges_defaults_and_case(tmp_path, monkeypatch) ) case = SimpleNamespace( inputs={ - # ΓΌberschreibt defaults.src "src": {"rows": [{"id": 2}]}, - # neue relation via CSV "dim": {"csv": "seed.csv"}, } ) @@ -323,7 +320,6 @@ def test_discover_unit_specs_basic(tmp_path, fake_registry): s = specs[0] assert s.model == "model_a" assert len(s.cases) == 1 - # merge muss greifen: expect.rows aus case ΓΌberschreibt defaults assert s.cases[0].expect["rows"] == [{"id": 2}] @@ -344,25 +340,21 @@ def test_discover_unit_specs_only_model_filter(tmp_path, fake_registry): # --------------------------------------------------------------------------- -# _load_relation_from_rows (duckdb-pfad) +# _load_relation_from_rows (duckdb path) # --------------------------------------------------------------------------- @pytest.mark.unit @pytest.mark.duckdb -def test_load_relation_from_rows_duckdb(duckdb_executor): +def test_load_relation_from_rows_duckdb(duckdbutor): rows = [{"id": 1}, {"id": 2}] - # wir lassen unregister fehlschlagen, damit der Fallback getriggert wird - duckdb_executor.con.unregister.side_effect = Exception("no unregister in this version") + duckdbutor.con.unregister.side_effect = Exception("no unregister in this version") - utest._load_relation_from_rows(duckdb_executor, "tmp_tbl", rows) + utest._load_relation_from_rows(duckdbutor, "tmp_tbl", rows) - # register mit tmp-name - assert duckdb_executor.con.register.call_count == 1 - # er muss create or replace table ... ausfΓΌhren - executed_sqls = [c.args[0] for c in duckdb_executor.con.execute.call_args_list] + assert duckdbutor.con.register.call_count == 1 + executed_sqls = [c.args[0] for c in duckdbutor.con.execute.call_args_list] assert any("create or replace table" in sql.lower() for sql in executed_sqls) - # fallback drop view assert any("drop view if exists" in sql.lower() for sql in executed_sqls) @@ -373,7 +365,7 @@ def test_load_relation_from_rows_duckdb(duckdb_executor): @pytest.mark.unit @pytest.mark.duckdb -def test_load_relation_from_csv_calls_rows(monkeypatch, tmp_path, duckdb_executor): +def test_load_relation_from_csv_calls_rows(monkeypatch, tmp_path, duckdbutor): csv_path = tmp_path / "data.csv" csv_path.write_text("id,value\n1,a\n2,b\n", encoding="utf-8") @@ -385,7 +377,7 @@ def fake_rows(executor, rel, rows): monkeypatch.setattr(utest, "_load_relation_from_rows", fake_rows) - utest._load_relation_from_csv(duckdb_executor, "my_rel", csv_path) + utest._load_relation_from_csv(duckdbutor, "my_rel", csv_path) assert called["rel"] == "my_rel" expected_row_count = 2 @@ -400,15 +392,15 @@ def fake_rows(executor, rel, rows): @pytest.mark.unit @pytest.mark.duckdb -def test_read_result_duckdb(duckdb_executor): - df = utest._read_result(duckdb_executor, "some_table") +def test_read_result_duckdb(duckdbutor): + df = utest._read_result(duckdbutor, "some_table") assert isinstance(df, pd.DataFrame) assert list(df.columns) == ["id"] @pytest.mark.unit @pytest.mark.postgres -def test_read_result_postgres(monkeypatch, postgres_executor): +def test_read_result_postgres(monkeypatch, postgresutor): # wir patchen pandas.read_sql_query, damit er keine DB braucht fake_df = pd.DataFrame([{"x": 1}]) @@ -417,7 +409,7 @@ def fake_read_sql(query, conn): monkeypatch.setattr(utest.pd, "read_sql_query", fake_read_sql) - df = utest._read_result(postgres_executor, "target_table") + df = utest._read_result(postgresutor, "target_table") assert isinstance(df, pd.DataFrame) assert list(df.columns) == ["x"] @@ -579,11 +571,11 @@ def test_fingerprint_case_and_maybe_skip(monkeypatch, tmp_path): @pytest.mark.unit @pytest.mark.duckdb -def test_execute_and_update_cache_success(fake_registry, duckdb_executor): +def test_execute_and_update_cache_success(fake_registry, duckdbutor): env_ctx = utest._make_env_ctx("duckdb") cache = MagicMock() ctx = utest.UtestCtx( - executor=duckdb_executor, + executor=duckdbutor, jenv=MagicMock(), engine_name="duckdb", env_ctx=env_ctx, @@ -598,12 +590,12 @@ def test_execute_and_update_cache_success(fake_registry, duckdb_executor): @pytest.mark.unit @pytest.mark.duckdb -def test_execute_and_update_cache_failure(fake_registry, duckdb_executor): +def test_execute_and_update_cache_failure(fake_registry, duckdbutor): # wir machen executor kaputt - duckdb_executor.run_sql = MagicMock(side_effect=RuntimeError("boom")) + duckdbutor.run_sql = MagicMock(side_effect=RuntimeError("boom")) env_ctx = utest._make_env_ctx("duckdb") ctx = utest.UtestCtx( - executor=duckdb_executor, + executor=duckdbutor, jenv=MagicMock(), engine_name="duckdb", env_ctx=env_ctx, @@ -623,10 +615,10 @@ def test_execute_and_update_cache_failure(fake_registry, duckdb_executor): @pytest.mark.unit @pytest.mark.duckdb -def test_read_and_assert_ok(fake_registry, duckdb_executor): +def test_read_and_assert_ok(fake_registry, duckdbutor): env_ctx = utest._make_env_ctx("duckdb") ctx = utest.UtestCtx( - executor=duckdb_executor, + executor=duckdbutor, jenv=MagicMock(), engine_name="duckdb", env_ctx=env_ctx, @@ -641,11 +633,11 @@ def test_read_and_assert_ok(fake_registry, duckdb_executor): @pytest.mark.unit @pytest.mark.duckdb -def test_read_and_assert_mismatch(fake_registry, duckdb_executor, monkeypatch): +def test_read_and_assert_mismatch(fake_registry, duckdbutor, monkeypatch): # actual ist id=1, expected ist id=2 -> mismatch env_ctx = utest._make_env_ctx("duckdb") ctx = utest.UtestCtx( - executor=duckdb_executor, + executor=duckdbutor, jenv=MagicMock(), engine_name="duckdb", env_ctx=env_ctx, @@ -665,7 +657,7 @@ def test_read_and_assert_mismatch(fake_registry, duckdb_executor, monkeypatch): @pytest.mark.unit @pytest.mark.duckdb -def test_run_unit_specs_happy(tmp_path, fake_registry, duckdb_executor, monkeypatch): +def test_run_unit_specs_happy(tmp_path, fake_registry, duckdbutor, monkeypatch): # wir bauen uns per Hand einen spec spec = utest.UnitSpec( model="model_a", @@ -682,5 +674,5 @@ def test_run_unit_specs_happy(tmp_path, fake_registry, duckdb_executor, monkeypa project_dir=tmp_path, ) # jenv ist hier egal - failures = utest.run_unit_specs([spec], duckdb_executor, jenv=MagicMock(), cache_mode="off") + failures = utest.run_unit_specs([spec], duckdbutor, jenv=MagicMock(), cache_mode="off") assert failures == 0 diff --git a/tests/unit/testing/test_accepted_values_unit.py b/tests/unit/testing/test_accepted_values_unit.py index 4e94e4e..aef12c5 100644 --- a/tests/unit/testing/test_accepted_values_unit.py +++ b/tests/unit/testing/test_accepted_values_unit.py @@ -1,6 +1,6 @@ import pytest -from fastflowtransform.executors.duckdb_exec import DuckExecutor +from fastflowtransform.executors.duckdb import DuckExecutor from fastflowtransform.testing.base import TestFailure, accepted_values diff --git a/uv.lock b/uv.lock index 9e1b983..b01fb07 100644 --- a/uv.lock +++ b/uv.lock @@ -736,27 +736,26 @@ name = "fastflowtransform" version = "0.5.15" source = { editable = "." } dependencies = [ - { name = "bigframes" }, - { name = "delta-spark" }, { name = "duckdb" }, - { name = "google-cloud-bigquery" }, { name = "httpx" }, { name = "jinja2" }, { name = "pandas" }, - { name = "psycopg", extra = ["binary"] }, - { name = "psycopg2-binary" }, { name = "pydantic" }, { name = "pydantic-settings" }, - { name = "pyspark" }, { name = "python-dotenv" }, { name = "pyyaml" }, - { name = "snowflake" }, - { name = "snowflake-snowpark-python" }, { name = "sqlalchemy" }, { name = "typer" }, ] [package.optional-dependencies] +bigquery = [ + { name = "google-cloud-bigquery" }, +] +bigquery-bf = [ + { name = "bigframes" }, + { name = "google-cloud-bigquery" }, +] dev = [ { name = "mypy" }, { name = "pandas-stubs" }, @@ -776,13 +775,39 @@ docs = [ { name = "mkdocstrings", extra = ["python"] }, { name = "pymdown-extensions" }, ] +full = [ + { name = "bigframes" }, + { name = "delta-spark" }, + { name = "google-cloud-bigquery" }, + { name = "psycopg", extra = ["binary"] }, + { name = "psycopg2-binary" }, + { name = "pyspark" }, + { name = "snowflake" }, + { name = "snowflake-snowpark-python" }, +] +postgres = [ + { name = "psycopg", extra = ["binary"] }, + { name = "psycopg2-binary" }, +] +snowflake = [ + { name = "snowflake" }, + { name = "snowflake-snowpark-python" }, +] +spark = [ + { name = "delta-spark" }, + { name = "pyspark" }, +] [package.metadata] requires-dist = [ - { name = "bigframes", specifier = ">=2.24.0" }, - { name = "delta-spark", specifier = ">=4.0.0" }, + { name = "bigframes", marker = "extra == 'bigquery-bf'", specifier = ">=2.24.0" }, + { name = "bigframes", marker = "extra == 'full'", specifier = ">=2.24.0" }, + { name = "delta-spark", marker = "extra == 'full'", specifier = ">=4.0.0" }, + { name = "delta-spark", marker = "extra == 'spark'", specifier = ">=4.0.0" }, { name = "duckdb", specifier = ">=1.0" }, - { name = "google-cloud-bigquery", specifier = ">=3.25" }, + { name = "google-cloud-bigquery", marker = "extra == 'bigquery'", specifier = ">=3.25" }, + { name = "google-cloud-bigquery", marker = "extra == 'bigquery-bf'", specifier = ">=3.25" }, + { name = "google-cloud-bigquery", marker = "extra == 'full'", specifier = ">=3.25" }, { name = "httpx", specifier = ">=0.28.1" }, { name = "jinja2", specifier = ">=3.1" }, { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.6" }, @@ -796,24 +821,29 @@ requires-dist = [ { name = "pandas", specifier = ">=2.0" }, { name = "pandas-stubs", marker = "extra == 'dev'", specifier = ">=2.1" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = "==3.*" }, - { name = "psycopg", extras = ["binary"], specifier = ">=3.1" }, - { name = "psycopg2-binary", specifier = ">=2.9" }, + { name = "psycopg", extras = ["binary"], marker = "extra == 'full'", specifier = ">=3.1" }, + { name = "psycopg", extras = ["binary"], marker = "extra == 'postgres'", specifier = ">=3.1" }, + { name = "psycopg2-binary", marker = "extra == 'full'", specifier = ">=2.9" }, + { name = "psycopg2-binary", marker = "extra == 'postgres'", specifier = ">=2.9" }, { name = "pydantic", specifier = ">=2.8" }, { name = "pydantic-settings", specifier = ">=2.4" }, { name = "pymdown-extensions", marker = "extra == 'docs'", specifier = ">=10.0" }, - { name = "pyspark", specifier = ">=4.0.1" }, + { name = "pyspark", marker = "extra == 'full'", specifier = ">=4.0.1" }, + { name = "pyspark", marker = "extra == 'spark'", specifier = ">=4.0.1" }, { name = "pytest", marker = "extra == 'dev'", specifier = "==8.4.*" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = "==7.0.*" }, { name = "python-dotenv", specifier = ">=1.0" }, { name = "pyyaml", specifier = ">=6.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = "==0.14.*" }, - { name = "snowflake", specifier = ">=1.8.0" }, - { name = "snowflake-snowpark-python", specifier = ">=1.40.0" }, + { name = "snowflake", marker = "extra == 'full'", specifier = ">=1.8.0" }, + { name = "snowflake", marker = "extra == 'snowflake'", specifier = ">=1.8.0" }, + { name = "snowflake-snowpark-python", marker = "extra == 'full'", specifier = ">=1.40.0" }, + { name = "snowflake-snowpark-python", marker = "extra == 'snowflake'", specifier = ">=1.40.0" }, { name = "sqlalchemy", specifier = ">=2.0" }, { name = "typer", specifier = ">=0.12" }, { name = "types-pyyaml", marker = "extra == 'dev'", specifier = ">=6.0.12" }, ] -provides-extras = ["dev", "docs"] +provides-extras = ["postgres", "bigquery", "bigquery-bf", "spark", "snowflake", "full", "dev", "docs"] [[package]] name = "filelock"