From dd22ca5a60e5824b61386c33f465d18d1f9e8dda Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Thu, 20 Nov 2025 20:23:52 +0100 Subject: [PATCH] Updated api.http caching + added snapshots + snapshot demo --- .gitignore | 1 + Makefile | 1 + Makefile.built | 35 ++ docs/Profiles.md | 8 +- docs/Snapshots.md | 378 ++++++++++++++++++ docs/Technical_Overview.md | 19 +- docs/examples/Snapshot_Demo.md | 255 ++++++++++++ docs/index.md | 130 +++--- .../.env.dev_bigquery_bigframes | 0 .../.env.dev_bigquery_pandas | 0 examples/snapshot_demo/.env.dev_databricks | 16 + .../snapshot_demo/.env.dev_databricks_delta | 16 + .../snapshot_demo/.env.dev_databricks_iceberg | 13 + examples/snapshot_demo/.env.dev_duckdb | 2 + examples/snapshot_demo/.env.dev_postgres | 3 + examples/snapshot_demo/.env.dev_snowflake | 18 + examples/snapshot_demo/Makefile | 138 +++++++ examples/snapshot_demo/README.md | 36 ++ examples/snapshot_demo/models/README.md | 20 + .../models/marts/mart_users_by_domain.ff.sql | 28 ++ .../mart_users_by_domain_snapshot.ff.sql | 24 ++ .../snapshots/users_clean_snapshot.ff.sql | 24 ++ .../models/staging/users_clean.ff.sql | 27 ++ examples/snapshot_demo/profiles.yml | 70 ++++ examples/snapshot_demo/project.yml | 119 ++++++ examples/snapshot_demo/seeds/README.md | 4 + examples/snapshot_demo/seeds/seed_users.csv | 4 + examples/snapshot_demo/sources.yml | 8 + examples/snapshot_demo/tests/dq/README.md | 4 + examples/snapshot_demo/tests/unit/README.md | 12 + examples_article/http_cache_demo/README.md | 7 + .../http_cache_demo/docs/README.md | 4 + .../http_cache_demo/models/README.md | 4 + .../http_cache_demo/models/todo_ingest.ff.py | 45 +++ examples_article/http_cache_demo/profiles.yml | 13 + examples_article/http_cache_demo/project.yml | 18 + .../http_cache_demo/seeds/README.md | 4 + examples_article/http_cache_demo/sources.yml | 9 + .../http_cache_demo/tests/dq/README.md | 4 + .../http_cache_demo/tests/unit/README.md | 4 + mkdocs.yml | 4 +- src/fastflowtransform/api/context.py | 2 +- src/fastflowtransform/api/http.py | 248 +++++++----- src/fastflowtransform/cli/__init__.py | 3 + src/fastflowtransform/cli/init_cmd.py | 55 ++- src/fastflowtransform/cli/run.py | 37 +- src/fastflowtransform/cli/snapshot_cmd.py | 173 ++++++++ src/fastflowtransform/cli/test_cmd.py | 15 +- src/fastflowtransform/config/models.py | 104 ++++- src/fastflowtransform/docs.py | 1 + .../executors/_spark_imports.py | 57 +++ src/fastflowtransform/executors/base.py | 81 +++- .../executors/bigquery/base.py | 225 +++++++++++ .../executors/databricks_spark.py | 299 +++++++++++++- src/fastflowtransform/executors/duckdb.py | 220 ++++++++++ src/fastflowtransform/executors/postgres.py | 234 +++++++++++ .../executors/snowflake_snowpark.py | 222 +++++++++- src/fastflowtransform/snapshots.py | 113 ++++++ src/fastflowtransform/templates/index.html.j2 | 1 + tests/integration/examples/config.py | 11 + .../api/http/test_http_offline_cache_unit.py | 22 + .../api/http/test_http_pagination_df_unit.py | 33 ++ 62 files changed, 3495 insertions(+), 190 deletions(-) create mode 100644 Makefile.built create mode 100644 docs/Snapshots.md create mode 100644 docs/examples/Snapshot_Demo.md rename examples/{incremental_demo => snapshot_demo}/.env.dev_bigquery_bigframes (100%) rename examples/{incremental_demo => snapshot_demo}/.env.dev_bigquery_pandas (100%) create mode 100644 examples/snapshot_demo/.env.dev_databricks create mode 100644 examples/snapshot_demo/.env.dev_databricks_delta create mode 100644 examples/snapshot_demo/.env.dev_databricks_iceberg create mode 100644 examples/snapshot_demo/.env.dev_duckdb create mode 100644 examples/snapshot_demo/.env.dev_postgres create mode 100644 examples/snapshot_demo/.env.dev_snowflake create mode 100644 examples/snapshot_demo/Makefile create mode 100644 examples/snapshot_demo/README.md create mode 100644 examples/snapshot_demo/models/README.md create mode 100644 examples/snapshot_demo/models/marts/mart_users_by_domain.ff.sql create mode 100644 examples/snapshot_demo/models/snapshots/mart_users_by_domain_snapshot.ff.sql create mode 100644 examples/snapshot_demo/models/snapshots/users_clean_snapshot.ff.sql create mode 100644 examples/snapshot_demo/models/staging/users_clean.ff.sql create mode 100644 examples/snapshot_demo/profiles.yml create mode 100644 examples/snapshot_demo/project.yml create mode 100644 examples/snapshot_demo/seeds/README.md create mode 100644 examples/snapshot_demo/seeds/seed_users.csv create mode 100644 examples/snapshot_demo/sources.yml create mode 100644 examples/snapshot_demo/tests/dq/README.md create mode 100644 examples/snapshot_demo/tests/unit/README.md create mode 100644 examples_article/http_cache_demo/README.md create mode 100644 examples_article/http_cache_demo/docs/README.md create mode 100644 examples_article/http_cache_demo/models/README.md create mode 100644 examples_article/http_cache_demo/models/todo_ingest.ff.py create mode 100644 examples_article/http_cache_demo/profiles.yml create mode 100644 examples_article/http_cache_demo/project.yml create mode 100644 examples_article/http_cache_demo/seeds/README.md create mode 100644 examples_article/http_cache_demo/sources.yml create mode 100644 examples_article/http_cache_demo/tests/dq/README.md create mode 100644 examples_article/http_cache_demo/tests/unit/README.md create mode 100644 src/fastflowtransform/cli/snapshot_cmd.py create mode 100644 src/fastflowtransform/executors/_spark_imports.py create mode 100644 src/fastflowtransform/snapshots.py diff --git a/.gitignore b/.gitignore index 7049ad9..4209e3f 100644 --- a/.gitignore +++ b/.gitignore @@ -50,3 +50,4 @@ examples/**/docs/ tickets/** site/dag/** cache/** +articles/** diff --git a/Makefile b/Makefile index 7e9b8cb..0585b64 100644 --- a/Makefile +++ b/Makefile @@ -20,3 +20,4 @@ MAKEFILE_DIR := $(dir $(abspath $(firstword $(MAKEFILE_LIST)))) include $(MAKEFILE_DIR)/Makefile.pipeline include $(MAKEFILE_DIR)/Makefile.dev +include $(MAKEFILE_DIR)/Makefile.built diff --git a/Makefile.built b/Makefile.built new file mode 100644 index 0000000..ae209a8 --- /dev/null +++ b/Makefile.built @@ -0,0 +1,35 @@ +# Makefile for fastflowtransform + +# Change this if your venv has a different name/path +VENV = .venv + +.PHONY: help venv build check upload-test upload clean + +help: + @echo "Useful commands:" + @echo " make venv - create virtualenv and install build" + @echo " make build - build wheel + sdist into dist/" + @echo " make check - run twine check on dist/*" + @echo " make upload-test - upload dist/* to TestPyPI (using uvx + twine)" + @echo " make upload - upload dist/* to PyPI (using uvx + twine)" + @echo " make clean - remove build artifacts" + +venv: + python3 -m venv $(VENV) + $(VENV)/bin/python -m pip install --upgrade pip + $(VENV)/bin/uv pip install build + +build: venv + $(VENV)/bin/python -m build + +check: + uvx twine check dist/* + +upload-test: build check + uvx twine upload --repository testpypi dist/* + +upload: build check + uvx twine upload dist/* + +clean: + rm -rf build dist *.egg-info diff --git a/docs/Profiles.md b/docs/Profiles.md index 4ff4454..d4a2175 100644 --- a/docs/Profiles.md +++ b/docs/Profiles.md @@ -50,12 +50,14 @@ Supported engines and their expected sections: |----------------------|--------------------|---------------------------------------------------| | `duckdb` | `duckdb` | `path` (file path or `:memory:`) | | `postgres` | `postgres` | `dsn`, `db_schema` | -| `bigquery` | `bigquery` | `project` (optional), `dataset`, `location` | -| `databricks_spark` | `databricks_spark` | `master`, `app_name`, optional `extra_conf`, `warehouse_dir`, `use_hive_metastore`, `database`, `table_format`, `table_options` | -| `snowflake_snowpark` | `snowflake_snowpark`| `account`, `user`, `password`, `warehouse`, `database`, `db_schema`, optional `role` | +| `bigquery` | `bigquery` | `project` (optional), `dataset`, `location`, `use_bigframes`, `allow_create_dataset` | +| `databricks_spark` | `databricks_spark` | `master`, `app_name`, optional `extra_conf`, `warehouse_dir`, `use_hive_metastore`, `catalog`, `database`, `table_format`, `table_options` | +| `snowflake_snowpark` | `snowflake_snowpark`| `account`, `user`, `password`, `warehouse`, `database`, `schema` (`db_schema` alias), optional `role`, `allow_create_schema` | Each profile can define its own `vars:` block (values exposed via `var('key')` inside templates). +> Snowflake note: the CLI scaffold shows `schema:` while the docs often mention `db_schema:`. The configuration accepts either key because `schema` is an alias for `db_schema` in the settings model. + ## Environment Variables `profiles.yml` supports Jinja expressions. The helper `env('FF_VAR', 'fallback')` reads process environment variables and substitutes the default if unset. Examples: diff --git a/docs/Snapshots.md b/docs/Snapshots.md new file mode 100644 index 0000000..b827634 --- /dev/null +++ b/docs/Snapshots.md @@ -0,0 +1,378 @@ +# Snapshots + +Snapshots are **history-aware tables** that track how a row changes over time. + +Unlike regular `table` / `view` / `incremental` models, which only ever expose the *current* state, a snapshot keeps **multiple versions** of each business key, with validity ranges and a “current” flag. + +FastFlowTransform implements snapshots as a dedicated materialization: + +```sql +{{ config( + materialized='snapshot', + snapshot={ + 'strategy': 'timestamp', -- or 'check' + }, + unique_key='id', + updated_at='updated_at', +) }} + +select + id, + ... +from {{ ref('some_model.ff') }}; +```` + +You run snapshot models via a **separate CLI entrypoint**: + +```bash +fft snapshot run . --env dev_duckdb +``` + +Regular `fft run` does *not* execute snapshot models. + +--- + +## When to use snapshots + +Use snapshots when you need to: + +* Answer **“what did we know back then?”** questions + e.g. “What was the user’s email on 2024-03-01?” +* Implement **type-2 slowly changing dimensions (SCD2)** for dimensions like users, customers, products, or feature flags. +* Preserve a **temporal audit trail** of important entities without hand-rolling history tables and merge logic. + +You typically place snapshot models near your **cleaned dimensions**, e.g.: + +* `staging/users_clean.ff.sql` +* `snapshots/users_clean_snapshot.ff.sql` ⟵ snapshot over the staging model +* `marts/dim_users.ff.sql` ⟵ reads from the snapshot’s “current” rows + +--- + +## Conceptual model + +A snapshot is defined by: + +1. **Business key**: + `unique_key` / `primary_key` + + > “Which column(s) identify a logical entity?” + +2. **Change detection strategy** (required for snapshots): + + * `strategy='timestamp'` + Use a **monotonic timestamp column** to detect new versions, e.g. `updated_at`, `signup_date`. + * `strategy='check'` + Compare a set of **“interesting” columns** (`check_cols`) between runs and open a new version when any of them changes. + +3. **Source query**: + A normal `SELECT` that produces the *current* state of your entities. + +On disk, each snapshot table contains: + +* All columns produced by your `SELECT` + (e.g. `user_id`, `email`, `email_domain`, `signup_date`) +* Plus a set of **snapshot metadata columns**, typically: + + * ` _ff_valid_from` – when this version became active + * ` _ff_valid_to` – when this version stopped being active (`NULL` for open/current) + * ` _ff_is_current` – boolean flag marking the current row for each key + +Exact column names may vary per implementation, but the pattern is always: + +> “Multiple rows per business key, each with a validity range, and exactly one current row.” + +--- + +## Snapshot configuration + +Snapshot behavior is configured via `config(...)` at the top of a model. + +### Minimal timestamp snapshot + +```sql +{{ config( + materialized='snapshot', + snapshot={ + 'strategy': 'timestamp', + }, + unique_key='user_id', + updated_at='signup_date', +) }} + +select + user_id, + email, + email_domain, + signup_date +from {{ ref('users_clean.ff') }}; +``` + +Key pieces: + +* `materialized='snapshot'` + Enables snapshot semantics for this model. + +* `snapshot.strategy='timestamp'` + Use a timestamp column to detect new versions. + +* `unique_key='user_id'` + Business key; you can also pass a list: `['user_id', 'country']`. + +* `updated_at='signup_date'` + Column used as the **freshness indicator**. When a new run sees a `signup_date` that is greater than the existing version’s, a new version is opened. + +> **Validation rules** +> +> * Snapshots require a `unique_key` (or `primary_key`). +> * `strategy` must be `'timestamp'` or `'check'`. +> * For `'timestamp'`, you must provide `updated_at` / `updated_at_column`. +> * For `'check'`, you must provide `check_cols`. + +### Check strategy with `check_cols` + +Use this when you **don’t have** a reliable `updated_at` column and instead want to compare a list of columns: + +```sql +{{ config( + materialized='snapshot', + snapshot={ + 'strategy': 'check', + 'check_cols': ['email', 'email_domain', 'status'], + }, + unique_key='user_id', +) }} + +select + user_id, + email, + email_domain, + status, + signup_date +from {{ ref('users_clean.ff') }}; +``` + +Here: + +* The engine joins **current source rows** with **current snapshot rows** on `unique_key`. +* It recomputes a hash over `check_cols`. When the hash changes, a new version is opened. + +This is convenient for: + +* Entities with **many changing attributes**. +* Sources where `updated_at` is unreliable or missing. + +### Shorthands and normalization + +FastFlowTransform’s config layer normalizes snapshot config so you can: + +* Pass a single string or list for `unique_key`, `check_cols`, `updated_at_columns`, etc. +* Use `updated_at` or `updated_at_column` interchangeably (they are validated to be consistent). +* Optionally keep snapshot settings nested under `snapshot={...}` while still accessing the top-level shortcuts (`unique_key`, `updated_at`, `check_cols`) in executors. + +--- + +## Runtime behavior + +### First snapshot run + +On the **first** `fft snapshot run`: + +* FFT executes the snapshot’s `SELECT`. +* For each row, it writes: + + * One row per `unique_key`. + * `valid_from = run_timestamp` + * `valid_to = NULL` + * `is_current = TRUE` + +No comparison with previous data (there is none yet). + +### Subsequent runs (timestamp strategy) + +On each subsequent run (`strategy='timestamp'`): + +1. **Load current version** per `unique_key` from the snapshot table. +2. **Load current source rows** from the snapshot model’s `SELECT`. +3. For each key: + + * If the key **did not exist** before → **insert** new open-ended version. + * If the key existed, and the source row’s `updated_at` is **greater** than the snapshot’s latest version: + + * Compare row values (implementation detail; often just “trust” updated_at). + * If considered changed: + + * **Close** the current version: set `valid_to = run_timestamp`, `is_current = FALSE`. + * **Open** a new version with `valid_from = run_timestamp`, `valid_to = NULL`, `is_current = TRUE`. + * If the key existed, and `updated_at` is **not greater** (or row unchanged) → no-op. + +> **Deletes** +> By design, snapshots focus on changes in the **source-of-truth rows**. When a row disappears from the source, it is treated as **no change** for snapshot purposes (the last known version remains current). If you need delete tracking, model a soft-delete flag and include it in `check_cols`. + +### Subsequent runs (check strategy) + +For `strategy='check'`: + +1. **Load current version** per `unique_key`. +2. **Load current source rows**. +3. Compute a **hash** (or equivalent) over the configured `check_cols` for both. +4. If the hash differs → treat it as a change: + + * Close old version (`valid_to = run_timestamp`, `is_current = FALSE`). + * Insert new version (`valid_from = run_timestamp`, `valid_to = NULL`, `is_current = TRUE`). + +This strategy is usually more robust when: + +* Your source doesn’t maintain an updated timestamp. +* You care about a specific subset of columns only. + +--- + +## Snapshot table schema + +A snapshot table contains: + +* **Business columns**: whatever your `SELECT` produces. +* **Snapshot columns** (typical pattern): + + ```text + _ff_valid_from TIMESTAMP -- when this version became active + _ff_valid_to TIMESTAMP -- when this version ended (NULL = still active) + _ff_is_current BOOLEAN -- TRUE exactly for the current version + ``` + +Common query patterns: + +### Current version per key + +```sql +select * +from users_clean_snapshot +where _ff_is_current = true; +``` + +### History of a single key + +```sql +select * +from users_clean_snapshot +where user_id = 42 +order by _ff_valid_from; +``` + +### Point-in-time view + +“What did we know on 2024-03-01?” + +```sql +select * +from users_clean_snapshot +where + _ff_valid_from <= timestamp '2024-03-01' + and ( _ff_valid_to is null or _ff_valid_to > timestamp '2024-03-01' ); +``` + +These patterns work uniformly across engines. + +--- + +## CLI: `fft snapshot run` + +Snapshots are run via a dedicated CLI subcommand: + +```bash +fft snapshot run [options] +``` + +Key properties: + +* Only models with `materialized='snapshot'` are eligible. +* If your selector (`--select/--exclude`) matches non-snapshot models, they are ignored or explicitly rejected with a clear error. +* You can combine all the usual selection patterns: `tag:...`, `path:...`, `name:...`, etc. + +Example: + +```bash +# Run only snapshot models that belong to the snapshot demo and the DuckDB engine +fft snapshot run . \ + --env dev_duckdb \ + --select tag:example:snapshot_demo \ + --select tag:engine:duckdb +``` + +### Retention & pruning + +To avoid unbounded growth, the snapshot CLI supports **retention** flags: + +* `--prune` + Enable pruning of old versions. +* `--keep-last N` + Keep only the last `N` versions per `unique_key`. +* `--dry-run` + Show what *would* be pruned without actually deleting anything. + +Examples: + +```bash +# Show which rows would be removed, but do not delete +fft snapshot run . \ + --env dev_duckdb \ + --select tag:example:snapshot_demo \ + --prune --keep-last 3 --dry-run + +# Apply pruning for real +fft snapshot run . \ + --env dev_duckdb \ + --select tag:example:snapshot_demo \ + --prune --keep-last 3 +``` + +Retention is applied **after** the snapshot update, so the most recent `N` versions are always preserved. + +--- + +## Interaction with regular runs + +Snapshots are intentionally **decoupled** from `fft run`: + +* `fft run` builds your **current-state pipeline** (seeds, staging, marts, incremental models, etc.). +* `fft snapshot run` builds and updates **history tables** based on the current state produced by `fft run`. + +Typical workflow: + +```bash +# 1) Rebuild main pipeline +fft run . --env dev_duckdb + +# 2) Update snapshots based on the new state +fft snapshot run . --env dev_duckdb +``` + +If you accidentally try to run a snapshot via `fft run`, FFT will raise an error such as: + +> Snapshot models cannot be executed via 'fft run'. Use 'fft snapshot run' instead. + +This separation keeps regular DAG runs **predictable and stateless**, while giving you a powerful, focused tool for history tracking. + +--- + +## Best practices + +* Put snapshots in a dedicated folder, e.g. `models/snapshots/`, and tag them: + + ```sql + tags=['scope:snapshot', 'engine:duckdb', 'example:snapshot_demo'] + ``` + +* Always configure a **stable, business-level `unique_key`**. Avoid transient IDs that might be re-used. + +* Prefer `strategy='timestamp'` when you have a trustworthy monotonic timestamp. + +* Prefer `strategy='check'` when: + + * timestamps are unreliable, or + * you care about specific columns only. + +* Build point-in-time marts by **reading from the snapshot** rather than the raw staging table when you need historical correctness. + +Snapshots give you a clean, structured way to get SCD2-style history without hand-writing merge logic for every table. diff --git a/docs/Technical_Overview.md b/docs/Technical_Overview.md index 2da99e8..0eba393 100644 --- a/docs/Technical_Overview.md +++ b/docs/Technical_Overview.md @@ -53,7 +53,24 @@ ### Project Layout -For an up-to-date view, browse the repository tree or run `find . -maxdepth 2` from the root; all examples live under `examples/` with their own READMEs. +The repository is structured so you can jump straight to the area you need without spelunking: + +``` +fastflowtransform/ +├── src/fastflowtransform/ # core package (CLI, executors, docs, streaming, settings) +│ ├── api/, cli/, executors/, testing/, templates/ and friends +│ └── entry-points such as core.py, dag.py, seeding.py, validation.py +├── docs/ # user + developer docs (this file plus guides) +├── examples/ # canonical SQL/Python demo projects with seeds + READMEs +├── examples_article/ # long-form tutorial artifacts referenced by articles +├── articles/, tickets/, _exports/ # content pipelines + planning notes +├── tests/ # pytest coverage for internal modules +├── _scripts/, rel_dir/, dist/, site/, htmlcov/ # tooling + build outputs +├── pyproject.toml, uv.lock # build system + dependency lock +└── Makefile*, docker-compose.yml # dev workflow shortcuts and services +``` + +Treat `fastflowtransform/` as the project root when running commands from this doc. ### Example Projects and Seeds diff --git a/docs/examples/Snapshot_Demo.md b/docs/examples/Snapshot_Demo.md new file mode 100644 index 0000000..912f684 --- /dev/null +++ b/docs/examples/Snapshot_Demo.md @@ -0,0 +1,255 @@ +# Snapshot Demo Project + +The `examples/snapshot_demo` project shows how to build **history-aware tables** with FastFlowTransform snapshots. It reuses the small users pipeline from the basic demo and adds a `users_clean_snapshot` model that captures **row-versioned history** over time. + +## Why it exists + +* **Show snapshot semantics** – demonstrate `materialized='snapshot'` with `strategy='timestamp'` on a simple dataset. +* **Separate runs** – illustrate why snapshots are executed via `fft snapshot run` instead of the regular `fft run`. +* **Engine parity** – keep the snapshot demo portable across DuckDB, Postgres, Databricks Spark (parquet / Delta Lake / Iceberg), and BigQuery (once engines are implemented). +* **Understand the shape of a snapshot table** – see how FFT adds validity columns on top of your source columns. + +Use it as a sandbox before adding snapshots to your own marts and dimensions. + +## Project layout + +The snapshot demo is intentionally tiny and mirrors the basic demo structure: + +| Path | Purpose | +| ---------------------------------------------- | --------------------------------------------------------------------------------------------------- | +| `seeds/seed_users.csv` | Sample CRM-style user data. `fft seed` materializes it as a physical `seed_users` table. | +| `models/staging/users_clean.ff.sql` | Same as in the basic demo: cleans emails, casts types, derives `email_domain`. | +| `models/marts/mart_users_by_domain.ff.sql` | Same as in the basic demo: aggregates users per email domain. | +| `models/snapshots/users_clean_snapshot.ff.sql` | **New:** snapshot model that captures slowly changing history of `users_clean.ff`. | +| `profiles.yml` | Reused from the basic demo: defines `dev_duckdb`, `dev_postgres`, `dev_databricks_delta`, `dev_databricks_iceberg`, `dev_bigquery`. | +| `.env.dev_*` | Engine-specific environment files (`.env.dev_duckdb`, `.env.dev_postgres`, `.env.dev_databricks_delta`, `.env.dev_databricks_iceberg`). | +| `Makefile` | Adds snapshot-aware targets on top of the usual `seed` / `run` / `test` / `dag`. | + +### The snapshot model + +The core of the demo is `models/snapshots/users_clean_snapshot.ff.sql`: + +```sql +{{ config( + materialized='snapshot', + snapshot={ + 'strategy': 'timestamp', -- or 'check' (not used in this demo) + }, + unique_key='user_id', + updated_at='signup_date', + tags=[ + 'example:snapshot_demo', + 'scope:snapshot', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery' + ], +) }} + +select + user_id, + email, + email_domain, + signup_date +from {{ ref('users_clean.ff') }}; +``` + +Key points: + +* `materialized='snapshot'` marks this as a **snapshot model**. +* `snapshot.strategy='timestamp'` means: + + * FFT uses `updated_at='signup_date'` to detect changed rows. + * When a row changes, the old version is **closed** and a new version is **opened**. +* `unique_key='user_id'` defines the **business key** used to match records between runs. +* The *body* is a normal `SELECT` from the cleaned staging model; FFT takes care of the history logic. + +On physical storage, FFT keeps: + +* All columns from the select (`user_id`, `email`, `email_domain`, `signup_date`) +* Plus engine-agnostic snapshot metadata columns (names depending on your implementation), typically: + + * a **valid-from** timestamp + * a **valid-to** timestamp (nullable/open ended) + * an **is_current** flag + +So a given `user_id` may appear multiple times with different validity ranges. + +## Running the snapshot demo + +Assuming you’ve already wired `examples/snapshot_demo/Makefile` similarly to the basic demo (with `snapshot` / `snapshot_demo` targets): + +1. Change into the project: + + ```bash + cd examples/snapshot_demo + ``` + +2. Choose an engine and export the environment (example: DuckDB): + + ```bash + # DuckDB + set -a; source .env.dev_duckdb; set +a + + # Or Postgres + # set -a; source .env.dev_postgres; set +a + + # Or Databricks + # Delta/parquet: set -a; source .env.dev_databricks_delta; set +a + # Iceberg: set -a; source .env.dev_databricks_iceberg; set +a + # (optionally export FF_DBR_TABLE_FORMAT=delta|iceberg to override the table format) + + # Or BigQuery (requires GCP setup) + # set -a; source .env.dev_bigquery_pandas; set +a + # set -a; source .env.dev_bigquery_bigframes; set +a + ``` + +3. Run the full snapshot demo for the selected engine: + + ```bash + # One-shot: clean → seed → run (pipeline) → snapshot → dag → test + make snapshot_demo ENGINE=duckdb + # make snapshot_demo ENGINE=postgres + # make snapshot_demo ENGINE=databricks_spark DBR_TABLE_FORMAT=delta + # make snapshot_demo ENGINE=databricks_spark DBR_TABLE_FORMAT=iceberg + # make snapshot_demo ENGINE=bigquery BQ_FRAME=bigframes + ``` + + Under the hood this will typically do: + + * `fft seed` – materialize `seed_users` + * `fft run` – build staging/mart views/tables (excluding snapshot models) + * `fft snapshot run` – apply snapshot logic to `users_clean_snapshot` + * `fft dag` – generate the DAG/site + * `fft test` – run any configured DQ tests + +### Databricks table formats (parquet / Delta / Iceberg) + +Just like the incremental demo, the snapshot project lets you flip Spark table formats without +editing models. Pass `DBR_TABLE_FORMAT=parquet|delta|iceberg` to `make snapshot_demo` or export +`FF_DBR_TABLE_FORMAT` when invoking `fft` directly. The `dev_databricks_delta` profile uses the same +Hive-compatible metastore as before, while `dev_databricks_iceberg` wires in an Iceberg catalog +(`spark.jars.packages` / `spark.sql.catalog.iceberg.*`). When running locally you still need the +matching Python packages (for example `pip install delta-spark` for Delta Lake and the Iceberg +runtime JARs bundled via the profile). + +Manual CLI examples: + +```bash +# Delta Lake snapshots +FF_DBR_TABLE_FORMAT=delta \ + FFT_ACTIVE_ENV=dev_databricks_delta FF_ENGINE=databricks_spark \ + fft snapshot run . --select tag:example:snapshot_demo --select tag:engine:databricks_spark + +# Iceberg snapshots +FF_DBR_TABLE_FORMAT=iceberg \ + FFT_ACTIVE_ENV=dev_databricks_iceberg FF_ENGINE=databricks_spark \ + fft snapshot run . --select tag:example:snapshot_demo --select tag:engine:databricks_spark +``` + +4. Or run only the snapshot step (after a normal `fft run`): + + ```bash + # DuckDB example + make run ENGINE=duckdb # builds users_clean etc. + make snapshot ENGINE=duckdb # runs only snapshot models + ``` + + Or directly with `fft`: + + ```bash + # Only snapshot models (tagged example:snapshot_demo) + fft snapshot run . \ + --env dev_duckdb \ + --select tag:example:snapshot_demo --select tag:engine:duckdb + ``` + + If your selection includes non-snapshot models, FFT will ignore them for the snapshot run. + +## Inspecting the snapshot table + +After a couple of runs with changed data, use your engine to inspect `users_clean_snapshot`: + +* **DuckDB** (from the project root): + + ```sql + select * + from users_clean_snapshot + order by user_id, _ff_valid_from; -- adjust column names to what you implement + ``` + +* **Postgres / BigQuery / Databricks**: the table name is the same; the schema/database/dataset follows the profile. + +Typical patterns to explore: + +* **Current records only** (one row per `user_id`): + + ```sql + select * + from users_clean_snapshot + where _ff_is_current = true; + ``` + +* **History of a single user**: + + ```sql + select * + from users_clean_snapshot + where user_id = 42 + order by _ff_valid_from; + ``` + +This makes it easy to answer questions like “what did we know about this user on date X?”. + +## Snapshot CLI & retention + +The snapshot demo uses the dedicated entry point: + +```bash +fft snapshot run . --env dev_duckdb --select tag:example:snapshot_demo +``` + +In addition, the CLI supports retention and pruning flags (once implemented in your code base): + +* `--prune` – enables pruning of old snapshot rows. +* `--keep-last N` – when used with `--prune`, keeps only the last `N` versions per key. +* `--dry-run` – shows which rows would be pruned without actually deleting anything. + +Example: + +```bash +# Keep only the last 3 versions per user_id; just show the plan +fft snapshot run . \ + --env dev_duckdb \ + --select tag:example:snapshot_demo \ + --prune --keep-last 3 --dry-run + +# Apply the pruning for real +fft snapshot run . \ + --env dev_duckdb \ + --select tag:example:snapshot_demo \ + --prune --keep-last 3 +``` + +This is especially useful when snapshot tables grow large and you only care about a bounded history window for most use cases. + +## Interaction with regular runs + +Two important rules: + +1. **Snapshot models are not part of `fft run`** + They’re intentionally excluded to keep regular pipeline runs stateless and predictable. If a snapshot model is accidentally selected in `fft run`, FFT surfaces a clear error: + + > Snapshot models cannot be executed via 'fft run'. Use 'fft snapshot run' instead. + +2. **Snapshots depend on upstream models** + In the demo, `users_clean_snapshot` depends on `users_clean.ff`. The typical flow is: + + ```bash + fft run . --env dev_duckdb --select tag:example:basic_demo + fft snapshot run . --env dev_duckdb --select tag:example:snapshot_demo + ``` + + * `fft run` ensures `users_clean` is fresh. + * `fft snapshot run` compares the new `users_clean` rows with the existing snapshot table and writes history changes. diff --git a/docs/index.md b/docs/index.md index 57ab785..20034b2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,79 +1,99 @@ # FastFlowTransform Documentation Hub -Welcome! This page is your starting point for FastFlowTransform docs. Pick the track that matches what you want to do and follow the links to the detailed guides. +FastFlowTransform (FFT) is a SQL + Python data modeling engine with a deterministic DAG, parallel executor, optional caching, incremental builds, auto-generated docs, snapshots, and built-in data-quality tests. The `fft` CLI orchestrates compilation, execution, docs, validation, and history tracking across DuckDB, Postgres, BigQuery (pandas + BigFrames), Databricks/Spark, and Snowflake Snowpark. ---- +Use this page as the front door into the docs: start with the orientation section, then jump to the guide that matches the task you have at hand. -## Docs Navigation -- **Getting Started** — you are here (`docs/index.md`) -- [User Guide](./Technical_Overview.md#part-i-operational-guide) -- [Modeling Reference](./Config_and_Macros.md) -- [Parallelism & Cache](./Cache_and_Parallelism.md) -- [CLI Guide](./CLI_Guide.md) -- [Logging & Verbosity](./Logging.md) -- [API calls in Python models](./Api_Models.md) -- [Incremental Models](./Incremental.md) -- [YAML Tests (Schema-bound)](./YAML_Tests.md) -- [Model Unit Tests](./Unit_Tests.md) -- [Data Quality Tests Reference](./Data_Quality_Tests.md) -- [Auto-Docs & Lineage](./Auto_Docs.md) -- [Troubleshooting & Error Codes](./Troubleshooting.md) -- [Profiles & Environments](./Profiles.md) -- [Sources Declaration](./Sources.md) -- [Project Configuration](./Project_Config.md) -- [State Selection (changed & results)](./State_Selection.md) -- [Basic Demo](./examples/Basic_Demo.md) -- [Materializations Demo](./examples/Materializations_Demo.md) -- [Data Quality Tests Demo](./examples/DQ_Demo.md) -- [Macros Demo](./examples/Macros_Demo.md) -- [Cache Demo](./examples/Cache_Demo.md) -- [Environment Matrix Demo](./examples/Environment_Matrix.md) -- [Incremental & Delta Demo](examples/Incremental_Demo.md) -- [Local Engine Setup](./examples/Local_Engine_Setup.md) -- [API Demo](./examples/API_Demo.md) -- [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) +--- ## Table of Contents -- [Docs Navigation](#docs-navigation) -- [Choose Your Path](#choose-your-path) -- [Reference Map](#reference-map) +- [Quick Orientation](#quick-orientation) +- [Build & Run Projects](#build--run-projects) +- [Modeling & Configuration](#modeling--configuration) +- [Execution & State Management](#execution--state-management) +- [Testing & Data Quality](#testing--data-quality) +- [Docs, Debugging & Operations](#docs-debugging--operations) +- [Examples & Tutorials](#examples--tutorials) +- [Reference & Contribution](#reference--contribution) - [Need Help?](#need-help) --- -## Choose Your Path +## Quick Orientation + +- **New to FFT?** Read the [Quickstart](Quickstart.md) for installation (venv + editable install), seeding, and the first `fft run`. +- **Want the bigger picture?** The [Technical Overview](Technical_Overview.md) explains the project layout, DAG, scheduler, registry, executors, and the roadmap snapshot. +- **Learning the CLI surface area?** Browse the [CLI Guide](CLI_Guide.md) for command groups such as `fft run`, `fft snapshot run`, `fft dag`, `fft docgen`, `fft test`, and `fft utest`. + +--- + +## Build & Run Projects + +- **Project layout & CLI workflow:** Pair the “Project Layout” chapter of the [Technical Overview](Technical_Overview.md#project-layout) with the [CLI Guide](CLI_Guide.md) to understand how `fft run`, `fft test`, and `fft dag` fit together. +- **Profiles & environments:** [Profiles & Environments](Profiles.md) covers executor profiles, environment overrides, credential handling, and engine-specific flags. +- **Runtimes & observability flags:** [Logging & Verbosity](Logging.md) explains log levels, JSON logs, progress indicators, and metrics toggles during `fft run`. +- **Local runtimes & engines:** [Local Engine Setup](examples/Local_Engine_Setup.md) walks through DuckDB, Postgres, Spark/Delta, BigQuery, and Snowflake Snowpark bootstrapping for the demos. + +--- + +## Modeling & Configuration + +- **SQL + Python authoring model:** [API & Models](Api_Models.md) documents the Python node decorators, HTTP helper (`fastflowtransform.api.http`), and how `ref()` / `source()` bindings work in both SQL and Python models. +- **Templates, macros, and config keys:** [Configuration & Macros](Config_and_Macros.md) lists the `config(...)` options, reusable macros, helper functions, and naming rules for `.ff.sql` / `.ff.py`. +- **Project-level metadata:** [Project Configuration](Project_Config.md) describes `project.yml`, default materializations, tags, exposures, docs strings, and the `models/` hierarchy. +- **Sources & seeds:** [Sources](Sources.md) shows how to register upstream tables/files, snapshots of raw data, and how state tracking interacts with sources. + +--- + +## Execution & State Management + +- **Parallelism, caching & rebuilds:** [Cache & Parallelism](Cache_and_Parallelism.md) dives into the level-wise scheduler, fingerprint cache, and `--rebuild` / `--no-cache` behaviors. +- **Incremental models:** [Incremental Processing](Incremental.md) explains merge vs append strategies, cleanup rules, and engine-specific hooks. +- **Snapshots / history tables:** [Snapshots](Snapshots.md) documents the `materialized='snapshot'` config, timestamp vs check strategies, and the dedicated `fft snapshot run . --env ` entrypoint. +- **Selective runs:** [State Selection](State_Selection.md) covers `--selector`, `--select`, `--exclude`, `--changed`, and `--results` filters across DAGs. + +--- + +## Testing & Data Quality -### 1. Build & Operate Projects (Data Practitioners) +- **Schema-bound YAML tests:** [YAML Tests](YAML_Tests.md) details how to define and run column-level constraints declared in `.yml`. +- **Reusable data-quality suites:** [Data Quality Tests](Data_Quality_Tests.md) catalogs reconciliation, freshness, and anomaly rules that can attach to models or sources. +- **Fast model unit tests:** [Unit Tests](Unit_Tests.md) shows how to author `.sql` / `.py` assertions, seed fixtures, and run them via `fft utest`. + +--- + +## Docs, Debugging & Operations + +- **Auto-generated docs & lineage:** [Auto Docs](Auto_Docs.md) explains `fft dag --html`, `fft docgen`, JSON exports, and optional `sync-db-comments` for Postgres/Snowflake. +- **Visibility & logging:** [Logging & Verbosity](Logging.md) lists CLI flags for structured logs, progress bars, and verbose executor info. +- **Troubleshooting:** [Troubleshooting & Error Codes](Troubleshooting.md) enumerates the most common failures, retry strategies, and diagnostic commands. + +--- -- **Get set up quickly:** follow the dedicated [Quickstart](Quickstart.md) guide for installation, seeding, and a first run. -- **Need local runtimes?** The [API demo local engine setup](examples/Local_Engine_Setup.md) walks through DuckDB, Postgres, Databricks Spark, BigQuery, and Snowflake Snowpark. -- **Understand the project layout & CLI workflow:** start with *Project Layout* in the [Technical Overview](Technical_Overview.md#project-layout) and pair it with the [CLI Guide](CLI_Guide.md) for command patterns. -- **Configure runtimes & profiles:** review executor profiles and environment overrides in the dedicated [Profiles guide](Profiles.md) plus [Logging & Verbosity](Logging.md) for observability flags. -- **Model data quality & troubleshoot runs:** combine the [Model Unit Tests guide](Unit_Tests.md) with [Troubleshooting & Error Codes](Troubleshooting.md) to keep runs deterministic and easy to debug. -- **Explore runnable demos:** start with the [Basic Demo Overview](examples/Basic_Demo.md) or browse the `examples/` directory; each subproject ships with its own README. +## Examples & Tutorials -### 2. Extend FastFlowTransform (Developers & Contributors) +- **Core walkthroughs:** [Basic Demo](examples/Basic_Demo.md) and [Materializations Demo](examples/Materializations_Demo.md) cover the standard table/view/incremental builds and DAG navigation. +- **Testing-focused:** [Data Quality Tests Demo](examples/DQ_Demo.md) and [Macros Demo](examples/Macros_Demo.md) showcase advanced assertions and templating. +- **Performance & state:** [Cache Demo](examples/Cache_Demo.md), [Environment Matrix Demo](examples/Environment_Matrix.md), and [Incremental Demo](examples/Incremental_Demo.md) highlight rebuilds and selective runs. +- **API & integrations:** [API Demo](examples/API_Demo.md) illustrates Python HTTP models; [Local Engine Setup](examples/Local_Engine_Setup.md) provides engine-specific Makefiles. +- **History tracking:** [Snapshot Demo](examples/Snapshot_Demo.md) demonstrates the snapshot materialization end-to-end with timestamp/check strategies. -- **Dive into architecture & core modules:** start with [Architecture Overview](Technical_Overview.md#architecture-overview) and [Core Modules](Technical_Overview.md#core-modules) for registry, DAG, executors, validation, and more. -- **Add tests & seeds:** reuse the curated demos under `docs/examples/` for seeds/Makefiles and follow the [Model Unit Tests guide](Unit_Tests.md) for deterministic fixtures. -- **Contribute code:** follow the workflow described in [`./Contributing.md`](./Contributing.md) and consult the module-level docs for internal APIs. -- **Plan ahead:** check the roadmap snapshot in the [Technical Overview](Technical_Overview.md#roadmap-snapshot) to understand upcoming work. +All demos live in the top-level `examples/` directory and ship with Makefiles plus runnable seeds. --- -## Reference Map +## Reference & Contribution -- **Modeling reference** — Jinja configuration, macros, helper functions: [`Config_and_Macros.md`](Config_and_Macros.md) -- **CLI entry point & commands** — `src/fastflowtransform/cli.py` -- **Registry & node loading** — `src/fastflowtransform/core.py` -- **Unit test runner** — `src/fastflowtransform/utest.py` -- **Rendered DAG templates** — `src/fastflowtransform/docs/templates/` +- **API reference:** Browse the generated [API Reference](reference/) (MkDocStrings) for public functions, classes, and executors under `src/fastflowtransform`. +- **Architecture internals:** The [Technical Overview](Technical_Overview.md#part-ii-architecture-internals) dives into registries, DAG building, validation, and engine abstractions. +- **Contributing:** Follow [Contributing.md](Contributing.md) for dev environment setup (`uv`, `pyproject.toml`), coding standards, tests, and PR expectations. +- **License:** Apache 2.0 — see [License.md](License.md). --- ## Need Help? -- Open an issue or PR — see [`./Contributing.md`](./Contributing.md) for guidelines. -- Join the discussion (planning doc / roadmap highlights) — see the roadmap section in the [Technical Overview](Technical_Overview.md#roadmap-snapshot). -- If you spot gaps in the docs, file an issue with the context and links to the relevant section. +- Open an issue or PR with context — start with [Contributing.md](Contributing.md) if you want to propose changes. +- Surface documentation gaps, bugs, or missing examples via GitHub issues in [MirrorsAndMisdirections/FastFlowTransform](https://github.com/MirrorsAndMisdirections/FastFlowTransform). +- For roadmap highlights or planning threads, check the final section of the [Technical Overview](Technical_Overview.md#roadmap-snapshot). diff --git a/examples/incremental_demo/.env.dev_bigquery_bigframes b/examples/snapshot_demo/.env.dev_bigquery_bigframes similarity index 100% rename from examples/incremental_demo/.env.dev_bigquery_bigframes rename to examples/snapshot_demo/.env.dev_bigquery_bigframes diff --git a/examples/incremental_demo/.env.dev_bigquery_pandas b/examples/snapshot_demo/.env.dev_bigquery_pandas similarity index 100% rename from examples/incremental_demo/.env.dev_bigquery_pandas rename to examples/snapshot_demo/.env.dev_bigquery_pandas diff --git a/examples/snapshot_demo/.env.dev_databricks b/examples/snapshot_demo/.env.dev_databricks new file mode 100644 index 0000000..a8d5dc9 --- /dev/null +++ b/examples/snapshot_demo/.env.dev_databricks @@ -0,0 +1,16 @@ +# Databricks (or local Spark) defaults for the snapshot demo +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=snapshot_demo + +# Optional overrides for Databricks SQL warehouses or Unity Catalog +FF_DBR_DATABASE=snapshot_demo +# FF_DBR_CATALOG=hive_metastore + +# Enable a local Hive-compatible metastore (required for snapshots when running Spark standalone) +FF_DBR_ENABLE_HIVE=1 + +# Configure Java for local Spark sessions when needed +# JAVA_HOME=/opt/homebrew/opt/openjdk@17 + +# To target Delta Lake tables explicitly, set: +# FF_DBR_TABLE_FORMAT=delta diff --git a/examples/snapshot_demo/.env.dev_databricks_delta b/examples/snapshot_demo/.env.dev_databricks_delta new file mode 100644 index 0000000..0cfccaf --- /dev/null +++ b/examples/snapshot_demo/.env.dev_databricks_delta @@ -0,0 +1,16 @@ +# Databricks Spark (Delta) profile defaults for the snapshot demo +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=snapshot_demo + +# Managed metastore/database when running Spark locally +FF_DBR_ENABLE_HIVE=1 +FF_DBR_DATABASE=snapshot_demo +# Optional: Unity Catalog +# FF_DBR_CATALOG=hive_metastore + +# Switch the managed table format (parquet|delta|iceberg) +# Defaults to parquet unless Makefile/CLI overrides FF_DBR_TABLE_FORMAT +# FF_DBR_TABLE_FORMAT=delta + +# Configure Java for local Spark sessions when needed +# JAVA_HOME=/opt/homebrew/opt/openjdk@17 diff --git a/examples/snapshot_demo/.env.dev_databricks_iceberg b/examples/snapshot_demo/.env.dev_databricks_iceberg new file mode 100644 index 0000000..d82e3c4 --- /dev/null +++ b/examples/snapshot_demo/.env.dev_databricks_iceberg @@ -0,0 +1,13 @@ +# Databricks Spark (Iceberg) profile defaults for the snapshot demo +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=snapshot_demo + +# Managed Iceberg catalog metadata lives under .local/iceberg_warehouse_snapshot +# The profile wires the Iceberg catalog; just ensure the directory is writable. +FF_DBR_DATABASE=snapshot_demo + +# Tell FFT/Spark to request Iceberg tables (Makefile also injects this) +FF_DBR_TABLE_FORMAT=iceberg + +# Configure Java for local Spark sessions when needed +# JAVA_HOME=/opt/homebrew/opt/openjdk@17 diff --git a/examples/snapshot_demo/.env.dev_duckdb b/examples/snapshot_demo/.env.dev_duckdb new file mode 100644 index 0000000..00a0889 --- /dev/null +++ b/examples/snapshot_demo/.env.dev_duckdb @@ -0,0 +1,2 @@ +# DuckDB profile for the snapshot demo +FF_DUCKDB_PATH=.local/snapshot_demo.duckdb diff --git a/examples/snapshot_demo/.env.dev_postgres b/examples/snapshot_demo/.env.dev_postgres new file mode 100644 index 0000000..3f80dd1 --- /dev/null +++ b/examples/snapshot_demo/.env.dev_postgres @@ -0,0 +1,3 @@ +# Postgres profile for the basic demo (replace with your own connection string) +FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 +FF_PG_SCHEMA=snapshot_demo diff --git a/examples/snapshot_demo/.env.dev_snowflake b/examples/snapshot_demo/.env.dev_snowflake new file mode 100644 index 0000000..b6410cb --- /dev/null +++ b/examples/snapshot_demo/.env.dev_snowflake @@ -0,0 +1,18 @@ +# Snowflake Snowpark profile for the incremental demo + +# Your Snowflake account identifier, e.g. xy12345.eu-central-1 +FF_SF_ACCOUNT=your_account_id + +# Username & password (or extend for keypair auth) +FF_SF_USER=your_username +FF_SF_PASSWORD=your_password + +# Compute warehouse +FF_SF_WAREHOUSE=COMPUTE_WH + +# Database & schema for the demo +FF_SF_DATABASE=EXAMPLE_DEMO +FF_SF_SCHEMA=SNAPSHOT_DEMO + +# Optional role (can be left blank) +FF_SF_ROLE=ANALYST diff --git a/examples/snapshot_demo/Makefile b/examples/snapshot_demo/Makefile new file mode 100644 index 0000000..7389e16 --- /dev/null +++ b/examples/snapshot_demo/Makefile @@ -0,0 +1,138 @@ +.PHONY: seed run snapshot test dag show artifacts clean demo help + +# --- Configuration ----------------------------------------------------------- + +DB ?= .local/snapshot_demo.duckdb +PROJECT ?= . +UV ?= uv + +# Engine selector (duckdb|postgres|databricks_spark|bigquery) +ENGINE ?= duckdb + +# BigQuery frame type selector (pandas | bigframes) +BQ_FRAME ?= bigframes + +# Databricks table format (parquet | delta | iceberg) +DBR_TABLE_FORMAT ?= parquet + +# Resolve profile and tags per engine +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb + ENGINE_TAG = engine:duckdb +endif +ifeq ($(ENGINE),postgres) + PROFILE_ENV = dev_postgres + ENGINE_TAG = engine:postgres +endif +ifeq ($(ENGINE),databricks_spark) + ENGINE_TAG = engine:databricks_spark + ifeq ($(DBR_TABLE_FORMAT),delta) + PROFILE_ENV = dev_databricks_delta + else ifeq ($(DBR_TABLE_FORMAT),iceberg) + PROFILE_ENV = dev_databricks_iceberg + else + PROFILE_ENV = dev_databricks_delta + endif +endif +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake + ENGINE_TAG = engine:snowflake_snowpark +endif + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) + +ifeq ($(ENGINE),bigquery) + BASE_ENV := $(BASE_ENV) FF_ENGINE_VARIANT=$(BQ_FRAME) +endif +ifeq ($(ENGINE),databricks_spark) + BASE_ENV := $(BASE_ENV) FF_DBR_TABLE_FORMAT=$(DBR_TABLE_FORMAT) +endif + +RUN_ENV = $(BASE_ENV) + +# Select only models belonging to this example + the active engine +SELECT_FLAGS = --select tag:example:snapshot_demo --select tag:$(ENGINE_TAG) + +SHOW_MODEL ?= mart_users_by_domain + +CLEAN_SCRIPT = ../_scripts/cleanup_env.py + +ifeq ($(ENGINE),duckdb) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine duckdb --env "$(PROFILE_ENV)" --project "$(PROJECT)" --duckdb-path "$(DB)" +else ifeq ($(ENGINE),postgres) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),databricks_spark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),bigquery) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),snowflake_snowpark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine snowflake_snowpark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else + $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark|bigquery) +endif + +# --- Targets ---------------------------------------------------------------- + +help: + @echo "FastFlowTransform Snapshot Demo" + @echo "Targets:" + @echo " make seed ENGINE=$(ENGINE)" + @echo " make run ENGINE=$(ENGINE) # staging + marts" + @echo " make snapshot ENGINE=$(ENGINE) # snapshot models (materialized='snapshot')" + @echo " make dag ENGINE=$(ENGINE)" + @echo " make test ENGINE=$(ENGINE)" + @echo " make show ENGINE=$(ENGINE) SHOW_MODEL=$(SHOW_MODEL)" + @echo " make demo ENGINE=$(ENGINE)" + @echo " make clean ENGINE=$(ENGINE)" + @echo + @echo "Variables: DB=$(DB) PROJECT=$(PROJECT) UV=$(UV) ENGINE=$(ENGINE) BQ_FRAME=$(BQ_FRAME) DBR_TABLE_FORMAT=$(DBR_TABLE_FORMAT)" + +seed: + env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) + +run: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +snapshot: + env $(RUN_ENV) $(UV) run fft snapshot run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +test: + env $(BASE_ENV) $(UV) run fft test "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +dag: + env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --html + +show: + @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ + $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ + else \ + echo "No HTML found: $(PROJECT)/site/dag/index.html"; \ + fi + +artifacts: + @echo + @echo "== 📦 Artifacts ==" + @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + @echo " DAG HTML: $(PROJECT)/site/dag/index.html" + +clean: + $(CLEAN_CMD) + +demo: clean + @echo "== 🚀 Snapshot Demo ($(ENGINE)) ==" + @echo "Profile=$(PROFILE_ENV) PROJECT=$(PROJECT) DBR_TABLE_FORMAT=$(DBR_TABLE_FORMAT)" + +$(MAKE) seed ENGINE=$(ENGINE) + +$(MAKE) run ENGINE=$(ENGINE) + +$(MAKE) snapshot ENGINE=$(ENGINE) + +$(MAKE) dag ENGINE=$(ENGINE) + +$(MAKE) test ENGINE=$(ENGINE) + +$(MAKE) artifacts + @echo "✅ Demo complete." diff --git a/examples/snapshot_demo/README.md b/examples/snapshot_demo/README.md new file mode 100644 index 0000000..61a7ab6 --- /dev/null +++ b/examples/snapshot_demo/README.md @@ -0,0 +1,36 @@ +# Snapshot demo + +`examples/snapshot_demo` reuses the basic users pipeline and adds snapshot models that keep +slowly-changing history tables. It now mirrors the incremental demo by supporting Spark parquet, +Delta Lake, and Iceberg targets through the Databricks/Spark executor. + +## Environment files + +Copy one of the `.env.dev_*` files and export it before running `make`: + +| File | Purpose | +| --- | --- | +| `.env.dev_duckdb` | Local DuckDB file for the demo | +| `.env.dev_postgres` | Postgres DSN/schema | +| `.env.dev_databricks_delta` | Local Spark or Databricks defaults for parquet/Delta tables | +| `.env.dev_databricks_iceberg` | Spark 4+/Databricks configuration with the Iceberg catalog wired in | + +`FF_DBR_TABLE_FORMAT` can always override the physical Spark table format (`parquet`, `delta`, +`iceberg`) even if the profile defaults differ. + +## Running the demo + +```bash +# DuckDB / Postgres +make snapshot_demo ENGINE=duckdb +make snapshot_demo ENGINE=postgres + +# Databricks / Spark: switch table format via DBR_TABLE_FORMAT +make snapshot_demo ENGINE=databricks_spark DBR_TABLE_FORMAT=parquet +make snapshot_demo ENGINE=databricks_spark DBR_TABLE_FORMAT=delta +make snapshot_demo ENGINE=databricks_spark DBR_TABLE_FORMAT=iceberg +``` + +Under the hood `make snapshot_demo` executes `fft seed`, `fft run`, `fft snapshot run`, `fft dag`, +and `fft test` for the models tagged with `example:snapshot_demo`. Use `make snapshot ENGINE=...` +if you only want to update the snapshot materializations after a regular `fft run`. diff --git a/examples/snapshot_demo/models/README.md b/examples/snapshot_demo/models/README.md new file mode 100644 index 0000000..cb87b64 --- /dev/null +++ b/examples/snapshot_demo/models/README.md @@ -0,0 +1,20 @@ +# Models directory (snapshot_demo) + +This demo shows how to: + +- Clean seed data into a staging view. +- Build a small mart (`mart_users_by_domain`). +- Maintain two slowly-changing snapshot tables: + + - `users_clean_snapshot` – timestamp-based snapshot of the staging view. + - `mart_users_by_domain_snapshot` – check-based snapshot of the mart. + +The snapshot models use: + +```jinja +materialized='snapshot' +strategy='timestamp' | 'check' +unique_key='...' +updated_at='...' +check_cols=['...'] +``` \ No newline at end of file diff --git a/examples/snapshot_demo/models/marts/mart_users_by_domain.ff.sql b/examples/snapshot_demo/models/marts/mart_users_by_domain.ff.sql new file mode 100644 index 0000000..bbeab40 --- /dev/null +++ b/examples/snapshot_demo/models/marts/mart_users_by_domain.ff.sql @@ -0,0 +1,28 @@ +{{ config( + materialized='table', + tags=[ + 'example:snapshot_demo', + 'scope:mart', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark', + ], +) }} + +with base as ( + select + email_domain, + signup_date + from {{ ref('users_clean.ff') }} +) + +select + email_domain, + count(*) as user_count, + min(signup_date) as first_signup, + max(signup_date) as last_signup +from base +group by email_domain +order by email_domain; diff --git a/examples/snapshot_demo/models/snapshots/mart_users_by_domain_snapshot.ff.sql b/examples/snapshot_demo/models/snapshots/mart_users_by_domain_snapshot.ff.sql new file mode 100644 index 0000000..19c9fc3 --- /dev/null +++ b/examples/snapshot_demo/models/snapshots/mart_users_by_domain_snapshot.ff.sql @@ -0,0 +1,24 @@ +{{ config( + materialized='snapshot', + snapshot={ + 'strategy': 'check', + 'check_cols': ['user_count', 'first_signup', 'last_signup'], + }, + unique_key='email_domain', + tags=[ + 'example:snapshot_demo', + 'scope:snapshot', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark', + ], +) }} + +select + email_domain, + user_count, + first_signup, + last_signup +from {{ ref('mart_users_by_domain.ff') }}; diff --git a/examples/snapshot_demo/models/snapshots/users_clean_snapshot.ff.sql b/examples/snapshot_demo/models/snapshots/users_clean_snapshot.ff.sql new file mode 100644 index 0000000..c068eec --- /dev/null +++ b/examples/snapshot_demo/models/snapshots/users_clean_snapshot.ff.sql @@ -0,0 +1,24 @@ +{{ config( + materialized='snapshot', + snapshot={ + 'strategy': 'timestamp', + 'updated_at': 'signup_date', + }, + unique_key='user_id', + tags=[ + 'example:snapshot_demo', + 'scope:snapshot', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark', + ], +) }} + +select + user_id, + email, + email_domain, + signup_date +from {{ ref('users_clean.ff') }}; diff --git a/examples/snapshot_demo/models/staging/users_clean.ff.sql b/examples/snapshot_demo/models/staging/users_clean.ff.sql new file mode 100644 index 0000000..7d8a674 --- /dev/null +++ b/examples/snapshot_demo/models/staging/users_clean.ff.sql @@ -0,0 +1,27 @@ +{{ config( + materialized='table', + tags=[ + 'example:snapshot_demo', + 'scope:staging', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark', + ], +) }} + +with raw_users as ( + select + cast(id as integer) as user_id, + lower(email) as email, + cast(signup_date as date) as signup_date + from {{ source('crm', 'users') }} +) + +select + user_id, + email, + regexp_replace(email, '^.*@', '') as email_domain, + signup_date +from raw_users; diff --git a/examples/snapshot_demo/profiles.yml b/examples/snapshot_demo/profiles.yml new file mode 100644 index 0000000..c23c3bd --- /dev/null +++ b/examples/snapshot_demo/profiles.yml @@ -0,0 +1,70 @@ +# Connection profiles for the snapshot demo. + +dev_duckdb: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/snapshot_demo.duckdb') }}" + +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'snapshot_demo') }}" + +dev_databricks_delta: + engine: databricks_spark + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'snapshot_demo') }}" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" + extra_conf: + spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" + spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" + spark.hadoop.datanucleus.schema.autoCreateAll: "true" + spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + +dev_databricks_iceberg: + engine: databricks_spark + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'snapshot_demo') }}" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" + table_format: "iceberg" + extra_conf: + spark.jars.packages: "org.apache.iceberg:iceberg-spark-runtime-4.0_2.13:1.10.0" + spark.sql.catalog.iceberg: "org.apache.iceberg.spark.SparkCatalog" + spark.sql.catalog.iceberg.type: "hadoop" + spark.sql.catalog.iceberg.warehouse: "file://{{ project_dir() }}/.local/iceberg_warehouse" + spark.sql.catalog.iceberg.write.metadata.version-hint.enabled: "false" + spark.sql.catalog.iceberg.read.metadata.version-hint.enabled: "false" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'snapshot_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true + # allow_create_dataset: true # uncomment to auto-create dataset on first run + +dev_bigquery_pandas: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'snapshot_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: false + # allow_create_dataset: true # uncomment to auto-create dataset on first run + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'SNAPSHOT_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true diff --git a/examples/snapshot_demo/project.yml b/examples/snapshot_demo/project.yml new file mode 100644 index 0000000..6ce1031 --- /dev/null +++ b/examples/snapshot_demo/project.yml @@ -0,0 +1,119 @@ +name: snapshot_demo +version: "0.1" +models_dir: models + +docs: + dag_dir: site/dag + +vars: {} + +models: + storage: + mart_users_by_domain.ff: + path: ".local/spark/events_base" + # format: parquet + users_clean.ff: + path: ".local/spark/users_clean" + # format: parquet + users_clean_snapshot.ff: + path: ".local/spark/users_clean_snapshot" + # format: parquet + mart_users_by_domain_snapshot.ff: + path: ".local/spark/mart_users_by_domain_snapshot" + # format: parquet + +tests: + # -------------------------------------------------------------------------- + # Base tests (similar to basic_demo, slightly simplified) + # -------------------------------------------------------------------------- + - type: not_null + table: users_clean + column: email_domain + tags: [example_snapshot_demo] + + - type: unique + table: users_clean + column: user_id + tags: [example_snapshot_demo] + + - type: unique + table: mart_users_by_domain + column: email_domain + tags: [example_snapshot_demo] + + # -------------------------------------------------------------------------- + # Snapshot tables – basic sanity on business keys + # -------------------------------------------------------------------------- + - type: not_null + table: users_clean_snapshot + column: user_id + tags: [example_snapshot_demo] + + - type: not_null + table: mart_users_by_domain_snapshot + column: email_domain + tags: [example_snapshot_demo] + + # -------------------------------------------------------------------------- + # Snapshot tables – snapshot metadata columns + # + # Assumes snapshot columns: + # _ff_valid_from – version start + # _ff_valid_to – version end (nullable for current rows) + # _ff_is_current – boolean flag for open row + # _ff_updated_at – source updated_at used for snapshotting + # + # Adjust the column names if your BaseExecutor constants differ. + # -------------------------------------------------------------------------- + + # users_clean_snapshot: every row must have a valid_from timestamp + - type: not_null + table: users_clean_snapshot + column: _ff_valid_from + tags: [example_snapshot_demo] + + # users_clean_snapshot: is_current flag should always be set (true/false) + - type: not_null + table: users_clean_snapshot + column: _ff_is_current + tags: [example_snapshot_demo] + + # users_clean_snapshot: updated_at metadata should be populated + - type: not_null + table: users_clean_snapshot + column: _ff_updated_at + tags: [example_snapshot_demo] + + # mart_users_by_domain_snapshot: every row must have a valid_from timestamp + - type: not_null + table: mart_users_by_domain_snapshot + column: _ff_valid_from + tags: [example_snapshot_demo] + + # mart_users_by_domain_snapshot: is_current flag should always be set (true/false) + - type: not_null + table: mart_users_by_domain_snapshot + column: _ff_is_current + tags: [example_snapshot_demo] + + # mart_users_by_domain_snapshot: updated_at metadata should be populated + - type: not_null + table: mart_users_by_domain_snapshot + column: _ff_updated_at + tags: [example_snapshot_demo] + + # -------------------------------------------------------------------------- + # Optional: row-count sanity for snapshots + # (for the demo data – tweak or remove if you change the seed) + # -------------------------------------------------------------------------- + - type: row_count_between + table: users_clean_snapshot + min_rows: 3 + max_rows: 50 + tags: [example_snapshot_demo] + + - type: row_count_between + table: mart_users_by_domain_snapshot + min_rows: 1 + max_rows: 50 + tags: [example_snapshot_demo] diff --git a/examples/snapshot_demo/seeds/README.md b/examples/snapshot_demo/seeds/README.md new file mode 100644 index 0000000..2e553ed --- /dev/null +++ b/examples/snapshot_demo/seeds/README.md @@ -0,0 +1,4 @@ +# Seeds directory + +Add CSV or Parquet files for reproducible seeds. +Usage examples are covered in docs/Quickstart.md and docs/Config_and_Macros.md#13-seeds-sources-and-dependencies. diff --git a/examples/snapshot_demo/seeds/seed_users.csv b/examples/snapshot_demo/seeds/seed_users.csv new file mode 100644 index 0000000..e890383 --- /dev/null +++ b/examples/snapshot_demo/seeds/seed_users.csv @@ -0,0 +1,4 @@ +id,email,signup_date +1,anna@example.com,2024-01-05 +2,bob@example.net,2024-02-11 +3,cara@example.org,2024-02-27 diff --git a/examples/snapshot_demo/sources.yml b/examples/snapshot_demo/sources.yml new file mode 100644 index 0000000..d48deca --- /dev/null +++ b/examples/snapshot_demo/sources.yml @@ -0,0 +1,8 @@ +version: 2 + +sources: + - name: crm + tables: + - name: users + identifier: seed_users + description: Three sample users that populate the seed table. diff --git a/examples/snapshot_demo/tests/dq/README.md b/examples/snapshot_demo/tests/dq/README.md new file mode 100644 index 0000000..1acd01d --- /dev/null +++ b/examples/snapshot_demo/tests/dq/README.md @@ -0,0 +1,4 @@ +# Data quality tests + +Store custom data-quality tests that run via `fft test` (docs/Data_Quality_Tests.md). +Use this directory for schema-bound tests separate from unit specs. diff --git a/examples/snapshot_demo/tests/unit/README.md b/examples/snapshot_demo/tests/unit/README.md new file mode 100644 index 0000000..74f4550 --- /dev/null +++ b/examples/snapshot_demo/tests/unit/README.md @@ -0,0 +1,12 @@ +# Unit tests (snapshot_demo) + +Add YAML unit specs for fine-grained expectations, for example: + +- that `users_clean_snapshot` has at least N rows +- that each `user_id` has at most one open (`ff_valid_to IS NULL`) row + +Invoke via: + +```bash +fft utest . --env dev_duckdb +``` \ No newline at end of file diff --git a/examples_article/http_cache_demo/README.md b/examples_article/http_cache_demo/README.md new file mode 100644 index 0000000..5e977f7 --- /dev/null +++ b/examples_article/http_cache_demo/README.md @@ -0,0 +1,7 @@ +# FastFlowTransform project scaffold + +This project was created with `fft init`. +Next steps: +1. Update `profiles.yml` with real connection details (docs/Profiles.md). +2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). +3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). diff --git a/examples_article/http_cache_demo/docs/README.md b/examples_article/http_cache_demo/docs/README.md new file mode 100644 index 0000000..69e73e7 --- /dev/null +++ b/examples_article/http_cache_demo/docs/README.md @@ -0,0 +1,4 @@ +# Project documentation + +Write operator or contributor notes here and keep them in sync with generated docs. +See docs/Technical_Overview.md#auto-docs-and-lineage for `fft dag` / `fft docgen` guidance. diff --git a/examples_article/http_cache_demo/models/README.md b/examples_article/http_cache_demo/models/README.md new file mode 100644 index 0000000..32818bb --- /dev/null +++ b/examples_article/http_cache_demo/models/README.md @@ -0,0 +1,4 @@ +# Models directory + +Place SQL (`*.ff.sql`) and Python (`*.ff.py`) models here. +See docs/Config_and_Macros.md for modeling guidance and config options. diff --git a/examples_article/http_cache_demo/models/todo_ingest.ff.py b/examples_article/http_cache_demo/models/todo_ingest.ff.py new file mode 100644 index 0000000..3c5dcc2 --- /dev/null +++ b/examples_article/http_cache_demo/models/todo_ingest.ff.py @@ -0,0 +1,45 @@ +import pandas as pd +from fastflowtransform import model +from fastflowtransform.api.http import get_df + + +# 1. Define the Paginator +# This function runs after every request to determine what to do next. +def offset_paginator(url, params, response_json): + # If the API returns an empty list, we are done. + if not response_json: + return None + + # Otherwise, increment the page number + current_page = params.get("_page", 1) + if current_page >= 2: + return None + next_params = dict(params or {}) + next_params["_page"] = current_page + 1 + return {"next_request": {"params": next_params}} + + +@model(name="todos_ingest") +def fetch_todos() -> pd.DataFrame: + # 2. get_df handles the HTTP calls, caching, and conversion + df = get_df( + url="https://jsonplaceholder.typicode.com/todos", + params={"_page": 1, "_limit": 10}, # Start at page 1 + paginator=offset_paginator, + # record_path is None because the root of the JSON is the list itself + record_path=None, + ) + + # 3. Apply transformation logic + # If we change THIS logic later, FFT won't re-fetch the API! + + # Example: Mark high-priority items locally + df["priority"] = df["title"].apply(lambda x: "HIGH" if "delectus" in x else "NORMAL") + + # New Logic: Filter rows + df = df[df["completed"] == False] + + # New Logic: Uppercase titles + df["title"] = df["title"].str.upper() + + return df diff --git a/examples_article/http_cache_demo/profiles.yml b/examples_article/http_cache_demo/profiles.yml new file mode 100644 index 0000000..d2a5bf7 --- /dev/null +++ b/examples_article/http_cache_demo/profiles.yml @@ -0,0 +1,13 @@ +# Profiles generated by `fft init`. +# Update these placeholders as described in docs/Profiles.md. +dev: + engine: duckdb + # DuckDB profile example. See docs/Profiles.md#engines-and-sections for details. + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}" # Path to your DuckDB database file. + +# Default in-memory profile for quick experiments. +default: + engine: duckdb + duckdb: + path: ":memory:" diff --git a/examples_article/http_cache_demo/project.yml b/examples_article/http_cache_demo/project.yml new file mode 100644 index 0000000..16a77d0 --- /dev/null +++ b/examples_article/http_cache_demo/project.yml @@ -0,0 +1,18 @@ +# Project configuration generated by `fft init`. +# Read docs/Project_Config.md for the complete reference. +name: http_cache_demo +version: "0.1" +models_dir: models + +docs: + # Adjust `dag_dir` to change where `fft dag --html` writes documentation (docs/Technical_Overview.md#auto-docs-and-lineage). + dag_dir: site/dag + +# Project-level variables accessible via {{ var('key') }} inside models. +# Example: +# vars: +# run_date: "2024-01-01" +vars: {} + +# Declare project-wide data quality checks under `tests`. See docs/Data_Quality_Tests.md. +tests: [] diff --git a/examples_article/http_cache_demo/seeds/README.md b/examples_article/http_cache_demo/seeds/README.md new file mode 100644 index 0000000..2e553ed --- /dev/null +++ b/examples_article/http_cache_demo/seeds/README.md @@ -0,0 +1,4 @@ +# Seeds directory + +Add CSV or Parquet files for reproducible seeds. +Usage examples are covered in docs/Quickstart.md and docs/Config_and_Macros.md#13-seeds-sources-and-dependencies. diff --git a/examples_article/http_cache_demo/sources.yml b/examples_article/http_cache_demo/sources.yml new file mode 100644 index 0000000..83436dc --- /dev/null +++ b/examples_article/http_cache_demo/sources.yml @@ -0,0 +1,9 @@ +# Source declarations describe external tables. See docs/Sources.md for details. +version: 2 +# sources: + # Example: + # - name: raw + # schema: staging + # tables: + # - name: users + # identifier: seed_users diff --git a/examples_article/http_cache_demo/tests/dq/README.md b/examples_article/http_cache_demo/tests/dq/README.md new file mode 100644 index 0000000..1acd01d --- /dev/null +++ b/examples_article/http_cache_demo/tests/dq/README.md @@ -0,0 +1,4 @@ +# Data quality tests + +Store custom data-quality tests that run via `fft test` (docs/Data_Quality_Tests.md). +Use this directory for schema-bound tests separate from unit specs. diff --git a/examples_article/http_cache_demo/tests/unit/README.md b/examples_article/http_cache_demo/tests/unit/README.md new file mode 100644 index 0000000..b3c3c8d --- /dev/null +++ b/examples_article/http_cache_demo/tests/unit/README.md @@ -0,0 +1,4 @@ +# Unit tests + +Define YAML unit specs as described in docs/Config_and_Macros.md#73-model-unit-tests-fft-utest. +Invoke them with `fft utest --env `. diff --git a/mkdocs.yml b/mkdocs.yml index 1d17581..cfc4781 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -32,11 +32,11 @@ nav: - State Selection: State_Selection.md - YAML Tests: YAML_Tests.md - Data Quality Tests: Data_Quality_Tests.md - - API Reference: reference/ - CLI Guide: CLI_Guide.md - Auto Docs: Auto_Docs.md - Logging: Logging.md - Unit Tests: Unit_Tests.md + - Snapshots: Snapshots.md - Troubleshooting: Troubleshooting.md - Examples: - Basic Demo: examples/Basic_Demo.md @@ -48,6 +48,8 @@ nav: - Incremental Demo: examples/Incremental_Demo.md - API Demo: examples/API_Demo.md - Local Engine Setup: examples/Local_Engine_Setup.md + - Snapshot Demo: examples/Snapshot_Demo.md + - API Reference: reference/ - Contributing: Contributing.md - License: License.md diff --git a/src/fastflowtransform/api/context.py b/src/fastflowtransform/api/context.py index 08937da..ff267e2 100644 --- a/src/fastflowtransform/api/context.py +++ b/src/fastflowtransform/api/context.py @@ -136,7 +136,7 @@ def record( if content_hash and (not stats.hashes or stats.hashes[-1] != content_hash): stats.hashes.append(content_hash) - stats.used_offline = bool(used_offline) + stats.used_offline = bool(stats.used_offline or used_offline) def snapshot() -> dict[str, Any]: diff --git a/src/fastflowtransform/api/http.py b/src/fastflowtransform/api/http.py index 75cab55..0abae81 100644 --- a/src/fastflowtransform/api/http.py +++ b/src/fastflowtransform/api/http.py @@ -133,6 +133,21 @@ def _write_cache(key: str, status: int, headers: dict, body: bytes, url: str) -> return meta +def _maybe_json_payload(body: bytes) -> Any: + with suppress(Exception): + return json.loads(body.decode("utf-8")) + with suppress(Exception): + return json.loads(body) + return body + + +def _json_payload(body: bytes) -> Any: + try: + return json.loads(body.decode("utf-8")) + except Exception: + return json.loads(body) + + def _http_request( method: str, url: str, @@ -153,6 +168,115 @@ def _backoff_sleep(i: int) -> None: time.sleep(base + random.random() * 0.3 * base) +def _request_with_cache( + method: str, + url: str, + params: dict | None, + headers: dict | None, + ttl: int | None, + timeout: float | None, +) -> tuple[bytes, dict[str, Any]]: + hdrs = dict(headers or {}) + key = _cache_key(method, url, params, hdrs) + meta, body, hit = _read_cache(key, ttl) + if hit: + meta_dict: dict[str, Any] = meta or {} + payload = body or b"" + _ctx.record( + key, + meta_dict.get("content_hash", ""), + True, + len(payload), + used_offline=_OFFLINE, + ) + return payload, meta_dict + if _OFFLINE: + raise RuntimeError(f"HTTP offline mode - cache miss for {url}") + + tries = max(_DEF_MAX_RETRIES, 1) + for i in range(tries): + try: + status, resp_headers, resp_body = _http_request( + method, url, params=params, headers=hdrs, timeout=timeout + ) + except _HTTP.TimeoutException as exc: + if i < tries - 1: + _backoff_sleep(i) + continue + raise RuntimeError(f"HTTP timeout after {timeout or _DEF_TIMEOUT}s for {url}") from exc + except _HTTP.RequestError as exc: + if i < tries - 1: + _backoff_sleep(i) + continue + raise RuntimeError(f"HTTP request error for {url}: {exc}") from exc + if status in (429, 500, 502, 503, 504) and i < tries - 1: + ra = resp_headers.get("Retry-After") + if ra: + try: + time.sleep(float(ra)) + except Exception: + _backoff_sleep(i) + else: + _backoff_sleep(i) + continue + http_status_200 = 200 + http_status_300 = 300 + http_status_304 = 304 + if http_status_200 <= status < http_status_300 or status == http_status_304: + meta_out = _write_cache(key, status, resp_headers, resp_body, url) + _ctx.record( + key, + meta_out.get("content_hash", ""), + False, + len(resp_body), + used_offline=False, + ) + return resp_body, meta_out + raise RuntimeError(f"HTTP {status} for {url}") + raise RuntimeError(f"HTTP error after retries for {url}") + + +def _collect_pages( + method: str, + url: str, + params: dict | None, + headers: dict[str, Any], + ttl: int | None, + timeout: float | None, + paginator: Callable[[str, dict | None, Any], dict | None] | None, + *, + keep_payload: bool, + payload_factory: Callable[[bytes], Any] | None, +) -> list[tuple[bytes, dict[str, Any], Any]]: + cur_url = url + cur_params = params + cur_headers = dict(headers or {}) + pages: list[tuple[bytes, dict[str, Any], Any]] = [] + while True: + body, meta = _request_with_cache(method, cur_url, cur_params, cur_headers, ttl, timeout) + need_payload = keep_payload or paginator is not None + payload = None + if payload_factory and need_payload: + payload = payload_factory(body) + stored_payload = payload if keep_payload else None + pages.append((body, meta, stored_payload)) + if paginator is None: + break + nxt = paginator(cur_url, cur_params, payload) + if not nxt: + break + req = nxt.get("next_request") if isinstance(nxt, dict) else None + if not req: + break + cur_url = req.get("url") or cur_url + if "params" in req: + cur_params = req.get("params") + if "headers" in req: + nxt_headers = req.get("headers") + cur_headers = dict(nxt_headers) if nxt_headers is not None else {} + return pages + + # ---- Public API --------------------------------------------------------- def get( url: str, @@ -160,13 +284,15 @@ def get( params: dict | None = None, headers: dict | None = None, ttl: int | None = None, - paginator: Callable[[str, dict | None, dict], dict | None] | None = None, + paginator: Callable[[str, dict | None, Any], dict | None] | None = None, timeout: float | None = None, -) -> bytes: +) -> bytes | list[bytes]: """ Raw GET with optional FS cache and simple pagination. If paginator is provided, it should return {"next_request": {"url": "...", "params": {...}}} or None. + When pagination is active the result is a list of response bodies; otherwise + a single bytes object is returned. """ if not _domain_ok(url): raise RuntimeError(f"HTTP domain not allowed by FF_HTTP_ALLOWED_DOMAINS: {url}") @@ -174,70 +300,22 @@ def get( ttl = _DEF_TTL if ttl is None else ttl headers = dict(headers or {}) - def _one(method: str, url_: str, params_: dict | None) -> tuple[bytes, dict]: - key = _cache_key(method, url_, params_, headers) - meta, body, hit = _read_cache(key, ttl) - if hit: - # meta can be None -> normalize to empty dict before accessing .get - meta_dict = meta or {} - _ctx.record( - key, meta_dict.get("content_hash", ""), True, len(body or b""), used_offline=True - ) - return body or b"", meta_dict - if _OFFLINE: - raise RuntimeError(f"HTTP offline mode - cache miss for {url_}") - - tries = max(_DEF_MAX_RETRIES, 1) - for i in range(tries): - try: - status, resp_headers, resp_body = _http_request( - method, url_, params=params_, headers=headers, timeout=timeout - ) - except _HTTP.TimeoutException as exc: - if i < tries - 1: - _backoff_sleep(i) - continue - raise RuntimeError( - f"HTTP timeout after {timeout or _DEF_TIMEOUT}s for {url_}" - ) from exc - except _HTTP.RequestError as exc: - if i < tries - 1: - _backoff_sleep(i) - continue - raise RuntimeError(f"HTTP request error for {url_}: {exc}") from exc - if status in (429, 500, 502, 503, 504) and i < tries - 1: - # honor Retry-After (seconds) if present - ra = resp_headers.get("Retry-After") - if ra: - try: - time.sleep(float(ra)) - except Exception: - _backoff_sleep(i) - else: - _backoff_sleep(i) - continue - # write cache for any success or 304 - http_status_200 = 200 - http_status_300 = 300 - http_status_304 = 304 - if http_status_200 <= status < http_status_300 or status == http_status_304: - meta = _write_cache(key, status, resp_headers, resp_body, url_) - _ctx.record( - key, meta.get("content_hash", ""), False, len(resp_body), used_offline=False - ) - return resp_body, meta - raise RuntimeError(f"HTTP {status} for {url_}") - # should not reach - raise RuntimeError(f"HTTP error after retries for {url_}") - - body, _ = _one("GET", url, params) - if not paginator: + if paginator is None: + body, _ = _request_with_cache("GET", url, params, headers, ttl, timeout) return body - # paginate: concatenated bytes are not helpful - # → collect JSON pages and join later in get_json/get_df - # Here we just return the first page; get_json/get_df implement paging across JSON. - return body + pages = _collect_pages( + "GET", + url, + params, + headers, + ttl, + timeout, + paginator, + keep_payload=False, + payload_factory=_maybe_json_payload, + ) + return [body for body, _, _ in pages] def get_json( @@ -246,37 +324,25 @@ def get_json( params: dict | None = None, headers: dict | None = None, ttl: int | None = None, - paginator: Callable[[str, dict | None, dict], dict | None] | None = None, + paginator: Callable[[str, dict | None, Any], dict | None] | None = None, timeout: float | None = None, ) -> Any: """GET returning parsed JSON. If paginator is provided, it follows pages via callback.""" ttl = _DEF_TTL if ttl is None else ttl headers = dict(headers or {}) - - def _load_one(u: str, p: dict | None) -> tuple[Any, dict]: - raw = get(u, params=p, headers=headers, ttl=ttl, paginator=None, timeout=timeout) - try: - js = json.loads(raw.decode("utf-8")) - except Exception: - js = json.loads(raw) # if already str - return js, {} - - pages: list[Any] = [] - u, p = url, params - while True: - js, _ = _load_one(u, p) - pages.append(js) - if paginator is None: - break - nxt = paginator(u, p, js) - if not nxt: - break - req = nxt.get("next_request") - if not req: - break - u = req.get("url") or u - p = req.get("params") - return pages[0] if paginator is None else pages + pages = _collect_pages( + "GET", + url, + params, + headers, + ttl, + timeout, + paginator, + keep_payload=True, + payload_factory=_json_payload, + ) + payloads = [payload for _, _, payload in pages] + return payloads[0] if paginator is None else payloads MetaEntry = str | list[str] @@ -293,7 +359,7 @@ def get_df( params: dict | None = None, headers: dict | None = None, ttl: int | None = None, - paginator: Callable[[str, dict | None, dict], dict | None] | None = None, + paginator: Callable[[str, dict | None, Any], dict | None] | None = None, json_path: list[str] | None = None, record_path: Sequence[str] | None = None, meta: MetaArgIn | None = None, diff --git a/src/fastflowtransform/cli/__init__.py b/src/fastflowtransform/cli/__init__.py index f3c8a2c..e67044c 100644 --- a/src/fastflowtransform/cli/__init__.py +++ b/src/fastflowtransform/cli/__init__.py @@ -62,6 +62,7 @@ _selected_subgraph_names, _selector, ) +from fastflowtransform.cli.snapshot_cmd import register as _register_snapshot, snapshot from fastflowtransform.cli.sync_db_comments_cmd import ( _pg_fq_table, _pg_quote_ident, @@ -128,6 +129,7 @@ def main( _register_docgen(app) _register_sync_db_comments(app) _register_init(app) +_register_snapshot(app) __all__ = [ @@ -191,6 +193,7 @@ def main( "run", "schedule", "seed", + "snapshot", "sync_db_comments", "test", "topo_sort", diff --git a/src/fastflowtransform/cli/init_cmd.py b/src/fastflowtransform/cli/init_cmd.py index 194ff4a..b6c42dd 100644 --- a/src/fastflowtransform/cli/init_cmd.py +++ b/src/fastflowtransform/cli/init_cmd.py @@ -11,7 +11,6 @@ "duckdb", "postgres", "bigquery", - "bigquery_bf", "databricks_spark", "snowflake_snowpark", } @@ -28,47 +27,51 @@ class _InitContext: def _build_profiles_yaml(ctx: _InitContext) -> str: engine_block = { "duckdb": [ - " # DuckDB profile example. See docs/Profiles.md#duckdb for details.", + " # DuckDB profile example. See docs/Profiles.md#engines-and-sections for details.", " duckdb:", " path: \"{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}\" # Path to your DuckDB database file.", # Noqa E501 ], "postgres": [ - " # Postgres profile example. See docs/Profiles.md#postgres for required keys.", + " # Postgres profile example. See docs/Profiles.md#engines-and-sections " + " for required keys.", " postgres:", " dsn: \"{{ env('FF_PG_DSN') }}\" # Full Postgres DSN, e.g. postgresql://user:pass@host/db", " db_schema: \"{{ env('FF_PG_SCHEMA', 'analytics') }}\"", ], "bigquery": [ - " # BigQuery profile example. See docs/Profiles.md#bigquery.", + " # BigQuery profile example. See docs/Profiles.md#engines-and-sections.", " bigquery:", - " project: \"{{ env('FF_BQ_PROJECT') }}\" # GCP project id.", + " project: \"{{ env('FF_BQ_PROJECT') }}\" # Optional if your ADC " + " default project is set.", " dataset: \"{{ env('FF_BQ_DATASET') }}\" # Target dataset for models.", - " location: US # Update to match your dataset location.", - ], - "bigquery_bf": [ - " # BigQuery BigFrames profile example. See docs/Profiles.md#bigquery.", - " bigquery_bf:", - " project: \"{{ env('FF_BQ_PROJECT') }}\"", - " dataset: \"{{ env('FF_BQ_DATASET') }}\"", - " location: US", + " location: \"{{ env('FF_BQ_LOCATION', 'US') }}\" # Must match dataset location.", + " use_bigframes: true # Run Python models through BigQuery DataFrames (BigFrames).", + " allow_create_dataset: false # Set true to auto-create the dataset on first run.", ], "databricks_spark": [ - " # Databricks Spark profile example. See docs/Profiles.md#databricks-spark.", + " # Databricks Spark profile example. See docs/Profiles.md#engines-and-sections.", " databricks_spark:", " master: \"{{ env('FF_SPARK_MASTER') }}\" # e.g. spark://host:7077 or a Databricks cluster URL.", # Noqa E501 " app_name: \"{{ env('FF_SPARK_APP_NAME', 'fft-project') }}\"", " warehouse_dir: \"{{ env('FF_SPARK_WAREHOUSE', '/tmp/fft-warehouse') }}\"", " use_hive_metastore: false", + " extra_conf: {} # Provide Spark conf overrides here.", + " catalog: \"{{ env('FF_SPARK_CATALOG', '') }}\" # Unity catalog (optional).", + " database: \"{{ env('FF_SPARK_DATABASE', 'default') }}\"", + " table_format: \"{{ env('FF_SPARK_TABLE_FORMAT', 'parquet') }}\"", + " table_options: {}", ], "snowflake_snowpark": [ - " # Snowflake Snowpark profile example. See docs/Profiles.md#snowflake-snowpark.", + " # Snowflake Snowpark profile example. See docs/Profiles.md#engines-and-sections.", " snowflake_snowpark:", " account: \"{{ env('FF_SF_ACCOUNT') }}\"", " user: \"{{ env('FF_SF_USER') }}\"", " password: \"{{ env('FF_SF_PASSWORD') }}\"", " warehouse: \"{{ env('FF_SF_WAREHOUSE') }}\"", " database: \"{{ env('FF_SF_DATABASE') }}\"", - " db_schema: \"{{ env('FF_SF_SCHEMA', 'PUBLIC') }}\"", + " schema: \"{{ env('FF_SF_SCHEMA', 'PUBLIC') }}\"", + " role: \"{{ env('FF_SF_ROLE') }}\"", + " allow_create_schema: true", ], }[ctx.engine] @@ -100,7 +103,7 @@ def _build_project_yaml(ctx: _InitContext) -> str: "", "docs:", " # Adjust `dag_dir` to change where `fft dag --html` writes documentation " - "(docs/Technical_Overview.md#documentation).", + "(docs/Technical_Overview.md#auto-docs-and-lineage).", " dag_dir: site/dag", "", "# Project-level variables accessible via {{ var('key') }} inside models.", @@ -122,7 +125,7 @@ def _build_sources_yaml() -> str: [ "# Source declarations describe external tables. See docs/Sources.md for details.", "version: 2", - "sources:", + "# sources:", " # Example:", " # - name: raw", " # schema: staging", @@ -170,13 +173,23 @@ def _create_directory_notes(target: Path) -> None: "", ] ), + "tests/dq/README.md": "\n".join( + [ + "# Data quality tests", + "", + "Store custom data-quality tests that run via `fft test` " + "(docs/Data_Quality_Tests.md).", + "Use this directory for schema-bound tests separate from unit specs.", + "", + ] + ), "docs/README.md": "\n".join( [ "# Project documentation", "", "Write operator or contributor notes here and keep " "them in sync with generated docs.", - "See docs/Technical_Overview.md#documentation " + "See docs/Technical_Overview.md#auto-docs-and-lineage " "for `fft dag` / `fft docgen` guidance.", "", ] @@ -220,7 +233,7 @@ def init( "--engine", help=( "Executor engine for the default profile. " - "Supported values: duckdb, postgres, bigquery, bigquery_bf, " + "Supported values: duckdb, postgres, bigquery, " "databricks_spark, snowflake_snowpark." ), ), @@ -261,7 +274,7 @@ def init( engine=resolved_engine, ) - for sub in ("models", "seeds", "tests/unit", "docs"): + for sub in ("models", "seeds", "tests/unit", "tests/dq", "docs"): (project_dir / sub).mkdir(parents=True, exist_ok=True) _write_file(project_dir / "project.yml", _build_project_yaml(ctx)) diff --git a/src/fastflowtransform/cli/run.py b/src/fastflowtransform/cli/run.py index 0429d49..3930ffa 100644 --- a/src/fastflowtransform/cli/run.py +++ b/src/fastflowtransform/cli/run.py @@ -327,9 +327,21 @@ def _normalize_node_names_or_warn(names: list[str] | None) -> set[str]: out: set[str] = set() for tok in _parse_select(names or []): try: - out.add(REGISTRY.get_node(tok).name) + node = REGISTRY.get_node(tok) except KeyError: warn(f"Unknown model in --rebuild: {tok}") + continue + + if _is_snapshot_model(node): + warn( + f"Ignoring snapshot model in --rebuild: {tok} " + "(snapshot models are not executed via 'fft run'; " + "use 'fft snapshot run' instead)." + ) + continue + + out.add(node.name) + return out @@ -344,6 +356,15 @@ def _abbr(e: str) -> str: return mapping.get(e, e.upper()[:4]) +def _is_snapshot_model(node: Any) -> bool: + """ + Return True if this node is a snapshot model (materialized='snapshot'). + """ + meta = getattr(node, "meta", {}) or {} + mat = str(meta.get("materialized") or "").lower() + return mat == "snapshot" + + # ----------------- helpers (run function) ----------------- @@ -357,7 +378,11 @@ def _build_engine_ctx(project, env_name, engine, vars, cache, no_cache): def _select_predicate_and_raw( - executor_engine: _RunEngine, ctx: CLIContext, select: SelectOpt + executor_engine: _RunEngine, + ctx: CLIContext, + select: SelectOpt, + *, + include_snapshots: bool = False, ) -> tuple[list[str], Callable[[Any], bool], list[str]]: select_tokens = _parse_select(select or []) base_tokens = [t for t in select_tokens if not t.startswith("state:modified")] @@ -369,7 +394,13 @@ def _select_predicate_and_raw( modified_set = executor_engine.cache.modified_set(ctx.jinja_env, executor) select_pred = augment_with_state_modified(select_tokens, base_pred, modified_set) - raw_selected = [k for k, v in REGISTRY.nodes.items() if select_pred(v)] + raw_selected = [] + for k, v in REGISTRY.nodes.items(): + if not select_pred(v): + continue + if not include_snapshots and _is_snapshot_model(v): + continue + raw_selected.append(k) return select_tokens, select_pred, raw_selected diff --git a/src/fastflowtransform/cli/snapshot_cmd.py b/src/fastflowtransform/cli/snapshot_cmd.py new file mode 100644 index 0000000..d09e499 --- /dev/null +++ b/src/fastflowtransform/cli/snapshot_cmd.py @@ -0,0 +1,173 @@ +from __future__ import annotations + +import typer + +from fastflowtransform.cli.bootstrap import CLIContext, _prepare_context +from fastflowtransform.cli.options import ( + EngineOpt, + EnvOpt, + ExcludeOpt, + JobsOpt, + KeepOpt, + ProjectArg, + SelectOpt, + VarsOpt, +) +from fastflowtransform.cli.run import ( + CacheMode, + _attempt_catalog, + _emit_logs_and_errors, + _levels_for_run, + _run_schedule, + _RunEngine, + _select_predicate_and_raw, + _wanted_names, + _write_artifacts, +) +from fastflowtransform.core import REGISTRY, relation_for +from fastflowtransform.executors.base import BaseExecutor +from fastflowtransform.logging import bind_context, clear_context, echo + +snapshot = typer.Typer(help="Snapshot materialization commands.") + + +class _SnapshotRunEngine(_RunEngine): + """ + Variant of _RunEngine that calls executor.run_snapshot_sql(...) for + SQL nodes instead of the normal run_sql path. + """ + + def run_node(self, name: str) -> None: + node = REGISTRY.nodes[name] + ex, _run_sql_fn, _run_py_fn = self._get_runner() + if node.kind != "sql": + raise TypeError( + f"Snapshot run only supports SQL models, but node '{name}' is kind={node.kind!r}." + ) + # No fingerprint / cache skipping: snapshots always execute. + ex.run_snapshot_sql(node, self.ctx.jinja_env) + + +def _prune_snapshots( + executor: BaseExecutor, snapshot_names: set[str], keep_last: int, dry_run: bool +) -> None: + """ + Apply per-model pruning using executor.snapshot_prune(...) where available. + """ + for name in sorted(snapshot_names): + node = REGISTRY.nodes[name] + meta = getattr(node, "meta", {}) or {} + + unique_key = meta.get("unique_key") or meta.get("primary_key") or [] + unique_key_list = [unique_key] if isinstance(unique_key, str) else list(unique_key or []) + + if not unique_key_list: + echo(f"Skipping prune for {name}: missing unique_key/primary_key.") + continue + + if not hasattr(executor, "snapshot_prune"): + eng = getattr(executor, "engine_name", "unknown") + echo(f"Skipping prune for {name}: snapshot_prune not implemented for engine '{eng}'.") + continue + + rel = relation_for(name) + prefix = "[DRY-RUN] " if dry_run else "" + echo(f"{prefix}Pruning snapshot {name} (relation={rel}, keep_last={keep_last})") + executor.snapshot_prune(rel, unique_key_list, keep_last=keep_last, dry_run=dry_run) + + +@snapshot.command("run") +def snapshot_run( + project: ProjectArg = ".", + env_name: EnvOpt = "dev", + engine: EngineOpt = None, + vars: VarsOpt = None, + select: SelectOpt = None, + exclude: ExcludeOpt = None, + jobs: JobsOpt = 1, + keep_going: KeepOpt = False, + prune: bool = typer.Option( + False, + "--prune", + help="Prune historical snapshot rows after a successful run.", + ), + keep_last: int = typer.Option( + 3, + "--keep-last", + min=1, + help="Number of latest versions per key to keep when pruning.", + ), + dry_run: bool = typer.Option( + False, + "--dry-run", + help="Show pruning actions without modifying any data.", + ), +) -> None: + """ + Execute only snapshot models (materialized='snapshot'). + + Selection works like `fft run` but the final set is restricted to snapshot + models. Use --prune/--keep-last/--dry-run for retention. + """ + ctx: CLIContext = _prepare_context(project, env_name, engine, vars) + bind_context(engine=ctx.profile.engine, env=env_name) + + engine_ = _SnapshotRunEngine( + ctx=ctx, + pred=None, + env_name=env_name, + cache_mode=CacheMode.OFF, # snapshots always run; no cache skipping + force_rebuild=set(), + ) + + # Selection identical to run(), but we filter to snapshots afterwards. + select_tokens, _, raw_selected = _select_predicate_and_raw( + engine_, ctx, select, include_snapshots=True + ) + wanted_all = _wanted_names( + select_tokens=select_tokens, exclude=exclude, raw_selected=raw_selected + ) + + # Restrict to snapshot models only + snapshot_names: set[str] = { + name + for name in wanted_all + if (getattr(REGISTRY.nodes[name], "meta", {}) or {}).get("materialized") == "snapshot" + } + + if not snapshot_names: + typer.secho( + "Nothing to run (no snapshot models in selection).", + fg="yellow", + ) + clear_context() + raise typer.Exit(0) + + # Build DAG levels for the full wanted set so dependency validation still runs. + lvls_all = _levels_for_run([], wanted_all) + # Only execute snapshot nodes while preserving their relative order. + lvls = [lvl for lvl in ([n for n in level if n in snapshot_names] for level in lvls_all) if lvl] + + result, logq, started_at, finished_at = _run_schedule(engine_, lvls, jobs, keep_going, ctx) + + _write_artifacts(ctx, result, started_at, finished_at, engine_) + _attempt_catalog(ctx) + _emit_logs_and_errors(logq, result, engine_) + + if result.failed: + clear_context() + raise typer.Exit(1) + + # Optional retention + if prune: + executor = engine_.shared[0] + _prune_snapshots(executor, snapshot_names, keep_last, dry_run) + + engine_.persist_on_success(result) + engine_.print_timings(result) + echo("✓ Snapshot run done") + clear_context() + + +def register(app: typer.Typer) -> None: + app.add_typer(snapshot, name="snapshot") diff --git a/src/fastflowtransform/cli/test_cmd.py b/src/fastflowtransform/cli/test_cmd.py index 8701e9d..382d0ea 100644 --- a/src/fastflowtransform/cli/test_cmd.py +++ b/src/fastflowtransform/cli/test_cmd.py @@ -49,6 +49,15 @@ class DQResult: example_sql: str | None = None +def _is_snapshot_model(node: Any) -> bool: + """ + Return True if this node is a snapshot model (materialized='snapshot'). + """ + meta = getattr(node, "meta", {}) or {} + mat = str(meta.get("materialized") or "").lower() + return mat == "snapshot" + + def _print_model_error_block(node_name: str, relation: str, message: str, sql: str | None) -> None: header = "┌" + "─" * 70 footer = "└" + "─" * 70 @@ -99,7 +108,11 @@ def _run_models( before: Callable[[str, Any], None] | None = None, on_error: Callable[[str, Any, Exception], None] | None = None, ) -> None: - order = [n for n in topo_sort(REGISTRY.nodes) if pred(REGISTRY.nodes[n])] + order = [ + n + for n in topo_sort(REGISTRY.nodes) + if pred(REGISTRY.nodes[n]) and not _is_snapshot_model(REGISTRY.nodes[n]) + ] _execute_models(order, run_sql, run_py, before=before, on_error=on_error) diff --git a/src/fastflowtransform/config/models.py b/src/fastflowtransform/config/models.py index 7c3989e..f71ab6c 100644 --- a/src/fastflowtransform/config/models.py +++ b/src/fastflowtransform/config/models.py @@ -1,3 +1,4 @@ +# fastflowtransform/config/model.py from __future__ import annotations from collections.abc import Mapping, Sequence @@ -108,6 +109,40 @@ def _normalize_str_or_seq(cls, v: Any) -> list[str] | None: raise TypeError("must be a string or a sequence of strings") +class SnapshotConfig(BaseModel): + """ + Snapshot configuration block, for example: + + {{ config( + materialized='snapshot', + snapshot={ + "strategy": "timestamp", # or "check" + "updated_at": "updated_at", + "check_cols": ["col1", "col2"], # required for strategy='check' + }, + unique_key=["id"], + ) }} + """ + + model_config = ConfigDict(extra="forbid") + + strategy: Literal["timestamp", "check"] + updated_at: str | None = None + updated_at_column: str | None = None + check_cols: list[str] | None = None + + @field_validator("check_cols", mode="before") + @classmethod + def _normalize_check_cols(cls, v: Any) -> list[str] | None: + if v is None: + return None + if isinstance(v, str): + return [v] + if isinstance(v, Sequence) and not isinstance(v, (str, bytes)): + return [str(x) for x in v] + raise TypeError("check_cols must be a string or a sequence of strings") + + # --------------------------------------------------------------------------- # ModelConfig - canonical form of config(...) / decorator meta # --------------------------------------------------------------------------- @@ -132,7 +167,7 @@ class ModelConfig(BaseModel): # --- Core materialization & classification ----------------------------- - materialized: Literal["table", "view", "incremental", "ephemeral"] | None = None + materialized: Literal["table", "view", "incremental", "ephemeral", "snapshot"] | None = None # Optional logical kind; useful for selectors (kind:python / kind:sql / etc.) kind: str | None = None @@ -157,6 +192,9 @@ class ModelConfig(BaseModel): # - { ... IncrementalConfig fields ... } incremental: IncrementalConfig | None = None + # --- Snapshot configuration (structured) --------------------------------- + snapshot: SnapshotConfig | None = None + # Top-level shortcuts (backwards-compatible) # These are used by existing executor logic. unique_key: list[str] | None = None @@ -273,6 +311,14 @@ def _merge_incremental_overlays(self) -> ModelConfig: if self.delta and not self.delta_sql: self.delta_sql = self.delta.sql + # Mirror snapshot hints onto top-level shortcuts for backwards compatibility. + snap = self.snapshot + if snap: + if self.updated_at is None and snap.updated_at is not None: + self.updated_at = snap.updated_at + if self.updated_at_column is None and snap.updated_at_column is not None: + self.updated_at_column = snap.updated_at_column + return self # ---------------------------------------------------------------------- @@ -291,19 +337,26 @@ def is_incremental_enabled(self) -> bool: # Cross-field guardrails (fail fast with clear messages) # ---------------------------------------------------------------------- @model_validator(mode="after") - def _validate_incremental_requirements(self) -> ModelConfig: + def _validate_model_requirements(self) -> ModelConfig: """ - Enforce combinations that must hold for incremental materializations. + Enforce combinations that must hold for incremental and snapshot models. - Rules: + Incremental rules: 1) If materialized == 'incremental', incremental must be effectively enabled. 2) If incremental is enabled, at least one freshness/delta hint must exist: - updated_at / updated_at_column / updated_at_columns / timestamp_columns OR delta_sql OR delta_python. 3) If both updated_at and updated_at_column are provided, they must match. - 4) (Opinionated) Require unique_key when incremental is enabled - to avoid accidental cartesian merges. Relax if your executor permits. + 4) Require unique_key when incremental is enabled. + + Snapshot rules: + 1) If materialized == 'snapshot', a snapshot config must be provided. + 2) Snapshot models require unique_key (or primary_key). + 3) strategy must be 'timestamp' or 'check'. + 4) For 'timestamp', require updated_at / updated_at_column. + 5) For 'check', require check_cols. """ + # --- Incremental --------------------------------------------------- is_mat_inc = self.materialized == "incremental" is_inc_enabled = self.is_incremental_enabled() @@ -348,6 +401,45 @@ def _validate_incremental_requirements(self) -> ModelConfig: "for safe merges. Example: unique_key: ['id']" ) + # --- Snapshot-specific rules -------------------------------------- + if self.materialized == "snapshot": + snap = self.snapshot + if snap is None: + raise ValueError( + "materialized='snapshot' requires a snapshot config block. " + "Example:\n" + " snapshot: { strategy: 'timestamp' }" + ) + + # business key + if not (self.unique_key or self.primary_key): + raise ValueError( + "materialized='snapshot' requires a unique_key (or primary_key). " + "Example: unique_key: ['id']" + ) + + # strategy is validated by SnapshotConfig (Literal), but we keep a guardrail here + if snap.strategy not in ("timestamp", "check"): + raise ValueError( + "Snapshot models require strategy='timestamp' or 'check'. " + "Example: snapshot: { strategy: 'timestamp' }" + ) + + # timestamp strategy: needs updated_at + snap_updated = snap.updated_at or snap.updated_at_column + if snap.strategy == "timestamp" and not snap_updated: + raise ValueError( + "strategy='timestamp' snapshots require snapshot.updated_at or " + "snapshot.updated_at_column." + ) + + # check strategy: needs check_cols + if snap.strategy == "check" and not snap.check_cols: + raise ValueError( + "strategy='check' snapshots require snapshot.check_cols " + "(string or list of column names)." + ) + return self diff --git a/src/fastflowtransform/docs.py b/src/fastflowtransform/docs.py index c8fdc16..d84b97a 100644 --- a/src/fastflowtransform/docs.py +++ b/src/fastflowtransform/docs.py @@ -160,6 +160,7 @@ def _materialization_legend() -> dict[str, dict[str, str]]: "view": {"label": "view", "class": "badge-view"}, "ephemeral": {"label": "ephemeral", "class": "badge-ephemeral"}, "incremental": {"label": "incremental", "class": "badge-incremental"}, + "snapshot": {"label": "snapshot", "class": "badge-snapshot"}, } diff --git a/src/fastflowtransform/executors/_spark_imports.py b/src/fastflowtransform/executors/_spark_imports.py new file mode 100644 index 0000000..e68bf16 --- /dev/null +++ b/src/fastflowtransform/executors/_spark_imports.py @@ -0,0 +1,57 @@ +# fastflowtransform/executors/_spark_imports.py +from __future__ import annotations + +from types import ModuleType +from typing import TYPE_CHECKING + +__all__ = ["get_spark_functions", "get_spark_window"] + + +def _spark_missing_error(exc: Exception) -> RuntimeError: + return RuntimeError( + "pyspark is required for Spark/Databricks executors. " + "Install the extra: fastflowtransform[spark]." + ) + + +if TYPE_CHECKING: # pragma: no cover - typing only + # We import these only for static typing. + from pyspark.sql import ( + Window, + ) + + # `Window` itself is a class with static constructors (partitionBy, orderBy, ...), + # so using it directly as the return type is fine. + def get_spark_window() -> type[Window]: # Window is a class + ... + + # `functions` is a module; for typing purposes we just expose it as ModuleType. + def get_spark_functions() -> ModuleType: ... + +else: + # Runtime implementations - no need to annotate; type-checkers use the stubs above. + def get_spark_window(): + """ + Lazy import for pyspark.sql.Window. + + Raises: + RuntimeError: if pyspark is not installed or import fails. + """ + try: + from pyspark.sql import Window # noqa PLC0415 + except Exception as exc: # pragma: no cover + raise _spark_missing_error(exc) from exc + return Window + + def get_spark_functions(): + """ + Lazy import for pyspark.sql.functions as F. + + Raises: + RuntimeError: if pyspark is not installed or import fails. + """ + try: + from pyspark.sql import functions as F # noqa PLC0415 + except Exception as exc: # pragma: no cover + raise _spark_missing_error(exc) from exc + return F diff --git a/src/fastflowtransform/executors/base.py b/src/fastflowtransform/executors/base.py index 7ef7672..a6a43e8 100644 --- a/src/fastflowtransform/executors/base.py +++ b/src/fastflowtransform/executors/base.py @@ -19,7 +19,7 @@ from fastflowtransform.core import REGISTRY, Node, relation_for, resolve_source_entry from fastflowtransform.errors import ModelExecutionError from fastflowtransform.incremental import _normalize_unique_key -from fastflowtransform.logging import echo_debug +from fastflowtransform.logging import echo, echo_debug from fastflowtransform.validation import validate_required_columns @@ -117,6 +117,13 @@ class BaseExecutor[TFrame](ABC): - (optional) _frame_name """ + # Standard meta columns used by snapshot materialization. + SNAPSHOT_VALID_FROM_COL = "_ff_valid_from" + SNAPSHOT_VALID_TO_COL = "_ff_valid_to" + SNAPSHOT_IS_CURRENT_COL = "_ff_is_current" + SNAPSHOT_HASH_COL = "_ff_snapshot_hash" + SNAPSHOT_UPDATED_AT_COL = "_ff_updated_at" + # ---------- SQL ---------- def render_sql( self, @@ -236,6 +243,18 @@ def run_sql(self, node: Node, env: Environment) -> None: # Delegates to incremental engine: render, schema sync, merge/insert, etc. return _ff_incremental.run_or_dispatch(self, node, env) + if self._meta_is_snapshot(meta): + # Snapshots are executed via the dedicated CLI: `fft snapshot run`. + raise ModelExecutionError( + node_name=node.name, + relation=relation_for(node.name), + message=( + "Snapshot models cannot be executed via 'fft run'. " + "Use 'fft snapshot run' instead." + ), + sql_snippet="", + ) + sql_rendered = self.render_sql( node, env, @@ -289,6 +308,17 @@ def run_sql(self, node: Node, env: Environment) -> None: sql_snippet=preview, ) from e + def run_snapshot_sql(self, node: Node, env: Environment) -> None: + """ + Execute a SQL model materialized as 'snapshot'. + + Default implementation: engines must override this or snapshots + will fail with a clear error. + """ + raise NotImplementedError( + f"Snapshot materialization is not implemented for engine '{self.engine_name}'." + ) + # --- Helpers for materialization & ephemeral inlining (instance methods) --- def _first_select_body(self, sql: str) -> str: """ @@ -593,7 +623,22 @@ def _snapshot_http_ctx(self, node: Node) -> None: return with suppress(Exception): - (node.meta or {}).update({"_http_snapshot": snap}) + if not isinstance(node.meta, dict) or not node.meta: + node.meta = {} + node.meta["_http_snapshot"] = snap + + requests = int(snap.get("requests") or 0) + if requests <= 0: + return + cache_hits = int(snap.get("cache_hits") or 0) + bytes_read = int(snap.get("bytes") or 0) + offline = bool(snap.get("used_offline")) + echo( + f"HTTP stats for {node.name}: requests={requests} cache_hits={cache_hits} " + f"bytes={bytes_read} offline={offline}" + ) + if offline: + echo(f"Node {node.name} served responses from offline cache") # -------- Python model view helpers (shared) -------- def _py_view_backing_name(self, relation: str) -> str: @@ -711,7 +756,7 @@ def _resolve_source(self, source_name: str, table_name: str) -> str: cfg.setdefault("options", {}) return self._format_source_reference(cfg, source_name, table_name) - # ---------- Abstrakte Frame-Hooks ---------- + # ---------- Abstract Frame-Hooks ---------- @abstractmethod def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> TFrame: ... @@ -816,6 +861,36 @@ def _meta_is_incremental(meta: Mapping[str, Any] | None) -> bool: # Fallback: any non-empty incremental value is treated as "enabled". return bool(incremental_cfg) + # ── Snapshot API ────────────────────────────────────────────────── + def snapshot_prune( + self, + relation: str, + unique_key: list[str], + keep_last: int, + *, + dry_run: bool = False, + ) -> None: # pragma: no cover - abstract + """ + Prune old snapshot versions for the given relation. + + Engines may implement this in a best-effort manner. Default: not supported. + """ + raise NotImplementedError( + f"Snapshot pruning is not implemented for engine '{self.engine_name}'." + ) + + @staticmethod + def _meta_is_snapshot(meta: Mapping[str, Any] | None) -> bool: + """ + Return True if the given meta mapping describes a snapshot model. + + For now we define snapshots purely by materialized='snapshot'. + """ + if not meta: + return False + materialized = str(meta.get("materialized") or "").lower() + return materialized == "snapshot" + ENGINE_NAME: str = "generic" @property diff --git a/src/fastflowtransform/executors/bigquery/base.py b/src/fastflowtransform/executors/bigquery/base.py index bae74a9..394dab2 100644 --- a/src/fastflowtransform/executors/bigquery/base.py +++ b/src/fastflowtransform/executors/bigquery/base.py @@ -7,7 +7,9 @@ from fastflowtransform.executors._shims import BigQueryConnShim from fastflowtransform.executors.base import BaseExecutor from fastflowtransform.executors.bigquery._bigquery_mixin import BigQueryIdentifierMixin +from fastflowtransform.logging import echo from fastflowtransform.meta import ensure_meta_table, upsert_meta +from fastflowtransform.snapshots import resolve_snapshot_config from fastflowtransform.typing import BadRequest, Client, NotFound, bigquery TFrame = TypeVar("TFrame") @@ -275,3 +277,226 @@ def alter_table_sync_schema( f"ALTER TABLE {target} ADD COLUMN {col} {typ}", location=self.location, ).result() + + # ── Snapshots API (shared for pandas + BigFrames) ───────────────────── + def run_snapshot_sql(self, node: Node, env: Any) -> None: + """ + Snapshot materialization for BigQuery SQL models. + + Uses the same semantics as the DuckDB/Postgres/Snowflake executors: + - First run: create table with snapshot metadata columns. + - Subsequent runs: + * close changed current rows (set valid_to, is_current=false) + * insert new current rows for new/changed keys. + """ + if node.kind != "sql": + raise TypeError( + f"Snapshot materialization is only supported for SQL models, " + f"got kind={node.kind!r} for {node.name}." + ) + + meta = getattr(node, "meta", {}) or {} + if not self._meta_is_snapshot(meta): + raise ValueError(f"Node {node.name} is not configured with materialized='snapshot'.") + + cfg = resolve_snapshot_config(node, meta) + strategy = cfg.strategy # "timestamp" | "check" + unique_key = cfg.unique_key # list[str] + updated_at = cfg.updated_at # str | None + check_cols = cfg.check_cols # list[str] + + if not unique_key: + raise ValueError(f"{node.path}: snapshot models require a non-empty unique_key list.") + + # ---- Render SQL and extract SELECT body ---- + sql_rendered = self.render_sql( + node, + env, + ref_resolver=lambda name: self._resolve_ref(name, env), + source_resolver=self._resolve_source, + ) + sql_clean = self._strip_leading_config(sql_rendered).strip() + body = self._selectable_body(sql_clean).rstrip(" ;\n\t") + + rel_name = relation_for(node.name) + target = self._qualified_identifier(rel_name) + + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + vt = BaseExecutor.SNAPSHOT_VALID_TO_COL + is_cur = BaseExecutor.SNAPSHOT_IS_CURRENT_COL + hash_col = BaseExecutor.SNAPSHOT_HASH_COL + upd_meta = BaseExecutor.SNAPSHOT_UPDATED_AT_COL + + self._ensure_dataset() + + # ---- First run: create snapshot table ---- + if not self.exists_relation(rel_name): + if strategy == "timestamp": + if not updated_at: + raise ValueError( + f"{node.path}: strategy='timestamp' snapshots require an updated_at column." + ) + create_sql = f""" +CREATE TABLE {target} AS +SELECT + s.*, + s.{updated_at} AS {upd_meta}, + s.{updated_at} AS {vf}, + CAST(NULL AS TIMESTAMP) AS {vt}, + TRUE AS {is_cur}, + CAST(NULL AS STRING) AS {hash_col} +FROM ({body}) AS s +""" + else: # strategy == "check" + if not check_cols: + raise ValueError( + f"{node.path}: strategy='check' snapshots require non-empty check_cols." + ) + col_exprs = [f"COALESCE(CAST(s.{col} AS STRING), '')" for col in check_cols] + concat_expr = " || '||' || ".join(col_exprs) + hash_expr = f"TO_HEX(MD5({concat_expr}))" + upd_expr = f"s.{updated_at}" if updated_at else "CURRENT_TIMESTAMP()" + create_sql = f""" +CREATE TABLE {target} AS +SELECT + s.*, + {upd_expr} AS {upd_meta}, + CURRENT_TIMESTAMP() AS {vf}, + CAST(NULL AS TIMESTAMP) AS {vt}, + TRUE AS {is_cur}, + {hash_expr} AS {hash_col} +FROM ({body}) AS s +""" + self.client.query(create_sql, location=self.location).result() + return + + # ---- Incremental snapshot update ---- + keys_pred = " AND ".join([f"t.{k} = s.{k}" for k in unique_key]) + + if strategy == "timestamp": + if not updated_at: + raise ValueError( + f"{node.path}: strategy='timestamp' snapshots require an updated_at column." + ) + change_condition = f"s.{updated_at} > t.{upd_meta}" + new_upd_expr = f"s.{updated_at}" + new_valid_from_expr = f"s.{updated_at}" + new_hash_expr = "NULL" + else: + col_exprs_s = [f"COALESCE(CAST(s.{col} AS STRING), '')" for col in check_cols] + concat_expr_s = " || '||' || ".join(col_exprs_s) + hash_expr_s = f"TO_HEX(MD5({concat_expr_s}))" + change_condition = f"COALESCE({hash_expr_s}, '') <> COALESCE(t.{hash_col}, '')" + new_upd_expr = f"s.{updated_at}" if updated_at else "CURRENT_TIMESTAMP()" + new_valid_from_expr = "CURRENT_TIMESTAMP()" + new_hash_expr = hash_expr_s + + # 1) Close changed current rows + close_sql = f""" +UPDATE {target} AS t +SET + {vt} = CURRENT_TIMESTAMP(), + {is_cur} = FALSE +FROM ({body}) AS s +WHERE + {keys_pred} + AND t.{is_cur} = TRUE + AND {change_condition} +""" + self.client.query(close_sql, location=self.location).result() + + # 2) Insert new current versions (new keys or changed rows) + first_key = unique_key[0] + insert_sql = f""" +INSERT INTO {target} +SELECT + s.*, + {new_upd_expr} AS {upd_meta}, + {new_valid_from_expr} AS {vf}, + CAST(NULL AS TIMESTAMP) AS {vt}, + TRUE AS {is_cur}, + {new_hash_expr} AS {hash_col} +FROM ({body}) AS s +LEFT JOIN {target} AS t + ON {keys_pred} + AND t.{is_cur} = TRUE +WHERE + t.{first_key} IS NULL + OR {change_condition} +""" + self.client.query(insert_sql, location=self.location).result() + + def snapshot_prune( + self, + relation: str, + unique_key: list[str], + keep_last: int, + *, + dry_run: bool = False, + ) -> None: + """ + Delete older snapshot versions while keeping the most recent `keep_last` + rows per business key (including the current row). + """ + if keep_last <= 0: + return + + keys = [k for k in unique_key if k] + if not keys: + return + + target = self._qualified_identifier( + relation, + project=self.project, + dataset=self.dataset, + ) + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + key_select = ", ".join(keys) + part_by = ", ".join(keys) + + ranked_sql = f""" +SELECT + {key_select}, + {vf}, + ROW_NUMBER() OVER ( + PARTITION BY {part_by} + ORDER BY {vf} DESC + ) AS rn +FROM {target} +""" + + if dry_run: + sql = f""" +WITH ranked AS ( + {ranked_sql} +) +SELECT COUNT(*) AS rows_to_delete +FROM ranked +WHERE rn > {int(keep_last)} +""" + job = self.client.query(sql, location=self.location) + rows = list(job.result()) + count = int(rows[0][0]) if rows else 0 + + echo( + f"[DRY-RUN] snapshot_prune({relation}): would delete {count} row(s) " + f"(keep_last={keep_last})" + ) + return + + join_pred = " AND ".join([f"t.{k} = r.{k}" for k in keys]) + delete_sql = f""" +DELETE FROM {target} AS t +WHERE EXISTS ( + WITH ranked AS ( + {ranked_sql} + ) + SELECT 1 + FROM ranked AS r + WHERE + r.rn > {int(keep_last)} + AND {join_pred} + AND t.{vf} = r.{vf} +) +""" + self.client.query(delete_sql, location=self.location).result() diff --git a/src/fastflowtransform/executors/databricks_spark.py b/src/fastflowtransform/executors/databricks_spark.py index 73a188d..0bede06 100644 --- a/src/fastflowtransform/executors/databricks_spark.py +++ b/src/fastflowtransform/executors/databricks_spark.py @@ -3,16 +3,24 @@ from collections.abc import Callable, Iterable from contextlib import suppress +from functools import reduce from pathlib import Path from typing import Any from urllib.parse import unquote, urlparse +from jinja2 import Environment + from fastflowtransform import storage from fastflowtransform.core import REGISTRY, Node, relation_for from fastflowtransform.errors import ModelExecutionError +from fastflowtransform.executors._spark_imports import ( + get_spark_functions, + get_spark_window, +) from fastflowtransform.executors.base import BaseExecutor -from fastflowtransform.logging import echo_debug +from fastflowtransform.logging import echo, echo_debug from fastflowtransform.meta import ensure_meta_table, upsert_meta +from fastflowtransform.snapshots import resolve_snapshot_config from fastflowtransform.table_formats import get_spark_format_handler from fastflowtransform.table_formats.base import SparkFormatHandler from fastflowtransform.typing import SDF, DataType, SparkSession @@ -28,7 +36,7 @@ _DELTA_EXTENSION = "io.delta.sql.DeltaSparkSessionExtension" _DELTA_CATALOG = "org.apache.spark.sql.delta.catalog.DeltaCatalog" -_SPARK_DEFAULT_CATALOG = "org.apache.spark.sql.internal.CatalogImpl" # Spark's built-in +# _SPARK_DEFAULT_CATALOG = "org.apache.spark.sql.internal.CatalogImpl" # Spark's built-in def _has_delta(spark: SparkSession) -> bool: @@ -584,7 +592,7 @@ def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: ensure_meta_table(self) upsert_meta(self, node.name, relation, fingerprint, "databricks_spark") - # ── Incremental API (parity) ───────────────────────────────────────── + # ── Incremental API ───────────────────────────────────────── def exists_relation(self, relation: str) -> bool: """Check whether a table/view exists (optionally qualified with database).""" return self._format_handler.relation_exists(relation, database=self.database) @@ -712,6 +720,291 @@ def _spark_sql_type(dt: DataType) -> str: table_sql = self._sql_identifier(relation) self.spark.sql(f"ALTER TABLE {table_sql} ADD COLUMNS ({cols_sql})") + # ── Snapshot API ───────────────────────────────────────────────────── + + def run_snapshot_sql(self, node: Node, env: Environment) -> None: + """ + Snapshot materialization for Spark/Databricks. + """ + F = get_spark_functions() + + meta = self._validate_snapshot_node(node) + cfg = resolve_snapshot_config(node, meta) + + strategy = cfg.strategy + unique_key = cfg.unique_key + updated_at = cfg.updated_at + check_cols = cfg.check_cols + + body, rel_name, physical = self._snapshot_sql_body(node, env) + + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + vt = BaseExecutor.SNAPSHOT_VALID_TO_COL + is_cur = BaseExecutor.SNAPSHOT_IS_CURRENT_COL + hash_col = BaseExecutor.SNAPSHOT_HASH_COL + upd_meta = BaseExecutor.SNAPSHOT_UPDATED_AT_COL + + if not self.exists_relation(rel_name): + self._snapshot_first_run( + node=node, + rel_name=rel_name, + body=body, + strategy=strategy, + updated_at=updated_at, + check_cols=check_cols, + F=F, + vf=vf, + vt=vt, + is_cur=is_cur, + hash_col=hash_col, + upd_meta=upd_meta, + ) + return + + self._snapshot_incremental_run( + node=node, + body=body, + rel_name=rel_name, + physical=physical, + strategy=strategy, + unique_key=unique_key, + updated_at=updated_at, + check_cols=check_cols, + F=F, + vf=vf, + vt=vt, + is_cur=is_cur, + hash_col=hash_col, + upd_meta=upd_meta, + ) + + def _validate_snapshot_node(self, node: Node) -> dict[str, Any]: + if node.kind != "sql": + raise TypeError( + f"Snapshot materialization is only supported for SQL models, " + f"got kind={node.kind!r} for {node.name}." + ) + + meta = getattr(node, "meta", {}) or {} + if not self._meta_is_snapshot(meta): + raise ValueError(f"Node {node.name} is not configured with materialized='snapshot'.") + return meta + + def _snapshot_sql_body( + self, + node: Node, + env: Environment, + ) -> tuple[str, str, str]: + sql_rendered = self.render_sql( + node, + env, + ref_resolver=lambda name: self._resolve_ref(name, env), + source_resolver=self._resolve_source, + ) + sql_clean = self._strip_leading_config(sql_rendered).strip() + body = self._selectable_body(sql_clean).rstrip(" ;\n\t") + + rel_name = relation_for(node.name) + physical = self._physical_identifier(rel_name) + return body, rel_name, physical + + def _snapshot_first_run( + self, + *, + node: Node, + rel_name: str, + body: str, + strategy: str, + updated_at: str | None, + check_cols: list[str], + F: Any, + vf: str, + vt: str, + is_cur: str, + hash_col: str, + upd_meta: str, + ) -> None: + src_df = self.spark.sql(body) + + echo_debug(f"[snapshot] first run for {rel_name} (strategy={strategy})") + + if strategy == "timestamp": + assert updated_at is not None, ( + "timestamp snapshots require a non-null updated_at column" + ) + df_snap = ( + src_df.withColumn(upd_meta, F.col(updated_at)) + .withColumn(vf, F.col(updated_at)) + .withColumn(vt, F.lit(None).cast("timestamp")) + .withColumn(is_cur, F.lit(True)) + .withColumn(hash_col, F.lit(None).cast("string")) + ) + else: + cols_expr = [F.coalesce(F.col(c).cast("string"), F.lit("")) for c in check_cols] + concat_expr = F.concat_ws("||", *cols_expr) + hash_expr = F.md5(concat_expr).cast("string") + upd_expr = F.col(updated_at) if updated_at else F.current_timestamp() + + df_snap = ( + src_df.withColumn(upd_meta, upd_expr) + .withColumn(vf, F.current_timestamp()) + .withColumn(vt, F.lit(None).cast("timestamp")) + .withColumn(is_cur, F.lit(True)) + .withColumn(hash_col, hash_expr) + ) + + storage_meta = self._storage_meta(node, rel_name) + self._save_df_as_table(rel_name, df_snap, storage=storage_meta) + + def _snapshot_incremental_run( + self, + *, + node: Node, + body: str, + rel_name: str, + physical: str, + strategy: str, + unique_key: list[str], + updated_at: str | None, + check_cols: list[str], + F: Any, + vf: str, + vt: str, + is_cur: str, + hash_col: str, + upd_meta: str, + ) -> None: + echo_debug(f"[snapshot] incremental run for {rel_name} (strategy={strategy})") + + existing = self.spark.table(physical) + src_df = self.spark.sql(body) + + missing_keys_src = [k for k in unique_key if k not in src_df.columns] + missing_keys_snap = [k for k in unique_key if k not in existing.columns] + if missing_keys_src or missing_keys_snap: + raise ValueError( + f"{node.path}: snapshot unique_key columns must exist on both source and " + f"snapshot table. Missing on source={missing_keys_src}, " + f"on snapshot={missing_keys_snap}." + ) + + if strategy == "check": + cols_expr = [F.coalesce(F.col(c).cast("string"), F.lit("")) for c in check_cols] + concat_expr = F.concat_ws("||", *cols_expr) + src_df = src_df.withColumn("__ff_new_hash", F.md5(concat_expr).cast("string")) + + current_df = existing.filter(F.col(is_cur) == True) # noqa: E712 + + s_alias = src_df.alias("s") + t_alias = current_df.alias("t") + joined = s_alias.join(t_alias, on=unique_key, how="left") + + if strategy == "timestamp": + assert updated_at is not None, ( + "timestamp snapshots require a non-null updated_at column" + ) + s_upd = F.col(f"s.{updated_at}") + t_upd = F.col(f"t.{upd_meta}") + cond_new = t_upd.isNull() + cond_changed = t_upd.isNotNull() & (s_upd > t_upd) + changed_or_new = cond_new | cond_changed + else: + s_hash = F.col("s.__ff_new_hash") + t_hash = F.col(f"t.{hash_col}") + cond_new = t_hash.isNull() + cond_changed = t_hash.isNotNull() & (s_hash != F.coalesce(t_hash, F.lit(""))) + changed_or_new = cond_new | cond_changed + + changed_keys = ( + joined.filter(changed_or_new) + .select(*[F.col(f"s.{k}").alias(k) for k in unique_key]) + .dropDuplicates() + ) + + prev_noncurrent = existing.filter(F.col(is_cur) == False) # noqa: E712 + preserved_current = current_df.join(changed_keys, on=unique_key, how="left_anti") + + closed_prev = ( + current_df.join(changed_keys, on=unique_key, how="inner") + .withColumn(vt, F.current_timestamp()) + .withColumn(is_cur, F.lit(False)) + ) + + new_src = src_df.join(changed_keys, on=unique_key, how="inner") + if strategy == "timestamp": + assert updated_at is not None, ( + "timestamp snapshots require a non-null updated_at column" + ) + new_versions = ( + new_src.withColumn(upd_meta, F.col(updated_at)) + .withColumn(vf, F.col(updated_at)) + .withColumn(vt, F.lit(None).cast("timestamp")) + .withColumn(is_cur, F.lit(True)) + .withColumn(hash_col, F.lit(None).cast("string")) + ) + else: + upd_expr = F.col(updated_at) if updated_at else F.current_timestamp() + new_versions = ( + new_src.withColumn(upd_meta, upd_expr) + .withColumn(vf, F.current_timestamp()) + .withColumn(vt, F.lit(None).cast("timestamp")) + .withColumn(is_cur, F.lit(True)) + .withColumn(hash_col, F.col("__ff_new_hash")) + ) + + parts = [prev_noncurrent, preserved_current, closed_prev, new_versions] + snapshot_df = reduce(lambda a, b: a.unionByName(b, allowMissingColumns=True), parts) + if "__ff_new_hash" in snapshot_df.columns: + snapshot_df = snapshot_df.drop("__ff_new_hash") + + storage_meta = self._storage_meta(node, rel_name) + self._save_df_as_table(rel_name, snapshot_df, storage=storage_meta) + + def snapshot_prune( + self, + relation: str, + unique_key: list[str], + keep_last: int, + *, + dry_run: bool = False, + ) -> None: + """ + Delete older snapshot versions while keeping the most recent `keep_last` + rows per business key (including the current row), implemented as a + DataFrame overwrite (no in-place DELETE). + """ + if keep_last <= 0: + return + + Window = get_spark_window() + F = get_spark_functions() + + if not unique_key: + return + + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + + try: + physical = self._physical_identifier(relation) + df = self.spark.table(physical) + except Exception: + return + + w = Window.partitionBy(*[F.col(k) for k in unique_key]).orderBy(F.col(vf).desc()) + ranked = df.withColumn("__ff_rn", F.row_number().over(w)) + + if dry_run: + cnt = ranked.filter(F.col("__ff_rn") > int(keep_last)).count() + + echo( + f"[DRY-RUN] snapshot_prune({relation}): would delete {cnt} row(s) " + f"(keep_last={keep_last})" + ) + return + + pruned = ranked.filter(F.col("__ff_rn") <= int(keep_last)).drop("__ff_rn") + self._save_df_as_table(relation, pruned) + # ────────────────────────── local helpers / shim ────────────────────────── class _SparkResult: diff --git a/src/fastflowtransform/executors/duckdb.py b/src/fastflowtransform/executors/duckdb.py index 536ccff..44121c0 100644 --- a/src/fastflowtransform/executors/duckdb.py +++ b/src/fastflowtransform/executors/duckdb.py @@ -9,10 +9,13 @@ import duckdb import pandas as pd from duckdb import CatalogException +from jinja2 import Environment from fastflowtransform.core import Node, relation_for from fastflowtransform.executors.base import BaseExecutor +from fastflowtransform.logging import echo from fastflowtransform.meta import ensure_meta_table, upsert_meta +from fastflowtransform.snapshots import resolve_snapshot_config def _q(ident: str) -> str: @@ -284,3 +287,220 @@ def alter_table_sync_schema( self.con.execute(f"alter table {target} add column {col} varchar") except Exception: self.con.execute(f"alter table {target} add column {col} varchar") + + def run_snapshot_sql(self, node: Node, env: Environment) -> None: + """ + Snapshot materialization for DuckDB. + + Config (node.meta): + - materialized='snapshot' + - snapshot: { ... } # strategy + per-strategy hints + - unique_key: str | list[str] + + Behaviour: + - First run: create table with one current row per unique key. + - Subsequent runs: + * close changed current rows (set valid_to, is_current=false) + * insert new current rows for new/changed keys. + """ + if node.kind != "sql": + raise TypeError( + f"Snapshot materialization is only supported for SQL models, " + f"got kind={node.kind!r} for {node.name}." + ) + + meta = getattr(node, "meta", {}) or {} + if not self._meta_is_snapshot(meta): + raise ValueError(f"Node {node.name} is not configured with materialized='snapshot'.") + + # ---- Extract & normalise snapshot config (shared helper) ---- + cfg = resolve_snapshot_config(node, meta) + strategy = cfg.strategy + unique_key = cfg.unique_key + updated_at = cfg.updated_at + check_cols = cfg.check_cols + + # ---- Render SQL and extract SELECT body ---- + sql_rendered = self.render_sql( + node, + env, + ref_resolver=lambda name: self._resolve_ref(name, env), + source_resolver=self._resolve_source, + ) + sql = self._strip_leading_config(sql_rendered).strip() + body = self._selectable_body(sql).rstrip(" ;\n\t") + + rel_name = relation_for(node.name) + target = self._qualified(rel_name) + + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + vt = BaseExecutor.SNAPSHOT_VALID_TO_COL + is_cur = BaseExecutor.SNAPSHOT_IS_CURRENT_COL + hash_col = BaseExecutor.SNAPSHOT_HASH_COL + upd_meta = BaseExecutor.SNAPSHOT_UPDATED_AT_COL + + # ---- First run: create snapshot table ---- + if not self.exists_relation(rel_name): + if strategy == "timestamp": + # valid_from + updated_at come from the source updated_at column + create_sql = f""" +create table {target} as +select + s.*, + s.{updated_at} as {upd_meta}, + s.{updated_at} as {vf}, + cast(null as timestamp) as {vt}, + true as {is_cur}, + cast(null as varchar) as {hash_col} +from ({body}) as s +""" + else: # strategy == "check" + # Hash over check_cols to detect changes + col_exprs = [f"coalesce(cast(s.{col} as varchar), '')" for col in check_cols] + concat_expr = " || '||' || ".join(col_exprs) + hash_expr = f"cast(md5({concat_expr}) as varchar)" + upd_expr = f"s.{updated_at}" if updated_at else "current_timestamp" + create_sql = f""" +create table {target} as +select + s.*, + {upd_expr} as {upd_meta}, + current_timestamp as {vf}, + cast(null as timestamp) as {vt}, + true as {is_cur}, + {hash_expr} as {hash_col} +from ({body}) as s +""" + self.con.execute(create_sql) + return + + # ---- Incremental snapshot update ---- + + # Stage current source rows in a temp view for reuse + src_view_name = f"__ff_snapshot_src_{rel_name}".replace(".", "_") + src_quoted = _q(src_view_name) + self.con.execute(f"create or replace temp view {src_quoted} as {body}") + + try: + # Join predicate on unique keys + keys_pred = " AND ".join([f"t.{k} = s.{k}" for k in unique_key]) + + # Change condition & hash for staging rows + if strategy == "timestamp": + change_condition = f"s.{updated_at} > t.{upd_meta}" + hash_expr_s = "NULL" + new_upd_expr = f"s.{updated_at}" + new_valid_from_expr = f"s.{updated_at}" + new_hash_expr = "NULL" + else: + col_exprs_s = [f"coalesce(cast(s.{col} as varchar), '')" for col in check_cols] + concat_expr_s = " || '||' || ".join(col_exprs_s) + hash_expr_s = f"cast(md5({concat_expr_s}) as varchar)" + change_condition = f"coalesce({hash_expr_s}, '') <> coalesce(t.{hash_col}, '')" + new_upd_expr = f"s.{updated_at}" if updated_at else "current_timestamp" + new_valid_from_expr = "current_timestamp" + new_hash_expr = hash_expr_s + + # 1) Close changed current rows + close_sql = f""" +update {target} as t +set + {vt} = current_timestamp, + {is_cur} = false +from {src_quoted} as s +where + {keys_pred} + and t.{is_cur} = true + and {change_condition}; +""" + self.con.execute(close_sql) + + # 2) Insert new current versions (new keys or changed rows) + first_key = unique_key[0] + insert_sql = f""" +insert into {target} +select + s.*, + {new_upd_expr} as {upd_meta}, + {new_valid_from_expr} as {vf}, + cast(null as timestamp) as {vt}, + true as {is_cur}, + {new_hash_expr} as {hash_col} +from {src_quoted} as s +left join {target} as t + on {keys_pred} + and t.{is_cur} = true +where + t.{first_key} is null + or {change_condition}; +""" + self.con.execute(insert_sql) + finally: + with suppress(Exception): + self.con.execute(f"drop view if exists {src_quoted}") + + def snapshot_prune( + self, + relation: str, + unique_key: list[str], + keep_last: int, + *, + dry_run: bool = False, + ) -> None: + """ + Delete older snapshot versions while keeping the most recent `keep_last` + rows per business key (including the current row). + """ + if keep_last <= 0: + return + + target = self._qualified(relation) + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + keys = [k for k in unique_key if k] + + if not keys: + return + + part_by = ", ".join([k for k in keys]) + key_select = ", ".join(keys) + + ranked_sql = f""" +select + {key_select}, + {vf}, + row_number() over ( + partition by {part_by} + order by {vf} desc + ) as rn +from {target} +""" + + if dry_run: + sql = f""" +with ranked as ( + {ranked_sql} +) +select count(*) as rows_to_delete +from ranked +where rn > {int(keep_last)} +""" + res = self.con.execute(sql).fetchone() + rows = int(res[0]) if res else 0 + + echo( + f"[DRY-RUN] snapshot_prune({relation}): would delete {rows} row(s) " + f"(keep_last={keep_last})" + ) + return + + delete_sql = f""" +delete from {target} t +using ( + {ranked_sql} +) r +where + r.rn > {int(keep_last)} + and {" AND ".join([f"t.{k} = r.{k}" for k in keys])} + and t.{vf} = r.{vf}; +""" + self.con.execute(delete_sql) diff --git a/src/fastflowtransform/executors/postgres.py b/src/fastflowtransform/executors/postgres.py index 1ff0243..de57fda 100644 --- a/src/fastflowtransform/executors/postgres.py +++ b/src/fastflowtransform/executors/postgres.py @@ -3,6 +3,7 @@ from typing import Any import pandas as pd +from jinja2 import Environment from sqlalchemy import create_engine, text from sqlalchemy.engine import Connection, Engine from sqlalchemy.exc import ProgrammingError, SQLAlchemyError @@ -11,7 +12,9 @@ from fastflowtransform.errors import ModelExecutionError, ProfileConfigError from fastflowtransform.executors._shims import SAConnShim from fastflowtransform.executors.base import BaseExecutor +from fastflowtransform.logging import echo from fastflowtransform.meta import ensure_meta_table, upsert_meta +from fastflowtransform.snapshots import resolve_snapshot_config class PostgresExecutor(BaseExecutor[pd.DataFrame]): @@ -257,3 +260,234 @@ def alter_table_sync_schema( add = [c for c in cols if c not in existing] for c in add: con.execute(text(f'alter table {qrel} add column "{c}" text')) + + # ── Snapshot API ────────────────────────────────────────────────────── + + def run_snapshot_sql(self, node: Node, env: Environment) -> None: + """ + Snapshot materialization for Postgres. + + Config: + - materialized='snapshot' + - snapshot={...} and/or top-level strategy/updated_at/check_cols + - unique_key / primary_key + + Behaviour: + - First run: create table with one current row per unique key. + - Subsequent runs: + * close changed current rows (set valid_to, is_current=false) + * insert new current rows for new/changed keys. + """ + if node.kind != "sql": + raise TypeError( + f"Snapshot materialization is only supported for SQL models, " + f"got kind={node.kind!r} for {node.name}." + ) + + meta = getattr(node, "meta", {}) or {} + if not self._meta_is_snapshot(meta): + raise ValueError(f"Node {node.name} is not configured with materialized='snapshot'.") + + # Shared normalisation: supports nested 'snapshot={...}' OR flattened config. + cfg = resolve_snapshot_config(node, meta) + strategy = cfg.strategy + unique_key = cfg.unique_key + updated_at = cfg.updated_at + check_cols = cfg.check_cols + + # ---- Render SQL and extract SELECT body ---- + sql_rendered = self.render_sql( + node, + env, + ref_resolver=lambda name: self._resolve_ref(name, env), + source_resolver=self._resolve_source, + ) + sql = self._strip_leading_config(sql_rendered).strip() + body = self._selectable_body(sql).rstrip(" ;\n\t") + + rel_name = relation_for(node.name) + target = self._qualified(rel_name) + + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + vt = BaseExecutor.SNAPSHOT_VALID_TO_COL + is_cur = BaseExecutor.SNAPSHOT_IS_CURRENT_COL + hash_col = BaseExecutor.SNAPSHOT_HASH_COL + upd_meta = BaseExecutor.SNAPSHOT_UPDATED_AT_COL + + # ---- First run: create snapshot table ---- + if not self.exists_relation(rel_name): + if strategy == "timestamp": + # valid_from + updated_at come from the source updated_at column + create_sql = f""" +create table {target} as +select + s.*, + s.{updated_at} as {upd_meta}, + s.{updated_at} as {vf}, + cast(null as timestamp) as {vt}, + true as {is_cur}, + cast(null as text) as {hash_col} +from ({body}) as s +""" + else: # strategy == "check" + # Hash over check_cols to detect changes + col_exprs = [f"coalesce(cast(s.{col} as text), '')" for col in check_cols] + concat_expr = " || '||' || ".join(col_exprs) + hash_expr = f"md5({concat_expr})" + upd_expr = f"s.{updated_at}" if updated_at else "current_timestamp" + create_sql = f""" +create table {target} as +select + s.*, + {upd_expr} as {upd_meta}, + current_timestamp as {vf}, + cast(null as timestamp) as {vt}, + true as {is_cur}, + {hash_expr} as {hash_col} +from ({body}) as s +""" + with self.engine.begin() as conn: + self._set_search_path(conn) + conn.execute(text(create_sql)) + return + + # ---- Incremental snapshot update ---- + + # Stage current source rows in a temporary table for reuse + src_name = f"__ff_snapshot_src_{rel_name}".replace(".", "_") + src_q = self._q_ident(src_name) + + with self.engine.begin() as conn: + self._set_search_path(conn) + + # (Re-)create temp staging table + conn.execute(text(f"drop table if exists {src_q}")) + conn.execute(text(f"create temporary table {src_q} as {body}")) + + # Join predicate on unique keys + keys_pred = " AND ".join([f"t.{k} = s.{k}" for k in unique_key]) + + # Change condition & hash for staging rows + if strategy == "timestamp": + change_condition = f"s.{updated_at} > t.{upd_meta}" + hash_expr_s = "NULL" + new_upd_expr = f"s.{updated_at}" + new_valid_from_expr = f"s.{updated_at}" + new_hash_expr = "NULL" + else: + col_exprs_s = [f"coalesce(cast(s.{col} as text), '')" for col in check_cols] + concat_expr_s = " || '||' || ".join(col_exprs_s) + hash_expr_s = f"md5({concat_expr_s})" + change_condition = ( + f"coalesce({hash_expr_s}, '') <> coalesce(t.{hash_col}::text, '')" + ) + new_upd_expr = f"s.{updated_at}" if updated_at else "current_timestamp" + new_valid_from_expr = "current_timestamp" + new_hash_expr = hash_expr_s + + # 1) Close changed current rows + close_sql = f""" +update {target} as t +set + {vt} = current_timestamp, + {is_cur} = false +from {src_q} as s +where + {keys_pred} + and t.{is_cur} = true + and {change_condition}; +""" + conn.execute(text(close_sql)) + + # 2) Insert new current versions (new keys or changed rows) + first_key = unique_key[0] + insert_sql = f""" +insert into {target} +select + s.*, + {new_upd_expr} as {upd_meta}, + {new_valid_from_expr} as {vf}, + cast(null as timestamp) as {vt}, + true as {is_cur}, + {new_hash_expr} as {hash_col} +from {src_q} as s +left join {target} as t + on {keys_pred} + and t.{is_cur} = true +where + t.{first_key} is null + or {change_condition}; +""" + conn.execute(text(insert_sql)) + + # Temp table will be dropped automatically at end of session; dropping + # explicitly here is harmless and keeps the connection clean for tests. + conn.execute(text(f"drop table if exists {src_q}")) + + def snapshot_prune( + self, + relation: str, + unique_key: list[str], + keep_last: int, + *, + dry_run: bool = False, + ) -> None: + """ + Delete older snapshot versions while keeping the most recent `keep_last` + rows per business key (including the current row). + """ + if keep_last <= 0: + return + + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + keys = [k for k in unique_key if k] + if not keys: + return + + target = self._qualified(relation) + part_by = ", ".join(keys) + key_select = ", ".join(keys) + + ranked_sql = f""" +select + {key_select}, + {vf}, + row_number() over ( + partition by {part_by} + order by {vf} desc + ) as rn +from {target} +""" + + if dry_run: + sql = f""" +with ranked as ( + {ranked_sql} +) +select count(*) as rows_to_delete +from ranked +where rn > {int(keep_last)} +""" + with self.engine.begin() as conn: + self._set_search_path(conn) + res = conn.execute(text(sql)).fetchone() + rows = int(res[0]) if res else 0 + echo( + f"[DRY-RUN] snapshot_prune({relation}): would delete {rows} row(s) " + f"(keep_last={keep_last})" + ) + return + + delete_sql = f""" +delete from {target} t +using ( + {ranked_sql} +) r +where + r.rn > {int(keep_last)} + and {" AND ".join([f"t.{k} = r.{k}" for k in keys])} + and t.{vf} = r.{vf}; +""" + with self.engine.begin() as conn: + self._set_search_path(conn) + conn.execute(text(delete_sql)) diff --git a/src/fastflowtransform/executors/snowflake_snowpark.py b/src/fastflowtransform/executors/snowflake_snowpark.py index f209b8a..3195a85 100644 --- a/src/fastflowtransform/executors/snowflake_snowpark.py +++ b/src/fastflowtransform/executors/snowflake_snowpark.py @@ -3,11 +3,15 @@ from collections.abc import Iterable from contextlib import suppress -from typing import Any +from typing import Any, cast + +from jinja2 import Environment from fastflowtransform.core import Node, relation_for from fastflowtransform.executors.base import BaseExecutor +from fastflowtransform.logging import echo from fastflowtransform.meta import ensure_meta_table, upsert_meta +from fastflowtransform.snapshots import resolve_snapshot_config from fastflowtransform.typing import SNDF, SnowparkSession as Session @@ -291,6 +295,222 @@ def alter_table_sync_schema( cols_sql = ", ".join(f"{self._q(c)} STRING" for c in to_add) self.session.sql(f"ALTER TABLE {qrel} ADD COLUMN {cols_sql}").collect() + # ── Snapshot API ───────────────────────────────────────────────────── + def run_snapshot_sql(self, node: Node, env: Environment) -> None: + """ + Snapshot materialization for Snowflake Snowpark. + + Uses the shared snapshot config resolver so all engines share the + same semantics and validation. + """ + if node.kind != "sql": + raise TypeError( + f"Snapshot materialization is only supported for SQL models, " + f"got kind={node.kind!r} for {node.name}." + ) + + meta = getattr(node, "meta", {}) or {} + if not self._meta_is_snapshot(meta): + raise ValueError(f"Node {node.name} is not configured with materialized='snapshot'.") + + cfg = resolve_snapshot_config(node, meta) + + # Render model SQL and extract the SELECT body + rendered = self.render_sql( + node, + env, + ref_resolver=lambda name: self._resolve_ref(name, env), + source_resolver=self._resolve_source, + ) + sql = self._strip_leading_config(rendered).strip() + body = self._selectable_body(sql).rstrip(";\n\t ") + + rel_name = relation_for(node.name) + target = self._qualified(rel_name) + + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + vt = BaseExecutor.SNAPSHOT_VALID_TO_COL + is_cur = BaseExecutor.SNAPSHOT_IS_CURRENT_COL + hash_col = BaseExecutor.SNAPSHOT_HASH_COL + upd_meta = BaseExecutor.SNAPSHOT_UPDATED_AT_COL + + # ---- First run: create snapshot table ---- + if not self.exists_relation(rel_name): + if cfg.strategy == "timestamp": + # cfg.updated_at is guaranteed non-None by resolve_snapshot_config + if cfg.updated_at is None: # defensive, for type-checkers + raise ValueError( + "strategy='timestamp' snapshot requires a non-null updated_at column." + ) + create_sql = f""" +CREATE OR REPLACE TABLE {target} AS +SELECT + s.*, + s.{cfg.updated_at} AS {upd_meta}, + s.{cfg.updated_at} AS {vf}, + CAST(NULL AS TIMESTAMP) AS {vt}, + TRUE AS {is_cur}, + CAST(NULL AS VARCHAR) AS {hash_col} +FROM ({body}) AS s +""" + else: # strategy == "check" + # hash over check_cols to detect changes + col_exprs = [f"COALESCE(CAST(s.{col} AS VARCHAR), '')" for col in cfg.check_cols] + concat_expr = " || '||' || ".join(col_exprs) or "''" + hash_expr = f"CAST(MD5({concat_expr}) AS VARCHAR)" + upd_expr = ( + f"s.{cfg.updated_at}" if cfg.updated_at is not None else "CURRENT_TIMESTAMP()" + ) + create_sql = f""" +CREATE OR REPLACE TABLE {target} AS +SELECT + s.*, + {upd_expr} AS {upd_meta}, + CURRENT_TIMESTAMP() AS {vf}, + CAST(NULL AS TIMESTAMP) AS {vt}, + TRUE AS {is_cur}, + {hash_expr} AS {hash_col} +FROM ({body}) AS s +""" + self.session.sql(create_sql).collect() + return + + # ---- Incremental snapshot update ---- + src_name = f"__ff_snapshot_src_{rel_name}".replace(".", "_") + + # Use a temporary view for the current source rows + self.session.sql(f"CREATE OR REPLACE TEMPORARY VIEW {src_name} AS {body}").collect() + + try: + keys_pred = " AND ".join([f"t.{k} = s.{k}" for k in cfg.unique_key]) or "FALSE" + + if cfg.strategy == "timestamp": + if cfg.updated_at is None: + raise ValueError( + "strategy='timestamp' snapshot requires a non-null updated_at column." + ) + change_condition = f"s.{cfg.updated_at} > t.{upd_meta}" + hash_expr_s = "NULL" + new_upd_expr = f"s.{cfg.updated_at}" + new_valid_from_expr = f"s.{cfg.updated_at}" + new_hash_expr = "NULL" + else: + col_exprs_s = [f"COALESCE(CAST(s.{col} AS VARCHAR), '')" for col in cfg.check_cols] + concat_expr_s = " || '||' || ".join(col_exprs_s) or "''" + hash_expr_s = f"CAST(MD5({concat_expr_s}) AS VARCHAR)" + change_condition = f"COALESCE({hash_expr_s}, '') <> COALESCE(t.{hash_col}, '')" + new_upd_expr = ( + f"s.{cfg.updated_at}" if cfg.updated_at is not None else "CURRENT_TIMESTAMP()" + ) + new_valid_from_expr = "CURRENT_TIMESTAMP()" + new_hash_expr = hash_expr_s + + # 1) Close changed current rows + close_sql = f""" +UPDATE {target} AS t +SET + {vt} = CURRENT_TIMESTAMP(), + {is_cur} = FALSE +FROM {src_name} AS s +WHERE + {keys_pred} + AND t.{is_cur} = TRUE + AND {change_condition} +""" + self.session.sql(close_sql).collect() + + # 2) Insert new current versions (new keys or changed rows) + first_key = cfg.unique_key[0] + insert_sql = f""" +INSERT INTO {target} +SELECT + s.*, + {new_upd_expr} AS {upd_meta}, + {new_valid_from_expr} AS {vf}, + CAST(NULL AS TIMESTAMP) AS {vt}, + TRUE AS {is_cur}, + {new_hash_expr} AS {hash_col} +FROM {src_name} AS s +LEFT JOIN {target} AS t + ON {keys_pred} + AND t.{is_cur} = TRUE +WHERE + t.{first_key} IS NULL + OR {change_condition} +""" + self.session.sql(insert_sql).collect() + finally: + with suppress(Exception): + self.session.sql(f"DROP VIEW IF EXISTS {src_name}").collect() + + def snapshot_prune( + self, + relation: str, + unique_key: list[str], + keep_last: int, + *, + dry_run: bool = False, + ) -> None: + """ + Delete older snapshot versions while keeping the most recent `keep_last` + rows per business key (including the current row). + """ + if keep_last <= 0: + return + + keys = [k for k in unique_key if k] + if not keys: + return + + target = self._qualified(relation) + vf = BaseExecutor.SNAPSHOT_VALID_FROM_COL + + part_by = ", ".join(keys) + key_select = ", ".join(keys) + + ranked_sql = f""" +SELECT + {key_select}, + {vf}, + ROW_NUMBER() OVER ( + PARTITION BY {part_by} + ORDER BY {vf} DESC + ) AS rn +FROM {target} +""" + + if dry_run: + sql = f""" +WITH ranked AS ( + {ranked_sql} +) +SELECT COUNT(*) AS rows_to_delete +FROM ranked +WHERE rn > {int(keep_last)} +""" + res_raw = self.session.sql(sql).collect() + # Snowflake returns a list of Row objects; treat them as tuples for typing. + res = cast("list[tuple[Any, ...]]", res_raw) + rows = int(res[0][0]) if res else 0 + + echo( + f"[DRY-RUN] snapshot_prune({relation}): would delete {rows} row(s) " + f"(keep_last={keep_last})" + ) + return + + delete_sql = f""" +DELETE FROM {target} t +USING ( + {ranked_sql} +) r +WHERE + r.rn > {int(keep_last)} + AND {" AND ".join([f"t.{k} = r.{k}" for k in keys])} + AND t.{vf} = r.{vf} +""" + self.session.sql(delete_sql).collect() + # ────────────────────────── local testing shim ─────────────────────────── class _SFCursorShim: diff --git a/src/fastflowtransform/snapshots.py b/src/fastflowtransform/snapshots.py new file mode 100644 index 0000000..456d837 --- /dev/null +++ b/src/fastflowtransform/snapshots.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass +from typing import Any, Literal + +from fastflowtransform.core import Node +from fastflowtransform.incremental import _normalize_unique_key + +SnapshotStrategy = Literal["timestamp", "check"] + + +@dataclass +class SnapshotConfigResolved: + """ + Normalised snapshot configuration usable by executors. + + Supports both: + - legacy nested config: snapshot={strategy=..., updated_at=..., check_cols=...} + - flattened config: strategy=..., updated_at=..., check_cols=... + """ + + strategy: SnapshotStrategy + unique_key: list[str] + updated_at: str | None + check_cols: list[str] + + +def resolve_snapshot_config(node: Node, meta: Mapping[str, Any]) -> SnapshotConfigResolved: + """ + Resolve and validate snapshot configuration from a model's meta dict. + + Accepted shapes: + {{ config( + materialized='snapshot', + snapshot={ + 'strategy': 'timestamp', + 'updated_at': 'updated_at', + 'check_cols': ['col1', 'col2'], + }, + unique_key='id', + ) }} + + OR (flattened) + + {{ config( + materialized='snapshot', + strategy='timestamp', + updated_at='updated_at', + check_cols=['col1', 'col2'], + unique_key='id', + ) }} + """ + meta = dict(meta or {}) + + # Optional nested block + snapshot_block = meta.get("snapshot") + if snapshot_block is not None and not isinstance(snapshot_block, Mapping): + raise TypeError( + f"{node.path}: snapshot configuration must be a mapping (snapshot={{...}})." + ) + snapshot_block = dict(snapshot_block or {}) + + # ---- unique key ---------------------------------------------------- + unique_key = _normalize_unique_key(meta.get("unique_key") or meta.get("primary_key")) + if not unique_key: + raise ValueError( + f"{node.path}: snapshot models require 'unique_key' (string or list of strings)." + ) + + # ---- strategy ------------------------------------------------------ + raw_strategy = snapshot_block.get("strategy") or meta.get("strategy") or "timestamp" + strategy_str = str(raw_strategy).lower() + if strategy_str not in ("timestamp", "check"): + raise ValueError( + f"{node.path}: snapshot 'strategy' must be 'timestamp' or 'check', " + f"got {raw_strategy!r}." + ) + + # Narrow to the Literal["timestamp", "check"] type for type-checkers + strategy: SnapshotStrategy = "timestamp" if strategy_str == "timestamp" else "check" + + # ---- updated_at ---------------------------------------------------- + updated_at = ( + snapshot_block.get("updated_at") + or snapshot_block.get("updated_at_column") + or meta.get("updated_at") + or meta.get("updated_at_column") + ) + + # ---- check_cols ---------------------------------------------------- + raw_check_cols = ( + snapshot_block.get("check_cols") + or snapshot_block.get("check_columns") + or meta.get("check_cols") + or meta.get("check_columns") + ) + check_cols = _normalize_unique_key(raw_check_cols) if raw_check_cols else [] + + # Per-strategy guards (extra safety besides ModelConfig) + if strategy == "timestamp" and not updated_at: + raise ValueError( + f"{node.path}: strategy='timestamp' snapshots require 'updated_at' column name." + ) + if strategy == "check" and not check_cols: + raise ValueError( + f"{node.path}: strategy='check' snapshots require non-empty " + "'check_cols' (string or list)." + ) + + return SnapshotConfigResolved( + strategy=strategy, unique_key=unique_key, updated_at=updated_at, check_cols=check_cols + ) diff --git a/src/fastflowtransform/templates/index.html.j2 b/src/fastflowtransform/templates/index.html.j2 index 149809c..b879e52 100644 --- a/src/fastflowtransform/templates/index.html.j2 +++ b/src/fastflowtransform/templates/index.html.j2 @@ -73,6 +73,7 @@ .badge-table { background:#eef7ff; color:#0a3a77; border-color:#bcd8fb; } .badge-view { background:#eefcf4; color:#0b5d2a; border-color:#bdebcf; } .badge-ephemeral { background:#fff7e8; color:#7a4a00; border-color:#f6db9b; } + .badge-snapshot { background:#f3e8ff; color:#5b21b6; border-color:#d8b4fe; } .badge-sql { background: var(--chip-sql-bg); color: var(--chip-sql-fg); } .badge-py { background: var(--chip-py-bg); color: var(--chip-py-fg); } .subline { display:block; margin-top:2px; font-size:12px; color: var(--muted); line-height:1.35; } diff --git a/tests/integration/examples/config.py b/tests/integration/examples/config.py index 4734ffe..2ab89fb 100644 --- a/tests/integration/examples/config.py +++ b/tests/integration/examples/config.py @@ -88,4 +88,15 @@ class ExampleConfig: "databricks_spark": "dev_databricks", }, ), + ExampleConfig( + name="snapshot_demo", + path=ROOT / "examples" / "snapshot_demo", + make_target="demo", + env_by_engine={ + "duckdb": "dev_duckdb", + "postgres": "dev_postgres", + "databricks_spark": "dev_databricks", + }, + spark_table_formats=["parquet", "delta", "iceberg"], + ), ] diff --git a/tests/unit/api/http/test_http_offline_cache_unit.py b/tests/unit/api/http/test_http_offline_cache_unit.py index f183179..b908d2c 100644 --- a/tests/unit/api/http/test_http_offline_cache_unit.py +++ b/tests/unit/api/http/test_http_offline_cache_unit.py @@ -43,3 +43,25 @@ def test_get_json_offline_cache_hit_records_stats(monkeypatch, tmp_path): assert snap["used_offline"] is True assert snap["bytes"] > 0 assert isinstance(snap["keys"], list) and len(snap["keys"]) == 1 + + +@pytest.mark.unit +@pytest.mark.http +def test_get_json_cache_hit_online_not_reported_offline(monkeypatch, tmp_path): + monkeypatch.setenv("FF_HTTP_OFFLINE", "0") + monkeypatch.setenv("FF_HTTP_CACHE_DIR", str(tmp_path)) + importlib.reload(http) + + url = "https://api.example.com/users" + params = {"page": 1} + payload = {"data": [{"id": 1}]} + _seed_cache(http, Path(tmp_path), url, params, payload) + + ctx.reset_for_node("online_node") + + out = http.get_json(url, params=params) + assert out == payload + + snap = ctx.snapshot() + assert snap["cache_hits"] == 1 + assert snap["used_offline"] is False diff --git a/tests/unit/api/http/test_http_pagination_df_unit.py b/tests/unit/api/http/test_http_pagination_df_unit.py index 3ace3e2..8c5565c 100644 --- a/tests/unit/api/http/test_http_pagination_df_unit.py +++ b/tests/unit/api/http/test_http_pagination_df_unit.py @@ -1,6 +1,7 @@ import importlib import json from pathlib import Path +from typing import Any import pandas as pd import pytest @@ -55,3 +56,35 @@ def paginator(u: str, p: dict | None, js: dict): cache_hit_count = 2 assert snap["requests"] == request_count assert snap["cache_hits"] == cache_hit_count + + +@pytest.mark.unit +@pytest.mark.http +def test_raw_get_pagination_returns_pages(monkeypatch, tmp_path): + monkeypatch.setenv("FF_HTTP_OFFLINE", "0") + monkeypatch.setenv("FF_HTTP_CACHE_DIR", str(tmp_path)) + importlib.reload(http) + + calls: list[dict[str, Any]] = [] + + def fake_http_request(method, u, *, params=None, headers=None, timeout=None): + calls.append({"url": u, "params": params, "headers": headers}) + if "page=1" in u: + body = json.dumps({"next": "https://api.example.com/users?page=2"}).encode("utf-8") + else: + body = json.dumps({"next": None}).encode("utf-8") + return 200, {}, body + + monkeypatch.setattr(http, "_http_request", fake_http_request) + + def paginator(u: str, p: dict | None, payload: Any): + nxt = payload.get("next") if isinstance(payload, dict) else None + if not nxt: + return None + return {"next_request": {"url": nxt, "headers": {"X-Token": "abc"}}} + + pages = http.get("https://api.example.com/users?page=1", paginator=paginator) + assert isinstance(pages, list) + assert len(pages) == 2 + assert calls[0]["headers"] == {} + assert calls[1]["headers"] == {"X-Token": "abc"}