diff --git a/docs/examples/API_Demo.md b/docs/examples/API_Demo.md index 104c43b..df2bf32 100644 --- a/docs/examples/API_Demo.md +++ b/docs/examples/API_Demo.md @@ -3,7 +3,7 @@ The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local data, external APIs, and multiple execution engines. It highlights: - **Hybrid data model**: joins a local seed (`crm.users`) with live user data from JSONPlaceholder. -- **Multiple environments**: switch between DuckDB, Postgres, Databricks Spark, and BigQuery (pandas or BigFrames client) using `profiles.yml` + `.env.*`. +- **Multiple environments**: switch between DuckDB, Postgres, Databricks Spark, BigQuery (pandas or BigFrames client), and Snowflake (Snowpark) using `profiles.yml` + `.env.*`. - **HTTP integration**: compare the built-in FastFlowTransform HTTP client (`api_users_http`) with a plain `requests` implementation (`api_users_requests`). - **Offline caching & telemetry**: inspect HTTP snapshots via `run_results.json`. - **Engine-aware registration**: scope Python models via `engine_model` and SQL models via `config(engines=[...])` so only the active engine’s nodes load. @@ -21,7 +21,8 @@ The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark' ] ) }} select id, email @@ -32,11 +33,11 @@ The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local 2. **API enrichment** – engine-specific Python implementations under `models/engines//`: - `api_users_http.ff.py` uses the built-in HTTP wrapper (`fastflowtransform.api.http.get_df`) with cache/offline support. - `api_users_requests.ff.py` uses raw `requests` for maximum flexibility. - - Engine-specific callables are scoped with `engine_model(only=...)` (DuckDB/Postgres/Spark) or `env_match={"FF_ENGINE": "bigquery", "FF_ENGINE_VARIANT": ...}` (BigQuery pandas/BigFrames) to stay isolated per engine. +- Engine-specific callables are scoped with `engine_model(only=...)` (DuckDB/Postgres/Spark/Snowflake) or `env_match={"FF_ENGINE": "bigquery", "FF_ENGINE_VARIANT": ...}` (BigQuery pandas/BigFrames) to stay isolated per engine. 3. **Mart join** – `models/common/mart_users_join.ff.sql` ```sql - {{ config(engines=['duckdb','postgres','databricks_spark','bigquery']) }} + {{ config(engines=['duckdb','postgres','databricks_spark','bigquery','snowflake_snowpark']) }} {% set api_users_model = var('api_users_model', 'api_users_http') %} {% set api_users_refs = { 'api_users_http': ref('api_users_http'), @@ -78,9 +79,21 @@ dev_bigquery_bigframes: dataset: "{{ env('FF_BQ_DATASET', 'api_demo') }}" location: "{{ env('FF_BQ_LOCATION', 'EU') }}" use_bigframes: true + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'API_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'API_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true ``` -`.env.dev_*` files supply the actual values. `_load_dotenv_layered()` loads them in priority order: repo `.env` → project `.env` → `.env.` → shell overrides (highest priority). Secrets stay out of version control. +`.env.dev_*` files supply the actual values (including `.env.dev_snowflake` for Snowflake credentials). `_load_dotenv_layered()` loads them in priority order: repo `.env` → project `.env` → `.env.` → shell overrides (highest priority). Secrets stay out of version control. ### BigQuery specifics @@ -91,7 +104,7 @@ dev_bigquery_bigframes: ## Makefile Workflow -`Makefile` chooses the profile via `ENGINE` (`duckdb`/`postgres`/`databricks_spark`/`bigquery`) and wraps the main commands. For BigQuery, set `BQ_FRAME=pandas|bigframes`: +`Makefile` chooses the profile via `ENGINE` (`duckdb`/`postgres`/`databricks_spark`/`bigquery`/`snowflake_snowpark`) and wraps the main commands. For BigQuery, set `BQ_FRAME=pandas|bigframes`: ```make ENGINE ?= duckdb @@ -108,6 +121,9 @@ ifeq ($(ENGINE),bigquery) PROFILE_ENV = dev_bigquery_bigframes endif endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake +endif seed: uv run fft seed "$(PROJECT)" --env $(PROFILE_ENV) @@ -122,6 +138,7 @@ Common targets: | `make ENGINE=duckdb seed`| Materialize seeds into DuckDB. | | `make ENGINE=postgres run`| Execute the full pipeline against Postgres. | | `make ENGINE=bigquery run BQ_FRAME=bigframes`| Run against BigQuery (default BigFrames client; set `BQ_FRAME=pandas` to switch). | +| `make ENGINE=snowflake_snowpark run`| Execute the API demo on Snowflake via Snowpark (install `fastflowtransform[snowflake]`). | | `make dag` | Render documentation (`site/dag/`). | | `make api-run` | Run only API models (uses HTTP cache). | | `make api-offline` | Force offline mode (`FF_HTTP_OFFLINE=1`). | @@ -131,7 +148,7 @@ HTTP tuning parameters (`FF_HTTP_ALLOWED_DOMAINS`, cache dir, timeouts) live in ## End-to-End Demo -1. **Select engine**: `make ENGINE=duckdb` (default). Set `ENGINE=postgres`, `ENGINE=databricks_spark`, or `ENGINE=bigquery BQ_FRAME=` to switch. +1. **Select engine**: `make ENGINE=duckdb` (default). Set `ENGINE=postgres`, `ENGINE=databricks_spark`, `ENGINE=bigquery BQ_FRAME=`, or `ENGINE=snowflake_snowpark` to switch. 2. **Seed data**: `make seed` 3. **Run pipeline**: `make run` 4. **Explore docs**: `make dag` → open `examples/api_demo/site/dag/index.html` diff --git a/docs/examples/Cache_Demo.md b/docs/examples/Cache_Demo.md index e6fc7e3..6cfecf5 100644 --- a/docs/examples/Cache_Demo.md +++ b/docs/examples/Cache_Demo.md @@ -60,6 +60,8 @@ make change_py # edit py_constants.ff.py -> rebuilds that model make run_parallel # runs entire DAG with 4 workers per level ``` +> Engines: set `ENGINE=` and copy the matching `.env.dev_*` file (`.env.dev_snowflake` for Snowflake; install `fastflowtransform[snowflake]`). + Seeds stay immutable: `change_seed` assembles a temporary combined copy in `.local/seeds` using `patches/seed_users_patch.csv`, so the repo stays clean while fingerprints still change. diff --git a/docs/examples/DQ_Demo.md b/docs/examples/DQ_Demo.md index 881798d..d3cecbf 100644 --- a/docs/examples/DQ_Demo.md +++ b/docs/examples/DQ_Demo.md @@ -56,6 +56,7 @@ examples/dq_demo/ .env.dev_databricks .env.dev_bigquery_pandas .env.dev_bigquery_bigframes + .env.dev_snowflake Makefile # optional, convenience wrapper around fft commands profiles.yml project.yml @@ -107,7 +108,9 @@ examples/dq_demo/ 'scope:staging', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} @@ -136,7 +139,9 @@ Aggregates orders per customer and prepares data for reconciliation + freshness: 'scope:mart', 'engine:duckdb', 'engine:postgres', - 'engine:databricks_spark' + 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} @@ -513,6 +518,31 @@ To run the same demo on BigQuery: Both profiles accept `allow_create_dataset` in `profiles.yml` if you want the example to create the dataset automatically. +## Snowflake Snowpark variant + +To run on Snowflake: + +1. Copy `.env.dev_snowflake` to `.env` and populate: + ```bash + FF_SF_ACCOUNT= + FF_SF_USER= + FF_SF_PASSWORD= + FF_SF_WAREHOUSE=COMPUTE_WH + FF_SF_DATABASE=DQ_DEMO + FF_SF_SCHEMA=DQ_DEMO + FF_SF_ROLE= + ``` +2. Install the Snowflake extra if needed: + ```bash + pip install "fastflowtransform[snowflake]" + ``` +3. Run via the Makefile: + ```bash + make demo ENGINE=snowflake_snowpark + ``` + +The Snowflake profile enables `allow_create_schema`, so the schema is created automatically on first run when permitted. + ## Things to experiment with To understand the tests better, intentionally break the data and re-run `fft test`: diff --git a/docs/examples/Incremental_Demo.md b/docs/examples/Incremental_Demo.md index 3005b1a..2182460 100644 --- a/docs/examples/Incremental_Demo.md +++ b/docs/examples/Incremental_Demo.md @@ -1,6 +1,6 @@ # Incremental, Delta & Iceberg Demo -This example project shows how to use **incremental models** and **Delta-/Iceberg-style merges** in FastFlowTransform across DuckDB, Postgres, Databricks Spark (Parquet, Delta & Iceberg), and BigQuery (pandas or BigFrames). +This example project shows how to use **incremental models** and **Delta-/Iceberg-style merges** in FastFlowTransform across DuckDB, Postgres, Databricks Spark (Parquet, Delta & Iceberg), BigQuery (pandas or BigFrames), and Snowflake Snowpark. It is intentionally small and self-contained so you can copy/paste patterns into your own project. @@ -26,6 +26,7 @@ incremental_demo/ .env.dev_databricks_iceberg .env.dev_bigquery_pandas .env.dev_bigquery_bigframes + .env.dev_snowflake Makefile profiles.yml project.yml @@ -51,6 +52,8 @@ incremental_demo/ fct_events_py_incremental.ff.py bigframes/ fct_events_py_incremental.ff.py + snowflake_snowpark/ + fct_events_py_incremental.ff.py ``` *Your actual filenames may differ slightly; the concepts are the same.* @@ -79,6 +82,7 @@ The demo revolves around a tiny `events` dataset and three different ways to bui * DuckDB / Postgres: incremental insert/merge in SQL * Databricks Spark: `MERGE INTO` for Delta or Iceberg where available (Spark 4), with a fallback full-refresh strategy for other formats * BigQuery: pandas- or BigFrames-backed DataFrame models with incremental merge logic handled by the BigQuery executor + * Snowflake Snowpark: Snowpark DataFrame operations with merges handled by the Snowflake executor 4. **Iceberg profile for Spark 4** @@ -134,6 +138,8 @@ Conceptually: 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} @@ -273,6 +279,8 @@ Here the model body only defines the **canonical SELECT** and does *not* contain 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark', ], ) }} @@ -581,6 +589,19 @@ FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigfr Ensure the service account credentials pointed to by `GOOGLE_APPLICATION_CREDENTIALS` can create/drop tables in the target dataset. +### Snowflake Snowpark + +```bash +# Seed / run / test (Snowflake profile) +FFT_ACTIVE_ENV=dev_snowflake FF_ENGINE=snowflake_snowpark fft seed . +FFT_ACTIVE_ENV=dev_snowflake FF_ENGINE=snowflake_snowpark fft run . \ + --select tag:example:incremental_demo --select tag:engine:snowflake_snowpark --cache rw +FFT_ACTIVE_ENV=dev_snowflake FF_ENGINE=snowflake_snowpark fft test . \ + --select tag:example:incremental_demo +``` + +Make sure `.env.dev_snowflake` sets the required `FF_SF_*` variables and install `fastflowtransform[snowflake]` so the Snowpark executor and client libraries are available. + ### Databricks Spark ```bash diff --git a/docs/examples/Local_Engine_Setup.md b/docs/examples/Local_Engine_Setup.md index c75da91..6ef2fdb 100644 --- a/docs/examples/Local_Engine_Setup.md +++ b/docs/examples/Local_Engine_Setup.md @@ -207,3 +207,187 @@ The BigQuery client in `fastflowtransform` will pick this up automatically **as * Project: `fft-basic-demo` * Dataset: `basic_demo` * Verify it exists and is in the same project you set in `FF_BQ_PROJECT`. + + +### Snowflake Snowpark + +#### 1. One-time setup in Snowflake + +You need a Snowflake account with a warehouse and database you can write to. + +1. **Log in to Snowflake UI (Web Console)** + Use your regular Snowflake login. You should see the Worksheets / Data / Compute sections. + +2. **Create (or pick) a warehouse** + + If you don’t have one yet: + + ```sql + CREATE WAREHOUSE COMPUTE_WH + WAREHOUSE_SIZE = XSMALL + AUTO_SUSPEND = 60 + AUTO_RESUME = TRUE; +```` + +You can of course use any existing warehouse; just make sure the user you configure below can `USE` and `OPERATE` it. + +3. **Create a database and base schema** + + FFT will auto-create the schema (if `allow_create_schema=true`), but **not the database**. + So create the DB once: + + ```sql + CREATE DATABASE EXAMPLE_DEMO; + CREATE SCHEMA EXAMPLE_DEMO.BASIC_DEMO; -- optional, FFT can create this if allowed + ``` + + Adjust names if you prefer something else; just keep database+schema consistent with `.env` and `profiles.yml`. + +4. **User / role permissions** + + Make sure the user you’ll use for FFT can: + + ```sql + USE ROLE ACCOUNTADMIN; -- or a less powerful custom role with the needed grants + GRANT USAGE ON WAREHOUSE COMPUTE_WH TO ROLE ACCOUNTADMIN; + GRANT USAGE ON DATABASE EXAMPLE_DEMO TO ROLE ACCOUNTADMIN; + GRANT USAGE, CREATE SCHEMA, CREATE TABLE, CREATE VIEW ON DATABASE EXAMPLE_DEMO TO ROLE ACCOUNTADMIN; + ``` + + (In the examples we stick with `ACCOUNTADMIN` to keep the setup simple; in real environments you’d use a dedicated, restricted role.) + +--- + +#### 2. Local configuration (env + profiles) + +1. **Environment file (`examples/api_demo/.env.dev_snowflake`)** + + ```env + # Snowflake connection + FF_SF_ACCOUNT=your_account_name # e.g. xy12345.eu-central-1 + FF_SF_USER=YOUR_USERNAME + FF_SF_PASSWORD=YOUR_PASSWORD + FF_SF_WAREHOUSE=COMPUTE_WH + FF_SF_DATABASE=EXAMPLE_DEMO + FF_SF_SCHEMA=BASIC_DEMO + FF_SF_ROLE=ACCOUNTADMIN # or another role with required grants + + # Active fft environment name (must match profiles.yml) + FFT_ACTIVE_ENV=dev_snowflake + ``` + + Notes: + + * `FF_SF_ACCOUNT` is the Snowflake **account identifier**, not the full URL + (e.g. `xy12345.eu-central-1`, not `https://xy12345.eu-central-1.snowflakecomputing.com`). + * `FF_SF_DATABASE` must already exist (see step 1). + * `FF_SF_SCHEMA` will be **auto-created** by FFT if `allow_create_schema=true` in the profile. + +2. **`profiles.yml`** + + Example profile matching the env above: + + ```yaml + dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + db_schema: "{{ env('FF_SF_SCHEMA', 'BASIC_DEMO') }}" + role: "{{ env('FF_SF_ROLE', 'ACCOUNTADMIN') }}" + allow_create_schema: true + ``` + + * `allow_create_schema: true` tells the executor to run: + + ```sql + CREATE SCHEMA IF NOT EXISTS "EXAMPLE_DEMO"."BASIC_DEMO"; + ``` + + on first connect (best-effort). If you prefer to manage schemas manually, set this to `false`. + +--- + +#### 3. Running seeds and models + +Once the env file and profile are in place: + +1. **Seed Snowflake from `seeds/`:** + + ```bash + make ENGINE=snowflake_snowpark seed + ``` + + This will: + + * Connect via Snowpark + * Create the schema (if allowed and it doesn’t exist) + * Upload CSV seeds via `write_pandas` into `EXAMPLE_DEMO.BASIC_DEMO.*` + +2. **Build models:** + + ```bash + make ENGINE=snowflake_snowpark run + ``` + + * SQL models are rendered to Snowflake SQL and executed as `CREATE OR REPLACE TABLE/VIEW`. + * Snowpark Python models (`only="snowflake_snowpark"`) receive Snowpark `DataFrame` inputs and write back using `save_as_table`. + +3. **Run tests (if you have them):** + + ```bash + make ENGINE=snowflake_snowpark test + ``` + + This executes the standard FFT test suite (e.g. `not_null`, `unique`, etc.) against tables in `EXAMPLE_DEMO.BASIC_DEMO`. + +--- + +#### 4. Cleanup / reset for re-runs + +You wired Snowflake into your `cleanup.py`, so you can reset the demo schema with: + +```bash +python scripts/cleanup.py --engine snowflake_snowpark --project examples/basic_demo +``` + +Depending on how you implemented `cleanup_snowflake`, this typically: + +* Drops and recreates the **schema** (not the database), e.g. `EXAMPLE_DEMO.BASIC_DEMO`. +* Removes local FFT artifacts (manifest, run_results, etc.) unless `--skip-artifacts` is set. + +Then you can re-seed and re-run from a clean slate: + +```bash +make ENGINE=snowflake_snowpark seed run +``` + +--- + +#### 5. Common Snowflake gotchas + +* **Database vs schema creation** + + * FFT’s Snowflake executor only auto-creates the **schema** (when `allow_create_schema=true`). + * The **database must exist** (e.g. `EXAMPLE_DEMO`) or you’ll get `Schema 'EXAMPLE_DEMO.BASIC_DEMO' does not exist or not authorized`. + +* **Case sensitivity / quoting** + + * FFT creates tables *unquoted*, e.g. `CREATE TABLE EXAMPLE_DEMO.BASIC_DEMO.SEED_USERS`, so Snowflake stores them as uppercase. + * Your SQL models can safely use lowercase identifiers (`select id, email from {{ ref('seed_users') }}`); Snowflake normalizes them. + * The executor takes care of quoting database/schema/table names when building fully qualified identifiers. + +* **Permissions** + + * Errors like `Object 'EXAMPLE_DEMO.BASIC_DEMO.*' does not exist or not authorized` usually mean: + + * DB/schema/table really doesn’t exist **or** + * the role in `FF_SF_ROLE` doesn’t have `USAGE` + `CREATE TABLE/VIEW` on that DB/schema. + * Double-check role grants with: + + ```sql + SHOW GRANTS TO ROLE ACCOUNTADMIN; + ``` diff --git a/docs/examples/Macros_Demo.md b/docs/examples/Macros_Demo.md index 8dee673..7fcb5a8 100644 --- a/docs/examples/Macros_Demo.md +++ b/docs/examples/Macros_Demo.md @@ -1,6 +1,6 @@ # Macros Demo -**Goal:** Showcase **SQL Jinja macros** and **Python render-time macros** working together across engines (DuckDB, Postgres, Databricks Spark, BigQuery). +**Goal:** Showcase **SQL Jinja macros** and **Python render-time macros** working together across engines (DuckDB, Postgres, Databricks Spark, BigQuery, Snowflake Snowpark). You’ll see reusable SQL helpers, engine-aware SQL generation, and Python functions exposed as Jinja globals/filters. --- @@ -13,6 +13,9 @@ examples/macros_demo/ .env.dev_databricks .env.dev_duckdb .env.dev_postgres + .env.dev_bigquery_pandas + .env.dev_bigquery_bigframes + .env.dev_snowflake Makefile profiles.yml project.yml @@ -99,12 +102,16 @@ From repo root: ```bash cd examples/macros_demo -# Choose engine: duckdb (default) | postgres | databricks_spark +# Choose engine: duckdb (default) | postgres | databricks_spark | bigquery | snowflake_snowpark make ENGINE=duckdb demo # or make ENGINE=postgres demo # or make ENGINE=databricks_spark demo +# or +make ENGINE=bigquery BQ_FRAME=pandas demo # or bigframes +# or +make ENGINE=snowflake_snowpark demo ``` The `demo` target runs: @@ -115,6 +122,8 @@ The `demo` target runs: 4. `fft test` — runs example tests 5. Prints artifact paths and tries to open the DAG +> For Snowflake, copy `.env.dev_snowflake` to `.env`, fill in the `FF_SF_*` values, and install `fastflowtransform[snowflake]` so the Snowpark executor is available. + --- ## Key files (highlights) diff --git a/docs/examples/Materializations_Demo.md b/docs/examples/Materializations_Demo.md index 3d89a53..20de043 100644 --- a/docs/examples/Materializations_Demo.md +++ b/docs/examples/Materializations_Demo.md @@ -11,6 +11,8 @@ examples/materializations_demo/models/ Each model type demonstrates how FastFlowTransform builds, caches, or executes models differently depending on its `materialized:` configuration. +Supported engines: DuckDB, Postgres, Databricks/Spark, BigQuery (pandas & BigFrames), and Snowflake Snowpark. + --- ## 🧩 1. View Models @@ -54,7 +56,7 @@ from {{ ref('fct_orders_view') }} * Fully rebuilt every run * Good for final curated datasets or small tables * Overwrites previous contents (atomic replace) -* Compatible with all engines (DuckDB, Postgres, BigQuery, etc.) +* Compatible with all engines (DuckDB, Postgres, Databricks, BigQuery, Snowflake) --- diff --git a/docs/index.md b/docs/index.md index 82efcc7..57ab785 100644 --- a/docs/index.md +++ b/docs/index.md @@ -47,7 +47,7 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t ### 1. Build & Operate Projects (Data Practitioners) - **Get set up quickly:** follow the dedicated [Quickstart](Quickstart.md) guide for installation, seeding, and a first run. -- **Need local runtimes?** The [API demo local engine setup](examples/Local_Engine_Setup.md) walks through DuckDB, Postgres, and Databricks Spark. +- **Need local runtimes?** The [API demo local engine setup](examples/Local_Engine_Setup.md) walks through DuckDB, Postgres, Databricks Spark, BigQuery, and Snowflake Snowpark. - **Understand the project layout & CLI workflow:** start with *Project Layout* in the [Technical Overview](Technical_Overview.md#project-layout) and pair it with the [CLI Guide](CLI_Guide.md) for command patterns. - **Configure runtimes & profiles:** review executor profiles and environment overrides in the dedicated [Profiles guide](Profiles.md) plus [Logging & Verbosity](Logging.md) for observability flags. - **Model data quality & troubleshoot runs:** combine the [Model Unit Tests guide](Unit_Tests.md) with [Troubleshooting & Error Codes](Troubleshooting.md) to keep runs deterministic and easy to debug. diff --git a/examples/_scripts/cleanup_env.py b/examples/_scripts/cleanup_env.py index 90e7281..4be86bf 100644 --- a/examples/_scripts/cleanup_env.py +++ b/examples/_scripts/cleanup_env.py @@ -229,6 +229,77 @@ def cleanup_databricks( return warehouse_path +def cleanup_snowflake( + *, + account: str | None, + user: str | None, + password: str | None, + warehouse: str | None, + database: str | None, + schema: str | None, + role: str | None, + dry_run: bool, +) -> None: + """ + Reset a Snowflake demo schema by dropping and recreating it. + + Uses account/user/password/warehouse/database/schema/role from + args/env/profile; intended for isolated demo schemas, not shared prod. + """ + missing = [] + if not account: + missing.append("FF_SF_ACCOUNT or profile.snowflake_snowpark.account") + if not user: + missing.append("FF_SF_USER or profile.snowflake_snowpark.user") + if not password: + missing.append("FF_SF_PASSWORD or profile.snowflake_snowpark.password") + if not warehouse: + missing.append("FF_SF_WAREHOUSE or profile.snowflake_snowpark.warehouse") + if not database: + missing.append("FF_SF_DATABASE or profile.snowflake_snowpark.database") + if not schema: + missing.append("FF_SF_SCHEMA or profile.snowflake_snowpark.db_schema") + + if missing: + raise ValueError("Snowflake cleanup requires: " + ", ".join(missing)) + + if dry_run: + _log( + f"[dry-run] Would drop and recreate Snowflake schema " + f'"{database}"."{schema}" on account "{account}" (warehouse={warehouse})' + ) + return + + # Local import so non-Snowflake users don't need the dependency + import snowflake.connector # type: ignore[import] + + _log( + f"Dropping schema '{database}.{schema}' on Snowflake " + f"(account={account}, warehouse={warehouse})" + ) + + conn = snowflake.connector.connect( + account=account, + user=user, + password=password, + warehouse=warehouse, + role=role or None, + database=database, + ) + try: + cur = conn.cursor() + try: + cur.execute(f'USE DATABASE "{database}"') + cur.execute(f'DROP SCHEMA IF EXISTS "{schema}" CASCADE') + cur.execute(f'CREATE SCHEMA "{schema}"') + finally: + with suppress(Exception): + cur.close() + finally: + with suppress(Exception): + conn.close() + + def cleanup_common_artifacts( *, project: Path, dry_run: bool, extra_paths: Iterable[Path] | None = None ) -> None: @@ -286,7 +357,9 @@ def _load_profile(project: Path, env_name: str, engine: str | None): def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description="Reset FastFlowTransform example environments.") parser.add_argument( - "--engine", required=True, choices=["duckdb", "postgres", "databricks_spark", "bigquery"] + "--engine", + required=True, + choices=["duckdb", "postgres", "databricks_spark", "bigquery", "snowflake_snowpark"], ) parser.add_argument("--project", default=".") parser.add_argument("--env", help="Profile environment name (e.g. dev_duckdb).") @@ -319,7 +392,8 @@ def main(argv: list[str] | None = None) -> int: or os.getenv("FFT_ACTIVE_ENV") or ( "dev_" + args.engine - if args.engine in {"duckdb", "postgres", "databricks_spark", "bigquery"} + if args.engine + in {"duckdb", "postgres", "databricks_spark", "bigquery", "snowflake_snowpark"} else "dev" ) ) @@ -389,6 +463,28 @@ def main(argv: list[str] | None = None) -> int: location=location, dry_run=args.dry_run, ) + elif args.engine == "snowflake_snowpark": + profile_sf = getattr(profile, "snowflake_snowpark", None) if profile else None + + # Prefer CLI/env over profile, just like other engines + account = os.getenv("FF_SF_ACCOUNT") or getattr(profile_sf, "account", None) + user = os.getenv("FF_SF_USER") or getattr(profile_sf, "user", None) + password = os.getenv("FF_SF_PASSWORD") or getattr(profile_sf, "password", None) + warehouse = os.getenv("FF_SF_WAREHOUSE") or getattr(profile_sf, "warehouse", None) + database = os.getenv("FF_SF_DATABASE") or getattr(profile_sf, "database", None) + schema = os.getenv("FF_SF_SCHEMA") or getattr(profile_sf, "db_schema", None) + role = os.getenv("FF_SF_ROLE") or getattr(profile_sf, "role", None) + + cleanup_snowflake( + account=account, + user=user, + password=password, + warehouse=warehouse, + database=database, + schema=schema, + role=role, + dry_run=args.dry_run, + ) except Exception as exc: _log(f"Cleanup failed: {exc}") return 1 diff --git a/examples/api_demo/.env.dev_snowflake b/examples/api_demo/.env.dev_snowflake new file mode 100644 index 0000000..5b307bf --- /dev/null +++ b/examples/api_demo/.env.dev_snowflake @@ -0,0 +1,18 @@ +# Snowflake Snowpark profile for the API demo + +# Your Snowflake account identifier, e.g. xy12345.eu-central-1 +FF_SF_ACCOUNT=your_account_id + +# Username & password for Snowflake (or use keypair auth if you extend the executor) +FF_SF_USER=your_username +FF_SF_PASSWORD=your_password + +# Compute warehouse +FF_SF_WAREHOUSE=COMPUTE_WH + +# Database & schema for the demo +FF_SF_DATABASE=EXAMPLE_DEMO +FF_SF_SCHEMA=API_DEMO + +# Optional role (can be left blank) +FF_SF_ROLE=ANALYST diff --git a/examples/api_demo/Makefile b/examples/api_demo/Makefile index ada8869..75aab87 100644 --- a/examples/api_demo/Makefile +++ b/examples/api_demo/Makefile @@ -9,7 +9,7 @@ DB ?= .local/api_demo.duckdb PROJECT ?= . UV ?= uv -# Engine selector (duckdb|postgres|databricks_spark) +# Engine selector (duckdb|postgres|databricks_spark|bigquery|snowflake_snowpark) ENGINE ?= duckdb # BigQuery frame type selector (pandas | bigframes) @@ -52,11 +52,15 @@ ifeq ($(ENGINE),bigquery) PROFILE_ENV = dev_bigquery_bigframes endif endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake + ENGINE_TAG = engine:snowflake_snowpark +endif -BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) ifeq ($(ENGINE),bigquery) - BASE_ENV := $(BASE_ENV) FF_ENGINE=$(ENGINE) FF_ENGINE_VARIANT=$(BQ_FRAME) + BASE_ENV := $(BASE_ENV) FF_ENGINE_VARIANT=$(BQ_FRAME) endif RUN_ENV = $(BASE_ENV) @@ -73,6 +77,8 @@ else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),bigquery) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),snowflake_snowpark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine snowflake_snowpark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) endif diff --git a/examples/api_demo/models/common/mart_users_join.ff.sql b/examples/api_demo/models/common/mart_users_join.ff.sql index ad6feef..25eb2f9 100644 --- a/examples/api_demo/models/common/mart_users_join.ff.sql +++ b/examples/api_demo/models/common/mart_users_join.ff.sql @@ -6,7 +6,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} diff --git a/examples/api_demo/models/common/users.ff.sql b/examples/api_demo/models/common/users.ff.sql index b170c5a..ab925cf 100644 --- a/examples/api_demo/models/common/users.ff.sql +++ b/examples/api_demo/models/common/users.ff.sql @@ -7,7 +7,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} -- Simple staging table from seed diff --git a/examples/api_demo/models/engines/snowflake_snowpark/api_users_http.ff.py b/examples/api_demo/models/engines/snowflake_snowpark/api_users_http.ff.py new file mode 100644 index 0000000..cdf1a6f --- /dev/null +++ b/examples/api_demo/models/engines/snowflake_snowpark/api_users_http.ff.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import pandas as pd + +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df + +if TYPE_CHECKING: # pragma: no cover - typing only + from snowflake.snowpark import DataFrame as SnowparkDataFrame + from snowflake.snowpark import Session as SnowparkSession +else: # pragma: no cover - runtime fallback + SnowparkDataFrame = Any + SnowparkSession = Any + + +def _ensure_session(df: Any) -> "SnowparkSession": + try: + from snowflake.snowpark import Session as _Session + except Exception as exc: # pragma: no cover - optional dependency guard + raise RuntimeError( + "snowflake-snowpark-python is required for this model. " + "Install fastflowtransform[snowflake]." + ) from exc + + session = getattr(df, "session", None) + if session is None: + raise RuntimeError( + "Snowpark session missing on upstream DataFrame. " + "Ensure Snowflake Snowpark is the active engine." + ) + return session + + +@engine_model( + only="snowflake_snowpark", + name="api_users_http", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine", "engine:snowflake_snowpark"], +) +def fetch(users_df: SnowparkDataFrame) -> SnowparkDataFrame: + """ + Fetch demo users via the FFT HTTP helper and return a Snowpark DataFrame. + """ + session = _ensure_session(users_df) + pdf = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, + normalize=True, + ) + + projected = ( + pdf.loc[:, ["id", "email", "username", "name"]] + .rename(columns={"id": "api_user_id"}) + .astype({"api_user_id": pd.Int64Dtype()}, copy=False) + ) + return session.create_dataframe(projected) diff --git a/examples/api_demo/models/engines/snowflake_snowpark/api_users_requests.ff.py b/examples/api_demo/models/engines/snowflake_snowpark/api_users_requests.ff.py new file mode 100644 index 0000000..7b55f5d --- /dev/null +++ b/examples/api_demo/models/engines/snowflake_snowpark/api_users_requests.ff.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: # pragma: no cover - typing only + from snowflake.snowpark import DataFrame as SnowparkDataFrame + from snowflake.snowpark import Session as SnowparkSession +else: # pragma: no cover - runtime fallback + SnowparkDataFrame = Any + SnowparkSession = Any + + +def _ensure_session(df: Any) -> "SnowparkSession": + try: + from snowflake.snowpark import Session as _Session + except Exception as exc: # pragma: no cover - optional dependency guard + raise RuntimeError( + "snowflake-snowpark-python is required for this model. " + "Install fastflowtransform[snowflake]." + ) from exc + + session = getattr(df, "session", None) + if session is None: + raise RuntimeError( + "Snowpark session missing on upstream DataFrame. " + "Ensure Snowflake Snowpark is selected via profiles.yml." + ) + return session + + +try: + import httpx +except Exception as _e: # pragma: no cover - optional dep + raise RuntimeError("Please install 'httpx' to run this model") from _e + + +@engine_model( + only="snowflake_snowpark", + name="api_users_requests", + deps=["users.ff"], + tags=["example:api_demo", "scope:engine", "engine:snowflake_snowpark"], +) +def fetch(users_df: SnowparkDataFrame) -> SnowparkDataFrame: + """ + Fetch demo users via plain httpx and return a Snowpark DataFrame. + """ + session = _ensure_session(users_df) + resp = httpx.get("https://jsonplaceholder.typicode.com/users", timeout=30.0) + resp.raise_for_status() + rows = resp.json() + + projected = [ + ( + row.get("id"), + row.get("email"), + row.get("username"), + row.get("name"), + ) + for row in rows + ] + + schema = ["api_user_id", "email", "username", "name"] + return session.create_dataframe(projected, schema=schema) diff --git a/examples/api_demo/profiles.yml b/examples/api_demo/profiles.yml index 88d7cd5..16fc055 100644 --- a/examples/api_demo/profiles.yml +++ b/examples/api_demo/profiles.yml @@ -42,3 +42,15 @@ dev_bigquery_pandas: location: "{{ env('FF_BQ_LOCATION', 'EU') }}" use_bigframes: false # allow_create_dataset: true # uncomment to auto-create dataset on first run + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'API_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true diff --git a/examples/api_demo/seeds/seed_users.csv b/examples/api_demo/seeds/seed_users.csv index 2acf25f..94d8004 100644 --- a/examples/api_demo/seeds/seed_users.csv +++ b/examples/api_demo/seeds/seed_users.csv @@ -1,4 +1,4 @@ id,email -1,a@example.com -2,b@gmail.com -3,c@gmail.com +1,Rey.Padberg@karina.biz +2,Lucio_Hettinger@annie.ca +3,Sincere@april.biz diff --git a/examples/basic_demo/.env.dev_snowflake b/examples/basic_demo/.env.dev_snowflake new file mode 100644 index 0000000..48f931d --- /dev/null +++ b/examples/basic_demo/.env.dev_snowflake @@ -0,0 +1,18 @@ +# Snowflake Snowpark profile for the basic demo + +# Your Snowflake account identifier, e.g. xy12345.eu-central-1 +FF_SF_ACCOUNT=your_account_id + +# Username & password for Snowflake (or use keypair auth if you extend the executor) +FF_SF_USER=your_username +FF_SF_PASSWORD=your_password + +# Compute warehouse +FF_SF_WAREHOUSE=COMPUTE_WH + +# Database & schema for the demo +FF_SF_DATABASE=EXAMPLE_DEMO +FF_SF_SCHEMA=BASIC_DEMO + +# Optional role (can be left blank) +FF_SF_ROLE=ANALYST diff --git a/examples/basic_demo/Makefile b/examples/basic_demo/Makefile index 7c4c675..22f0d63 100644 --- a/examples/basic_demo/Makefile +++ b/examples/basic_demo/Makefile @@ -6,7 +6,7 @@ DB ?= .local/basic_demo.duckdb PROJECT ?= . UV ?= uv -# Engine selector (duckdb|postgres|databricks_spark) +# Engine selector (duckdb|postgres|databricks_spark|bigquery|snowflake_snowpark) ENGINE ?= duckdb # BigQuery frame type selector (pandas | bigframes) @@ -34,6 +34,10 @@ ifeq ($(ENGINE),bigquery) PROFILE_ENV = dev_bigquery_bigframes endif endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake + ENGINE_TAG = engine:snowflake_snowpark +endif BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) @@ -57,6 +61,8 @@ else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),bigquery) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),snowflake_snowpark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine snowflake_snowpark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark) endif diff --git a/examples/basic_demo/README.md b/examples/basic_demo/README.md index adf92ec..fa75cc0 100644 --- a/examples/basic_demo/README.md +++ b/examples/basic_demo/README.md @@ -1,8 +1,8 @@ # Basic demo -Minimal FFT pipeline that runs unchanged on DuckDB, Postgres, Databricks Spark, and BigQuery. +Minimal FFT pipeline that runs unchanged on DuckDB, Postgres, Databricks Spark, BigQuery, and Snowflake (Snowpark). ## How to use - See the full walkthrough (env setup, Makefile targets, engine notes, DQ tests) in `docs/examples/Basic_Demo.md`. -- From this directory: set the desired `.env.dev_*` (for BigQuery choose `.env.dev_bigquery_pandas` or `.env.dev_bigquery_bigframes`), then run `make demo ENGINE=` (set `BQ_FRAME` to switch BigQuery client) to seed → run → dag → test. +- From this directory: set the desired `.env.dev_*` (for BigQuery choose `.env.dev_bigquery_pandas` or `.env.dev_bigquery_bigframes`), then run `make demo ENGINE=` (set `BQ_FRAME` to switch BigQuery client) to seed → run → dag → test. - To inspect results, open `site/dag/index.html` after a run or query the mart tables via your engine client. diff --git a/examples/basic_demo/models/engines/snowflake_snowpark/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/snowflake_snowpark/mart_latest_signup.ff.py new file mode 100644 index 0000000..19e6538 --- /dev/null +++ b/examples/basic_demo/models/engines/snowflake_snowpark/mart_latest_signup.ff.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: + from snowflake.snowpark import DataFrame + from snowflake.snowpark import functions as F + from snowflake.snowpark.window import WindowSpec +else: + DataFrame = Any + F = Any + WindowSpec = Any + + +def _get_snowpark_utils() -> tuple[Any, Any]: + try: + from snowflake.snowpark import functions as _F + from snowflake.snowpark.window import Window as _Window + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "snowflake-snowpark-python is required for this model. " + "Install fastflowtransform[snowflake]." + ) from exc + return _Window, _F + + +@engine_model( + only="snowflake_snowpark", + name="mart_latest_signup", + materialized="table", + tags=[ + "example:basic_demo", + "scope:mart", + "engine:snowflake_snowpark", + ], + deps=["users_clean.ff"], + require={"users_clean.ff": ["user_id", "email", "email_domain", "signup_date"]}, +) +def build(users_clean: DataFrame) -> DataFrame: + """Return the latest signup per email domain using Snowpark DataFrame operations.""" + Window, F = _get_snowpark_utils() + + window: WindowSpec = Window.partitionBy(F.col("email_domain")).orderBy( + F.col("signup_date").desc() + ) + + latest = ( + users_clean.withColumn("row_number", F.row_number().over(window)) + .filter(F.col("row_number") == 1) + .select( + F.col("email_domain"), + F.col("user_id").as_("latest_user_id"), + F.col("email").as_("latest_email"), + F.col("signup_date").as_("latest_signup_date"), + ) + ) + return latest diff --git a/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql b/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql index 170632c..5a04f6f 100644 --- a/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql +++ b/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql @@ -6,7 +6,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark', ], ) }} diff --git a/examples/basic_demo/models/staging/users_clean.ff.sql b/examples/basic_demo/models/staging/users_clean.ff.sql index b91afbb..26f255a 100644 --- a/examples/basic_demo/models/staging/users_clean.ff.sql +++ b/examples/basic_demo/models/staging/users_clean.ff.sql @@ -6,7 +6,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark', ], ) }} diff --git a/examples/basic_demo/profiles.yml b/examples/basic_demo/profiles.yml index f0d88bc..361d0c5 100644 --- a/examples/basic_demo/profiles.yml +++ b/examples/basic_demo/profiles.yml @@ -33,7 +33,7 @@ dev_bigquery_bigframes: dataset: "{{ env('FF_BQ_DATASET', 'basic_demo') }}" location: "{{ env('FF_BQ_LOCATION', 'EU') }}" use_bigframes: true - # allow_create_dataset: true # uncomment to auto-create dataset on first run + allow_create_dataset: true dev_bigquery_pandas: engine: bigquery @@ -42,4 +42,16 @@ dev_bigquery_pandas: dataset: "{{ env('FF_BQ_DATASET', 'basic_demo') }}" location: "{{ env('FF_BQ_LOCATION', 'EU') }}" use_bigframes: false - # allow_create_dataset: true # uncomment to auto-create dataset on first run + allow_create_dataset: true + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'BASIC_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true diff --git a/examples/cache_demo/.env.dev_snowflake b/examples/cache_demo/.env.dev_snowflake new file mode 100644 index 0000000..94584e9 --- /dev/null +++ b/examples/cache_demo/.env.dev_snowflake @@ -0,0 +1,18 @@ +# Snowflake Snowpark profile for the cache demo + +# Your Snowflake account identifier, e.g. xy12345.eu-central-1 +FF_SF_ACCOUNT=your_account_id + +# Username & password (or extend the profile for keypair auth) +FF_SF_USER=your_username +FF_SF_PASSWORD=your_password + +# Compute warehouse +FF_SF_WAREHOUSE=COMPUTE_WH + +# Database & schema for the demo +FF_SF_DATABASE=EXAMPLE_DEMO +FF_SF_SCHEMA=CACHE_DEMO + +# Optional role (can be left blank) +FF_SF_ROLE=ANALYST diff --git a/examples/cache_demo/Makefile b/examples/cache_demo/Makefile index b57f5a6..3511d0d 100644 --- a/examples/cache_demo/Makefile +++ b/examples/cache_demo/Makefile @@ -3,7 +3,7 @@ http_first http_offline http_cache_clear artifacts dag clean \ demo -ENGINE ?= duckdb # duckdb | postgres | databricks_spark | bigquery +ENGINE ?= duckdb # duckdb | postgres | databricks_spark | bigquery | snowflake_snowpark # BigQuery frame selector (pandas | bigframes) BQ_FRAME ?= bigframes PROJECT ?= . @@ -31,8 +31,12 @@ ifeq ($(ENGINE),bigquery) PROFILE_ENV = dev_bigquery_bigframes endif endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake + ENGINE_TAG = engine:snowflake_snowpark +endif ifndef PROFILE_ENV - $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark|bigquery) + $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark|bigquery|snowflake_snowpark) endif BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) @@ -54,8 +58,10 @@ else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),bigquery) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),snowflake_snowpark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine snowflake_snowpark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else - $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark|bigquery) + $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark|bigquery|snowflake_snowpark) endif seed: diff --git a/examples/cache_demo/README.md b/examples/cache_demo/README.md index 419ea60..62ca284 100644 --- a/examples/cache_demo/README.md +++ b/examples/cache_demo/README.md @@ -10,11 +10,12 @@ This demo shows: ## Quickstart ```bash -# pick your engine (duckdb, postgres, databricks_spark, or bigquery); defaults to duckdb +# pick your engine (duckdb, postgres, databricks_spark, bigquery, or snowflake_snowpark); defaults to duckdb cp .env.dev_duckdb .env # or: cp .env.dev_postgres .env (then edit DSN/schema) # or: cp .env.dev_databricks .env # or: cp .env.dev_bigquery_pandas .env # or .env.dev_bigquery_bigframes +# or: cp .env.dev_snowflake .env cd examples/cache_demo make cache_first ENGINE=duckdb # builds and writes cache @@ -44,6 +45,7 @@ Code kopieren To run everything on Postgres, set `ENGINE=postgres` and copy/edit `.env.dev_postgres`, e.g. `make demo ENGINE=postgres`. To run on Databricks/Spark locally, set `ENGINE=databricks_spark` and copy/edit `.env.dev_databricks`, e.g. `make demo ENGINE=databricks_spark`. To run on BigQuery, set `ENGINE=bigquery` and copy/edit `.env.dev_bigquery_pandas` (or `.env.dev_bigquery_bigframes`), e.g. `make demo ENGINE=bigquery BQ_FRAME=bigframes` (default) or `BQ_FRAME=pandas`. +To run on Snowflake Snowpark, install `fastflowtransform[snowflake]`, set `ENGINE=snowflake_snowpark`, copy/edit `.env.dev_snowflake`, and run e.g. `make demo ENGINE=snowflake_snowpark`. ## What this demo proves (in a minute) diff --git a/examples/cache_demo/models/engines/snowflake_snowpark/http_users.ff.py b/examples/cache_demo/models/engines/snowflake_snowpark/http_users.ff.py new file mode 100644 index 0000000..c428a4e --- /dev/null +++ b/examples/cache_demo/models/engines/snowflake_snowpark/http_users.ff.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model +from fastflowtransform.api.http import get_df + +if TYPE_CHECKING: # pragma: no cover - typing only + from snowflake.snowpark import DataFrame as SnowparkDataFrame + from snowflake.snowpark import Session as SnowparkSession +else: # pragma: no cover - runtime fallback + SnowparkDataFrame = Any + SnowparkSession = Any + + +def _ensure_session(df: Any) -> "SnowparkSession": + try: + from snowflake.snowpark import Session as _Session + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "snowflake-snowpark-python is required for this model. " + "Install fastflowtransform[snowflake]." + ) from exc + + session = getattr(df, "session", None) + if session is None: + raise RuntimeError( + "Snowpark session missing on upstream DataFrame. " + "Ensure Snowflake Snowpark is the active engine." + ) + return session + + +@engine_model( + only="snowflake_snowpark", + name="http_users", + deps=["stg_users.ff"], + meta={ + "materialized": "table", + "tags": ["example:cache_demo", "engine:snowflake_snowpark"], + }, +) +def fetch(users_df: SnowparkDataFrame) -> SnowparkDataFrame: + """ + Fetch demo users via the FFT HTTP helper and return a Snowpark DataFrame. + """ + session = _ensure_session(users_df) + pdf = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, + normalize=True, + ) + projected = pdf.loc[:, ["id", "email", "username"]].rename(columns={"id": "api_user_id"}) + return session.create_dataframe(projected.to_dict("records")) diff --git a/examples/cache_demo/models/engines/snowflake_snowpark/py_constants.ff.py b/examples/cache_demo/models/engines/snowflake_snowpark/py_constants.ff.py new file mode 100644 index 0000000..0d3c039 --- /dev/null +++ b/examples/cache_demo/models/engines/snowflake_snowpark/py_constants.ff.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: # pragma: no cover - typing only + from snowflake.snowpark import DataFrame as SnowparkDataFrame + from snowflake.snowpark import Session as SnowparkSession +else: # pragma: no cover - runtime fallback + SnowparkDataFrame = Any + SnowparkSession = Any + + +def _ensure_session(df: Any) -> "SnowparkSession": + try: + from snowflake.snowpark import Session as _Session + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "snowflake-snowpark-python is required for this model. " + "Install fastflowtransform[snowflake]." + ) from exc + + session = getattr(df, "session", None) + if session is None: + raise RuntimeError( + "Snowpark session missing on upstream DataFrame. " + "Ensure Snowflake Snowpark is selected via profiles.yml.", + ) + return session + + +@engine_model( + only="snowflake_snowpark", + name="py_constants", + deps=["stg_users.ff"], + materialized="table", + tags=[ + "example:cache_demo", + "engine:snowflake_snowpark", + ], +) +def build(stg_users: SnowparkDataFrame) -> SnowparkDataFrame: + """Snowpark variant that materializes the constant table.""" + session = _ensure_session(stg_users) + rows = [("answer", 42)] + return session.create_dataframe(rows, schema=["k", "v"]) diff --git a/examples/cache_demo/models/marts/mart_user_orders.ff.sql b/examples/cache_demo/models/marts/mart_user_orders.ff.sql index b5aa71b..f2bcce5 100644 --- a/examples/cache_demo/models/marts/mart_user_orders.ff.sql +++ b/examples/cache_demo/models/marts/mart_user_orders.ff.sql @@ -1,4 +1,4 @@ -{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb','engine:postgres','engine:databricks_spark','engine:bigquery']) }} +{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb','engine:postgres','engine:databricks_spark','engine:bigquery','engine:snowflake_snowpark']) }} with u as ( select user_id, email from {{ ref('stg_users.ff') }} ), diff --git a/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql b/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql index 156f624..ff87f94 100644 --- a/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql +++ b/examples/cache_demo/models/seeds_consumers/stg_orders.ff.sql @@ -1,4 +1,4 @@ -{{ config(materialized='view', tags=['example:cache_demo','engine:duckdb','engine:postgres','engine:databricks_spark','engine:bigquery']) }} +{{ config(materialized='view', tags=['example:cache_demo','engine:duckdb','engine:postgres','engine:databricks_spark','engine:bigquery','engine:snowflake_snowpark']) }} select cast(order_id as int) as order_id, cast(customer_id as int) as user_id, diff --git a/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql b/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql index 7862f7b..303fb92 100644 --- a/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql +++ b/examples/cache_demo/models/seeds_consumers/stg_users.ff.sql @@ -1,3 +1,3 @@ -{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb','engine:postgres','engine:databricks_spark','engine:bigquery']) }} +{{ config(materialized='table', tags=['example:cache_demo','engine:duckdb','engine:postgres','engine:databricks_spark','engine:bigquery','engine:snowflake_snowpark']) }} select cast(id as int) as user_id, lower(email) as email from {{ source('crm', 'users') }}; diff --git a/examples/cache_demo/profiles.yml b/examples/cache_demo/profiles.yml index fccfda5..a1d96ac 100644 --- a/examples/cache_demo/profiles.yml +++ b/examples/cache_demo/profiles.yml @@ -37,3 +37,15 @@ dev_bigquery_pandas: location: "{{ env('FF_BQ_LOCATION', 'EU') }}" use_bigframes: false # allow_create_dataset: true # uncomment to auto-create dataset on first run + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'CACHE_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true diff --git a/examples/dq_demo/.env.dev_snowflake b/examples/dq_demo/.env.dev_snowflake new file mode 100644 index 0000000..403dc79 --- /dev/null +++ b/examples/dq_demo/.env.dev_snowflake @@ -0,0 +1,18 @@ +# Snowflake Snowpark profile for the DQ demo + +# Your Snowflake account identifier, e.g. xy12345.eu-central-1 +FF_SF_ACCOUNT=your_account_id + +# Username & password (or extend to keypair auth) +FF_SF_USER=your_username +FF_SF_PASSWORD=your_password + +# Compute warehouse +FF_SF_WAREHOUSE=COMPUTE_WH + +# Database & schema for the demo +FF_SF_DATABASE=EXAMPLE_DEMO +FF_SF_SCHEMA=DQ_DEMO + +# Optional role +FF_SF_ROLE=ANALYST diff --git a/examples/dq_demo/Makefile b/examples/dq_demo/Makefile index ce2fca4..599f2c4 100644 --- a/examples/dq_demo/Makefile +++ b/examples/dq_demo/Makefile @@ -5,7 +5,7 @@ PROJECT ?= . UV ?= uv -# Engine selector (duckdb|postgres|databricks_spark) +# Engine selector (duckdb|postgres|databricks_spark|bigquery|snowflake_snowpark) ENGINE ?= duckdb # BigQuery frame selector (pandas|bigframes) @@ -40,6 +40,10 @@ ifeq ($(ENGINE),bigquery) PROFILE_ENV = dev_bigquery_bigframes endif endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake + ENGINE_TAG = engine:snowflake_snowpark +endif BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) @@ -61,6 +65,8 @@ else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),bigquery) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),snowflake_snowpark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine snowflake_snowpark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) endif diff --git a/examples/dq_demo/README.md b/examples/dq_demo/README.md index 68319e8..b271774 100644 --- a/examples/dq_demo/README.md +++ b/examples/dq_demo/README.md @@ -1,6 +1,6 @@ # Data Quality Demo -Run the complete DQ demo (seeds → models → DAG → tests) on DuckDB, Postgres, Databricks Spark, or BigQuery (pandas or BigFrames). +Run the complete DQ demo (seeds → models → DAG → tests) on DuckDB, Postgres, Databricks Spark, BigQuery (pandas or BigFrames), or Snowflake Snowpark. ## Quickstart From this directory: @@ -11,6 +11,7 @@ From this directory: - Databricks Spark: `.env.dev_databricks` - BigQuery (pandas): `.env.dev_bigquery_pandas` - BigQuery (BigFrames): `.env.dev_bigquery_bigframes` + - Snowflake Snowpark: `.env.dev_snowflake` 2) Run the demo (set `BQ_FRAME` when using BigQuery): ```sh @@ -18,6 +19,7 @@ From this directory: make demo ENGINE=postgres make demo ENGINE=databricks_spark make demo ENGINE=bigquery BQ_FRAME=pandas # or bigframes + make demo ENGINE=snowflake_snowpark # install fastflowtransform[snowflake] ``` Artifacts: diff --git a/examples/dq_demo/models/marts/mart_orders_agg.ff.sql b/examples/dq_demo/models/marts/mart_orders_agg.ff.sql index 1a9d686..2e20869 100644 --- a/examples/dq_demo/models/marts/mart_orders_agg.ff.sql +++ b/examples/dq_demo/models/marts/mart_orders_agg.ff.sql @@ -6,7 +6,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} diff --git a/examples/dq_demo/models/staging/customers.ff.sql b/examples/dq_demo/models/staging/customers.ff.sql index d1e2583..74697ed 100644 --- a/examples/dq_demo/models/staging/customers.ff.sql +++ b/examples/dq_demo/models/staging/customers.ff.sql @@ -6,7 +6,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} diff --git a/examples/dq_demo/models/staging/orders.ff.sql b/examples/dq_demo/models/staging/orders.ff.sql index 3a2c511..83e729c 100644 --- a/examples/dq_demo/models/staging/orders.ff.sql +++ b/examples/dq_demo/models/staging/orders.ff.sql @@ -6,7 +6,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} diff --git a/examples/dq_demo/profiles.yml b/examples/dq_demo/profiles.yml index 8df61e7..be54f0e 100644 --- a/examples/dq_demo/profiles.yml +++ b/examples/dq_demo/profiles.yml @@ -42,3 +42,15 @@ dev_bigquery_pandas: location: "{{ env('FF_BQ_LOCATION', 'EU') }}" use_bigframes: false # allow_create_dataset: true + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'DQ_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true diff --git a/examples/incremental_demo/.env.dev_snowflake b/examples/incremental_demo/.env.dev_snowflake new file mode 100644 index 0000000..de77a97 --- /dev/null +++ b/examples/incremental_demo/.env.dev_snowflake @@ -0,0 +1,18 @@ +# Snowflake Snowpark profile for the incremental demo + +# Your Snowflake account identifier, e.g. xy12345.eu-central-1 +FF_SF_ACCOUNT=your_account_id + +# Username & password (or extend for keypair auth) +FF_SF_USER=your_username +FF_SF_PASSWORD=your_password + +# Compute warehouse +FF_SF_WAREHOUSE=COMPUTE_WH + +# Database & schema for the demo +FF_SF_DATABASE=EXAMPLE_DEMO +FF_SF_SCHEMA=INCREMENTAL_DEMO + +# Optional role (can be left blank) +FF_SF_ROLE=ANALYST diff --git a/examples/incremental_demo/Makefile b/examples/incremental_demo/Makefile index 03b97d1..efe8e10 100644 --- a/examples/incremental_demo/Makefile +++ b/examples/incremental_demo/Makefile @@ -6,7 +6,7 @@ DB ?= .local/incremental_demo.duckdb PROJECT ?= . UV ?= uv -# Engine selector (duckdb|postgres|databricks_spark) +# Engine selector (duckdb|postgres|databricks_spark|bigquery|snowflake_snowpark) ENGINE ?= duckdb # BigQuery frame type selector (pandas | bigframes) @@ -52,13 +52,17 @@ ifeq ($(ENGINE),bigquery) PROFILE_ENV = dev_bigquery_bigframes endif endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake + ENGINE_TAG = engine:snowflake_snowpark +endif -BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) ifeq ($(ENGINE),databricks_spark) BASE_ENV := $(BASE_ENV) FF_DBR_TABLE_FORMAT=$(DBR_TABLE_FORMAT) endif ifeq ($(ENGINE),bigquery) - BASE_ENV := $(BASE_ENV) FF_ENGINE=$(ENGINE) FF_ENGINE_VARIANT=$(BQ_FRAME) + BASE_ENV := $(BASE_ENV) FF_ENGINE_VARIANT=$(BQ_FRAME) endif RUN_ENV = $(BASE_ENV) @@ -75,6 +79,8 @@ else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),bigquery) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),snowflake_snowpark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine snowflake_snowpark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) endif diff --git a/examples/incremental_demo/README.md b/examples/incremental_demo/README.md index c61aa84..400823f 100644 --- a/examples/incremental_demo/README.md +++ b/examples/incremental_demo/README.md @@ -1,10 +1,10 @@ # Incremental demo Small FFT example that showcases incremental models and Delta/Iceberg-style merges -across DuckDB, Postgres, Databricks Spark, and BigQuery (pandas or BigFrames). +across DuckDB, Postgres, Databricks Spark, BigQuery (pandas or BigFrames), and Snowflake Snowpark. ## How to use -- Fill an `.env.dev_*` for your engine (DuckDB/Postgres/Databricks/BigQuery). For BigQuery use `.env.dev_bigquery_pandas` or `.env.dev_bigquery_bigframes` plus a service-account key in `secrets/`. -- From this directory run `make demo ENGINE=` (set `BQ_FRAME` to switch BigQuery client; set `DBR_TABLE_FORMAT` for Spark). +- Fill an `.env.dev_*` for your engine (DuckDB/Postgres/Databricks/BigQuery/Snowflake). For BigQuery use `.env.dev_bigquery_pandas` or `.env.dev_bigquery_bigframes`; for Snowflake use `.env.dev_snowflake`. +- From this directory run `make demo ENGINE=` (set `BQ_FRAME` for BigQuery, `DBR_TABLE_FORMAT` for Spark). - Artifacts: DAG HTML in `site/dag/index.html`, FFT metadata in `.fastflowtransform/target/`. - See `docs/examples/Incremental_Demo.md` for a full walkthrough of the models and incremental configs. diff --git a/examples/incremental_demo/models/common/events_base.ff.sql b/examples/incremental_demo/models/common/events_base.ff.sql index 68ea352..cb4276f 100644 --- a/examples/incremental_demo/models/common/events_base.ff.sql +++ b/examples/incremental_demo/models/common/events_base.ff.sql @@ -7,7 +7,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} diff --git a/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql b/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql index 5bee73f..5680e8a 100644 --- a/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql +++ b/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql @@ -12,7 +12,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} diff --git a/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql b/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql index 34ee345..1816dfb 100644 --- a/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql +++ b/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql @@ -8,7 +8,8 @@ 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', - 'engine:bigquery' + 'engine:bigquery', + 'engine:snowflake_snowpark' ], ) }} diff --git a/examples/incremental_demo/models/engines/snowflake_snowpark/fct_events_py_incremental.ff.py b/examples/incremental_demo/models/engines/snowflake_snowpark/fct_events_py_incremental.ff.py new file mode 100644 index 0000000..907dc35 --- /dev/null +++ b/examples/incremental_demo/models/engines/snowflake_snowpark/fct_events_py_incremental.ff.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: # pragma: no cover - typing only + from snowflake.snowpark import DataFrame as SnowparkDataFrame +else: # pragma: no cover - runtime fallback + SnowparkDataFrame = Any + + +def _get_snowpark_functions(): + try: + from snowflake.snowpark import functions as _functions + except Exception as exc: # pragma: no cover - optional dependency guard + raise RuntimeError( + "snowflake-snowpark-python is required for this model. " + "Install fastflowtransform[snowflake]." + ) from exc + return _functions + + +@engine_model( + only="snowflake_snowpark", + name="fct_events_py_incremental", + deps=["events_base.ff"], + tags=[ + "example:incremental_demo", + "scope:engine", + "engine:snowflake_snowpark", + "kind:python", + "kind:incremental", + ], +) +def build(events_df: SnowparkDataFrame) -> SnowparkDataFrame: + """Snowpark variant of the incremental Python model.""" + F = _get_snowpark_functions() + df = events_df.withColumn("value_x10", F.col("value") * F.lit(10)) + return df.select("event_id", "updated_at", "value", "value_x10") diff --git a/examples/incremental_demo/profiles.yml b/examples/incremental_demo/profiles.yml index 8ab139b..692e7b5 100644 --- a/examples/incremental_demo/profiles.yml +++ b/examples/incremental_demo/profiles.yml @@ -83,3 +83,15 @@ dev_bigquery_pandas: location: "{{ env('FF_BQ_LOCATION', 'EU') }}" use_bigframes: false # allow_create_dataset: true # uncomment to auto-create dataset on first run + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'INCREMENTAL_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true diff --git a/examples/macros_demo/.env.dev_snowflake b/examples/macros_demo/.env.dev_snowflake new file mode 100644 index 0000000..fdffc89 --- /dev/null +++ b/examples/macros_demo/.env.dev_snowflake @@ -0,0 +1,16 @@ +# Snowflake Snowpark profile for the macros demo + +# Account identifier, e.g. xy12345.eu-central-1 +FF_SF_ACCOUNT=your_account_id + +# Credentials (or extend for keypair auth) +FF_SF_USER=your_username +FF_SF_PASSWORD=your_password + +# Warehouse / database / schema +FF_SF_WAREHOUSE=COMPUTE_WH +FF_SF_DATABASE=EXAMPLE_DEMO +FF_SF_SCHEMA=MACROS_DEMO + +# Optional role +FF_SF_ROLE=ANALYST diff --git a/examples/macros_demo/Makefile b/examples/macros_demo/Makefile index fbc7b75..cf60c26 100644 --- a/examples/macros_demo/Makefile +++ b/examples/macros_demo/Makefile @@ -33,6 +33,10 @@ ifeq ($(ENGINE),bigquery) PROFILE_ENV = dev_bigquery_bigframes endif endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake + ENGINE_TAG = engine:snowflake_snowpark +endif BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) @@ -52,6 +56,8 @@ else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),bigquery) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),snowflake_snowpark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine snowflake_snowpark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) endif diff --git a/examples/macros_demo/README.md b/examples/macros_demo/README.md index 95ca70c..d73255e 100644 --- a/examples/macros_demo/README.md +++ b/examples/macros_demo/README.md @@ -1,13 +1,12 @@ -# FastFlowTransform project scaffold +# Macros demo -This project was created with `fft init`. -Next steps: -1. Update `profiles.yml` with real connection details (docs/Profiles.md). -2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). -3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). +FastFlowTransform example that highlights SQL & Python macros. See `docs/examples/Macros_Demo.md` +for a full walkthrough. ## Engines -- DuckDB/Postgres/Databricks Spark are pre-wired. Use `make demo ENGINE=duckdb|postgres|databricks_spark`. -- BigQuery (pandas or BigFrames) mirrors the basic demo setup. Set `ENGINE=bigquery` and optionally `BQ_FRAME=pandas|bigframes` (default bigframes), then run `make demo ENGINE=bigquery BQ_FRAME=bigframes`. -- Sample env files: `.env.dev_bigquery_bigframes` and `.env.dev_bigquery_pandas` contain the required `FF_BQ_*` variables and `GOOGLE_APPLICATION_CREDENTIALS` hint. +- DuckDB/Postgres/Databricks Spark are pre-wired. Run `make demo ENGINE=duckdb|postgres|databricks_spark`. +- BigQuery (pandas or BigFrames) mirrors the basic demo setup. Set `ENGINE=bigquery` and optionally + `BQ_FRAME=pandas|bigframes` (default `bigframes`), then run `make demo ENGINE=bigquery BQ_FRAME=`. +- Snowflake Snowpark is available via `ENGINE=snowflake_snowpark`. Copy `.env.dev_snowflake` and install + `fastflowtransform[snowflake]` before running `make demo ENGINE=snowflake_snowpark`. diff --git a/examples/macros_demo/models/common/dim_users.ff.sql b/examples/macros_demo/models/common/dim_users.ff.sql index 30bc807..172bdd0 100644 --- a/examples/macros_demo/models/common/dim_users.ff.sql +++ b/examples/macros_demo/models/common/dim_users.ff.sql @@ -1,6 +1,6 @@ {{ config( materialized='table', - tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'] + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'] ) }} with u as ( diff --git a/examples/macros_demo/models/common/fct_user_sales.ff.sql b/examples/macros_demo/models/common/fct_user_sales.ff.sql index c3e8fa2..1e0a628 100644 --- a/examples/macros_demo/models/common/fct_user_sales.ff.sql +++ b/examples/macros_demo/models/common/fct_user_sales.ff.sql @@ -1,6 +1,6 @@ {{ config( materialized='table', - tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'] + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'] ) }} with o as ( diff --git a/examples/macros_demo/models/common/stg_orders.ff.sql b/examples/macros_demo/models/common/stg_orders.ff.sql index f872e6c..a041e2f 100644 --- a/examples/macros_demo/models/common/stg_orders.ff.sql +++ b/examples/macros_demo/models/common/stg_orders.ff.sql @@ -1,6 +1,6 @@ {{ config( materialized='view', - tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'] + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'] ) }} select diff --git a/examples/macros_demo/models/common/stg_users.ff.sql b/examples/macros_demo/models/common/stg_users.ff.sql index 1da7408..f1c75bd 100644 --- a/examples/macros_demo/models/common/stg_users.ff.sql +++ b/examples/macros_demo/models/common/stg_users.ff.sql @@ -1,6 +1,6 @@ {{ config( materialized='view', - tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'] + tags=['example:macros_demo', 'scope:common', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'] ) }} with src as ( diff --git a/examples/macros_demo/models/engines/snowflake_snowpark/py_exmaple.ff.py b/examples/macros_demo/models/engines/snowflake_snowpark/py_exmaple.ff.py new file mode 100644 index 0000000..c0d69b9 --- /dev/null +++ b/examples/macros_demo/models/engines/snowflake_snowpark/py_exmaple.ff.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: # pragma: no cover - typing only + from snowflake.snowpark import DataFrame as SnowparkDataFrame +else: # pragma: no cover - runtime fallback + SnowparkDataFrame = Any + + +def _get_session(df: Any) -> SnowparkDataFrame: + session = getattr(df, "session", None) + if session is None: + raise RuntimeError( + "Snowpark session missing on upstream DataFrame. " + "Ensure Snowflake Snowpark is the active engine." + ) + return session + + +@engine_model( + only="snowflake_snowpark", + name="py_example", + deps=["fct_user_sales.ff"], + tags=["example:macros_demo", "scope:engine", "engine:snowflake_snowpark"], +) +def produce(sales_df: SnowparkDataFrame) -> SnowparkDataFrame: + session = _get_session(sales_df) + data = [{"note": "Python model ran on Snowflake Snowpark"}] + return session.create_dataframe(data) diff --git a/examples/macros_demo/profiles.yml b/examples/macros_demo/profiles.yml index 3da5cee..4c35722 100644 --- a/examples/macros_demo/profiles.yml +++ b/examples/macros_demo/profiles.yml @@ -39,3 +39,15 @@ dev_bigquery_pandas: location: "{{ env('FF_BQ_LOCATION', 'EU') }}" use_bigframes: false allow_create_dataset: true + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'MACROS_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true diff --git a/examples/materializations_demo/.env.dev_bigquery_bigframes b/examples/materializations_demo/.env.dev_bigquery_bigframes new file mode 100644 index 0000000..3423acd --- /dev/null +++ b/examples/materializations_demo/.env.dev_bigquery_bigframes @@ -0,0 +1,8 @@ +# BigQuery (BigFrames) profile for the materializations demo + +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=materializations_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON (or rely on gcloud ADC) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/materializations_demo/.env.dev_bigquery_pandas b/examples/materializations_demo/.env.dev_bigquery_pandas new file mode 100644 index 0000000..600d5a5 --- /dev/null +++ b/examples/materializations_demo/.env.dev_bigquery_pandas @@ -0,0 +1,8 @@ +# BigQuery (pandas) profile for the materializations demo + +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=materializations_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON (or rely on gcloud ADC) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/materializations_demo/.env.dev_snowflake b/examples/materializations_demo/.env.dev_snowflake new file mode 100644 index 0000000..ee3c7d7 --- /dev/null +++ b/examples/materializations_demo/.env.dev_snowflake @@ -0,0 +1,16 @@ +# Snowflake Snowpark profile for the materializations demo + +# Account identifier (e.g. xy12345.eu-central-1) +FF_SF_ACCOUNT=your_account_id + +# Credentials +FF_SF_USER=your_username +FF_SF_PASSWORD=your_password + +# Warehouse / database / schema +FF_SF_WAREHOUSE=COMPUTE_WH +FF_SF_DATABASE=EXAMPLE_DEMO +FF_SF_SCHEMA=MATERIALIZATIONS_DEMO + +# Optional role +FF_SF_ROLE=ANALYST diff --git a/examples/materializations_demo/Makefile b/examples/materializations_demo/Makefile index 635292c..c117a85 100644 --- a/examples/materializations_demo/Makefile +++ b/examples/materializations_demo/Makefile @@ -8,8 +8,9 @@ DB ?= .local/materializations_demo.duckdb PROJECT ?= . UV ?= uv -# Engine selector (duckdb|postgres|databricks_spark) +# Engine selector (duckdb|postgres|databricks_spark|snowflake_snowpark) ENGINE ?= duckdb +BQ_FRAME ?= bigframes # Detect OS opener (macOS: open, Linux: xdg-open) UNAME_S := $(shell uname -s) @@ -31,8 +32,24 @@ ifeq ($(ENGINE),databricks_spark) PROFILE_ENV = dev_databricks ENGINE_TAG = engine:databricks_spark endif +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake + ENGINE_TAG = engine:snowflake_snowpark +endif + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) +ifeq ($(ENGINE),bigquery) + BASE_ENV := $(BASE_ENV) FF_ENGINE_VARIANT=$(BQ_FRAME) +endif -BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) RUN_ENV = $(BASE_ENV) # Select only models for this demo + active engine @@ -46,6 +63,10 @@ else ifeq ($(ENGINE),postgres) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" else ifeq ($(ENGINE),databricks_spark) CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),bigquery) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),snowflake_snowpark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine snowflake_snowpark --env "$(PROFILE_ENV)" --project "$(PROJECT)" else CLEAN_CMD = $(error Unsupported ENGINE=$(ENGINE) for cleanup) endif diff --git a/examples/materializations_demo/README.md b/examples/materializations_demo/README.md index 5e977f7..b20896c 100644 --- a/examples/materializations_demo/README.md +++ b/examples/materializations_demo/README.md @@ -1,7 +1,12 @@ -# FastFlowTransform project scaffold +# Materializations demo -This project was created with `fft init`. -Next steps: -1. Update `profiles.yml` with real connection details (docs/Profiles.md). -2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). -3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). +FastFlowTransform example highlighting materialized views/tables, incremental models, and Python emitters. +See `docs/examples/Materializations_Demo.md` for a full walkthrough. + +## Engines + +- DuckDB/Postgres/Databricks Spark are wired via the Makefile: `make demo ENGINE=duckdb|postgres|databricks_spark`. +- BigQuery supports both pandas and BigFrames clients. Copy `.env.dev_bigquery_pandas` (or `_bigframes`), + set `GOOGLE_APPLICATION_CREDENTIALS`, and run `make demo ENGINE=bigquery BQ_FRAME=pandas|bigframes`. +- Snowflake Snowpark mirrors the basic demo setup. Copy `.env.dev_snowflake`, install `fastflowtransform[snowflake]`, + then run `make demo ENGINE=snowflake_snowpark`. diff --git a/examples/materializations_demo/models/common/dim_customers.ff.sql b/examples/materializations_demo/models/common/dim_customers.ff.sql index a6bf77a..e3aa494 100644 --- a/examples/materializations_demo/models/common/dim_customers.ff.sql +++ b/examples/materializations_demo/models/common/dim_customers.ff.sql @@ -1,7 +1,7 @@ {{ config( materialized='table', tags=['example:materializations_demo', 'scope:dim', - 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] + 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'] ) }} -- Dimension table; stable per customer diff --git a/examples/materializations_demo/models/common/fct_orders_inc.ff.sql b/examples/materializations_demo/models/common/fct_orders_inc.ff.sql index 308ed42..dfe5621 100644 --- a/examples/materializations_demo/models/common/fct_orders_inc.ff.sql +++ b/examples/materializations_demo/models/common/fct_orders_inc.ff.sql @@ -1,7 +1,7 @@ {{ config( materialized='incremental', tags=['example:materializations_demo', 'scope:fct', - 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'], + 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'], incremental={ 'updated_at_column': 'order_ts' }, diff --git a/examples/materializations_demo/models/common/mart_order_summary.ff.sql b/examples/materializations_demo/models/common/mart_order_summary.ff.sql index 89dcacc..30a35f0 100644 --- a/examples/materializations_demo/models/common/mart_order_summary.ff.sql +++ b/examples/materializations_demo/models/common/mart_order_summary.ff.sql @@ -1,7 +1,7 @@ {{ config( materialized='table', tags=['example:materializations_demo', 'scope:mart', - 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] + 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'] ) }} with orders as ( diff --git a/examples/materializations_demo/models/common/order_flags_ephemeral.ff.sql b/examples/materializations_demo/models/common/order_flags_ephemeral.ff.sql index 3dd6d68..3737d7d 100644 --- a/examples/materializations_demo/models/common/order_flags_ephemeral.ff.sql +++ b/examples/materializations_demo/models/common/order_flags_ephemeral.ff.sql @@ -1,7 +1,7 @@ {{ config( materialized='ephemeral', tags=['example:materializations_demo', 'scope:helpers', - 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] + 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'] ) }} -- Not persisted; will be inlined as a CTE where referenced diff --git a/examples/materializations_demo/models/common/stg_customers.ff.sql b/examples/materializations_demo/models/common/stg_customers.ff.sql index 0dc7a30..45508ae 100644 --- a/examples/materializations_demo/models/common/stg_customers.ff.sql +++ b/examples/materializations_demo/models/common/stg_customers.ff.sql @@ -1,7 +1,7 @@ {{ config( materialized='view', tags=['example:materializations_demo', 'scope:staging', - 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] + 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'] ) }} -- Lightweight projection and type normalization from the seed diff --git a/examples/materializations_demo/models/common/stg_orders.ff.sql b/examples/materializations_demo/models/common/stg_orders.ff.sql index 91b5187..bb2b863 100644 --- a/examples/materializations_demo/models/common/stg_orders.ff.sql +++ b/examples/materializations_demo/models/common/stg_orders.ff.sql @@ -1,7 +1,7 @@ {{ config( materialized='view', tags=['example:materializations_demo', 'scope:staging', - 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark'] + 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery', 'engine:snowflake_snowpark'] ) }} -- Normalize order columns and force types portable across engines diff --git a/examples/materializations_demo/models/engines/bigquery/bigframes/demo_py_emit.ff.py b/examples/materializations_demo/models/engines/bigquery/bigframes/demo_py_emit.ff.py new file mode 100644 index 0000000..1a340ff --- /dev/null +++ b/examples/materializations_demo/models/engines/bigquery/bigframes/demo_py_emit.ff.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model +import pandas as pd + +if TYPE_CHECKING: + from bigframes.dataframe import DataFrame as BFDataFrame +else: + BFDataFrame = Any + + +def _ensure_bigframes(): + try: + import bigframes.pandas as bpd # type: ignore + except Exception as exc: # pragma: no cover + raise RuntimeError( + "bigframes is required for this model. Install fastflowtransform[bigquery_bf]." + ) from exc + return bpd + + +@engine_model( + env_match={"FF_ENGINE": "bigquery", "FF_ENGINE_VARIANT": "bigframes"}, + name="demo_py_emit", + deps=["dim_customers.ff"], + tags=["example:materializations_demo", "scope:python", "engine:bigquery"], +) +def fetch(_: BFDataFrame) -> BFDataFrame: + bpd = _ensure_bigframes() + return bpd.DataFrame( + [{"note": "hello from python (BigQuery BigFrames)", "emitted_at": pd.Timestamp.utcnow()}] + ) diff --git a/examples/materializations_demo/models/engines/bigquery/pandas/demo_py_emit.ff.py b/examples/materializations_demo/models/engines/bigquery/pandas/demo_py_emit.ff.py new file mode 100644 index 0000000..fedce08 --- /dev/null +++ b/examples/materializations_demo/models/engines/bigquery/pandas/demo_py_emit.ff.py @@ -0,0 +1,14 @@ +from fastflowtransform import engine_model +import pandas as pd + + +@engine_model( + env_match={"FF_ENGINE": "bigquery", "FF_ENGINE_VARIANT": "pandas"}, + name="demo_py_emit", + deps=["dim_customers.ff"], + tags=["example:materializations_demo", "scope:python", "engine:bigquery"], +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame( + [{"note": "hello from python (BigQuery pandas)", "emitted_at": pd.Timestamp.utcnow()}] + ) diff --git a/examples/materializations_demo/models/engines/snowflake_snowpark/demo_py_emit.ff.py b/examples/materializations_demo/models/engines/snowflake_snowpark/demo_py_emit.ff.py new file mode 100644 index 0000000..898d8d6 --- /dev/null +++ b/examples/materializations_demo/models/engines/snowflake_snowpark/demo_py_emit.ff.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from fastflowtransform import engine_model + +if TYPE_CHECKING: # pragma: no cover - typing only + from snowflake.snowpark import DataFrame as SnowparkDataFrame +else: # pragma: no cover - runtime fallback + SnowparkDataFrame = Any + + +def _ensure_session(df: Any): + session = getattr(df, "session", None) + if session is None: + raise RuntimeError( + "Snowpark session missing on upstream DataFrame. " + "Ensure Snowflake Snowpark is the active engine." + ) + return session + + +def _snowflake_functions(): + try: + from snowflake.snowpark import functions as F # type: ignore + except Exception as exc: # pragma: no cover - optional dep guard + raise RuntimeError( + "snowflake-snowpark-python is required for this model. " + "Install fastflowtransform[snowflake]." + ) from exc + return F + + +@engine_model( + only="snowflake_snowpark", + name="demo_py_emit", + deps=["dim_customers.ff"], + tags=["example:materializations_demo", "scope:python", "engine:snowflake_snowpark"], +) +def fetch(dim_customers: SnowparkDataFrame) -> SnowparkDataFrame: + session = _ensure_session(dim_customers) + F = _snowflake_functions() + df = session.create_dataframe([("hello from python (Snowflake)",)], schema=["note"]) + return df.withColumn("emitted_at", F.current_timestamp()) diff --git a/examples/materializations_demo/models/macros/types.sql.j2 b/examples/materializations_demo/models/macros/types.sql.j2 index ca29165..cec8efe 100644 --- a/examples/materializations_demo/models/macros/types.sql.j2 +++ b/examples/materializations_demo/models/macros/types.sql.j2 @@ -4,6 +4,8 @@ {% set eng = (var('engine') or env('FF_ENGINE') or '').lower() %} {% if eng in ('postgres', 'postgresql') %} double precision + {% elif eng == 'bigquery' %} + float64 {% else %} double {% endif %} diff --git a/examples/materializations_demo/profiles.yml b/examples/materializations_demo/profiles.yml index f554af7..5d4e79f 100644 --- a/examples/materializations_demo/profiles.yml +++ b/examples/materializations_demo/profiles.yml @@ -23,3 +23,33 @@ dev_databricks: spark.hadoop.datanucleus.schema.autoCreateAll: "true" spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'materializations_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true + allow_create_dataset: true + +dev_bigquery_pandas: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'materializations_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: false + allow_create_dataset: true + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'MATERIALIZATIONS_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true diff --git a/exports/Combined.md b/exports/Combined.md deleted file mode 100644 index f68b473..0000000 --- a/exports/Combined.md +++ /dev/null @@ -1,5435 +0,0 @@ -# Combined Documentation - - - - - -# FastFlowTransform Documentation Hub - -Welcome! This page is your starting point for FastFlowTransform docs. Pick the track that matches what you want to do and follow the links to the detailed guides. - ---- - -## Docs Navigation -- **Getting Started** — you are here (`docs/index.md`) -- [User Guide](./Technical_Overview.md#part-i-operational-guide) -- [Modeling Reference](./Config_and_Macros.md) -- [Parallelism & Cache](./Cache_and_Parallelism.md) -- [CLI Guide](./CLI_Guide.md) -- [Logging & Verbosity](./Logging.md) -- [API calls in Python models](./Api_Models.md) -- [Incremental Models](./Incremental.md) -- [YAML Tests (Schema-bound)](./YAML_Tests.md) -- [Model Unit Tests](./Unit_Tests.md) -- [Data Quality Tests Reference](./Data_Quality_Tests.md) -- [Auto-Docs & Lineage](./Auto_Docs.md) -- [Troubleshooting & Error Codes](./Troubleshooting.md) -- [Profiles & Environments](./Profiles.md) -- [Sources Declaration](./Sources.md) -- [Project Configuration](./Project_Config.md) -- [State Selection (changed & results)](./State_Selection.md) -- [Basic Demo](./examples/Basic_Demo.md) -- [Materializations Demo](./examples/Materializations_Demo.md) -- [Data Quality Tests Demo](./examples/DQ_Demo.md) -- [Macros Demo](./examples/Macros_Demo.md) -- [Cache Demo](./examples/Cache_Demo.md) -- [Environment Matrix Demo](./examples/Environment_Matrix.md) -- [Incremental & Delta Demo](examples/Incremental_Demo.md) -- [Local Engine Setup](./examples/Local_Engine_Setup.md) -- [API Demo](./examples/API_Demo.md) -- [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) - -## Table of Contents - -- [Docs Navigation](#docs-navigation) -- [Choose Your Path](#choose-your-path) -- [Reference Map](#reference-map) -- [Need Help?](#need-help) - ---- - -## Choose Your Path - -### 1. Build & Operate Projects (Data Practitioners) - -- **Get set up quickly:** follow the dedicated [Quickstart](Quickstart.md) guide for installation, seeding, and a first run. -- **Need local runtimes?** The [API demo local engine setup](examples/Local_Engine_Setup.md) walks through DuckDB, Postgres, and Databricks Spark. -- **Understand the project layout & CLI workflow:** start with *Project Layout* in the [Technical Overview](Technical_Overview.md#project-layout) and pair it with the [CLI Guide](CLI_Guide.md) for command patterns. -- **Configure runtimes & profiles:** review executor profiles and environment overrides in the dedicated [Profiles guide](Profiles.md) plus [Logging & Verbosity](Logging.md) for observability flags. -- **Model data quality & troubleshoot runs:** combine the [Model Unit Tests guide](Unit_Tests.md) with [Troubleshooting & Error Codes](Troubleshooting.md) to keep runs deterministic and easy to debug. -- **Explore runnable demos:** start with the [Basic Demo Overview](examples/Basic_Demo.md) or browse the `examples/` directory; each subproject ships with its own README. - -### 2. Extend FastFlowTransform (Developers & Contributors) - -- **Dive into architecture & core modules:** start with [Architecture Overview](Technical_Overview.md#architecture-overview) and [Core Modules](Technical_Overview.md#core-modules) for registry, DAG, executors, validation, and more. -- **Add tests & seeds:** reuse the curated demos under `docs/examples/` for seeds/Makefiles and follow the [Model Unit Tests guide](Unit_Tests.md) for deterministic fixtures. -- **Contribute code:** follow the workflow described in [`./Contributing.md`](./Contributing.md) and consult the module-level docs for internal APIs. -- **Plan ahead:** check the roadmap snapshot in the [Technical Overview](Technical_Overview.md#roadmap-snapshot) to understand upcoming work. - ---- - -## Reference Map - -- **Modeling reference** — Jinja configuration, macros, helper functions: [`Config_and_Macros.md`](Config_and_Macros.md) -- **CLI entry point & commands** — `src/fastflowtransform/cli.py` -- **Registry & node loading** — `src/fastflowtransform/core.py` -- **Unit test runner** — `src/fastflowtransform/utest.py` -- **Rendered DAG templates** — `src/fastflowtransform/docs/templates/` - ---- - -## Need Help? - -- Open an issue or PR — see [`./Contributing.md`](./Contributing.md) for guidelines. -- Join the discussion (planning doc / roadmap highlights) — see the roadmap section in the [Technical Overview](Technical_Overview.md#roadmap-snapshot). -- If you spot gaps in the docs, file an issue with the context and links to the relevant section. - - - - - -# Quickstart - -This guide walks you through creating a minimal FastFlowTransform project from scratch and running it end-to-end. - -## 0. Create a skeleton (optional) - -Start with a minimal project structure: - -```bash -fft init demo_project --engine duckdb -``` - -The command is non-interactive, refuses to overwrite existing directories, and leaves inline comments that point back to the relevant docs (`Project_Config.md`, `Profiles.md`, etc.). Populate the generated files before running the steps below. - -## 1. Install & bootstrap - -```bash -python3 -m venv .venv -. .venv/bin/activate # or source .venv/bin/activate -pip install --upgrade pip -pip install -e . # run from the repo root; use `uv pip install --editable .` if you prefer uv -fft --help -``` - -## 2. Create project layout - -```bash -mkdir -p demo/{models,seeds} -cat <<'YAML' > demo/sources.yml -version: 2 - -sources: - - name: raw - schema: staging - tables: - - name: users - identifier: seed_users -YAML - -cat <<'CSV' > demo/seeds/seed_users.csv -id,email -1,a@example.com -2,b@example.com -CSV - -cat <<'SQL' > demo/models/users.ff.sql -{{ config(materialized='table') }} -select id, email -from {{ source('raw', 'users') }} -SQL - -cat <<'YAML' > demo/profiles.yml -dev: - engine: duckdb - duckdb: - path: ".local/demo.duckdb" -YAML -``` - -## 3. Seed static inputs - -```bash -fft seed demo --env dev -``` - -This materializes the CSV into the configured engine (DuckDB by default) using `seed_users` as the physical table. - -## 4. Run the pipeline - -```bash -fft run demo --env dev --cache off -``` - -You should see log lines similar to `✓ L01 [DUCK] users.ff`. The resulting table lives in the target schema (`staging` in this example). - -## 5. Inspect artifacts - -- `.fastflowtransform/target/manifest.json` → model graph + sources -- `.fastflowtransform/target/run_results.json` → run outcomes and durations - -## 6. Add more models (optional) - -- Reference other models with `{{ ref('model_name') }}` -- Configure tags or materializations via `{{ config(...) }}` at the top of each SQL file - -## 7. Next steps - -- Add `project.yml` for reusable `vars:` and metadata -- Explore `fft docs` to generate HTML documentation -- Use engine profiles under `profiles.yml` to target Postgres, BigQuery, or Databricks (path-based sources supported via `format` + `location` overrides) -- Render the DAG site for this project: `fft dag demo --env dev --html` (find it under `demo/site/dag/index.html`) - -Refer to `docs/Config_and_Macros.md` for advanced configuration options. - - - - - -# 🧭 FastFlowTransform – Technical Developer Documentation - -> Status: latest updates from your context dump. This document consolidates project structure, architecture, core APIs, error handling, CLI, examples, and roadmap into a print/git-friendly Markdown. -> -> Looking for an overview? Start at the [`docs/index.md`](./index.md) hub, then dive back here when you need details. -> -> Project: **FastFlowTransform** — SQL & Python Data Modeling (Batch + Streaming), DAG, CLI, Auto-Docs, DQ Tests. - ---- - -## Docs Navigation -1. [Getting Started](./index.md) -2. **User Guide** — see [Part I – Operational Guide](#part-i-operational-guide) (this document) -3. [Modeling Reference](./Config_and_Macros.md) -4. **Developer Guide** — see [Part II – Architecture & Internals](#part-ii-architecture-internals) (this document) - ---- - -## Table of Contents - -- [Docs Navigation](#docs-navigation) -- [Part I – Operational Guide](#part-i-operational-guide) - - [Project Layout](#project-layout) - - [Example Projects and Seeds](#example-projects-and-seeds) - - [CLI Flows](#cli-flows) - - [Logging & Verbosity](#logging-verbosity) - - [Model Unit Tests (`fft utest`)](#model-unit-tests-fft-utest) - - [Troubleshooting](#troubleshooting) - - [Profiles & Environment Overrides](#profiles-environment-overrides) - - [Parallel Execution and Cache](#parallel-execution-and-cache) - - [Roadmap Snapshot](#roadmap-snapshot) - - [Cross-Table Reconciliations](#cross-table-reconciliations) - - [Auto-Docs and Lineage](#auto-docs-and-lineage) -- [Part II – Architecture & Internals](#part-ii-architecture-internals) - - [Architecture Overview](#architecture-overview) - - [Core Modules](#core-modules) - - [`core.py`](#corepy) - - [`dag.py`](#dagpy) - - [`errors.py`](#errorspy) - - [Executors](#executors) - - [`validation.py`](#validationpy) - - [`testing.py`](#testingpy) - - [`docs.py` & Templates](#docspy-templates) - - [`seeding.py`](#seedingpy) - - [CLI Implementation](#cli-implementation) - - [Settings Infrastructure](#settings-infrastructure) - - [Streaming Components](#streaming-components) - - [Mini End-to-End Example (Python API)](#mini-end-to-end-example-python-api) - ---- - -## Part I – Operational Guide - -### Project Layout - -For an up-to-date view, browse the repository tree or run `find . -maxdepth 2` from the root; all examples live under `examples/` with their own READMEs. - -### Example Projects and Seeds - -Need runnable references? Start with the curated demos under `docs/examples/`: - -- [Basic Demo](./examples/Basic_Demo.md) shows the minimum viable project (seeds, staging, marts) plus Makefile targets you can copy. -- [API Demo](./examples/API_Demo.md) focuses on HTTP-powered Python models. -- [Environment Matrix](./examples/Environment_Matrix.md) demonstrates multiple profiles talking to different engines. - -Each demo includes deterministic seeds (`seeds/*.csv`), schema YAML, and Makefile shortcuts, so the detailed CSV listings and commands here would be redundant. Follow the demo docs (or the [Quickstart](./Quickstart.md)) for the full walkthrough. - -### CLI Flows - -Looking for command recipes, selection filters, or sync workflows? See the dedicated [CLI Guide](./CLI_Guide.md) for a task-by-task breakdown (seed/run/dag/docgen/test/utest/sync-db-comments) plus links to API-model helpers. - -### Logging & Verbosity - -Need the exact behaviour of `-q/-v/-vv`, SQL debug output, or the parallel log queue? Head over to [Logging.md](./Logging.md) for the full matrix plus usage snippets. - -### Model Unit Tests (`fft utest`) - -The full how-to (cache modes, YAML DSL, CI snippets) moved to [Unit_Tests.md](./Unit_Tests.md). Keep this Section in mind whenever you need fast feedback on SQL/Python models without executing the entire DAG. - -### Troubleshooting - -Common fixes (engines, docs generation, tests) plus the exit-code matrix live in [Troubleshooting.md](./Troubleshooting.md). Skim that doc whenever you hit connectivity issues or need to decode return codes. - -### Profiles & Environment Overrides -Need to understand profile precedence, `.env` layering, or the Pydantic models that back settings? Jump to the [Profiles guide](./Profiles.md) which covers file layout, environment helpers, validation, and selection precedence in depth. - -### Parallel Execution and Cache - -Level-wise parallelism, cache modes, fingerprint formula, and the `_ff_meta` audit table are documented in [Cache_and_Parallelism.md](./Cache_and_Parallelism.md). Use that reference for CLI examples (`--jobs`, `--cache`, `--rebuild`), skip conditions, and troubleshooting tips related to concurrency. - -### Cross-Table Reconciliations - -Reconciliation tests (`reconcile_equal`, `reconcile_ratio_within`, `reconcile_diff_within`, `reconcile_coverage`) are fully documented in the [Data Quality Test Reference](./Data_Quality_Tests.md#cross-table-reconciliations). Use that guide for YAML schemas, tolerance parameters, and engine notes before wiring the checks into `fft test`. - -### Auto-Docs and Lineage - -Rendering the DAG site, feeding project descriptions/lineage, and exporting JSON manifests are covered in [Auto_Docs.md](./Auto_Docs.md). Head there for command flags, markdown/YAML resolution, and lineage overrides. - -## Part II – Architecture & Internals - -### Architecture Overview - -``` -CLI (Typer) -│ -├── Registry (core.py) -│ ├── Discover models (*.ff.sql / *.ff.py) -│ ├── Load Python models (decorator) -│ ├── Parse/validate dependencies -│ └── Jinja environment + sources.yml -│ -├── DAG (dag.py) -│ ├── topo_sort (Kahn, deterministic) -│ └── mermaid() (styled + stable IDs) -│ -├── Executors (executors/*) -│ ├── BaseExecutor (SQL rendering, dependency loading, materialization, requires guard) -│ ├── DuckExecutor (DuckDB) -│ ├── PostgresExecutor (SQLAlchemy, shims) -│ ├── BigQueryExecutor (pandas) -│ ├── BigQueryBFExecutor (BigQuery DataFrames / bigframes) -│ ├── DatabricksSparkExecutor (PySpark, without pandas) -│ └── SnowflakeSnowparkExecutor (Snowpark, without pandas) -│ -├── Testing (testing.py) -│ ├── generic _exec / _scalar -│ └── Checks: not_null, unique, row_count_between, greater_equal, non_negative_sum, freshness -│ -├── Seeding (seeding.py) -│ └── Load seeds (CSV/Parquet/SQL) → engine agnostic -│ -├── Docs (docs.py + templates/) -│ ├── Mermaid + overview table (index.html) -│ └── Model detail pages (model.html) -│ -├── Settings/Profiles (settings.py) -│ └── Pydantic v2 discriminated union + ENV overrides -│ -└── Streaming (streaming/*) - ├── FileTailSource - └── StreamSessionizer -``` - ---- - -### Core Modules - -#### `core.py` - -Key data structures and the project loading process. - -```python -@dataclass -class Node: - name: str # logical name (stem or @model(name=...)) - kind: str # "sql" | "python" - path: Path - deps: List[str] = field(default_factory=list) - -class Registry: - def load_project(self, project_dir: Path) -> None: ... - def _register_node(self, node: Node) -> None: ... - def _load_py_module(self, path: Path) -> types.ModuleType: ... - def _scan_sql_deps(self, path: Path) -> List[str]: ... -``` - -**Helpers & decorator:** - -```python -def relation_for(node_name: str) -> str: ... -def ref(name: str) -> str: ... -def source(source_name: str, table_name: str) -> str: ... - -def model(name=None, deps=None, requires=None) -> Callable[[Callable[..., Any]], Callable[..., Any]]: ... -``` - -**Python models (example):** - -```python -@model(name="users_enriched", deps=["users.ff"], requires={"users": {"id","email"}}) -def enrich(df: pd.DataFrame) -> pd.DataFrame: ... -``` - ---- - -#### `dag.py` - -Deterministic topological sort plus Mermaid export. - -```python -def topo_sort(nodes: Dict[str, Node]) -> List[str]: ... -def mermaid(nodes: Dict[str, Node]) -> str: ... -``` - ---- - -#### `errors.py` - -Primary error types with helpful messages. - -```python -class FastFlowTransformError(Exception): ... -class ModuleLoadError(FastFlowTransformError): ... -class DependencyNotFoundError(FastFlowTransformError): ... -class ModelCycleError(FastFlowTransformError): ... -class TestFailureError(FastFlowTransformError): ... -``` - ---- - -#### Executors - -Shared logic (`BaseExecutor`) plus engine implementations. - -```python -class BaseExecutor(ABC): - def render_sql(self, node: Node, env: Environment, ref_resolver=None, source_resolver=None) -> str: ... - def run_python(self, node: Node) -> None: ... - @abstractmethod - def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.DataFrame: ... - @abstractmethod - def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None: ... -``` - -**DuckDB (`duckdb.py`)** - -- `run_sql(node, env)` renders Jinja (`ref/source`) and executes the SQL. -- `_read_relation` loads a table as `DataFrame`; surfaces actionable errors when a dependency is missing. -- `_materialize_relation` writes the `DataFrame` as a table (`create or replace table ...`). - -**Postgres (`postgres.py`)** - -- `_SAConnShim` (compatible with `testing._exec`). -- `run_sql` renders SQL and rewrites `CREATE OR REPLACE TABLE` to `DROP + CREATE AS`. -- `_read_relation` uses pandas, handles schemas, and provides clear guidance. -- `_materialize_relation` writes via `to_sql(if_exists="replace")`. - -**BigQuery / BigQuery DataFrames / Spark / Snowpark** - -- Identical signatures; IO uses the respective native dataframes (no pandas for Spark/Snowpark). - ---- - -#### `validation.py` - -Required-column checks for Python models (single and multi dependency). - -```python -class RequiredColumnsError(ValueError): ... -def validate_required_columns(node_name: str, inputs: Any, requires: dict[str, set[str]]): ... -``` - ---- - -#### `testing.py` - -Minimal data quality framework (engine agnostic via `_exec`). - -**Checks:** `not_null`, `unique`, `greater_equal`, `non_negative_sum`, `row_count_between`, `freshness` - -```python -class TestFailure(Exception): ... -def _exec(con: Any, sql: Any): ... -def _scalar(con: Any, sql: Any): ... -``` - ---- - -#### `docs.py` & Templates - -- `render_site(out_dir, nodes)` produces `index.html` plus `model.html` per model. -- Templates (`docs/templates/`) include dark mode, filters, copy buttons, legend. -- Uses `dag.mermaid(nodes)` for the graph. - ---- - -#### `seeding.py` - -Engine-agnostic seed loading (CSV/Parquet/SQL). - -```python -def seed_project(project_dir: Path, executor, schema: Optional[str] = None) -> int: ... -``` - ---- - -### CLI Implementation - -Operational usage lives in [CLI Flows](#cli-flows) and the dedicated [CLI Guide](CLI_Guide.md). For implementation details, see the Typer commands in `src/fastflowtransform/cli/`. - ---- - -### Settings Infrastructure - -`settings.py` uses a **Pydantic v2 discriminated union** (`engine` as discriminator) plus ENV overrides. - -Profile types: -- `DuckDBProfile(engine="duckdb", duckdb: {path})` -- `PostgresProfile(engine="postgres", postgres: {dsn, db_schema})` -- `BigQueryProfile(engine="bigquery", bigquery: {project?, dataset, location?, use_bigframes?})` -- `DatabricksSparkProfile(engine="databricks_spark", ...)` -- `SnowflakeSnowparkProfile(engine="snowflake_snowpark", ...)` - -Resolver idea: - -```python -def resolve_profile(project_dir: Path, env_name: str, env: EnvSettings) -> Profile: ... -``` - ---- - -### Streaming Components - -**`streaming/sessionizer.py`** - -- Normalizes events (JSONL / batch DF) and writes `fct_sessions_streaming`. -- `process_batch(df)` aggregates sessions (start/end, pageviews, revenue). - -**Smoke test (DuckDB):** - -```python -def test_stream_sessionizer_produces_sessions(): ... -``` - ---- - -### Mini End-to-End Example (Python API) - -```python -from pathlib import Path -from jinja2 import Environment, FileSystemLoader -from fastflowtransform.core import REGISTRY -from fastflowtransform.dag import topo_sort -from fastflowtransform.executors.duckdb import DuckExecutor - -proj = Path("examples/simple_duckdb").resolve() -REGISTRY.load_project(proj) -env = REGISTRY.env # Jinja env from the registry load - -order = topo_sort(REGISTRY.nodes) -ex = DuckExecutor(db_path=str(proj / ".local" / "demo.duckdb")) - -for name in order: - node = REGISTRY.nodes[name] - if node.kind == "sql": - ex.run_sql(node, env) - else: - ex.run_python(node) - -print("✓ Done") -``` - ---- - -Need a different angle? Head back to the [Docs Hub](./index.md) or deep-dive into the [Modeling Reference](./Config_and_Macros.md). - - - - - -# API Calls in Python Models - -> **Status:** Experimental but stable for demos and smaller workflows. -> **Goal:** Query HTTP APIs from Python models, return responses as DataFrames, cache and instrument them cleanly, and support reproducible offline runs. - -* [Motivation](#motivation) -* [Quickstart](#quickstart) -* [Programming API](#programming-api) - * [`get_json`](#get_json) - * [`get_df`](#get_df) - * [Pagination](#pagination) - * [Context & Telemetry](#context-telemetry) -* [CLI Flags & Environment Variables](#cli-flags-environment-variables) -* [Example Model](#example-model) -* [Artifacts](#artifacts) -* [Tests & Offline Demos](#tests-offline-demos) -* [Best Practices](#best-practices) -* [Troubleshooting](#troubleshooting) -* [Security & Compliance](#security-compliance) -* [FAQ](#faq) - ---- - -## Motivation - -Many pipelines need small, reliable API fetchers: configuration tables, miniature dimensions, feature flags, SaaS exports. This feature provides: - -- Simple HTTP calls inside Python models -- File-backed cache (reproducible builds, works offline) -- Per-node telemetry (requests, hits, bytes, hashes) -- CLI switches `--offline` and `--http-cache` for reproducible runs - ---- - -## Quickstart - -1. **Optionally enable flags** (recommended): - - ```bash - # No network - cache hits only - fft run . --env dev --offline - # Cache mode - fft run . --env dev --http-cache rw # rw|ro|off - ``` - -2. **Write a Python model**: - - ```python - # models/users_from_api.ff.py - import pandas as pd - from fastflowtransform.core import model - from fastflowtransform.api.http import get_df - - @model(name="users_from_api", deps=["users.ff"]) - def fetch(_: pd.DataFrame) -> pd.DataFrame: - df = get_df( - url="https://api.example.com/users", - params={"page": 1}, - record_path=["data"], # JSON -> list -> DataFrame - ) - return df - ``` - -3. **Run it**: - - ```bash - fft run . --env dev --select users_from_api - ``` - ---- - -## Programming API - -> Module: `fastflowtransform.api.http` - -### `get_json` - -```python -from fastflowtransform.api.http import get_json - -data = get_json( - url="https://api.example.com/objects", - params={"page": 1}, # optional - headers={"Authorization": "Bearer ..."}, # optional - timeout=20, # optional (seconds) -) -# -> Python dict / list -``` - -**Behavior** - -- Reads from the local cache (when present and valid). -- Writes to the cache (`rw` mode), including the response body. -- Respects offline mode (no network traffic). - -### `get_df` - -```python -from fastflowtransform.api.http import get_df - -df = get_df( - url="https://api.example.com/users", - params={"page": 1}, - record_path=["data"], # path to the JSON list - normalize=True, # optional: flatten nested objects - paginator=None, # optional: pagination strategy (see below) - output="pandas", # pandas|spark (default=pandas) -) -# -> pandas.DataFrame -``` - -**Conversion** - -- Default: `record_path` points to the array payload (for example `["data"]`). -- `normalize=True` delegates to `json_normalize` for deeper structures. -- `output='spark'` (plus an optional `session=SparkSession`) converts the normalized result into a `pyspark.sql.DataFrame`. Additional backends will reuse the same parameter. - -### Pagination - -For paged APIs you can describe the next request declaratively: - -```python -def paginator(url: str, params: dict | None, json_obj: dict): - next_url = json_obj.get("next") # e.g. absolute URL - if next_url: - return {"next_request": {"url": next_url}} - return None - -df = get_df( - "https://api.example.com/users?page=1", - paginator=paginator, - record_path=["data"], -) -``` - -The paginator may return the following fields: - -- `{"next_request": {"url": "...", "params": {...}, "headers": {...}}}` - (any missing field keeps its previous value) - -### Context & Telemetry - -During a model run the executor collects telemetry per node and writes it into `run_results.json`: - -- `requests` (count) -- `cache_hits` -- `bytes` (sum of response bodies) -- `used_offline` (bool) -- `keys` (cache keys) -- `entries` (optional compact array with URL, status, content hash) - -You will find these metrics under the `http` block of each node (see [Artifacts](#artifacts)). - ---- - -## CLI Flags & Environment Variables - -**CLI** - -- `--offline` - Sets `FF_HTTP_OFFLINE=1`; network requests are blocked, **cache hits only**. -- `--http-cache {off|ro|rw}` - Sets `FF_HTTP_CACHE_MODE`: - - - `off`: neither read nor write. - - `ro`: read-only (hits), **no** writes. - - `rw`: read and write (default). - -**Environment (optional to set directly)** - -| Variable | Default | Effect | -| ------------------------ | ------------------------------- | ----------------------------------- | -| `FF_HTTP_OFFLINE` | `0` | `1/true/on` -> offline mode | -| `FF_HTTP_CACHE_MODE` | `rw` | `off` / `ro` / `rw` | -| `FF_HTTP_CACHE_DIR` | `.fastflowtransform/http_cache` | Cache directory | -| `FF_HTTP_TTL` | `0` | Seconds; 0 = never expires | -| `FF_HTTP_TIMEOUT` | `20` | Request timeout (seconds) | -| `FF_HTTP_MAX_RETRIES` | `3` | Basic retry count | -| `FF_HTTP_RATE_LIMIT_RPS` | `0` | Requests per second (0 = unlimited) | - ---- - -## Example Model - -```python -# models/dim_countries_from_api.ff.py -import pandas as pd -from fastflowtransform.core import model -from fastflowtransform.api.http import get_df - -@model(name="dim_countries_from_api", deps=["users.ff"]) -def countries(_: pd.DataFrame) -> pd.DataFrame: - def pager(u, p, js): - nxt = js.get("paging", {}).get("next") - return {"next_request": {"url": nxt}} if nxt else None - - df = get_df( - url="https://api.example.com/countries?page=1", - paginator=pager, - record_path=["data"], - normalize=True, - ) - # lightweight post-processing - if "code" in df.columns: - df["code"] = df["code"].str.upper() - return df -``` - -Run: - -```bash -fft run . --env dev --select dim_countries_from_api --http-cache ro -``` - ---- - -## Artifacts - -`/.fastflowtransform/target/run_results.json` (excerpt): - -```json -{ - "results": [ - { - "name": "dim_countries_from_api", - "status": "success", - "duration_ms": 153, - "http": { - "requests": 2, - "cache_hits": 2, - "bytes": 1842, - "used_offline": true, - "keys": ["GET:https://api.example.com/countries?page=1|{}|{}", "..."], - "entries": [ - {"url": "https://api.example.com/countries?page=1", "status": 200, "content_hash": "sha256:..."}, - {"url": "https://api.example.com/countries?page=2", "status": 200, "content_hash": "sha256:..."} - ] - } - } - ] -} -``` - -> Note: When a node is **skipped** (fingerprint cache hit), no new `http` block is emitted - the model did not run. - ---- - -## Tests & Offline Demos - -- Place unit tests under `tests/api/...` and seed the cache directly (no real HTTP calls). -- Suggested scenarios: - - - **Offline hit:** set `FF_HTTP_OFFLINE=1`, seed the cache, `get_json/get_df` must succeed. - - **Cache mode `off`:** even with cache entries, **no** reads; expect a failure in offline mode. - - **`ro`:** allow read hits; **no** cache writes after a real or mocked request. - - **Pagination:** stitch several pages from offline fixtures; telemetry should count requests/hits. - ---- - -## Best Practices - -- **Stable URLs and parameter order** produce identical cache keys and reproducible builds. -- **Keep `record_path` shallow**; use `normalize=True` only when necessary (performance). -- **Never cache secrets:** provide tokens via headers; the response body and metadata are cached. -- **Use `--offline` in CI** for deterministic tests with a pre-seeded cache. -- **Set TTL intentionally** when APIs change frequently. -- **Scope engine-specific variants** with `engine_model(only=...)` so each execution backend registers only the models it can run (pair with SQL `config(engines=[...])` when duplicating logical names). - ---- - -## Troubleshooting - -- **“offline + cache miss”** - Seed the cache (see tests) or disable offline mode. -- **“Schema mismatch”** - Harmonize columns after `get_df` (types, missing keys). -- **“Too many requests”** - Configure `FF_HTTP_RATE_LIMIT_RPS`; make pagination more efficient (larger `page_size`). -- **“No http block”** - Was the node **skipped** (fingerprint cache)? Or did the model avoid HTTP calls altogether? - ---- - -## Security & Compliance - -- **Do not commit secrets** - use environment variables or a secret manager. -- **PII/GDPR:** verify whether the API returns personal data; minimise retention. -- **Cache directory:** keep it in `.gitignore`; encrypt or isolate it if necessary. - ---- - -## FAQ - -**Q:** Can I call other libraries (for example `requests`, `httpx`) directly? -**A:** Yes, but you lose telemetry and caching. The recommended entrypoint is `fastflowtransform.api.http`. - -**Q:** How do I add custom headers (for example OAuth)? -**A:** Pass `headers={...}`. Store sensitive values in env vars and inject them into your models. - -**Q:** Does this work for POST requests? -**A:** Release R1 focuses on GET. Please open an issue for POST/PUT support; the design can be extended. - ---- - -**See also:** - -- Technical guide: *Developer Guide – Architecture & Internals* -- Unit tests: `tests/api/test_http_*.py` -- Runtime & cache: *Parallelism & Cache* - - - - - -# FastFlowTransform Modeling Reference - -> Authoritative reference for FastFlowTransform’s modeling layer: SQL/Python models, configuration macros, templating helpers, and testing hooks. -> Supported engines: DuckDB, Postgres, BigQuery (pandas & BigFrames), Databricks/Spark, Snowflake/Snowpark. -> **Execution & Cache quick notes** -> - Parallelism is level-wise; use `fft run --jobs N`. -> - Use `--cache={off|ro|rw|wo}` to control skipping behavior. -> - Fingerprints include rendered SQL / Python function source, selected `FF_*` env vars, `sources.yml` and upstream fingerprints. -> - Change any of these → downstream nodes rebuild. -> - `--rebuild ` forces rebuilding selected models (ignores cache). - - -For an operational walkthrough (CLI usage, troubleshooting, pipelines) see the [Technical Overview](./Technical_Overview.md). This document focuses purely on how you author and test models. - ---- - -## Docs Navigation -1. [Getting Started](./index.md) -2. [User Guide](./Technical_Overview.md#part-i-operational-guide) -3. **Modeling Reference** — you are here (`Config_and_Macros.md`) -4. [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) - ---- - -## Table of Contents - -- [Docs Navigation](#docs-navigation) -- [1. Model Fundamentals](#1-model-fundamentals) - - [1.1 SQL models (`*.ff.sql`)](#11-sql-models-ffsql) - - [1.2 Python models (`*.ff.py`)](#12-python-models-ffpy) - - [1.3 Seeds, sources, and dependencies](#13-seeds-sources-and-dependencies) -- [2. `config()` options](#2-config-options) -- [3. Variables with `var()`](#3-variables-with-var) -- [4. Template context & helpers](#4-template-context-helpers) -- [5. Macros & reusable Jinja code](#5-macros-reusable-jinja-code) -- [6. Materialization semantics](#6-materialization-semantics) -- [7. Testing & quality gates](#7-testing-quality-gates) -- [8. Quick cheat sheet](#8-quick-cheat-sheet) - ---- - -## 1. Model Fundamentals - -FastFlowTransform discovers models under `/models/` with two primary flavours: - -### 1.1 SQL models (`*.ff.sql`) - -- File stem defines the logical DAG node (`users.ff.sql` → `users.ff`). -- Jinja template rendered with FastFlowTransform context (helpers like `ref`, `source`, `var`, `config`, `this`). -- Output relation defaults to the stem without `.ff` (configurable via `config(alias=...)` if supported in future releases). - -```sql --- models/users.ff.sql -{{ config(materialized='table', tags=['staging']) }} -create or replace table users as -select id, email -from {{ source('crm', 'users') }}; -``` - -### 1.2 Python models (`*.ff.py`) - -Use the `@model` decorator from `fastflowtransform.core` to register a callable. The decorator accepts: - -- `name` (optional) → overrides the logical name (defaults to stem). -- `deps` → list of dependency nodes (file stems or logical names). -- `requires` → column contract per dependency (validated via `validation.validate_required_columns`). -- `materialized` (optional) → `'table' | 'view' | 'ephemeral'`; mirrors `config(materialized=...)` for SQL. -- `tags` (optional) → convenience for attaching selection labels without writing `meta={"tags": ...}`. - -Dependencies determine the call signature: - -- Single dependency → function receives a single `pandas.DataFrame`. -- Multiple dependencies → function receives `dict[str, pandas.DataFrame]` keyed by physical relation name (e.g. `"users"`). - -```python -# models/users_enriched.ff.py -from fastflowtransform.core import model -import pandas as pd - -@model( - name="users_enriched", - deps=["users.ff"], - requires={"users": {"id", "email"}}, - materialized="view", -) -def enrich(df: pd.DataFrame) -> pd.DataFrame: - out = df.copy() - out["is_gmail"] = out["email"].str.endswith("@gmail.com") - return out -``` - -#### Engine-scoped registration - -When the same project supports multiple execution backends, use `engine_model` to register a Python model only for specific engines. The decorator wraps `@model` but bails out early if the active engine (from `FF_ENGINE` or the selected profile) is not allowed. - -```python -from fastflowtransform import engine_model -import pandas as pd - -@engine_model( - only=("duckdb", "postgres"), - name="api_users_requests", - deps=["users.ff"], - tags=["example:api_demo", "scope:engine"], -) -def fetch(_: pd.DataFrame) -> pd.DataFrame: - ... -``` - -Allowed values are case-insensitive strings or tuples. If the engine does not match, the function is left undecorated and no node is created, preventing duplicate registrations across engine-specific folders. - -### 1.3 Seeds, sources, and dependencies - -- Declare external tables in `sources.yml`; they become available via `source('group','table')`. -- Provide reproducible inputs with CSV/Parquet seeds in `/seeds/`. -- FastFlowTransform auto-detects dependencies: - - SQL models → parse `ref()` / `source()` calls. - - Python models → use the decorator’s `deps`. - - Additional runtime dependencies can be expressed via `relation_for()`. - -> **Warning:** SQL dependency detection is static. Only literal calls such as `ref('users.ff')` are registered. When you need to gate a dependency behind a variable, materialise the options in a mapping (`{'foo': ref('foo'), 'bar': ref('bar')}`) and pick from that map at runtime; a bare `ref(variable)` will not show up in the DAG. - -- Persistence (e.g. Spark/Databricks): configure default targets under `project.yml → models.storage` (and optionally `seeds.storage`). Example: - - ```yaml - models: - storage: - api_users_http: - path: ".local/spark/api_users_http" - format: delta - options: - mergeSchema: true - - seeds: - storage: - users: - path: ".local/spark/seeds/users" - ``` - - Entries end up in `node.meta["storage"]` (keys: `path`, `format`, `options`) and are respected by the matching executor. - -```yaml -# sources.yml -version: 2 - -sources: - - name: crm - tables: - - name: users - identifier: seed_users - - name: erp - tables: - - name: orders - identifier: seed_orders -``` - -Each source can declare defaults such as `schema`, `database`, or `catalog`. Tables may -override those defaults, add per-engine overrides, or point at files: - -```yaml - - name: raw - schema: staging - tables: - - name: seed_users - identifier: seed_users - overrides: - postgres: - schema: raw - databricks_spark: - format: delta - location: "/mnt/delta/raw/seed_users" -``` - ---- - -## 2. `config()` options - -Call `config()` at the top of SQL models. Python models get the same options via the `@model(..., materialized=..., tags=...)` decorator kwargs. - -```sql -{{ config( - materialized='view', - tags=['mart', 'daily'] -) }} -``` - -Supported keys: - -| Key | Type | Description | -|----------------|-----------------|------------------------------------------------------------------------------| -| `materialized` | `"table" \| "view" \| "ephemeral"` | Controls how FastFlowTransform persists the model. See [Materialization semantics](#6-materialization-semantics). | -| `tags` | `list[str]` | Arbitrary labels surfaced in docs / selection tooling. | -| `engines` | `list[str]` or `str` | Restrict registration to the listed engines (case-insensitive). Requires the active engine to be known (profile selection or `FF_ENGINE`). | -| (future) | – | Additional metadata is stored under `node.meta[...]` if added later. | - -**Tips** - -- Place `config()` before any SQL text. -- Use tags to power custom filters in docs or to drive test selection. -- Combine `engines=[...]` with per-engine subfolders to keep one physical file per backend without name clashes. When no engine is active, FastFlowTransform raises a clear error to avoid silent skips. -- Ephemeral models inline into downstream SQL; pick `view` for shareable logic without materializing a table. - ---- - -## 3. Variables with `var()` - -Project-level variables live under `project.yml → vars:` and can be overridden from the CLI: - -```yaml -# project.yml -vars: - snapshot_day: "2000-01-01" - limit: 100 -``` - -```bash -fft run . --vars snapshot_day='2025-10-01' limit=50 -``` - -Usage in templates: - -```sql -select * -from {{ source('crm','users') }} -where signup_date <= '{{ var("snapshot_day", "1970-01-01") }}' -limit {{ var("limit", 1000) }} -``` - -Resolution order: CLI overrides → project vars → default argument. - ---- - -## 4. Template context & helpers - -Every model (SQL & Python) gets a rich Jinja context. Key helpers: - -| Helper | Purpose | -|--------------------|------------------------------------------------------------------------------------------| -| `this` | Object exposing `name`, `relation`, `materialized`, `schema`, `database`. | -| `ref("model")` | Resolves another model’s physical relation (or inlines ephemeral SQL). | -| `source("group","table")` | Resolves entries defined in `sources.yml`. | -| `relation_for(node)` (Python utility) | Maps logical node names to physical relations (helpful inside UDFs/tests). | -| `var("key", default)` | Retrieves project/CLI variables (see above). | - -Example: - -```sql -{{ config(materialized='view') }} -select - u.id, - u.email, - {{ var("country_column", "'US'") }} as country_code -from {{ ref('users.ff') }} as u --- rendered relation for logging/debugging --- {{ this.relation }} -``` - ---- - -## 5. Macros & reusable Jinja code - -Organise shared SQL snippets in `models/macros/` (all `.sql` files are auto-loaded): - -``` -models/ - macros/ - string_utils.sql - marts/ - users.ff.sql -``` - -```jinja -{# models/macros/string_utils.sql #} -{% macro safe_lower(col) -%} -lower(trim({{ col }})) -{%- endmacro %} -``` - -Use the macro anywhere within the project: - -```sql -select {{ safe_lower("email") }} as email_lower -from {{ ref('users.ff') }}; -``` - -**Best practices** - -- Keep macros idempotent and side-effect free. -- Group related macros per file (e.g., string utilities, date helpers). -- Document macros with inline comments; FastFlowTransform’s generated docs list each macro with its path. - ---- - -## 6. Materialization semantics - -### SQL models - -| Materialization | Behaviour | -|-----------------|-----------| -| `table` | `CREATE OR REPLACE TABLE … AS ` | -| `ephemeral` | No object is created; downstream `ref()` expands to a subquery. | - -**Postgres-specific:** FastFlowTransform rewrites the “create or replace” pattern into `DROP TABLE IF EXISTS …; CREATE TABLE … AS …` for compatibility. - -### Python models - -- Default → materialized as `table`. -- `materialized='view'` produces an engine-specific temporary table first, then creates/overwrites a view that selects from it. -- Ephemeral Python models are not supported. - ---- - -## 7. Testing & quality gates - -### 7.1 Column contracts (`requires`) - -Use the decorator’s `requires` argument (Python models) to ensure upstream inputs carry expected columns. Under the hood FastFlowTransform calls `validation.validate_required_columns`, raising `RequiredColumnsError` with a descriptive diff. - -```python -@model( - deps=["orders.ff", "users_enriched"], - requires={ - "orders": {"order_id", "user_id", "amount"}, - "users_enriched": {"id", "email", "is_gmail"} - } -) -def join_orders(inputs: dict[str, pd.DataFrame]) -> pd.DataFrame: - ... -``` - -### 7.2 Data quality tests (`project.yml`) - -Declare checks under `project.yml → tests:`. Each entry maps directly to a function in `fastflowtransform.testing` (`not_null`, `unique`, `row_count_between`, `greater_equal`, `non_negative_sum`, `freshness`). Run them via `fft test …`. - -```yaml -tests: - - type: not_null - table: users - column: email - tags: [batch] -``` - -### 7.3 Model unit tests (`fft utest`) - -Keep transformation logic honest with small, engine-agnostic specs: - -- Place YAML files under `/tests/unit/`. -- Express inputs via inline rows or CSV paths. -- Declare expected output rows plus comparison options (`order_by`, `any_order`, `ignore_columns`, `approx`). - -```yaml -# tests/unit/users_enriched.yml -model: users_enriched -defaults: - inputs: - users: - rows: - - {id: 1, email: "a@example.com"} - - {id: 2, email: "b@gmail.com"} - expect: - relation: users_enriched - order_by: [id] - -cases: - - name: flags_gmail - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} -``` - -Run with: - -```bash -fft utest . --env dev -fft utest . --model users_enriched --case flags_gmail -``` - -See the [Model Unit Tests guide](./Unit_Tests.md) for an exhaustive walkthrough (engine overrides, CI examples, troubleshooting). - ---- - -## 8. Quick cheat sheet - -| Task | Snippet / Pointer | -|------|-------------------| -| Set materialization | `{{ config(materialized='view') }}` | -| Add tags | `{{ config(tags=['mart','daily']) }}` | -| Read project variable | `{{ var('run_date', '1970-01-01') }}` | -| Current relation name | `{{ this.relation }}` | -| Reference another model | `{{ ref('users.ff') }}` | -| Reference source | `{{ source('crm','users') }}` | -| Macro definition | `models/macros/*.sql` | -| Guarantee columns (Python) | `@model(..., requires={'users': {'id','email'}})` | -| Data-quality test | `project.yml → tests` + `fft test …` | -| Unit test | `tests/unit/*.yml` + `fft utest …` | - ---- - -Return to the [Docs Hub](./index.md) or switch to the [User/Developer Guide](./Technical_Overview.md). - - - - - -# Parallelism & Cache - -**TL;DR:** FastFlowTransform executes models in parallel DAG levels and uses deterministic -fingerprints to skip unchanged nodes — while a separate HTTP cache accelerates API models. - -FastFlowTransform introduces a level-wise parallel scheduler and a build cache driven by stable fingerprints. This document explains **how parallel execution works**, **when nodes are skipped**, the exact **fingerprint formula**, and the **meta table** written after successful builds. - ---- - -## Table of Contents -- [Parallel Scheduler](#parallel-scheduler) -- [Cache Policy](#cache-policy) -- [Fingerprint Formula](#fingerprint-formula) -- [Meta Table Schema](#meta-table-schema) -- [CLI Recipes](#cli-recipes) -- [Troubleshooting & FAQ](#troubleshooting--faq) -- [Example: simple_duckdb](#example-simple_duckdb) -- [Appendix: Environment Inputs](#appendix-environment-inputs) - ---- - -## Parallel Scheduler - -FastFlowTransform splits the DAG into **levels** (all nodes that can run together without violating dependencies). Within a level, up to `--jobs` nodes execute in **parallel**. - -- Dependencies are **never** violated. -- `--keep-going`: tasks already started in a level finish; **subsequent levels won’t start** if any task in the current level fails. -- Logs are serialized through an internal queue to keep lines readable and per-node timing visible. - -**Quick start** -```bash -# Run with 4 workers per level -fft run . --env dev --jobs 4 - -# Keep tasks in the same level running even if one fails -fft run . --env dev --jobs 4 --keep-going -``` - ---- - -## Cache Policy - -The cache decides whether a node can be **skipped** when nothing relevant changed. Modes: - -``` ---cache=off # always build ---cache=rw # default; skip on match; write cache after build ---cache=ro # skip on match; on miss build but don't write cache ---cache=wo # always build and write cache ---rebuild # ignore cache for matching nodes ---no-cache # alias for --cache=off -``` - -### Skip condition - -A node is skipped iff: - -1. The current **fingerprint** matches the on-disk cache value, **and** -2. The **physical relation exists** on the target engine. - -If the relation was dropped externally, FastFlowTransform will **rebuild** even if the fingerprint matches. - -### HTTP Response Cache - -In addition to the build cache, FastFlowTransform provides an **HTTP response cache** for API models using -`fastflowtransform.api.http.get_df(...)`. - -- **Purpose:** Avoid redundant API calls and support offline mode. -- **Location:** Controlled by `FF_HTTP_CACHE_DIR` (e.g. `.local/http-cache`). -- **Controls (environment):** - - `FF_HTTP_ALLOWED_DOMAINS`: comma-separated list of hosts allowed to cache. - - `FF_HTTP_MAX_RPS`, `FF_HTTP_MAX_RETRIES`, `FF_HTTP_TIMEOUT`: rate limiting & retry policy. - - `FF_HTTP_OFFLINE=1`: run in offline mode — serve only from cache, no network calls. -- **CLI visibility:** Each run writes HTTP stats (`requests`, `cache_hits`, `bytes`, `used_offline`) - to `.fastflowtransform/target/run_results.json`. -- **Makefile helpers:** see `make api-show-http` in the API demo to inspect HTTP cache usage. - -> This cache is independent from the build cache; it stores API responses, not SQL or fingerprints. - ---- - -## Fingerprint Formula - -Fingerprints are stable hashes that change on any relevant input: - -* **SQL models**: `fingerprint_sql(node, rendered_sql, env_ctx, dep_fps)` - - * Uses **rendered** SQL (after Jinja), not the raw template. -* **Python models**: `fingerprint_py(node, func_src, env_ctx, dep_fps)` - - * Uses `inspect.getsource(func)` with a **file-content fallback** if needed. - -`env_ctx` includes: - -* `engine` (e.g., `duckdb`, `postgres`, `bigquery`) -* `profile_name` (CLI `--env`) -* Selected environment entries: **all `FF_*` keys** (key + value) -* A **normalized** portion of `sources.yml` (sorted keys/dump) - -`dep_fps` are upstream fingerprints; **any upstream change** invalidates downstream fingerprints. - -**Properties** - -* Same inputs ⇒ same hash. -* Minimal change in SQL/function ⇒ different hash. -* Dependency changes propagate downstream. - -> **Note:** The active engine and profile name are part of the fingerprint. -> Switching from `duckdb` to `postgres` automatically invalidates the cache, so cross-engine runs -> never reuse outdated fingerprints. - ---- - -## Meta Table Schema - -After a successful build, FastFlowTransform writes a per-node audit row: - -``` -_ff_meta ( - node_name TEXT/STRING, -- logical name, e.g. "users.ff" - relation TEXT/STRING, -- physical table/view, e.g. "users" - fingerprint TEXT/STRING, - engine TEXT/STRING, - built_at TIMESTAMP -) -``` - -Backends: - -* **DuckDB:** table `_ff_meta` in `main`. -* **Postgres:** table `_ff_meta` in the active schema. -* **BigQuery:** table `._ff_meta`. - -> Note: Skip logic uses the file-backed fingerprint cache and a direct relation existence check; the meta table is for auditing and tooling. - ---- - -## CLI Recipes - -```bash -# First run — builds everything, writes cache and meta -fft run . --env dev --cache=rw - -# No-op run — should skip all nodes (if nothing changed) -fft run . --env dev --cache=rw - -# Force rebuild of a single model (ignores cache for it) -fft run . --env dev --cache=rw --rebuild marts_daily.ff - -# Read-only cache (skip on match, build on miss, no writes) -fft run . --env dev --cache=ro - -# Always build and write cache -fft run . --env dev --cache=wo - -# Disable cache entirely -fft run . --env dev --no-cache -``` - -With parallelism: - -```bash -fft run . --env dev --jobs 4 -fft run . --env dev --jobs 4 --keep-going -``` - ---- - -## Troubleshooting & FAQ - -**“Why did it skip?”** -A skip requires a fingerprint match and an existing relation. Fingerprints include: - -* rendered SQL / Python function source, -* `sources.yml` (normalized), -* engine/profile, -* **all `FF_*` environment variables**, -* upstream fingerprints. - -Any change in the above triggers a rebuild downstream. - -**“Relation missing but cache says skip?”** -We also check relation existence. If the table/view was dropped externally, FastFlowTransform will **rebuild**. - -**“My logs interleave under parallelism.”** -Logs are serialized via a queue; use `-v` / `-vv` for richer but still stable output. Each node prints start/end and duration; levels summarize. - -**“Utest cache?”** -`fft utest --cache {off|ro|rw}` defaults to `off` for deterministic runs. With `rw`, expensive unit cases can be accelerated. Unit tests do not rely on the meta table by default. - ---- - -## Example: simple_duckdb - -The demo contains two independent staging nodes (`users.ff.sql`, `orders.ff.sql`). They run in **parallel** within the same level. - -Makefile targets: - -```makefile -run_parallel: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --jobs 4 - -cache_rw_first: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --cache=rw - -cache_rw_second: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" fft run "$(PROJECT)" --env dev --cache=rw - -cache_invalidate_env: - FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" FF_DEMO_TOGGLE=1 fft run "$(PROJECT)" --env dev --cache=rw -``` - ---- - -## Appendix: Environment Inputs - -Only environment variables with the `FF_` prefix affect fingerprints (keys and values). If you change one (e.g., `FF_RUN_DATE`, `FF_REGION`), fingerprints change and downstream nodes rebuild. - -```bash -# Will invalidate fingerprints and rebuild affected nodes -FF_RUN_DATE=2025-01-01 fft run . --env dev --cache=rw -``` - -```` - ---- - -### 🔗 `docs/index.md` – Link zum neuen Kapitel - -```diff ---- a/docs/index.md -+++ b/docs/index.md -@@ -10,6 +10,7 @@ - - [User Guide – Operational](./Technical_Overview.md#part-i--operational-guide) - - [Modeling Reference](./Config_and_Macros.md) -- [Parallelism & Cache](./Cache_and_Parallelism.md) - - [Developer Guide – Architecture & Internals](./Technical_Overview.md#part-ii--architecture--internals) -```` - - - - - -# Incremental models - -Incremental models let you **reuse existing data** and only process **new or changed rows** instead of rebuilding a table from scratch on every run. This is essential for larger datasets or frequently running pipelines. - -This page explains the **concepts and configuration** of incremental models in FastFlowTransform (FFT) independently of any specific example project. - ---- - -## Why incremental models? - -By default, a model is built with a **full refresh**: - -* Read all sources -* Recompute all transformations -* Overwrite the target table - -For small tables this is fine. For anything medium-sized or larger, this quickly becomes: - -* slow, -* expensive (especially on cloud warehouses / Spark), -* and unnecessary if only a small portion of rows changed. - -Incremental models solve this by: - -1. Reusing existing target data. -2. Processing only **new / changed** rows. -3. Applying an **incremental strategy** (append or merge). - ---- - -## High-level architecture - -Incremental behaviour is coordinated between three layers: - -1. **Model configuration** - - You declare that a model is incremental and provide hints: - - * Does it append or upsert? - * What is the **unique key**? - * Which column(s) indicate freshness (e.g. `updated_at`)? - - This lives in the model’s `config(...)` (SQL) or `meta` (Python) and is validated against a strict schema. - -2. **Planner / Core** - - FFT looks at: - - * the model’s incremental config (`incremental={...}`), - * whether the physical table already exists, - * CLI flags like `--full-refresh`, - - and decides whether to: - - * run a **full rebuild**, or - * run an **incremental update** using engine hooks. - -3. **Engine executors** (DuckDB, Postgres, Databricks/Spark, …) - - Each engine implements a small incremental API: - - * `exists_relation(relation)` - * `create_table_as(relation, select_sql)` – initial full build - * `full_refresh_table(relation, select_sql)` – forced rebuild - * `incremental_insert(relation, select_sql)` – append-only - * `incremental_merge(relation, select_sql, unique_key)` – upsert / merge - * `alter_table_sync_schema(relation, select_sql, mode=...)` – optional schema evolution - - The planner calls these methods – you just configure the model. - ---- - -## Enabling incremental mode - -You enable incremental mode **per model** via the model config. - -### SQL models - -Inside the Jinja `config` block you use a structured `incremental` dictionary: - -```sql -{{ config( - materialized='incremental', - tags=['example:incremental', 'engine:duckdb'], - incremental={ - "enabled": true, - "strategy": "merge", # or "append", "insert", "full_refresh" - "unique_key": ["event_id"], - "updated_at_column": "updated_at" - } -) }} - -select - event_id, - updated_at, - value -from some_source -```` - -Key points: - -* `materialized='incremental'` tells FFT to use the incremental pipeline. -* `incremental.enabled: true` declares that this model supports incremental processing. -* `unique_key` declares one or more columns that uniquely identify a row in the target. -* `strategy` is a hint for how deltas should be applied (append vs merge etc.). -* `updated_at_column` (or `delta_columns`/`updated_at_columns`) tells FFT which column is used for “new vs old” comparisons (usually a timestamp or monotonically increasing surrogate). - -There is **no extra `meta={...}` wrapper** anymore – the fields of `config(...)` are validated directly. - -### Python engine models - -For `@engine_model` functions you pass the same information via the `meta` parameter – but again with **top-level incremental config**, not inside another `meta` key: - -```python -from fastflowtransform import engine_model - -@engine_model( - only="duckdb", - name="fct_events_py_incremental", - deps=["events_base.ff"], - tags=["incremental", "engine:duckdb"], - meta={ - "materialized": "incremental", - "incremental": { - "enabled": True, - "strategy": "merge", - "unique_key": ["event_id"], - "updated_at_column": "updated_at", - }, - }, -) -def build(df): - # Return a frame with event_id, updated_at, value, ... - return df -``` - -The **frame you return** (pandas, Spark, etc.) is treated as the *delta dataset* for incremental processing – FFT does not care how you compute it, only about the columns and the meta. - ---- - -## Incremental strategies - -The core supports at least two conceptual strategies: - -### 1. Append / insert-only (`strategy: "append"` / `"insert"`) - -Use this when: - -* data is immutable once written, and -* new rows have strictly increasing `updated_at` / timestamp or surrogate key. - -Behaviour: - -* For the **first run**, FFT calls `create_table_as(relation, SELECT ...)`. -* For **subsequent runs**: - - * Only rows considered “new” are included in the SELECT (using your configured watermark columns). - * The executor calls `incremental_insert(relation, SELECT ...)` which typically becomes: - - ```sql - INSERT INTO target_table - SELECT ... - ``` - -Good for: - -* log/event style tables -* audit trails -* many ingestion pipelines - -### 2. Merge / upsert (`strategy: "merge"`) - -Use this when: - -* rows may change later, -* you want the target table to always reflect the **latest version** per `unique_key`. - -Behaviour: - -* For the **first run**, same as full refresh: `create_table_as`. -* For **later runs**: - - * The SELECT (or delta query, see below) produces a *delta* frame with new/updated rows. - * Executor tries `incremental_merge(relation, select_sql, unique_key)`. - -Engine-specific behaviour: - -* **Databricks / Spark (Delta)** - The executor attempts a native Delta MERGE: - - ```sql - MERGE INTO target AS t - USING (SELECT ...) AS s - ON t.key1 = s.key1 AND ... - WHEN MATCHED THEN UPDATE SET * - WHEN NOT MATCHED THEN INSERT * - ``` - - If MERGE is not supported (non-Delta table), it falls back to a safe full rebuild. - -* **Other engines (DuckDB, Postgres, …)** - The executor can implement merge using: - - * `INSERT ... ON CONFLICT ... DO UPDATE` (Postgres), - * a **full-refresh emulation**: build a new version by combining old rows and delta rows and overwrite. - -In all cases, the `unique_key` list is used to match rows between existing table and delta frame. - ---- - -## Watermark / delta SQL and default behaviour - -To decide **which rows are “new enough”** for an incremental run, FFT uses the configuration you provide (for example `updated_at_column` or `delta_columns`) plus the existing table. - -A typical default pattern is: - -```sql -where updated_at > ( - select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') - from {{ this }} -) -``` - -The exact SQL will vary by engine, but the core idea is: - -* Read the current maximum of your watermark column in the target. -* Select only rows strictly newer than that. - -### Overriding the delta logic - -If the default “`updated_at > max(updated_at)`” is not enough, you have a few options: - -1. **Additional delta columns** - - Use `delta_columns` / `updated_at_columns` in `incremental={...}` to indicate multiple fields that drive change detection (especially for Python incremental). - -2. **Inline delta SQL (`delta_sql`)** - - Provide a custom **delta SELECT** that FFT should use on incremental runs: - - ```sql - {{ config( - materialized='incremental', - incremental={ - "enabled": true, - "strategy": "merge", - "unique_key": ["event_id"], - "updated_at_column": "updated_at", - "delta_sql": " - with base as ( - select event_id, updated_at, value - from {{ ref('events_base.ff') }} - ) - select * - from base - where updated_at > ( - select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') - from {{ this }} - ) - " - } - ) }} - ``` - -3. **External delta config (`delta_config`)** - - Keep the base query in the model, but put the delta SQL into a separate YAML file and reference it via `delta_config: "config/incremental/my_model.delta.yml"`. - -In all cases, FFT still delegates the **merge/insert mechanics** to the executor; you only control what qualifies as “delta”. - ---- - -## Full refresh vs incremental - -You can always force a full rebuild: - -```bash -fft run . --env dev --full-refresh -``` - -The logic is: - -* If `--full-refresh` is set → **ignore incremental** and call `full_refresh_table`. - -* Otherwise, if the model has `incremental.enabled` and the target exists: - - * attempt incremental path (`incremental_insert` / `incremental_merge`), - -* Otherwise: - - * do initial full build via `create_table_as`. - ---- - -## Schema evolution for incremental models - -Real tables evolve. To avoid incremental runs failing when the output schema changes, executors can implement: - -```python -alter_table_sync_schema(relation: str, select_sql: str, mode: str = "append_new_columns") -``` - -Typical behaviour (Spark example): - -1. Run the SELECT with `LIMIT 0` to infer the **output schema**. -2. Compare it to the existing table schema. -3. For any **new columns**: - - * issue `ALTER TABLE ... ADD COLUMNS (...)`, - * map complex types to reasonable SQL types (often defaulting to `STRING` in Spark for safety). - -Modes: - -* `"append_new_columns"` – only new columns are added; existing columns are left untouched. -* `"sync_all_columns"` – more aggressive sync, may also adjust types (implementation-specific). - -For DuckDB/Postgres, the simplest implementation may be a no-op initially; more advanced engines (or future versions) can support automatic `ALTER TABLE` statements. - ---- - -## Storage overrides and Delta Lake integration - -Incremental models work with both: - -1. **Managed / catalog tables**, and -2. **Storage overrides** via `project.yml` / model config, e.g.: - - ```yaml - models: - storage: - fct_events: - path: ".local/spark/fct_events" - format: delta - ``` - -The storage layer (`fastflowtransform.storage`) provides helpers like: - -* `get_model_storage(name)` – resolve per-model `path`/`format`/`options` -* `spark_write_to_path(spark, identifier, df, storage=..., default_format=...)` - -For Spark/Delta: - -* Incremental models can be backed by **Delta files** at a fixed path. - -* The executor writes the DataFrame to a temporary directory, then atomically renames it into place and wires up: - - ```sql - CREATE TABLE `db`.`tbl` - USING DELTA - LOCATION '/path/to/model' - ``` - -* Incremental MERGE (`incremental_merge`) then runs against this Delta table. - -This keeps: - -* a stable location on disk / in the lake, -* and a proper table in the metastore/catalog. - -When the Databricks/Spark executor's `table_format` (or `FF_DBR_TABLE_FORMAT`) resolves to `delta`, -FastFlowTransform automatically pulls in `delta-spark` and configures both -`spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension` and -`spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog` (unless you -already provided custom values). Install `delta-spark >= 4.0` and you can seed/run Delta-backed -models without manually adding Spark CLI flags. - ---- - -## Interaction with metadata and DAG selection - -After each successful build, executors call: - -```python -on_node_built(node, relation, fingerprint) -``` - -which uses the meta helpers: - -* `ensure_meta_table(executor)` -* `upsert_meta(executor, node_name, relation, fingerprint, engine_name)` - -The `_ff_meta` table records, for each model and engine: - -* the relation name, -* the last fingerprint/hash, -* timestamps, etc. - -While this metadata is **not strictly required** for incremental mechanics, it is used for advanced features such as: - -* **state-based selection** (`--select state:modified`, etc.), -* change-aware DAG runs. - -Incremental models work together with these features: you can, for example, run only models whose source files changed and let the incremental planner update them efficiently. - ---- - -## Best practices & recommendations - -* **Always define a `unique_key`** for merge strategies. - Without a stable key, upserts can behave unpredictably. - -* **Use timestamps or monotonically increasing columns** for delta selection. - Avoid non-deterministic expressions (e.g. `now()` in your model SQL) in incremental filters. - -* **Start simple**: - - * Begin with `strategy: "append"` and a single `updated_at_column`. - * Move to `strategy: "merge"` only when you truly need updates. - -* **Test both fresh and incremental runs**: - - * First run with an empty database (initial full build). - * Then run again with new rows and verify the target grew as expected. - * Add automated tests that run the same model twice and assert row counts / contents. - -* **Use `--full-refresh` when semantics change**: - If you change the business logic of a model in a way that invalidates old rows, do a full rebuild at least once. - - - - - -# Profiles Configuration - -FastFlowTransform uses `profiles.yml` to describe how each environment connects to the execution engine (DuckDB, Postgres, BigQuery, Databricks Spark, Snowflake Snowpark, …). This document covers file layout, supported features, environment overrides, and loading precedence. - -## File Location - -`profiles.yml` lives at the project root (same level as `models/`, `project.yml`). The CLI loads it whenever you run `fft` commands (seed/run/test/dag/utest/docgen …). - -``` -project/ -├── models/ -├── project.yml -└── profiles.yml -``` - -## Basic Structure - -The file is parsed as YAML after optional Jinja rendering. Top-level keys represent profile “names” (e.g. `dev`, `prod`, `dev_postgres`). Each profile must include an `engine` plus engine-specific configuration. - -```yaml -dev: - engine: duckdb - duckdb: - path: "{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}" - -stg: - engine: postgres - postgres: - dsn: "{{ env('FF_PG_DSN') }}" - db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" - -prod: - engine: bigquery - bigquery: - project: "{{ env('FF_BQ_PROJECT') }}" - dataset: "{{ env('FF_BQ_DATASET') }}" - location: EU - -default: - engine: duckdb - duckdb: - path: ":memory:" -``` - -### Engines and Sections - -Supported engines and their expected sections: - -| Engine | Section | Key Fields | -|----------------------|--------------------|---------------------------------------------------| -| `duckdb` | `duckdb` | `path` (file path or `:memory:`) | -| `postgres` | `postgres` | `dsn`, `db_schema` | -| `bigquery` | `bigquery` | `project` (optional), `dataset`, `location` | -| `databricks_spark` | `databricks_spark` | `master`, `app_name`, optional `extra_conf`, `warehouse_dir`, `use_hive_metastore`, `database`, `table_format`, `table_options` | -| `snowflake_snowpark` | `snowflake_snowpark`| `account`, `user`, `password`, `warehouse`, `database`, `db_schema`, optional `role` | - -Each profile can define its own `vars:` block (values exposed via `var('key')` inside templates). - -## Environment Variables - -`profiles.yml` supports Jinja expressions. The helper `env('FF_VAR', 'fallback')` reads process environment variables and substitutes the default if unset. Examples: - -```yaml -dev_postgres: - engine: postgres - postgres: - dsn: "{{ env('FF_PG_DSN') }}" - db_schema: "{{ env('FF_PG_SCHEMA', 'analytics') }}" -``` - -These expressions are rendered *before* YAML parsing. If the environment variable is missing and no default is provided, the expression resolves to an empty string and validation will fail with a clear error message. - -## Loading Order & Precedence - -When running `fft` commands, `_load_dotenv_layered()` loads `.env` files in ascending precedence: - -1. `/.env` -2. `/.env` -3. `/.env.local` -4. `/.env.` -5. `/.env..local` - -Earlier values fill defaults; later files override earlier ones *only for keys that are not already defined*. **Values set in the shell (e.g. via `FF_ENGINE=duckdb fft run …`) have highest priority**—they remain untouched, even if `.env` files define the same key. - -After `.env` loading, `profiles.yml` is rendered with Jinja (using the current `os.environ`) and parsed by Pydantic. Validation ensures required fields are present for each engine and produces human-readable errors for missing DSNs, schemas, etc. - -## Selecting Profiles - -- **Via `--env` flag**: `fft run . --env dev_postgres` -- **Via `FFT_ACTIVE_ENV`**: set in shell or `.env` to choose the active profile name. -- **Legacy `FF_ENGINE`** (overrides `engine` field post-parse): useful for quick experiments but explicit `profiles.yml` entries are preferred. - -Example Makefile snippet that switches profiles without exposing secrets: - -```make -ENGINE ?= duckdb - -ifeq ($(ENGINE),duckdb) - PROFILE_ENV = dev_duckdb -endif -ifeq ($(ENGINE),postgres) - PROFILE_ENV = dev_postgres -endif - -seed: - FFT_ACTIVE_ENV=$(PROFILE_ENV) uv run fft seed . --env $(PROFILE_ENV) -``` - -## Using `.env` for Secrets - -Keep sensitive credentials out of VCS by storing them in `.env` files referenced above: - -``` -examples/api_demo/ -├── .env.dev_duckdb # FF_DUCKDB_PATH=... -├── .env.dev_postgres # FF_PG_DSN=..., FF_PG_SCHEMA=... -├── .env.dev_databricks # FF_SPARK_MASTER=..., FF_SPARK_APP_NAME=... -└── profiles.yml -``` - -These files stay out of git (via `.gitignore`), while `profiles.yml` contains only non-sensitive wiring. - -## Summary of Features - -- Multiple profiles in a single YAML file. -- Jinja templating with `env()` helper for dynamic values. -- `.env` layered loading with shell overrides taking precedence. -- Validation for engine-specific parameters (clear error messages). -- Profile-specific `vars` exposed to Jinja `var()` function in models. -- Works seamlessly across CLI commands: seed, run, dag, test, docgen, utest. - -Keep `profiles.yml` declarative, `.env` files secret, and use CLI or Makefiles to select the active profile per run. This pattern scales from local DuckDB demos to production Postgres/BigQuery/Snowflake deployments. - - - - - -# Sources Configuration - -`sources.yml` declares external tables (seeds, raw inputs, lakehouse paths) that models can reference via `{{ source('group', 'table') }}`. This document covers the schema, engine overrides, file paths, and best practices. - -## File Location - -Place `sources.yml` at your project root (same level as `models/`). Example: - -``` -project/ -├── models/ -├── sources.yml -└── seeds/ -``` - -## YAML Schema (Version 2) - -FastFlowTransform expects a dbt-style structure: - -```yaml -version: 2 -sources: - - name: raw - schema: staging # default schema for this source group - overrides: - postgres: - schema: raw_main # engine-specific default override - - tables: - - name: seed_users - identifier: seed_users # optional physical name - overrides: - duckdb: - schema: main - databricks_spark: - format: delta - location: "/mnt/delta/raw/seed_users" -``` - -### Fields - -| Level | Field | Description | -|----------|-------------|-------------| -| source | `name` | Logical group identifier referenced by `source('name', ...)`. | -| | `schema` | Default target schema/database for the group. | -| | `database`/`catalog` | Optional qualifiers per engine (BigQuery, Snowflake). | -| | `overrides` | Map of engine → config snippet (schema overrides, formats, locations). | -| table | `name` | Logical table name (second argument in `source()`). | -| | `identifier`| Physical name; defaults to `name` if omitted. | -| | `location` | File/path location (used with `format`). | -| | `format` | Ingestion format for engines supporting path-based sources (`delta`, `parquet`, …). | -| | `options` | Dict of format options (Spark/Databricks). | -| | `overrides` | Additional engine-specific settings merged with source-level overrides. | - -Engine-specific overrides follow this merge order: - -1. Source defaults (`schema`, `database`, …) -2. Source-level `overrides[engine]` -3. Table-level `overrides[engine]` - -### Engine Behavior - -- **DuckDB / Postgres / BigQuery / Snowflake**: expect `identifier` (plus `schema`/`database` where relevant). Path-based sources raise errors. -- **Databricks Spark**: supports `format` + `location`. The executor registers a temp view with optional `options` (e.g. `compression`). - -### Path-Based Sources Example - -```yaml - - name: raw_events - tables: - - name: landing - overrides: - databricks_spark: - format: json - location: "abfss://landing@storage.dfs.core.windows.net/events/*.json" - options: - multiline: true -``` - -## Referencing Sources in Models - -```sql -select id, email -from {{ source('raw', 'seed_users') }} -``` - -After rendering, the executor resolves the fully-qualified relation or path depending on the active engine. - -## Seed Integration - -When combined with `seeds/schema.yml`, you can map CSV/Parquet seeds into schemas per engine: - -```yaml -targets: - raw/users: - schema: raw - schema_by_engine: - duckdb: main - postgres: staging -``` - -## Validation & Errors - -- Missing `identifier` *and* `location` produce `KeyError` during rendering. -- Unknown source/table names raise `KeyError` with suggestions. -- Unsupported path-based sources on an engine (`location` provided but no `format`) raise descriptive `NotImplementedError`. - -Keep `sources.yml` declarative, use engine overrides for schema differences, and lean on `.env` files where credentials or URIs vary per environment. - - - - - -# Project Configuration (`project.yml`) - -`project.yml` defines global metadata, documentation, variables, and data-quality tests for a FastFlowTransform project. This reference walks through the supported sections and common patterns. - -## File Location - -`project.yml` lives at the root of your project. - -``` -project/ -├── models/ -├── project.yml -└── profiles.yml -``` - -## Top-Level Keys - -```yaml -name: my_project -version: "0.1" -models_dir: models # optional, defaults to "models" - -docs: - dag_dir: site/dag # output for fft dag --html - models: - users: - description: "Raw users table" - columns: - id: "Primary key" - email: "Email address" - -vars: - snapshot_day: "2024-01-01" - default_limit: 100 - -tests: - - type: not_null - table: users - column: id - tags: [batch] -``` - -### Metadata - -| Key | Description | -|-------------|-------------| -| `name` | Project identifier (used in docs/metadata). | -| `version` | Arbitrary version string. | -| `models_dir`| Relative directory containing models (`*.ff.sql` / `*.ff.py`). | - -### Documentation (`docs`) - -- `dag_dir`: where `fft dag --html` writes the static site. -- `models`: per-model descriptions and column docs surfaced in the generated DAG/docs. - -### Variables (`vars`) - -Key/value pairs accessible via `{{ var('key', default) }}` in Jinja templates. CLI overrides (`--vars key=value`) take precedence. - -### Tests (`tests`) - -Project-wide data quality checks run by `fft test`. Each test is a dict with: - -- `type`: `not_null`, `unique`, `accepted_values`, `row_count_between`, `greater_equal`, `non_negative_sum`, `freshness`, or reconciliation checks (`reconcile_equal`, `reconcile_diff_within`, `reconcile_ratio_within`, `reconcile_coverage`). -- `table`: target table or relation. -- `column`: required for column-based tests. -- Optional: `tags`, `severity` (`error`/`warn`), additional parameters (e.g. `values`, `min`, `max`). - -Example: - -```yaml -tests: - - type: accepted_values - table: mart_users - column: status - values: [active, invited] - severity: warn - - type: reconcile_equal - name: revenue_vs_bookings - left: { table: fct_revenue, expr: "sum(amount)" } - right: { table: fct_bookings, expr: "sum(expected_amount)" } - abs_tolerance: 5.0 -``` - -## Interaction with `.env` and Profiles - -`project.yml` does not read environment variables directly. However: - -- `vars:` can reference `var('key')` defaults overridden by CLI or `.env`. -- Tests often depend on `profiles.yml` and `sources.yml` for the actual connection details. -- Makefiles may set `FFT_ACTIVE_ENV` or other `FF_*` variables influencing runs, but `project.yml` remains static. - -## Best Practices - -- Keep `project.yml` committed to version control (no secrets). -- Use `docs/` to provide richer Markdown descriptions; reference them via `columns` or `description` fields if desired. -- Organize tests by tag (`tags: [batch]`, `tags: [reconcile]`) to support selective execution: `fft test . --select tag:reconcile`. - -Refer to `docs/Data_Quality_Tests.md` for detailed test semantics and `docs/Profiles.md` for profile/env loading behavior. - - - - - -# State Selection — R1 - -Build only changed nodes or select by last run results. - -## Changed Nodes - -- `state:modified` — models that have changed since last cached fingerprint. -- `state:modified+` — the above plus all downstream dependents. - -```bash -# First run populates cache -fft run examples/r1_demo --env dev --cache rw -# Touch files / change SQL → next run: -fft run examples/r1_demo --env dev --cache rw --select state:modified -fft run examples/r1_demo --env dev --cache rw --select state:modified+ -```` - -## Result-based Selection - -Use the last `run_results.json`: - -* `result:ok` — successful models (no warnings) -* `result:warn` — successful but with warnings -* `result:fail` — alias of `result:error` -* `result:error`— failed models - -```bash -fft run examples/r1_demo --env dev --select result:error -``` - -### Artifacts - -``` -examples/r1_demo/.fastflowtransform/target/ -├── manifest.json -├── run_results.json -└── catalog.json -``` - - - - - -# YAML Tests (Schema-bound) - -Schema-bound tests live in `models/*.yml` or `models/**/schema.yml` and complement (or replace) `project.yml`-based tests. - -## Example - -```yaml -# examples/r1_demo/models/users_enriched.yml -version: 2 -models: - - name: users_enriched - description: "Adds gmail flag" - columns: - - name: id - tests: - - not_null: { severity: error } - - unique - - name: email - tests: - - not_null - - accepted_values: - values: ["a@example.com","b@example.com","c@gmail.com"] - severity: warn -```` - -### Severities - -* `error` → contributes to failures (exit code 2). -* `warn` → surfaced in summary as ❕, does not affect exit code. - -### Run - -```bash -fft test examples/r1_demo --env dev -# Select only tests tagged 'reconcile' (if present) -fft test examples/r1_demo --env dev --select tag:reconcile -``` - -### Output (excerpt) - -``` -Data Quality Summary -──────────────────── -✅ not_null users.id (3ms) -❌ unique users.id (2ms) - ↳ [unique] users.id: found 1 duplicate -❕ accepted_values users_enriched.email (1ms) - -Totals -────── -✓ passed: 2 -✗ failed: 1 -! warnings: 1 -``` - - - - - -# Data Quality Test Reference - -FastFlowTransform exposes a set of built-in data quality checks that you can configure in `project.yml → tests:` and execute with `fft test`. This document lists every supported test, required parameters, and example configurations. - -## Supported Test Types - -The following values are currently supported for `type`: - -- `not_null` -- `unique` -- `accepted_values` -- `greater_equal` -- `non_negative_sum` -- `row_count_between` -- `freshness` -- `reconcile_equal` -- `reconcile_ratio_within` -- `reconcile_diff_within` -- `reconcile_coverage` - -## Usage Overview - -```yaml -# project.yml -tests: - - type: not_null - table: users - column: id - severity: error # default (omit for error) - tags: [batch] - - - type: unique - table: users - column: email - tags: [batch] - - - type: accepted_values - table: users - column: status - values: [active, invited] - severity: warn # warn keeps run green on failure - - - type: greater_equal - table: orders - column: amount - threshold: 0 - - - type: non_negative_sum - table: orders - column: amount - - - type: row_count_between - table: users_enriched - min_rows: 1 - max_rows: 100000 - - - type: freshness - table: events - column: event_ts - max_delay_minutes: 30 - - - type: reconcile_equal - name: revenue_vs_bookings # optional label in summaries - tags: [reconcile] - left: { table: fct_revenue, expr: "sum(amount)" } - right: { table: fct_bookings, expr: "sum(expected_amount)" } - abs_tolerance: 5.0 -```` - -Every entry is a single dictionary describing one check. The common keys are: - -| Key | Description | -| ---------- | ------------------------------------------------------------------------ | -| `type` | Test kind (see list above). | -| `table` | Target table for table-level checks or display hint for reconciliations. | -| `column` | Required for column-scoped checks (`not_null`, `unique`, …). | -| `severity` | `error` (default) or `warn`. | -| `tags` | Optional list of selectors for `fft test --select tag:...`. | -| `name` | Optional identifier surfaced in summaries (useful for reconciliations). | - -Run all configured checks: - -```bash -fft test . --env dev -``` - -Use `--select tag:` to restrict by tags (e.g. `fft test --select tag:batch`). Tests always execute regardless of cache settings. - -Each entry produces a summary line. Failures stop the command unless `severity: warn` is set. - -## Table-Level Checks - -These checks operate on a single table (optionally filtered with `where:`). Unless noted, they require a `column` argument. - -### `not_null` - -* **Purpose:** Assert that a column never contains NULLs. -* **Parameters:** - - * `column` *(str, required)* - * `where` *(str, optional)* — SQL predicate applied before the NULL check. -* **Failure:** Reports the number of NULL rows and shows the underlying SQL. - ---- - -### `unique` - -* **Purpose:** Detect duplicates within a column. -* **Parameters:** - - * `column` *(str, required)* - * `where` *(str, optional)* -* **Failure:** Indicates how many duplicate groups were found (HAVING `count(*) > 1`) and shows a sample query. - ---- - -### `accepted_values` - -* **Purpose:** Ensure every non-NULL value is inside an allowed set. -* **Parameters:** - - * `column` *(str, required)* - * `values` *(list, required)* — permitted literals (strings are quoted automatically). - * `where` *(str, optional)* — additional filter condition. -* **Behaviour note:** If `values` is omitted or an empty list, the check is treated as a no-op and always passes. The summary still shows the configured test. -* **Failure:** Shows the number of out-of-set values plus up to five sample values. - ---- - -### `greater_equal` - -* **Purpose:** Require all values to be greater than or equal to a threshold. -* **Parameters:** - - * `column` *(str, required)* - * `threshold` *(number, default `0`)* -* **Failure:** Lists how many rows fell below the threshold. - ---- - -### `non_negative_sum` - -* **Purpose:** Validate that the sum of a numeric column is not negative. -* **Parameters:** - - * `column` *(str, required)* -* **Failure:** Reports the signed sum when it is negative. - ---- - -### `row_count_between` - -* **Purpose:** Guard minimum (and optional maximum) row counts for a table. -* **Parameters:** - - * `min_rows` *(int, default `1`)* — minimum expected number of rows. - * `max_rows` *(int, optional)* — omit for open-ended upper bounds. -* **Failure:** Indicates the observed row count when it falls outside `[min_rows, max_rows]`. - ---- - -### `freshness` - -* **Purpose:** Warn when the latest timestamp is older than an allowed delay. -* **Parameters:** - - * `column` *(str, required)* — timestamp column. - * `max_delay_minutes` *(int, required)* — permitted staleness in whole minutes. -* **Failure:** Reports the computed lag in minutes. Uses: - - ```sql - select date_part('epoch', now() - max(column)) / 60.0 as delay_min - from - ``` - - This is straightforward for DuckDB/Postgres; other engines may need adaptations. - -## Cross-Table Reconciliations - -Reconciliation checks compare aggregates or keys across two relations. Their configuration accepts dictionaries describing the left/right side expressions or keys. The top-level `table`/`column` fields are used only for display and grouping; the actual queries are defined via the nested dictionaries. - -### `reconcile_equal` - -* **Purpose:** Compare two scalar expressions with optional tolerances. -* **Parameters:** - - * `left`, `right` *(dict, required)* with keys: - - * `table` *(str, required)* - * `expr` *(str, required)* — SQL select expression (e.g. `sum(amount)`). - * `where` *(str, optional)* - * `abs_tolerance` *(float, optional)* — maximum absolute difference. - * `rel_tolerance_pct` *(float, optional)* — maximum relative difference in percent. -* **Failure:** Displays both values, absolute and relative differences. If no tolerance is provided, strict equality is enforced (diff must be exactly `0.0`). - ---- - -### `reconcile_ratio_within` - -* **Purpose:** Constrain the ratio `left/right` within bounds. -* **Parameters:** - - * `left`, `right` *(dict, required as above)* - * `min_ratio`, `max_ratio` *(float, required)* -* **Failure:** Shows the computed ratio and expected interval. - ---- - -### `reconcile_diff_within` - -* **Purpose:** Limit the absolute difference between two aggregates. -* **Parameters:** - - * `left`, `right` *(dict, required)* - * `max_abs_diff` *(float, required)* -* **Failure:** Reports the absolute difference when it exceeds `max_abs_diff`. - ---- - -### `reconcile_coverage` - -* **Purpose:** Ensure every key present in a source table appears in a target table (anti-join zero). -* **Parameters:** - - * `source` *(dict, required)* — must contain: - - * `table` *(str)* — source table. - * `key` *(str)* — key column in the source. - * `target` *(dict, required)* — must contain: - - * `table` *(str)* — target table. - * `key` *(str)* — key column in the target. - * `source_where` *(str, optional)* — filter applied to the source. - * `target_where` *(str, optional)* — filter applied to the target. -* **Failure:** Reports the number of missing keys. - -## Severity & Tags - -* `severity: error` (default) makes failures stop the test run with exit code 1. -* `severity: warn` records the result but keeps the run successful. -* `tags:` lets you group checks under named tokens (e.g. `batch`, `streaming`). Use `fft test --select tag:batch` to execute a subset. - -## CLI Summary Output - -Each executed check produces a line in the summary: - -```text -✓ not_null users.email (3ms) -✖ accepted_values events.status values=['new', 'active'] (warn) -``` - -Failures include the generated SQL (where available) to simplify debugging. Use `fft test --verbose` for more detail, or `FFT_SQL_DEBUG=1` to log the underlying queries. - -## Further Reading - -* `docs/YAML_Tests.md` – schema for YAML-defined tests and advanced scenarios. -* `fft test --help` — command-line switches, selectors, and cache options. - - - - - -# CLI Guide - -FastFlowTransform’s CLI is the entry point for seeding data, running DAGs, generating docs, syncing metadata, and executing quality tests. This guide summarizes the day-to-day commands and how they fit together. See `src/fastflowtransform/cli.py` for Typer definitions. - -## Core Commands - -| Command | Purpose | -|---------|---------| -| `fft seed [--env dev]` | Materialize CSV/Parquet seeds into the configured engine. | -| `fft run [--env dev]` | Execute the DAG (obeys cache + parallel flags). | -| `fft dag --html` | Render the DAG graph/site for quick inspection. | -| `fft docgen [--out site/docs] [--emit-json path] [--open-source]` | Generate the full documentation bundle (graph + model pages + optional JSON). Default output is `/site/docs`. | -| `fft test [--env dev]` | Run schema/data-quality tests defined in `project.yml` or schema YAML files. | -| `fft utest ` | Execute unit tests defined under `tests/unit/*.yml`. | -| `fft sync-db-comments ` | Push model/column descriptions into Postgres or Snowflake comments. | - -Use `--select` to scope `run`, `dag`, or `test` commands (e.g. `state:modified`, `tag:finance`, `result:error`). Environment overrides rely on the selected profile in `profiles.yml` or the `FF_*` variables. - -## HTTP/API Helpers - -Python models can make HTTP calls via `fastflowtransform.api.http`. When you need examples, head over to `docs/Api_Models.md` for `get_json`, `get_df`, pagination helpers, caching, and offline modes. - -## DAG & Documentation - -- Narrow the graph with `fft dag ... --select ` (for example `state:modified` or `tag:finance`). Combined with `--html` this produces a focused mini-site under `/docs/index.html`. -- Control schema introspection via `--with-schema/--no-schema`. Use `--no-schema` when the executor should avoid fetching column metadata (for example, BigQuery without sufficient permissions). -- `fft docgen` renders the DAG, model pages, and an optional JSON manifest in one command. Append `--open-source` to open `index.html` in your default browser after rendering. - -## Sync Database Comments - -`fft sync-db-comments --env ` pushes model and column descriptions from project YAML or Markdown into database comments. The command currently supports Postgres and Snowflake Snowpark: - -- Start with `--dry-run` to review the generated `COMMENT` statements. -- Postgres honors `profiles.yml -> postgres.db_schema` (and any `FF_PG_SCHEMA` override). -- Snowflake reuses the session or connection exposed by the executor. - -If no descriptions are found, the command exits without making changes. - - - - - -# Auto-Docs & Lineage - -FastFlowTransform can generate a lightweight documentation site (DAG + model detail pages) plus an optional JSON manifest for external tooling. - -## Commands - -```bash -# Classic -fft dag . --env dev --html - -# Convenience wrapper (loads schema + descriptions + lineage, can emit JSON) -fft docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json -``` - -Add `--open-source` if you want the default browser to open the rendered `index.html` immediately. - -## Descriptions - -Descriptions can be provided in YAML (`project.yml`) and/or Markdown files. Markdown has higher priority. - -YAML in `project.yml`: - -```yaml -docs: - models: - users.ff: - description: "Raw users table imported from CRM." - columns: - id: "Primary key." - email: "User email address." - users_enriched: - description: "Adds gmail flag." - columns: - is_gmail: "True if email ends with @gmail.com" -``` - -Markdown overrides YAML when present: - -``` -/docs/models/.md -/docs/columns//.md -``` - -Optional front matter is ignored for now (title/tags may be used later). - -## Column Lineage - -- SQL models: expressions like `col`, `alias AS out`, `upper(u.email) AS email_upper)` are parsed; `u` must come from a `FROM ... AS u` clause that resolves to a relation. Functions mark lineage as *transformed*. -- Python (pandas) models: simple patterns like `rename`, `out["x"] = df["y"]`, `assign(x=...)` are recognized. -- Override hints in YAML when the heuristic is insufficient: - -```yaml -docs: - models: - mart_orders_enriched: - lineage: - email_upper: - from: [{ table: users, column: email }] - transformed: true -``` - -## JSON Manifest - -The optional manifest (via `--emit-json`) includes models, relations, descriptions, columns (with nullable/dtype), and lineage per column—useful for custom doc portals or CI checks. - -## Notes - -- Schema introspection currently supports DuckDB and Postgres. For other engines, the Columns card may be empty. -- Lineage is optional; when uncertain, entries fall back to “unknown” and never fail doc generation. - - - - - -# Logging & Verbosity - -FastFlowTransform exposes uniform logging controls across all CLI commands plus a dedicated SQL debug channel for tracing rendered SQL, dependency loading, and auxiliary queries. - -## CLI Flags - -- `-q` / `--quiet` → only errors (`ERROR`) -- *(default)* → concise warnings (`WARNING`) -- `-v` / `--verbose` → progress/info (`INFO`) -- `-vv` → full debug (`DEBUG`) including SQL debug output - -`-vv` automatically flips on the SQL debug channel (same effect as `FFT_SQL_DEBUG=1`). - -## SQL Debug Channel - -Enable it to inspect Python-model inputs, dependency columns, and helper SQL emitted by data-quality checks: - -```bash -# full debug (recommended) -fft run . -vv - -# equivalent using the env var (legacy behaviour retained) -FFT_SQL_DEBUG=1 fft run . -``` - -## Usage Patterns - -```bash -fft run . -q # quiet (errors only) -fft run . # default (concise) -fft run . -v # verbose progress (model names, executor info) -fft run . -vv # full debug + SQL channel -``` - -## Parallel Logging UX - -- Each node emits start/end lines with duration, truncated name, and engine abbreviation (DUCK/PG/BQ/…). -- Output remains line-stable via a thread-safe log queue; per-level summaries trail each run. -- Failures still surface the familiar “error block” per node for quick diagnosis. - -**Notes** - -- SQL debug output routes through the `fastflowtransform.sql` logger; use `-vv` or `FFT_SQL_DEBUG=1` to reveal it. -- Existing projects do not need changes: the environment variable keeps working even without `-vv`. - - - - - -# Model Unit Tests (`fft utest`) - -`fft utest` executes a single model in isolation, loading only the inputs you provide and comparing the result to an expected dataset. It works for SQL and Python models and runs against DuckDB or Postgres by default. - -## Cache Modes - -`fft utest --cache {off|ro|rw}` (default: `off`) - -- `off`: deterministic, never skips. -- `ro`: skip on cache hit; on miss, build but **do not write** cache. -- `rw`: skip on hit; on miss, build **and write** fingerprint. - -Notes: - -- UTests key the cache with `profile="utest"`. -- Fingerprints include case inputs (CSV content hash / inline rows), so changing inputs invalidates the cache. -- `--reuse-meta` is currently a reserved flag: exposed in the CLI, acts as a no-op today, and will enable future meta-table optimizations. - -## Why Use UTests? - -- Fast feedback on transformation logic without full DAG runs. -- Small, reproducible fixtures (rows inline or external CSV). -- Engine-agnostic: swap DuckDB/Postgres to spot dialect differences. - -## Folder Layout - -Specs live under `/tests/unit/*.yml` relative to the project root (the directory passed to the CLI that contains `models/`): - -``` -your-project/ -├── models/ -│ ├── users.ff.sql -│ ├── users_enriched.ff.py -│ └── mart_users.ff.sql -└── tests/ - └── unit/ - ├── users_enriched.yml - └── mart_users.yml -``` - -## YAML DSL (with `defaults`) - -Each file targets one logical node (the DAG name). Defaults are deep-merged into every case so you can share inputs/expectations and override per scenario. - -```yaml -# tests/unit/users_enriched.yml -model: users_enriched - -defaults: - inputs: - users: - rows: - - {id: 1, email: "a@example.com"} - - {id: 2, email: "b@gmail.com"} - expect: - relation: users_enriched - order_by: [id] - -cases: - - name: basic_gmail_flag - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} - - - name: override_inputs - inputs: - users: - rows: - - {id: 3, email: "c@hotmail.com"} - - {id: 4, email: "d@gmail.com"} - expect: - rows: - - {id: 3, email: "c@hotmail.com", is_gmail: false} - - {id: 4, email: "d@gmail.com", is_gmail: true} -``` - -SQL models use the file stem (including `.ff`) as `model`. Provide expected relation names that match the materialized table/view: - -```yaml -# tests/unit/mart_users.yml -model: mart_users.ff - -defaults: - inputs: - users_enriched: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} - expect: - relation: mart_users - order_by: [id] - -cases: - - name: passthrough_columns - expect: - rows: - - {id: 1, email: "a@example.com", is_gmail: false} - - {id: 2, email: "b@gmail.com", is_gmail: true} -``` - -For multi-dependency models, include every physical relation name (what `relation_for(dep)` returns): - -```yaml -model: mart_orders_enriched -defaults: - inputs: - users_enriched: - rows: - - {id: 1, email: "x@gmail.com", is_gmail: true} - orders: - rows: - - {order_id: 10, user_id: 1, amount: 19.9} - - {order_id: 11, user_id: 1, amount: -1.0} -cases: - - name: join_and_flag - expect: - any_order: true - rows: - - {order_id: 10, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: 19.9, valid_amt: true} - - {order_id: 11, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: -1.0, valid_amt: false} -``` - -## Input Formats - -- `rows`: inline dictionaries per row. -- `csv`: reference a CSV file (relative paths allowed). - -Keys under `inputs` are physical relations; use `relation_for('users.ff')` if unsure. - -## Expected Output & Comparison - -- `relation`: actual table/view name produced by the model (defaults to `relation_for(model)`). -- Ordering: `order_by: [...]` or `any_order: true`. -- Columns: `ignore_columns: [...]`, `subset: true`. -- Numeric tolerance: `approx: true` or `approx: { col: 1e-9, other_col: 0.01 }` - (numbers can be plain `1e-9` or quoted; they are cast to float). - -## Running UTests - -```bash -fft utest . # discover all specs -fft utest . --env dev # use a specific profile -fft utest . --model users_enriched -fft utest . --model mart_orders_enriched --case join_and_flag -fft utest . --path tests/unit/users_enriched.yml -``` - -Override the executor for all specs (ensure credentials/DSNs are set): - -```bash -export FF_PG_DSN="postgresql+psycopg://postgres:postgres@localhost:5432/ffdb" -export FF_PG_SCHEMA="public" -fft utest . --engine postgres -``` - -Executor precedence (highest → lowest): CLI `--engine`, YAML `engine:` (optional), `profiles.yml`, environment overrides. - -## Design Notes - -- Only the target model runs; supply all upstream relations the model expects. -- `defaults` deep-merge: dicts merge, lists/scalars overwrite. -- Results compare as DataFrames with configurable order, subsets, ignored columns, and numeric tolerances. -- Exit codes: `0` for success, `2` when at least one case fails (compact CSV-style diff is printed). - -## CI Example - -```yaml -name: utests -on: [push, pull_request] -jobs: - duckdb: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: { python-version: "3.11" } - - run: pip install -e . - - run: fft utest . --env dev -``` - -For Postgres, add a service container and run `fft utest . --engine postgres` with `FF_PG_DSN` / `FF_PG_SCHEMA`. - - - - - -# Troubleshooting & Error Codes - -Use this checklist when FastFlowTransform commands misbehave. Each item points to the quickest fix plus the relevant CLI options. - -## Quick Fixes - -- **DuckDB seeds not visible** → ensure `FF_DUCKDB_PATH` (or the profile path) is identical for `seed`, `run`, `dag`, and `test`. If you configure `FF_DUCKDB_SCHEMA` / `FF_DUCKDB_CATALOG`, keep them consistent across commands so unqualified references resolve to the right namespace. -- **Postgres connection refused** → confirm `FF_PG_DSN`, container status (`docker ps`), and that port `5432` is open. -- **BigQuery permissions** → set `GOOGLE_APPLICATION_CREDENTIALS` and match dataset/location to your profile. -- **HTML docs missing** → run `fft dag --html` and open `/docs/index.html`. -- **Unexpected test failures** → inspect rendered SQL in CLI output, refine selection via `--select`, refresh seeds if needed. -- **Dependency table not found in utests** → provide all physical upstream relations in the YAML spec. - -## Error Codes - -| Type | Class/Source | Exit | Notes | -|---------------------------|---------------------------|------|---------------------------------------------------------| -| Missing dependency | `DependencyNotFoundError` | 1 | Per-node list; tips for `ref()` / names | -| Cycle in DAG | `ModelCycleError` | 1 | “Cycle detected among nodes: …” | -| Model execution (KeyError)| `cli.py` → formatted block| 1 | Inspect columns, use `relation_for(dep)` as keys | -| Data quality failures | `cli test` → summary | 2 | Totals section prints passed/failed counts | -| Unknown/unexpected | generic | 99 | Optional trace via `FFT_TRACE=1` | - -Error types map to the classes documented in `docs/Technical_Overview.md#core-modules` and the CLI source. - - - - - -# Basic Demo Project - -The `examples/basic_demo` project shows the smallest end-to-end FastFlowTransform pipeline. It combines one seed, a staging model, and a final mart while staying portable across DuckDB, Postgres, Databricks Spark, and BigQuery. - -## Why it exists - -- **Start small** – demonstrate the minimum folder structure (`seeds/`, `models/`, `profiles.yml`) needed to run `fft`. -- **Engine parity** – prove that a single project can target multiple engines by swapping profiles. -- **Cloud & local** – show that the same project runs both on local engines (DuckDB/Postgres/Spark) and in a cloud warehouse (BigQuery). -- **Understand outputs** – show where documentation and manifests land after a run. - -Use it as a sandbox before adding your own sources, macros, or Python models. - -## Project layout - -| Path | Purpose | -|------|---------| -| `seeds/seed_users.csv` | Sample CRM-style user data. `fft seed` materializes it as a physical `seed_users` table in the active engine (schema/dataset depends on the profile). | -| `models/staging/users_clean.ff.sql` | Normalizes emails, casts types, and tags the model for all engines. | -| `models/marts/mart_users_by_domain.ff.sql` | Aggregates users per email domain and records the first/last signup dates. | -| `models/engines/*/mart_latest_signup.ff.py` | Engine-specific Python models selecting the most recent signup per domain from the staging view:
• pandas for DuckDB/Postgres
• PySpark for Databricks
• BigQuery DataFrames (BigFrames) for BigQuery. | -| `profiles.yml` | Declares `dev_duckdb`, `dev_postgres`, `dev_databricks`, and `dev_bigquery` profiles driven by environment variables. | -| `.env.dev_*` | Template environment files you can `source` per engine (`.env.dev_duckdb`, `.env.dev_postgres`, `.env.dev_databricks`, `.env.dev_bigquery`). | -| `Makefile` | One command (`make demo ENGINE=…`) to seed, run, document, test, and preview results. | - -## Running the demo - -1. `cd examples/basic_demo` -2. Choose an engine and export its environment variables: - ```bash - # DuckDB - set -a; source .env.dev_duckdb; set +a - - # Postgres - # set -a; source .env.dev_postgres; set +a - - # Databricks Spark - # set -a; source .env.dev_databricks; set +a - - # BigQuery (choose one) - # set -a; source .env.dev_bigquery_pandas; set +a # pandas client - # set -a; source .env.dev_bigquery_bigframes; set +a # BigFrames - ``` - -3. Execute the full flow for the selected engine: - - ```bash - # DuckDB / Postgres / Databricks - make demo ENGINE=duckdb - # make demo ENGINE=postgres - # make demo ENGINE=databricks_spark - - # BigQuery (set BQ_FRAME to choose pandas vs bigframes) - # builds into ..* - # requires a GCP project, dataset, and credentials (see BigQuery setup docs) - # set profiles.yml → bigquery.allow_create_dataset: true if the dataset should be auto-created - # make demo ENGINE=bigquery BQ_FRAME=bigframes - # make demo ENGINE=bigquery BQ_FRAME=pandas - ``` - - The Makefile runs `fft seed`, `fft run`, `fft dag`, and `fft test`. - - To open the rendered DAG site after a run: - - ```bash - make show ENGINE=duckdb - make show ENGINE=bigquery - ``` -4. Inspect artifacts: - - * `.fastflowtransform/target/manifest.json` and `run_results.json` - * `site/dag/index.html` for the rendered model graph - * Use your engine’s client (or `fft run` logs) to inspect the mart outputs - -## Data quality tests - -The demo enables baseline data quality checks in `project.yml`. Running `fft test` (or `make test ENGINE=…`) verifies that: - -* Primary keys remain unique/not-null across: - - * `seed_users` - * `users_clean` - * `mart_users_by_domain` - * the Python mart `mart_latest_signup` -* Aggregate metrics such as `user_count` never drop below zero. -* Each email domain appears only once in `mart_latest_signup`. - -These tests run against whatever engine/profile is active — including BigQuery, where they execute as standard SQL queries on the configured dataset. - - - - - -# Materializations Demo - -> This example shows how different **materializations** (`view`, `table`, `incremental`, `ephemeral`) behave in FastFlowTransform. - -The demo models are located under: -``` - -examples/materializations_demo/models/ - -```` - -Each model type demonstrates how FastFlowTransform builds, caches, or executes models differently depending on its `materialized:` configuration. - ---- - -## 🧩 1. View Models - -A **view** model is always re-created from scratch each run. -It defines a virtual relation that doesn’t store data permanently — ideal for lightweight transformations. - -```sql -{{ config(materialized='view') }} - -select - order_id, - customer_id, - total_amount, - order_date -from {{ ref('stg_orders') }} -```` - -**Characteristics** - -* Rebuilt each run (no persisted data) -* Useful for staging, joins, and intermediate logic -* Fast and always up-to-date with upstreams -* Cannot store or cache incremental state - ---- - -## 🧱 2. Table Models - -A **table** model materializes into a physical table on the target engine. - -```sql -{{ config(materialized='table') }} - -select * -from {{ ref('fct_orders_view') }} -``` - -**Characteristics** - -* Fully rebuilt every run -* Good for final curated datasets or small tables -* Overwrites previous contents (atomic replace) -* Compatible with all engines (DuckDB, Postgres, BigQuery, etc.) - ---- - -## ⚡ 3. Incremental Models - -An **incremental** model stores state and only updates changed records on subsequent runs. - -```sql -{{ config( - materialized='incremental', - incremental={ - "enabled": true, - "unique_key": "order_id", - "updated_at_column": "updated_at", - "delta_sql": "select * from {{ ref('stg_orders') }} where updated_at > (select max(updated_at) from {{ this }})" - } -) }} -``` - -**Characteristics** - -* Persists data between runs -* Only merges new or changed rows -* Significantly faster for large tables -* Requires `unique_key` and (optionally) an `updated_at_column` -* Schema changes can be managed via: - - * `on_schema_change: "ignore"` - * `on_schema_change: "append_new_columns"` - * `on_schema_change: "sync_all_columns"` - -**Behavior example:** - -| Run | Operation | Rows affected | -| --- | ----------- | ------------- | -| 1 | full load | 10,000 | -| 2 | merge delta | 120 | -| 3 | merge delta | 45 | - ---- - -## 🧮 4. Ephemeral Models - -An **ephemeral** model exists only during query compilation. -It never creates a physical table or view — it’s inlined wherever it’s referenced. - -```sql -{{ config(materialized='ephemeral') }} - -select - order_id, - total_amount * 0.1 as tax_amount -from {{ ref('fct_orders_inc') }} -``` - -**Characteristics** - -* Inlined into parent queries -* Reduces I/O overhead (no temporary tables) -* Ideal for lightweight reusable SQL snippets -* Not visible in the warehouse after execution - ---- - -## 🔗 5. Combined Example DAG - -In the demo, these models are connected as follows: - -```text -stg_orders - ↓ -fct_orders_view (view) - ↓ -fct_orders_tbl (table) - ↓ -fct_orders_inc (incremental) - ↓ -fct_orders_ephemeral (ephemeral) -``` - -This DAG demonstrates: - -* How **data flows** between materializations -* Which ones persist or recompute data -* How incremental models can feed downstream table or ephemeral models - ---- - -## 🧭 When to Use Each Type - -| Materialization | Persists? | Performance | Recommended Use Case | -| --------------- | --------- | ------------------- | ----------------------------------------- | -| `view` | ❌ No | ⚡ Fast rebuild | Intermediate or temporary transformations | -| `table` | ✅ Yes | ⚖️ Moderate | Final outputs or smaller datasets | -| `incremental` | ✅ Yes | 🚀 High (on deltas) | Large, frequently updated fact tables | -| `ephemeral` | ❌ No | ⚡ Fast inline | Reusable SQL snippets or shared logic | - ---- - -## 🧠 Tips - -* You can set default materializations in `project.yml` under `models.materialized`. -* Override per model using `{{ config(materialized='...') }}`. -* For incremental models, ensure **unique keys** and **delta logic** are consistent across runs. -* Test behavior locally using the DuckDB engine before deploying to a warehouse. - - - - - -# Environment Matrix (DuckDB-only) — Example - -This tiny project demonstrates **per-environment configuration** (dev / stg / prod) while keeping everything on **DuckDB**. -Each environment uses its **own DuckDB file**, so you can switch environments without changing code. - -It also includes a **seed step** (CSV → table) and two minimal models: - -* `env_vars.ff` (Python) — echoes which env is active and which DuckDB file is used -* `hello.ff` (SQL view) — shows how `{{ this.* }}` resolves from the active profile -* `users.ff` (SQL table) — reads from the seeded CSV table to prove seeding works - ---- - -## What this shows - -* Layered environment files: `.env.dev`, `.env.stg`, `.env.prod` (+ optional `*.local` overrides) -* `profiles.yml` that reads from `env('…')` so connection details live in env files -* All environments use **DuckDB**, but **different DB files** (e.g. `.local/dev.duckdb`, `.local/stg.duckdb`, …) -* Seeding CSV → `seed_users` table, then a simple model consuming it - ---- - -## Project layout - -``` -examples/env_matrix/ -├─ models/ -│ ├─ env_vars.ff.py # Python model: shows env + DuckDB file info -│ └─ users.ff.sql # SQL table: reads from seeded 'seed_users' -├─ seeds/ -│ └─ users.csv # sample data for seeding (-> seed_users) -├─ profiles.yml # all envs = DuckDB, different paths -├─ .env # shared defaults (optional) -├─ .env.dev # dev environment vars -├─ .env.stg # stg environment vars -├─ .env.prod # prod environment vars -├─ .env.dev.local # private overrides (gitignored; optional) -├─ .env.stg.local # private overrides (gitignored; optional) -├─ .env.prod.local # private overrides (gitignored; optional) -└─ Makefile # convenience targets (run, seed, dag) -``` - ---- - -## Environment files - -Each env file sets a different DuckDB path: - -* `.env.dev` - - ``` - FFT_ACTIVE_ENV=dev - FF_ENGINE=duckdb - FF_DUCKDB_PATH=.local/env_matrix.dev.duckdb - ``` - -* `.env.stg` - - ``` - FFT_ACTIVE_ENV=stg - FF_ENGINE=duckdb - FF_DUCKDB_PATH=.local/env_matrix.stg.duckdb - ``` - -* `.env.prod` - - ``` - FFT_ACTIVE_ENV=prod - FF_ENGINE=duckdb - FF_DUCKDB_PATH=.local/env_matrix.prod.duckdb - ``` - -> You can place secrets or machine-local tweaks in `.env..local` (ignored by git). -> Optional toggles (if you want verbose SQL logs): -> `FFT_SQL_DEBUG=1`, `FFT_LOG_JSON=1` - ---- - -## `profiles.yml` (DuckDB for all envs) - -```yaml -default: - dev: - engine: "{{ env('FF_ENGINE', 'duckdb') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" - - stg: - engine: "{{ env('FF_ENGINE', 'duckdb') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" - - prod: - engine: "{{ env('FF_ENGINE', 'duckdb') }}" - duckdb: - path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" -``` - ---- - -## Models - -### `models/env_vars.ff.py` (Python) - -Returns one row with: - -* `active_env_hint` (from `.env.*`), -* `ff_engine` (should be `duckdb` here), -* `duckdb_path`, `duckdb_exists`, `duckdb_size_bytes`. - -### `models/hello.ff.sql` (SQL view) - -Uses `{{ this.materialized }}`, `{{ this.schema }}`, `{{ this.database }}` so you can see what the active profile provides. (The simple `SELECT` is compatible with DuckDB; if you added casts like `::text`, they’re fine in DuckDB too.) - -### `models/users.ff.sql` (SQL table) - -Reads from the seeded table `seed_users`: - -```sql -{{ config(materialized='table', tags=['demo', 'seed']) }} - -select - id, - email -from "seed_users"; -``` - -> If you see an error “table seed_users does not exist”, you **haven’t run `fft seed`** for that environment yet. - ---- - -## Seeds - -`seeds/users.csv` is loaded by `fft seed` into a table named `seed_users`. -(That’s the default naming convention: `users.csv` → `seed_users`.) - ---- - -## Running it - -From the repo root: - -### Using `uv` directly - -**Dev** - -```bash -uv run fft seed examples/env_matrix --env dev -uv run fft run examples/env_matrix --env dev -uv run fft dag examples/env_matrix --env dev --html -``` - -**Staging** - -```bash -uv run fft seed examples/env_matrix --env stg -uv run fft run examples/env_matrix --env stg -``` - -**Prod** - -```bash -uv run fft seed examples/env_matrix --env prod -uv run fft run examples/env_matrix --env prod -``` - -### Using the Makefile (inside `examples/env_matrix/`) - -```bash -make run-dev # runs the DAG on dev -make run-stg -make run-prod - -make seed-dev # seed only (dev) -make seed-stg -make seed-prod - -make dag-dev # generate HTML DAG for dev -make clean # remove .local/, docs/, site/, .fastflowtransform/ -``` - -> Tip: re-run `fft seed` whenever you switch environments or change `seeds/*.csv`. - ---- - -## Inspecting results - -* The **HTML DAG** (after `make dag-dev`) will be at: - - ``` - examples/env_matrix/site/dag/index.html - ``` -* The **artifacts** are under: - - ``` - examples/env_matrix/.fastflowtransform/target/{manifest.json, run_results.json, catalog.json} - ``` -* Query the DuckDB files directly with `duckdb` CLI or `python` + `duckdb` module if you want to peek inside. - ---- - -## Troubleshooting - -* **`seed_users` not found** - Run `fft seed` for the same environment: - `uv run fft seed examples/env_matrix --env dev` - -* **No logs showing** - Use `-v`/`-vv` and/or `--sql-debug` on the CLI, or set: - - ``` - FFT_SQL_DEBUG=1 - FFT_LOG_JSON=1 # optional JSON logs - ``` - -* **Wrong environment picked** - Double-check the `--env` flag in your CLI call and ensure the `.env.` file exists. - ---- - -## Clean up - -```bash -make clean # from examples/env_matrix/ -# or manually: -rm -rf examples/env_matrix/.local examples/env_matrix/site examples/env_matrix/docs -rm -rf examples/env_matrix/.fastflowtransform -``` - - - - - -# Data Quality Demo Project - -The **Data Quality Demo** shows how to use **all built-in FFT data quality tests** on a small, understandable model: - -* Column checks: - - * `not_null` - * `unique` - * `accepted_values` - * `greater_equal` - * `non_negative_sum` - * `row_count_between` - * `freshness` -* Cross-table reconciliations: - - * `reconcile_equal` - * `reconcile_ratio_within` - * `reconcile_diff_within` - * `reconcile_coverage` - -It uses a simple **customers / orders / mart** setup so you can see exactly what each test does and how it fails when something goes wrong. - ---- - -## What this example demonstrates - -1. **Basic column checks** on staging tables - Ensure IDs are present and unique, amounts are non-negative, and status values are valid. - -2. **Freshness** on a timestamp column - Check that the most recent order in your mart is not “too old”, using `last_order_ts`. - -3. **Row count sanity checks** - Guard against empty tables and unexpectedly large row counts. - -4. **Cross-table reconciliations** between staging and mart - Verify that sums and counts match between `orders` and the aggregated `mart_orders_agg`, and that every customer has a corresponding mart row. - -5. **Tagged tests and selective execution** - All tests are tagged (e.g. `example:dq_demo`, `reconcile`) so you can run exactly the subset you care about. - ---- - -## Project layout (example) - -```text -examples/dq_demo/ - .env - .env.dev_duckdb - .env.dev_postgres - .env.dev_databricks - Makefile # optional, convenience wrapper around fft commands - profiles.yml - project.yml - sources.yml - - seeds/ - customers.csv - orders.csv - - models/ - staging/ - customers.ff.sql - orders.ff.sql - marts/ - mart_orders_agg.ff.sql -``` - -### Seeds - -* `seeds/customers.csv` - Simple customer dimension (e.g. `customer_id`, `name`, `status`). - -* `seeds/orders.csv` - Order fact data (e.g. `order_id`, `customer_id`, `amount`, `order_ts` as a string). - -### Models - -**1. Staging: `customers.ff.sql`** - -* Materialized as a table. -* Casts IDs and other fields into proper types. -* Used as the “clean” customer dimension for downstream checks. - -**2. Staging: `orders.ff.sql`** - -* Materialized as a table. -* Casts fields to proper types so DQ tests work reliably: - - ```sql - {{ config( - materialized='table', - tags=[ - 'example:dq_demo', - 'scope:staging', - 'engine:duckdb', - 'engine:postgres', - 'engine:databricks_spark' - ], - ) }} - - select - cast(order_id as int) as order_id, - cast(customer_id as int) as customer_id, - cast(amount as double) as amount, - cast(order_ts as timestamp) as order_ts - from {{ source('crm', 'orders') }}; - ``` - - This is important for: - - * numeric checks (`greater_equal`, `non_negative_sum`) - * timestamp-based `freshness` checks - -**3. Mart: `mart_orders_agg.ff.sql`** - -Aggregates orders per customer and prepares data for reconciliation + freshness: - -```sql -{{ config( - materialized='table', - tags=[ - 'example:dq_demo', - 'scope:mart', - 'engine:duckdb', - 'engine:postgres', - 'engine:databricks_spark' - ], -) }} - --- Aggregate orders per customer for DQ & reconciliation tests -with base as ( - select - o.order_id, - o.customer_id, - -- Ensure numeric and timestamp types for downstream DQ checks - cast(o.amount as double) as amount, - cast(o.order_ts as timestamp) as order_ts, - c.name as customer_name, - c.status as customer_status - from {{ ref('orders.ff') }} o - join {{ ref('customers.ff') }} c - on o.customer_id = c.customer_id -) -select - customer_id, - customer_name, - customer_status as status, - count(*) as order_count, - sum(amount) as total_amount, - min(order_ts) as first_order_ts, - max(order_ts) as last_order_ts -from base -group by customer_id, customer_name, customer_status; -``` - -The important columns for DQ tests are: - -* `status` → used for `accepted_values` -* `order_count` and `total_amount` → used for numeric and reconciliation tests -* `last_order_ts` → used for `freshness` - ---- - -## Data quality configuration (`project.yml`) - -All tests live under `project.yml → tests:`. -This example uses the tag `example:dq_demo` for easy selection. - -### Column-level checks - -```yaml -tests: - # 1) IDs must be present and unique - - type: not_null - table: customers - column: customer_id - tags: [example:dq_demo, batch] - - - type: unique - table: customers - column: customer_id - tags: [example:dq_demo, batch] - - # 2) Order amounts must be >= 0 - - type: greater_equal - table: orders - column: amount - threshold: 0 - tags: [example:dq_demo, batch] - - # 3) Total sum of amounts must not be negative - - type: non_negative_sum - table: orders - column: amount - tags: [example:dq_demo, batch] - - # 4) Customer status values must be within a known set - - type: accepted_values - table: mart_orders_agg - column: status - values: ["active", "churned", "prospect"] - severity: warn # show as warning, not hard failure - tags: [example:dq_demo, batch] - - # 5) Row count sanity check on mart - - type: row_count_between - table: mart_orders_agg - min_rows: 1 - max_rows: 100000 - tags: [example:dq_demo, batch] - - # 6) Freshness: last order in the mart must not be "too old" - - type: freshness - table: mart_orders_agg - column: last_order_ts - max_delay_minutes: 100000000 - tags: [example:dq_demo, batch] -``` - -### Cross-table reconciliations - -```yaml - # 7) Reconcile total revenue between orders and mart - - type: reconcile_equal - name: total_amount_orders_vs_mart - tags: [example:dq_demo, reconcile] - left: - table: orders - expr: "sum(amount)" - right: - table: mart_orders_agg - expr: "sum(total_amount)" - abs_tolerance: 0.01 - - # 8) Ratio of sums should be ~1 (within tight bounds) - - type: reconcile_ratio_within - name: total_amount_ratio - tags: [example:dq_demo, reconcile] - left: - table: orders - expr: "sum(amount)" - right: - table: mart_orders_agg - expr: "sum(total_amount)" - min_ratio: 0.999 - max_ratio: 1.001 - - # 9) Row count diff between orders and mart should be bounded - - type: reconcile_diff_within - name: order_count_diff - tags: [example:dq_demo, reconcile] - left: - table: orders - expr: "count(*)" - right: - table: mart_orders_agg - expr: "sum(order_count)" - max_abs_diff: 0 - - # 10) Coverage: every customer should appear in the mart - - type: reconcile_coverage - name: customers_covered_in_mart - tags: [example:dq_demo, reconcile] - source: - table: customers - key: "customer_id" - target: - table: mart_orders_agg - key: "customer_id" -``` - -This set of tests touches **all available test types** and ties directly back to the simple data model. - ---- - -## Running the demo - -Assuming you are in the repo root and using DuckDB as a starting point: - -### 1. Seed the data - -```bash -fft seed examples/dq_demo --env dev_duckdb -``` - -This reads `seeds/customers.csv` and `seeds/orders.csv` and materializes them as tables referenced by `sources.yml`. - -### 2. Run the models - -```bash -fft run examples/dq_demo --env dev_duckdb -``` - -This builds: - -* `customers` (staging) -* `orders` (staging) -* `mart_orders_agg` (mart) - -### 3. Run all DQ tests - -```bash -fft test examples/dq_demo --env dev_duckdb --select tag:example:dq_demo -``` - -You should see a summary like: - -```text -Data Quality Summary -──────────────────── -✅ not_null customers.customer_id -✅ unique customers.customer_id -✅ greater_equal orders.amount -✅ non_negative_sum orders.amount -❕ accepted_values mart_orders_agg.status -✅ row_count_between mart_orders_agg -✅ freshness mart_orders_agg.last_order_ts -✅ reconcile_equal total_amount_orders_vs_mart -✅ reconcile_ratio_within total_amount_ratio -✅ reconcile_diff_within order_count_diff -✅ reconcile_coverage customers_covered_in_mart - -Totals -────── -✓ passed: 10 -! warnings: 1 -``` - -(Exact output will differ, but you’ll see pass/failed/warned checks listed.) - -### 4. Run only reconciliation tests - -```bash -fft test examples/dq_demo --env dev_duckdb --select tag:reconcile -``` - -This executes just the cross-table checks, which is handy when you’re iterating on a mart. - ---- - -## Things to experiment with - -To understand the tests better, intentionally break the data and re-run `fft test`: - -* Set one `customers.customer_id` to `NULL` → watch `not_null` fail. -* Duplicate a `customer_id` → watch `unique` fail. -* Put a negative `amount` in `orders.csv` → `greater_equal` and `non_negative_sum` fail. -* Add a new `status` value (e.g. `"paused"`) → `accepted_values` warns. -* Drop a customer from `mart_orders_agg` manually (or filter it out in SQL) → `reconcile_coverage` fails. -* Change an amount in the mart only → reconciliation tests fail. - -This makes it very clear what each test guards against. - ---- - -## Summary - -The Data Quality Demo is designed to be: - -* **Small and readable** – customers, orders, and a single mart. -* **Complete** – exercises every built-in FFT DQ test type. -* **Practical** – real-world patterns like: - - * typing in staging models, - * testing freshness on a mart timestamp, - * reconciling sums and row counts across tables. - -Once you’re comfortable with this example, you can copy the patterns into your real project: start with staging-level checks, then layer in reconciliations and freshness on your most important marts. - - - - - -# Macros Demo - -**Goal:** Showcase **SQL Jinja macros** and **Python render-time macros** working together across engines (DuckDB, Postgres, Databricks Spark). -You’ll see reusable SQL helpers, engine-aware SQL generation, and Python functions exposed as Jinja globals/filters. - ---- - -## Directory structure - -```text -examples/macros_demo/ - .env - .env.dev_databricks - .env.dev_duckdb - .env.dev_postgres - Makefile - profiles.yml - project.yml - sources.yml - seeds/ - seed_users.csv - seed_orders.csv - models/ - macros/ - utils.sql - star.sql - macros_py/ - helpers.py - common/ - stg_users.ff.sql - stg_orders.ff.sql - dim_users.ff.sql - fct_user_sales.ff.sql - engines/ - duckdb/ - py_example.ff.py - postgres/ - py_example.ff.py - databricks_spark/ - py_example.ff.py -``` - ---- - -## What this demo shows - -* **SQL Jinja macros** (`models/macros/*.sql`) - - * `email_domain(expr)` – derive email domain - * `safe_cast_amount(expr)` – engine-aware numeric cast - * `coalesce_any(expr, default)` – small convenience - * `default_country()` – pull a default from `project.yml → vars` - * `star_except(relation, exclude_cols)` – select all except listed columns (falls back to `*` if columns unknown) -* **Python macros** (`models/macros_py/helpers.py`) - - * `slugify(str)` – URL-friendly slug - * `mask_email(email)` – redact local part - * `csv_values(rows, cols)` – inline small lookup tables via SQL `VALUES(...)` -* **Usage from models** - - * `stg_users` uses SQL + Python macros at render time - * `stg_orders` uses engine-aware casting - * `dim_users` builds a tiny inline lookup via `csv_values(...)` - * `fct_user_sales` aggregates across staged models - ---- - -## Prerequisites - -* A working FFT installation (CLI `fft` available) -* For Postgres/Databricks: valid local env and drivers -* The core must expose these Jinja globals (already done in the FFT core): - - * `var(name, default)`, `env(name, default)`, `engine(default)` - (Used by profiles/macros to read vars and detect engine.) - ---- - -## Seeds - -Two tiny CSVs materialized via `fft seed`: - -* `seed_users.csv` — `id,email,country` -* `seed_orders.csv` — `order_id,customer_id,amount,order_ts` - -`profiles.yml` and `project.yml` give minimal storage and connection configs. - ---- - -## How to run - -From repo root: - -```bash -cd examples/macros_demo - -# Choose engine: duckdb (default) | postgres | databricks_spark -make ENGINE=duckdb demo -# or -make ENGINE=postgres demo -# or -make ENGINE=databricks_spark demo -``` - -The `demo` target runs: - -1. `fft seed` — loads CSVs -2. `fft run` — builds models using macros -3. `fft dag --html` — writes DAG HTML to `site/dag/index.html` -4. `fft test` — runs example tests -5. Prints artifact paths and tries to open the DAG - ---- - -## Key files (highlights) - -### SQL macros – `models/macros/utils.sql` - -```jinja -{%- macro email_domain(expr) -%} - lower(split_part({{ expr }}, '@', 2)) -{%- endmacro -%} - -{%- macro safe_cast_amount(expr) -%} -{%- set e = engine('duckdb') -%} -{%- if e in ['duckdb', 'postgres', 'databricks_spark'] -%} - cast({{ expr }} as double) -{%- else -%} - cast({{ expr }} as double) -{%- endif -%} -{%- endmacro -%} - -{%- macro coalesce_any(expr, default) -%} - coalesce({{ expr }}, {{ default }}) -{%- endmacro -%} - -{%- macro default_country() -%} - '{{ var("default_country", "DE") }}' -{%- endmacro -%} -``` - -### SQL macros – `models/macros/star.sql` - -```jinja -{%- macro star_except(relation, exclude_cols) -%} -{%- set excl = exclude_cols | map('lower') | list -%} -{%- set cols = adapter_columns(relation) -%} -{%- if cols and cols|length > 0 -%} - {{- (cols | reject('in', excl) | map('string') | join(', ')) -}} -{%- else -%} - * -{%- endif -%} -{%- endmacro -%} -``` - -> Note: If the executor can’t describe columns for `relation`, this macro falls back to `*`. - -### Python macros – `models/macros_py/helpers.py` - -```python -def slugify(value: str) -> str: ... -def mask_email(email: str) -> str: ... -def csv_values(rows: list[dict], cols: list[str]) -> str: ... -``` - -Exposed as Jinja globals/filters at **render time** (not runtime SQL UDFs). - ---- - -## Models using macros - -### `stg_users.ff.sql` (Jinja + Python macro usage) - -* Coalesces missing country with `default_country()` -* Adds `email_domain(...)` -* Embeds a `slugify(var('site_name', ...))` literal into SQL - -```jinja -with src as ( - select - cast(id as int) as user_id, - lower(email) as email, - {{ coalesce_any("country", default_country()) }} as country - from {{ source('crm', 'users') }} -) -select - user_id, - email, - {{ email_domain("email") }} as email_domain, - country, - '{{ slugify(var("site_name", "My Site")) }}' as site_slug -from src; -``` - -### `stg_orders.ff.sql` (engine-aware types) - -```jinja -select - cast(order_id as int) as order_id, - cast(customer_id as int) as user_id, - {{ safe_cast_amount("amount") }} as amount, - cast(order_ts as timestamp) as order_ts -from {{ source('sales', 'orders') }}; -``` - -### `dim_users.ff.sql` (inline lookup via Python macro) - -```jinja -labels as ( - select * from (values {{ csv_values( - [ - {"domain":"example.com", "label":"internal"}, - {"domain":"gmail.com", "label":"consumer"}, - ], - ["domain","label"] - ) }}) as t(domain, label) -) -``` - -### `fct_user_sales.ff.sql` (final aggregation) - -Joins `stg_orders` with `dim_users` and aggregates. - ---- - -## Tests (examples) - -Declared in `project.yml`: - -* `not_null(dim_users.user_id)` -* `row_count_between(fct_user_sales, min_rows=1)` - -Run with: - -```bash -fft test examples/macros_demo --env dev_duckdb --select tag:example:macros_demo -``` - ---- - -## Troubleshooting - -* **`jinja2.exceptions.UndefinedError: 'var'/'env'/'engine' is undefined`** - Ensure your core’s Jinja environment registers these globals before loading templates: - - ```python - env.globals.update(var=..., env=..., engine=...) - ``` -* **Engine differences (types & functions):** - Always branch in macros (`engine(...)`) when types or functions differ. -* **`adapter_columns(...)` returns none:** - The `star_except` macro will fallback to `*`. For strict behavior, replace with static column lists per engine. - ---- - -## Extending this demo - -* Add more helpers to `helpers.py` (e.g., `render_json(obj)`, `join_csv(list)`). -* Create reusable macro libraries under `models/macros/` (date handling, SCD helpers, etc.). -* Use `var(...)` to parameterize behavior per environment or profile. - ---- - -Happy macro-ing! - - - - - -# 🧠 Cache & Parallelism Demo - -This example demonstrates FastFlowTransform’s **build cache**, **fingerprint logic**, **parallel scheduler**, and **HTTP response caching**. -It’s a compact playground to visualize **when nodes are skipped**, **what triggers rebuilds**, and **how caching accelerates iterative runs**. - ---- - -## 🗂 Directory Structure - -```text -cache_demo/ - .env.dev_duckdb - Makefile - profiles.yml - project.yml - sources.yml - models/ - seeds_consumers/ - stg_users.ff.sql - stg_orders.ff.sql - marts/ - mart_user_orders.ff.sql - python/ - py_constants.ff.py - http/ - http_users.ff.py - seeds/ - seed_users.csv - seed_orders.csv - README.md -``` - ---- - -## ⚙️ Overview - -This demo showcases several FastFlowTransform features: - -| Feature | Demonstrated by | -| -------------------------- | ----------------------------------------------- | -| Level-wise parallelism | Multiple models running concurrently (`--jobs`) | -| Deterministic fingerprints | Build cache skipping unchanged nodes | -| Upstream invalidation | Seed → staging → mart rebuilds | -| Environment invalidation | Any `FF_*` change triggers rebuild | -| Python model caching | Fingerprints derived from function source | -| HTTP response caching | Persistent API result cache with offline mode | - ---- - -## ⚡ Quickstart - -```bash -cd examples/cache_demo -make cache_first # builds all nodes, writes cache -make cache_second # no-op run (everything skipped) -make change_sql # touch a model -> rebuilds dependent mart -make change_seed # change seed -> rebuilds staging + mart -make change_env # set FF_* env -> invalidates cache globally -make change_py # edit py_constants.ff.py -> rebuilds that model -make run_parallel # runs entire DAG with 4 workers per level -``` - -Inspect results: - -* `.fastflowtransform/target/run_results.json` – fingerprints, results, timings, HTTP stats -* `site/dag/index.html` – DAG visualization -* `.local/http-cache/` – persisted API responses - ---- - -## 🧩 Model Summary - -| Model | Kind | Purpose | Notes | -| ------------------------- | ------ | --------------------------- | ------------------------------------ | -| `stg_users.ff.sql` | SQL | Load & normalize users seed | Rebuilds if seed changes | -| `stg_orders.ff.sql` | SQL | Load orders seed | Builds as a view | -| `mart_user_orders.ff.sql` | SQL | Join staging tables | Rebuilds if any staging changes | -| `py_constants.ff.py` | Python | Simple constant DataFrame | Fingerprint based on function source | -| `http_users.ff.py` | Python | HTTP fetch with cache | Uses `get_df()` and offline cache | - ---- - -## 🌐 HTTP Response Cache - -The `http_users.ff.py` model demonstrates the built-in HTTP cache: - -* **First run:** downloads `https://jsonplaceholder.typicode.com/users` -* **Subsequent runs:** reuse cached responses from `.local/http-cache` -* **Offline mode:** works with `FF_HTTP_OFFLINE=1` - -```bash -make http_first # warms HTTP cache -make http_offline # reuses cached response, no network access -make http_cache_clear # deletes cache directory -``` - -You can inspect HTTP usage in the `run_results.json` file: - -```bash -jq -r '.results[] | select(.http!=null) - | "\(.name): requests=\(.http.requests) cache_hits=\(.http.cache_hits) offline=\(.http.used_offline)"' \ - .fastflowtransform/target/run_results.json -``` - ---- - -## ⚙️ Cache Logic Recap - -FastFlowTransform caches model fingerprints and skips nodes when: - -1. **Fingerprints match** (SQL text, Python source, vars, engine, env, deps). -2. The **physical relation exists** in the database. - -Changing *any* of the following invalidates the cache: - -* SQL/Jinja content -* Python model code -* `sources.yml` -* `FF_*` environment variables -* Seed file contents -* Engine or profile name - -You can control cache behavior via CLI: - -```bash ---cache=off # always build ---cache=rw # default; skip on match; write cache ---cache=ro # read-only; skip on hit, build on miss ---cache=wo # always build, always write -``` - ---- - -## 🧮 Parallel Scheduler - -FastFlowTransform executes models **level-wise**: - -* Each level contains nodes whose dependencies are fully satisfied. -* Up to `--jobs` nodes per level run concurrently. -* Logs are serialized for clean output. - -Example: - -```bash -fft run . --env dev_duckdb --jobs 4 -``` - ---- - -## 🧪 Example Experiments - -| Scenario | Command | Expected behavior | -| ------------------------- | -------------------------------------- | ------------------------------- | -| First full run | `make cache_first` | All models build, cache written | -| No-op run | `make cache_second` | All skipped (no rebuilds) | -| Modify SQL | `make change_sql` | Downstream mart rebuilds | -| Add seed row | `make change_seed` | Staging + mart rebuild | -| Change env | `make change_env` | All nodes rebuild | -| Edit Python constant | `make change_py` | Only that Python model rebuilds | -| Warm & offline HTTP cache | `make http_first && make http_offline` | HTTP cache reused, no network | - ---- - -## 🧩 DAG Example - -After the first run, generate the DAG visualization: - -```bash -make dag -open site/dag/index.html -``` - -You’ll see: - -``` -seed_users → stg_users.ff -seed_orders → stg_orders.ff -(stg_users + stg_orders) → mart_user_orders.ff -py_constants -http_users -``` - -* `py_constants` runs independently (parallel) -* `mart_user_orders.ff` depends on both staging nodes - ---- - -## 🧰 Tips - -* **Inspect fingerprints:** stored in `.fastflowtransform/target/manifest.json` -* **Audit table:** `_ff_meta` table in the engine stores build metadata -* **Clear cache:** delete `.fastflowtransform/` or use `make clean` -* **Parallel debugging:** use `--keep-going` to continue unaffected levels - ---- - -## ✅ Takeaways - -* FFT’s build cache uses stable fingerprints to skip unchanged nodes. -* Fingerprints propagate downstream, ensuring correctness. -* The HTTP cache supports deterministic, offline API pipelines. -* Parallel execution accelerates runs without breaking dependencies. - -Together, these features make iterative development **fast, reliable, and reproducible**. - - - - - -# Incremental, Delta & Iceberg Demo - -This example project shows how to use **incremental models** and **Delta-/Iceberg-style merges** in FastFlowTransform across DuckDB, Postgres, Databricks Spark (Parquet, Delta & Iceberg), and BigQuery (pandas or BigFrames). - - -It is intentionally small and self-contained so you can copy/paste patterns into your own project. - ---- - -## Location & Layout - -The example lives under: - -```text -examples/incremental_demo/ -```` - -Directory structure: - -```text -incremental_demo/ - .env - .env.dev_duckdb - .env.dev_postgres - .env.dev_databricks_delta - .env.dev_databricks_iceberg - .env.dev_bigquery_pandas - .env.dev_bigquery_bigframes - Makefile - profiles.yml - project.yml - sources.yml - - seeds/ - seed_events.csv - - models/ - common/ - events_base.ff.sql - fct_events_sql_inline.ff.sql - fct_events_sql_yaml.ff.sql - engines/ - duckdb/ - fct_events_py_incremental.ff.py - postgres/ - fct_events_py_incremental.ff.py - databricks_spark/ - fct_events_py_incremental.ff.py - bigquery/ - pandas/ - fct_events_py_incremental.ff.py - bigframes/ - fct_events_py_incremental.ff.py -``` - -*Your actual filenames may differ slightly; the concepts are the same.* - ---- - -## What the demo shows - -The demo revolves around a tiny `events` dataset and three different ways to build an incremental fact table: - -1. **SQL incremental model with inline delta SQL** - - * `models/common/fct_events_sql_inline.ff.sql` - * All incremental logic (how to find “new/changed” rows) is defined directly in the model’s `config(meta=...)` block. - -2. **SQL incremental model with YAML config in `project.yml`** - - * `models/common/fct_events_sql_yaml.ff.sql` - * The base SELECT lives in the model, but all incremental hints (`incremental.enabled`, `unique_key`, `updated_at_column`, …) are configured in `project.yml → models.incremental`. - -3. **Python incremental model** - - * `models/engines/*/fct_events_py_incremental.ff.py` - * A Python model that returns a DataFrame; the executor applies incremental behaviour based on model `meta` (unique key + updated-at timestamp) and the target engine: - - * DuckDB / Postgres: incremental insert/merge in SQL - * Databricks Spark: `MERGE INTO` for Delta or Iceberg where available (Spark 4), with a fallback full-refresh strategy for other formats - * BigQuery: pandas- or BigFrames-backed DataFrame models with incremental merge logic handled by the BigQuery executor - -4. **Iceberg profile for Spark 4** - - * Optional Databricks/Spark profile that uses the built-in **Iceberg catalog**. - * Seeds and models are materialized as Iceberg tables in a local warehouse directory. - * `ref()` and `source()` automatically point to the Iceberg catalog when the `databricks_spark.table_format` is set to `iceberg`. - ---- - -## Seed data - -The demo uses a simple seed file: - -```text -examples/incremental_demo/seeds/seed_events.csv -``` - -Example contents (conceptually): - -```csv -event_id,updated_at,value -1,2024-01-01T10:00:00,10 -2,2024-01-01T10:05:00,20 -3,2024-01-01T10:10:00,30 -``` - -Running: - -```bash -fft seed examples/incremental_demo --env dev_duckdb -``` - -(or with your engine/env of choice) will materialize this seed into the warehouse (e.g. a DuckDB table or Postgres table). - ---- - -## Base model: `events_base` - -The base staging model simply exposes the events from the seed: - -```text -models/common/events_base.ff.sql -``` - -Conceptually: - -```sql -{{ config( - materialized='table', - tags=[ - 'example:incremental_demo', - 'scope:common', - 'engine:duckdb', - 'engine:postgres', - 'engine:databricks_spark', - ], -) }} - -select - event_id, - updated_at, - value -from {{ source('raw', 'events') }}; -``` - -All incremental models build on top of this base table. - ---- - -## Incremental configuration (high-level) - -All three incremental models share the same core idea: - -* Mark the model as **incremental** -* Provide a **unique key** (e.g. `event_id`) -* Provide an **updated-at / timestamp column** (e.g. `updated_at`) -* Optionally specify a **delta strategy**: - - * **Inline SQL** (in the model) - * **External YAML** (referenced from the model) - * **Python** (engine-specific model that returns the delta dataset) - -There are two ways to express this in the demo: - -1. **Inline on the model** (used by `fct_events_sql_inline.ff.sql`), via `config(...)`: - -```jinja -{{ config( - materialized='incremental', - unique_key='event_id', - incremental={'updated_at_column': 'updated_at'}, - tags=['example:incremental_demo'], -) }} -``` - -2. **As an overlay in `project.yml`** (used by `fct_events_sql_yaml.ff.sql` and the Python model): - -```yaml -models: - incremental: - fct_events_sql_yaml.ff: - unique_key: "event_id" - incremental: - enabled: true - updated_at_column: "updated_at" - - fct_events_py_incremental.ff: - unique_key: "event_id" - incremental: - enabled: true - updated_at_column: "updated_at" -``` - -The incremental engine then uses these `meta` fields to decide whether to: - -* create the table (`create_table_as`) for the **first run** -* perform an **incremental insert** or **merge** for subsequent runs - ---- - -## 1) SQL incremental with inline delta SQL - -File: - -```text -models/common/fct_events_sql_inline.ff.sql -``` - -In this variant, both *incremental configuration* and the *delta filter* live directly in the model: - -```jinja -{{ config( - materialized='incremental', - unique_key='event_id', - incremental={'updated_at_column': 'updated_at'}, - tags=[ - 'example:incremental_demo', - 'scope:common', - 'kind:incremental', - 'inc:type:inline-sql', - 'engine:duckdb', - 'engine:postgres', - 'engine:databricks_spark', - ], -) }} - -with base as ( - select * - from {{ ref('events_base.ff') }} -) -select - event_id, - updated_at, - value -from base -{% if is_incremental() %} -where updated_at > ( - select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') - from {{ this }} -) -{% endif %}; -``` - -On the **first run**, the engine sees no existing relation, so it materializes the full `select ... from events_base`. - -On subsequent runs, the engine evaluates the `delta.sql` snippet and: - -* **DuckDB / Postgres**: inserts or merges the resulting rows into the target table -* **Databricks Spark**: tries a `MERGE INTO` (Delta) and falls back to a full-refresh if necessary -* **BigQuery**: applies incremental insert/merge logic in SQL via the BigQuery executor - ---- - -## 2) SQL incremental with YAML delta config - -File: - -```text -models/common/fct_events_sql_yaml.ff.sql -``` - -Here the model body only defines the **canonical SELECT** and does *not* contain any incremental hints: - -```jinja -{{ config( - materialized='incremental', - tags=[ - 'example:incremental_demo', - 'scope:common', - 'kind:incremental', - 'inc:type:yaml-config', - 'engine:duckdb', - 'engine:postgres', - 'engine:databricks_spark', - ], -) }} - -with base as ( - select * - from {{ ref('events_base.ff') }} -) -select - event_id, - updated_at, - value -from base; -``` - -All incremental behaviour for this model is driven by `project.yml`: - -```yaml -models: - incremental: - fct_events_sql_yaml.ff: - unique_key: "event_id" - incremental: - enabled: true - updated_at_column: "updated_at" -``` - -The registry merges this overlay into the model at load time, so the incremental runtime -sees effectively the same config as for the inline model (`unique_key` + `updated_at_column`) – -only the **source of truth** is different. - ---- - -### Inline vs YAML config at a glance - -| Model | Where is incremental configured? | What lives in the SQL file? | -|----------------------------|-----------------------------------------|-----------------------------------------------| -| `fct_events_sql_inline.ff` | Inline in `config(...)` on the model | Full SELECT **+** `is_incremental()` filter | -| `fct_events_sql_yaml.ff` | `project.yml → models.incremental` | Full SELECT only (no incremental hints) | - -Both end up with the same runtime meta, only the **location of config** differs. - -## 3) Python incremental model - -Files: - -```text -models/engines/duckdb/fct_events_py_incremental.ff.py -models/engines/postgres/fct_events_py_incremental.ff.py -models/engines/databricks_spark/fct_events_py_incremental.ff.py -models/engines/bigquery/pandas/fct_events_py_incremental.ff.py -models/engines/bigquery/bigframes/fct_events_py_incremental.ff.py -``` - -Each engine variant uses the same logical signature: - -```python -from fastflowtransform import engine_model -import pandas as pd # or pyspark.sql.DataFrame for Databricks Spark - - -@engine_model( - only="duckdb", # or "postgres" / "databricks_spark" - name="fct_events_py_incremental", - deps=["events_base.ff"], - tags=[ - "example:incremental_demo", - "scope:engine", - "engine:duckdb", # or engine-specific - ], - meta={ - "incremental": True, - "unique_key": ["event_id"], - "updated_at": "updated_at", - }, -) -def build(df_events): - # 'df_events' is either a pandas.DataFrame or Spark DataFrame - # depending on the engine. - # The function returns either: - # - a full canonical result, or - # - only the delta rows, depending on your design. - # - # In the simplest version, you just return the full dataset and let the - # executor handle incremental logic based on meta. - return df_events[["event_id", "updated_at", "value"]] -``` - -The executor uses the `meta.incremental` / `meta.unique_key` / `meta.updated_at` hints to run: - -* A **full-refresh** on the first run -* A **delta merge** on subsequent runs: - - * For DuckDB / Postgres: insert/merge SQL - * For Databricks Spark: - - * `MERGE INTO` for Delta tables, or - * a full-refresh fallback strategy that rewrites the table based on the union of existing + delta rows - ---- - -## Delta & Iceberg variants (Databricks / Spark) - -In addition to the “regular” incremental models, the demo also includes **Delta Lake** and **Iceberg** variants -that shows how to: - -- route a model to **Delta tables** via `project.yml` -- reuse the same incremental pattern, but with a **Delta-backed** table on Databricks/Spark -- keep Parquet and Delta models side-by-side in the same project - -This is optional and only relevant for the `databricks_spark` engine. - ---- - -### Storage configuration for the Delta / Iceberg models - -In `project.yml`, the Delta variant gets its own storage entry, separate from the Parquet fact table: - -```yaml -models: - storage: - # Existing Parquet fact table - fct_events_sql_inline: - path: ".local/spark/fct_events_sql_inline" - format: parquet - - # 🔹 Delta-based fact table (Spark/Databricks only) - fct_events_sql_inline_delta: - path: ".local/spark_delta/fct_events_sql_inline" - format: delta - - # ❄️ Iceberg-based fact table (Spark 4 / Databricks only) - fct_events_sql_inline_iceberg: - # Points into the Iceberg warehouse; must match your Iceberg catalog config - path: ".local/iceberg_warehouse/incremental_demo/fct_events_sql_inline" - format: iceberg -```` - -Notes: - -* The key `fct_events_sql_inline_delta` must match the **model name**. -* `format: delta` tells the Databricks/Spark executor to create `USING DELTA LOCATION ...`. -* The path is different from the Parquet path so artifacts don’t clash. - ---- - -### Delta fact model - -The Delta fact model is a close sibling of `fct_events_sql_inline.ff.sql`, but: - -* is tagged only for the Databricks/Spark engine -* is configured for incremental **merge** with a `unique_key` + `updated_at` column - -Example (conceptual) model: - -```sql --- models/common/fct_events_sql_inline_delta.ff.sql - -{{ config( - materialized='table', - tags=[ - 'example:incremental_demo', - 'kind:incremental', - 'engine:databricks_spark', - ], - meta={ - 'incremental': True, - 'unique_key': ['event_id'], - 'updated_at': 'updated_at', - 'delta': { - 'sql': " - with base as ( - select event_id, updated_at, value - from {{ ref('events_base.ff') }} - ) - select - event_id, - updated_at, - value - from base - where updated_at > ( - select coalesce(max(updated_at), timestamp '1970-01-01 00:00:00') - from {{ this }} - ) - " - }, - }, -) }} - --- canonical full-select (used for docs / full-refresh) -select - event_id, - updated_at, - value -from {{ ref('events_base.ff') }}; -``` - -What happens: - -* On the **first run**, the engine sees no existing table and does a full materialization - (a Delta table at `.local/spark_delta/fct_events_sql_inline`). -* On **subsequent runs**, the executor uses the `delta.sql` query as the **incremental delta** and: - - * attempts a `MERGE INTO` for Delta tables, or - * falls back to a full-refresh strategy if MERGE is not supported. - ---- - -### Running the Delta variant - -Once your Databricks/Spark profile is configured (e.g. `dev_databricks` in `profiles.yml` and `.env.dev_databricks`), -you can run the Delta model like any other: - -```bash -# From the repo root -cd examples/incremental_demo - -# Seed -FFT_ACTIVE_ENV=dev_databricks fft seed . - -# Run only the Delta variant -FFT_ACTIVE_ENV=dev_databricks fft run . \ - --select fct_events_sql_inline_delta.ff \ - --select tag:engine:databricks_spark - -# Or include it in the general incremental demo selection -FFT_ACTIVE_ENV=dev_databricks fft run . \ - --select tag:example:incremental_demo \ - --select tag:engine:databricks_spark -``` - -Optionally, you can add a small `not_null` test to `project.yml` to verify the Delta model: - -```yaml -tests: - - type: not_null - table: fct_events_sql_inline_delta - column: event_id - tags: [batch, delta] -``` - -Then run: - -```bash -FFT_ACTIVE_ENV=dev_databricks fft test . --select tag:delta -``` - -to validate the Delta-backed incremental table specifically. - ---- - -## Running the demo - -From the project root: - -```bash -cd examples/incremental_demo -``` - -### DuckDB - -```bash -# Seed -FFT_ACTIVE_ENV=dev_duckdb fft seed . - -# Initial full run -FFT_ACTIVE_ENV=dev_duckdb fft run . \ - --select tag:example:incremental_demo --select tag:engine:duckdb - -# Incremental run (after modifying seed_events.csv to add later events) -FFT_ACTIVE_ENV=dev_duckdb fft run . \ - --select tag:example:incremental_demo --select tag:engine:duckdb \ - --cache rw - -# Data-quality tests (if configured in project.yml / schema YAML) -FFT_ACTIVE_ENV=dev_duckdb fft test . \ - --select tag:example:incremental_demo -``` - -### Postgres - -```bash -FFT_ACTIVE_ENV=dev_postgres fft seed . -FFT_ACTIVE_ENV=dev_postgres fft run . \ - --select tag:example:incremental_demo --select tag:engine:postgres -FFT_ACTIVE_ENV=dev_postgres fft test . \ - --select tag:example:incremental_demo -``` - -### BigQuery - -```bash -# pandas -FF_ENGINE=bigquery FF_ENGINE_VARIANT=pandas FFT_ACTIVE_ENV=dev_bigquery_pandas fft seed . -FF_ENGINE=bigquery FF_ENGINE_VARIANT=pandas FFT_ACTIVE_ENV=dev_bigquery_pandas fft run . \ - --select tag:example:incremental_demo --select tag:engine:bigquery --cache rw -FF_ENGINE=bigquery FF_ENGINE_VARIANT=pandas FFT_ACTIVE_ENV=dev_bigquery_pandas fft test . \ - --select tag:example:incremental_demo - -# BigFrames -FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigframes fft seed . -FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigframes fft run . \ - --select tag:example:incremental_demo --select tag:engine:bigquery --cache rw -FF_ENGINE=bigquery FF_ENGINE_VARIANT=bigframes FFT_ACTIVE_ENV=dev_bigquery_bigframes fft test . \ - --select tag:example:incremental_demo -``` - -Ensure the service account credentials pointed to by `GOOGLE_APPLICATION_CREDENTIALS` can create/drop tables in the target dataset. - -### Databricks Spark - -```bash -FFT_ACTIVE_ENV=dev_databricks fft seed . -FFT_ACTIVE_ENV=dev_databricks fft run . \ - --select tag:example:incremental_demo --select tag:engine:databricks_spark -FFT_ACTIVE_ENV=dev_databricks fft test . \ - --select tag:example:incremental_demo -```` - -### Databricks Spark (parquet vs Delta) - -You can run the incremental demo on Databricks/Spark against either **parquet** or **Delta** tables. - -FFT reads the desired table format from the `FF_DBR_TABLE_FORMAT` environment variable, which overrides -`databricks_spark.table_format` from `profiles.yml`. - -When `FF_DBR_TABLE_FORMAT=delta`, the Databricks/Spark executor automatically wires Delta Lake into the -SparkSession (downloads the Maven artifact via `delta-spark`, adds -`spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension`, and sets -`spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog` unless you already -overrode those settings). No extra `spark-submit --conf` flags are needed—just ensure the -`delta-spark >= 4.0` Python package is installed. - -From the repo root: - -```bash -cd examples/incremental_demo -```` - -Run with **parquet** tables (default): - -```bash -FF_DBR_TABLE_FORMAT=parquet FFT_ACTIVE_ENV=dev_databricks fft seed . -FF_DBR_TABLE_FORMAT=parquet FFT_ACTIVE_ENV=dev_databricks fft run . \ - --select tag:example:incremental_demo --select tag:engine:databricks_spark -FF_DBR_TABLE_FORMAT=parquet FFT_ACTIVE_ENV=dev_databricks fft test . \ - --select tag:example:incremental_demo -``` - -Run with **Delta** tables: - -```bash -FF_DBR_TABLE_FORMAT=delta FFT_ACTIVE_ENV=dev_databricks fft seed . -FF_DBR_TABLE_FORMAT=delta FFT_ACTIVE_ENV=dev_databricks fft run . \ - --select tag:example:incremental_demo --select tag:engine:databricks_spark -FF_DBR_TABLE_FORMAT=delta FFT_ACTIVE_ENV=dev_databricks fft test . \ - --select tag:example:incremental_demo -``` - -This way you can switch between parquet and Delta just by changing the `FF_DBR_TABLE_FORMAT` -environment variable, without touching the models or project.yml. - -Adjust environment names to match your `profiles.yml`. - -### Databricks Spark (Iceberg / Spark 4+) - -If you are on Spark 4 / Databricks with Iceberg support, you can also run the incremental demo -purely against Iceberg tables using a dedicated profile (for example `dev_databricks_iceberg`). - -That profile typically: - -* uses `engine: databricks_spark` -* sets `databricks_spark.table_format: iceberg` -* configures an Iceberg catalog via `extra_conf`, for example: - - models: - storage: - # Example warehouse location, adjust as needed - fct_events_sql_inline_iceberg: - path: ".local/iceberg_warehouse/incremental_demo/fct_events_sql_inline" - format: iceberg - -and in the profile (profiles.yml) something like: - - dev_databricks_iceberg: - engine: databricks_spark - databricks_spark: - master: "local[*]" - app_name: "incremental_demo" - warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" - extra_conf: - spark.sql.catalog.iceberg: org.apache.iceberg.spark.SparkCatalog - spark.sql.catalog.iceberg.type: hadoop - spark.sql.catalog.iceberg.warehouse: "file:///{{ project_dir() }}/.local/iceberg_warehouse" - -From the repo root: - - cd examples/incremental_demo - -Run seeds and models against Iceberg: - - FFT_ACTIVE_ENV=dev_databricks_iceberg fft seed . - - FFT_ACTIVE_ENV=dev_databricks_iceberg fft run . \ - --select tag:example:incremental_demo --select tag:engine:databricks_spark - - FFT_ACTIVE_ENV=dev_databricks_iceberg fft test . \ - --select tag:example:incremental_demo - -Under this profile, all `ref()` / `source()` calls in Spark SQL and Python models are resolved -against the Iceberg catalog, so seeds and incremental models operate purely on Iceberg tables. - - - - - -# API Demo Project - -The `examples/api_demo` scenario demonstrates how FastFlowTransform blends local data, external APIs, and multiple execution engines. It highlights: - -- **Hybrid data model**: joins a local seed (`crm.users`) with live user data from JSONPlaceholder. -- **Multiple environments**: switch between DuckDB, Postgres, Databricks Spark, and BigQuery (pandas or BigFrames client) using `profiles.yml` + `.env.*`. -- **HTTP integration**: compare the built-in FastFlowTransform HTTP client (`api_users_http`) with a plain `requests` implementation (`api_users_requests`). -- **Offline caching & telemetry**: inspect HTTP snapshots via `run_results.json`. -- **Engine-aware registration**: scope Python models via `engine_model` and SQL models via `config(engines=[...])` so only the active engine’s nodes load. - -## Data Model - -1. **Seed staging** – `models/common/users.ff.sql` - ```sql - {{ config( - materialized='table', - tags=[ - 'example:api_demo', - 'scope:common', - 'kind:seed-consumer', - 'engine:duckdb', - 'engine:postgres', - 'engine:databricks_spark', - 'engine:bigquery' - ] - ) }} - select id, email - from {{ source('crm', 'users') }}; - ``` - Consumes `sources.yml → crm.users` (seeded from `seeds/seed_users.csv`). - -2. **API enrichment** – engine-specific Python implementations under `models/engines//`: - - `api_users_http.ff.py` uses the built-in HTTP wrapper (`fastflowtransform.api.http.get_df`) with cache/offline support. - - `api_users_requests.ff.py` uses raw `requests` for maximum flexibility. - - Engine-specific callables are scoped with `engine_model(only=...)` (DuckDB/Postgres/Spark) or `env_match={"FF_ENGINE": "bigquery", "FF_ENGINE_VARIANT": ...}` (BigQuery pandas/BigFrames) to stay isolated per engine. - -3. **Mart join** – `models/common/mart_users_join.ff.sql` - ```sql - {{ config(engines=['duckdb','postgres','databricks_spark','bigquery']) }} - {% set api_users_model = var('api_users_model', 'api_users_http') %} - {% set api_users_refs = { - 'api_users_http': ref('api_users_http'), - 'api_users_requests': ref('api_users_requests') - } %} - {% set api_users_relation = api_users_refs.get(api_users_model, api_users_refs['api_users_http']) %} - with a as ( - select u.id as user_id, u.email from {{ ref('users.ff') }} u - ), - b as ( - select * from {{ api_users_relation }} - ) - select ... - ``` - Ties everything together and exposes the `var('api_users_model')` hook to choose the HTTP implementation while still keeping literal `ref('…')` calls in the template (required for DAG detection). `config(engines=[...])` keeps the SQL node registered only for the engines you list, preventing duplicate names across engine-specific folders. - - > **Warning:** The DAG builder only detects dependencies from literal `ref('model_name')` strings. A pure `ref(api_users_model)` (without the mapping shown above) compiles, but the graph would miss the edge to `api_users_http`/`api_users_requests`. - -## Profiles & Secrets - -`profiles.yml` defines per-engine profiles that reference environment variables: - -```yaml -dev_duckdb: - engine: duckdb - duckdb: - path: "{{ env('FF_DUCKDB_PATH', '.local/api_demo.duckdb') }}" - -dev_postgres: - engine: postgres - postgres: - dsn: "{{ env('FF_PG_DSN') }}" - db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" - -dev_bigquery_bigframes: - engine: bigquery - bigquery: - project: "{{ env('FF_BQ_PROJECT') }}" - dataset: "{{ env('FF_BQ_DATASET', 'api_demo') }}" - location: "{{ env('FF_BQ_LOCATION', 'EU') }}" - use_bigframes: true -``` - -`.env.dev_*` files supply the actual values. `_load_dotenv_layered()` loads them in priority order: repo `.env` → project `.env` → `.env.` → shell overrides (highest priority). Secrets stay out of version control. - -### BigQuery specifics - -- Set `ENGINE=bigquery` in the Makefile targets and choose a client via `BQ_FRAME=pandas` or `BQ_FRAME=bigframes` (default). -- Required env vars: `FF_BQ_PROJECT`, `FF_BQ_DATASET` (defaults to `api_demo`), and optionally `FF_BQ_LOCATION`. Uncomment `allow_create_dataset` in `profiles.yml` for first-run convenience. -- BigFrames variants ingest the HTTP payload into a pandas DataFrame, then wrap it as a BigFrames DataFrame (FFT’s `get_df(..., output="bigframes")` is not implemented yet). - - -## Makefile Workflow - -`Makefile` chooses the profile via `ENGINE` (`duckdb`/`postgres`/`databricks_spark`/`bigquery`) and wraps the main commands. For BigQuery, set `BQ_FRAME=pandas|bigframes`: - -```make -ENGINE ?= duckdb - -ifeq ($(ENGINE),duckdb) - PROFILE_ENV = dev_duckdb -endif -... -ifeq ($(ENGINE),bigquery) - ENGINE_TAG = engine:bigquery - ifeq ($(BQ_FRAME),pandas) - PROFILE_ENV = dev_bigquery_pandas - else - PROFILE_ENV = dev_bigquery_bigframes - endif -endif - -seed: - uv run fft seed "$(PROJECT)" --env $(PROFILE_ENV) -run: - env FFT_ACTIVE_ENV=$(PROFILE_ENV) ... uv run fft run ... -``` - -Common targets: - -| Target | Description | -|--------------------------|-------------| -| `make ENGINE=duckdb seed`| Materialize seeds into DuckDB. | -| `make ENGINE=postgres run`| Execute the full pipeline against Postgres. | -| `make ENGINE=bigquery run BQ_FRAME=bigframes`| Run against BigQuery (default BigFrames client; set `BQ_FRAME=pandas` to switch). | -| `make dag` | Render documentation (`site/dag/`). | -| `make api-run` | Run only API models (uses HTTP cache). | -| `make api-offline` | Force offline mode (`FF_HTTP_OFFLINE=1`). | -| `make api-show-http` | Display HTTP snapshot metrics via `jq`. | - -HTTP tuning parameters (`FF_HTTP_ALLOWED_DOMAINS`, cache dir, timeouts) live in `.env` and are appended via `HTTP_ENV` when running commands. - -## End-to-End Demo - -1. **Select engine**: `make ENGINE=duckdb` (default). Set `ENGINE=postgres`, `ENGINE=databricks_spark`, or `ENGINE=bigquery BQ_FRAME=` to switch. -2. **Seed data**: `make seed` -3. **Run pipeline**: `make run` -4. **Explore docs**: `make dag` → open `examples/api_demo/site/dag/index.html` -5. **Inspect HTTP usage**: `make api-show-http` - -This example demonstrates multi-engine configuration, environment-driven secrets, and API enrichment within FastFlowTransform. - - - - - -## Local Engine Setup - -### DuckDB - -- Copy `.env.dev_duckdb` and adjust `FF_DUCKDB_PATH` if you want a different location (default: `.local/api_demo.duckdb`). - Optionally set `FF_DUCKDB_SCHEMA` (default schema for models/seeds) and `FF_DUCKDB_CATALOG` (catalog alias) if you need to isolate namespaces. -- Create the target directory once: `mkdir -p examples/api_demo/.local`. -- Run `make ENGINE=duckdb seed run` to build the seeds and models inside the DuckDB file. - -### Postgres - -- Start a local database, e.g. via Docker: - `docker run --name fft-postgres -e POSTGRES_PASSWORD=postgres -p 5432:5432 -d postgres:15`. -- Set `FF_PG_DSN` in `.env.dev_postgres` (for example `postgresql+psycopg://postgres:postgres@localhost:5432/fft`) and optionally override `FF_PG_SCHEMA` (defaults to `api_demo`). - The executor ensures the schema exists via `CREATE SCHEMA IF NOT EXISTS` on first connect. -- Execute `make ENGINE=postgres seed run` to materialize seeds and models in Postgres. - -### Databricks Spark (local) - -- Install Java (JDK ≥ 17) and declare `JAVA_HOME`, for example: - `brew install openjdk@17` - `echo 'JAVA_HOME=/opt/homebrew/opt/openjdk@17' >> examples/api_demo/.env.dev_databricks`. -- Optionally tweak `FF_SPARK_MASTER` / `FF_SPARK_APP_NAME` in `.env.dev_databricks` (default: `local[*]`). -- To persist tables across separate `seed`/`run` sessions, enable the bundled Hive metastore defaults: - `FF_DBR_ENABLE_HIVE=1`, `FF_DBR_WAREHOUSE_DIR=examples/api_demo/spark-warehouse`, `FF_DBR_DATABASE=api_demo`. -- Switch the physical format by setting `FF_DBR_TABLE_FORMAT` (e.g. `delta`, requires the Delta Lake runtime); extra writer options can be supplied via `profiles.yml → databricks_spark.table_options`. -- Ensure your shell loads `.env.dev_databricks` (via `make`, `direnv`, or manual export) and run `make ENGINE=databricks_spark seed run`. - - -Yep, let’s bolt on a “how to set it up in GCP” section that fits with what you already have. - -Here’s an extended BigQuery section you can drop into your docs (you can keep or trim the parts you already added): - -### BigQuery - -#### 1. One-time setup in Google Cloud - -You only need to do this once per project / environment. - -1. **Create (or pick) a GCP project** - - - Go to the *Google Cloud Console* → **IAM & Admin → Create project**. - - Give it a name, e.g. `FFT Basic Demo`, and note the **Project ID**, e.g. `fft-basic-demo`. - - All further steps refer to this project id. - -2. **Enable the BigQuery API** - - - In the console, go to **APIs & Services → Library**. - - Search for **“BigQuery API”** and click **Enable**. - - (Optional but recommended) Also enable **BigQuery Storage API** for faster reads. - -3. **Create a BigQuery dataset** - - - Go to **BigQuery** in the console (left sidebar). - - Make sure your project `fft-basic-demo` is selected. - - Click **“+ Create dataset”**: - - **Dataset ID**: e.g. `basic_demo` - - **Location type**: choose a **multi-region**, e.g.: - - `EU` or `US` - - Click **Create dataset**. - - ⚠️ **Important:** The dataset **location must match** the location you use in your env (`FF_BQ_LOCATION`). - - If your dataset is in `EU` (multi-region), then `FF_BQ_LOCATION=EU`. - - If the dataset is in a single region like `europe-west3`, use that exact region name. - -4. **Create a service account (for CI / non-interactive use)** - - For local dev you can use your own user credentials (see below), but for CI/CD or shared environments - a service account is better. - - - Go to **IAM & Admin → Service Accounts → Create service account**. - - Name it e.g. `fft-runner`. - - On the **Roles** step, add roles with BigQuery write access, for example: - - `BigQuery Job User` - - `BigQuery Data Editor` - - (Optionally) Restrict to dataset level later if you want stricter permissions. - - Then create a key: - - - Click your service account → **Keys → Add key → Create new key**. - - Select **JSON**, download the file, and store it somewhere safe (e.g. `~/.config/gcloud/fft-sa.json`). - -5. **Authentication options** - - You have two ways to authenticate locally: - - **A) Application Default Credentials via gcloud (easy for dev)** - - ```bash - gcloud auth application-default login - ``` - -This opens a browser, you log in, and Google stores your ADC in -`~/.config/gcloud/application_default_credentials.json`. - -The BigQuery client in `fastflowtransform` will pick this up automatically **as long as** -`FF_BQ_PROJECT` points to a project you have access to. - -**B) Service account key (good for CI)** - -* Put the downloaded JSON key (from step 4) somewhere on disk. - -* Set the environment variable before running `fft`: - - ```bash - export GOOGLE_APPLICATION_CREDENTIALS=/path/to/fft-sa.json - ``` - -* Make sure the service account has at least: - - * `BigQuery Job User` - * `BigQuery Data Editor` - -* Optionally grant `BigQuery Data Viewer` if you’re only reading some tables. - ---- - -#### 2. Local configuration (env + profiles) - -1. **Environment file (`.env.dev_bigquery`)** - - ```env - # BigQuery connection - FF_BQ_PROJECT=fft-basic-demo # your GCP project id - FF_BQ_DATASET=basic_demo # dataset from step 3 - FF_BQ_LOCATION=EU # or europe-west3, US, etc. MUST match dataset location - - # Active fft environment name (must match profiles.yml) - FFT_ACTIVE_ENV=dev_bigquery - ``` - - Load this via `direnv`, `make`, or manual `export`. - -2. **profiles.yml** - - ```yaml - dev_bigquery: - engine: bigquery - bigquery: - project: ${FF_BQ_PROJECT} - dataset: ${FF_BQ_DATASET} - location: ${FF_BQ_LOCATION} - use_bigframes: true # Python models use BigQuery DataFrames (BigFrames) - ``` - ---- - -#### 3. Running seeds, models, and tests - -* **Seed BigQuery from `seeds/`:** - - ```bash - make ENGINE=bigquery seed - ``` - - This writes all `seeds/*.csv|parquet` to tables under - `${FF_BQ_PROJECT}.${FF_BQ_DATASET}.*`. - -* **Build models:** - - ```bash - make ENGINE=bigquery run - ``` - - * SQL models are executed as BigQuery queries. - * Python models with `only="bigquery"` run via `BigQueryBFExecutor` (BigQuery DataFrames) - and are written back into the same dataset. - -* **Run data-quality tests:** - - ```bash - make ENGINE=bigquery test - ``` - - `fft test` uses the BigQuery shim to run checks like `not_null`, `unique`, - `row_count_between`, `greater_equal`, etc. against - `${FF_BQ_PROJECT}.${FF_BQ_DATASET}.
`. - ---- - -#### 4. Common BigQuery gotchas - -* **Location mismatch** - - * Error like `Location basic_demo does not support this operation` or `Not found: Dataset ...`: - - * Check the **dataset location** in the BigQuery UI. - * Make sure `FF_BQ_LOCATION` is exactly that value (`EU`, `US`, `europe-west3`, …). - * Ensure the executor is initialized with the same location (via `profiles.yml` → `location`). - -* **Permission issues** - - * If you see `accessDenied` or `Permission denied`: - - * Confirm you authenticated (ADC or service account). - * Ensure your user / service account has at least: - - * `BigQuery Job User` - * `BigQuery Data Editor` on the project or dataset. - -* **Dataset not found** - - * Error `Not found: Dataset fft-basic-demo:basic_demo`: - - * Check that the dataset id matches exactly: - - * Project: `fft-basic-demo` - * Dataset: `basic_demo` - * Verify it exists and is in the same project you set in `FF_BQ_PROJECT`. - - - - - ---8<-- "Contributing.md" - - - - - -# License - ---8<-- "License.md" diff --git a/pyproject.toml b/pyproject.toml index b7ace1a..a8eb2aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,11 +4,11 @@ build-backend = "hatchling.build" [project] name = "fastflowtransform" -version = "0.6.1" +version = "0.6.2" description = "Python framework for SQL & Python data transformation, ETL pipelines, and dbt-style data modeling" readme = "README.md" license = { text = "Apache-2.0" } -authors = [ { name = "Marko Lekic", email = "you@example.com" } ] +authors = [ { name = "Marko Lekic" } ] requires-python = ">=3.12" keywords = [ "etl", @@ -66,7 +66,7 @@ spark = [ ] snowflake = [ - "snowflake>=1.8.0", + "snowflake-connector-python>=4.0.0", "snowflake-snowpark-python>=1.40.0", ] @@ -77,7 +77,7 @@ full = [ "bigframes>=2.24.0", "pyspark>=4.0.1", "delta-spark>=4.0.0", - "snowflake>=1.8.0", + "snowflake-connector-python>=4.0.0", "snowflake-snowpark-python>=1.40.0", ] @@ -154,7 +154,7 @@ combine-as-imports = true [tool.ruff.lint.pylint] max-args = 15 -max-returns = 8 +max-returns = 15 max-branches = 20 max-statements = 60 diff --git a/src/fastflowtransform/cli/bootstrap.py b/src/fastflowtransform/cli/bootstrap.py index 5a641b5..e303a12 100644 --- a/src/fastflowtransform/cli/bootstrap.py +++ b/src/fastflowtransform/cli/bootstrap.py @@ -368,6 +368,7 @@ def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Cal "warehouse": prof.snowflake_snowpark.warehouse, "database": prof.snowflake_snowpark.database, "schema": prof.snowflake_snowpark.db_schema, + "allow_create_schema": prof.snowflake_snowpark.allow_create_schema, } if prof.snowflake_snowpark.role: cfg["role"] = prof.snowflake_snowpark.role diff --git a/src/fastflowtransform/errors.py b/src/fastflowtransform/errors.py index bbc848e..e5deccb 100644 --- a/src/fastflowtransform/errors.py +++ b/src/fastflowtransform/errors.py @@ -119,9 +119,33 @@ class ModelExecutionError(Exception): Carries friendly context for CLI formatting. """ - def __init__(self, node_name: str, relation: str, message: str, sql_snippet: str | None = None): + def __init__( + self, + node_name: str, + relation: str, + message: str, + sql_snippet: str | None = None, + ): self.node_name = node_name self.relation = relation self.sql_snippet = sql_snippet self.message = message super().__init__(message) + + def __str__(self) -> str: + """ + Control how this error is rendered in the CLI. + + The run CLI uses traceback.format_exception_only(type(e), e), + which calls str(e), so this is the single place we need to adjust. + """ + base = self.message + + # prepend relation if we have it + if self.relation: + base = f"{self.relation}: {base}" + + if self.sql_snippet: + return f"{base}\n\n[SQL]\n{self.sql_snippet}" + + return base diff --git a/src/fastflowtransform/executors/bigquery/base.py b/src/fastflowtransform/executors/bigquery/base.py index cfb630e..bae74a9 100644 --- a/src/fastflowtransform/executors/bigquery/base.py +++ b/src/fastflowtransform/executors/bigquery/base.py @@ -138,12 +138,8 @@ def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: Write/update dataset._ff_meta after a successful build. Both pandas + BigFrames executors use the logical engine key 'bigquery'. """ - try: - ensure_meta_table(self) - upsert_meta(self, node.name, relation, fingerprint, "bigquery") - except Exception: - # Best-effort: meta must not break the run - pass + ensure_meta_table(self) + upsert_meta(self, node.name, relation, fingerprint, "bigquery") # ── Incremental API (shared across BigQuery executors) ─────────────── def exists_relation(self, relation: str) -> bool: diff --git a/src/fastflowtransform/executors/databricks_spark.py b/src/fastflowtransform/executors/databricks_spark.py index 1c9aaae..73a188d 100644 --- a/src/fastflowtransform/executors/databricks_spark.py +++ b/src/fastflowtransform/executors/databricks_spark.py @@ -581,11 +581,8 @@ def _create_or_replace_view_from_table( # ---- Meta hook ---- def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: """After successful materialization, upsert _ff_meta (best-effort).""" - try: - ensure_meta_table(self) - upsert_meta(self, node.name, relation, fingerprint, "databricks_spark") - except Exception: - pass + ensure_meta_table(self) + upsert_meta(self, node.name, relation, fingerprint, "databricks_spark") # ── Incremental API (parity) ───────────────────────────────────────── def exists_relation(self, relation: str) -> bool: diff --git a/src/fastflowtransform/executors/duckdb.py b/src/fastflowtransform/executors/duckdb.py index f1c13ec..536ccff 100644 --- a/src/fastflowtransform/executors/duckdb.py +++ b/src/fastflowtransform/executors/duckdb.py @@ -200,12 +200,8 @@ def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: """ After successful materialization, ensure the meta table exists and upsert the row. """ - # Best-effort: do not let meta errors break the run - try: - ensure_meta_table(self) - upsert_meta(self, node.name, relation, fingerprint, "duckdb") - except Exception: - pass + ensure_meta_table(self) + upsert_meta(self, node.name, relation, fingerprint, "duckdb") # ── Incremental API ──────────────────────────────────────────────────── def exists_relation(self, relation: str) -> bool: diff --git a/src/fastflowtransform/executors/postgres.py b/src/fastflowtransform/executors/postgres.py index 9eac580..1ff0243 100644 --- a/src/fastflowtransform/executors/postgres.py +++ b/src/fastflowtransform/executors/postgres.py @@ -165,11 +165,8 @@ def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: """ Write/update _ff_meta in the current schema after a successful build. """ - try: - ensure_meta_table(self) - upsert_meta(self, node.name, relation, fingerprint, "postgres") - except Exception: - pass + ensure_meta_table(self) + upsert_meta(self, node.name, relation, fingerprint, "postgres") # ── Incremental API ──────────────────────────────────────────────────── def exists_relation(self, relation: str) -> bool: diff --git a/src/fastflowtransform/executors/snowflake_snowpark.py b/src/fastflowtransform/executors/snowflake_snowpark.py index 8e7c3a9..f209b8a 100644 --- a/src/fastflowtransform/executors/snowflake_snowpark.py +++ b/src/fastflowtransform/executors/snowflake_snowpark.py @@ -2,6 +2,7 @@ from __future__ import annotations from collections.abc import Iterable +from contextlib import suppress from typing import Any from fastflowtransform.core import Node, relation_for @@ -19,6 +20,10 @@ def __init__(self, cfg: dict): self.session = Session.builder.configs(cfg).create() self.database = cfg["database"] self.schema = cfg["schema"] + + self.allow_create_schema: bool = bool(cfg["allow_create_schema"]) + self._ensure_schema() + # Provide a tiny testing shim so tests can call executor.con.execute("SQL") self.con = _SFCursorShim(self.session) @@ -27,16 +32,49 @@ def _q(self, s: str) -> str: return '"' + s.replace('"', '""') + '"' def _qualified(self, rel: str) -> str: - # "DB"."SCHEMA"."TABLE" - return f"{self._q(self.database)}.{self._q(self.schema)}.{self._q(rel)}" + # DATABASE.SCHEMA.TABLE (no quotes) + return f"{self.database}.{self.schema}.{rel}" + + def _ensure_schema(self) -> None: + """ + Best-effort schema creation when allow_create_schema=True. + + Mirrors BigQuery's `_ensure_dataset` behaviour: + - If the flag is false → do nothing. + - If true → `CREATE SCHEMA IF NOT EXISTS "DB"."SCHEMA"`. + """ + if not getattr(self, "allow_create_schema", False): + return + if not self.database or not self.schema: + # Misconfigured; let downstream errors surface naturally. + return + + db = self._q(self.database) + sch = self._q(self.schema) + with suppress(Exception): + # Fully qualified CREATE SCHEMA is allowed in Snowflake. + self.session.sql(f"CREATE SCHEMA IF NOT EXISTS {db}.{sch}").collect() + # Best-effort; permission issues or race conditions shouldn't crash the executor. + # If the schema truly doesn't exist and we can't create it, later queries will fail + # with a clearer engine error. # ---------- Frame-Hooks ---------- def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> SNDF: - return self.session.table(self._qualified(relation)) + df = self.session.table(self._qualified(relation)) + # Present a *logical* lowercase schema to Python models: + lowered = [c.lower() for c in df.schema.names] + return df.toDF(*lowered) def _materialize_relation(self, relation: str, df: SNDF, node: Node) -> None: if not self._is_frame(df): raise TypeError("Snowpark model must return a Snowpark DataFrame") + + # Normalize to uppercase for storage in Snowflake + cols = list(df.schema.names) + upper_cols = [c.upper() for c in cols] + if cols != upper_cols: + df = df.toDF(*upper_cols) + df.write.save_as_table(self._qualified(relation), mode="overwrite") def _create_view_over_table(self, view_name: str, backing_table: str, node: Node) -> None: @@ -51,19 +89,21 @@ def _validate_required( return def cols(df: SNDF) -> set[str]: - # Snowpark: schema names - return set(df.schema.names) + # Compare in lowercase to be case-insensitive for Snowflake + return {c.lower() for c in df.schema.names} + + # Normalize the required sets too + normalized_requires = {rel: {c.lower() for c in needed} for rel, needed in requires.items()} errors: list[str] = [] - # Single dependency + if isinstance(inputs, SNDF): - need = next(iter(requires.values()), set()) + need = next(iter(normalized_requires.values()), set()) missing = need - cols(inputs) if missing: errors.append(f"- missing columns: {sorted(missing)} | have={sorted(cols(inputs))}") else: - # Multiple dependencies - for rel, need in requires.items(): + for rel, need in normalized_requires.items(): if rel not in inputs: errors.append(f"- missing dependency key '{rel}'") continue @@ -98,6 +138,13 @@ def _frame_name(self) -> str: def _format_relation_for_ref(self, name: str) -> str: return self._qualified(relation_for(name)) + def _this_identifier(self, node: Node) -> str: + """ + Identifier for {{ this }} in SQL models. + Use fully-qualified DB.SCHEMA.TABLE so all build/read/test paths agree. + """ + return self._qualified(relation_for(node.name)) + def _format_source_reference( self, cfg: dict[str, Any], source_name: str, table_name: str ) -> str: @@ -114,7 +161,7 @@ def _format_source_reference( raise KeyError( f"Source {source_name}.{table_name} missing database/schema for Snowflake" ) - return f"{self._q(db)}.{self._q(sch)}.{self._q(ident)}" + return f"{db}.{sch}.{ident}" def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None: self.session.sql(f"CREATE OR REPLACE VIEW {target_sql} AS {select_body}").collect() @@ -129,25 +176,37 @@ def _create_or_replace_view_from_table( back_id = self._qualified(backing_table) self.session.sql(f"CREATE OR REPLACE VIEW {view_id} AS SELECT * FROM {back_id}").collect() + def _format_test_table(self, table: str | None) -> str | None: + formatted = super()._format_test_table(table) + if formatted is None: + return None + + # If it's already qualified (DB.SCHEMA.TABLE) or quoted, leave it alone. + if "." in formatted or '"' in formatted: + return formatted + + # Otherwise, treat it as a logical relation name and fully-qualify it + # with the executor's configured database/schema. + return self._qualified(formatted) + # ---- Meta hook ---- def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None: """After successful materialization, upsert _ff_meta (best-effort).""" - try: - ensure_meta_table(self) - upsert_meta(self, node.name, relation, fingerprint, "snowflake_snowpark") - except Exception: - pass + ensure_meta_table(self) + upsert_meta(self, node.name, relation, fingerprint, "snowflake_snowpark") # ── Incremental API (parity with DuckDB/PG) ────────────────────────── def exists_relation(self, relation: str) -> bool: """Check existence via information_schema.tables.""" db = self._q(self.database) + schema_lit = f"'{self.schema.upper()}'" + rel_lit = f"'{relation.upper()}'" q = f""" - select 1 - from {db}.information_schema.tables - where table_schema = {self._q(self.schema)} - and lower(table_name) = lower({self._q(relation)}) - limit 1 + select 1 + from {db}.information_schema.tables + where upper(table_schema) = {schema_lit} + and upper(table_name) = {rel_lit} + limit 1 """ try: return bool(self.session.sql(q).collect()) @@ -155,63 +214,80 @@ def exists_relation(self, relation: str) -> bool: return False def create_table_as(self, relation: str, select_sql: str) -> None: - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") + self.session.sql(f"CREATE OR REPLACE TABLE {self._qualified(relation)} AS {body}").collect() + + def full_refresh_table(self, relation: str, select_sql: str) -> None: + """ + Engine-specific full refresh for incremental fallbacks. + """ + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") self.session.sql(f"CREATE OR REPLACE TABLE {self._qualified(relation)} AS {body}").collect() def incremental_insert(self, relation: str, select_sql: str) -> None: - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") self.session.sql(f"INSERT INTO {self._qualified(relation)} {body}").collect() def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None: - """ - Portable fallback without explicit column list: - - WITH src AS () - - DELETE ... USING src ... - - INSERT ... SELECT * FROM src - This avoids Snowflake MERGE column listing complexity. - """ - body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") + body = self._selectable_body(select_sql).strip().rstrip(";\n\t ") pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE" qrel = self._qualified(relation) - sql = f""" - WITH src AS ({body}) - DELETE FROM {qrel} AS t USING src AS s WHERE {pred}; - INSERT INTO {qrel} SELECT * FROM src; + + # 1) Delete matching keys + delete_sql = f""" + DELETE FROM {qrel} AS t + USING ({body}) AS s + WHERE {pred} """ - self.session.sql(sql).collect() + self.session.sql(delete_sql).collect() + + # 2) Insert all rows from the delta + insert_sql = f"INSERT INTO {qrel} SELECT * FROM ({body})" + self.session.sql(insert_sql).collect() def alter_table_sync_schema( self, relation: str, select_sql: str, *, mode: str = "append_new_columns" ) -> None: """ Best-effort additive schema sync: - - infer SELECT schema via LIMIT 0 - - add missing columns as STRING + - infer SELECT schema via LIMIT 0 + - add missing columns as STRING """ if mode not in {"append_new_columns", "sync_all_columns"}: return + qrel = self._qualified(relation) + + # Use identifiers in FROM, but *string literals* in WHERE + db_ident = self._q(self.database) + schema_lit = self.schema.replace("'", "''") + rel_lit = relation.replace("'", "''") + try: existing = { r[0] for r in self.session.sql( f""" - select column_name - from {self._q(self.database)}.information_schema.columns - where table_schema={self._q(self.schema)} - and lower(table_name)=lower({self._q(relation)}) - """ + select column_name + from {db_ident}.information_schema.columns + where upper(table_schema) = upper('{schema_lit}') + and upper(table_name) = upper('{rel_lit}') + """ ).collect() } except Exception: existing = set() + # Probe SELECT columns body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") probe = self.session.sql(f"SELECT * FROM ({body}) q WHERE 1=0") probe_cols = list(probe.schema.names) + to_add = [c for c in probe_cols if c not in existing] if not to_add: return + + # Column names are identifiers → _q is correct here cols_sql = ", ".join(f"{self._q(c)} STRING" for c in to_add) self.session.sql(f"ALTER TABLE {qrel} ADD COLUMN {cols_sql}").collect() @@ -228,7 +304,13 @@ def execute(self, sql: str, params: Any | None = None) -> _SFResult: # Parametrized SQL not needed in our internal calls raise NotImplementedError("Snowflake shim does not support parametrized SQL") rows = self._session.sql(sql).collect() - as_tuples = [tuple(getattr(r, k) for k in r.asDict()) for r in rows] if rows else [] + + if rows: + cols = list(rows[0].asDict().keys()) + as_tuples = [tuple(row.asDict()[c] for c in cols) for row in rows] + else: + as_tuples = [] + return _SFResult(as_tuples) diff --git a/src/fastflowtransform/meta.py b/src/fastflowtransform/meta.py index c50af9d..ee54bdc 100644 --- a/src/fastflowtransform/meta.py +++ b/src/fastflowtransform/meta.py @@ -19,6 +19,8 @@ - DuckDB (executor.con) - Postgres (executor.engine, optional .schema) - BigQuery (executor.client, .dataset, optional .project) + - Snowflake Snowpark (executor.session, .database, .schema) + - Databricks Spark (executor.spark, optional .database/.schema) """ from __future__ import annotations @@ -31,7 +33,18 @@ # --------------------------- Engine detection --------------------------- +def _is_snowflake_snowpark(ex: Any) -> bool: + return hasattr(ex, "session") and hasattr(ex.session, "sql") + + +def _is_spark(ex: Any) -> bool: + return hasattr(ex, "spark") and hasattr(ex.spark, "sql") + + def _is_duckdb(ex: Any) -> bool: + engine_name = getattr(ex, "ENGINE_NAME", None) + if isinstance(engine_name, str): + return engine_name.lower() == "duckdb" return hasattr(ex, "con") and hasattr(ex.con, "execute") @@ -68,6 +81,44 @@ def _bq_qual_meta(ex: Any) -> str: return f"`{dataset}._ff_meta`" +def _sf_ident(name: str) -> str: + return '"' + name.replace('"', '""') + '"' + + +def _sf_qual_meta(ex: Any) -> str: + db = getattr(ex, "database", None) + schema = getattr(ex, "schema", None) + tbl = _sf_ident("_ff_meta") + if db and schema: + return f"{_sf_ident(db)}.{_sf_ident(schema)}.{tbl}" + if schema: + return f"{_sf_ident(schema)}.{tbl}" + return tbl + + +def _spark_ident(name: str) -> str: + return "`" + name.replace("`", "``") + "`" + + +def _spark_db(ex: Any) -> str | None: + db = getattr(ex, "database", None) or getattr(ex, "schema", None) + if isinstance(db, str) and db.strip(): + return db.strip() + return None + + +def _spark_qual_meta(ex: Any) -> str: + db = _spark_db(ex) + ident = _spark_ident("_ff_meta") + if db: + return f"{_spark_ident(db)}.{ident}" + return ident + + +def _sql_literal(value: str) -> str: + return "'" + value.replace("'", "''") + "'" + + # --------------------------- Public API --------------------------- @@ -75,6 +126,36 @@ def ensure_meta_table(executor: Any) -> None: """ Create the _ff_meta table if it does not exist for the active engine. """ + if _is_snowflake_snowpark(executor): + qual = _sf_qual_meta(executor) + ddl = ( + f"create table if not exists {qual} (" + " node_name string," + " relation string," + " fp string," + " engine string," + " built_at timestamp_ltz default current_timestamp()" + ")" + ) + executor.session.sql(ddl).collect() + return + + if _is_spark(executor): + qual = _spark_qual_meta(executor) + fmt = getattr(executor, "spark_table_format", None) + fmt_clause = f" USING {fmt}" if isinstance(fmt, str) and fmt.strip() else "" + ddl = ( + f"CREATE TABLE IF NOT EXISTS {qual} (" + " node_name STRING," + " relation STRING," + " fp STRING," + " engine STRING," + " built_at TIMESTAMP" + f"){fmt_clause}" + ) + executor.spark.sql(ddl).collect() + return + if _is_duckdb(executor): sql = ( 'create table if not exists "_ff_meta" (' @@ -127,6 +208,45 @@ def upsert_meta(executor: Any, node_name: str, relation: str, fp: str, engine: s """ ensure_meta_table(executor) + if _is_snowflake_snowpark(executor): + qual = _sf_qual_meta(executor) + node_lit = _sql_literal(node_name) + rel_lit = _sql_literal(relation) + fp_lit = _sql_literal(fp) + eng_lit = _sql_literal(engine) + executor.session.sql(f"delete from {qual} where node_name = {node_lit}").collect() + executor.session.sql( + f"insert into {qual}(node_name, relation, fp, engine, built_at) " + f"values ({node_lit}, {rel_lit}, {fp_lit}, {eng_lit}, current_timestamp())" + ).collect() + return + + if _is_spark(executor): + qual = _spark_qual_meta(executor) + + def _lit(val: str) -> str: + return _sql_literal(val) + + merge_sql = f""" + MERGE INTO {qual} AS target + USING ( + SELECT {_lit(node_name)} AS node_name, + {_lit(relation)} AS relation, + {_lit(fp)} AS fp, + {_lit(engine)} AS engine + ) AS source + ON target.node_name = source.node_name + WHEN MATCHED THEN UPDATE SET + relation = source.relation, + fp = source.fp, + engine = source.engine, + built_at = current_timestamp() + WHEN NOT MATCHED THEN INSERT (node_name, relation, fp, engine, built_at) + VALUES (source.node_name, source.relation, source.fp, source.engine, current_timestamp()) + """ + executor.spark.sql(merge_sql).collect() + return + if _is_duckdb(executor): # DuckDB: emulate upsert via delete + insert inside the same connection. executor.con.execute('delete from "_ff_meta" where node_name = ?', [node_name]) @@ -222,6 +342,37 @@ def get_meta(executor: Any, node_name: str) -> tuple[str, str, Any, str] | None: except Exception: return (r[0], r[1], r[2], r[3]) + if _is_snowflake_snowpark(executor): + qual = _sf_qual_meta(executor) + node = _sql_literal(node_name) + sql = f"select fp, relation, built_at, engine from {qual} where node_name = {node} limit 1" + rows = executor.session.sql(sql).collect() + if not rows: + return None + row = rows[0] + data = getattr(row, "asDict", lambda: None)() + if data: + return (data.get("FP"), data.get("RELATION"), data.get("BUILT_AT"), data.get("ENGINE")) + try: + return (row[0], row[1], row[2], row[3]) + except Exception: + return None + + if _is_spark(executor): + qual = _spark_qual_meta(executor) + sql = ( + f"SELECT fp, relation, built_at, engine FROM {qual} " + f"WHERE node_name = {_sql_literal(node_name)} LIMIT 1" + ) + rows = executor.spark.sql(sql).collect() + if not rows: + return None + row = rows[0] + try: + return (row["fp"], row["relation"], row["built_at"], row["engine"]) + except Exception: + return (row[0], row[1], row[2], row[3]) + return None @@ -270,6 +421,37 @@ def relation_exists(executor: Any, relation: str) -> bool: except Exception: return True + if _is_snowflake_snowpark(executor): + try: + db = getattr(executor, "database", None) + schema = getattr(executor, "schema", None) + if not db or not schema: + return False + q = f""" + select 1 + from {_sf_ident(db)}.information_schema.tables + where upper(table_schema) = {_sql_literal(schema.upper())} + and upper(table_name) = {_sql_literal(relation.upper())} + limit 1 + """ + rows = executor.session.sql(q).collect() + return bool(rows) + except Exception: + return False + + if _is_spark(executor): + try: + spark = executor.spark + if "." in relation: + db_name, tbl = relation.rsplit(".", 1) + return spark.catalog.tableExists(db_name, tbl) + db = _spark_db(executor) + if db: + return spark.catalog.tableExists(db, relation) + return spark.catalog.tableExists(relation) + except Exception: + return False + return True @@ -308,3 +490,19 @@ def delete_meta_for_node(executor: Any, node_name: str) -> None: f'DELETE FROM `{dataset}._ff_meta` WHERE node_name = "{node_name}"' ) return + + if _is_snowflake_snowpark(executor): + with suppress(Exception): + qual = _sf_qual_meta(executor) + executor.session.sql( + f"delete from {qual} where node_name = {_sql_literal(node_name)}" + ).collect() + return + + if _is_spark(executor): + with suppress(Exception): + qual = _spark_qual_meta(executor) + executor.spark.sql( + f"DELETE FROM {qual} WHERE node_name = {_sql_literal(node_name)}" + ).collect() + return diff --git a/src/fastflowtransform/seeding.py b/src/fastflowtransform/seeding.py index 071c7be..95a1b0e 100644 --- a/src/fastflowtransform/seeding.py +++ b/src/fastflowtransform/seeding.py @@ -16,6 +16,7 @@ from fastflowtransform import storage from fastflowtransform.config.seeds import SeedsSchemaConfig, load_seeds_schema +from fastflowtransform.executors.snowflake_snowpark import SnowflakeSnowparkExecutor from fastflowtransform.logging import echo from fastflowtransform.settings import EngineType from fastflowtransform.typing import SDF, SparkAnalysisException, SparkSession @@ -745,13 +746,75 @@ def _handle_spark( return True +def _handle_snowflake_snowpark( + table: str, + df: pd.DataFrame, + executor: Any, + schema: str | None, +) -> bool: + """ + Seed loader for SnowflakeSnowparkExecutor. + + Uses Session.write_pandas to create/overwrite the table in the configured + database + schema. + """ + if not isinstance(executor, SnowflakeSnowparkExecutor): + return False + + session = executor.session + target_db = getattr(executor, "database", None) + target_schema = schema or getattr(executor, "schema", None) + + if not target_db or not target_schema: + # Not enough info to build a fully-qualified target + return False + + # Optionally auto-create schema when allowed + created_schema = False + if getattr(executor, "allow_create_schema", False): + session.sql(f'CREATE SCHEMA IF NOT EXISTS "{target_db}"."{target_schema}"').collect() + created_schema = True + + full_name = _qualify(table, target_schema, target_db) + + t0 = perf_counter() + # Use Snowpark's write_pandas: CREATE+OVERWRITE semantics + session.write_pandas( + df, + table_name=table, + database=target_db, + schema=target_schema, + auto_create_table=True, + quote_identifiers=False, + overwrite=True, + ) + dt_ms = int((perf_counter() - t0) * 1000) + + _echo_seed_line( + full_name=full_name, + rows=len(df), + cols=df.shape[1], + engine="snowflake", + ms=dt_ms, + created_schema=created_schema, + action="replaced", + ) + return True + + # ------------------------------------------------------------ # Dispatcher # ------------------------------------------------------------ Handler = Callable[[str, pd.DataFrame, Any, str | None], bool] -_HANDLERS: Iterable[Handler] = (_handle_duckdb, _handle_sqlalchemy, _handle_spark, _handle_bigquery) +_HANDLERS: Iterable[Handler] = ( + _handle_duckdb, + _handle_sqlalchemy, + _handle_spark, + _handle_bigquery, + _handle_snowflake_snowpark, +) def materialize_seed( diff --git a/src/fastflowtransform/settings.py b/src/fastflowtransform/settings.py index 0436154..d7f7029 100644 --- a/src/fastflowtransform/settings.py +++ b/src/fastflowtransform/settings.py @@ -58,8 +58,9 @@ class SnowflakeSnowparkConfig(BaseConfig): password: str warehouse: str database: str - db_schema: str | None = None + db_schema: str = Field(alias="schema") role: str | None = None + allow_create_schema: bool = False class DuckDBProfile(BaseConfig): @@ -138,6 +139,7 @@ class EnvSettings(BaseSettings): SF_WAREHOUSE: str | None = None SF_DATABASE: str | None = None SF_SCHEMA: str | None = None + SF_ALLOW_CREATE_SCHEMA: int | None = None # --- HTTP / API (optional) --- HTTP_CACHE_DIR: str | None = None # maps to FF_HTTP_CACHE_DIR @@ -286,6 +288,13 @@ def _ov_snowflake_snowpark(raw: dict[str, Any], env: EnvSettings) -> None: _set_if(sf, "schema", getattr(env, "SF_SCHEMA", None)) _set_if(sf, "role", getattr(env, "SF_ROLE", None)) + acs = getattr(env, "SF_ALLOW_CREATE_SCHEMA", None) + if acs is not None: + if isinstance(acs, str): + sf["allow_create_schema"] = acs.strip().lower() in {"1", "true", "yes", "on"} + else: + sf["allow_create_schema"] = bool(acs) + # ---------- Sanity Checks ---------- CheckFn = Callable[[Profile], None] diff --git a/src/fastflowtransform/testing/base.py b/src/fastflowtransform/testing/base.py index 95cb34b..7cba8a3 100644 --- a/src/fastflowtransform/testing/base.py +++ b/src/fastflowtransform/testing/base.py @@ -290,6 +290,13 @@ def freshness(con: Any, table: str, ts_col: str, max_delay_minutes: int) -> None or "bigquery" in name_l or str(getattr(con, "marker", "")).upper() == "BQ_SHIM" ) + is_snowflake = ( + "snowflake" in mod_l + or "snowpark" in mod_l + or "snowflake" in name_l + or "snowpark" in name_l + or hasattr(con, "_session") + ) # Primary SQL (Postgres / DuckDB style) sql_primary = ( @@ -308,6 +315,11 @@ def freshness(con: Any, table: str, ts_col: str, max_delay_minutes: int) -> None f"select cast(TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), max({ts_col}), MINUTE) as float64) " f"as delay_min from {table}" ) + # Snowflake: DATEDIFF on minutes; cast to float to align with other engines + sql_snowflake = ( + f"select DATEDIFF('minute', max({ts_col}), CURRENT_TIMESTAMP())::float as delay_min " + f"from {table}" + ) delay = None sql_used: str @@ -327,6 +339,12 @@ def freshness(con: Any, table: str, ts_col: str, max_delay_minutes: int) -> None except Exception as e: # BigQuery error messages don't mention EXTRACT/EPOCH; surface directly. raise _wrap_db_error("freshness", table, ts_col, sql_bigquery, e) from e + elif is_snowflake: + sql_used = sql_snowflake + try: + delay = _scalar(con, sql_snowflake) + except Exception as e: + raise _wrap_db_error("freshness", table, ts_col, sql_snowflake, e) from e else: # Non-Spark engines: try the Postgres/DuckDB expression first. sql_used = sql_primary diff --git a/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py b/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py index f2e5a39..6fb3991 100644 --- a/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py +++ b/tests/integration/executors/duckdb/test_executor_meta_hook_smoke_integration.py @@ -11,7 +11,7 @@ @pytest.mark.integration @pytest.mark.duckdb def test_duckdb_on_node_built_no_crash(tmp_path: Path): - # Smoke-test: calling the hook must not raise errors (best-effort semantics) + # Smoke-test: calling the hook must succeed and write metadata ex = DuckExecutor(db_path=":memory:") node = Node(name="x.ff", kind="sql", path=tmp_path / "x.ff.sql", deps=[], meta={}) ex.on_node_built(node, relation="x", fingerprint="abc123") # should not raise diff --git a/tests/integration/settings/test_profiles_validation.py b/tests/integration/settings/test_profiles_validation.py index a52d6b0..5370945 100644 --- a/tests/integration/settings/test_profiles_validation.py +++ b/tests/integration/settings/test_profiles_validation.py @@ -111,7 +111,8 @@ def _write_profiles(tmp_path: Path, yaml_text: str) -> None: password: "" warehouse: "" database: "" - db_schema: "" + schema: "" + allow_create_schema: true """, {}, True, diff --git a/tests/unit/executors/test_bigquery_bf_exec_unit.py b/tests/unit/executors/test_bigquery_bf_exec_unit.py index 74b6189..65c189f 100644 --- a/tests/unit/executors/test_bigquery_bf_exec_unit.py +++ b/tests/unit/executors/test_bigquery_bf_exec_unit.py @@ -390,7 +390,7 @@ def fake_query(sql: str, location: str | None = None, job_config: Any | None = N @pytest.mark.unit @pytest.mark.bigquery -def test_on_node_built_best_effort(monkeypatch, bq_exec): +def test_on_node_built_calls_meta(monkeypatch, bq_exec): called = {"ensure": 0, "upsert": 0} def fake_ensure(ex): @@ -408,6 +408,18 @@ def fake_upsert(ex, name, rel, fp, eng): assert called["upsert"] == 1 +@pytest.mark.unit +@pytest.mark.bigquery +def test_on_node_built_raises_on_meta_failure(monkeypatch, bq_exec): + def bad_upsert(ex, name, rel, fp, eng): + raise RuntimeError("nope") + + monkeypatch.setattr(bq_base_mod, "upsert_meta", bad_upsert) + + with pytest.raises(RuntimeError): + bq_exec.on_node_built(Node(name="m", kind="sql", path=Path(".")), "p1.ds1.m", "fp123") + + @pytest.mark.unit @pytest.mark.bigquery def test_bf_apply_sql_materialization_calls_super(monkeypatch, bq_exec): diff --git a/tests/unit/executors/test_bigquery_exec_unit.py b/tests/unit/executors/test_bigquery_exec_unit.py index a4dc33d..4848311 100644 --- a/tests/unit/executors/test_bigquery_exec_unit.py +++ b/tests/unit/executors/test_bigquery_exec_unit.py @@ -298,7 +298,7 @@ def test_create_or_replace_view_from_table_calls_client_query(bq_exec): @pytest.mark.unit @pytest.mark.bigquery -def test_on_node_built_best_effort(monkeypatch, bq_exec): +def test_on_node_built_calls_meta(monkeypatch, bq_exec): called = {"ensure": 0, "upsert": 0} def fake_ensure(ex): @@ -316,6 +316,18 @@ def fake_upsert(ex, name, rel, fp, eng): assert called["upsert"] == 1 +@pytest.mark.unit +@pytest.mark.bigquery +def test_on_node_built_raises_on_meta_failure(monkeypatch, bq_exec): + def bad_ensure(ex): + raise RuntimeError("boom") + + monkeypatch.setattr(bq_base_mod, "ensure_meta_table", bad_ensure) + + with pytest.raises(RuntimeError): + bq_exec.on_node_built(Node(name="m", kind="sql", path=Path(".")), "p1.ds1.m", "fp123") + + @pytest.mark.unit @pytest.mark.bigquery def test_exists_relation_true(bq_exec, monkeypatch): diff --git a/tests/unit/executors/test_databricks_spark_exec_unit.py b/tests/unit/executors/test_databricks_spark_exec_unit.py index 54dea8b..ed3711d 100644 --- a/tests/unit/executors/test_databricks_spark_exec_unit.py +++ b/tests/unit/executors/test_databricks_spark_exec_unit.py @@ -295,7 +295,6 @@ def test__create_view_over_table_executes_expected_sql(exec_minimal): @pytest.mark.unit @pytest.mark.databricks_spark def test_on_node_built_calls_meta_helpers(exec_minimal, monkeypatch): - """on_node_built should best-effort call ensure_meta_table and upsert_meta.""" ensure_called = {} upsert_called = {} @@ -315,6 +314,20 @@ def fake_upsert(executor, node_name, relation, fingerprint, engine): assert upsert_called["args"] == ("demo_node", "demo_tbl", "abc123", "databricks_spark") +@pytest.mark.unit +@pytest.mark.databricks_spark +def test_on_node_built_raises_on_meta_failure(exec_minimal, monkeypatch): + def bad_upsert(executor, node_name, relation, fingerprint, engine): + raise RuntimeError("meta fail") + + monkeypatch.setattr(mod, "upsert_meta", bad_upsert) + + node = Node(name="demo_node", kind="sql", path=Path("x")) + + with pytest.raises(RuntimeError): + exec_minimal.on_node_built(node, "demo_tbl", "abc123") + + @pytest.mark.unit @pytest.mark.databricks_spark def test_spark_conn_shim_execute_runs_select(monkeypatch): diff --git a/tests/unit/executors/test_duckdb_exec_unit.py b/tests/unit/executors/test_duckdb_exec_unit.py index cff4e89..be7aa6e 100644 --- a/tests/unit/executors/test_duckdb_exec_unit.py +++ b/tests/unit/executors/test_duckdb_exec_unit.py @@ -169,13 +169,15 @@ def test_format_source_reference_injects_catalog_when_matches_schema(): # --------------------------------------------------------------------------- -# on_node_built - best effort +# on_node_built # --------------------------------------------------------------------------- @pytest.mark.unit @pytest.mark.duckdb -def test_on_node_built_best_effort(duck_exec: DuckExecutor, monkeypatch: pytest.MonkeyPatch): +def test_on_node_built_invokes_meta_helpers( + duck_exec: DuckExecutor, monkeypatch: pytest.MonkeyPatch +): called = {"ensure": 0, "upsert": 0} def fake_ensure(ex: Any) -> None: @@ -184,8 +186,7 @@ def fake_ensure(ex: Any) -> None: def fake_upsert(ex: Any, name: str, rel: str, fp: str, eng: str) -> None: called["upsert"] += 1 - # patch the functions used in on_node_built - import fastflowtransform.executors.duckdb as duck_mod # noqa PLC0415 + import fastflowtransform.executors.duckdb as duck_mod # noqa: PLC0415 monkeypatch.setattr(duck_mod, "ensure_meta_table", fake_ensure, raising=True) monkeypatch.setattr(duck_mod, "upsert_meta", fake_upsert, raising=True) @@ -196,6 +197,22 @@ def fake_upsert(ex: Any, name: str, rel: str, fp: str, eng: str) -> None: assert called["upsert"] == 1 +@pytest.mark.unit +@pytest.mark.duckdb +def test_on_node_built_propagates_meta_errors( + duck_exec: DuckExecutor, monkeypatch: pytest.MonkeyPatch +): + import fastflowtransform.executors.duckdb as duck_mod # noqa: PLC0415 + + def bad_ensure(ex: Any) -> None: + raise RuntimeError("boom") + + monkeypatch.setattr(duck_mod, "ensure_meta_table", bad_ensure, raising=True) + + with pytest.raises(RuntimeError): + duck_exec.on_node_built(_node("m1"), "out_tbl", "fp123") + + # --------------------------------------------------------------------------- # exists_relation # --------------------------------------------------------------------------- diff --git a/tests/unit/executors/test_postgres_exec_unit.py b/tests/unit/executors/test_postgres_exec_unit.py index 37e09a8..909f6d9 100644 --- a/tests/unit/executors/test_postgres_exec_unit.py +++ b/tests/unit/executors/test_postgres_exec_unit.py @@ -404,15 +404,16 @@ def fake_upsert(executor, name, relation, fp, engine): @pytest.mark.unit @pytest.mark.postgres -def test_on_node_built_swallows_exceptions(monkeypatch, fake_engine_and_conn, node_tmp): +def test_on_node_built_raises_when_meta_fails(monkeypatch, fake_engine_and_conn, node_tmp): ex = PostgresExecutor("postgresql+psycopg://x", schema="public") def bad_ensure(executor): raise RuntimeError("meta fail") monkeypatch.setattr(pgmod, "ensure_meta_table", bad_ensure) - # should not raise - ex.on_node_built(node_tmp, "t", "fp") + + with pytest.raises(RuntimeError): + ex.on_node_built(node_tmp, "t", "fp") # --------------------------------------------------------------------------- @@ -558,8 +559,14 @@ def boom(stmt, params=None): ex._create_or_replace_view_from_table("v_broken", "src_tbl", node) err = exc.value - # message is just the original error text - assert str(err) == "db down" + # Error is wrapped with relation + original message for better context + assert isinstance(err, ModelExecutionError) + # Relation should be the qualified one the executor used + assert err.relation == '"public"."v_broken"' + assert err.node_name == "m_bad" + # String form includes both relation and original error text + assert str(err).endswith("db down") + assert str(err).startswith('"public"."v_broken":') # but the extra context must be present assert err.node_name == "m_bad" assert err.relation == '"public"."v_broken"' diff --git a/tests/unit/executors/test_snowflake_snowpark_exec.py b/tests/unit/executors/test_snowflake_snowpark_exec.py index 6cf2434..34584d2 100644 --- a/tests/unit/executors/test_snowflake_snowpark_exec.py +++ b/tests/unit/executors/test_snowflake_snowpark_exec.py @@ -33,6 +33,7 @@ class FakeSnowparkDataFrame: It needs: - .schema.names - .collect() + - .toDF() - .write.save_as_table(...) """ @@ -63,6 +64,9 @@ def save_as_table(self, name: str, mode: str = "overwrite") -> None: def write(self) -> FakeSnowparkDataFrame._Writer: # type: ignore[name-defined] return FakeSnowparkDataFrame._Writer(self) + def toDF(self, *cols: str) -> FakeSnowparkDataFrame: + return FakeSnowparkDataFrame(self._session, self._sql, list(cols)) + class FakeSession: """ @@ -158,6 +162,7 @@ def sf_exec(monkeypatch): "warehouse": "wh", "database": "DB1", "schema": "SC1", + "allow_create_schema": True, } ex = SnowflakeSnowparkExecutor(cfg) # sanity: we actually got our fake session @@ -183,7 +188,7 @@ def test_init_sets_db_schema_and_con(sf_exec): @pytest.mark.snowflake def test_q_and_qualified(sf_exec): assert sf_exec._q("x") == '"x"' - assert sf_exec._qualified("TBL") == '"DB1"."SC1"."TBL"' + assert sf_exec._qualified("TBL") == "DB1.SC1.TBL" @pytest.mark.unit @@ -193,7 +198,7 @@ def test_read_relation_calls_session_table(sf_exec): df = sf_exec._read_relation("MY_TBL", node, deps=[]) assert isinstance(df, FakeSnowparkDataFrame) # session should have been asked for the fully qualified name - assert sf_exec.session.table_calls == ['"DB1"."SC1"."MY_TBL"'] + assert sf_exec.session.table_calls == ["DB1.SC1.MY_TBL"] @pytest.mark.unit @@ -209,6 +214,16 @@ def test_materialize_relation_happy(sf_exec, monkeypatch): ) ) ) + fake_df.schema = SimpleNamespace(names=["col1"]) + + def _fake_to_df(*cols: str): + return SimpleNamespace( + write=fake_df.write, + schema=SimpleNamespace(names=list(cols)), + toDF=_fake_to_df, + ) + + fake_df.toDF = _fake_to_df monkeypatch.setattr( sf_exec, @@ -216,6 +231,7 @@ def test_materialize_relation_happy(sf_exec, monkeypatch): lambda obj: obj is fake_df, raising=True, ) + fake_df.schema = SimpleNamespace(names=["col1"]) node = Node(name="m", kind="python", path=Path(".")) @@ -223,7 +239,7 @@ def test_materialize_relation_happy(sf_exec, monkeypatch): sf_exec._materialize_relation("OUT_TBL", fake_df, node) # ASSERT - assert called["table"] == '"DB1"."SC1"."OUT_TBL"' + assert called["table"] == "DB1.SC1.OUT_TBL" assert called["mode"] == "overwrite" @@ -242,8 +258,8 @@ def test_create_view_over_table_issues_sql(sf_exec): sf_exec._create_view_over_table("V_USERS", "USERS", node) assert any("CREATE OR REPLACE VIEW" in s for s in sf_exec.session.sql_calls) sql = sf_exec.session.sql_calls[-1] - assert '"DB1"."SC1"."V_USERS"' in sql - assert '"DB1"."SC1"."USERS"' in sql + assert "DB1.SC1.V_USERS" in sql + assert "DB1.SC1.USERS" in sql @pytest.mark.unit @@ -276,8 +292,8 @@ def test_validate_required_single_df_missing(sf_exec): msg = str(exc.value) assert "missing columns" in msg - assert "NAME" in msg - assert "ID" in msg + assert "name" in msg.lower() + assert "id" in msg.lower() @pytest.mark.unit @@ -305,6 +321,22 @@ def test_validate_required_multi_input_missing_key(sf_exec): assert "missing dependency key 'DB1.SC1.ORDERS'" in str(exc.value) +@pytest.mark.unit +@pytest.mark.snowflake +def test_validate_required_is_case_insensitive(sf_exec): + SNDF = sf_mod.SNDF + df = SNDF(sf_exec.session) # type: ignore[call-arg] + # Snowflake-style upper-case physical columns + df.schema = SimpleNamespace(names=["USER_ID", "EMAIL"]) # type: ignore[attr-defined] + + # Logical requirements in lower-case should still pass + sf_exec._validate_required( + "model_ci", + df, + {"DB1.SC1.USERS": {"user_id", "email"}}, + ) + + @pytest.mark.unit @pytest.mark.snowflake def test_columns_of(sf_exec): @@ -332,7 +364,7 @@ def test_frame_name(sf_exec): @pytest.mark.snowflake def test_format_relation_for_ref(sf_exec): r = sf_exec._format_relation_for_ref("my_model") - assert '"DB1"' in r and '"SC1"' in r and "my_model" in r + assert r == "DB1.SC1.my_model" @pytest.mark.unit @@ -340,7 +372,7 @@ def test_format_relation_for_ref(sf_exec): def test_format_source_reference_happy(sf_exec): cfg = {"identifier": "SRC_TBL", "database": "DBX", "schema": "RAW"} ref = sf_exec._format_source_reference(cfg, "src", "tbl") - assert ref == '"DBX"."RAW"."SRC_TBL"' + assert ref == "DBX.RAW.SRC_TBL" @pytest.mark.unit @@ -381,13 +413,13 @@ def test_create_or_replace_view_from_table_calls_session_sql(sf_exec): sf_exec._create_or_replace_view_from_table("V1", "T1", node) sql = sf_exec.session.sql_calls[-1] assert "CREATE OR REPLACE VIEW" in sql - assert '"DB1"."SC1"."V1"' in sql - assert '"DB1"."SC1"."T1"' in sql + assert "DB1.SC1.V1" in sql + assert "DB1.SC1.T1" in sql @pytest.mark.unit @pytest.mark.snowflake -def test_on_node_built_best_effort(monkeypatch, sf_exec): +def test_on_node_built_calls_meta(monkeypatch, sf_exec): called = {"ensure": 0, "upsert": 0} def fake_ensure(ex): @@ -405,6 +437,20 @@ def fake_upsert(ex, name, rel, fp, eng): assert called["upsert"] == 1 +@pytest.mark.unit +@pytest.mark.snowflake +def test_on_node_built_raises_on_meta_error(monkeypatch, sf_exec): + def bad_upsert(ex, name, rel, fp, eng): + raise RuntimeError("meta boom") + + monkeypatch.setattr(sf_exec_mod, "upsert_meta", bad_upsert) + + with pytest.raises(RuntimeError): + sf_exec.on_node_built( + Node(name="m", kind="sql", path=Path(".")), '"DB1"."SC1"."M"', "fp123" + ) + + @pytest.mark.unit @pytest.mark.snowflake def test_exists_relation_true(sf_exec, monkeypatch): @@ -449,12 +495,20 @@ def test_incremental_insert_strips_semicolon(sf_exec): @pytest.mark.snowflake def test_incremental_merge_builds_two_statements(sf_exec): sf_exec.session.sql_calls.clear() + sf_exec.incremental_merge("DST", "SELECT 1 AS id", ["id"]) - sql = sf_exec.session.sql_calls[-1] - # both DELETE and INSERT statements should be in there - assert "DELETE FROM" in sql - assert "INSERT INTO" in sql - assert "WITH src AS" in sql + + # We now emit two separate statements: DELETE ... and INSERT ... + assert len(sf_exec.session.sql_calls) == 2 + delete_sql, insert_sql = sf_exec.session.sql_calls + + # First: DELETE FROM ... USING () AS s WHERE ... + assert "DELETE FROM" in delete_sql + assert "USING (" in delete_sql + + # Second: INSERT INTO ... SELECT * FROM () + assert "INSERT INTO" in insert_sql + assert "SELECT * FROM (" in insert_sql @pytest.mark.unit diff --git a/tests/unit/meta/test_meta_extended_unit.py b/tests/unit/meta/test_meta_extended_unit.py new file mode 100644 index 0000000..345432f --- /dev/null +++ b/tests/unit/meta/test_meta_extended_unit.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import MagicMock + +from fastflowtransform.meta import ( + get_meta, + relation_exists, + upsert_meta, +) + + +class _FakeSFRow: + def __init__(self, data: dict[str, object]): + self._data = data + + def asDict(self): + return self._data + + def __getitem__(self, idx: int) -> object: + return list(self._data.values())[idx] + + +def _sf_executor(): + session = MagicMock() + session.sql.return_value.collect.return_value = [] + ex = SimpleNamespace(session=session, database="DBX", schema="SCX") + return ex, session + + +def _spark_executor(): + spark = MagicMock() + spark.sql.return_value.collect.return_value = [] + spark.catalog.tableExists.return_value = True + ex = SimpleNamespace(spark=spark, database="analytics", spark_table_format=None) + return ex, spark + + +def test_snowflake_meta_calls_session_sql(): + ex, session = _sf_executor() + upsert_meta(ex, "node", '"DBX"."SCX"."TBL"', "fp1", "snowflake_snowpark") + sql_calls = [call.args[0] for call in session.sql.call_args_list] + assert any("create table if not exists" in stmt.lower() for stmt in sql_calls) + assert any("delete from" in stmt.lower() for stmt in sql_calls) + assert any("insert into" in stmt.lower() for stmt in sql_calls) + + +def test_snowflake_get_meta_returns_tuple(): + ex, session = _sf_executor() + row = _FakeSFRow({"FP": "abc", "RELATION": "r", "BUILT_AT": "ts", "ENGINE": "sf"}) + session.sql.return_value.collect.return_value = [row] + out = get_meta(ex, "node") + assert out == ("abc", "r", "ts", "sf") + + +def test_spark_meta_uses_merge_and_get_meta_reads_rows(): + ex, spark = _spark_executor() + upsert_meta(ex, "node", "analytics.table", "fp2", "databricks_spark") + merge_stmt = spark.sql.call_args_list[-1].args[0] + assert "MERGE INTO" in merge_stmt + spark.sql.return_value.collect.return_value = [ + {"fp": "fp2", "relation": "analytics.table", "built_at": "ts", "engine": "databricks_spark"} + ] + assert get_meta(ex, "node") == ( + "fp2", + "analytics.table", + "ts", + "databricks_spark", + ) + + +def test_spark_relation_exists_uses_catalog(monkeypatch): + ex, spark = _spark_executor() + assert relation_exists(ex, "tbl_name") is True + spark.catalog.tableExists.assert_called_with("analytics", "tbl_name") diff --git a/uv.lock b/uv.lock index 26a07e1..63bdea9 100644 --- a/uv.lock +++ b/uv.lock @@ -290,7 +290,7 @@ name = "cffi" version = "1.17.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pycparser" }, + { name = "pycparser", marker = "python_full_version < '3.13' or platform_python_implementation != 'PyPy'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621, upload-time = "2024-09-04T20:45:21.852Z" } wheels = [ @@ -733,7 +733,7 @@ wheels = [ [[package]] name = "fastflowtransform" -version = "0.6.1" +version = "0.6.2" source = { editable = "." } dependencies = [ { name = "duckdb" }, @@ -783,7 +783,7 @@ full = [ { name = "psycopg", extra = ["binary"] }, { name = "psycopg2-binary" }, { name = "pyspark" }, - { name = "snowflake" }, + { name = "snowflake-connector-python" }, { name = "snowflake-snowpark-python" }, ] postgres = [ @@ -791,7 +791,7 @@ postgres = [ { name = "psycopg2-binary" }, ] snowflake = [ - { name = "snowflake" }, + { name = "snowflake-connector-python" }, { name = "snowflake-snowpark-python" }, ] spark = [ @@ -837,8 +837,8 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.0" }, { name = "pyyaml", specifier = ">=6.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = "==0.14.*" }, - { name = "snowflake", marker = "extra == 'full'", specifier = ">=1.8.0" }, - { name = "snowflake", marker = "extra == 'snowflake'", specifier = ">=1.8.0" }, + { name = "snowflake-connector-python", marker = "extra == 'full'", specifier = ">=4.0.0" }, + { name = "snowflake-connector-python", marker = "extra == 'snowflake'", specifier = ">=4.0.0" }, { name = "snowflake-snowpark-python", marker = "extra == 'full'", specifier = ">=1.40.0" }, { name = "snowflake-snowpark-python", marker = "extra == 'snowflake'", specifier = ">=1.40.0" }, { name = "sqlalchemy", specifier = ">=2.0" }, @@ -3257,29 +3257,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] -[[package]] -name = "snowflake" -version = "1.8.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "snowflake-core" }, - { name = "snowflake-legacy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/78/7d/53c6a73e0767a0b26b1d1939f462d86002d2b21817cd6fcdcb192db13219/snowflake-1.8.0.tar.gz", hash = "sha256:cd87e8633cf9cc88815dbfab495c8fd7c069fa428af26e027330b0898dabcee7", size = 6038, upload-time = "2025-09-22T11:33:04.908Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/4d/d8fb50fe92e78756d0f4fb88a14fe5224a99db8f3787027408a84f9af8a2/snowflake-1.8.0-py3-none-any.whl", hash = "sha256:36fa6b4b730755ff549203d52311c004eed90ab442dd36d2393fe29ac6ad200d", size = 5634, upload-time = "2025-09-22T11:33:04.007Z" }, -] - [[package]] name = "snowflake-connector-python" -version = "3.18.0" +version = "4.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "asn1crypto" }, { name = "boto3" }, { name = "botocore" }, { name = "certifi" }, - { name = "cffi" }, { name = "charset-normalizer" }, { name = "cryptography", version = "45.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' and platform_python_implementation != 'PyPy'" }, { name = "cryptography", version = "46.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or platform_python_implementation == 'PyPy'" }, @@ -3295,50 +3281,23 @@ dependencies = [ { name = "tomlkit" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/25/df/41fe26b68801e3d59653a5dc7ce87a92e9d967dcad7b59b035b8c9804815/snowflake_connector_python-3.18.0.tar.gz", hash = "sha256:41a46eb9824574c5f8068e3ed5c02a2dc0a733ed08ee81fa1fb3dd0ebe921728", size = 798019, upload-time = "2025-10-06T12:15:34.301Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/da/67/0df7829f295988c121f385c562d60c7a4989bc8f72885d04669ce5cd6516/snowflake_connector_python-3.18.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fee7035f865088f948510b094101c8a0e5b22501891f2115f7fb1cb555de76a", size = 1013717, upload-time = "2025-10-06T12:15:41.906Z" }, - { url = "https://files.pythonhosted.org/packages/4d/90/35353d5311735ebe85f0224f3a6e4f136c29e1b3e4ce6c7466c9b7e7931b/snowflake_connector_python-3.18.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:283366b35df88cd0c71caf0215ba80370ddef4dd37d2adf43b24208c747231ee", size = 1025471, upload-time = "2025-10-06T12:15:43.073Z" }, - { url = "https://files.pythonhosted.org/packages/ec/16/d490c00546ca8842d314de689ac718c73c9fe0f9b042e06703449282de7c/snowflake_connector_python-3.18.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e4c285cc6a7f6431cff98c8f235a0fe9da2262462dd3dfc2b97120574a95cf9", size = 2684000, upload-time = "2025-10-06T12:15:23.411Z" }, - { url = "https://files.pythonhosted.org/packages/d3/cb/4bc697af4138e17cccde506f28233492a6e1919ced7a65aa31b6f1e8bb6c/snowflake_connector_python-3.18.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94e041e347b5151b66d19d6cfc3b3172dac1f51e44bbf7cf58f3989427dd464a", size = 2715472, upload-time = "2025-10-06T12:15:25.062Z" }, - { url = "https://files.pythonhosted.org/packages/d9/72/815a4b9795ddce224a1392849dd34a408f2dac240bcdcb0539d42cfd31b1/snowflake_connector_python-3.18.0-cp312-cp312-win_amd64.whl", hash = "sha256:7116cfa410d517328fd25fabffb54845b88667586718578c4333ce034fead1ba", size = 1160435, upload-time = "2025-10-06T12:15:55.046Z" }, - { url = "https://files.pythonhosted.org/packages/a1/e6/b75caca8bcfeae1bc999bf70c9cb54a73607f361a3f1ef0b679e2bd850a6/snowflake_connector_python-3.18.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4ed2d593f1983939d5d8d88b212d86fd4f14f0ceefc1df9882b4a18534adbde9", size = 1014849, upload-time = "2025-10-06T12:15:44.228Z" }, - { url = "https://files.pythonhosted.org/packages/4b/03/0420ebed3b9326e738ab06f8d3f80d9d430054e181ddfe3bf908d87ea5f9/snowflake_connector_python-3.18.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:b99f261c82be92224ac20c8c12bdf26ce3ed5dfd8a3df8a97f15a1e11c46ad27", size = 1026296, upload-time = "2025-10-06T12:15:46.82Z" }, - { url = "https://files.pythonhosted.org/packages/d5/04/a467a3bc6d59fd77b7628086a32102711cfb337b0920c3dac340a29f27e8/snowflake_connector_python-3.18.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51eb789a09dc6c62119cfabd044fba1a6b8378206f05a1e83ddb2e9cb49acc0b", size = 2685839, upload-time = "2025-10-06T12:15:26.475Z" }, - { url = "https://files.pythonhosted.org/packages/29/70/0ae9d661d405720b7e3bcea425f1915475b457e4a17fec4eb28b8bd91d35/snowflake_connector_python-3.18.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd1de3038b6d7059ca59f93e105aba2a673151c693cc4292f72f38bfaf147df2", size = 2718059, upload-time = "2025-10-06T12:15:27.765Z" }, - { url = "https://files.pythonhosted.org/packages/9d/38/ea46bbe910bd44ce52aaeea2fefe072392c7c6f3c04bfd0aea3f8fdd5e3a/snowflake_connector_python-3.18.0-cp313-cp313-win_amd64.whl", hash = "sha256:aeeb181a156333480f60b5f8ddbb3d087e288b4509adbef7993236defe4d7570", size = 1160453, upload-time = "2025-10-06T12:15:58.405Z" }, -] - -[[package]] -name = "snowflake-core" -version = "1.8.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "pydantic" }, - { name = "python-dateutil" }, - { name = "pyyaml" }, - { name = "requests" }, - { name = "snowflake-connector-python" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9a/39/34c84b238b16cbc2b81fa3f2fd4b590f1b1b572978401d2fcdad02001ede/snowflake_core-1.8.0.tar.gz", hash = "sha256:96d6f2c61a44bc72efa229c3a8cef2f4a5d4e2c756b00e01820c8147533a8bec", size = 837203, upload-time = "2025-09-22T11:33:08.933Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/d3/6662cfc50021b2a346edee0228a4d2e16615153aaac3556e317f8e904317/snowflake_core-1.8.0-py3-none-any.whl", hash = "sha256:55ab1e9102f13f12af5b998010d128edd5c87ff80ef0bd3b8a2b973359f86319", size = 1415435, upload-time = "2025-09-22T11:33:06.692Z" }, -] - -[[package]] -name = "snowflake-legacy" -version = "1.0.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5b/e3/d9f9bea349e1cedb3d1215f66e13d32ca6530e6dcd1f1adb202ce776bc96/snowflake_legacy-1.0.2.tar.gz", hash = "sha256:94deaaef203958e11400dc607125b6d3c3124963b6f7ebf8f16bf23370ccf0bd", size = 4077, upload-time = "2025-09-23T12:30:50.146Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/f1/4aff125021a9c5e0183f2f55dd7d04b7256a0e1e10db50d537a7415d9c55/snowflake_connector_python-4.0.0.tar.gz", hash = "sha256:4b10a865c4a5e1fa60c365c7fe41e0433605e6e5edc824e8730a9038f330b3a6", size = 813937, upload-time = "2025-10-09T10:11:34.631Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7d/67/93e409e6db141e22e476d8170618681b860fc1fd06f72aecb9416f9762fd/snowflake_legacy-1.0.2-py3-none-any.whl", hash = "sha256:4fdf218594d6132100736aa38c6ea149b445369b509a1e5c59bcc1f5fd5cfd51", size = 3058, upload-time = "2025-09-23T12:30:49.011Z" }, + { url = "https://files.pythonhosted.org/packages/ea/b0/462c0deee35d6d03d3d729b3f923615bae665beb7f9a94673a23a52080fe/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bfd3b8523d7adc830f99c5c4c635689ceca61700a05368d5bbb34c6811f2ec54", size = 1029568, upload-time = "2025-10-09T10:11:42.125Z" }, + { url = "https://files.pythonhosted.org/packages/ff/4b/bb3ae3f07e7927c8f16c4c0f1283d3c721978d16e8bf4193fc8e41025c1e/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:835161dd46ef8f5fc9d2f135ca654c2f3fbdf57b035d3e1980506aa8eac671dc", size = 1041337, upload-time = "2025-10-09T10:11:43.692Z" }, + { url = "https://files.pythonhosted.org/packages/9c/75/4bfac89f10c6dbb75e97adf1e217737fc599ebf964031c9298b6cbd807d0/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65e4e36dd1b0c7235d84cddef8a3c97c5ea0dc8fea85e31e45fc485000b77a83", size = 2699730, upload-time = "2025-10-09T10:11:25.295Z" }, + { url = "https://files.pythonhosted.org/packages/cd/78/0e916416c50909dbae511fe38b1e671a9efa62decdce51b174a0396804e4/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6132986d6965e4005b0167270612fbc7fa4bc4ef42726a40b85a8f57475a78d", size = 2731336, upload-time = "2025-10-09T10:11:27.028Z" }, + { url = "https://files.pythonhosted.org/packages/83/f0/3db8a2f3f5ee724d309c661af739a70d0643070b9b4597728151ef900f9b/snowflake_connector_python-4.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a790f06808e4481c23cfed1396d2c9a786060ddd62408b1fda1a63e1e6bc4b07", size = 1176292, upload-time = "2025-10-09T10:11:54.956Z" }, + { url = "https://files.pythonhosted.org/packages/64/c0/10dfcce18514d711bf17d7766d24aedfc20d7a5aa0e8311c0d3068baf266/snowflake_connector_python-4.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4e8c3d2ea4055dd4aecc93514030341e300f557f2e86ca21eb47568c461a6f56", size = 1030702, upload-time = "2025-10-09T10:11:45.013Z" }, + { url = "https://files.pythonhosted.org/packages/16/c1/9d068375ccb341975eb95a87a99176b4b25bb7725e61c8ed62681f2d5123/snowflake_connector_python-4.0.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:1fea301e3d1e8022b9f2ff87dc3be139d5ed7be5e85fab8a6c59d400a02e6d58", size = 1042153, upload-time = "2025-10-09T10:11:46.309Z" }, + { url = "https://files.pythonhosted.org/packages/5d/ae/f4da6b62e546f48885b63bc1884c935bc293e6da9605ddcd217e21307a63/snowflake_connector_python-4.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54e648bbd506a0f2f8076f9eafe231b2d4284b1a884528c3a0690391ab2bb54e", size = 2701637, upload-time = "2025-10-09T10:11:28.58Z" }, + { url = "https://files.pythonhosted.org/packages/88/bf/6cf92dbd1c6d95311894404e2c46db9a06ff6d37bea9a19e667d0bf26362/snowflake_connector_python-4.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f67d844241a6fed764a8f04d32c0273aedf9159d5162b764748526277c7f8831", size = 2733899, upload-time = "2025-10-09T10:11:30.186Z" }, + { url = "https://files.pythonhosted.org/packages/5b/c8/7d9a41e1b10c0a2bae86241773a6b55c06e897c74b3cab14ec8315e16b34/snowflake_connector_python-4.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:cd23bff2abc74e34c6123a181c004ead9e6cc8ef2661250892afd64bad24533c", size = 1176311, upload-time = "2025-10-09T10:11:56.176Z" }, ] [[package]] name = "snowflake-snowpark-python" -version = "1.40.0" +version = "1.42.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cloudpickle" }, @@ -3351,9 +3310,9 @@ dependencies = [ { name = "tzlocal" }, { name = "wheel" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/69/23/e6a42719e4ec8223d761e279961f2acf9fbb97a4d8a328dd540ef8b8bd71/snowflake_snowpark_python-1.40.0.tar.gz", hash = "sha256:f09f919433317f53240cf7f81116bfc788bedd54afcb02f5ff7bcabe7bcab9a8", size = 1753918, upload-time = "2025-10-06T16:00:45.09Z" } +sdist = { url = "https://files.pythonhosted.org/packages/78/e3/b70799997481185cdad44b0786c7597764935d78d71632b4735fb05d63a1/snowflake_snowpark_python-1.42.0.tar.gz", hash = "sha256:e994b3860c816d1b5fdf0c6272f8d9e41505e470140b063ff9418d234fd8cc00", size = 1781749, upload-time = "2025-10-28T18:10:52.418Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/54/75/888c8056a4cf14f25eb1ec508d81b8666f4eaba1a90fe416d2adf1ec9615/snowflake_snowpark_python-1.40.0-py3-none-any.whl", hash = "sha256:257af300579c462c612729b7b938bf65d5909f959d039e2a07e760eabc1001f0", size = 1800245, upload-time = "2025-10-06T16:00:43.137Z" }, + { url = "https://files.pythonhosted.org/packages/a3/ea/a3f1ff82aa144fd072f4be440ed636f4c298a7ee7a278e68709cf2753da5/snowflake_snowpark_python-1.42.0-py3-none-any.whl", hash = "sha256:fd92a3633b79573bb481b6e85a1434842758637dc6a30b32b9c5ce2824f4296d", size = 1825602, upload-time = "2025-10-28T18:10:48.778Z" }, ] [[package]]