From 737d42cdacd6cefcc74fcbd883f97a6ec89cfa20 Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Fri, 31 Oct 2025 17:50:15 +0100 Subject: [PATCH] New unittests + new basic example + fix for materialized keyword in python model annotation --- _scripts/concat_docs.py | 32 +- docs/Config_and_Macros.md | 7 +- docs/examples/Basic_Demo.md | 49 ++ docs/index.md | 3 +- examples/_scripts/cleanup_env.py | 48 +- examples/basic_demo/Makefile | 97 +++ examples/basic_demo/README.md | 57 +- examples/basic_demo/models/README.md | 8 +- .../databricks_spark/mart_latest_signup.ff.py | 34 + .../engines/duckdb/mart_latest_signup.ff.py | 33 + .../engines/postgres/mart_latest_signup.ff.py | 33 + .../models/marts/mart_users_by_domain.ff.sql | 26 + .../models/staging/users_clean.ff.sql | 25 + examples/basic_demo/profiles.yml | 41 +- examples/basic_demo/project.yml | 47 +- examples/basic_demo/seeds/README.md | 3 +- examples/basic_demo/seeds/seed_users.csv | 4 + examples/basic_demo/site/dag/index.html | 248 +++++++ .../site/dag/mart_latest_signup.html | 246 +++++++ .../site/dag/mart_users_by_domain.ff.html | 252 +++++++ .../basic_demo/site/dag/users_clean.ff.html | 271 ++++++++ examples/basic_demo/sources.yml | 14 +- examples/env_matrix/models/env_vars.ff.py | 2 +- .../.fastflowtransform/target/catalog.json | 2 +- .../.fastflowtransform/target/manifest.json | 2 +- .../target/run_results.json | 26 +- .../.fastflowtransform/target/catalog.json | 2 +- .../.fastflowtransform/target/manifest.json | 2 +- .../target/run_results.json | 40 +- mkdocs.yml | 3 +- pytest.ini | 2 + src/fastflowtransform/cli/docgen_cmd.py | 1 + src/fastflowtransform/cli/seed_cmd.py | 16 - src/fastflowtransform/decorators.py | 10 +- .../executors/duckdb_exec.py | 40 +- src/fastflowtransform/logging.py | 67 +- tests/common/mock/bigquery.py | 199 ++++++ tests/common/mock/profiles.py | 150 ++++ .../test_python_model_materialized_view.py | 9 +- tests/unit/cli/test_bootstrap_unit.py | 260 +++++++ tests/unit/cli/test_docgen_cmd_unit.py | 143 ++++ tests/unit/cli/test_seed_cmd_unit.py | 163 +++++ .../test_docs_materialization_badges.py | 0 tests/unit/{ => docs}/test_docs_merge.py | 0 tests/unit/docs/test_docs_unit.py | 648 ++++++++++++++++++ .../executors/test_bigquery_bf_exec_unit.py | 410 +++++++++++ .../unit/executors/test_bigquery_exec_unit.py | 339 +++++++++ tests/unit/executors/test_duckdb_exec_unit.py | 244 +++++++ tests/unit/executors/test_shims_unit.py | 249 +++++++ .../executors/test_snowflake_snowpark_exec.py | 521 ++++++++++++++ tests/unit/test_core_python_tags.py | 2 +- 51 files changed, 4984 insertions(+), 146 deletions(-) create mode 100644 docs/examples/Basic_Demo.md create mode 100644 examples/basic_demo/Makefile create mode 100644 examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py create mode 100644 examples/basic_demo/models/engines/duckdb/mart_latest_signup.ff.py create mode 100644 examples/basic_demo/models/engines/postgres/mart_latest_signup.ff.py create mode 100644 examples/basic_demo/models/marts/mart_users_by_domain.ff.sql create mode 100644 examples/basic_demo/models/staging/users_clean.ff.sql create mode 100644 examples/basic_demo/seeds/seed_users.csv create mode 100644 examples/basic_demo/site/dag/index.html create mode 100644 examples/basic_demo/site/dag/mart_latest_signup.html create mode 100644 examples/basic_demo/site/dag/mart_users_by_domain.ff.html create mode 100644 examples/basic_demo/site/dag/users_clean.ff.html create mode 100644 tests/common/mock/bigquery.py create mode 100644 tests/common/mock/profiles.py create mode 100644 tests/unit/cli/test_bootstrap_unit.py create mode 100644 tests/unit/cli/test_docgen_cmd_unit.py create mode 100644 tests/unit/cli/test_seed_cmd_unit.py rename tests/unit/{ => docs}/test_docs_materialization_badges.py (100%) rename tests/unit/{ => docs}/test_docs_merge.py (100%) create mode 100644 tests/unit/docs/test_docs_unit.py create mode 100644 tests/unit/executors/test_bigquery_bf_exec_unit.py create mode 100644 tests/unit/executors/test_bigquery_exec_unit.py create mode 100644 tests/unit/executors/test_duckdb_exec_unit.py create mode 100644 tests/unit/executors/test_shims_unit.py create mode 100644 tests/unit/executors/test_snowflake_snowpark_exec.py diff --git a/_scripts/concat_docs.py b/_scripts/concat_docs.py index 1c00d6e..c0ca6ff 100644 --- a/_scripts/concat_docs.py +++ b/_scripts/concat_docs.py @@ -1,13 +1,13 @@ #!/usr/bin/env python3 # concat_docs.py """ -Fügt alle Markdown-Dateien aus dem docs-Verzeichnis zu einer einzelnen Datei zusammen. -- Respektiert die Reihenfolge in mkdocs.yml (nav). -- Ignoriert doppelte Einträge / Anker (#...). -- Hängt übrige .md-Dateien (nicht in nav) am Ende an. -- Optional: Headings demoten (um mehrfaches H1 zu vermeiden). +Concatenates all Markdown files from the docs directory into a single file. +- Respects the order in mkdocs.yml (nav). +- Ignores duplicate entries / anchors (#...). +- Appends remaining .md files (not in nav) at the end. +- Optional: Demote headings (to avoid multiple H1s). -Beispiel: +Example: python concat_docs.py -o Combined.md python concat_docs.py -o Combined.md --demote --exclude "reference/**" --exclude "site/**" """ @@ -30,18 +30,18 @@ def load_nav_order(project_root: Path) -> list[Path]: - """Liest mkdocs.yml und extrahiert eine geordnete Liste der Markdown-Pfade (ohne Anker).""" + """Read mkdocs.yml and extract an ordered list of Markdown paths (without anchors).""" yml_path = project_root / MKDOCS_YML ordered: list[Path] = [] if yaml is None or not yml_path.exists(): - return ordered # keine Order-Info -> leere Liste + return ordered # no ordering info -> empty list data = yaml.safe_load(yml_path.read_text(encoding="utf-8")) nav = data.get("nav") if isinstance(data, dict) else None if not isinstance(nav, list): return ordered def normalize_nav_item(item) -> list[str]: - # item kann dict ({"Title": "path.md" | ["subitems"]}) oder string sein + # Item can be dict ({"Title": "path.md" | ["subitems"]}) or string out: list[str] = [] if isinstance(item, str): out.append(item) @@ -60,12 +60,12 @@ def normalize_nav_item(item) -> list[str]: seen = set() for p in paths: - # Nur Dateien unter docs berücksichtigen; Anker entfernen + # Only consider files under docs; strip anchors p_no_anchor = p.split("#", 1)[0] if not p_no_anchor.lower().endswith(".md"): continue - # mkdocs erlaubt relative Pfade; wir interpretieren sie relativ zu docs/ - # Falls der Pfad bereits "docs/..." enthält, normalisieren wir trotzdem + # mkdocs allows relative paths; interpret them relative to docs/ + # If the path already contains "docs/...", normalize it anyway if p_no_anchor.startswith(DOCS_DIR_DEFAULT + "/"): rel = Path(p_no_anchor).relative_to(DOCS_DIR_DEFAULT) else: @@ -94,8 +94,8 @@ def apply_excludes(paths: list[Path], patterns: list[str]) -> list[Path]: def demote_headings(text: str, levels: int = 1) -> str: """ - Erhöht die Anzahl der '#' um 'levels' für alle ATX-Headings (Markdown #). - Lässt Codeblöcke unberührt. + Increase the number of '#' by 'levels' for all ATX headings (Markdown #). + Leave code fences untouched. """ if levels <= 0: return text @@ -160,12 +160,12 @@ def main(): print(f"Fehler: docs-Verzeichnis nicht gefunden: {docs_dir}", file=sys.stderr) sys.exit(1) - # 1) Reihenfolge aus mkdocs.yml (falls nicht deaktiviert / vorhanden) + # 1) Order from mkdocs.yml (if not disabled / available) nav_order = load_nav_order(project_root) if not args.no_nav else [] all_md = collect_md_files(docs_dir) all_md = apply_excludes(all_md, args.exclude) - # 2) Liste zusammenstellen: zuerst nav, dann Rest (ohne Duplikate) + # 2) Build list: nav entries first, then the rest (without duplicates) ordered: list[Path] = [] seen = set() for rel in nav_order: diff --git a/docs/Config_and_Macros.md b/docs/Config_and_Macros.md index 6c1cd5d..6ec708d 100644 --- a/docs/Config_and_Macros.md +++ b/docs/Config_and_Macros.md @@ -64,6 +64,8 @@ Use the `@model` decorator from `fastflowtransform.core` to register a callable. - `name` (optional) → overrides the logical name (defaults to stem). - `deps` → list of dependency nodes (file stems or logical names). - `requires` → column contract per dependency (validated via `validation.validate_required_columns`). +- `materialized` (optional) → `'table' | 'view' | 'ephemeral'`; mirrors `config(materialized=...)` for SQL. +- `tags` (optional) → convenience for attaching selection labels without writing `meta={"tags": ...}`. Dependencies determine the call signature: @@ -78,7 +80,8 @@ import pandas as pd @model( name="users_enriched", deps=["users.ff"], - requires={"users": {"id", "email"}} + requires={"users": {"id", "email"}}, + materialized="view", ) def enrich(df: pd.DataFrame) -> pd.DataFrame: out = df.copy() @@ -172,7 +175,7 @@ override those defaults, add per-engine overrides, or point at files: ## 2. `config()` options -Call `config()` at the top of SQL models (and optionally within Python models via decorator kwargs in future versions). +Call `config()` at the top of SQL models. Python models get the same options via the `@model(..., materialized=..., tags=...)` decorator kwargs. ```sql {{ config( diff --git a/docs/examples/Basic_Demo.md b/docs/examples/Basic_Demo.md new file mode 100644 index 0000000..baea5de --- /dev/null +++ b/docs/examples/Basic_Demo.md @@ -0,0 +1,49 @@ +# Basic Demo Project + +The `examples/basic_demo` project shows the smallest end-to-end FastFlowTransform pipeline. It combines one seed, a staging model, and a final mart while staying portable across DuckDB, Postgres, and Databricks Spark. + +## Why it exists +- **Start small** – demonstrate the minimum folder structure (`seeds/`, `models/`, `profiles.yml`) needed to run `fft`. +- **Engine parity** – prove that a single project can target multiple engines by swapping profiles. +- **Understand outputs** – show where documentation and manifests land after a run. + +Use it as a sandbox before adding your own sources, macros, or Python models. + +## Project layout + +| Path | Purpose | +|------|---------| +| `seeds/seed_users.csv` | Sample CRM-style user data. `fft seed` materializes it as `crm.users`. | +| `models/staging/users_clean.ff.sql` | Normalizes emails, casts types, and tags the model for all engines. | +| `models/marts/mart_users_by_domain.ff.sql` | Aggregates users per email domain and records the first/last signup dates. | +| `models/engines/*/mart_latest_signup.ff.py` | Engine-specific Python models (pandas for DuckDB/Postgres, PySpark for Databricks) selecting the most recent signup per domain from the staging view. | +| `profiles.yml` | Declares `dev_duckdb`, `dev_postgres`, and `dev_databricks` profiles driven by environment variables. | +| `.env.dev_*` | Template environment files you can `source` per engine. | +| `Makefile` | One command (`make demo ENGINE=…`) to seed, run, document, test, and preview results. | + +## Running the demo + +1. `cd examples/basic_demo` +2. Choose an engine and export its environment variables: + ```bash + set -a; source .env.dev_duckdb; set +a + # swap to .env.dev_postgres or .env.dev_databricks for other engines + ``` +3. Execute the full flow: + ```bash + make demo ENGINE=duckdb + ``` + The Makefile runs `fft seed`, `fft run`, `fft dag`, `fft test`, and `fft show basic_demo.mart_users_by_domain`. To preview the Python mart, run `make show ENGINE=duckdb SHOW_MODEL=mart_latest_signup` (or swap `ENGINE` as needed). +4. Inspect artifacts: + - `.fastflowtransform/target/manifest.json` and `run_results.json` + - `site/dag/index.html` for the rendered model graph + - CLI output from `fft show` displaying the aggregated mart + +The demo also enables baseline data quality checks in `project.yml`. Running `fft test` (or `make test`) verifies that primary keys remain unique/not-null across `seed_users`, `users_clean`, `mart_users_by_domain`, and the Python mart, while ensuring aggregate metrics such as `user_count` never drop below zero and each domain appears only once in `mart_latest_signup`. + +## Next steps + +- Add more CSVs under `seeds/` and declare them in `sources.yml`. +- Create additional staging models so marts can reuse normalized data. +- Introduce Python models or macros mirroring how the API demo scales up. +- Update `.env.dev_*` with real credentials once you connect to shared databases. diff --git a/docs/index.md b/docs/index.md index 36754ad..eccc66f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,6 +17,7 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t - [Sources Declaration](./Sources.md) - [Project Configuration](./Project_Config.md) - [State Selection (changed & results)](./State_Selection.md) +- [Basic Demo Overview](./examples/Basic_Demo.md) - [Cross-Table Reconciliations](./Technical_Overview.md#cross-table-reconciliations) - [Auto-Docs & Lineage](./Technical_Overview.md#auto-docs-lineage) - [Developer Guide](./Technical_Overview.md#part-ii-architecture-internals) @@ -39,7 +40,7 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t - **Understand the project layout & CLI workflow:** see *Project Layout*, *Makefile Targets*, and *CLI Flows* in the [Technical Overview](Technical_Overview.md#project-layout). - **Configure runtimes & profiles:** review executor profiles, environment overrides, and logging options in the [Technical Overview](Technical_Overview.md#profiles-environment-overrides). - **Model data quality & troubleshoot runs:** the [Technical Overview](Technical_Overview.md#model-unit-tests-fft-utest) covers unit tests, troubleshooting tips, and exit codes. -- **Explore runnable demos:** browse the `examples/` directory in the repo; each subproject comes with its own README. +- **Explore runnable demos:** start with the [Basic Demo Overview](examples/Basic_Demo.md) or browse the `examples/` directory; each subproject ships with its own README. ### 2. Extend FastFlowTransform (Developers & Contributors) diff --git a/examples/_scripts/cleanup_env.py b/examples/_scripts/cleanup_env.py index abc98dd..fe7ae7c 100644 --- a/examples/_scripts/cleanup_env.py +++ b/examples/_scripts/cleanup_env.py @@ -16,11 +16,15 @@ if SRC_DIR.exists() and str(SRC_DIR) not in sys.path: sys.path.insert(0, str(SRC_DIR)) +from fastflowtransform.logging import LOG_PREFIX from fastflowtransform.settings import EnvSettings, resolve_profile def _log(msg: str) -> None: - print(msg) + if LOG_PREFIX: + print(f"{LOG_PREFIX} {msg}") + else: + print(msg) def _coerce_path(value: str | None, project: Path) -> Path | None: @@ -261,32 +265,44 @@ def main(argv: list[str] | None = None) -> int: or ("dev_" + args.engine if args.engine in {"duckdb", "postgres"} else "dev") ) + os.environ["FFT_ACTIVE_ENV"] = env_name + _load_dotenv_layered(project, env_name) + + profile = None try: - os.environ["FFT_ACTIVE_ENV"] = env_name - _load_dotenv_layered(project, env_name) profile = _load_profile(project, env_name, args.engine) + except Exception as exc: # pragma: no cover - best-effort logging + _log( + f"Warning: failed to resolve profile '{env_name}' for engine '{args.engine}': {exc}. " + "Continuing with environment variables only." + ) - warehouse_path: Path | None = None + warehouse_path: Path | None = None + try: if args.engine == "duckdb": - profile_duckdb = getattr(getattr(profile, "duckdb", None), "path", None) + profile_duckdb = ( + getattr(getattr(profile, "duckdb", None), "path", None) if profile else None + ) db_path = args.duckdb_path or os.getenv("FF_DUCKDB_PATH") or profile_duckdb cleanup_duckdb(project=project, db_path=db_path, dry_run=args.dry_run) elif args.engine == "postgres": - profile_pg = getattr(profile, "postgres", None) - profile_dsn = getattr(profile_pg, "dsn", None) - profile_schema = getattr(profile_pg, "db_schema", None) + profile_pg = getattr(profile, "postgres", None) if profile else None + profile_dsn = getattr(profile_pg, "dsn", None) if profile_pg else None + profile_schema = getattr(profile_pg, "db_schema", None) if profile_pg else None dsn = args.postgres_dsn or os.getenv("FF_PG_DSN") or profile_dsn schema = args.postgres_schema or os.getenv("FF_PG_SCHEMA") or profile_schema cleanup_postgres(dsn=dsn, schema=schema, dry_run=args.dry_run) elif args.engine == "databricks_spark": - profile_db = getattr(profile, "databricks_spark", None) - profile_master = getattr(profile_db, "master", None) - profile_app = getattr(profile_db, "app_name", None) - profile_warehouse = getattr(profile_db, "warehouse_dir", None) - profile_database = getattr(profile_db, "database", None) - profile_catalog = getattr(profile_db, "catalog", None) - profile_use_hive = getattr(profile_db, "use_hive_metastore", False) - profile_extra_conf = getattr(profile_db, "extra_conf", None) + profile_db = getattr(profile, "databricks_spark", None) if profile else None + profile_master = getattr(profile_db, "master", None) if profile_db else None + profile_app = getattr(profile_db, "app_name", None) if profile_db else None + profile_warehouse = getattr(profile_db, "warehouse_dir", None) if profile_db else None + profile_database = getattr(profile_db, "database", None) if profile_db else None + profile_catalog = getattr(profile_db, "catalog", None) if profile_db else None + profile_use_hive = ( + getattr(profile_db, "use_hive_metastore", False) if profile_db else False + ) + profile_extra_conf = getattr(profile_db, "extra_conf", None) if profile_db else None warehouse_path = cleanup_databricks( project=project, master=args.spark_master or profile_master, diff --git a/examples/basic_demo/Makefile b/examples/basic_demo/Makefile new file mode 100644 index 0000000..7f7f5a9 --- /dev/null +++ b/examples/basic_demo/Makefile @@ -0,0 +1,97 @@ +.PHONY: seed run test dag show artifacts clean demo help + +# --- Configuration ----------------------------------------------------------- + +DB ?= .local/basic_demo.duckdb +PROJECT ?= . +UV ?= uv + +# Engine selector (duckdb|postgres|databricks_spark) +ENGINE ?= duckdb + +# Resolve profile and tags per engine +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb + ENGINE_TAG = engine:duckdb +endif +ifeq ($(ENGINE),postgres) + PROFILE_ENV = dev_postgres + ENGINE_TAG = engine:postgres +endif +ifeq ($(ENGINE),databricks_spark) + PROFILE_ENV = dev_databricks + ENGINE_TAG = engine:databricks_spark +endif + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) +RUN_ENV = $(BASE_ENV) + +SELECT_FLAGS = --select tag:example:basic_demo --select tag:$(ENGINE_TAG) + +SHOW_MODEL ?= mart_users_by_domain + +CLEAN_SCRIPT = ../_scripts/cleanup_env.py + +ifeq ($(ENGINE),duckdb) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine duckdb --env "$(PROFILE_ENV)" --project "$(PROJECT)" --duckdb-path "$(DB)" +else ifeq ($(ENGINE),postgres) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),databricks_spark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else + $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark) +endif + +# --- Targets ---------------------------------------------------------------- + +help: + @echo "FastFlowTransform Basic Demo" + @echo "Targets:" + @echo " make seed ENGINE=$(ENGINE)" + @echo " make run ENGINE=$(ENGINE)" + @echo " make dag ENGINE=$(ENGINE)" + @echo " make test ENGINE=$(ENGINE)" + @echo " make show ENGINE=$(ENGINE) SHOW_MODEL=$(SHOW_MODEL)" + @echo " make demo ENGINE=$(ENGINE)" + @echo " make clean ENGINE=$(ENGINE)" + @echo + @echo "Variables: DB=$(DB) PROJECT=$(PROJECT) UV=$(UV)" + +seed: + env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) + +run: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +test: + env $(BASE_ENV) $(UV) run fft test "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +dag: + env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --html + +show: + @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ + $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ + else \ + echo "No HTML found: $(PROJECT)/site/dag/index.html"; \ + fi + +artifacts: + @echo + @echo "== 📦 Artifacts ==" + @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + @echo " DAG HTML: $(PROJECT)/site/dag/index.html" + +clean: + $(CLEAN_CMD) + +demo: clean + @echo "== 🚀 Basic Demo ($(ENGINE)) ==" + @echo "Profile=$(PROFILE_ENV) PROJECT=$(PROJECT)" + +$(MAKE) seed ENGINE=$(ENGINE) + +$(MAKE) run ENGINE=$(ENGINE) + +$(MAKE) dag ENGINE=$(ENGINE) + +$(MAKE) test ENGINE=$(ENGINE) + +$(MAKE) show ENGINE=$(ENGINE) + +$(MAKE) artifacts + @echo "✅ Demo complete." diff --git a/examples/basic_demo/README.md b/examples/basic_demo/README.md index 5e977f7..106a814 100644 --- a/examples/basic_demo/README.md +++ b/examples/basic_demo/README.md @@ -1,7 +1,52 @@ -# FastFlowTransform project scaffold +# Basic demo -This project was created with `fft init`. -Next steps: -1. Update `profiles.yml` with real connection details (docs/Profiles.md). -2. Add sources in `sources.yml` and author models under `models/` (docs/Config_and_Macros.md). -3. Seed sample data with `fft seed` and execute models with `fft run` (docs/Quickstart.md). +This project is a minimal FastFlowTransform pipeline that works unchanged on DuckDB, Postgres, and Databricks Spark. It ships with: +- `seeds/seed_users.csv` – three sample users that bootstrap the project. +- `models/staging/users_clean.ff.sql` – normalizes emails and signup timestamps. +- `models/marts/mart_users_by_domain.ff.sql` – aggregates users by email domain. +- `models/engines/*/mart_latest_signup.ff.py` – engine-scoped Python models (pandas for DuckDB/Postgres, PySpark for Databricks) that grab the latest signup per domain from the staging view. + +## Quickstart + +1. Install the package and CLI (see repository root instructions). +2. `cd examples/basic_demo` (this folder) so relative paths line up. +3. Load one of the provided engine environments, then seed and run the project. + +> ⚠️ `make clean` (or direct calls to `cleanup_env.py`) rely on the same environment variables as the run commands. Always export the `.env.dev_*` file for the engine you are cleaning so paths, schemas, and credentials are available. + +### DuckDB + +```bash +cp .env.dev_duckdb .env.local # optional convenience copy +set -a; source .env.dev_duckdb; set +a # export FF_DUCKDB_PATH +fft seed basic_demo --env dev_duckdb +fft run basic_demo --env dev_duckdb +fft show basic_demo.mart_users_by_domain --env dev_duckdb +fft show basic_demo.mart_latest_signup --env dev_duckdb +``` + +### Postgres + +```bash +cp .env.dev_postgres .env.local # fill in FF_PG_DSN with your credentials +set -a; source .env.dev_postgres; set +a +fft seed basic_demo --env dev_postgres +fft run basic_demo --env dev_postgres +fft show basic_demo.mart_users_by_domain --env dev_postgres +fft show basic_demo.mart_latest_signup --env dev_postgres +``` + +### Databricks Spark (local or hosted) + +```bash +cp .env.dev_databricks .env.local # adjust Spark master / credentials as needed +set -a; source .env.dev_databricks; set +a +fft seed basic_demo --env dev_databricks +fft run basic_demo --env dev_databricks +fft show basic_demo.mart_users_by_domain --env dev_databricks +fft show basic_demo.mart_latest_signup --env dev_databricks +``` + +The resulting tables report user counts per email domain and spotlight the most recent signup per domain. Extend any of the CSV, SQL, or Python assets to explore more complex scenarios. + +Further background is documented in [`docs/examples/Basic_Demo.md`](../../docs/examples/Basic_Demo.md). diff --git a/examples/basic_demo/models/README.md b/examples/basic_demo/models/README.md index 32818bb..7ec1560 100644 --- a/examples/basic_demo/models/README.md +++ b/examples/basic_demo/models/README.md @@ -1,4 +1,8 @@ # Models directory -Place SQL (`*.ff.sql`) and Python (`*.ff.py`) models here. -See docs/Config_and_Macros.md for modeling guidance and config options. +This demo ships with: +- `staging/users_clean.ff.sql` – normalizes the seeded users table. +- `marts/mart_users_by_domain.ff.sql` – aggregates signups per email domain. +- `engines/*/mart_latest_signup.ff.py` – engine-scoped Python models (pandas for DuckDB/Postgres, PySpark for Databricks) that select the most recent signup per domain using the staging view as input. + +Add further SQL (`*.ff.sql`) or Python (`*.ff.py`) models alongside them to grow the pipeline. diff --git a/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py new file mode 100644 index 0000000..0b1a77a --- /dev/null +++ b/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py @@ -0,0 +1,34 @@ +from pyspark.sql import DataFrame +from pyspark.sql import Window +from pyspark.sql import functions as F + +from fastflowtransform import engine_model + + +@engine_model( + only="databricks_spark", + name="mart_latest_signup", + materialized="table", + tags=[ + "example:basic_demo", + "scope:mart", + "engine:databricks_spark", + ], + deps=["users_clean.ff"], + require={"users_clean.ff": ["user_id", "email", "email_domain", "signup_date"]}, +) +def build(users_clean: DataFrame) -> DataFrame: + """Return the latest signup per email domain using PySpark DataFrame operations.""" + window = Window.partitionBy("email_domain").orderBy(F.col("signup_date").desc()) + + latest = ( + users_clean.withColumn("row_number", F.row_number().over(window)) + .filter(F.col("row_number") == 1) + .select( + F.col("email_domain"), + F.col("user_id").alias("latest_user_id"), + F.col("email").alias("latest_email"), + F.col("signup_date").alias("latest_signup_date"), + ) + ) + return latest diff --git a/examples/basic_demo/models/engines/duckdb/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/duckdb/mart_latest_signup.ff.py new file mode 100644 index 0000000..d33277e --- /dev/null +++ b/examples/basic_demo/models/engines/duckdb/mart_latest_signup.ff.py @@ -0,0 +1,33 @@ +import pandas as pd + +from fastflowtransform import engine_model + + +@engine_model( + only="duckdb", + name="mart_latest_signup", + materialized="table", + tags=[ + "example:basic_demo", + "scope:mart", + "engine:duckdb", + ], + deps=["users_clean.ff"], + require={"users_clean.ff": ["user_id", "email", "email_domain", "signup_date"]}, +) +def build(users_clean: pd.DataFrame) -> pd.DataFrame: + """Return the latest signup per email domain using pandas (DuckDB).""" + latest = ( + users_clean.sort_values("signup_date", ascending=False) + .drop_duplicates("email_domain") + .loc[:, ["email_domain", "user_id", "email", "signup_date"]] + .rename( + columns={ + "user_id": "latest_user_id", + "email": "latest_email", + "signup_date": "latest_signup_date", + } + ) + .reset_index(drop=True) + ) + return latest diff --git a/examples/basic_demo/models/engines/postgres/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/postgres/mart_latest_signup.ff.py new file mode 100644 index 0000000..f465fd0 --- /dev/null +++ b/examples/basic_demo/models/engines/postgres/mart_latest_signup.ff.py @@ -0,0 +1,33 @@ +import pandas as pd + +from fastflowtransform import engine_model + + +@engine_model( + only="postgres", + name="mart_latest_signup", + materialized="table", + tags=[ + "example:basic_demo", + "scope:mart", + "engine:postgres", + ], + deps=["users_clean.ff"], + require={"users_clean.ff": ["user_id", "email", "email_domain", "signup_date"]}, +) +def build(users_clean: pd.DataFrame) -> pd.DataFrame: + """Return the latest signup per email domain using pandas (Postgres).""" + latest = ( + users_clean.sort_values("signup_date", ascending=False) + .drop_duplicates("email_domain") + .loc[:, ["email_domain", "user_id", "email", "signup_date"]] + .rename( + columns={ + "user_id": "latest_user_id", + "email": "latest_email", + "signup_date": "latest_signup_date", + } + ) + .reset_index(drop=True) + ) + return latest diff --git a/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql b/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql new file mode 100644 index 0000000..d74c06d --- /dev/null +++ b/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql @@ -0,0 +1,26 @@ +{{ config( + materialized='table', + tags=[ + 'example:basic_demo', + 'scope:mart', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], +) }} + +with base as ( + select + email_domain, + signup_date + from {{ ref('users_clean.ff') }} +) + +select + email_domain, + count(*) as user_count, + min(signup_date) as first_signup, + max(signup_date) as last_signup +from base +group by email_domain +order by email_domain; diff --git a/examples/basic_demo/models/staging/users_clean.ff.sql b/examples/basic_demo/models/staging/users_clean.ff.sql new file mode 100644 index 0000000..3795e73 --- /dev/null +++ b/examples/basic_demo/models/staging/users_clean.ff.sql @@ -0,0 +1,25 @@ +{{ config( + materialized='view', + tags=[ + 'example:basic_demo', + 'scope:staging', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark' + ], +) }} + +with raw_users as ( + select + cast(id as integer) as user_id, + lower(email) as email, + cast(signup_date as date) as signup_date + from {{ source('crm', 'users') }} +) + +select + user_id, + email, + regexp_replace(email, '^.*@', '') as email_domain, + signup_date +from raw_users; diff --git a/examples/basic_demo/profiles.yml b/examples/basic_demo/profiles.yml index ca66403..130a7e7 100644 --- a/examples/basic_demo/profiles.yml +++ b/examples/basic_demo/profiles.yml @@ -1,13 +1,40 @@ -# Profiles generated by `fft init`. -# Update these placeholders as described in docs/Profiles.md. -dev: +# Connection profiles for the basic demo. +# Populate environment variables as shown in the accompanying .env.dev_* files. + +dev_duckdb: engine: duckdb - # DuckDB profile example. See docs/Profiles.md#duckdb for details. + vars: + demo_target_schema: main duckdb: - path: "{{ env('FF_DUCKDB_PATH', '.local/dev.duckdb') }}" # Path to your DuckDB database file. + path: "{{ env('FF_DUCKDB_PATH', '.local/basic_demo.duckdb') }}" + +dev_postgres: + engine: postgres + vars: + demo_target_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +dev_databricks: + engine: databricks_spark + vars: + demo_target_schema: "{{ env('FF_DBR_DATABASE', 'basic_demo') }}" + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'basic_demo') }}" + warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" + schema: "{{ env('FF_DBR_DATABASE', 'basic_demo') }}" + extra_conf: + spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" + spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" + spark.hadoop.datanucleus.schema.autoCreateAll: "true" + spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" -# Default in-memory profile for quick experiments. default: engine: duckdb + vars: + demo_target_schema: main duckdb: - path: ":memory:" + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" diff --git a/examples/basic_demo/project.yml b/examples/basic_demo/project.yml index 4d8ded1..3986ec5 100644 --- a/examples/basic_demo/project.yml +++ b/examples/basic_demo/project.yml @@ -15,4 +15,49 @@ docs: vars: {} # Declare project-wide data quality checks under `tests`. See docs/Data_Quality_Tests.md. -tests: [] +tests: + - type: not_null + table: seed_users + column: id + tags: [example_basic_demo] + + - type: unique + table: seed_users + column: id + tags: [example_basic_demo] + + - type: unique + table: users_clean + column: user_id + tags: [example_basic_demo] + + - type: not_null + table: users_clean + column: email_domain + tags: [example_basic_demo] + + - type: not_null + table: mart_users_by_domain + column: email_domain + tags: [example_basic_demo] + + - type: unique + table: mart_latest_signup + column: email_domain + tags: [example_basic_demo] + + - type: not_null + table: mart_latest_signup + column: latest_user_id + tags: [example_basic_demo] + + - type: not_null + table: mart_latest_signup + column: latest_signup_date + tags: [example_basic_demo] + + - type: greater_equal + table: mart_users_by_domain + column: user_count + threshold: 0 + tags: [example_basic_demo] diff --git a/examples/basic_demo/seeds/README.md b/examples/basic_demo/seeds/README.md index 2e553ed..2d39b58 100644 --- a/examples/basic_demo/seeds/README.md +++ b/examples/basic_demo/seeds/README.md @@ -1,4 +1,3 @@ # Seeds directory -Add CSV or Parquet files for reproducible seeds. -Usage examples are covered in docs/Quickstart.md and docs/Config_and_Macros.md#13-seeds-sources-and-dependencies. +`seed_users.csv` ships with the demo and feeds the staging model. Extend or replace it with your own CSV or Parquet files when experimenting. diff --git a/examples/basic_demo/seeds/seed_users.csv b/examples/basic_demo/seeds/seed_users.csv new file mode 100644 index 0000000..e890383 --- /dev/null +++ b/examples/basic_demo/seeds/seed_users.csv @@ -0,0 +1,4 @@ +id,email,signup_date +1,anna@example.com,2024-01-05 +2,bob@example.net,2024-02-11 +3,cara@example.org,2024-02-27 diff --git a/examples/basic_demo/site/dag/index.html b/examples/basic_demo/site/dag/index.html new file mode 100644 index 0000000..19e7a44 --- /dev/null +++ b/examples/basic_demo/site/dag/index.html @@ -0,0 +1,248 @@ + + + + + + FastFlowTransform - DAG & Mini Docs + + + + + + + +
+
+

FastFlowTransform - DAG & Mini Docs

+
Mermaid renders automatically (light/dark)
+
+
+ + +
+
+ +
+
+

DAG

+
+ SQL + Python + + Materialization: + + table + + view + + ephemeral + + incremental + +
+
flowchart TD + classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; + classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; + mart_latest_signup("mart_latest_signup
(mart_latest_signup)") + class mart_latest_signup py; + mart_users_by_domain_ff["mart_users_by_domain.ff
(mart_users_by_domain)"] + class mart_users_by_domain_ff sql; + users_clean_ff["users_clean.ff
(users_clean)"] + class users_clean_ff sql; + users_clean_ff --> mart_users_by_domain_ff + users_clean_ff --> mart_latest_signup +
+
+ + + +
+

Macros

+ +

No macros found.

+ +
+
+ + + + \ No newline at end of file diff --git a/examples/basic_demo/site/dag/mart_latest_signup.html b/examples/basic_demo/site/dag/mart_latest_signup.html new file mode 100644 index 0000000..f6561dc --- /dev/null +++ b/examples/basic_demo/site/dag/mart_latest_signup.html @@ -0,0 +1,246 @@ + + + + + + mart_latest_signup – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ mart_latest_signup + table +

+
Model Detail • FastFlowTransform
+
+ python +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
mart_latest_signup
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
email_domainstring + + yes + + + + — + + + + unknown + +
latest_user_idint + + yes + + + + — + + + + unknown + +
latest_emailstring + + yes + + + + — + + + + unknown + +
latest_signup_datedate + + yes + + + + — + + + + unknown + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/basic_demo/site/dag/mart_users_by_domain.ff.html b/examples/basic_demo/site/dag/mart_users_by_domain.ff.html new file mode 100644 index 0000000..4cccafe --- /dev/null +++ b/examples/basic_demo/site/dag/mart_users_by_domain.ff.html @@ -0,0 +1,252 @@ + + + + + + mart_users_by_domain.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ mart_users_by_domain.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
mart_users_by_domain
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
email_domainstring + + yes + + + + — + + + + + ?.email_domain + + direct + + + + +
user_countbigint + + yes + + + + — + + + + unknown + +
first_signupdate + + yes + + + + — + + + + unknown + +
last_signupdate + + yes + + + + — + + + + unknown + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/basic_demo/site/dag/users_clean.ff.html b/examples/basic_demo/site/dag/users_clean.ff.html new file mode 100644 index 0000000..ba353fd --- /dev/null +++ b/examples/basic_demo/site/dag/users_clean.ff.html @@ -0,0 +1,271 @@ + + + + + + users_clean.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ users_clean.ff + view +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
view
+ +
Relation
+
users_clean
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/basic_demo/models/staging/users_clean.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+
+ +
+ +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
user_idint + + yes + + + + — + + + + + ?.? + + transformed + + + + +
emailstring + + yes + + + + — + + + + + ?.? + + transformed + + + + +
email_domainstring + + yes + + + + — + + + + unknown + +
signup_datedate + + yes + + + + — + + + + + ?.? + + transformed + + + + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/basic_demo/sources.yml b/examples/basic_demo/sources.yml index cf52a95..25c9c73 100644 --- a/examples/basic_demo/sources.yml +++ b/examples/basic_demo/sources.yml @@ -1,9 +1,9 @@ -# Source declarations describe external tables. See docs/Sources.md for details. version: 2 + sources: - # Example: - # - name: raw - # schema: staging - # tables: - # - name: users - # identifier: seed_users + - name: crm + description: Seeded CRM-style data for the demo. + tables: + - name: users + identifier: seed_users + description: Three sample users that populate the seed table. diff --git a/examples/env_matrix/models/env_vars.ff.py b/examples/env_matrix/models/env_vars.ff.py index 646b969..3e40c1c 100644 --- a/examples/env_matrix/models/env_vars.ff.py +++ b/examples/env_matrix/models/env_vars.ff.py @@ -15,7 +15,7 @@ name="env_vars.ff", tags=["demo", "env"], kind="python", - meta={"materialized": "table"}, + materialized="table", ) def build(_: pd.DataFrame | None) -> pd.DataFrame: """ diff --git a/examples/postgres/.fastflowtransform/target/catalog.json b/examples/postgres/.fastflowtransform/target/catalog.json index 833aa53..9bf6fca 100644 --- a/examples/postgres/.fastflowtransform/target/catalog.json +++ b/examples/postgres/.fastflowtransform/target/catalog.json @@ -1,6 +1,6 @@ { "metadata": { - "generated_at": "2025-10-30T18:29:08+00:00", + "generated_at": "2025-10-31T16:46:28+00:00", "tool": "fastflowtransform" }, "relations": { diff --git a/examples/postgres/.fastflowtransform/target/manifest.json b/examples/postgres/.fastflowtransform/target/manifest.json index 1a5eed6..e41b5dc 100644 --- a/examples/postgres/.fastflowtransform/target/manifest.json +++ b/examples/postgres/.fastflowtransform/target/manifest.json @@ -1,7 +1,7 @@ { "macros": {}, "metadata": { - "generated_at": "2025-10-30T18:29:08+00:00", + "generated_at": "2025-10-31T16:46:28+00:00", "tool": "fastflowtransform" }, "nodes": { diff --git a/examples/postgres/.fastflowtransform/target/run_results.json b/examples/postgres/.fastflowtransform/target/run_results.json index 36067fb..07e2a05 100644 --- a/examples/postgres/.fastflowtransform/target/run_results.json +++ b/examples/postgres/.fastflowtransform/target/run_results.json @@ -1,55 +1,55 @@ { "metadata": { - "generated_at": "2025-10-30T18:29:08+00:00", + "generated_at": "2025-10-31T16:46:28+00:00", "tool": "fastflowtransform" }, "results": [ { "duration_ms": 0, - "finished_at": "2025-10-30T18:29:08+00:00", + "finished_at": "2025-10-31T16:46:28+00:00", "http": null, "message": null, "name": "mart_orders_enriched", - "started_at": "2025-10-30T18:29:08+00:00", + "started_at": "2025-10-31T16:46:28+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-30T18:29:08+00:00", + "finished_at": "2025-10-31T16:46:28+00:00", "http": null, "message": null, "name": "mart_users.ff", - "started_at": "2025-10-30T18:29:08+00:00", + "started_at": "2025-10-31T16:46:28+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-30T18:29:08+00:00", + "finished_at": "2025-10-31T16:46:28+00:00", "http": null, "message": null, "name": "orders.ff", - "started_at": "2025-10-30T18:29:08+00:00", + "started_at": "2025-10-31T16:46:28+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-30T18:29:08+00:00", + "finished_at": "2025-10-31T16:46:28+00:00", "http": null, "message": null, "name": "users.ff", - "started_at": "2025-10-30T18:29:08+00:00", + "started_at": "2025-10-31T16:46:28+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-30T18:29:08+00:00", + "finished_at": "2025-10-31T16:46:28+00:00", "http": null, "message": null, "name": "users_enriched", - "started_at": "2025-10-30T18:29:08+00:00", + "started_at": "2025-10-31T16:46:28+00:00", "status": "success" } ], - "run_finished_at": "2025-10-30T18:29:08+00:00", - "run_started_at": "2025-10-30T18:29:08+00:00" + "run_finished_at": "2025-10-31T16:46:28+00:00", + "run_started_at": "2025-10-31T16:46:28+00:00" } diff --git a/examples/simple_duckdb/.fastflowtransform/target/catalog.json b/examples/simple_duckdb/.fastflowtransform/target/catalog.json index 4690161..88b0fd6 100644 --- a/examples/simple_duckdb/.fastflowtransform/target/catalog.json +++ b/examples/simple_duckdb/.fastflowtransform/target/catalog.json @@ -1,6 +1,6 @@ { "metadata": { - "generated_at": "2025-10-30T18:29:03+00:00", + "generated_at": "2025-10-31T16:46:22+00:00", "tool": "fastflowtransform" }, "relations": { diff --git a/examples/simple_duckdb/.fastflowtransform/target/manifest.json b/examples/simple_duckdb/.fastflowtransform/target/manifest.json index 0aea182..c96a216 100644 --- a/examples/simple_duckdb/.fastflowtransform/target/manifest.json +++ b/examples/simple_duckdb/.fastflowtransform/target/manifest.json @@ -6,7 +6,7 @@ "upper_col": "models/macros/util.sql" }, "metadata": { - "generated_at": "2025-10-30T18:29:03+00:00", + "generated_at": "2025-10-31T16:46:22+00:00", "tool": "fastflowtransform" }, "nodes": { diff --git a/examples/simple_duckdb/.fastflowtransform/target/run_results.json b/examples/simple_duckdb/.fastflowtransform/target/run_results.json index f257d89..7dad55a 100644 --- a/examples/simple_duckdb/.fastflowtransform/target/run_results.json +++ b/examples/simple_duckdb/.fastflowtransform/target/run_results.json @@ -1,82 +1,82 @@ { "metadata": { - "generated_at": "2025-10-30T18:29:03+00:00", + "generated_at": "2025-10-31T16:46:22+00:00", "tool": "fastflowtransform" }, "results": [ { "duration_ms": 0, - "finished_at": "2025-10-30T18:29:03+00:00", + "finished_at": "2025-10-31T16:46:22+00:00", "http": null, "message": null, "name": "ephemeral_ids.ff", - "started_at": "2025-10-30T18:29:03+00:00", + "started_at": "2025-10-31T16:46:22+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-30T18:29:03+00:00", + "finished_at": "2025-10-31T16:46:22+00:00", "http": null, "message": null, "name": "mart_orders_enriched", - "started_at": "2025-10-30T18:29:03+00:00", + "started_at": "2025-10-31T16:46:22+00:00", "status": "success" }, { "duration_ms": 1, - "finished_at": "2025-10-30T18:29:03+00:00", + "finished_at": "2025-10-31T16:46:22+00:00", "http": null, "message": null, "name": "mart_users.ff", - "started_at": "2025-10-30T18:29:03+00:00", + "started_at": "2025-10-31T16:46:22+00:00", "status": "success" }, { "duration_ms": 4, - "finished_at": "2025-10-30T18:29:03+00:00", + "finished_at": "2025-10-31T16:46:22+00:00", "http": null, "message": null, "name": "orders.ff", - "started_at": "2025-10-30T18:29:03+00:00", + "started_at": "2025-10-31T16:46:22+00:00", "status": "success" }, { "duration_ms": 1, - "finished_at": "2025-10-30T18:29:03+00:00", + "finished_at": "2025-10-31T16:46:22+00:00", "http": null, "message": null, "name": "users.ff", - "started_at": "2025-10-30T18:29:03+00:00", + "started_at": "2025-10-31T16:46:22+00:00", "status": "success" }, { - "duration_ms": 0, - "finished_at": "2025-10-30T18:29:03+00:00", + "duration_ms": 1, + "finished_at": "2025-10-31T16:46:22+00:00", "http": null, "message": null, "name": "users_enriched", - "started_at": "2025-10-30T18:29:03+00:00", + "started_at": "2025-10-31T16:46:22+00:00", "status": "success" }, { "duration_ms": 1, - "finished_at": "2025-10-30T18:29:03+00:00", + "finished_at": "2025-10-31T16:46:22+00:00", "http": null, "message": null, "name": "v_users.ff", - "started_at": "2025-10-30T18:29:03+00:00", + "started_at": "2025-10-31T16:46:22+00:00", "status": "success" }, { "duration_ms": 0, - "finished_at": "2025-10-30T18:29:03+00:00", + "finished_at": "2025-10-31T16:46:22+00:00", "http": null, "message": null, "name": "v_users_enriched.ff", - "started_at": "2025-10-30T18:29:03+00:00", + "started_at": "2025-10-31T16:46:22+00:00", "status": "success" } ], - "run_finished_at": "2025-10-30T18:29:03+00:00", - "run_started_at": "2025-10-30T18:29:03+00:00" + "run_finished_at": "2025-10-31T16:46:22+00:00", + "run_started_at": "2025-10-31T16:46:22+00:00" } diff --git a/mkdocs.yml b/mkdocs.yml index 0bc9890..df787ba 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -36,7 +36,8 @@ nav: - Examples: - Environment Matrix: examples/Environment_Matrix.md - API Demo Overview: examples/API_Demo.md - - API Demo Local Setup: examples/Local_Engine_Setup.md + - Basic Demo Overview: examples/Basic_Demo.md + - Local Engine Setup: examples/Local_Engine_Setup.md - Contributing: Contributing.md - License: License.md diff --git a/pytest.ini b/pytest.ini index 9a86424..c488100 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,6 +4,8 @@ markers = duckdb: marks tests that require DuckDB postgres: marks tests that require Postgres spark: marks tests that require Spark + bigquery: marks tests that require BigQuery + snowflake: marks tests that require Snowflake artifacts: marks tests covering artifacts generation render: marks tests for render-time helpers/templates schema: marks schema loader/validation tests diff --git a/src/fastflowtransform/cli/docgen_cmd.py b/src/fastflowtransform/cli/docgen_cmd.py index fa6588e..5e24319 100644 --- a/src/fastflowtransform/cli/docgen_cmd.py +++ b/src/fastflowtransform/cli/docgen_cmd.py @@ -1,3 +1,4 @@ +# fastflowtransform/cli/docgen_cmd.py from __future__ import annotations import json diff --git a/src/fastflowtransform/cli/seed_cmd.py b/src/fastflowtransform/cli/seed_cmd.py index 444f02e..5270e1f 100644 --- a/src/fastflowtransform/cli/seed_cmd.py +++ b/src/fastflowtransform/cli/seed_cmd.py @@ -7,22 +7,6 @@ from fastflowtransform.logging import echo from fastflowtransform.seeding import _human_int, seed_project -# def seed( -# project: ProjectArg = ".", -# env_name: EnvOpt = "dev", -# engine: EngineOpt = None, -# vars: VarsOpt = None, -# ) -> None: -# ctx = _prepare_context(project, env_name, engine, vars) -# execu, _, _ = ctx.make_executor() - -# schema: str | None = None -# if ctx.profile.engine == "postgres": -# schema = ctx.profile.postgres.db_schema - -# n = seed_project(ctx.project, execu, schema) -# echo(f"✓ Seeded {n} table(s)") - def seed( project: ProjectArg = ".", diff --git a/src/fastflowtransform/decorators.py b/src/fastflowtransform/decorators.py index 86345dc..1f35f54 100644 --- a/src/fastflowtransform/decorators.py +++ b/src/fastflowtransform/decorators.py @@ -58,6 +58,7 @@ def model( *, tags: Sequence[str] | None = None, kind: str = "python", + materialized: str | None = None, meta: Mapping[str, Any] | None = None, ) -> Callable[[Callable[P, R_co]], HasFFMeta[P, R_co]]: """ @@ -72,7 +73,8 @@ def model( (dep_name = logical name or physical relation). tags: Optional tags for selection (e.g. ['demo','env']). kind: Logical kind; defaults to 'python' (useful for selectors kind:python). - meta: Arbitrary metadata, e.g. {'materialized': 'table'|'view'|'incremental'}. + materialized: Shorthand for meta['materialized']; mirrors config(materialized='...'). + meta: Arbitrary metadata for executors/docs (merged with materialized if provided). """ def deco(func: Callable[P, R_co]) -> HasFFMeta[P, R_co]: @@ -92,7 +94,11 @@ def deco(func: Callable[P, R_co]) -> HasFFMeta[P, R_co]: f_any.__ff_tags__ = list(tags) if tags else [] f_any.__ff_kind__ = kind or "python" - f_any.__ff_meta__ = dict(meta) if meta else {} + + metadata = dict(meta) if meta else {} + if materialized is not None: + metadata["materialized"] = materialized + f_any.__ff_meta__ = metadata # Determine the source path (better error message if it fails) src: str | None = inspect.getsourcefile(func) diff --git a/src/fastflowtransform/executors/duckdb_exec.py b/src/fastflowtransform/executors/duckdb_exec.py index 3a97f58..d937dd7 100644 --- a/src/fastflowtransform/executors/duckdb_exec.py +++ b/src/fastflowtransform/executors/duckdb_exec.py @@ -35,6 +35,17 @@ def clone(self) -> DuckExecutor: """ return DuckExecutor(self.db_path) + def _exec_many(self, sql: str) -> None: + """ + Execute multiple SQL statements separated by ';' on the same connection. + DuckDB normally accepts one statement per execute(), so we split here. + """ + # very simple splitter - good enough for what we emit in the executor + for stmt in (part.strip() for part in sql.split(";")): + if not stmt: + continue + self.con.execute(stmt) + # ---- Frame hooks ---- def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.DataFrame: try: @@ -143,20 +154,25 @@ def incremental_insert(self, relation: str, select_sql: str) -> None: def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None: """ - Fallback strategy: - - Staging-CTE: data from SELECT - - Delete-Merge: delete collisions in target - - Insert all staging rows + Fallback strategy for DuckDB: + - DELETE collisions via DELETE ... USING () + We intentionally do NOT use a CTE here, because we execute two separate + statements and DuckDB won't see the CTE from the previous statement. """ - keys_pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) - # Clean inner SELECT for CTE: remove trailing semicolon and keep only SELECT body + # 1) clean inner SELECT body = self._first_select_body(select_sql).strip().rstrip(";\n\t ") - sql = f""" - with src as ({body}) - delete from {relation} t using src s where {keys_pred}; - insert into {relation} select * from src; - """ - self.con.execute(sql) + + # 2) predicate for DELETE + keys_pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE" + + # 3) first: delete collisions + delete_sql = f"delete from {relation} t using ({body}) s where {keys_pred}" + self.con.execute(delete_sql) + + # 4) then: insert fresh rows + insert_sql = f"insert into {relation} select * from ({body}) src" + self.con.execute(insert_sql) def alter_table_sync_schema( self, relation: str, select_sql: str, *, mode: str = "append_new_columns" diff --git a/src/fastflowtransform/logging.py b/src/fastflowtransform/logging.py index 16d2a42..fbe3fe7 100644 --- a/src/fastflowtransform/logging.py +++ b/src/fastflowtransform/logging.py @@ -15,6 +15,51 @@ import typer +# ----------------------------------------------------------------------------- +# Prefix configuration +# ----------------------------------------------------------------------------- +LOG_PREFIX = os.getenv("FFT_LOG_PREFIX", "[FFT]").strip() + + +def _prefix_enabled() -> bool: + return bool(LOG_PREFIX) + + +def _prefix_text_line(line: str) -> str: + if not _prefix_enabled(): + return line + return f"{LOG_PREFIX} {line}" + + +def _prefix_text_block(text: str) -> str: + if not _prefix_enabled() or not text: + return text + lines = text.splitlines(keepends=True) + if not lines: + return text + prefixed: list[str] = [] + for line in lines: + if line.strip(): + prefixed.append(_prefix_text_line(line)) + else: + prefixed.append(line) + return "".join(prefixed) + + +def _prefix_format(fmt: str) -> str: + if not _prefix_enabled(): + return fmt + return f"{LOG_PREFIX} {fmt}" + + +def _apply_prefix(message: Any) -> Any: + if not _prefix_enabled() or message is None: + return message + if isinstance(message, str): + return _prefix_text_block(message) + return _prefix_text_line(str(message)) + + # ----------------------------------------------------------------------------- # Context (enriched into log records) and runtime flags # ----------------------------------------------------------------------------- @@ -116,7 +161,7 @@ class _ConsoleFormatter(_logging.Formatter): _date = "%Y-%m-%d %H:%M:%S" def __init__(self) -> None: - super().__init__(self._fmt, self._date) + super().__init__(_prefix_format(self._fmt), self._date) class _JsonFormatter(_logging.Formatter): @@ -231,30 +276,32 @@ def get_logger(name: str | None = None) -> _logging.Logger: return _logging.getLogger(base if not name else f"{base}.{name}") -def echo(*args: Any, **kwargs: Any) -> None: +def echo(message: Any = "", *, prefix: bool = True, **kwargs: Any) -> None: """ - Thin passthrough to typer.echo(...). + Thin wrapper around typer.echo(...) that prepends the global log prefix. Usage: echo("hello") echo("to stderr", err=True) echo("no newline", nl=False) echo("colored", color=True) + echo("raw message", prefix=False) # skip prefix if needed """ - typer.echo(*args, **kwargs) + msg = _apply_prefix(message) if prefix else message + typer.echo(msg, **kwargs) -def echo_debug(*args: Any, **kwargs: Any) -> None: +def echo_debug(message: Any = "", *, prefix: bool = True, **kwargs: Any) -> None: """ - Like typer.echo(...), but only emits when `fastflowtransform` logger is in DEBUG. + Like echo(...), but only emits when `fastflowtransform` logger is in DEBUG. Usage: echo_debug("SQL preview:", sql_text) echo_debug("to stderr only in debug", err=True) """ - logger = _logging.getLogger("fastflowtransform") + logger = get_logger() if logger.isEnabledFor(_logging.DEBUG): - typer.echo(*args, **kwargs) + echo(message, prefix=prefix, **kwargs) def info(msg: str, *args: Any, **kwargs: Any) -> None: @@ -281,7 +328,9 @@ def dprint(*parts: Any) -> None: Only prints when general debug is enabled. """ if is_debug_enabled(): - print("[DBG]", *parts, file=sys.stdout) + body = " ".join(str(p) for p in parts) if parts else "" + msg = "[DBG]" if not body else f"[DBG] {body}" + print(_prefix_text_line(msg), file=sys.stdout) def sql_debug(msg: str, *args: Any, **kwargs: Any) -> None: diff --git a/tests/common/mock/bigquery.py b/tests/common/mock/bigquery.py new file mode 100644 index 0000000..b27bdb3 --- /dev/null +++ b/tests/common/mock/bigquery.py @@ -0,0 +1,199 @@ +# tests/helpers/fake_bigquery.py +from __future__ import annotations + +import sys +import types +from types import SimpleNamespace +from typing import Any + +import pandas as pd + +# --------------------------------------------------------------------------- +# Fake types +# --------------------------------------------------------------------------- + + +class FakeField: + def __init__(self, name: str, field_type: str = "STRING"): + self.name = name + self.field_type = field_type + + +class FakeDFResult: + """ + Wird vom pandas-Executor gebraucht: job.result().to_dataframe(...) + Wir bauen ein minimales pandas-DataFrame daraus. + """ + + def __init__(self, rows: list[tuple] | None = None, schema: list[FakeField] | None = None): + self._rows = rows or [] + self._schema = schema or [] + + def to_dataframe(self, create_bqstorage_client: bool = True): + if not self._rows: + return pd.DataFrame([]) + cols = [f.name for f in self._schema] if self._schema else [] + data = [dict(zip(cols, r, strict=False)) for r in self._rows] + return pd.DataFrame(data) + + +class FakeJob: + def __init__(self, rows: list[Any] | None = None, schema: list[FakeField] | None = None): + self._rows = rows or [] + self._schema = schema or [] + + def result(self): + return FakeDFResult(self._rows, self._schema) + + @property + def schema(self): + return self._schema + + +class FakeQueryJobConfig: + def __init__(self, **kwargs: Any): + self.kwargs = kwargs + + +class FakeScalarQueryParameter: + def __init__(self, name: str, typ: str, val: Any): + self.name = name + self.type_ = typ + self.value = val + + +class FakeDataset: + def __init__(self, dataset_id: str): + self.dataset_id = dataset_id + self.location: str | None = None + + +class FakeBadRequest(Exception): + pass + + +class FakeNotFound(Exception): + pass + + +class FakeClient: + """ + Gemeinsamer Client für beide Executor-Tests. + Kann: + - query(...) + - list_tables(...) + - get_table(...) + - get_dataset(...) + - create_dataset(...) + - load_table_from_dataframe(...) (für pandas-Executor) + und hat: + - _datasets: set[str] + - _tables: dict[str, list[Any]] + """ + + def __init__(self, project: str, location: str | None = None): + self.project = project + self.location = location + self.queries: list[tuple[str, str | None, Any | None]] = [] + self._datasets: set[str] = set() + self._tables: dict[str, list[Any]] = {} + + # ---- Test helper ---- + def add_dataset(self, ds_id: str) -> None: + self._datasets.add(ds_id) + + def add_table(self, dataset_id: str, table_id: str) -> None: + self._tables.setdefault(dataset_id, []).append(SimpleNamespace(table_id=table_id)) + + # ---- Emulator methods ---- + def query(self, sql: str, location: str | None = None, job_config: Any | None = None): + self.queries.append((sql, location, job_config)) + + # INFORMATION_SCHEMA → 1 Row back + if "INFORMATION_SCHEMA.TABLES" in sql or "INFORMATION_SCHEMA.VIEWS" in sql: + return FakeJob(rows=[(1,)]) + + # Probe-Query (SELECT ... WHERE 1=0) → Schema back + if "WHERE 1=0" in sql: + return FakeJob(schema=[FakeField("id"), FakeField("new_col", "INT64")]) + + # ALTER TABLE ... ADD COLUMN ... + if sql.lstrip().upper().startswith("ALTER TABLE"): + return FakeJob() + + # everything else → empty return + return FakeJob() + + def list_tables(self, dataset_id: str): + return self._tables.get(dataset_id, []) + + def get_table(self, table_ref: str): + if table_ref.endswith(".existing"): + return SimpleNamespace(schema=[FakeField("id")]) + + ds = ".".join(table_ref.split(".")[:2]) + name = table_ref.split(".")[-1] + for t in self._tables.get(ds, []): + if t.table_id == name: + return SimpleNamespace(schema=[FakeField("id")]) + + raise FakeNotFound(f"table {table_ref} not found") + + def get_dataset(self, ds_id: str): + if ds_id not in self._datasets: + raise FakeNotFound(f"dataset {ds_id} not found") + return FakeDataset(ds_id) + + def create_dataset(self, ds_obj: Any): + ds_id = getattr(ds_obj, "dataset_id", ds_obj) + self._datasets.add(ds_id) + ds = FakeDataset(ds_id) + ds.location = getattr(ds_obj, "location", None) + return ds + + def load_table_from_dataframe(self, df, table_id: str, job_config: Any, location: str | None): + return FakeJob() + + +class FakeWriteDisposition: + WRITE_TRUNCATE = "WRITE_TRUNCATE" + + +# --------------------------------------------------------------------------- +# Factory +# --------------------------------------------------------------------------- + + +def make_fake_bigquery_module() -> types.ModuleType: + mod = types.ModuleType("google.cloud.bigquery") + mod.Client = FakeClient # type: ignore[attr-defined] + mod.QueryJobConfig = FakeQueryJobConfig # type: ignore[attr-defined] + mod.ScalarQueryParameter = FakeScalarQueryParameter # type: ignore[attr-defined] + mod.Dataset = FakeDataset # type: ignore[attr-defined] + mod.BadRequest = FakeBadRequest # type: ignore[attr-defined] + mod.NotFound = FakeNotFound # type: ignore[attr-defined] + mod.WriteDisposition = FakeWriteDisposition # type: ignore[attr-defined] + return mod + + +# --------------------------------------------------------------------------- +# Helper fixture +# --------------------------------------------------------------------------- + + +def install_fake_bigquery(monkeypatch, target_modules: list[types.ModuleType]) -> types.ModuleType: + """ + Installiert unser Fake-bigquery sowohl in sys.modules als auch in den angegebenen + Zielmodulen (per monkeypatch.setattr(mod, "bigquery", ...)). + Gibt das Fake-Modul zurück. + """ + fake_bq = make_fake_bigquery_module() + + gc_mod = sys.modules.setdefault("google.cloud", types.ModuleType("google.cloud")) + gc_mod.bigquery = fake_bq # type: ignore[attr-defined] + sys.modules["google.cloud.bigquery"] = fake_bq + + for m in target_modules: + monkeypatch.setattr(m, "bigquery", fake_bq, raising=True) + + return fake_bq diff --git a/tests/common/mock/profiles.py b/tests/common/mock/profiles.py new file mode 100644 index 0000000..06de95c --- /dev/null +++ b/tests/common/mock/profiles.py @@ -0,0 +1,150 @@ +# tests/common/mock/profiles.py +from __future__ import annotations + +from types import SimpleNamespace +from typing import cast + +# wir brauchen nur den Typ für das cast +from fastflowtransform.settings import ( + Profile, +) + + +def fake_bigquery_profile( + *, + project: str = "p1", + dataset: str = "ds1", + location: str | None = "EU", + use_bigframes: bool = False, +) -> Profile: + """ + Return a shape-compatible fake of a BigQuery profile. + + Only attributes that fastflowtransform.cli.bootstrap._validate_profile_params() + and _make_executor() actually read are provided. + """ + ns = SimpleNamespace( + # top-level field that bootstrap branches on + engine="bigquery", + # nested bigquery section + bigquery=SimpleNamespace( + project=project, + dataset=dataset, + location=location, + use_bigframes=use_bigframes, + ), + ) + # tell the type checker: "this is good enough to be treated as Profile" + return cast(Profile, ns) + + +def fake_duckdb_profile( + *, + path: str = ":memory:", +) -> Profile: + """ + Fake DuckDB profile - just enough for _validate_profile_params and _make_executor. + """ + ns = SimpleNamespace( + engine="duckdb", + duckdb=SimpleNamespace( + path=path, + ), + ) + return cast(Profile, ns) + + +def fake_postgres_profile( + *, + dsn: str = "postgres://user:pass@localhost:5432/db", + schema: str = "public", +) -> Profile: + """ + Fake Postgres profile - fields match what bootstrap checks: + - postgres.dsn + - postgres.db_schema + """ + ns = SimpleNamespace( + engine="postgres", + postgres=SimpleNamespace( + dsn=dsn, + db_schema=schema, + ), + ) + return cast(Profile, ns) + + +def fake_databricks_spark_profile( + *, + master: str = "local[*]", + app_name: str = "ff-test", + extra_conf: dict[str, str] | None = None, + warehouse_dir: str | None = None, + use_hive_metastore: bool = False, + catalog: str | None = None, + database: str | None = None, + table_format: str | None = None, + table_options: dict[str, str] | None = None, +) -> Profile: + """ + Fake Databricks/Spark profile - mirrors the attribute names the real code expects. + """ + ns = SimpleNamespace( + engine="databricks_spark", + databricks_spark=SimpleNamespace( + master=master, + app_name=app_name, + extra_conf=extra_conf or {}, + warehouse_dir=warehouse_dir, + use_hive_metastore=use_hive_metastore, + catalog=catalog, + database=database, + table_format=table_format, + table_options=table_options or {}, + ), + ) + return cast(Profile, ns) + + +def fake_snowflake_snowpark_profile( + *, + account: str = "acc", + user: str = "user", + password: str = "pass", + warehouse: str = "wh", + database: str = "db", + db_schema: str = "PUBLIC", + role: str | None = None, +) -> Profile: + """ + Fake Snowflake Snowpark profile - includes all required fields the bootstrap validates. + """ + ns = SimpleNamespace( + engine="snowflake_snowpark", + snowflake_snowpark=SimpleNamespace( + account=account, + user=user, + password=password, + warehouse=warehouse, + database=database, + db_schema=db_schema, + role=role, + ), + ) + return cast(Profile, ns) + + +def make_fake_profile(engine: str = "bigquery") -> Profile: + """ + Convenience factory for tests that don't care about the exact engine. + """ + if engine == "duckdb": + return fake_duckdb_profile() + if engine == "postgres": + return fake_postgres_profile() + if engine == "databricks_spark": + return fake_databricks_spark_profile() + if engine == "snowflake_snowpark": + return fake_snowflake_snowpark_profile() + # default → bigquery + return fake_bigquery_profile() diff --git a/tests/executors/duckdb/test_python_model_materialized_view.py b/tests/executors/duckdb/test_python_model_materialized_view.py index c4f53a7..35b6e24 100644 --- a/tests/executors/duckdb/test_python_model_materialized_view.py +++ b/tests/executors/duckdb/test_python_model_materialized_view.py @@ -23,7 +23,7 @@ def test_python_model_materialized_as_view(tmp_path: Path, monkeypatch): from fastflowtransform.decorators import model import pandas as pd -@model(name="py_users.ff", deps=["base.ff"]) +@model(name="py_users.ff", deps=["base.ff"], materialized="view") def build(df: pd.DataFrame) -> pd.DataFrame: out = df.copy() out["is_gmail"] = out["email"].str.endswith("@gmail.com") @@ -38,13 +38,8 @@ def build(df: pd.DataFrame) -> pd.DataFrame: encoding="utf-8", ) - # Also place a config block on the Python view in a tiny shim SQL file: - # We attach materialized='view' via a separate top directive model that won't be built; - # alternatively, set REGISTRY.nodes['py_users.ff'].meta manually after load. - # For simplicity here, we modify meta after load. REGISTRY.load_project(tmp_path) - # mark the python node as view - REGISTRY.nodes["py_users.ff"].meta["materialized"] = "view" + assert REGISTRY.nodes["py_users.ff"].meta["materialized"] == "view" env = REGISTRY.env assert env is not None diff --git a/tests/unit/cli/test_bootstrap_unit.py b/tests/unit/cli/test_bootstrap_unit.py new file mode 100644 index 0000000..73b7181 --- /dev/null +++ b/tests/unit/cli/test_bootstrap_unit.py @@ -0,0 +1,260 @@ +# tests/unit/cli/test_bootstrap_unit.py +from __future__ import annotations + +import datetime +from pathlib import Path +from types import SimpleNamespace +from typing import cast + +import pytest +from jinja2 import Environment +from tests.common.mock.profiles import ( + fake_bigquery_profile, + fake_databricks_spark_profile, + fake_duckdb_profile, + fake_postgres_profile, + fake_snowflake_snowpark_profile, +) + +from fastflowtransform.cli import bootstrap +from fastflowtransform.settings import ( + SnowflakeSnowparkProfile, +) + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + + +class _FakeRegistry: + """Tiny fake of fastflowtransform.core.REGISTRY for tests.""" + + def __init__(self) -> None: + self._active_engine: str | None = None + self._cli_vars: dict[str, object] = {} + self.env: Environment = Environment() + + def set_active_engine(self, name: str | None) -> None: + self._active_engine = name + + def set_cli_vars(self, d: dict[str, object]) -> None: + self._cli_vars = d + + def load_project(self, p: Path) -> None: + # in unit tests we do nothing - real REGISTRY would scan models + return + + +# --------------------------------------------------------------------------- +# _validate_profile_params +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_validate_profile_params_bigquery_ok(): + prof = fake_bigquery_profile(dataset="analytics") + # should not raise + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_bigquery_missing_dataset(): + prof = fake_bigquery_profile(dataset="") + with pytest.raises(bootstrap.typer.BadParameter): + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_duckdb_ok(): + prof = fake_duckdb_profile(path=":memory:") + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_duckdb_missing_path(): + prof = fake_duckdb_profile(path="") + with pytest.raises(bootstrap.typer.BadParameter): + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_postgres_ok(): + prof = fake_postgres_profile(dsn="postgres://...", schema="public") + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_postgres_missing_dsn(): + prof = fake_postgres_profile(dsn="", schema="public") + with pytest.raises(bootstrap.typer.BadParameter): + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_postgres_missing_schema(): + prof = fake_postgres_profile(dsn="postgres://...", schema="") + with pytest.raises(bootstrap.typer.BadParameter): + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_databricks_ok(): + prof = fake_databricks_spark_profile(master="local[*]", app_name="fft-test") + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_databricks_missing_master(): + prof = fake_databricks_spark_profile(master="", app_name="fft-test") + with pytest.raises(bootstrap.typer.BadParameter): + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_snowflake_ok(): + prof = fake_snowflake_snowpark_profile() + bootstrap._validate_profile_params("dev", prof) + + +@pytest.mark.unit +def test_validate_profile_params_snowflake_missing_required(): + prof = fake_snowflake_snowpark_profile() + sf_prof = cast(SnowflakeSnowparkProfile, prof) + # break one required field + sf_prof.snowflake_snowpark.account = "" + with pytest.raises(bootstrap.typer.BadParameter): + bootstrap._validate_profile_params("dev", sf_prof) + + +# --------------------------------------------------------------------------- +# _parse_cli_vars +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_parse_cli_vars_parses_yaml_values(): + # note: the quotes around [a,b] make YAML return a *string*, not a list + out = bootstrap._parse_cli_vars(["day=2025-10-01", "limit=5", "enabled=true", "tags='[a,b]'"]) + + # day: yaml parses ISO dates as datetime.date + day_val = out["day"] + if isinstance(day_val, datetime.date): + assert day_val == datetime.date(2025, 10, 1) + else: + # fallback, in case the YAML loader changes + assert day_val == "2025-10-01" + + # numeric + assert out["limit"] == 5 + + # bool + assert out["enabled"] is True + + # because of the extra quotes, YAML keeps this as a plain string + assert out["tags"] == "[a,b]" + + +@pytest.mark.unit +def test_parse_cli_vars_raises_on_missing_equal(): + with pytest.raises(bootstrap.typer.BadParameter): + bootstrap._parse_cli_vars(["justkey"]) + + +# --------------------------------------------------------------------------- +# _make_executor +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_make_executor_bigquery_uses_correct_executor(monkeypatch): + # we don't want to hit the real bigquery imports + # so we just monkeypatch the executors the bootstrap imports + class _FakeBQExec: + def __init__(self, *a, **k): + self.args = a + self.kw = k + + def run_python(self, *a, **k): + pass + + # patch BOTH BF and normal - code branches on use_bigframes + monkeypatch.setattr(bootstrap, "BigQueryExecutor", _FakeBQExec, raising=True) + monkeypatch.setattr(bootstrap, "BigQueryBFExecutor", _FakeBQExec, raising=True) + + prof = fake_bigquery_profile(use_bigframes=False) + jenv = Environment() + + ex, run_fn, py_fn = bootstrap._make_executor(prof, jenv) + + assert isinstance(ex, _FakeBQExec) + assert callable(run_fn) + assert callable(py_fn) + + +@pytest.mark.unit +def test_make_executor_duckdb(monkeypatch, tmp_path: Path): + class _FakeDuckExec: + def __init__(self, db_path: str): + self.db_path = db_path + + def run_python(self, *a, **k): + pass + + monkeypatch.setattr(bootstrap, "DuckExecutor", _FakeDuckExec, raising=True) + + prof = fake_duckdb_profile(path=str(tmp_path / "test.duckdb")) + jenv = Environment() + + ex, run_fn, py_fn = bootstrap._make_executor(prof, jenv) + assert isinstance(ex, _FakeDuckExec) + assert ex.db_path.endswith("test.duckdb") + assert callable(run_fn) + assert callable(py_fn) + + +# --------------------------------------------------------------------------- +# _resolve_project_path - minimal check with temp dir +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_resolve_project_path_happy(tmp_path: Path): + # create models/ folder so bootstrap accepts the dir + (tmp_path / "models").mkdir() + p = bootstrap._resolve_project_path(str(tmp_path)) + assert p == tmp_path.resolve() + + +@pytest.mark.unit +def test_resolve_project_path_missing_models(tmp_path: Path): + # no models/ → should raise + with pytest.raises(bootstrap.typer.BadParameter): + bootstrap._resolve_project_path(str(tmp_path)) + + +# --------------------------------------------------------------------------- +# _get_test_con - just smoke test +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_get_test_con_prefers_executor_con(): + class ExecWithCon: + def __init__(self): + self.con = SimpleNamespace(execute=lambda *_: "ok") + + ex = ExecWithCon() + con = bootstrap._get_test_con(ex) + assert con.execute("SELECT 1") == "ok" + + +@pytest.mark.unit +def test_get_test_con_falls_back_to_executor(): + class ExecSimple: + def run(self): + return "ran" + + ex = ExecSimple() + con = bootstrap._get_test_con(ex) + # we just get the executor back + assert con is ex diff --git a/tests/unit/cli/test_docgen_cmd_unit.py b/tests/unit/cli/test_docgen_cmd_unit.py new file mode 100644 index 0000000..cf03556 --- /dev/null +++ b/tests/unit/cli/test_docgen_cmd_unit.py @@ -0,0 +1,143 @@ +# tests/unit/cli/test_docgen_cmd_unit.py +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest +import typer + +import fastflowtransform.cli.docgen_cmd as docgen_mod + + +@pytest.fixture(autouse=True) +def _patch_registry(monkeypatch): + """Keep REGISTRY small and predictable for all tests in this module.""" + fake_registry = SimpleNamespace(nodes={"model_a": object()}) + monkeypatch.setattr(docgen_mod, "REGISTRY", fake_registry, raising=True) + + +@pytest.fixture +def fake_ctx(tmp_path: Path): + """Return a fake context that looks like _prepare_context output.""" + fake_executor = SimpleNamespace(name="fake-exec") + + class FakeCtx: + def __init__(self, project: Path): + self.project = project + self.profile = SimpleNamespace(engine="duckdb") + + def make_executor(self): + return (fake_executor,) + + return FakeCtx(tmp_path / "proj") + + +def test_docgen_basic_writes_html_dir(monkeypatch, tmp_path, fake_ctx): + monkeypatch.setattr(docgen_mod, "_prepare_context", lambda *a, **k: fake_ctx, raising=True) + + dag_dir = tmp_path / "site" / "dag" + monkeypatch.setattr( + docgen_mod, + "_resolve_dag_out_dir", + lambda project, override: dag_dir, + raising=True, + ) + + called: dict[str, Any] = {} + + def fake_render(out_dir, nodes, executor=None): + called["render_out_dir"] = out_dir + called["nodes"] = nodes + called["executor"] = executor + + monkeypatch.setattr(docgen_mod, "render_site", fake_render, raising=True) + monkeypatch.setattr(docgen_mod, "echo", lambda *_: None, raising=True) + monkeypatch.setattr(docgen_mod, "echo_debug", lambda *_: None, raising=True) + + docgen_mod.docgen(project=".", env_name="dev") + + assert dag_dir.exists() + assert called["render_out_dir"] == dag_dir + assert isinstance(called["nodes"], dict) + assert called["executor"].name == "fake-exec" + + +def test_docgen_emits_json_when_option_set(monkeypatch, tmp_path, fake_ctx): + monkeypatch.setattr(docgen_mod, "_prepare_context", lambda *a, **k: fake_ctx, raising=True) + + out_dir = tmp_path / "site" / "dag" + monkeypatch.setattr( + docgen_mod, + "_resolve_dag_out_dir", + lambda project, override: out_dir, + raising=True, + ) + + monkeypatch.setattr(docgen_mod, "render_site", lambda *a, **k: None, raising=True) + monkeypatch.setattr(docgen_mod, "echo", lambda *_: None, raising=True) + monkeypatch.setattr(docgen_mod, "echo_debug", lambda *_: None, raising=True) + + monkeypatch.setattr( + docgen_mod, + "_build_docs_manifest", + lambda project_dir, nodes, executor, env_name: { + "project": "fake", + "models": [], + "generated_at": "2025-01-01T00:00:00Z", + }, + raising=True, + ) + + json_path = tmp_path / "docs.json" + + docgen_mod.docgen(project=".", env_name="dev", emit_json=json_path) + + assert json_path.exists() + txt = json_path.read_text(encoding="utf-8") + assert '"project": "fake"' in txt + + +def test_docgen_open_source_opens_browser(monkeypatch, tmp_path, fake_ctx): + monkeypatch.setattr(docgen_mod, "_prepare_context", lambda *a, **k: fake_ctx, raising=True) + dag_dir = tmp_path / "site" / "dag" + monkeypatch.setattr( + docgen_mod, + "_resolve_dag_out_dir", + lambda project, override: dag_dir, + raising=True, + ) + monkeypatch.setattr(docgen_mod, "render_site", lambda *a, **k: None, raising=True) + monkeypatch.setattr(docgen_mod, "echo", lambda *_: None, raising=True) + monkeypatch.setattr(docgen_mod, "echo_debug", lambda *_: None, raising=True) + + opened: dict[str, Any] = {} + + def fake_open(url, new=0): + opened["url"] = url + opened["new"] = new + return True + + monkeypatch.setattr(docgen_mod.webbrowser, "open", fake_open, raising=True) + + docgen_mod.docgen(project=".", env_name="dev", open_source=True) + + assert "index.html" in opened["url"] + assert opened["new"] == 2 + + +def test_register_adds_command(): + """ + Typer sometimes sets Command.name to None for decorator-style registration. + So we assert on the callback instead of the name. + """ + app = typer.Typer() + docgen_mod.register(app) + + cmds = app.registered_commands + assert len(cmds) == 1 + + cmd = cmds[0] + # Typer stores the real function on cmd.callback + assert cmd.callback is docgen_mod.docgen diff --git a/tests/unit/cli/test_seed_cmd_unit.py b/tests/unit/cli/test_seed_cmd_unit.py new file mode 100644 index 0000000..b94ecba --- /dev/null +++ b/tests/unit/cli/test_seed_cmd_unit.py @@ -0,0 +1,163 @@ +# tests/unit/cli/test_seed_cmd_unit.py +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace + +import pytest +import typer + +import fastflowtransform.cli.seed_cmd as seed_mod + + +@pytest.mark.unit +def test_seed_happy_path(monkeypatch, tmp_path: Path): + """ + seed(...) should: + - prepare context + - call ctx.make_executor() + - call seed_project(...) with project, executor, and default_schema (None for non-postgres) + - echo a humanized message + """ + called = { + "prepare_ctx": None, + "make_exec": 0, + "seed_project": None, + "echo": None, + } + + # fake executor + fake_executor = object() + + # fake context returned by _prepare_context + fake_ctx = SimpleNamespace( + project=tmp_path, + profile=SimpleNamespace(engine="duckdb"), # not postgres → default_schema=None + make_executor=lambda: ( + called.__setitem__("make_exec", called["make_exec"] + 1) or (fake_executor, None, None) + ), + ) + + # patch _prepare_context + monkeypatch.setattr( + seed_mod, + "_prepare_context", + lambda project, env_name, engine, vars: called.__setitem__( + "prepare_ctx", (project, env_name, engine, vars) + ) + or fake_ctx, + raising=True, + ) + + # patch seed_project + def fake_seed_project(project, executor, default_schema): + called["seed_project"] = (project, executor, default_schema) + return 3 # pretend we seeded 3 tables + + monkeypatch.setattr(seed_mod, "seed_project", fake_seed_project, raising=True) + + # patch echo + monkeypatch.setattr( + seed_mod, + "echo", + lambda msg: called.__setitem__("echo", msg), + raising=True, + ) + + # ACT + seed_mod.seed(project=".", env_name="dev", engine=None, vars=None) + + # ASSERT + # 1) context got correct params + assert called["prepare_ctx"] == (".", "dev", None, None) + # 2) executor was created + assert called["make_exec"] == 1 + # 3) seed_project was called with project + executor + None (because duckdb) + proj_arg, exec_arg, schema_arg = called["seed_project"] + assert proj_arg == tmp_path + assert exec_arg is fake_executor + assert schema_arg is None + # 4) message was printed, humanized + assert "3 table(s)" in called["echo"] + + +@pytest.mark.unit +def test_seed_uses_postgres_schema_if_profile_is_postgres(monkeypatch, tmp_path: Path): + """ + If profile.engine == 'postgres', seed(...) should pass profile.postgres.db_schema + as default_schema into seed_project(...). + """ + called = {} + + fake_executor = object() + + fake_ctx = SimpleNamespace( + project=tmp_path, + profile=SimpleNamespace( + engine="postgres", + postgres=SimpleNamespace(db_schema="public"), + ), + make_executor=lambda: (fake_executor, None, None), + ) + + monkeypatch.setattr(seed_mod, "_prepare_context", lambda *a, **k: fake_ctx, raising=True) + + def fake_seed_project(project, executor, default_schema): + called["args"] = (project, executor, default_schema) + return 1 + + monkeypatch.setattr(seed_mod, "seed_project", fake_seed_project, raising=True) + monkeypatch.setattr(seed_mod, "echo", lambda *_: None, raising=True) + + # ACT + seed_mod.seed(project=".", env_name="dev") + + # ASSERT + proj_arg, exec_arg, schema_arg = called["args"] + assert proj_arg == tmp_path + assert exec_arg is fake_executor + assert schema_arg == "public" # <-- this is the important part + + +@pytest.mark.unit +def test_seed_passes_through_cli_vars(monkeypatch, tmp_path: Path): + """ + We just want to be sure that whatever --vars / vars=... arrives + is forwarded to _prepare_context. + """ + captured = {} + + fake_ctx = SimpleNamespace( + project=tmp_path, + profile=SimpleNamespace(engine="duckdb"), + make_executor=lambda: (object(), None, None), + ) + + def fake_prepare(project, env_name, engine, vars): + captured["prepare"] = (project, env_name, engine, vars) + return fake_ctx + + monkeypatch.setattr(seed_mod, "_prepare_context", fake_prepare, raising=True) + monkeypatch.setattr(seed_mod, "seed_project", lambda *_: 0, raising=True) + monkeypatch.setattr(seed_mod, "echo", lambda *_: None, raising=True) + + # ACT + seed_mod.seed(project=".", env_name="dev", vars=["day=2025-10-01", "limit=5"]) + + # ASSERT + assert captured["prepare"] == (".", "dev", None, ["day=2025-10-01", "limit=5"]) + + +@pytest.mark.unit +def test_register_adds_command(): + app = typer.Typer() + seed_mod.register(app) + + names: set[str] = set() + for cmd in app.registered_commands: + if cmd.name: + names.add(cmd.name) + elif cmd.callback is not None: + names.add(cmd.callback.__name__) + + assert "seed" in names diff --git a/tests/unit/test_docs_materialization_badges.py b/tests/unit/docs/test_docs_materialization_badges.py similarity index 100% rename from tests/unit/test_docs_materialization_badges.py rename to tests/unit/docs/test_docs_materialization_badges.py diff --git a/tests/unit/test_docs_merge.py b/tests/unit/docs/test_docs_merge.py similarity index 100% rename from tests/unit/test_docs_merge.py rename to tests/unit/docs/test_docs_merge.py diff --git a/tests/unit/docs/test_docs_unit.py b/tests/unit/docs/test_docs_unit.py new file mode 100644 index 0000000..1ad9cff --- /dev/null +++ b/tests/unit/docs/test_docs_unit.py @@ -0,0 +1,648 @@ +# tests/unit/docs/test_docs_unit.py +from __future__ import annotations + +import textwrap +from pathlib import Path +from types import SimpleNamespace +from typing import Any, cast + +import pytest + +import fastflowtransform.docs as docs_mod +from fastflowtransform.core import Node + +# --------------------------------------------------------------------------- +# Helper fakes +# --------------------------------------------------------------------------- + + +class _FakeTemplate: + def __init__(self, content: str): + self._content = content + + def render(self, **ctx: Any) -> str: + # handle exactly the two patterns we need in the tests + if "{m.name}" in self._content and "m" in ctx: + m = ctx["m"] + return f"MODEL {getattr(m, 'name', 'UNKNOWN')}" + if "{mermaid_src}" in self._content and "mermaid_src" in ctx: + return f"INDEX {ctx['mermaid_src']}" + # fallback: just dump keys + return self._content + + +class _FakeEnv: + def __init__(self): + # name -> template + self._tmpls: dict[str, _FakeTemplate] = { + "index.html.j2": _FakeTemplate("INDEX {mermaid_src}"), + "model.html.j2": _FakeTemplate("MODEL {m.name}"), + } + + def get_template(self, name: str) -> _FakeTemplate: + return self._tmpls[name] + + +# --------------------------------------------------------------------------- +# Unit tests for small helpers +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_safe_filename_keeps_dots_and_slashes_as_underscore(): + assert docs_mod._safe_filename("abc.sql") == "abc.sql" + assert docs_mod._safe_filename("a b") == "a_b" + # exotic chars + assert docs_mod._safe_filename("äöü") == "___" + # empty → fallback + assert docs_mod._safe_filename("") == "_model" + + +@pytest.mark.unit +def test_render_minimarkdown_inline_code_and_links(): + md = textwrap.dedent( + """ + Some text with `code` and a [link](https://example.com). + + Second paragraph. + """ + ).strip() + html = docs_mod._render_minimarkdown(md) + assert "code" in html + assert 'Some text" in html + assert "

Second paragraph." in html + + +@pytest.mark.unit +def test_strip_html_removes_tags_and_collapses_ws(): + txt = "

Hello World

\n X " + assert docs_mod._strip_html(txt) == "Hello World X" + + +@pytest.mark.unit +def test_read_markdown_file_with_front_matter(tmp_path: Path): + p = tmp_path / "doc.md" + p.write_text( + "---\ntitle: Hello\n---\nBody text\n", + encoding="utf-8", + ) + fm, body = docs_mod._read_markdown_file(p) + assert fm == {"title": "Hello"} + assert body.strip() == "Body text" + + +@pytest.mark.unit +def test_read_markdown_file_no_front_matter(tmp_path: Path): + p = tmp_path / "doc.md" + p.write_text("Plain body", encoding="utf-8") + fm, body = docs_mod._read_markdown_file(p) + assert fm == {} + assert body == "Plain body" + + +@pytest.mark.unit +def test_reverse_deps_builds_inverse_graph(): + nodes = { + "a": SimpleNamespace(name="a", deps=["b", "c"]), + "b": SimpleNamespace(name="b", deps=["c"]), + "c": SimpleNamespace(name="c", deps=[]), + } + rev = docs_mod._reverse_deps(nodes) # type: ignore[arg-type] + # c is used by a and b + assert rev["c"] == ["a", "b"] + # b is used by a + assert rev["b"] == ["a"] + # a is not used by anyone + assert rev["a"] == [] + + +@pytest.mark.unit +def test_materialization_legend_has_incremental(): + legend = docs_mod._materialization_legend() + assert "incremental" in legend + assert legend["incremental"]["label"] == "incremental" + + +# --------------------------------------------------------------------------- +# read_docs_metadata +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_read_docs_metadata_merges_yaml_and_markdown(tmp_path: Path): + # project.yml with docs + (tmp_path / "project.yml").write_text( + textwrap.dedent( + """ + docs: + models: + my_model: + description: "YAML desc" + columns: + col1: "YAML col1" + lineage: + col1: + from: + - table: src.t + column: x + transformed: true + """ + ), + encoding="utf-8", + ) + + # Markdown model override (should win over YAML description) + md_dir = tmp_path / "docs" / "models" + md_dir.mkdir(parents=True) + (md_dir / "my_model.md").write_text("Markdown desc", encoding="utf-8") + + # Markdown column override + col_dir = tmp_path / "docs" / "columns" / "my_model" + col_dir.mkdir(parents=True) + (col_dir / "col1.md").write_text("MD col1", encoding="utf-8") + + meta = docs_mod.read_docs_metadata(tmp_path) + + # model present + assert "my_model" in meta["models"] + # description_html from MD wins + assert "Markdown desc" in meta["models"]["my_model"]["description_html"] + # YAML lineage preserved + assert "lineage" in meta["models"]["my_model"] + # column override present + assert meta["columns"]["my_model"]["col1"].startswith("

MD col1") + + +# --------------------------------------------------------------------------- +# _apply_descriptions_to_models +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_apply_descriptions_to_models_applies_short_and_column_desc(): + models = [ + docs_mod.ModelDoc( + name="m1", + kind="sql", + path="models/m1.sql", + relation="db.sc.m1", + deps=[], + materialized="table", + ) + ] + docs_meta = { + "models": { + "m1": { + "description_html": "

Hello world

", + "columns": {"col1": "

Col 1

"}, + } + }, + "columns": { + "db.sc.m1": { + "col2": "

Col 2

", + } + }, + } + cols_by_table = { + "db.sc.m1": [ + docs_mod.ColumnInfo("col1", "STRING", True), + docs_mod.ColumnInfo("col2", "INT", False), + ] + } + + docs_mod._apply_descriptions_to_models(models, docs_meta, cols_by_table, with_schema=True) + + assert models[0].description_html == "

Hello world

" + assert models[0].description_short == "Hello world" + # column 1 desc from model-level + assert cols_by_table["db.sc.m1"][0].description_html == "

Col 1

" + # column 2 desc from relation-level + assert cols_by_table["db.sc.m1"][1].description_html == "

Col 2

" + + +# --------------------------------------------------------------------------- +# render_site (with patched jinja + registry) +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_render_site_writes_index_and_model_pages(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): + fake_nodes_raw = { + "model_a": SimpleNamespace( + name="model_a", + kind="sql", + path=tmp_path / "models" / "model_a.sql", + deps=["model_b"], + meta={"materialized": "view"}, + ), + "model_b": SimpleNamespace( + name="model_b", + kind="python", + path=tmp_path / "models" / "model_b.py", + deps=[], + meta={}, + ), + } + + monkeypatch.setattr( + docs_mod, + "REGISTRY", + SimpleNamespace( + nodes=fake_nodes_raw, + macros={}, + get_project_dir=lambda: tmp_path, + ), + raising=True, + ) + + monkeypatch.setattr(docs_mod, "_init_jinja", lambda: _FakeEnv(), raising=True) + fake_nodes = cast(dict[str, Node], fake_nodes_raw) + + docs_mod.render_site(tmp_path, fake_nodes, executor=None, with_schema=False) + + index_file = tmp_path / "index.html" + assert index_file.exists() + assert "INDEX" in index_file.read_text(encoding="utf-8") + + model_a_file = tmp_path / "model_a.html" + model_b_file = tmp_path / "model_b.html" + assert model_a_file.exists() + assert model_b_file.exists() + + assert "MODEL model_a" in model_a_file.read_text(encoding="utf-8") + assert "MODEL model_b" in model_b_file.read_text(encoding="utf-8") + + +# --------------------------------------------------------------------------- +# _collect_columns engine stubs +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_collect_columns_prefers_spark(): + class FakeCol: + def __init__(self, name: str): + self.name = name + self.dataType = "INT" + self.nullable = True + + class FakeTable: + def __init__(self, name: str): + self.name = name + self.database = None + self.catalog = None + + class FakeSparkCatalog: + def listTables(self): + return [FakeTable("T1")] + + def listColumns(self, ident, database=None): + return [FakeCol("C1"), FakeCol("C2")] + + class FakeSpark: + catalog = FakeSparkCatalog() + + cols = docs_mod._collect_columns(SimpleNamespace(spark=FakeSpark())) + assert "T1" in cols + assert [c.name for c in cols["T1"]] == ["C1", "C2"] + + +@pytest.mark.unit +def test_collect_columns_with_unknown_executor_returns_empty(): + cols = docs_mod._collect_columns(object()) + assert cols == {} + + +# ---------------------- _columns_duckdb ---------------------- + + +@pytest.mark.unit +def test_columns_duckdb_collects_tables_and_cols(): + class FakeCursor: + def __init__(self, rows): + self._rows = rows + + def fetchall(self): + return self._rows + + class FakeConn: + def __init__(self, rows): + self._rows = rows + + def execute(self, _sql: str): + return FakeCursor(self._rows) + + rows = [ + # table_name, column_name, data_type, is_nullable + ("my_table", "id", "INTEGER", "NO"), + ("my_table", "name", "TEXT", "YES"), + ("other", "x", "BOOLEAN", "YES"), + ] + fake_con = FakeConn(rows) + + cols = docs_mod._columns_duckdb(fake_con) + + assert set(cols.keys()) == {"my_table", "other"} + mt = cols["my_table"] + assert [c.name for c in mt] == ["id", "name"] + assert mt[0].dtype == "INTEGER" + assert mt[0].nullable is False + assert mt[1].nullable is True + + +# ---------------------- _columns_postgres ---------------------- + + +@pytest.mark.unit +def test_columns_postgres_collects_from_engine(): + class FakeResult: + def __init__(self, rows): + self._rows = rows + + def fetchall(self): + return self._rows + + class FakeConn: + def __init__(self, rows): + self._rows = rows + + def execute(self, _stmt): + return FakeResult(self._rows) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + class FakeEngine: + def __init__(self, rows): + self._rows = rows + + def begin(self): + return FakeConn(self._rows) + + rows = [ + # table_name, column_name, data_type, is_nullable + ("public_tbl", "id", "integer", "YES"), + ("public_tbl", "email", "text", "NO"), + ] + fake_engine = FakeEngine(rows) + + cols = docs_mod._columns_postgres(fake_engine) + + assert "public_tbl" in cols + tcols = cols["public_tbl"] + assert [c.name for c in tcols] == ["id", "email"] + assert tcols[0].dtype == "integer" + # in deiner Implementierung: nullable == "YES" + assert tcols[0].nullable is True + assert tcols[1].nullable is False + + +# ---------------------- _columns_snowflake ---------------------- + + +@pytest.mark.unit +def test_columns_snowflake_collects_from_session(): + # snowflake .collect() liefert list[Row], aber wir können dicts nehmen + class FakeDF: + def __init__(self, rows): + self._rows = rows + + def collect(self): + return self._rows + + class FakeSession: + def __init__(self, rows): + self._rows = rows + + def sql(self, _sql: str): + return FakeDF(self._rows) + + rows = [ + { + "TABLE_NAME": "T1", + "COLUMN_NAME": "ID", + "DATA_TYPE": "NUMBER", + "IS_NULLABLE": "NO", + }, + { + "TABLE_NAME": "T1", + "COLUMN_NAME": "NAME", + "DATA_TYPE": "TEXT", + "IS_NULLABLE": "YES", + }, + { + "TABLE_NAME": "T2", + "COLUMN_NAME": "TS", + "DATA_TYPE": "TIMESTAMP_NTZ", + "IS_NULLABLE": "YES", + }, + ] + fake_session = FakeSession(rows) + + cols = docs_mod._columns_snowflake(fake_session) + + assert set(cols.keys()) == {"T1", "T2"} + t1 = cols["T1"] + assert [c.name for c in t1] == ["ID", "NAME"] + assert t1[0].dtype == "NUMBER" + assert t1[0].nullable is False + assert t1[1].nullable is True + + t2 = cols["T2"] + assert t2[0].name == "TS" + assert t2[0].dtype == "TIMESTAMP_NTZ" + assert t2[0].nullable is True + + +@pytest.mark.unit +def test_build_macro_list_collects_and_sorts(monkeypatch: pytest.MonkeyPatch, tmp_path: Path): + # Arrange + project_dir = tmp_path + macros_dir = project_dir / "macros" + macros_dir.mkdir(parents=True, exist_ok=True) + + m1 = macros_dir / "util_dates.sql" + m1.write_text("-- sql macro 1", encoding="utf-8") + + m2 = macros_dir / "cleanup.sql" + m2.write_text("-- sql macro 2", encoding="utf-8") + + m3 = macros_dir / "py_macro.py" + m3.write_text("def run(): ...", encoding="utf-8") + + fake_registry = SimpleNamespace( + macros={ + "util_dates": m1, + "cleanup": m2, + "py_macro": m3, + } + ) + monkeypatch.setattr(docs_mod, "REGISTRY", fake_registry, raising=True) + + # Act + res = docs_mod._build_macro_list(project_dir) + + # Assert + names = [x["name"] for x in res] + assert names == ["py_macro", "cleanup", "util_dates"] + + paths = {x["name"]: x["path"] for x in res} + assert paths["cleanup"] == "macros/cleanup.sql" + assert paths["util_dates"] == "macros/util_dates.sql" + assert paths["py_macro"] == "macros/py_macro.py" + + kinds = {x["name"]: x["kind"] for x in res} + assert kinds["py_macro"] == "python" + assert kinds["cleanup"] == "sql" + assert kinds["util_dates"] == "sql" + + +@pytest.mark.unit +def test_infer_and_attach_lineage_sql_branch_is_used(monkeypatch: pytest.MonkeyPatch): + m = docs_mod.ModelDoc( + name="model_sql", + kind="sql", + path="models/model_sql.sql", + relation="project.dataset.model_sql", + deps=["src_table"], + materialized="table", + ) + models = [m] + + cols_by_table = { + "project.dataset.model_sql": [ + docs_mod.ColumnInfo("id", "INT", True), + docs_mod.ColumnInfo("name", "STRING", True), + ] + } + + class FakeExecutor: + def render_sql(self, node, jenv, ref_resolver=None, source_resolver=None): + return "select id, name from src_table" + + def _resolve_ref(self, name, env): + return f"resolved_ref_{name}" + + def _resolve_source(self, source, table): + return f"resolved_source_{source}_{table}" + + fake_executor = FakeExecutor() + + fake_node = SimpleNamespace( + name="model_sql", + kind="sql", + path=Path("models/model_sql.sql"), + deps=["src_table"], + meta={}, + ) + fake_registry = SimpleNamespace( + nodes={"model_sql": fake_node}, + env=SimpleNamespace(), + ) + monkeypatch.setattr(docs_mod, "REGISTRY", fake_registry, raising=True) + + monkeypatch.setattr( + docs_mod, + "infer_sql_lineage", + lambda rendered: {"id": [{"from_relation": "src_table", "from_column": "id"}]}, + raising=True, + ) + monkeypatch.setattr( + docs_mod, + "parse_sql_lineage_overrides", + lambda rendered: {}, + raising=True, + ) + monkeypatch.setattr( + docs_mod, + "merge_lineage", + lambda base, overrides: base, + raising=True, + ) + docs_meta: dict[str, object] = {} + + # ACT + docs_mod._infer_and_attach_lineage( + models, + fake_executor, + docs_meta, + cols_by_table, + with_schema=True, + ) + + # ASSERT + col_id = cols_by_table["project.dataset.model_sql"][0] + assert col_id.name == "id" + assert col_id.lineage == [{"from_relation": "src_table", "from_column": "id"}] + + +@pytest.mark.unit +def test_infer_and_attach_lineage_yaml_override_branch_is_used(monkeypatch: pytest.MonkeyPatch): + m = docs_mod.ModelDoc( + name="model_yml", + kind="sql", + path="models/model_yml.sql", + relation="project.dataset.model_yml", + deps=[], + materialized="table", + ) + models = [m] + + cols_by_table = { + "project.dataset.model_yml": [ + docs_mod.ColumnInfo("total", "NUMBER", True), + docs_mod.ColumnInfo("cnt", "NUMBER", True), + ] + } + + docs_meta = { + "models": { + "model_yml": { + "description_html": None, + "columns": {}, + "lineage": { + "total": { + "from": [ + {"table": "project.dataset.orders", "column": "amount"}, + ], + "transformed": True, + } + }, + } + }, + "columns": {}, + } + + monkeypatch.setattr(docs_mod, "infer_sql_lineage", lambda *_a, **_k: {}, raising=True) + monkeypatch.setattr(docs_mod, "parse_sql_lineage_overrides", lambda *_a, **_k: {}, raising=True) + monkeypatch.setattr( + docs_mod, + "merge_lineage", + lambda base, overrides: (overrides or base), + raising=True, + ) + + # ACT + docs_mod._infer_and_attach_lineage( + models, + executor=None, + docs_meta=docs_meta, + cols_by_table=cols_by_table, + with_schema=True, + ) + + # ASSERT + col_total = cols_by_table["project.dataset.model_yml"][0] + assert col_total.name == "total" + assert col_total.lineage == [ + { + "from_relation": "project.dataset.orders", + "from_column": "amount", + "transformed": True, + } + ] diff --git a/tests/unit/executors/test_bigquery_bf_exec_unit.py b/tests/unit/executors/test_bigquery_bf_exec_unit.py new file mode 100644 index 0000000..9275053 --- /dev/null +++ b/tests/unit/executors/test_bigquery_bf_exec_unit.py @@ -0,0 +1,410 @@ +# tests/unit/executors/test_bigquery_bf_exec_unit.py +from __future__ import annotations + +import sys +import types +from pathlib import Path +from types import SimpleNamespace +from typing import Any, ClassVar + +import pytest +from tests.common.mock.bigquery import ( + FakeBadRequest, + FakeClient, + FakeField, + FakeJob, + FakeNotFound, + install_fake_bigquery, +) + +import fastflowtransform.executors._bigquery_mixin as bq_mix_mod +import fastflowtransform.executors.bigquery_bf_exec as bq_exec_mod +from fastflowtransform.core import Node + +# ---------------------- BigFrames-Fakes ------------------------------------ + + +class _FakeBigQueryOptions: + def __init__(self, *a, **kw): + self.kw = kw + + +class _FakeBFSession: + def __init__(self, *a, **kw): + self.created_with = (a, kw) + self.read_calls: list[str] = [] + + def read_gbq(self, table_id: str) -> Any: + self.read_calls.append(table_id) + return SimpleNamespace( + columns=["id", "name"], + to_gbq=None, + materialize=None, + ) + + +@pytest.fixture +def bq_exec(monkeypatch): + _ = install_fake_bigquery(monkeypatch, [bq_exec_mod, bq_mix_mod]) + + fake_bigframes = types.ModuleType("bigframes") + fake_conf = types.ModuleType("bigframes._config") + fake_conf_bq = types.ModuleType("bigframes._config.bigquery_options") + + fake_conf_bq.BigQueryOptions = _FakeBigQueryOptions # type: ignore[attr-defined] + fake_bigframes.Session = _FakeBFSession # type: ignore[attr-defined] + + sys.modules.setdefault("bigframes", fake_bigframes) + sys.modules.setdefault("bigframes._config", fake_conf) + sys.modules["bigframes._config.bigquery_options"] = fake_conf_bq + + monkeypatch.setattr(bq_exec_mod, "bigframes", fake_bigframes, raising=True) + + ex = bq_exec_mod.BigQueryBFExecutor(project="p1", dataset="ds1", location="EU") + + assert isinstance(ex.client, FakeClient) + assert isinstance(ex.session, _FakeBFSession) + + ex.client.add_dataset("p1.ds1") + + monkeypatch.setattr(bq_exec_mod, "NotFound", FakeNotFound, raising=True) + monkeypatch.setattr(bq_exec_mod, "BadRequest", FakeBadRequest, raising=True) + + return ex + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_read_relation_happy(bq_exec): + df = bq_exec._read_relation("tbl", Node(name="n", kind="sql", path=Path(".")), deps=["x"]) + assert hasattr(df, "columns") + assert df.columns == ["id", "name"] + assert bq_exec.session.read_calls == ["p1.ds1.tbl"] + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_read_relation_not_found_raises_nice_message(bq_exec): + def boom(_table_id: str): + raise FakeNotFound("nope") + + bq_exec.session.read_gbq = boom # type: ignore[assignment] + bq_exec.client.add_table("p1.ds1", "existing1") + + with pytest.raises(RuntimeError) as exc: + bq_exec._read_relation( + "missing", + Node(name="n", kind="sql", path=Path(".")), + deps=["dep1"], + ) + msg = str(exc.value) + assert "Dependency table not found" in msg + assert "dep1" in msg + assert "existing1" in msg + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_materialize_relation_prefers_to_gbq(bq_exec): + called: dict[str, Any] = {} + + class DF: + def to_gbq(self, table_id, if_exists="replace"): + called["table_id"] = table_id + called["if_exists"] = if_exists + + bq_exec._materialize_relation("out_tbl", DF(), Node(name="m", kind="python", path=Path("."))) + assert called["table_id"] == "p1.ds1.out_tbl" + assert called["if_exists"] == "replace" + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_materialize_relation_fallback_to_materialize(bq_exec): + called: dict[str, Any] = {} + + class DF: + def materialize(self, table, mode="overwrite"): + called["table"] = table + called["mode"] = mode + + bq_exec._materialize_relation("out_tbl", DF(), Node(name="m", kind="python", path=Path("."))) + assert called["table"] == "p1.ds1.out_tbl" + assert called["mode"] == "overwrite" + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_materialize_relation_raises_if_no_supported_method(bq_exec): + class DF: + columns: ClassVar[list[str]] = ["x"] + + with pytest.raises(RuntimeError): + bq_exec._materialize_relation( + "out_tbl", DF(), Node(name="m", kind="python", path=Path(".")) + ) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_validate_required_single_frame_ok(bq_exec): + frame = SimpleNamespace(columns=["a", "b", "c"]) + bq_exec._validate_required("m1", frame, {"ds1.tbl": {"a", "b"}}) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_validate_required_single_frame_missing(bq_exec): + frame = SimpleNamespace(columns=["a"]) + with pytest.raises(ValueError) as exc: + bq_exec._validate_required("m1", frame, {"ds1.tbl": {"a", "b"}}) + assert "missing" in str(exc.value) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_validate_required_multi_input_ok(bq_exec): + f1 = SimpleNamespace(columns=["id", "name"]) + f2 = SimpleNamespace(columns=["user_id", "order_id"]) + bq_exec._validate_required( + "m1", + {"ds1.users": f1, "ds1.orders": f2}, + {"ds1.users": {"id"}, "ds1.orders": {"order_id"}}, + ) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_is_frame_detection(bq_exec): + class WithToGbq: + def to_gbq(self): ... + + class WithMaterialize: + def materialize(self): ... + + class WithColumns: + columns: ClassVar[list[str]] = ["x"] + + assert bq_exec._is_frame(WithToGbq()) is True + assert bq_exec._is_frame(WithMaterialize()) is True + assert bq_exec._is_frame(WithColumns()) is True + assert bq_exec._is_frame(object()) is False + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_columns_of_prefers_columns_attr(bq_exec): + frame = SimpleNamespace(columns=["a", "b"]) + assert bq_exec._columns_of(frame) == ["a", "b"] + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_columns_of_falls_back_to_schema_names(bq_exec): + class F: + schema = SimpleNamespace(names=["x", "y"]) + + assert bq_exec._columns_of(F()) == ["x", "y"] + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_format_relation_for_ref(bq_exec): + rel = bq_exec._format_relation_for_ref("m1") + assert "p1" in rel + assert "ds1" in rel + assert "m1" in rel + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_format_source_reference(bq_exec): + cfg = {"identifier": "src_tbl", "project": "other_proj", "dataset": "raw"} + ref = bq_exec._format_source_reference(cfg, "src", "tbl") + assert "other_proj" in ref + assert "raw" in ref + assert "src_tbl" in ref + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_create_or_replace_view_calls_client_query(bq_exec): + bq_exec.client.queries.clear() + bq_exec._create_or_replace_view( + "p1.ds1.v_view", "SELECT 1", Node(name="x", kind="sql", path=Path(".")) + ) + assert any("CREATE OR REPLACE VIEW" in q[0] for q in bq_exec.client.queries) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_create_or_replace_table_calls_client_query(bq_exec): + bq_exec.client.queries.clear() + bq_exec._create_or_replace_table( + "p1.ds1.t_out", "SELECT 1", Node(name="x", kind="sql", path=Path(".")) + ) + assert any("CREATE OR REPLACE TABLE" in q[0] for q in bq_exec.client.queries) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_create_or_replace_view_from_table_calls_client_query(bq_exec): + bq_exec.client.queries.clear() + bq_exec._create_or_replace_view_from_table( + "v_users", "ds1.users", Node(name="x", kind="python", path=Path(".")) + ) + assert any("CREATE OR REPLACE VIEW" in q[0] for q in bq_exec.client.queries) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_exists_relation_true(bq_exec, monkeypatch): + class _Job: + def __init__(self): + self._rows = [(1,)] + + def result(self): + return self._rows + + monkeypatch.setattr( + bq_exec.client, + "query", + lambda sql, location=None, job_config=None: _Job(), + raising=True, + ) + + ok = bq_exec.exists_relation("some_table") + assert ok is True + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_create_table_as_cleans_select(bq_exec): + bq_exec.client.queries.clear() + bq_exec.create_table_as("dst_tbl", "SELECT 1;") + sql = bq_exec.client.queries[-1][0] + assert "CREATE TABLE" in sql + assert "SELECT 1" in sql + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_incremental_insert_cleans_select(bq_exec): + bq_exec.client.queries.clear() + bq_exec.incremental_insert("dst_tbl", "SELECT 1;") + sql = bq_exec.client.queries[-1][0] + assert "INSERT INTO" in sql + assert "SELECT 1" in sql + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_incremental_merge_executes_two_statements(bq_exec): + bq_exec.client.queries.clear() + bq_exec.incremental_merge("dst_tbl", "SELECT 1 AS id", ["id"]) + assert len(bq_exec.client.queries) == 2 + assert "DELETE FROM" in bq_exec.client.queries[0][0] + assert "INSERT INTO" in bq_exec.client.queries[1][0] + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_alter_table_sync_schema_adds_missing_columns(bq_exec): + def fake_query(sql: str, location: str | None = None, job_config: Any | None = None): + if "WHERE 1=0" in sql: + return FakeJob(schema=[FakeField("id"), FakeField("new_col", "INT64")]) + if sql.startswith("ALTER TABLE"): + bq_exec.client.queries.append((sql, location, job_config)) + return FakeJob() + return FakeJob() + + bq_exec.client.query = fake_query # type: ignore[assignment] + bq_exec.client.get_table = lambda ref: SimpleNamespace(schema=[FakeField("id")]) # type: ignore[assignment] + + bq_exec.alter_table_sync_schema("existing", "SELECT 1 AS id, 2 AS new_col") + assert any("ADD COLUMN new_col" in q[0] for q in bq_exec.client.queries) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_on_node_built_best_effort(monkeypatch, bq_exec): + called = {"ensure": 0, "upsert": 0} + + def fake_ensure(ex): + called["ensure"] += 1 + + def fake_upsert(ex, name, rel, fp, eng): + called["upsert"] += 1 + + monkeypatch.setattr(bq_exec_mod, "ensure_meta_table", fake_ensure) + monkeypatch.setattr(bq_exec_mod, "upsert_meta", fake_upsert) + + bq_exec.on_node_built(Node(name="m", kind="sql", path=Path(".")), "p1.ds1.m", "fp123") + + assert called["ensure"] == 1 + assert called["upsert"] == 1 + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_bf_apply_sql_materialization_calls_super(monkeypatch, bq_exec): + monkeypatch.setattr(bq_exec, "_ensure_dataset", lambda: None, raising=True) + + import fastflowtransform.executors.bigquery_bf_exec as bq_bf_mod # noqa PLC0415 + + called: dict[str, str] = {} + + monkeypatch.setattr( + bq_bf_mod.BaseExecutor, + "_apply_sql_materialization", + lambda self, node, target_sql, select_body, materialization: called.update( + { + "node": node.name, + "target_sql": target_sql, + "select_body": select_body, + "materialization": materialization, + } + ), + raising=True, + ) + + node = Node(name="m_apply", kind="sql", path=Path(".")) + + bq_exec._apply_sql_materialization( + node, + target_sql="`p1`.`ds1`.`t_out`", + select_body="SELECT 1", + materialization="table", + ) + + assert called["node"] == "m_apply" + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_apply_sql_materialization_wraps_badrequest(monkeypatch, bq_exec): + monkeypatch.setattr( + bq_exec_mod.BaseExecutor, + "_apply_sql_materialization", + lambda *a, **k: (_ for _ in ()).throw(FakeBadRequest("bad SQL")), + raising=True, + ) + + node = Node(name="m_err", kind="sql", path=Path(".")) + + with pytest.raises(RuntimeError) as exc: + bq_exec._apply_sql_materialization( + node, + target_sql="`p1`.`ds1`.`broken`", + select_body="SELECT bad", + materialization="table", + ) + msg = str(exc.value) + assert "BigQuery SQL failed for" in msg + assert "SELECT bad" in msg + assert "broken" in msg diff --git a/tests/unit/executors/test_bigquery_exec_unit.py b/tests/unit/executors/test_bigquery_exec_unit.py new file mode 100644 index 0000000..f0d5455 --- /dev/null +++ b/tests/unit/executors/test_bigquery_exec_unit.py @@ -0,0 +1,339 @@ +# tests/unit/executors/test_bigquery_exec_unit.py +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from typing import Any, cast + +import pandas as pd +import pytest +from tests.common.mock.bigquery import ( + FakeBadRequest, + FakeClient, + FakeField, + FakeJob, + FakeNotFound, + FakeWriteDisposition, + install_fake_bigquery, +) + +import fastflowtransform.executors._bigquery_mixin as bq_mix_mod +import fastflowtransform.executors.bigquery_exec as bq_exec_mod +from fastflowtransform.core import Node + + +@pytest.fixture +def bq_exec(monkeypatch): + _ = install_fake_bigquery(monkeypatch, [bq_exec_mod, bq_mix_mod]) + + fake_client = FakeClient(project="p1", location="EU") + + ex = bq_exec_mod.BigQueryExecutor( + project="p1", + dataset="ds1", + location="EU", + client=cast(Any, fake_client), + ) + + fake_client.add_dataset("p1.ds1") + + monkeypatch.setattr(bq_exec_mod, "NotFound", FakeNotFound, raising=True) + monkeypatch.setattr(bq_exec_mod, "BadRequest", FakeBadRequest, raising=True) + + return ex + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_read_relation_happy(bq_exec, monkeypatch): + def fake_query(sql: str, location: str | None = None, job_config: Any | None = None): + return FakeJob( + rows=[(1, "A")], + schema=[FakeField("id"), FakeField("name")], + ) + + bq_exec.client.query = fake_query # type: ignore[assignment] + + df = bq_exec._read_relation("tbl", Node(name="n", kind="sql", path=Path(".")), deps=["x"]) + assert list(df.columns) == ["id", "name"] + assert df.to_dict(orient="records") == [{"id": 1, "name": "A"}] + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_read_relation_not_found_raises_nice_message(bq_exec, monkeypatch): + def boom(sql: str, location: str | None = None, job_config: Any | None = None): + raise FakeNotFound("nope") + + bq_exec.client.query = boom # type: ignore[assignment] + + bq_exec.client.add_table("p1.ds1", "existing1") + + with pytest.raises(RuntimeError) as exc: + bq_exec._read_relation( + "missing", + Node(name="n", kind="sql", path=Path(".")), + deps=["dep1"], + ) + + msg = str(exc.value) + assert "Dependency table not found" in msg + assert "dep1" in msg + assert "existing1" in msg + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_materialize_relation_happy(bq_exec, monkeypatch): + df = pd.DataFrame({"id": [1], "name": ["A"]}) + + called: dict[str, Any] = {} + + def fake_load(df_in, table_id, job_config=None, location=None): + called["table_id"] = table_id + called["location"] = location + called["job_config"] = job_config + return FakeJob() + + bq_exec.client.load_table_from_dataframe = fake_load # type: ignore[assignment] + + bq_exec._materialize_relation("out_tbl", df, Node(name="m", kind="python", path=Path("."))) + + assert called["table_id"] == "p1.ds1.out_tbl" + assert called["location"] == "EU" + assert ( + getattr(called["job_config"], "write_disposition", None) + == FakeWriteDisposition.WRITE_TRUNCATE + ) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_materialize_relation_wraps_badrequest(bq_exec, monkeypatch): + df = pd.DataFrame({"id": [1]}) + + def bad_load(*_a, **_k): + raise FakeBadRequest("boom") + + bq_exec.client.load_table_from_dataframe = bad_load # type: ignore[assignment] + + with pytest.raises(RuntimeError) as exc: + bq_exec._materialize_relation("out_tbl", df, Node(name="m", kind="python", path=Path("."))) + msg = str(exc.value) + assert "BigQuery write failed" in msg + assert "out_tbl" in msg + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_create_view_over_table_calls_bq(bq_exec): + bq_exec.client.queries.clear() + node = Node(name="m", kind="python", path=Path(".")) + bq_exec._create_view_over_table("v_users", "ds1.users", node) + assert any("CREATE OR REPLACE VIEW" in q[0] for q in bq_exec.client.queries) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_frame_name(bq_exec): + assert bq_exec._frame_name() == "pandas" + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_format_relation_for_ref(bq_exec): + rel = bq_exec._format_relation_for_ref("m1") + assert "p1" in rel + assert "ds1" in rel + assert "m1" in rel + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_format_source_reference(bq_exec): + cfg = { + "identifier": "src_tbl", + "project": "other_proj", + "dataset": "raw", + } + ref = bq_exec._format_source_reference(cfg, "src", "tbl") + assert "other_proj" in ref + assert "raw" in ref + assert "src_tbl" in ref + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_apply_sql_materialization_calls_super_and_ensures_dataset(monkeypatch, bq_exec): + monkeypatch.setattr(bq_exec, "_ensure_dataset", lambda: None, raising=True) + + import fastflowtransform.executors.bigquery_exec as bq_exec_mod # noqa PLC0415 + + called: dict[str, str] = {} + + monkeypatch.setattr( + bq_exec_mod.BaseExecutor, + "_apply_sql_materialization", + lambda self, node, target_sql, select_body, materialization: called.update( + { + "node": node.name, + "target_sql": target_sql, + "select_body": select_body, + "materialization": materialization, + } + ), + raising=True, + ) + + node = Node(name="m_apply", kind="sql", path=Path(".")) + bq_exec._apply_sql_materialization( + node, + target_sql="`p1`.`ds1`.`t_out`", + select_body="SELECT 1", + materialization="table", + ) + + assert called["node"] == "m_apply" + assert called["target_sql"] == "`p1`.`ds1`.`t_out`" + assert called["select_body"] == "SELECT 1" + assert called["materialization"] == "table" + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_apply_sql_materialization_wraps_badrequest(monkeypatch, bq_exec): + monkeypatch.setattr( + bq_exec_mod.BaseExecutor, + "_apply_sql_materialization", + lambda *a, **k: (_ for _ in ()).throw(FakeBadRequest("bq exploded")), + raising=True, + ) + + node = Node(name="m_bad", kind="sql", path=Path(".")) + + with pytest.raises(RuntimeError) as exc: + bq_exec._apply_sql_materialization( + node, + target_sql="`p1`.`ds1`.`t_out`", + select_body="SELECT 1", + materialization="table", + ) + msg = str(exc.value) + assert "BigQuery SQL failed for" in msg + assert "SELECT 1" in msg + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_create_or_replace_view_calls_client_query(bq_exec): + bq_exec.client.queries.clear() + bq_exec._create_or_replace_view( + "p1.ds1.v_view", "SELECT 1", Node(name="x", kind="sql", path=Path(".")) + ) + assert any("CREATE OR REPLACE VIEW" in q[0] for q in bq_exec.client.queries) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_create_or_replace_table_calls_client_query(bq_exec): + bq_exec.client.queries.clear() + bq_exec._create_or_replace_table( + "p1.ds1.t_out", "SELECT 1", Node(name="x", kind="sql", path=Path(".")) + ) + assert any("CREATE OR REPLACE TABLE" in q[0] for q in bq_exec.client.queries) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_create_or_replace_view_from_table_calls_client_query(bq_exec): + bq_exec.client.queries.clear() + bq_exec._create_or_replace_view_from_table( + "v_users", "ds1.users", Node(name="x", kind="python", path=Path(".")) + ) + assert any("CREATE OR REPLACE VIEW" in q[0] for q in bq_exec.client.queries) + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_on_node_built_best_effort(monkeypatch, bq_exec): + called = {"ensure": 0, "upsert": 0} + + def fake_ensure(ex): + called["ensure"] += 1 + + def fake_upsert(ex, name, rel, fp, eng): + called["upsert"] += 1 + + monkeypatch.setattr(bq_exec_mod, "ensure_meta_table", fake_ensure) + monkeypatch.setattr(bq_exec_mod, "upsert_meta", fake_upsert) + + bq_exec.on_node_built(Node(name="m", kind="sql", path=Path(".")), "p1.ds1.m", "fp123") + + assert called["ensure"] == 1 + assert called["upsert"] == 1 + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_exists_relation_true(bq_exec, monkeypatch): + class _Job: + def __init__(self): + self._rows = [(1,)] + + def result(self): + return self._rows + + monkeypatch.setattr( + bq_exec.client, + "query", + lambda sql, location=None, job_config=None: _Job(), + raising=True, + ) + + assert bq_exec.exists_relation("some_table") is True + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_create_table_as_cleans_select(bq_exec): + bq_exec.client.queries.clear() + bq_exec.create_table_as("dst_tbl", "SELECT 1;") + sql = bq_exec.client.queries[-1][0] + assert "CREATE TABLE" in sql + assert "SELECT 1" in sql + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_incremental_insert_cleans_select(bq_exec): + bq_exec.client.queries.clear() + bq_exec.incremental_insert("dst_tbl", "SELECT 1;") + sql = bq_exec.client.queries[-1][0] + assert "INSERT INTO" in sql + assert "SELECT 1" in sql + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_incremental_merge_executes_two_statements(bq_exec): + bq_exec.client.queries.clear() + bq_exec.incremental_merge("dst_tbl", "SELECT 1 AS id", ["id"]) + assert len(bq_exec.client.queries) == 2 + assert "DELETE FROM" in bq_exec.client.queries[0][0] + assert "INSERT INTO" in bq_exec.client.queries[1][0] + + +@pytest.mark.unit +@pytest.mark.bigquery +def test_alter_table_sync_schema_adds_missing_columns(bq_exec, monkeypatch): + bq_exec.client.get_table = lambda ref: SimpleNamespace(schema=[FakeField("id")]) # type: ignore[assignment] + + bq_exec.client.queries.clear() + bq_exec.alter_table_sync_schema("existing", "SELECT 1 AS id, 2 AS new_col") + + assert any("ADD COLUMN new_col" in q[0] for q in bq_exec.client.queries) diff --git a/tests/unit/executors/test_duckdb_exec_unit.py b/tests/unit/executors/test_duckdb_exec_unit.py new file mode 100644 index 0000000..6aa8475 --- /dev/null +++ b/tests/unit/executors/test_duckdb_exec_unit.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pandas as pd +import pytest + +from fastflowtransform.core import Node +from fastflowtransform.executors.duckdb_exec import DuckExecutor, _q + + +@pytest.fixture +def duck_exec() -> DuckExecutor: + # real in-memory db is fine for unit tests + return DuckExecutor(":memory:") + + +def _node(name: str = "m", kind: str = "python") -> Node: + return Node(name=name, kind=kind, path=Path(".")) + + +# --------------------------------------------------------------------------- +# _read_relation +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_read_relation_happy(duck_exec: DuckExecutor): + duck_exec.con.execute("create table my_tbl (id int, name varchar)") + duck_exec.con.execute("insert into my_tbl values (1, 'A'), (2, 'B')") + + df = duck_exec._read_relation("my_tbl", _node(), deps=["up1"]) + assert list(df.columns) == ["id", "name"] + assert df.to_dict(orient="records") == [ + {"id": 1, "name": "A"}, + {"id": 2, "name": "B"}, + ] + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_read_relation_missing_raises_nice_error(duck_exec: DuckExecutor): + # no table created + with pytest.raises(RuntimeError) as exc: + duck_exec._read_relation("does_not_exist", _node(), deps=["m1", "m2"]) + msg = str(exc.value) + assert "Dependency table not found" in msg + assert "m1" in msg + assert "Existing tables" in msg # from the executor + + +# --------------------------------------------------------------------------- +# _materialize_relation +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_materialize_relation_registers_and_creates_table(duck_exec: DuckExecutor): + df = pd.DataFrame({"id": [1, 2], "name": ["A", "B"]}) + + duck_exec._materialize_relation("out_tbl", df, _node()) + + out = duck_exec.con.execute("select * from out_tbl order by id").fetchall() + assert out == [(1, "A"), (2, "B")] + + # temp table should be gone / unregistered - we just assert that running again doesn't error + duck_exec._materialize_relation("out_tbl", df, _node()) + + +# --------------------------------------------------------------------------- +# view over table +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_create_or_replace_view_from_table(duck_exec: DuckExecutor): + duck_exec.con.execute("create table src_tbl (id int)") + duck_exec.con.execute("insert into src_tbl values (10)") + duck_exec._create_or_replace_view_from_table("v_src", "src_tbl", _node()) + + rows = duck_exec.con.execute("select * from v_src").fetchall() + assert rows == [(10,)] + + +# --------------------------------------------------------------------------- +# formatting helpers +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_format_relation_for_ref(duck_exec: DuckExecutor): + rel = duck_exec._format_relation_for_ref("my_model") + # relation_for("my_model") → "my_model" + assert rel == _q("my_model") + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_format_source_reference_ok(duck_exec: DuckExecutor): + cfg = { + "catalog": "c1", + "schema": "s1", + "identifier": "src_tbl", + } + ref = duck_exec._format_source_reference(cfg, "src", "tbl") + # should be quoted catalog.schema.identifier + assert ref == '"c1"."s1"."src_tbl"' + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_format_source_reference_missing_identifier_raises(duck_exec: DuckExecutor): + cfg = { + "catalog": "c1", + "schema": "s1", + # no identifier! + } + with pytest.raises(KeyError): + duck_exec._format_source_reference(cfg, "src", "tbl") + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_format_source_reference_path_not_supported(duck_exec: DuckExecutor): + cfg = {"location": "/some/path.csv"} + with pytest.raises(NotImplementedError): + duck_exec._format_source_reference(cfg, "src", "tbl") + + +# --------------------------------------------------------------------------- +# on_node_built - best effort +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_on_node_built_best_effort(duck_exec: DuckExecutor, monkeypatch: pytest.MonkeyPatch): + called = {"ensure": 0, "upsert": 0} + + def fake_ensure(ex: Any) -> None: + called["ensure"] += 1 + + def fake_upsert(ex: Any, name: str, rel: str, fp: str, eng: str) -> None: + called["upsert"] += 1 + + # patch the functions used in on_node_built + import fastflowtransform.executors.duckdb_exec as duck_mod # noqa PLC0415 + + monkeypatch.setattr(duck_mod, "ensure_meta_table", fake_ensure, raising=True) + monkeypatch.setattr(duck_mod, "upsert_meta", fake_upsert, raising=True) + + duck_exec.on_node_built(_node("m1"), "out_tbl", "fp123") + + assert called["ensure"] == 1 + assert called["upsert"] == 1 + + +# --------------------------------------------------------------------------- +# exists_relation +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_exists_relation_true(duck_exec: DuckExecutor): + duck_exec.con.execute("create table t1 (id int)") + assert duck_exec.exists_relation("t1") is True + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_exists_relation_false(duck_exec: DuckExecutor): + assert duck_exec.exists_relation("nope") is False + + +# --------------------------------------------------------------------------- +# create_table_as / incremental_insert / incremental_merge +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_create_table_as_cleans_select(duck_exec: DuckExecutor): + duck_exec.create_table_as("t_created", "select 1 as id;") + rows = duck_exec.con.execute("select * from t_created").fetchall() + assert rows == [(1,)] + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_incremental_insert_appends_rows(duck_exec: DuckExecutor): + duck_exec.con.execute("create table tgt (id int)") + duck_exec.incremental_insert("tgt", "select 1 as id;") + duck_exec.incremental_insert("tgt", "select 2 as id;") + rows = duck_exec.con.execute("select * from tgt order by id").fetchall() + assert rows == [(1,), (2,)] + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_incremental_merge_deletes_then_inserts(duck_exec: DuckExecutor): + # target with PK-like col + duck_exec.con.execute("create table tgt (id int, val varchar)") + duck_exec.con.execute("insert into tgt values (1, 'old'), (2, 'keep')") + + # src produces (1, 'new'), (3, 'new3') + duck_exec.incremental_merge( + "tgt", + "select 1 as id, 'new' as val union all select 3 as id, 'new3' as val", + unique_key=["id"], + ) + + rows = duck_exec.con.execute("select id, val from tgt order by id").fetchall() + # id=1 should be updated to 'new' (via delete+insert) + # id=2 stays + # id=3 inserted + assert rows == [(1, "new"), (2, "keep"), (3, "new3")] + + +# --------------------------------------------------------------------------- +# alter_table_sync_schema +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.duckdb +def test_alter_table_sync_schema_adds_missing_columns(duck_exec: DuckExecutor): + # create target with only "id" + duck_exec.con.execute("create table my_tbl (id int)") + + # select has id + new_col + duck_exec.alter_table_sync_schema("my_tbl", "select 1 as id, 2 as new_col") + + # inspect schema + info = duck_exec.con.execute("pragma table_info('my_tbl')").fetchall() + # format: (cid, name, type, notnull, dflt_value, pk) + col_names = [r[1] for r in info] + assert "id" in col_names + assert "new_col" in col_names diff --git a/tests/unit/executors/test_shims_unit.py b/tests/unit/executors/test_shims_unit.py new file mode 100644 index 0000000..79b880a --- /dev/null +++ b/tests/unit/executors/test_shims_unit.py @@ -0,0 +1,249 @@ +# tests/unit/executors/test_shims_unit.py +from __future__ import annotations + +from collections.abc import Sequence +from types import SimpleNamespace +from typing import Any, cast + +import pytest +from google.cloud.bigquery import Client +from sqlalchemy import text as sa_text +from sqlalchemy.engine import Engine + +from fastflowtransform.executors._shims import ( + BigQueryConnShim, + SAConnShim, + _rewrite_pg_create_or_replace_table, +) + +# --------------------------------------------------------------------------- +# Fakes / helpers +# --------------------------------------------------------------------------- + + +class _FakeConn: + """Collects executed statements for assertions.""" + + def __init__(self) -> None: + self.executed: list[tuple[str, dict[str, Any] | None]] = [] + + # SQLAlchemy-style execute + def execute(self, stmt: Any, params: dict[str, Any] | None = None) -> Any: + # store string form for easier asserts + sql_str = stmt.text if hasattr(stmt, "text") else str(stmt) + self.executed.append((sql_str, params)) + # return something fetchable + return SimpleNamespace(fetchone=lambda: None, fetchall=lambda: []) + + # context manager API + def __enter__(self) -> _FakeConn: + return self + + def __exit__(self, exc_type, exc, tb) -> None: + return None + + # so SAConnShim can call conn.begin() in the fallback (we never do here) + def begin(self) -> _FakeConn: + return self + + +class _FakeEngine: + """Engine that always returns the same connection.""" + + def __init__(self) -> None: + self.conn = _FakeConn() + + def begin(self) -> _FakeConn: + return self.conn + + +# --------------------------------------------------------------------------- +# _rewrite_pg_create_or_replace_table +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_rewrite_pg_create_or_replace_table_simple(): + sql = "CREATE OR REPLACE TABLE public.t AS SELECT 1" + out = _rewrite_pg_create_or_replace_table(sql) + assert "DROP TABLE IF EXISTS public.t CASCADE;" in out + assert "CREATE TABLE public.t AS SELECT 1" in out + + +@pytest.mark.unit +def test_rewrite_pg_create_or_replace_table_with_schema_and_quotes(): + sql = ' create or replace table "raw"."users" as select * from src ' + out = _rewrite_pg_create_or_replace_table(sql) + # two statements + assert 'DROP TABLE IF EXISTS "raw"."users" CASCADE;' in out + assert 'CREATE TABLE "raw"."users" AS select * from src' in out + + +@pytest.mark.unit +def test_rewrite_pg_create_or_replace_table_untouched_for_other_sql(): + sql = "SELECT 1" + out = _rewrite_pg_create_or_replace_table(sql) + assert out == sql + + +# --------------------------------------------------------------------------- +# SAConnShim +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_sa_shim_executes_plain_sql_without_schema(): + eng = _FakeEngine() + shim = SAConnShim(cast(Engine, eng), schema=None) + + shim.execute("SELECT 1") + + executed = eng.conn.executed + assert len(executed) == 1 + sql, params = executed[0] + assert sql.strip().upper().startswith("SELECT 1") + assert params is None + + +@pytest.mark.unit +def test_sa_shim_sets_search_path_when_schema_given(): + eng = _FakeEngine() + shim = SAConnShim(cast(Engine, eng), schema="public") + + shim.execute("SELECT 42") + + executed = eng.conn.executed + # 1) SET LOCAL ... 2) SELECT 42 + assert len(executed) == 2 + assert 'SET LOCAL search_path = "public"' in executed[0][0] + assert "SELECT 42" in executed[1][0] + + +@pytest.mark.unit +def test_sa_shim_rewrites_cor_table_into_two_statements(): + eng = _FakeEngine() + shim = SAConnShim(cast(Engine, eng), schema=None) + + shim.execute("CREATE OR REPLACE TABLE my_tbl AS SELECT 1") + + executed = eng.conn.executed + # should have been split into DROP + CREATE + assert len(executed) == 2 + assert "DROP TABLE IF EXISTS my_tbl CASCADE" in executed[0][0] + assert "CREATE TABLE my_tbl AS SELECT 1" in executed[1][0] + + +@pytest.mark.unit +def test_sa_shim_executes_iterable_sequentially(): + eng = _FakeEngine() + shim = SAConnShim(cast(Engine, eng), schema=None) + + shim.execute(["SELECT 1", "SELECT 2"]) + + executed = [sql for (sql, _) in eng.conn.executed] + assert executed == ["SELECT 1", "SELECT 2"] + + +@pytest.mark.unit +def test_sa_shim_executes_tuple_with_params_on_last_statement(): + eng = _FakeEngine() + shim = SAConnShim(cast(Engine, eng), schema=None) + + shim.execute(("SELECT :x", {"x": 10})) + + executed = eng.conn.executed + assert len(executed) == 1 + sql, params = executed[0] + assert "SELECT :x" in sql + assert params == {"x": 10} + + +@pytest.mark.unit +def test_sa_shim_executes_sqlalchemy_clauseelement(): + eng = _FakeEngine() + shim = SAConnShim(cast(Engine, eng), schema=None) + + stmt = sa_text("SELECT 1") + shim.execute(stmt) + + executed = eng.conn.executed + assert len(executed) == 1 + assert "SELECT 1" in executed[0][0] + + +# --------------------------------------------------------------------------- +# BigQueryConnShim +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_bq_shim_executes_single_sql(): + calls: dict[str, Any] = {} + + class FakeClient: + def query(self, sql: str, location: str | None = None): + calls["sql"] = sql + calls["location"] = location + return "JOB-1" + + fake = FakeClient() + shim = BigQueryConnShim(cast(Client, fake), location="EU") + res = shim.execute("SELECT 1") + + assert res == "JOB-1" + assert calls["sql"] == "SELECT 1" + assert calls["location"] == "EU" + + +@pytest.mark.unit +def test_bq_shim_executes_sequence_and_returns_last_job(): + seen: list[str] = [] + + class FakeJob: + def result(self) -> None: + return None + + class FakeClient: + def query(self, sql: str, location: str | None = None): + seen.append(sql) + return FakeJob() + + fake = FakeClient() + shim = BigQueryConnShim(cast(Client, fake), location="EU") + res = shim.execute(["SELECT 1", "SELECT 2", "SELECT 3"]) + + # should have executed all + assert seen == ["SELECT 1", "SELECT 2", "SELECT 3"] + # and returned the last job + assert isinstance(res, FakeJob) + + +def test_bq_shim_raises_on_unsupported_type(): + fake_client = SimpleNamespace(query=lambda *a, **k: None) + + # für den Typchecker so tun, als wäre es ein echter Client + shim = BigQueryConnShim(client=cast(Client, fake_client)) + + with pytest.raises(TypeError): + shim.execute(123) + + +# --------------------------------------------------------------------------- +# Mixed / defensive +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_sa_shim_iterable_of_mixed_types(): + """Ensure iterable with strings and ClauseElements is executed in order.""" + eng = _FakeEngine() + shim = SAConnShim(cast(Engine, eng)) + + stmts: Sequence[Any] = ["SELECT 1", sa_text("SELECT 2"), "SELECT 3"] + shim.execute(stmts) + + executed = [sql for (sql, _) in eng.conn.executed] + # sql text may contain trailing semicolons/spaces from TextClause + assert "SELECT 1" in executed[0] + assert "SELECT 2" in executed[1] + assert "SELECT 3" in executed[2] diff --git a/tests/unit/executors/test_snowflake_snowpark_exec.py b/tests/unit/executors/test_snowflake_snowpark_exec.py new file mode 100644 index 0000000..c19bc61 --- /dev/null +++ b/tests/unit/executors/test_snowflake_snowpark_exec.py @@ -0,0 +1,521 @@ +# tests/unit/executors/test_snowflake_snowpark_exec_unit.py +from __future__ import annotations + +import sys +import types +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest +import snowflake.snowpark as sf + +import fastflowtransform.executors.snowflake_snowpark_exec as sf_mod +from fastflowtransform.executors.snowflake_snowpark_exec import _SFResult + +# --------------------------------------------------------------------------- +# 1) Install a fake snowflake.snowpark BEFORE importing the executor module +# --------------------------------------------------------------------------- + +fake_sf_pkg = sys.modules.setdefault("snowflake", types.ModuleType("snowflake")) +fake_sf_snowpark_mod = types.ModuleType("snowflake.snowpark") + + +class _FakeSchema: + """Minimal schema object exposing .names like real Snowpark dataframes do.""" + + def __init__(self, names: list[str]): + self.names = names + + +class FakeSnowparkDataFrame: + """ + Very small stand-in for snowflake.snowpark.DataFrame. + It needs: + - .schema.names + - .collect() + - .write.save_as_table(...) + """ + + def __init__(self, session: FakeSession, sql: str | None = None, cols: list[str] | None = None): + self._session = session + self._sql = sql + self.schema = _FakeSchema(cols or []) + self._rows: list[dict[str, Any]] = [] + + # allow tests to inject rows + def _with_rows(self, rows: list[dict[str, Any]]) -> FakeSnowparkDataFrame: + self._rows = rows + return self + + def collect(self) -> list[dict[str, Any]]: + # return rows that were set, otherwise empty + return self._rows + + class _Writer: + def __init__(self, outer: FakeSnowparkDataFrame): + self._outer = outer + + def save_as_table(self, name: str, mode: str = "overwrite") -> None: + # just record the call on the session + self._outer._session.write_calls.append(("save_as_table", name, mode)) + + @property + def write(self) -> FakeSnowparkDataFrame._Writer: # type: ignore[name-defined] + return FakeSnowparkDataFrame._Writer(self) + + +class FakeSession: + """ + Minimal session mock: + - .sql(sql) -> FakeSnowparkDataFrame + - .table(name) -> FakeSnowparkDataFrame + - remembers SQL calls + """ + + def __init__(self, cfg: dict[str, Any]): + self.cfg = cfg + self.sql_calls: list[str] = [] + self.table_calls: list[str] = [] + self.write_calls: list[tuple[str, str, str]] = [] + + # --- behaviours that depend on the SQL string --- + def _df_for_sql(self, sql: str) -> FakeSnowparkDataFrame: + """ + Create a DF whose columns/rows depend on the SQL text. + We only simulate the bits the executor actually uses. + """ + # 1) exists_relation → information_schema.tables → return 1 row + if "information_schema.tables" in sql.lower(): + return FakeSnowparkDataFrame(self, sql, [])._with_rows([{"1": 1}]) + + # 2) alter_table_sync_schema → information_schema.columns + if "information_schema.columns" in sql.lower(): + # Pretend there is already an "id" column + return FakeSnowparkDataFrame(self, sql, [])._with_rows([{"COLUMN_NAME": "ID"}]) + + # 3) probe SELECT ... WHERE 1=0 → provide schema names + if "where 1=0" in sql.lower(): + # executor expects to read probe.schema.names + return FakeSnowparkDataFrame(self, sql, ["ID", "NEW_COL"]) + + # 4) everything else → empty df, no rows + return FakeSnowparkDataFrame(self, sql, []) + + def sql(self, sql: str) -> FakeSnowparkDataFrame: + self.sql_calls.append(sql) + return self._df_for_sql(sql) + + def table(self, name: str) -> FakeSnowparkDataFrame: + self.table_calls.append(name) + # pretend table has 2 columns + return FakeSnowparkDataFrame(self, name, ["ID", "NAME"]) + + +class _FakeSessionBuilder: + def __init__(self): + self._cfg: dict[str, Any] = {} + + def configs(self, cfg: dict[str, Any]) -> _FakeSessionBuilder: + self._cfg = cfg + return self + + def create(self) -> FakeSession: + return FakeSession(self._cfg) + + +# expose as snowflake.snowpark.Session +fake_sf_snowpark_mod.Session = SimpleNamespace(builder=_FakeSessionBuilder()) # type: ignore[attr-defined] +# expose DataFrame type so isinstance(..., SNDF) works +fake_sf_snowpark_mod.DataFrame = FakeSnowparkDataFrame # type: ignore[attr-defined] + +fake_sf_snowpark_mod.DataFrame = FakeSnowparkDataFrame # type: ignore[attr-defined] +sys.modules["snowflake.snowpark"] = fake_sf_snowpark_mod + +# --------------------------------------------------------------------------- +# 2) Now we can safely import the module under test +# --------------------------------------------------------------------------- +import fastflowtransform.executors.snowflake_snowpark_exec as sf_exec_mod # noqa: E402 +from fastflowtransform.core import Node # noqa: E402 +from fastflowtransform.executors.snowflake_snowpark_exec import ( # noqa: E402 + SnowflakeSnowparkExecutor, + _SFCursorShim, +) + + +@pytest.fixture +def sf_exec(monkeypatch): + """ + Build an executor with a fake Snowpark session. + """ + # make sure the module uses our just-registered fake + monkeypatch.setattr(sf_exec_mod, "Session", fake_sf_snowpark_mod.Session, raising=True) + + cfg = { + "account": "acc", + "user": "usr", + "password": "pwd", + "warehouse": "wh", + "database": "DB1", + "schema": "SC1", + } + ex = SnowflakeSnowparkExecutor(cfg) + # sanity: we actually got our fake session + assert isinstance(ex.session, FakeSession) + return ex + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_init_sets_db_schema_and_con(sf_exec): + assert sf_exec.database == "DB1" + assert sf_exec.schema == "SC1" + # con must be present + assert isinstance(sf_exec.con, _SFCursorShim) + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_q_and_qualified(sf_exec): + assert sf_exec._q("x") == '"x"' + assert sf_exec._qualified("TBL") == '"DB1"."SC1"."TBL"' + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_read_relation_calls_session_table(sf_exec): + node = Node(name="n", kind="sql", path=Path(".")) + df = sf_exec._read_relation("MY_TBL", node, deps=[]) + assert isinstance(df, FakeSnowparkDataFrame) + # session should have been asked for the fully qualified name + assert sf_exec.session.table_calls == ['"DB1"."SC1"."MY_TBL"'] + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_materialize_relation_happy(sf_exec, monkeypatch): + called: dict[str, str] = {} + + fake_df = SimpleNamespace( + write=SimpleNamespace( + save_as_table=lambda tbl, mode="overwrite": ( + called.setdefault("table", tbl), + called.setdefault("mode", mode), + ) + ) + ) + + monkeypatch.setattr( + sf_exec, + "_is_frame", + lambda obj: obj is fake_df, + raising=True, + ) + + node = Node(name="m", kind="python", path=Path(".")) + + # ACT + sf_exec._materialize_relation("OUT_TBL", fake_df, node) + + # ASSERT + assert called["table"] == '"DB1"."SC1"."OUT_TBL"' + assert called["mode"] == "overwrite" + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_materialize_relation_raises_on_non_frame(sf_exec): + node = Node(name="m", kind="python", path=Path(".")) + with pytest.raises(TypeError): + sf_exec._materialize_relation("OUT_TBL", object(), node) + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_create_view_over_table_issues_sql(sf_exec): + node = Node(name="x", kind="sql", path=Path(".")) + sf_exec._create_view_over_table("V_USERS", "USERS", node) + assert any("CREATE OR REPLACE VIEW" in s for s in sf_exec.session.sql_calls) + sql = sf_exec.session.sql_calls[-1] + assert '"DB1"."SC1"."V_USERS"' in sql + assert '"DB1"."SC1"."USERS"' in sql + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_validate_required_single_df_ok(sf_exec): + SNDF = sf_mod.SNDF + df = SNDF(sf_exec.session) # type: ignore[call-arg] + df.schema = SimpleNamespace(names=["ID", "NAME", "AGE"]) # type: ignore[attr-defined] + + sf_exec._validate_required( + "model1", + df, + {"DB1.SC1.USERS": {"ID", "NAME"}}, + ) + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_validate_required_single_df_missing(sf_exec): + SNDF = sf_mod.SNDF + df = SNDF(sf_exec.session) # type: ignore[call-arg] + df.schema = SimpleNamespace(names=["ID"]) # type: ignore[attr-defined] + + with pytest.raises(ValueError) as exc: + sf_exec._validate_required( + "model1", + df, + {"DB1.SC1.USERS": {"ID", "NAME"}}, + ) + + msg = str(exc.value) + assert "missing columns" in msg + assert "NAME" in msg + assert "ID" in msg + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_validate_required_multi_input_ok(sf_exec): + df1 = FakeSnowparkDataFrame(sf_exec.session, cols=["ID", "NAME"]) + df2 = FakeSnowparkDataFrame(sf_exec.session, cols=["USER_ID", "ORDER_ID"]) + sf_exec._validate_required( + "model2", + {"DB1.SC1.USERS": df1, "DB1.SC1.ORDERS": df2}, + {"DB1.SC1.USERS": {"ID"}, "DB1.SC1.ORDERS": {"ORDER_ID"}}, + ) # no raise + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_validate_required_multi_input_missing_key(sf_exec): + df1 = FakeSnowparkDataFrame(sf_exec.session, cols=["ID", "NAME"]) + with pytest.raises(ValueError) as exc: + sf_exec._validate_required( + "model2", + {"DB1.SC1.USERS": df1}, + {"DB1.SC1.USERS": {"ID"}, "DB1.SC1.ORDERS": {"ORDER_ID"}}, + ) + assert "missing dependency key 'DB1.SC1.ORDERS'" in str(exc.value) + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_columns_of(sf_exec): + df = FakeSnowparkDataFrame(sf_exec.session, cols=["A", "B"]) + assert sf_exec._columns_of(df) == ["A", "B"] + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_is_frame(sf_exec): + df = sf.DataFrame(sf_exec.session) + df.schema = SimpleNamespace(names=["ID"]) # type: ignore[attr-defined] + + assert sf_exec._is_frame(df) is True + assert sf_exec._is_frame(object()) is False + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_frame_name(sf_exec): + assert sf_exec._frame_name() == "Snowpark" + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_format_relation_for_ref(sf_exec): + r = sf_exec._format_relation_for_ref("my_model") + assert '"DB1"' in r and '"SC1"' in r and "my_model" in r + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_format_source_reference_happy(sf_exec): + cfg = {"identifier": "SRC_TBL", "database": "DBX", "schema": "RAW"} + ref = sf_exec._format_source_reference(cfg, "src", "tbl") + assert ref == '"DBX"."RAW"."SRC_TBL"' + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_format_source_reference_raises_on_location(sf_exec): + cfg = {"identifier": "X", "location": "s3://foo"} + with pytest.raises(NotImplementedError): + sf_exec._format_source_reference(cfg, "src", "tbl") + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_format_source_reference_raises_on_missing_identifier(sf_exec): + with pytest.raises(KeyError): + sf_exec._format_source_reference({}, "src", "tbl") + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_create_or_replace_view_calls_session_sql(sf_exec): + node = Node(name="x", kind="sql", path=Path(".")) + sf_exec._create_or_replace_view('"DB1"."SC1"."V1"', "SELECT 1", node) + assert any("CREATE OR REPLACE VIEW" in s for s in sf_exec.session.sql_calls) + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_create_or_replace_table_calls_session_sql(sf_exec): + node = Node(name="x", kind="sql", path=Path(".")) + sf_exec._create_or_replace_table('"DB1"."SC1"."T1"', "SELECT 1", node) + assert any("CREATE OR REPLACE TABLE" in s for s in sf_exec.session.sql_calls) + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_create_or_replace_view_from_table_calls_session_sql(sf_exec): + node = Node(name="x", kind="sql", path=Path(".")) + sf_exec._create_or_replace_view_from_table("V1", "T1", node) + sql = sf_exec.session.sql_calls[-1] + assert "CREATE OR REPLACE VIEW" in sql + assert '"DB1"."SC1"."V1"' in sql + assert '"DB1"."SC1"."T1"' in sql + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_on_node_built_best_effort(monkeypatch, sf_exec): + called = {"ensure": 0, "upsert": 0} + + def fake_ensure(ex): + called["ensure"] += 1 + + def fake_upsert(ex, name, rel, fp, eng): + called["upsert"] += 1 + + monkeypatch.setattr(sf_exec_mod, "ensure_meta_table", fake_ensure) + monkeypatch.setattr(sf_exec_mod, "upsert_meta", fake_upsert) + + sf_exec.on_node_built(Node(name="m", kind="sql", path=Path(".")), '"DB1"."SC1"."M"', "fp123") + + assert called["ensure"] == 1 + assert called["upsert"] == 1 + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_exists_relation_true(sf_exec, monkeypatch): + # our fake session already returns one row for information_schema.tables + ok = sf_exec.exists_relation("SOME_TBL") + assert ok is True + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_exists_relation_false_on_error(sf_exec, monkeypatch): + def boom(sql: str): + raise RuntimeError("bad") + + monkeypatch.setattr(sf_exec.session, "sql", boom, raising=True) + ok = sf_exec.exists_relation("SOME_TBL") + assert ok is False + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_create_table_as_strips_semicolon(sf_exec): + sf_exec.session.sql_calls.clear() + sf_exec.create_table_as("DST", "SELECT 1;") + sql = sf_exec.session.sql_calls[-1] + assert "CREATE OR REPLACE TABLE" in sql + assert "SELECT 1" in sql + assert not sql.strip().endswith(";") + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_incremental_insert_strips_semicolon(sf_exec): + sf_exec.session.sql_calls.clear() + sf_exec.incremental_insert("DST", "SELECT 1;") + sql = sf_exec.session.sql_calls[-1] + assert "INSERT INTO" in sql + assert "SELECT 1" in sql + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_incremental_merge_builds_two_statements(sf_exec): + sf_exec.session.sql_calls.clear() + sf_exec.incremental_merge("DST", "SELECT 1 AS id", ["id"]) + sql = sf_exec.session.sql_calls[-1] + # both DELETE and INSERT statements should be in there + assert "DELETE FROM" in sql + assert "INSERT INTO" in sql + assert "WITH src AS" in sql + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_alter_table_sync_schema_adds_missing(sf_exec): + sf_exec.session.sql_calls.clear() + sf_exec.alter_table_sync_schema("EXISTING", "SELECT 1 AS ID, 2 AS NEW_COL") + # last sql should be ALTER TABLE ... ADD COLUMN ... + assert any("ALTER TABLE" in s and "ADD COLUMN" in s for s in sf_exec.session.sql_calls) + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_alter_table_sync_schema_noop_on_unknown_mode(sf_exec): + sf_exec.session.sql_calls.clear() + sf_exec.alter_table_sync_schema("EXISTING", "SELECT 1", mode="replace_all") # unknown + # no new calls + assert sf_exec.session.sql_calls == [] + + +@pytest.mark.unit +@pytest.mark.snowflake +def test_sfcursorshim_execute_returns_rows(sf_exec): + class FakeRow: + """Mimic Snowpark Row: has attributes *and* asDict().""" + + def __init__(self, data: dict[str, Any]): + self._data = data + # make attributes accessible: r.A, r.B, ... + for k, v in data.items(): + setattr(self, k, v) + + def asDict(self) -> dict[str, Any]: + return self._data + + class DFWithRows(FakeSnowparkDataFrame): + def __init__(self, session, sql, cols, rows): + super().__init__(session, sql, cols) + self._rows = rows + + def collect(self) -> list[Any]: # type: ignore[override] + return self._rows + + def fake_sql(sql: str): + return DFWithRows( + sf_exec.session, + sql, + ["A", "B"], + [ + FakeRow({"A": 1, "B": "x"}), + FakeRow({"A": 2, "B": "y"}), + ], + ) + + # executor auf unseren fake umbiegen + sf_exec.session.sql = fake_sql # type: ignore[assignment] + + # ACT + res = sf_exec.con.execute("SELECT 1") + + # ASSERT + assert isinstance(res, _SFResult) + assert res.fetchall() == [(1, "x"), (2, "y")] + assert res.fetchone() == (1, "x") diff --git a/tests/unit/test_core_python_tags.py b/tests/unit/test_core_python_tags.py index 0f84375..95a0f94 100644 --- a/tests/unit/test_core_python_tags.py +++ b/tests/unit/test_core_python_tags.py @@ -14,7 +14,7 @@ def test_python_model_tags_propagate_to_node(tmp_path: Path, monkeypatch) -> Non model_file.write_text( ( "from fastflowtransform import model\n\n" - "@model(name='py_tagged', tags=['example', 'demo'], meta={'materialized': 'view'})\n" + "@model(name='py_tagged', tags=['example', 'demo'], materialized='view')\n" "def build(df=None):\n" " return df\n" ),