diff --git a/.fastflowtransform/target/catalog.json b/.fastflowtransform/target/catalog.json new file mode 100644 index 0000000..a9f7728 --- /dev/null +++ b/.fastflowtransform/target/catalog.json @@ -0,0 +1,11 @@ +{ + "metadata": { + "generated_at": "2025-10-28T19:05:15+00:00", + "tool": "fastflowtransform" + }, + "relations": { + "failing": { + "columns": [] + } + } +} diff --git a/.fastflowtransform/target/manifest.json b/.fastflowtransform/target/manifest.json new file mode 100644 index 0000000..d3fd433 --- /dev/null +++ b/.fastflowtransform/target/manifest.json @@ -0,0 +1,20 @@ +{ + "macros": { + "snake": "text.py" + }, + "metadata": { + "generated_at": "2025-10-28T19:05:15+00:00", + "tool": "fastflowtransform" + }, + "nodes": { + "failing": { + "deps": [], + "kind": "sql", + "materialized": "table", + "name": "failing", + "path": "", + "relation": "failing" + } + }, + "sources": {} +} diff --git a/.fastflowtransform/target/run_results.json b/.fastflowtransform/target/run_results.json new file mode 100644 index 0000000..fd4e084 --- /dev/null +++ b/.fastflowtransform/target/run_results.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "generated_at": "2025-10-28T19:05:15+00:00", + "tool": "fastflowtransform" + }, + "results": [ + { + "duration_ms": 0, + "finished_at": "2025-10-28T19:05:15+00:00", + "http": null, + "message": "'boom'", + "name": "failing", + "started_at": "2025-10-28T19:05:15+00:00", + "status": "error" + } + ], + "run_finished_at": "2025-10-28T19:05:15+00:00", + "run_started_at": "2025-10-28T19:05:15+00:00" +} diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 0000000..31d629a --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,50 @@ +name: Build & Deploy MkDocs to GitHub Pages + +on: + push: + branches: [ main ] + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: "pages" + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Build site + run: mkdocs build --strict + + - name: Upload Pages artifact + uses: actions/upload-pages-artifact@v3 + with: + path: ./site + + deploy: + needs: build + runs-on: ubuntu-latest + steps: + - name: Configure Pages + uses: actions/configure-pages@v5 + - name: Deploy to Pages + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore index 97ebb6e..a9b5a99 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,5 @@ dist/ # Docs Output examples/**/docs/ tickets/** +site/dag/** +cache/** diff --git a/Makefile.pipeline b/Makefile.pipeline index fd0972d..9390625 100644 --- a/Makefile.pipeline +++ b/Makefile.pipeline @@ -25,9 +25,9 @@ dag: # Opens the generated DAG HTML on macOS/Linux; Windows users open it manually. demo-open: @if [ -f "$(FF_PROJECT)/site/dag/index.html" ]; then \ - $(OPENER) "$(FF_PROJECT)/site/dag/index.html" 2>/dev/null || echo "Öffne manuell: $(FF_PROJECT)/site/dag/index.html"; \ + $(OPENER) "$(FF_PROJECT)/site/dag/index.html" 2>/dev/null || echo "Oopen at: $(FF_PROJECT)/site/dag/index.html"; \ else \ - echo "Keine HTML-Datei gefunden: $(FF_PROJECT)/site/dag/index.html"; \ + echo "No HTML found: $(FF_PROJECT)/site/dag/index.html"; \ fi test: @@ -35,7 +35,7 @@ test: # End-to-end showcase: Seed → Run → DAG → Open → Tests demo: seed run dag demo-open test - @echo "\n✓ Demo fertig: Tabellen gebaut, DAG generiert, Tests gelaufen." + @echo "\n✓ Demo done." clean: rm -rf .local "$(FF_PROJECT)/docs" dist build *.egg-info diff --git a/README.md b/README.md index 933e80f..9f27b75 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ make install # upgrades pip + installs FastFlowTransform in editable mode ## Quickstart > 📚 **Mehr lesen … CLI-Details** -> Für Flag-Referenzen, Automatisierung und Hintergründe siehe [`docs/Technical_Overview.md`](docs/Technical_Overview.md#cli-flows). +> For flag referencees, automatization and backgrounds see [`docs/Technical_Overview.md`](docs/Technical_Overview.md#cli-flows). Run the end-to-end DuckDB demo (seed → run → docs → tests) in under a minute: @@ -188,8 +188,11 @@ Examples: - **Documentation hub:** choose your path (operators vs contributors) — see [`docs/index.md`](docs/index.md). - **User & operator guide:** project layout, CLI usage, troubleshooting tips — see [`docs/Technical_Overview.md`](docs/Technical_Overview.md). +- **Docgen shortcut:** append `--open-source` to `fft docgen ...` to launch the freshly rendered `index.html` immediately; use `--no-schema` when column introspection should be skipped. - **Modeling reference:** configuration, Jinja helpers, macros — see [`docs/Config_and_Macros.md`](docs/Config_and_Macros.md). -- **Examples:** runnable demo projects live under `examples/`; each README covers engine-specific setup. +- **API calls in Python models:** [`docs/API_Models.md`](docs/API_Models.md) +- **Database comments sync:** preview database comment updates with `fft sync-db-comments . --env dev --dry-run` before applying them to Postgres or Snowflake. +- **Examples:** runnable demo projects live under `examples/`; --- diff --git a/docs/Api_Models.md b/docs/Api_Models.md new file mode 100644 index 0000000..18708b3 --- /dev/null +++ b/docs/Api_Models.md @@ -0,0 +1,306 @@ +# API Calls in Python Models + +> **Status:** Experimental but stable for demos and smaller workflows. +> **Goal:** Query HTTP APIs from Python models, return responses as DataFrames, cache and instrument them cleanly, and support reproducible offline runs. + +* [Motivation](#motivation) +* [Quickstart](#quickstart) +* [Programming API](#programming-api) + * [`get_json`](#get_json) + * [`get_df`](#get_df) + * [Pagination](#pagination) + * [Context & Telemetry](#context--telemetry) +* [CLI Flags & Environment Variables](#cli-flags--environment-variables) +* [Example Model](#example-model) +* [Artifacts](#artifacts) +* [Tests & Offline Demos](#tests--offline-demos) +* [Best Practices](#best-practices) +* [Troubleshooting](#troubleshooting) +* [Security & Compliance](#security--compliance) +* [FAQ](#faq) + +--- + +## Motivation + +Many pipelines need small, reliable API fetchers: configuration tables, miniature dimensions, feature flags, SaaS exports. This feature provides: + +- Simple HTTP calls inside Python models +- File-backed cache (reproducible builds, works offline) +- Per-node telemetry (requests, hits, bytes, hashes) +- CLI switches `--offline` and `--http-cache` for reproducible runs + +--- + +## Quickstart + +1. **Optionally enable flags** (recommended): + + ```bash + # No network - cache hits only + fft run . --env dev --offline + # Cache mode + fft run . --env dev --http-cache rw # rw|ro|off + ``` + +2. **Write a Python model**: + + ```python + # models/users_from_api.ff.py + import pandas as pd + from fastflowtransform.core import model + from fastflowtransform.api.http import get_df + + @model(name="users_from_api", deps=["users.ff"]) + def fetch(_: pd.DataFrame) -> pd.DataFrame: + df = get_df( + url="https://api.example.com/users", + params={"page": 1}, + record_path=["data"], # JSON -> list -> DataFrame + ) + return df + ``` + +3. **Run it**: + + ```bash + fft run . --env dev --select users_from_api + ``` + +--- + +## Programming API + +> Module: `fastflowtransform.api.http` + +### `get_json` + +```python +from fastflowtransform.api.http import get_json + +data = get_json( + url="https://api.example.com/objects", + params={"page": 1}, # optional + headers={"Authorization": "Bearer ..."}, # optional + timeout=20, # optional (seconds) +) +# -> Python dict / list +``` + +**Behavior** + +- Reads from the local cache (when present and valid). +- Writes to the cache (`rw` mode), including the response body. +- Respects offline mode (no network traffic). + +### `get_df` + +```python +from fastflowtransform.api.http import get_df + +df = get_df( + url="https://api.example.com/users", + params={"page": 1}, + record_path=["data"], # path to the JSON list + normalize=True, # optional: flatten nested objects + paginator=None, # optional: pagination strategy (see below) +) +# -> pandas.DataFrame +``` + +**Conversion** + +- Default: `record_path` points to the array payload (for example `["data"]`). +- `normalize=True` delegates to `json_normalize` for deeper structures. + +### Pagination + +For paged APIs you can describe the next request declaratively: + +```python +def paginator(url: str, params: dict | None, json_obj: dict): + next_url = json_obj.get("next") # e.g. absolute URL + if next_url: + return {"next_request": {"url": next_url}} + return None + +df = get_df( + "https://api.example.com/users?page=1", + paginator=paginator, + record_path=["data"], +) +``` + +The paginator may return the following fields: + +- `{"next_request": {"url": "...", "params": {...}, "headers": {...}}}` + (any missing field keeps its previous value) + +### Context & Telemetry + +During a model run the executor collects telemetry per node and writes it into `run_results.json`: + +- `requests` (count) +- `cache_hits` +- `bytes` (sum of response bodies) +- `used_offline` (bool) +- `keys` (cache keys) +- `entries` (optional compact array with URL, status, content hash) + +You will find these metrics under the `http` block of each node (see [Artifacts](#artifacts)). + +--- + +## CLI Flags & Environment Variables + +**CLI** + +- `--offline` + Sets `FF_HTTP_OFFLINE=1`; network requests are blocked, **cache hits only**. +- `--http-cache {off|ro|rw}` + Sets `FF_HTTP_CACHE_MODE`: + + - `off`: neither read nor write. + - `ro`: read-only (hits), **no** writes. + - `rw`: read and write (default). + +**Environment (optional to set directly)** + +| Variable | Default | Effect | +| ------------------------ | ------------------------------- | ----------------------------------- | +| `FF_HTTP_OFFLINE` | `0` | `1/true/on` -> offline mode | +| `FF_HTTP_CACHE_MODE` | `rw` | `off` / `ro` / `rw` | +| `FF_HTTP_CACHE_DIR` | `.fastflowtransform/http_cache` | Cache directory | +| `FF_HTTP_TTL` | `0` | Seconds; 0 = never expires | +| `FF_HTTP_TIMEOUT` | `20` | Request timeout (seconds) | +| `FF_HTTP_MAX_RETRIES` | `3` | Basic retry count | +| `FF_HTTP_RATE_LIMIT_RPS` | `0` | Requests per second (0 = unlimited) | + +--- + +## Example Model + +```python +# models/dim_countries_from_api.ff.py +import pandas as pd +from fastflowtransform.core import model +from fastflowtransform.api.http import get_df + +@model(name="dim_countries_from_api", deps=["users.ff"]) +def countries(_: pd.DataFrame) -> pd.DataFrame: + def pager(u, p, js): + nxt = js.get("paging", {}).get("next") + return {"next_request": {"url": nxt}} if nxt else None + + df = get_df( + url="https://api.example.com/countries?page=1", + paginator=pager, + record_path=["data"], + normalize=True, + ) + # lightweight post-processing + if "code" in df.columns: + df["code"] = df["code"].str.upper() + return df +``` + +Run: + +```bash +fft run . --env dev --select dim_countries_from_api --http-cache ro +``` + +--- + +## Artifacts + +`/.fastflowtransform/target/run_results.json` (excerpt): + +```json +{ + "results": [ + { + "name": "dim_countries_from_api", + "status": "success", + "duration_ms": 153, + "http": { + "requests": 2, + "cache_hits": 2, + "bytes": 1842, + "used_offline": true, + "keys": ["GET:https://api.example.com/countries?page=1|{}|{}", "..."], + "entries": [ + {"url": "https://api.example.com/countries?page=1", "status": 200, "content_hash": "sha256:..."}, + {"url": "https://api.example.com/countries?page=2", "status": 200, "content_hash": "sha256:..."} + ] + } + } + ] +} +``` + +> Note: When a node is **skipped** (fingerprint cache hit), no new `http` block is emitted - the model did not run. + +--- + +## Tests & Offline Demos + +- Place unit tests under `tests/api/...` and seed the cache directly (no real HTTP calls). +- Suggested scenarios: + + - **Offline hit:** set `FF_HTTP_OFFLINE=1`, seed the cache, `get_json/get_df` must succeed. + - **Cache mode `off`:** even with cache entries, **no** reads; expect a failure in offline mode. + - **`ro`:** allow read hits; **no** cache writes after a real or mocked request. + - **Pagination:** stitch several pages from offline fixtures; telemetry should count requests/hits. + +--- + +## Best Practices + +- **Stable URLs and parameter order** produce identical cache keys and reproducible builds. +- **Keep `record_path` shallow**; use `normalize=True` only when necessary (performance). +- **Never cache secrets:** provide tokens via headers; the response body and metadata are cached. +- **Use `--offline` in CI** for deterministic tests with a pre-seeded cache. +- **Set TTL intentionally** when APIs change frequently. + +--- + +## Troubleshooting + +- **“offline + cache miss”** + Seed the cache (see tests) or disable offline mode. +- **“Schema mismatch”** + Harmonize columns after `get_df` (types, missing keys). +- **“Too many requests”** + Configure `FF_HTTP_RATE_LIMIT_RPS`; make pagination more efficient (larger `page_size`). +- **“No http block”** + Was the node **skipped** (fingerprint cache)? Or did the model avoid HTTP calls altogether? + +--- + +## Security & Compliance + +- **Do not commit secrets** - use environment variables or a secret manager. +- **PII/GDPR:** verify whether the API returns personal data; minimise retention. +- **Cache directory:** keep it in `.gitignore`; encrypt or isolate it if necessary. + +--- + +## FAQ + +**Q:** Can I call other libraries (for example `requests`, `httpx`) directly? +**A:** Yes, but you lose telemetry and caching. The recommended entrypoint is `fastflowtransform.api.http`. + +**Q:** How do I add custom headers (for example OAuth)? +**A:** Pass `headers={...}`. Store sensitive values in env vars and inject them into your models. + +**Q:** Does this work for POST requests? +**A:** Release R1 focuses on GET. Please open an issue for POST/PUT support; the design can be extended. + +--- + +**See also:** + +- Technical guide: *Developer Guide – Architecture & Internals* +- Unit tests: `tests/api/test_http_*.py` +- Runtime & cache: *Parallelism & Cache (v0.3)* diff --git a/docs/Config_and_Macros.md b/docs/Config_and_Macros.md index c470ab6..fcef06d 100644 --- a/docs/Config_and_Macros.md +++ b/docs/Config_and_Macros.md @@ -97,12 +97,34 @@ def enrich(df: pd.DataFrame) -> pd.DataFrame: ```yaml # sources.yml -crm: - users: - identifier: seed_users -erp: - orders: - identifier: seed_orders +version: 2 + +sources: + - name: crm + tables: + - name: users + identifier: seed_users + - name: erp + tables: + - name: orders + identifier: seed_orders +``` + +Each source can declare defaults such as `schema`, `database`, or `catalog`. Tables may +override those defaults, add per-engine overrides, or point at files: + +```yaml + - name: raw + schema: staging + tables: + - name: seed_users + identifier: seed_users + overrides: + postgres: + schema: raw + databricks_spark: + format: delta + location: "/mnt/delta/raw/seed_users" ``` --- diff --git a/docs/Data_Quality_Tests.md b/docs/Data_Quality_Tests.md new file mode 100644 index 0000000..65082d7 --- /dev/null +++ b/docs/Data_Quality_Tests.md @@ -0,0 +1,173 @@ +# Data Quality Test Reference + +FastFlowTransform exposes a set of built-in data quality checks that you can configure in `project.yml → tests:` and execute with `fft test`. This document lists every supported test, required parameters, and example configurations. + +## Usage Overview + +```yaml +# project.yml +tests: + - type: not_null + table: users + column: id + severity: error # default (omit for error) + tags: [batch] + + - type: unique + table: users + column: email + tags: [batch] + + - type: accepted_values + table: users + column: status + values: [active, invited] + severity: warn # warn keeps run green on failure + + - type: row_count_between + table: users_enriched + min: 1 + max: 100000 + + - type: reconcile_equal + name: revenue_vs_bookings # optional label in summaries + tags: [reconcile] + left: { table: fct_revenue, expr: "sum(amount)" } + right: { table: fct_bookings, expr: "sum(expected_amount)" } + abs_tolerance: 5.0 +``` + +Every entry is a single dictionary describing one check. The common keys are: + +| Key | Description | +|------------|-------------| +| `type` | Test kind (see tables below). | +| `table` | Target table for table-level checks or display hint for reconciliations. | +| `column` | Required for column-scoped checks (`not_null`, `unique`, …). | +| `severity` | `error` (default) or `warn`. | +| `tags` | Optional list of selectors for `fft test --select tag:...`. | +| `name` | Optional identifier surfaced in summaries (useful for reconciliations). | + +Run all configured checks: + +```bash +fft test . --env dev +``` + +Use `--select tag:` to restrict by tags (legacy `--select batch` reads the same tags list). Tests always execute regardless of cache settings. + +Each entry produces a summary line. Failures stop the command unless `severity: warn` is set. + +## Table-Level Checks + +These checks operate on a single table (optionally filtered with `where:`). Unless noted, they require a `column` argument. + +### `not_null` +- **Purpose:** Assert that a column never contains NULLs. +- **Parameters:** + - `column` *(str, required)* + - `where` *(str, optional)* — SQL predicate applied before the NULL check. +- **Failure:** Reports the number of NULL rows and shows the underlying SQL. + +### `unique` +- **Purpose:** Detect duplicates within a column. +- **Parameters:** + - `column` *(str, required)* + - `where` *(str, optional)* +- **Failure:** Indicates how many duplicate groups were found (HAVING count > 1) and shows a sample query. + +### `accepted_values` +- **Purpose:** Ensure every non-NULL value is inside an allowed set. +- **Parameters:** + - `column` *(str, required)* + - `values` *(list, required)* — permitted literals (strings are quoted automatically). + - `where` *(str, optional)* +- **Failure:** Shows the number of out-of-set values plus up to five sample values. + +### `greater_equal` +- **Purpose:** Require all values to be greater than or equal to a threshold. +- **Parameters:** + - `column` *(str, required)* + - `threshold` *(number, default `0`)* +- **Failure:** Lists how many rows fell below the threshold. + +### `non_negative_sum` +- **Purpose:** Validate that the sum of a numeric column is not negative. +- **Parameters:** + - `column` *(str, required)* +- **Failure:** Reports the signed sum when it is negative. + +### `row_count_between` +- **Purpose:** Guard minimum (and optional maximum) row counts for a table. +- **Parameters:** + - `min` *(int, default `1`)* + - `max` *(int, optional)* — omit for open-ended upper bounds. +- **Failure:** Indicates the observed row count when it falls outside `[min, max]`. + +### `freshness` +- **Purpose:** Warn when the latest timestamp is older than an allowed delay. +- **Parameters:** + - `column` *(str, required)* — timestamp column. + - `max_delay_minutes` *(int, required)* — permitted staleness. +- **Failure:** Reports the computed lag in minutes. Uses ANSI-style `DATE_PART` (works on DuckDB/Postgres; extend for other engines as needed). + +## Cross-Table Reconciliations + +Reconciliation checks compare aggregates or keys across two relations. Their configuration accepts dictionaries describing the left/right side expressions or keys. + +### `reconcile_equal` +- **Purpose:** Compare two scalar expressions with optional tolerances. +- **Parameters:** + - `left`, `right` *(dict, required)* with keys: + - `table` *(str, required)* + - `expr` *(str, required)* — SQL select expression (e.g. `sum(amount)`). + - `where` *(str, optional)* + - `abs_tolerance` *(float, optional)* — maximum absolute difference. + - `rel_tolerance_pct` *(float, optional)* — maximum relative difference in percent. +- **Failure:** Displays both values, absolute and relative differences. + +### `reconcile_ratio_within` +- **Purpose:** Constrain the ratio `left/right` within bounds. +- **Parameters:** + - `left`, `right` *(dict, required as above)* + - `min_ratio`, `max_ratio` *(float, required)* +- **Failure:** Shows the computed ratio and expected interval. + +### `reconcile_diff_within` +- **Purpose:** Limit the absolute difference between two aggregates. +- **Parameters:** + - `left`, `right` *(dict, required)* + - `max_abs_diff` *(float, required)* +- **Failure:** Reports the absolute difference when it exceeds `max_abs_diff`. + +### `reconcile_coverage` +- **Purpose:** Ensure every key present in a source table appears in a target table (anti-join zero). +- **Parameters:** + - `source` *(dict, required)* — `table` and `key` column. + - `target` *(dict, required)* — `table` and `key` column. + - `source_where` *(str, optional)* — filter applied to the source. + - `target_where` *(str, optional)* — filter applied to the target. +- **Failure:** Reports the number of missing keys. + +## Severity & Selectors + +- `severity: error` (default) makes failures stop the test run with exit code 1. +- `severity: warn` records the result but keeps the run successful. +- `selectors:` lets you group checks under named tokens (e.g. `batch`, `streaming`). Use `fft test --select tag:batch` to execute a subset. + +## CLI Summary Output + +Each executed check produces a line in the summary: + +``` +✓ not_null users.email (3ms) +✖ accepted_values events.status values=['new', 'active'] (warn) +``` + +Failures include the generated SQL (where available) to simplify debugging. Use `fft test --verbose` for more detail, or `FFT_SQL_DEBUG=1` to log the underlying queries. + +## Further Reading + +- [`docs/YAML_Tests.md`](YAML_Tests.md) – schema for YAML-defined tests and advanced scenarios. +- [`fastflowtransform/testing.py`](../src/fastflowtransform/testing.py) – implementation details and helper functions. +- [`fft test --help`] — command-line switches, selectors, and cache options. diff --git a/docs/Incremental.md b/docs/Incremental.md new file mode 100644 index 0000000..527f25b --- /dev/null +++ b/docs/Incremental.md @@ -0,0 +1,66 @@ +# Incremental Models (R1) + +This guide explains how to configure incremental models, use `is_incremental()` in SQL, engine compatibility, and schema change policies. + +## Quick Start + +A minimal incremental model: + +```sql +-- examples/r1_demo/models/fct_events_inc.ff.sql +{{ config( + materialized='incremental', + unique_key=['event_id'], + on_schema_change='append_new_columns' -- or 'sync_all_columns' +) }} +with src as ( + select * from {{ source('app', 'events') }} + {% if is_incremental() %} + where ingested_at > (select coalesce(max(ingested_at), timestamp '1970-01-01') from {{ this.name }}) + {% endif %} +) +select + event_id, + user_id, + event_type, + ingested_at, + -- evolving column: will appear later + meta_json +from src; +```` + +### `is_incremental()` + +* Available in SQL templates during rendering. +* Returns `true` when the model exists and the current `materialized='incremental'` run chooses an incremental path (insert/merge) instead of full rebuild. +* Typical usage: filter the source to “new” rows only. + +### Engine Matrix (MVP) + +| Engine | Incremental Insert | Merge/Upsert | Schema Change Policy | +| ------------------ | ------------------ | ------------ | -------------------- | +| DuckDB | ✅ insert | 🚧 fallback* | ✅ append new cols | +| Postgres | ✅ insert | 🚧 fallback* | ✅ append new cols | +| BigQuery (classic) | ✅ insert | 🚧 fallback* | 🚧 best-effort | +| BigQuery BigFrames | ✅ insert | 🚧 fallback* | 🚧 best-effort | +| Databricks Spark | ✅ insert | 🚧 fallback* | 🚧 best-effort | +| Snowflake Snowpark | ✅ insert | 🚧 fallback* | 🚧 best-effort | + +* Fallback strategy merges by delete-on-keys + insert (best effort) if native merge isn’t wired. + +### Schema Change Policies + +* `append_new_columns` (default): new columns appear in target if they show up in the select. +* `sync_all_columns` (planned): attempt to keep type/nullable alignment. Currently not enforced; prefer append in R1. + +### End-to-End + +```bash +# Seeds → initial incremental build → run again with filter +fft seed examples/r1_demo --env dev +fft run examples/r1_demo --env dev --select fct_events_inc.ff +# simulate new data (re-seed or append), then: +fft run examples/r1_demo --env dev --select fct_events_inc.ff +``` + +**Artifacts:** see `.fastflowtransform/target/{manifest.json, run_results.json, catalog.json}`. diff --git a/docs/Quickstart.md b/docs/Quickstart.md new file mode 100644 index 0000000..1c20f12 --- /dev/null +++ b/docs/Quickstart.md @@ -0,0 +1,74 @@ +# Quickstart + +This guide walks you through creating a minimal FastFlowTransform project from scratch and running it end-to-end. + +## 1. Install & bootstrap + +```bash +python -m venv .venv +. .venv/bin/activate +pip install -e ./fastflowtransform +fft --help +``` + +## 2. Create project layout + +```bash +mkdir -p demo/{models,seeds} +cat <<'YAML' > demo/sources.yml +version: 2 + +sources: + - name: raw + schema: staging + tables: + - name: users + identifier: seed_users +YAML + +cat <<'CSV' > demo/seeds/seed_users.csv +id,email +1,a@example.com +2,b@example.com +CSV + +cat <<'SQL' > demo/models/users.ff.sql +{{ config(materialized='table') }} +select id, email +from {{ source('raw', 'users') }} +SQL +``` + +## 3. Seed static inputs + +```bash +fft seed demo --profile dev +``` + +This materializes the CSV into the configured engine (DuckDB by default) using `seed_users` as the physical table. + +## 4. Run the pipeline + +```bash +fft run demo --cache off +``` + +You should see log lines similar to `✓ L01 [DUCK] users.ff`. The resulting table lives in the target schema (`staging` in this example). + +## 5. Inspect artifacts + +- `.fastflowtransform/target/manifest.json` → model graph + sources +- `.fastflowtransform/target/run_results.json` → run outcomes and durations + +## 6. Add more models (optional) + +- Reference other models with `{{ ref('model_name') }}` +- Configure tags or materializations via `{{ config(...) }}` at the top of each SQL file + +## 7. Next steps + +- Add `project.yml` for reusable `vars:` and metadata +- Explore `fft docs` to generate HTML documentation +- Use engine profiles under `profiles.yml` to target Postgres, BigQuery, or Databricks (path-based sources supported via `format` + `location` overrides) + +Refer to `docs/Config_and_Macros.md` for advanced configuration options. diff --git a/docs/State_Selection.md b/docs/State_Selection.md new file mode 100644 index 0000000..e9590dd --- /dev/null +++ b/docs/State_Selection.md @@ -0,0 +1,39 @@ + +# State Selection — R1 + +Build only changed nodes or select by last run results. + +## Changed Nodes + +- `state:modified` — models that have changed since last cached fingerprint. +- `state:modified+` — the above plus all downstream dependents. + +```bash +# First run populates cache +fft run examples/r1_demo --env dev --cache rw +# Touch files / change SQL → next run: +fft run examples/r1_demo --env dev --cache rw --select state:modified +fft run examples/r1_demo --env dev --cache rw --select state:modified+ +```` + +## Result-based Selection + +Use the last `run_results.json`: + +* `result:ok` — successful models (no warnings) +* `result:warn` — successful but with warnings +* `result:fail` — alias of `result:error` +* `result:error`— failed models + +```bash +fft run examples/r1_demo --env dev --select result:error +``` + +### Artifacts + +``` +examples/r1_demo/.fastflowtransform/target/ +├── manifest.json +├── run_results.json +└── catalog.json +``` \ No newline at end of file diff --git a/docs/Technical_Overview.md b/docs/Technical_Overview.md index b4a472e..4206508 100644 --- a/docs/Technical_Overview.md +++ b/docs/Technical_Overview.md @@ -1,4 +1,4 @@ -# 🧭 FastFlowTransform – Technical Developer Documentation (v0.1) +# 🧭 FastFlowTransform – Technical Developer Documentation (v0.4) > Status: latest updates from your context dump. This document consolidates project structure, architecture, core APIs, error handling, CLI, examples, and roadmap into a print/git-friendly Markdown. > @@ -172,11 +172,31 @@ Targets wrap the CLI commands showcased below. Feel free to copy the pattern int ### CLI Flows -> 📚 **Mehr lesen … Quickstart & Rezepte** -> Die Schritt-für-Schritt-Befehle findest du im [`README.md`](../README.md#quickstart). Dort bleibt der vollständige Ablauf gepflegt; dieser Abschnitt fokussiert auf weiterführende Hinweise. +> 📚 **Need recipes?** The step-by-step walkthrough lives in [`README.md`](../README.md#quickstart); this section highlights additional guidance. -- CLI-Flags und interne Abläufe sind im Abschnitt [CLI Implementation](#cli-implementation) dokumentiert. -- Beispiele für Automatisierung findest du in den [Makefile Targets](#makefile-targets). +- CLI flags and internals are documented under [CLI Implementation](#cli-implementation). +- Automation examples appear in the [Makefile Targets](#makefile-targets). + + +#### HTTP/API in Python models +See [API calls in Python models](./API_Models.md) for `get_json`/`get_df`, pagination, cache/offline flags. + + +#### DAG & Documentation + +- Narrow the graph with `fft dag ... --select ` (for example `state:modified` or `tag:finance`). Combined with `--html` this produces a focused mini site. +- Control schema introspection via `--with-schema/--no-schema`. Use `--no-schema` when the executor should avoid fetching column metadata (for example, BigQuery without sufficient permissions). +- `fft docgen` renders the DAG, model pages, and an optional JSON manifest in one command. Append `--open-source` to open `index.html` in your default browser after rendering. + +#### Sync Database Comments + +`fft sync-db-comments --env ` pushes model and column descriptions from project YAML or Markdown into database comments. The command currently supports Postgres and Snowflake Snowpark: + +- Start with `--dry-run` to review the generated `COMMENT` statements. +- Postgres honors `profiles.yml -> postgres.db_schema` (and any `FF_PG_SCHEMA` override). +- Snowflake reuses the session or connection exposed by the executor. + +If no descriptions are found, the command exits without making changes. ### Logging & Verbosity @@ -238,6 +258,7 @@ fft run . -vv # full debug + SQL channel Notes: - UTests key the cache with `profile="utest"`. - Fingerprints include case inputs (CSV content hash / inline rows), so changing inputs invalidates the cache. +- `--reuse-meta` is currently a reserved flag: it is exposed in the CLI, acts as a no-op today, and will enable future meta-table optimizations. #### Why? @@ -660,6 +681,8 @@ fft dag . --env dev --html fft docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json ``` +Add `--open-source` if you want the default browser to open the rendered `index.html` immediately. + **Descriptions** can be provided in YAML (project.yml) and/or Markdown files. Markdown has higher priority. YAML in `project.yml`: @@ -908,9 +931,12 @@ Operational usage lives in [CLI Flows](#cli-flows). This section drills into the **Commands:** - `fft run [--env dev] [--engine ...]` -- `fft dag [--env dev] [--html]` -- `fft test [--env dev] [--select batch|streaming]` +- `fft dag [--env dev] [--html] [--select ...] [--with-schema/--no-schema]` +- `fft docgen [--env dev] [--out dir] [--emit-json path] [--open-source]` +- `fft test [--env dev] [--select batch|streaming|tag:...]` - `fft seed [--env dev]` +- `fft sync-db-comments [--env dev] [--dry-run]` +- `fft utest [--env dev] [--cache off|ro|rw] [--reuse-meta]` - `fft --version` **Key components:** diff --git a/docs/YAML_Tests.md b/docs/YAML_Tests.md new file mode 100644 index 0000000..a97f3a4 --- /dev/null +++ b/docs/YAML_Tests.md @@ -0,0 +1,54 @@ +# YAML Tests (Schema-bound) + +Schema-bound tests live in `models/*.yml` or `models/**/schema.yml` and complement (or replace) `project.yml`-based tests. + +## Example + +```yaml +# examples/r1_demo/models/users_enriched.yml +version: 2 +models: + - name: users_enriched + description: "Adds gmail flag" + columns: + - name: id + tests: + - not_null: { severity: error } + - unique + - name: email + tests: + - not_null + - accepted_values: + values: ["a@example.com","b@example.com","c@gmail.com"] + severity: warn +```` + +### Severities + +* `error` → contributes to failures (exit code 2). +* `warn` → surfaced in summary as ❕, does not affect exit code. + +### Run + +```bash +fft test examples/r1_demo --env dev +# Select only tests tagged 'reconcile' (if present) +fft test examples/r1_demo --env dev --select tag:reconcile +``` + +### Output (excerpt) + +``` +Data Quality Summary +──────────────────── +✅ not_null users.id (3ms) +❌ unique users.id (2ms) + ↳ [unique] users.id: found 1 duplicate +❕ accepted_values users_enriched.email (1ms) + +Totals +────── +✓ passed: 2 +✗ failed: 1 +! warnings: 1 +``` \ No newline at end of file diff --git a/docs/examples/Environment_Matrix.md b/docs/examples/Environment_Matrix.md new file mode 100644 index 0000000..57b5132 --- /dev/null +++ b/docs/examples/Environment_Matrix.md @@ -0,0 +1,228 @@ +# Environment Matrix (DuckDB-only) — Example + +This tiny project demonstrates **per-environment configuration** (dev / stg / prod) while keeping everything on **DuckDB**. +Each environment uses its **own DuckDB file**, so you can switch environments without changing code. + +It also includes a **seed step** (CSV → table) and two minimal models: + +* `env_vars.ff` (Python) — echoes which env is active and which DuckDB file is used +* `hello.ff` (SQL view) — shows how `{{ this.* }}` resolves from the active profile +* `users.ff` (SQL table) — reads from the seeded CSV table to prove seeding works + +--- + +## What this shows + +* Layered environment files: `.env.dev`, `.env.stg`, `.env.prod` (+ optional `*.local` overrides) +* `profiles.yml` that reads from `env('…')` so connection details live in env files +* All environments use **DuckDB**, but **different DB files** (e.g. `.local/dev.duckdb`, `.local/stg.duckdb`, …) +* Seeding CSV → `seed_users` table, then a simple model consuming it + +--- + +## Project layout + +``` +examples/env_matrix/ +├─ models/ +│ ├─ env_vars.ff.py # Python model: shows env + DuckDB file info +│ └─ users.ff.sql # SQL table: reads from seeded 'seed_users' +├─ seeds/ +│ └─ users.csv # sample data for seeding (-> seed_users) +├─ profiles.yml # all envs = DuckDB, different paths +├─ .env # shared defaults (optional) +├─ .env.dev # dev environment vars +├─ .env.stg # stg environment vars +├─ .env.prod # prod environment vars +├─ .env.dev.local # private overrides (gitignored; optional) +├─ .env.stg.local # private overrides (gitignored; optional) +├─ .env.prod.local # private overrides (gitignored; optional) +└─ Makefile # convenience targets (run, seed, dag) +``` + +--- + +## Environment files + +Each env file sets a different DuckDB path: + +* `.env.dev` + + ``` + FFT_ACTIVE_ENV=dev + FF_ENGINE=duckdb + FF_DUCKDB_PATH=.local/env_matrix.dev.duckdb + ``` + +* `.env.stg` + + ``` + FFT_ACTIVE_ENV=stg + FF_ENGINE=duckdb + FF_DUCKDB_PATH=.local/env_matrix.stg.duckdb + ``` + +* `.env.prod` + + ``` + FFT_ACTIVE_ENV=prod + FF_ENGINE=duckdb + FF_DUCKDB_PATH=.local/env_matrix.prod.duckdb + ``` + +> You can place secrets or machine-local tweaks in `.env..local` (ignored by git). +> Optional toggles (if you want verbose SQL logs): +> `FFT_SQL_DEBUG=1`, `FFT_LOG_JSON=1` + +--- + +## `profiles.yml` (DuckDB for all envs) + +```yaml +default: + dev: + engine: "{{ env('FF_ENGINE', 'duckdb') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" + + stg: + engine: "{{ env('FF_ENGINE', 'duckdb') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" + + prod: + engine: "{{ env('FF_ENGINE', 'duckdb') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH', ':memory:') }}" +``` + +--- + +## Models + +### `models/env_vars.ff.py` (Python) + +Returns one row with: + +* `active_env_hint` (from `.env.*`), +* `ff_engine` (should be `duckdb` here), +* `duckdb_path`, `duckdb_exists`, `duckdb_size_bytes`. + +### `models/hello.ff.sql` (SQL view) + +Uses `{{ this.materialized }}`, `{{ this.schema }}`, `{{ this.database }}` so you can see what the active profile provides. (The simple `SELECT` is compatible with DuckDB; if you added casts like `::text`, they’re fine in DuckDB too.) + +### `models/users.ff.sql` (SQL table) + +Reads from the seeded table `seed_users`: + +```sql +{{ config(materialized='table', tags=['demo', 'seed']) }} + +select + id, + email +from "seed_users"; +``` + +> If you see an error “table seed_users does not exist”, you **haven’t run `fft seed`** for that environment yet. + +--- + +## Seeds + +`seeds/users.csv` is loaded by `fft seed` into a table named `seed_users`. +(That’s the default naming convention: `users.csv` → `seed_users`.) + +--- + +## Running it + +From the repo root: + +### Using `uv` directly + +**Dev** + +```bash +uv run fft seed examples/env_matrix --env dev +uv run fft run examples/env_matrix --env dev +uv run fft dag examples/env_matrix --env dev --html +``` + +**Staging** + +```bash +uv run fft seed examples/env_matrix --env stg +uv run fft run examples/env_matrix --env stg +``` + +**Prod** + +```bash +uv run fft seed examples/env_matrix --env prod +uv run fft run examples/env_matrix --env prod +``` + +### Using the Makefile (inside `examples/env_matrix/`) + +```bash +make run-dev # runs the DAG on dev +make run-stg +make run-prod + +make seed-dev # seed only (dev) +make seed-stg +make seed-prod + +make dag-dev # generate HTML DAG for dev +make clean # remove .local/, docs/, site/, .fastflowtransform/ +``` + +> Tip: re-run `fft seed` whenever you switch environments or change `seeds/*.csv`. + +--- + +## Inspecting results + +* The **HTML DAG** (after `make dag-dev`) will be at: + + ``` + examples/env_matrix/site/dag/index.html + ``` +* The **artifacts** are under: + + ``` + examples/env_matrix/.fastflowtransform/target/{manifest.json, run_results.json, catalog.json} + ``` +* Query the DuckDB files directly with `duckdb` CLI or `python` + `duckdb` module if you want to peek inside. + +--- + +## Troubleshooting + +* **`seed_users` not found** + Run `fft seed` for the same environment: + `uv run fft seed examples/env_matrix --env dev` + +* **No logs showing** + Use `-v`/`-vv` and/or `--sql-debug` on the CLI, or set: + + ``` + FFT_SQL_DEBUG=1 + FFT_LOG_JSON=1 # optional JSON logs + ``` + +* **Wrong environment picked** + Double-check the `--env` flag in your CLI call and ensure the `.env.` file exists. + +--- + +## Clean up + +```bash +make clean # from examples/env_matrix/ +# or manually: +rm -rf examples/env_matrix/.local examples/env_matrix/site examples/env_matrix/docs +rm -rf examples/env_matrix/.fastflowtransform +``` diff --git a/docs/index.md b/docs/index.md index e1a678a..735f88c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -5,13 +5,18 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t --- ## Docs Navigation -1. **Getting Started** — you are here (`docs/index.md`) -2. [User Guide](./Technical_Overview.md#part-i--operational-guide) -3. [Modeling Reference](./Config_and_Macros.md) -4. [Parallelism & Cache](./Cache_and_Parallelism.md) -5. [Cross-Table Reconciliations](./Technical_Overview.md#cross-table-reconciliations) -6. [Auto-Docs & Lineage](./Technical_Overview.md#auto-docs--lineage) -7. [Developer Guide](./Technical_Overview.md#part-ii--architecture--internals) +- **Getting Started** — you are here (`docs/index.md`) +- [User Guide](./Technical_Overview.md#part-i--operational-guide) +- [Modeling Reference](./Config_and_Macros.md) +- [Parallelism & Cache](./Cache_and_Parallelism.md) +- [API calls in Python models](./API_Models.md) +- [Incremental Models](./Incremental.md) +- [YAML Tests (Schema-bound)](./YAML_Tests.md) +- [Data Quality Tests Reference](./Data_Quality_Tests.md) +- [State Selection (changed & results)](./State_Selection.md) +- [Cross-Table Reconciliations](./Technical_Overview.md#cross-table-reconciliations) +- [Auto-Docs & Lineage](./Technical_Overview.md#auto-docs--lineage) +- [Developer Guide](./Technical_Overview.md#part-ii--architecture--internals) ## Table of Contents @@ -26,7 +31,7 @@ Welcome! This page is your starting point for FastFlowTransform docs. Pick the t ### 1. Build & Operate Projects (Data Practitioners) -- **Get set up quickly:** run the DuckDB demo or install locally via the [Quickstart](../README.md#quickstart). +- **Get set up quickly:** follow the dedicated [Quickstart](Quickstart.md) guide for installation, seeding, and a first run. - **Understand the project layout & CLI workflow:** see *Project Layout*, *Makefile Targets*, and *CLI Flows* in the [Technical Overview](Technical_Overview.md#project-layout). - **Configure runtimes & profiles:** review executor profiles, environment overrides, and logging options in the [Technical Overview](Technical_Overview.md#profiles--environment-overrides). - **Model data quality & troubleshoot runs:** the [Technical Overview](Technical_Overview.md#model-unit-tests-fastflowtransform-utest) covers unit tests, troubleshooting tips, and exit codes. diff --git a/examples/api_demo/Makefile b/examples/api_demo/Makefile new file mode 100644 index 0000000..f4e0d63 --- /dev/null +++ b/examples/api_demo/Makefile @@ -0,0 +1,164 @@ +.PHONY: demo seed run dag test artifacts incr state-mod state-mod-plus res-error res-warn \ + clean demo-open \ + api-run api-offline api-warm api-cache-clear api-show-http api-demo + +# --- Config ------------------------------------------------------------------- + +# DuckDB database file and project path (for the API demo typically examples/api_demo) +DB ?= .local/demo.duckdb +PROJECT ?= . +UV ?= uv + +# HTTP wrapper defaults (override per call if needed) +# Allowed domains are comma-separated (no https://) +FF_HTTP_ALLOWED_DOMAINS ?= jsonplaceholder.typicode.com,api.github.com +FF_HTTP_CACHE_DIR ?= .local/http-cache +FF_HTTP_MAX_RPS ?= 5 +FF_HTTP_MAX_RETRIES ?= 3 +FF_HTTP_TIMEOUT ?= 20 + +# Detect OS opener (macOS: open, Linux: xdg-open) +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + OPENER := open +else + OPENER := xdg-open +endif + +# --- Shared env for all runs (DuckDB + HTTP) --------------------------------- +RUN_ENV = FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" \ + FF_HTTP_ALLOWED_DOMAINS="$(FF_HTTP_ALLOWED_DOMAINS)" \ + FF_HTTP_CACHE_DIR="$(FF_HTTP_CACHE_DIR)" \ + FF_HTTP_MAX_RPS="$(FF_HTTP_MAX_RPS)" \ + FF_HTTP_MAX_RETRIES="$(FF_HTTP_MAX_RETRIES)" \ + FF_HTTP_TIMEOUT="$(FF_HTTP_TIMEOUT)" + +# Engine env +ifeq ($(ENGINE),duckdb) + ENGINE_ENV = FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" + ENGINE_TAG = engine:duckdb +endif +ifeq ($(ENGINE),postgres) + ENGINE_ENV = FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" + ENGINE_TAG = engine:postgres +endif +ifeq ($(ENGINE),databricks_spark) + ENGINE_ENV = FF_ENGINE=databricks_spark FF_SPARK_MASTER="$(FF_SPARK_MASTER)" + ENGINE_TAG = engine:databricks_spark +endif + +# Select only common + this engine; keeps DAG clean and avoids executing foreign variants +SELECT = tag:example:api_demo,tag:scope:common,tag:$(ENGINE_TAG) + +# --- Standard R1 targets ------------------------------------------------------ + +seed: + $(ENGINE_ENV) $(UV) run fft seed "$(PROJECT)" --env dev + +run: + $(ENGINE_ENV) $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev + +test: + $(ENGINE_ENV) $(UV) run fft test "$(PROJECT)" --env dev + +dag: + $(ENGINE_ENV) $(UV) run fft dag "$(PROJECT)" --env dev --html + +artifacts: + @echo + @echo "== 📦 Artifacts ==" + @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + @echo " DAG HTML: $(PROJECT)/site/dag/index.html" + +incr: + $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --select fct_events_inc.ff --cache rw || true + +state-mod: + @if [ -f "$(PROJECT)/models/users.ff.sql" ]; then touch "$(PROJECT)/models/users.ff.sql"; fi + $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --cache rw --select state:modified + +state-mod-plus: + $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --cache rw --select state:modified+ + +res-error: + $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --select result:error || true + +res-warn: + $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --select result:warn || true + +pg-seed: + FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" $(UV) run fft seed "$(PROJECT)" --env stg + +pg-run: + FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" \ + FF_HTTP_ALLOWED_DOMAINS="$(FF_HTTP_ALLOWED_DOMAINS)" \ + FF_HTTP_CACHE_DIR="$(FF_HTTP_CACHE_DIR)" \ + FF_HTTP_MAX_RPS="$(FF_HTTP_MAX_RPS)" \ + FF_HTTP_MAX_RETRIES="$(FF_HTTP_MAX_RETRIES)" \ + FF_HTTP_TIMEOUT="$(FF_HTTP_TIMEOUT)" \ + $(UV) run fft run "$(PROJECT)" --env stg + +clean: + rm -rf .local "$(PROJECT)/docs" dist build *.egg-info .fastflowtransform + +demo-open: + @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ + $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ + else \ + echo "No HTML found: $(PROJECT)/site/dag/index.html"; \ + fi + +demo: clean + @echo "== 🚀 R1 Demo (DuckDB) ==" + @echo "DB=$(DB) PROJECT=$(PROJECT)" + +$(MAKE) seed + +$(MAKE) run + +$(MAKE) dag + +$(MAKE) test + +$(MAKE) artifacts + @echo + @echo "== 🔁 Incremental Model ==" + +$(MAKE) incr + @echo + @echo "== 🧠 State Selection (changed only) ==" + +$(MAKE) state-mod + +$(MAKE) state-mod-plus + @echo + @echo "== 🧪 Result Selection (from last run_results.json) ==" + +$(MAKE) res-error + +$(MAKE) res-warn + @echo + @echo "✅ Demo done. Open DAG here: $(PROJECT)/site/dag/index.html" + +$(MAKE) demo-open + +# --- API-specific convenience targets ---------------------------------------- + +api-run: + $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env dev --select "kind:python" --cache rw + +api-warm: + +$(MAKE) api-run + +api-offline: + $(RUN_ENV) FF_HTTP_OFFLINE=1 $(UV) run fft run "$(PROJECT)" --env dev --select "kind:python" --cache rw + +api-cache-clear: + rm -rf "$(FF_HTTP_CACHE_DIR)" + +api-show-http: + @if command -v jq >/dev/null 2>&1; then \ + echo "== HTTP snapshots from run_results.json =="; \ + jq -r '.results[] | select(.http!=null) | "\(.name): requests=\(.http.requests) cache_hits=\(.http.cache_hits) bytes=\(.http.bytes) offline=\(.http.used_offline)"' \ + "$(PROJECT)/.fastflowtransform/target/run_results.json" || true; \ + else \ + echo "Install 'jq' to pretty-print HTTP snapshots from run_results.json."; \ + fi + +api-demo: clean + @echo "== 🌐 API Demo (DuckDB) ==" + @echo "DB=$(DB) PROJECT=$(PROJECT)" + +$(MAKE) run + +$(MAKE) dag + +$(MAKE) api-show-http || true + @echo "✅ API Demo done. Open DAG here: $(PROJECT)/site/dag/index.html" + +$(MAKE) demo-open diff --git a/examples/api_demo/models/common/mart_users_join.ff.sql b/examples/api_demo/models/common/mart_users_join.ff.sql new file mode 100644 index 0000000..9ee550c --- /dev/null +++ b/examples/api_demo/models/common/mart_users_join.ff.sql @@ -0,0 +1,23 @@ +{{ config(materialized='table', tags=['example:api_demo','scope:common']) }} + +{# Choose the producing model by variable. Default is the pandas HTTP version. #} +{% set api_users_model = var('api_users_model', 'api_users_http') %} + +-- Join local seed users with API users by email (demo-only; real keys will differ) +with a as ( + select u.id as user_id, u.email + from {{ ref('users.ff') }} u +), +b as ( + -- Choose one of the API models: + -- select * from {{ ref('api_users_http') }} + select * from {{ ref('api_users_requests') }} +) +select + a.user_id, + a.email, + b.api_user_id, + b.username, + b.name +from a +left join b on lower(a.email) = lower(b.email); diff --git a/examples/api_demo/models/common/users.ff.sql b/examples/api_demo/models/common/users.ff.sql new file mode 100644 index 0000000..639ce2d --- /dev/null +++ b/examples/api_demo/models/common/users.ff.sql @@ -0,0 +1,4 @@ +{{ config(materialized='table', tags=['example:api_demo','scope:common','kind:seed-consumer']) }} +-- Simple staging table from seed +select id, email +from {{ source('crm', 'users') }}; diff --git a/examples/api_demo/models/engines/duckdb/api_users_http.ff.py b/examples/api_demo/models/engines/duckdb/api_users_http.ff.py new file mode 100644 index 0000000..e75d7bc --- /dev/null +++ b/examples/api_demo/models/engines/duckdb/api_users_http.ff.py @@ -0,0 +1,25 @@ +from fastflowtransform import model +from fastflowtransform.api.http import get_df +import pandas as pd + + +@model( + name="api_users_http", + deps=["users.ff"], # at least one dependency is required by the executor contract +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + """ + Fetch users from a public demo API using the built-in HTTP wrapper. + Pros: caching, offline mode, telemetry in run_results.json. + """ + # Example endpoint (JSON Placeholder); replace with your real API. + # For paginated APIs you can add a `paginator` function. + df = get_df( + url="https://jsonplaceholder.typicode.com/users", + record_path=None, # the outer JSON is already a list + normalize=True, # flatten objects to columns (address.*, company.*) + ) + + # Keep only a few columns to make joins simpler + cols = [c for c in df.columns if c in ("id", "email", "username", "name")] + return df[cols].rename(columns={"id": "api_user_id"}) diff --git a/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py b/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py new file mode 100644 index 0000000..d1b731f --- /dev/null +++ b/examples/api_demo/models/engines/duckdb/api_users_requests.ff.py @@ -0,0 +1,33 @@ +# NOTE: Plain Python variant (requests/httpx). No built-in FFT telemetry or HTTP cache here. +from fastflowtransform import model +import pandas as pd + +try: + import requests # you can swap this with httpx if you prefer +except Exception as _e: # pragma: no cover + raise RuntimeError("Please install 'requests' to run this model") from _e + + +@model( + name="api_users_requests", + deps=["users.ff"], # keep a dependency for executor contract +) +def fetch(_: pd.DataFrame) -> pd.DataFrame: + """ + Fetch users from the same demo API using plain Python code. + Pros: ultimate flexibility (custom auth, retry, shaping). + Cons: no built-in FFT telemetry or cache (unless you add it manually). + """ + url = "https://jsonplaceholder.typicode.com/users" + headers = { + # Add your auth headers here if needed: + # "Authorization": f"Bearer {os.getenv('MY_TOKEN')}", + } + resp = requests.get(url, headers=headers, timeout=30) + resp.raise_for_status() + data = resp.json() # list[dict] + + # Example shaping + df = pd.DataFrame(data) + cols = [c for c in df.columns if c in ("id", "email", "username", "name")] + return df[cols].rename(columns={"id": "api_user_id"}) diff --git a/examples/api_demo/profiles.yml b/examples/api_demo/profiles.yml new file mode 100644 index 0000000..1698a40 --- /dev/null +++ b/examples/api_demo/profiles.yml @@ -0,0 +1,21 @@ +default: + dev: + engine: "{{ env('FF_ENGINE' }}" + + # Router variable binds "api_users_model" to the *canonical* model name + # We keep the same value for all engines here (api_users_http), but it could differ + # (e.g. "api_users_requests" or a parquet loader) per engine in real projects. + vars: + api_users_model: "api_users_http" + + duckdb: + path: "{{ env('FF_DUCKDB_PATH') }}" + + postgres: + # Safe default DSN for local dev; override via FF_PG_DSN in CI/Prod + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA') }}" + + databricks_spark: + master: "{{ env('FF_SPARK_MASTER') }}" + app_name: "api_demo" diff --git a/examples/api_demo/project.yml b/examples/api_demo/project.yml new file mode 100644 index 0000000..b84f5a8 --- /dev/null +++ b/examples/api_demo/project.yml @@ -0,0 +1,11 @@ +name: duckdb_api_demo +version: "0.1" + +vars: {} + +tests: + # Batch‑Tabellen + - type: not_null + table: mart_users_join + column: user_id + tags: [batch] diff --git a/examples/api_demo/seeds/seed_users.csv b/examples/api_demo/seeds/seed_users.csv new file mode 100644 index 0000000..2acf25f --- /dev/null +++ b/examples/api_demo/seeds/seed_users.csv @@ -0,0 +1,4 @@ +id,email +1,a@example.com +2,b@gmail.com +3,c@gmail.com diff --git a/examples/api_demo/site/dag/api_users_http.html b/examples/api_demo/site/dag/api_users_http.html new file mode 100644 index 0000000..d4a491a --- /dev/null +++ b/examples/api_demo/site/dag/api_users_http.html @@ -0,0 +1,261 @@ + + + + + + api_users_http – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ api_users_http + table +

+
Model Detail • FastFlowTransform
+
+ python +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
api_users_http
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/duckdb_api_demo/models/api_users_http.ff.py + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+
+ +
+ +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
api_user_idBIGINT + + yes + + + + — + + + + + ?.id + + transformed + + + + +
nameVARCHAR + + yes + + + + — + + + + unknown + +
usernameVARCHAR + + yes + + + + — + + + + unknown + +
emailVARCHAR + + yes + + + + — + + + + unknown + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/api_demo/site/dag/api_users_requests.html b/examples/api_demo/site/dag/api_users_requests.html new file mode 100644 index 0000000..e52fa3d --- /dev/null +++ b/examples/api_demo/site/dag/api_users_requests.html @@ -0,0 +1,261 @@ + + + + + + api_users_requests – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ api_users_requests + table +

+
Model Detail • FastFlowTransform
+
+ python +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
api_users_requests
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/duckdb_api_demo/models/api_users_requests.ff.py + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+
+ +
+ +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
api_user_idBIGINT + + yes + + + + — + + + + + ?.id + + transformed + + + + +
nameVARCHAR + + yes + + + + — + + + + unknown + +
usernameVARCHAR + + yes + + + + — + + + + unknown + +
emailVARCHAR + + yes + + + + — + + + + unknown + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/api_demo/site/dag/index.html b/examples/api_demo/site/dag/index.html new file mode 100644 index 0000000..7a50925 --- /dev/null +++ b/examples/api_demo/site/dag/index.html @@ -0,0 +1,272 @@ + + + + + + FastFlowTransform - DAG & Mini Docs + + + + + + + +
+
+

FastFlowTransform - DAG & Mini Docs

+
Mermaid renders automatically (light/dark)
+
+
+ + +
+
+ +
+
+

DAG

+
+ SQL + Python + + Materialization: + + table + + view + + ephemeral + + incremental + +
+
flowchart TD + classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; + classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; + api_users_http("api_users_http
(api_users_http)") + class api_users_http py; + api_users_requests("api_users_requests
(api_users_requests)") + class api_users_requests py; + mart_users_join_ff["mart_users_join.ff
(mart_users_join)"] + class mart_users_join_ff sql; + users_ff["users.ff
(users)"] + class users_ff sql; + users_ff --> mart_users_join_ff + api_users_http --> mart_users_join_ff + api_users_requests --> mart_users_join_ff + users_ff --> api_users_requests + users_ff --> api_users_http +
+
+ + + +
+

Macros

+ +

No macros found.

+ +
+
+ + + + \ No newline at end of file diff --git a/examples/api_demo/site/dag/mart_users_join.ff.html b/examples/api_demo/site/dag/mart_users_join.ff.html new file mode 100644 index 0000000..7913056 --- /dev/null +++ b/examples/api_demo/site/dag/mart_users_join.ff.html @@ -0,0 +1,282 @@ + + + + + + mart_users_join.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ mart_users_join.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
mart_users_join
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/duckdb_api_demo/models/mart_users_join.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
user_idBIGINT + + yes + + + + — + + + + + ?.id + + direct + + + + +
emailVARCHAR + + yes + + + + — + + + + + ?.email + + direct + + + + +
api_user_idBIGINT + + yes + + + + — + + + + unknown + +
usernameVARCHAR + + yes + + + + — + + + + unknown + +
nameVARCHAR + + yes + + + + — + + + + unknown + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/api_demo/site/dag/users.ff.html b/examples/api_demo/site/dag/users.ff.html new file mode 100644 index 0000000..ae430a3 --- /dev/null +++ b/examples/api_demo/site/dag/users.ff.html @@ -0,0 +1,227 @@ + + + + + + users.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ users.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
users
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/duckdb_api_demo/models/users.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
idBIGINT + + yes + + + + — + + + + + ?.id + + direct + + + + +
emailVARCHAR + + yes + + + + — + + + + + ?.email + + direct + + + + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/api_demo/sources.yml b/examples/api_demo/sources.yml new file mode 100644 index 0000000..84e04d5 --- /dev/null +++ b/examples/api_demo/sources.yml @@ -0,0 +1,7 @@ +version: 2 + +sources: + - name: crm + tables: + - name: users + identifier: seed_users # will be materialized via `fft seed` diff --git a/examples/env_matrix/.fastflowtransform/cache/prod-duckdb.json b/examples/env_matrix/.fastflowtransform/cache/prod-duckdb.json new file mode 100644 index 0000000..9eacacd --- /dev/null +++ b/examples/env_matrix/.fastflowtransform/cache/prod-duckdb.json @@ -0,0 +1,9 @@ +{ + "engine": "duckdb", + "entries": { + "env_vars.ff": "f7ef95030b8fe43c52bd748deb4fb8061c41d9f384b5b34a49ff69203cde4d82", + "users.ff": "11705124b10bc738b123f525e164f81d909972b29651b4b86153c8edba18bde1" + }, + "profile": "prod", + "version": 1 +} \ No newline at end of file diff --git a/examples/env_matrix/.fastflowtransform/target/catalog.json b/examples/env_matrix/.fastflowtransform/target/catalog.json new file mode 100644 index 0000000..743d3df --- /dev/null +++ b/examples/env_matrix/.fastflowtransform/target/catalog.json @@ -0,0 +1,51 @@ +{ + "metadata": { + "generated_at": "2025-10-28T18:36:20+00:00", + "tool": "fastflowtransform" + }, + "relations": { + "env_vars": { + "columns": [ + { + "dtype": "VARCHAR", + "name": "active_env_hint", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "ff_engine", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "duckdb_path", + "nullable": true + }, + { + "dtype": "BOOLEAN", + "name": "duckdb_exists", + "nullable": true + }, + { + "dtype": "BIGINT", + "name": "duckdb_size_bytes", + "nullable": true + } + ] + }, + "users": { + "columns": [ + { + "dtype": "BIGINT", + "name": "id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email", + "nullable": true + } + ] + } + } +} diff --git a/examples/env_matrix/.fastflowtransform/target/manifest.json b/examples/env_matrix/.fastflowtransform/target/manifest.json new file mode 100644 index 0000000..473bc66 --- /dev/null +++ b/examples/env_matrix/.fastflowtransform/target/manifest.json @@ -0,0 +1,53 @@ +{ + "macros": {}, + "metadata": { + "generated_at": "2025-10-28T18:36:20+00:00", + "tool": "fastflowtransform" + }, + "nodes": { + "env_vars.ff": { + "deps": [], + "kind": "python", + "materialized": "table", + "name": "env_vars.ff", + "path": "models/env_vars.ff.py", + "relation": "env_vars" + }, + "users.ff": { + "deps": [], + "kind": "sql", + "materialized": "table", + "name": "users.ff", + "path": "models/users.ff.sql", + "relation": "users" + } + }, + "sources": { + "raw": { + "seed_users": { + "base": { + "catalog": null, + "database": null, + "dataset": null, + "format": null, + "identifier": "seed_users", + "location": null, + "options": {}, + "project": null, + "schema": "raw" + }, + "overrides": { + "databricks_spark": { + "format": "delta", + "location": "/mnt/delta/raw/seed_users", + "options": {} + }, + "postgres": { + "options": {}, + "schema": "main" + } + } + } + } + } +} diff --git a/examples/env_matrix/.fastflowtransform/target/run_results.json b/examples/env_matrix/.fastflowtransform/target/run_results.json new file mode 100644 index 0000000..a9adfc8 --- /dev/null +++ b/examples/env_matrix/.fastflowtransform/target/run_results.json @@ -0,0 +1,28 @@ +{ + "metadata": { + "generated_at": "2025-10-28T18:36:20+00:00", + "tool": "fastflowtransform" + }, + "results": [ + { + "duration_ms": 5, + "finished_at": "2025-10-28T18:36:20+00:00", + "http": null, + "message": null, + "name": "env_vars.ff", + "started_at": "2025-10-28T18:36:20+00:00", + "status": "success" + }, + { + "duration_ms": 2, + "finished_at": "2025-10-28T18:36:20+00:00", + "http": null, + "message": null, + "name": "users.ff", + "started_at": "2025-10-28T18:36:20+00:00", + "status": "success" + } + ], + "run_finished_at": "2025-10-28T18:36:20+00:00", + "run_started_at": "2025-10-28T18:36:20+00:00" +} diff --git a/examples/env_matrix/Makefile b/examples/env_matrix/Makefile new file mode 100644 index 0000000..ab4d5ad --- /dev/null +++ b/examples/env_matrix/Makefile @@ -0,0 +1,78 @@ +.PHONY: seed-dev seed-stg seed-prod run-dev run-stg run-prod dag-dev clean \ + demo-dev demo-stg demo-prod demo artifacts + +PROJECT ?= . +UV ?= uv + +seed-dev: + $(UV) run fft seed "$(PROJECT)" --env dev + +seed-stg: + $(UV) run fft seed "$(PROJECT)" --env stg + +seed-prod: + $(UV) run fft seed "$(PROJECT)" --env prod + +run-dev: + $(UV) run fft run "$(PROJECT)" --env dev + +run-stg: + $(UV) run fft run "$(PROJECT)" --env stg + +run-prod: + $(UV) run fft run "$(PROJECT)" --env prod + +dag-dev: + $(UV) run fft dag "$(PROJECT)" --env dev --html + +dag-stg: + $(UV) run fft dag "$(PROJECT)" --env stg --html + +dag-prod: + $(UV) run fft dag "$(PROJECT)" --env prod --html + +test-dev: + $(UV) run fft test $(PROJECT) --env dev + +test-stg: + $(UV) run fft test $(PROJECT) --env stg + +test-prod: + $(UV) run fft test $(PROJECT) --env prod + +clean: + rm -rf .local "$(PROJECT)/docs" "$(PROJECT)/site" .fastflowtransform + +# --- Convenience: show common artifacts path quickly +artifacts: + @echo + @echo "== 📦 Artifacts ==" + @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + @echo " DAG HTML (if generated): $(PROJECT)/site/dag/index.html" + +# --- One-shot demos per environment ----------------------------------------- + +demo-dev: clean seed-dev run-dev test-dev dag-dev artifacts + @echo + @echo "✅ Demo (dev) complete. Open DAG here (if generated): $(PROJECT)/site/dag/index.html" + +demo-stg: clean seed-stg run-stg test-stg dag-stg artifacts + @echo + @echo "✅ Demo (stg) complete." + +demo-prod: clean seed-prod run-prod test-prod dag-prod artifacts + @echo + @echo "✅ Demo (prod) complete." + +# --- Generic demo with ENV selector (dev|stg|prod) --------------------------- + +ENV ?= dev + +demo: + @echo "== 🚀 Demo ENV=$(ENV) PROJECT=$(PROJECT) ==" + @case "$(ENV)" in \ + dev) $(MAKE) demo-dev ;; \ + stg) $(MAKE) demo-stg ;; \ + prod) $(MAKE) demo-prod ;; \ + *) echo "Unknown ENV=$(ENV). Use dev|stg|prod."; exit 1 ;; \ + esac diff --git a/examples/env_matrix/models/env_vars.ff.py b/examples/env_matrix/models/env_vars.ff.py new file mode 100644 index 0000000..646b969 --- /dev/null +++ b/examples/env_matrix/models/env_vars.ff.py @@ -0,0 +1,46 @@ +# models/env_vars.ff.py +# Purpose: Minimal, engine-agnostic Python model that echoes which environment +# variables are active for the current run. Optimized for the +# "all-DuckDB" setup (dev/stg/prod each point to a different file). + +from __future__ import annotations +import os +import pathlib +import pandas as pd + +from fastflowtransform import model # your existing decorator + + +@model( + name="env_vars.ff", + tags=["demo", "env"], + kind="python", + meta={"materialized": "table"}, +) +def build(_: pd.DataFrame | None) -> pd.DataFrame: + """ + Return a single-row DataFrame showing the key environment variables + for the DuckDB-only setup. + + Columns: + - active_env_hint: optional marker from your .env. file (if you set FFT_ACTIVE_ENV). + - ff_engine: should be "duckdb" for all environments in this demo. + - duckdb_path: the DB file path in use for the active environment. + - duckdb_exists: whether that file currently exists on disk. + - duckdb_size_bytes: file size if it exists (0 otherwise). + """ + duck_path = os.getenv("FF_DUCKDB_PATH", "") + p = pathlib.Path(duck_path) + exists = bool(p and p.exists()) + size = int(p.stat().st_size) if exists else 0 + + row = { + "active_env_hint": os.getenv( + "FFT_ACTIVE_ENV" + ), # set in .env.dev/.env.stg/.env.prod (optional) + "ff_engine": os.getenv("FF_ENGINE", "duckdb"), + "duckdb_path": duck_path, + "duckdb_exists": exists, + "duckdb_size_bytes": size, + } + return pd.DataFrame([row]) diff --git a/examples/env_matrix/models/users.ff.sql b/examples/env_matrix/models/users.ff.sql new file mode 100644 index 0000000..47c2f46 --- /dev/null +++ b/examples/env_matrix/models/users.ff.sql @@ -0,0 +1,5 @@ +{{ config(materialized='table', tags=['demo', 'env', 'seed']) }} +select + id, + email +from {{ source('raw', 'seed_users') }} diff --git a/examples/env_matrix/profiles.yml b/examples/env_matrix/profiles.yml new file mode 100644 index 0000000..f9a04a6 --- /dev/null +++ b/examples/env_matrix/profiles.yml @@ -0,0 +1,15 @@ +default: + dev: + engine: "{{ env('FF_ENGINE') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH') }}" + + stg: + engine: "{{ env('FF_ENGINE') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH') }}" + + prod: + engine: "{{ env('FF_ENGINE') }}" + duckdb: + path: "{{ env('FF_DUCKDB_PATH') }}" diff --git a/examples/env_matrix/project.yml b/examples/env_matrix/project.yml new file mode 100644 index 0000000..502a0b7 --- /dev/null +++ b/examples/env_matrix/project.yml @@ -0,0 +1,17 @@ +name: duckdb_api_demo +version: "0.1" + +vars: {} + +tests: + # Batch‑Tabellen + - type: accepted_values + table: env_vars + column: ff_engine + values: [duckdb] + tags: [batch] + + - type: row_count_between + table: users + min: 1 + tags: [batch] diff --git a/examples/env_matrix/seeds/raw/seed_users.csv b/examples/env_matrix/seeds/raw/seed_users.csv new file mode 100644 index 0000000..6622ba4 --- /dev/null +++ b/examples/env_matrix/seeds/raw/seed_users.csv @@ -0,0 +1,4 @@ +id,email,username,name +1,alice@example.com,alice,Alice Wonderland +2,bob@example.com,bob,Bob Builder +3,carol@example.com,carol,Carol Singer diff --git a/examples/env_matrix/seeds/schema.yml b/examples/env_matrix/seeds/schema.yml new file mode 100644 index 0000000..3ff84e8 --- /dev/null +++ b/examples/env_matrix/seeds/schema.yml @@ -0,0 +1,7 @@ +targets: + raw/seed_users: + schema: raw + table: seed_users # optional Rename + schema_by_engine: # optional + postgres: main + duckdb: raw diff --git a/examples/env_matrix/site/dag/env_vars.ff.html b/examples/env_matrix/site/dag/env_vars.ff.html new file mode 100644 index 0000000..5605883 --- /dev/null +++ b/examples/env_matrix/site/dag/env_vars.ff.html @@ -0,0 +1,262 @@ + + + + + + env_vars.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ env_vars.ff + table +

+
Model Detail • FastFlowTransform
+
+ python +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
env_vars
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/env_matrix/models/env_vars.ff.py + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
active_env_hintVARCHAR + + yes + + + + — + + + + unknown + +
ff_engineVARCHAR + + yes + + + + — + + + + unknown + +
duckdb_pathVARCHAR + + yes + + + + — + + + + unknown + +
duckdb_existsBOOLEAN + + yes + + + + — + + + + unknown + +
duckdb_size_bytesBIGINT + + yes + + + + — + + + + unknown + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/env_matrix/site/dag/index.html b/examples/env_matrix/site/dag/index.html new file mode 100644 index 0000000..ef240f3 --- /dev/null +++ b/examples/env_matrix/site/dag/index.html @@ -0,0 +1,225 @@ + + + + + + FastFlowTransform - DAG & Mini Docs + + + + + + + +
+
+

FastFlowTransform - DAG & Mini Docs

+
Mermaid renders automatically (light/dark)
+
+
+ + +
+
+ +
+
+

DAG

+
+ SQL + Python + + Materialization: + + table + + view + + ephemeral + + incremental + +
+
flowchart TD + classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; + classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; + env_vars_ff("env_vars.ff
(env_vars)") + class env_vars_ff py; + users_ff["users.ff
(users)"] + class users_ff sql; +
+
+ + + +
+

Macros

+ +

No macros found.

+ +
+
+ + + + \ No newline at end of file diff --git a/examples/env_matrix/site/dag/users.ff.html b/examples/env_matrix/site/dag/users.ff.html new file mode 100644 index 0000000..7a9026f --- /dev/null +++ b/examples/env_matrix/site/dag/users.ff.html @@ -0,0 +1,214 @@ + + + + + + users.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ users.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
users
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/env_matrix/models/users.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
idBIGINT + + yes + + + + — + + + + + ?.id + + direct + + + + +
emailVARCHAR + + yes + + + + — + + + + + ?.email + + direct + + + + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/env_matrix/sources.yml b/examples/env_matrix/sources.yml new file mode 100644 index 0000000..cac9e94 --- /dev/null +++ b/examples/env_matrix/sources.yml @@ -0,0 +1,14 @@ +version: 2 + +sources: + - name: raw + schema: raw + tables: + - name: seed_users + identifier: seed_users + overrides: + postgres: + schema: main + databricks_spark: + format: delta + location: "/mnt/delta/raw/seed_users" diff --git a/examples/events_users_duckdb/.fastflowtransform/cache/dev-duckdb.json b/examples/events_users_duckdb/.fastflowtransform/cache/dev-duckdb.json new file mode 100644 index 0000000..8319263 --- /dev/null +++ b/examples/events_users_duckdb/.fastflowtransform/cache/dev-duckdb.json @@ -0,0 +1,10 @@ +{ + "engine": "duckdb", + "entries": { + "fct_events_inc.ff": "9b48a28e3100c719dedfe953a49ae7ba9b88a7f7d983a4a2dc7065cf0b7124d9", + "users.ff": "dbd191f9ad5ada230f5099832f5f8c91b4c905a7a800330d282ab854777c4622", + "users_enriched": "670800d172c85bb405ed1bab65ffc8e723fd98f442ebe64bd9c92e17fc19cb8f" + }, + "profile": "dev", + "version": 1 +} \ No newline at end of file diff --git a/examples/events_users_duckdb/.fastflowtransform/target/catalog.json b/examples/events_users_duckdb/.fastflowtransform/target/catalog.json new file mode 100644 index 0000000..10194f5 --- /dev/null +++ b/examples/events_users_duckdb/.fastflowtransform/target/catalog.json @@ -0,0 +1,70 @@ +{ + "metadata": { + "generated_at": "2025-10-27T16:43:12+00:00", + "tool": "fastflowtransform" + }, + "relations": { + "fct_events_inc": { + "columns": [ + { + "dtype": "BIGINT", + "name": "event_id", + "nullable": true + }, + { + "dtype": "BIGINT", + "name": "user_id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "event_type", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "ingested_at", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "meta_json", + "nullable": true + } + ] + }, + "users": { + "columns": [ + { + "dtype": "BIGINT", + "name": "id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email", + "nullable": true + } + ] + }, + "users_enriched": { + "columns": [ + { + "dtype": "BIGINT", + "name": "id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email", + "nullable": true + }, + { + "dtype": "BOOLEAN", + "name": "is_gmail", + "nullable": true + } + ] + } + } +} diff --git a/examples/events_users_duckdb/.fastflowtransform/target/manifest.json b/examples/events_users_duckdb/.fastflowtransform/target/manifest.json new file mode 100644 index 0000000..0f84982 --- /dev/null +++ b/examples/events_users_duckdb/.fastflowtransform/target/manifest.json @@ -0,0 +1,45 @@ +{ + "macros": {}, + "metadata": { + "generated_at": "2025-10-27T16:43:12+00:00", + "tool": "fastflowtransform" + }, + "nodes": { + "fct_events_inc.ff": { + "deps": [], + "kind": "sql", + "materialized": "incremental", + "name": "fct_events_inc.ff", + "path": "models/fct_events_inc.ff.sql", + "relation": "fct_events_inc" + }, + "users.ff": { + "deps": [], + "kind": "sql", + "materialized": "table", + "name": "users.ff", + "path": "models/users.ff.sql", + "relation": "users" + }, + "users_enriched": { + "deps": [ + "users.ff" + ], + "kind": "python", + "materialized": "table", + "name": "users_enriched", + "path": "models/users_enriched.ff.py", + "relation": "users_enriched" + } + }, + "sources": { + "app": { + "events": { + "identifier": "seed_events_initial" + }, + "users": { + "identifier": "seed_users" + } + } + } +} diff --git a/examples/events_users_duckdb/.fastflowtransform/target/run_results.json b/examples/events_users_duckdb/.fastflowtransform/target/run_results.json new file mode 100644 index 0000000..9a8cdd6 --- /dev/null +++ b/examples/events_users_duckdb/.fastflowtransform/target/run_results.json @@ -0,0 +1,34 @@ +{ + "metadata": { + "generated_at": "2025-10-27T16:43:12+00:00", + "tool": "fastflowtransform" + }, + "results": [ + { + "duration_ms": 4, + "finished_at": "2025-10-27T16:43:12+00:00", + "message": null, + "name": "fct_events_inc.ff", + "started_at": "2025-10-27T16:43:12+00:00", + "status": "success" + }, + { + "duration_ms": 1, + "finished_at": "2025-10-27T16:43:12+00:00", + "message": null, + "name": "users.ff", + "started_at": "2025-10-27T16:43:12+00:00", + "status": "success" + }, + { + "duration_ms": 0, + "finished_at": "2025-10-27T16:43:12+00:00", + "message": null, + "name": "users_enriched", + "started_at": "2025-10-27T16:43:12+00:00", + "status": "success" + } + ], + "run_finished_at": "2025-10-27T16:43:12+00:00", + "run_started_at": "2025-10-27T16:43:12+00:00" +} diff --git a/examples/events_users_duckdb/Makefile b/examples/events_users_duckdb/Makefile new file mode 100644 index 0000000..dec2227 --- /dev/null +++ b/examples/events_users_duckdb/Makefile @@ -0,0 +1,86 @@ +.PHONY: demo seed run dag test artifacts incr state-mod state-mod-plus res-error res-warn clean + +DB ?= .local/demo.duckdb +PROJECT ?= . +UV ?= uv + +# Detect OS opener (macOS: open, Linux: xdg-open) +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + OPENER := open +else + OPENER := xdg-open +endif + +seed: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" $(UV) run fft seed "$(PROJECT)" --env dev + +run: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" $(UV) run fft run "$(PROJECT)" --env dev + +test: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" $(UV) run fft test "$(PROJECT)" --env dev + +dag: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" $(UV) run fft dag "$(PROJECT)" --env dev --html + +artifacts: + @echo + @echo "== 📦 Artifacts ==" + @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + @echo " DAG HTML: $(PROJECT)/site/dag/index.html" + +incr: + $(UV) run fft run "$(PROJECT)" --env dev --select fct_events_inc.ff --cache rw || true + +state-mod: + @if [ -f "$(PROJECT)/models/users.ff.sql" ]; then touch "$(PROJECT)/models/users.ff.sql"; fi + FF_ENGINE=duckdb $(UV) run fft run "$(PROJECT)" --env dev --cache rw --select state:modified + +state-mod-plus: + FF_ENGINE=duckdb $(UV) run fft run "$(PROJECT)" --env dev --cache rw --select state:modified+ + +res-error: + FF_ENGINE=duckdb $(UV) run fft run "$(PROJECT)" --env dev --select result:error || true + +res-warn: + FF_ENGINE=duckdb $(UV) run fft run "$(PROJECT)" --env dev --select result:warn || true + +pg-seed: + FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" $(UV) run fft seed "$(PROJECT)" --env stg + +pg-run: + FF_ENGINE=postgres FF_PG_DSN="$(FF_PG_DSN)" FF_PG_SCHEMA="$(FF_PG_SCHEMA)" $(UV) run fft run "$(PROJECT)" --env stg + +clean: + rm -rf .local "$(PROJECT)/docs" dist build *.egg-info .fastflowtransform + +demo-open: + @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ + $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ + else \ + echo "No HTML found: $(PROJECT)/site/dag/index.html"; \ + fi + +demo: clean + @echo "== 🚀 R1 Demo (DuckDB) ==" + @echo "DB=$(DB) PROJECT=$(PROJECT)" + +$(MAKE) seed + +$(MAKE) run + +$(MAKE) dag + +$(MAKE) test + +$(MAKE) artifacts + @echo + @echo "== 🔁 Incremental Model ==" + +$(MAKE) incr + @echo + @echo "== 🧠 State Selection (changed only) ==" + +$(MAKE) state-mod + +$(MAKE) state-mod-plus + @echo + @echo "== 🧪 Result Selection (from last run_results.json) ==" + +$(MAKE) res-error + +$(MAKE) res-warn + @echo + @echo "✅ Demo done. Open DAG here: $(PROJECT)/site/dag/index.html" + +$(MAKE) demo-open diff --git a/examples/events_users_duckdb/README.md b/examples/events_users_duckdb/README.md new file mode 100644 index 0000000..6e079dd --- /dev/null +++ b/examples/events_users_duckdb/README.md @@ -0,0 +1,50 @@ +# R1 Demo + +Minimal project showing: +- Incremental model (`fct_events_inc.ff.sql`) +- YAML tests (`models/users_enriched.yml`) +- State selection (`state:modified`, `result:*`) + +## DuckDB (local) + +```bash +make -C examples/r1_demo seed +make -C examples/r1_demo run +make -C examples/r1_demo dag +```` + +Incremental-only: + +```bash +make -C examples/r1_demo inc +``` + +## Postgres (optional) + +Set `FF_PG_DSN` and `FF_PG_SCHEMA`, then: + +```bash +make -C examples/r1_demo pg-seed +make -C examples/r1_demo pg-run +``` + +## Expected Artifacts + +``` +examples/r1_demo/.fastflowtransform/target/ +├── manifest.json +├── run_results.json +└── catalog.json +``` + +## Sample Output (excerpt) + +``` +✔ L00 [DUCK] users.ff (120ms) +✔ L01 [DUCK] users_enriched (35ms) +✔ L01 [DUCK] fct_events_inc.ff (41ms) + +Data Quality Summary +──────────────────── +✅ not_null users.email (2ms) +❕ accepted_values users_enriched.email (1ms) \ No newline at end of file diff --git a/examples/events_users_duckdb/models/fct_events_inc.ff.sql b/examples/events_users_duckdb/models/fct_events_inc.ff.sql new file mode 100644 index 0000000..02916fb --- /dev/null +++ b/examples/events_users_duckdb/models/fct_events_inc.ff.sql @@ -0,0 +1,24 @@ +{{ config( + materialized='incremental', + unique_key=['event_id'], + on_schema_change='append_new_columns', + tags=['fact','incremental'] +) }} +with base as ( + select * + from {{ source('app','events') }} + {% if is_incremental() %} + where cast(ingested_at as timestamp) + > coalesce( + (select max(cast(ingested_at as timestamp)) from {{ this }}), + timestamp '1970-01-01' + ) + {% endif %} +) +select + event_id, + user_id, + event_type, + ingested_at, + meta_json +from base; \ No newline at end of file diff --git a/examples/events_users_duckdb/models/users.ff.sql b/examples/events_users_duckdb/models/users.ff.sql new file mode 100644 index 0000000..5283037 --- /dev/null +++ b/examples/events_users_duckdb/models/users.ff.sql @@ -0,0 +1,3 @@ +{{ config(materialized='table', tags=['staging']) }} +select id, email +from {{ source('app','users') }}; \ No newline at end of file diff --git a/examples/events_users_duckdb/models/users_enriched.ff.py b/examples/events_users_duckdb/models/users_enriched.ff.py new file mode 100644 index 0000000..383e3fc --- /dev/null +++ b/examples/events_users_duckdb/models/users_enriched.ff.py @@ -0,0 +1,13 @@ +from fastflowtransform import model +import pandas as pd + + +@model( + name="users_enriched", + deps=["users.ff"], + require={"users": {"id", "email"}}, +) +def enrich(df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + out["is_gmail"] = out["email"].str.endswith("@gmail.com") + return out diff --git a/examples/events_users_duckdb/models/users_enriched.yml b/examples/events_users_duckdb/models/users_enriched.yml new file mode 100644 index 0000000..a113bbe --- /dev/null +++ b/examples/events_users_duckdb/models/users_enriched.yml @@ -0,0 +1,15 @@ +version: 2 +models: + - name: users_enriched + description: "Adds gmail flag" + columns: + - name: id + tests: + - not_null: { severity: error } + - unique + - name: email + tests: + - not_null + - accepted_values: + values: ["a@example.com","b@gmail.com","c@gmail.com"] + severity: warn \ No newline at end of file diff --git a/examples/events_users_duckdb/profiles.yml b/examples/events_users_duckdb/profiles.yml new file mode 100644 index 0000000..50de7b1 --- /dev/null +++ b/examples/events_users_duckdb/profiles.yml @@ -0,0 +1,10 @@ +dev: + engine: duckdb + duckdb: + path: .local/demo.duckdb + +stg: + engine: postgres + postgres: + dsn: postgresql+psycopg://postgres:postgres@localhost:5432/ffdb + db_schema: public \ No newline at end of file diff --git a/examples/events_users_duckdb/project.yml b/examples/events_users_duckdb/project.yml new file mode 100644 index 0000000..99a4f07 --- /dev/null +++ b/examples/events_users_duckdb/project.yml @@ -0,0 +1,8 @@ +vars: + run_date: "2025-01-01" + +tests: + - type: not_null + table: users + column: email + tags: [batch] \ No newline at end of file diff --git a/examples/events_users_duckdb/seeds/seed_events_initial.csv b/examples/events_users_duckdb/seeds/seed_events_initial.csv new file mode 100644 index 0000000..87c320f --- /dev/null +++ b/examples/events_users_duckdb/seeds/seed_events_initial.csv @@ -0,0 +1,3 @@ +event_id,user_id,event_type,ingested_at,meta_json +100,1,signup,2025-01-01T00:00:00Z,"{}" +101,2,purchase,2025-01-01T00:05:00Z,"{}" \ No newline at end of file diff --git a/examples/events_users_duckdb/seeds/seed_users.csv b/examples/events_users_duckdb/seeds/seed_users.csv new file mode 100644 index 0000000..f423042 --- /dev/null +++ b/examples/events_users_duckdb/seeds/seed_users.csv @@ -0,0 +1,4 @@ +id,email +1,a@example.com +2,b@gmail.com +3,c@gmail.com \ No newline at end of file diff --git a/examples/events_users_duckdb/site/dag/fct_events_inc.ff.html b/examples/events_users_duckdb/site/dag/fct_events_inc.ff.html new file mode 100644 index 0000000..bb2b563 --- /dev/null +++ b/examples/events_users_duckdb/site/dag/fct_events_inc.ff.html @@ -0,0 +1,262 @@ + + + + + + fct_events_inc.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ fct_events_inc.ff + incremental +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
incremental
+ +
Relation
+
fct_events_inc
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/events_users_duckdb/models/fct_events_inc.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
event_idBIGINT + + yes + + + + — + + + + unknown + +
user_idBIGINT + + yes + + + + — + + + + unknown + +
event_typeVARCHAR + + yes + + + + — + + + + unknown + +
ingested_atVARCHAR + + yes + + + + — + + + + unknown + +
meta_jsonVARCHAR + + yes + + + + — + + + + unknown + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/events_users_duckdb/site/dag/index.html b/examples/events_users_duckdb/site/dag/index.html new file mode 100644 index 0000000..e438d5a --- /dev/null +++ b/examples/events_users_duckdb/site/dag/index.html @@ -0,0 +1,247 @@ + + + + + + FastFlowTransform - DAG & Mini Docs + + + + + + + +
+
+

FastFlowTransform - DAG & Mini Docs

+
Mermaid renders automatically (light/dark)
+
+
+ + +
+
+ +
+
+

DAG

+
+ SQL + Python + + Materialization: + + table + + view + + ephemeral + + incremental + +
+
flowchart TD + classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; + classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; + fct_events_inc_ff["fct_events_inc.ff
(fct_events_inc)"] + class fct_events_inc_ff sql; + users_ff["users.ff
(users)"] + class users_ff sql; + users_enriched("users_enriched
(users_enriched)") + class users_enriched py; + users_ff --> users_enriched +
+
+ + + +
+

Macros

+ +

No macros found.

+ +
+
+ + + + \ No newline at end of file diff --git a/examples/events_users_duckdb/site/dag/users.ff.html b/examples/events_users_duckdb/site/dag/users.ff.html new file mode 100644 index 0000000..83e6158 --- /dev/null +++ b/examples/events_users_duckdb/site/dag/users.ff.html @@ -0,0 +1,223 @@ + + + + + + users.ff – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ users.ff + table +

+
Model Detail • FastFlowTransform
+
+ sql +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
users
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/events_users_duckdb/models/users.ff.sql + +
+ +
Dependencies
+
+ + + +
+ + +
Referenced by
+
+ +
+ +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
idBIGINT + + yes + + + + — + + + + + ?.id + + direct + + + + +
emailVARCHAR + + yes + + + + — + + + + + ?.email + + direct + + + + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/events_users_duckdb/site/dag/users_enriched.html b/examples/events_users_duckdb/site/dag/users_enriched.html new file mode 100644 index 0000000..8fca7e0 --- /dev/null +++ b/examples/events_users_duckdb/site/dag/users_enriched.html @@ -0,0 +1,232 @@ + + + + + + users_enriched – FastFlowTransform + + + +

← Back to overview

+ +
+
+

+ users_enriched + table +

+
Model Detail • FastFlowTransform
+
+ python +
+ +
+ +
+

Metadata

+
+
Materialized
+
table
+ +
Relation
+
users_enriched
+ +
Path
+
+ /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/events_users_duckdb/models/users_enriched.ff.py + +
+ +
Dependencies
+
+ + + +
+ + +
+
+ + + + +
+

Columns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeNullableDescriptionLineage
idBIGINT + + yes + + + + — + + + + unknown + +
emailVARCHAR + + yes + + + + — + + + + unknown + +
is_gmailBOOLEAN + + yes + + + + — + + + + + ?.email + + direct + + + + +
+
+ + + +
+ + + + \ No newline at end of file diff --git a/examples/events_users_duckdb/sources.yml b/examples/events_users_duckdb/sources.yml new file mode 100644 index 0000000..296aa75 --- /dev/null +++ b/examples/events_users_duckdb/sources.yml @@ -0,0 +1,9 @@ +version: 2 + +sources: + - name: app + tables: + - name: users + identifier: seed_users + - name: events + identifier: seed_events_initial diff --git a/examples/postgres/.fastflowtransform/cache/stg-postgres.json b/examples/postgres/.fastflowtransform/cache/stg-postgres.json index b1a6262..20a7672 100644 --- a/examples/postgres/.fastflowtransform/cache/stg-postgres.json +++ b/examples/postgres/.fastflowtransform/cache/stg-postgres.json @@ -1,11 +1,11 @@ { "engine": "postgres", "entries": { - "mart_orders_enriched": "fc41294d6967cfcf3c9b7d2c5405210d9383e5538747f7f13bc16c96cc8754c5", - "mart_users.ff": "6a61e68266d9151e9c473340ee93ccb70146b0079371bae889e4c8313b40a8b8", - "orders.ff": "b45347dd5ad3adbf1637e637fb32e27b766995549c7b1ae4d9412a8ff1b0d375", - "users.ff": "68dbd147dcca21a36d04f031499eb8977a6fae8659873189b2a3169e560cb81e", - "users_enriched": "cf5157127bd1c72c6942a54049acd61ee8817782920534c274b4261783ceda4b" + "mart_orders_enriched": "6599c52b248c143c13a9cf4daaab3b646685f10ba50cd9477e62791b3ae3071b", + "mart_users.ff": "68aa5a370f7fc55b669f87134e7ceaf959d3ee4d5a563e75ae83392709b085ce", + "orders.ff": "351176593a9a5e231aa860af35e1dab4e3d7070cc25cfd13a49e415214051358", + "users.ff": "78ff9f7baa9b7b5617dbb3081fdb8a61d19a637ad6a8874862a91051b31ef646", + "users_enriched": "bbbee5bef3591988a2d6cb27fd561ac6d6f719f74dec668903e28e74e85fbd63" }, "profile": "stg", "version": 1 diff --git a/examples/postgres/.fastflowtransform/target/catalog.json b/examples/postgres/.fastflowtransform/target/catalog.json new file mode 100644 index 0000000..28dcb5f --- /dev/null +++ b/examples/postgres/.fastflowtransform/target/catalog.json @@ -0,0 +1,23 @@ +{ + "metadata": { + "generated_at": "2025-10-28T19:05:08+00:00", + "tool": "fastflowtransform" + }, + "relations": { + "mart_orders_enriched": { + "columns": [] + }, + "mart_users": { + "columns": [] + }, + "orders": { + "columns": [] + }, + "users": { + "columns": [] + }, + "users_enriched": { + "columns": [] + } + } +} diff --git a/examples/postgres/.fastflowtransform/target/manifest.json b/examples/postgres/.fastflowtransform/target/manifest.json new file mode 100644 index 0000000..503cc94 --- /dev/null +++ b/examples/postgres/.fastflowtransform/target/manifest.json @@ -0,0 +1,88 @@ +{ + "macros": {}, + "metadata": { + "generated_at": "2025-10-28T19:05:08+00:00", + "tool": "fastflowtransform" + }, + "nodes": { + "mart_orders_enriched": { + "deps": [ + "orders.ff", + "users_enriched" + ], + "kind": "python", + "materialized": "table", + "name": "mart_orders_enriched", + "path": "models/mart_orders_enriched.ff.py", + "relation": "mart_orders_enriched" + }, + "mart_users.ff": { + "deps": [ + "users_enriched" + ], + "kind": "sql", + "materialized": "table", + "name": "mart_users.ff", + "path": "models/mart_users.ff.sql", + "relation": "mart_users" + }, + "orders.ff": { + "deps": [], + "kind": "sql", + "materialized": "table", + "name": "orders.ff", + "path": "models/orders.ff.sql", + "relation": "orders" + }, + "users.ff": { + "deps": [], + "kind": "sql", + "materialized": "table", + "name": "users.ff", + "path": "models/users.ff.sql", + "relation": "users" + }, + "users_enriched": { + "deps": [ + "users.ff" + ], + "kind": "python", + "materialized": "table", + "name": "users_enriched", + "path": "models/users_enrich.ff.py", + "relation": "users_enriched" + } + }, + "sources": { + "crm": { + "orders": { + "base": { + "catalog": null, + "database": null, + "dataset": null, + "format": null, + "identifier": "seed_orders", + "location": null, + "options": {}, + "project": null, + "schema": null + }, + "overrides": {} + }, + "users": { + "base": { + "catalog": null, + "database": null, + "dataset": null, + "format": null, + "identifier": "seed_users", + "location": null, + "options": {}, + "project": null, + "schema": null + }, + "overrides": {} + } + } + } +} diff --git a/examples/postgres/.fastflowtransform/target/run_results.json b/examples/postgres/.fastflowtransform/target/run_results.json new file mode 100644 index 0000000..e1f3625 --- /dev/null +++ b/examples/postgres/.fastflowtransform/target/run_results.json @@ -0,0 +1,55 @@ +{ + "metadata": { + "generated_at": "2025-10-28T19:05:08+00:00", + "tool": "fastflowtransform" + }, + "results": [ + { + "duration_ms": 0, + "finished_at": "2025-10-28T19:05:08+00:00", + "http": null, + "message": null, + "name": "mart_orders_enriched", + "started_at": "2025-10-28T19:05:08+00:00", + "status": "success" + }, + { + "duration_ms": 0, + "finished_at": "2025-10-28T19:05:08+00:00", + "http": null, + "message": null, + "name": "mart_users.ff", + "started_at": "2025-10-28T19:05:08+00:00", + "status": "success" + }, + { + "duration_ms": 0, + "finished_at": "2025-10-28T19:05:08+00:00", + "http": null, + "message": null, + "name": "orders.ff", + "started_at": "2025-10-28T19:05:08+00:00", + "status": "success" + }, + { + "duration_ms": 0, + "finished_at": "2025-10-28T19:05:08+00:00", + "http": null, + "message": null, + "name": "users.ff", + "started_at": "2025-10-28T19:05:08+00:00", + "status": "success" + }, + { + "duration_ms": 0, + "finished_at": "2025-10-28T19:05:08+00:00", + "http": null, + "message": null, + "name": "users_enriched", + "started_at": "2025-10-28T19:05:08+00:00", + "status": "success" + } + ], + "run_finished_at": "2025-10-28T19:05:08+00:00", + "run_started_at": "2025-10-28T19:05:08+00:00" +} diff --git a/examples/postgres/site/dag/index.html b/examples/postgres/site/dag/index.html index 2aa6c49..25ed4ac 100644 --- a/examples/postgres/site/dag/index.html +++ b/examples/postgres/site/dag/index.html @@ -118,6 +118,8 @@

DAG

ephemeral + incremental +
flowchart TD classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; diff --git a/examples/postgres/sources.yml b/examples/postgres/sources.yml index 398a587..5366a46 100644 --- a/examples/postgres/sources.yml +++ b/examples/postgres/sources.yml @@ -1,5 +1,9 @@ -crm: - users: - identifier: seed_users - orders: - identifier: seed_orders +version: 2 + +sources: + - name: crm + tables: + - name: users + identifier: seed_users + - name: orders + identifier: seed_orders diff --git a/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json b/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json index c745d0b..6826995 100644 --- a/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json +++ b/examples/simple_duckdb/.fastflowtransform/cache/dev-duckdb.json @@ -1,14 +1,14 @@ { "engine": "duckdb", "entries": { - "ephemeral_ids.ff": "915b339016411eae1c33bf4a0d377ae9d95f55576dce9b137abb357907c4e40a", - "mart_orders_enriched": "6459daee02b149168dec749e8e78f61dccaa9b0c510dd1e6d1825502a778e03e", - "mart_users.ff": "2a06558561f91b8a6e9fe39edfb4c4abb5155d40ba62a0150485315b351fdde3", - "orders.ff": "dbd1c47c35fbf9f43f492262a365aa70cbc989df3dd35dcdfcba70c691da4133", - "users.ff": "2c9afcf9cc8abcbe4bf14edbe4f56b2e2efe634ef01d100e2e2aa4145110cbf5", - "users_enriched": "edb574dc070b5b698531fa8c002d199c01b3b8342571b2bef1d958c33066914f", - "v_users.ff": "9b736079070f114cd1dfd1305e4c3a0a59af45c16d8f746590f860104707ccc5", - "v_users_enriched.ff": "f50fed4d50fbf407a4c56a4d3a89cba9e040e0686c4fd09f37dfd7203f81aca5" + "ephemeral_ids.ff": "f36221b3fb6961430bffa420c97ed12a5ae2e9b92ec39acd18e74086814e4868", + "mart_orders_enriched": "5b99b9c7cafe7ce175c64eccecd245ef72cd40093a63c892b2e9fcedb64d6e6f", + "mart_users.ff": "14a922bedfa7d2eaa3b7f6a8a2e9fbc624d8a29a8d14689197b917540964a74d", + "orders.ff": "ffbbec879b95932afaead84df0b5bad425ced5068168d955faa29a3b49c24306", + "users.ff": "e16d945f4def7ce8bcb110ef25cca35c30626e4de80faf7d02aa39adaf4fe759", + "users_enriched": "8e9d55b46133f51e2eae0d68a2c3ae8a4c787b5e562279245309b64773f3b44b", + "v_users.ff": "4ecdbcaee200abfc46568a666aad308ca206120f1f9f55b15619ddc3683fd4d5", + "v_users_enriched.ff": "e92842ccc13ae2c0c2f1d84616bc6de8b382896eafbde178f9a986ab531a8dbc" }, "profile": "dev", "version": 1 diff --git a/examples/simple_duckdb/.fastflowtransform/target/catalog.json b/examples/simple_duckdb/.fastflowtransform/target/catalog.json new file mode 100644 index 0000000..75cbf5b --- /dev/null +++ b/examples/simple_duckdb/.fastflowtransform/target/catalog.json @@ -0,0 +1,179 @@ +{ + "metadata": { + "generated_at": "2025-10-28T19:05:02+00:00", + "tool": "fastflowtransform" + }, + "relations": { + "ephemeral_ids": { + "columns": [] + }, + "mart_orders_enriched": { + "columns": [ + { + "dtype": "BIGINT", + "name": "order_id", + "nullable": true + }, + { + "dtype": "BIGINT", + "name": "user_id", + "nullable": true + }, + { + "dtype": "DOUBLE", + "name": "amount", + "nullable": true + }, + { + "dtype": "BIGINT", + "name": "id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email", + "nullable": true + }, + { + "dtype": "TIMESTAMP", + "name": "signup_ts", + "nullable": true + }, + { + "dtype": "BOOLEAN", + "name": "is_gmail", + "nullable": true + }, + { + "dtype": "BOOLEAN", + "name": "valid_amt", + "nullable": true + } + ] + }, + "mart_users": { + "columns": [ + { + "dtype": "BIGINT", + "name": "id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email", + "nullable": true + }, + { + "dtype": "BOOLEAN", + "name": "is_gmail", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email_domain", + "nullable": true + } + ] + }, + "orders": { + "columns": [ + { + "dtype": "BIGINT", + "name": "order_id", + "nullable": true + }, + { + "dtype": "BIGINT", + "name": "user_id", + "nullable": true + }, + { + "dtype": "DOUBLE", + "name": "amount", + "nullable": true + } + ] + }, + "users": { + "columns": [ + { + "dtype": "BIGINT", + "name": "id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email", + "nullable": true + }, + { + "dtype": "DATE", + "name": "signup_ts", + "nullable": true + } + ] + }, + "users_enriched": { + "columns": [ + { + "dtype": "BIGINT", + "name": "id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email", + "nullable": true + }, + { + "dtype": "TIMESTAMP", + "name": "signup_ts", + "nullable": true + }, + { + "dtype": "BOOLEAN", + "name": "is_gmail", + "nullable": true + } + ] + }, + "v_users": { + "columns": [ + { + "dtype": "BIGINT", + "name": "id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email_upper", + "nullable": true + } + ] + }, + "v_users_enriched": { + "columns": [ + { + "dtype": "BIGINT", + "name": "id", + "nullable": true + }, + { + "dtype": "VARCHAR", + "name": "email", + "nullable": true + }, + { + "dtype": "BOOLEAN", + "name": "is_gmail", + "nullable": true + } + ] + } + } +} diff --git a/examples/simple_duckdb/.fastflowtransform/target/manifest.json b/examples/simple_duckdb/.fastflowtransform/target/manifest.json new file mode 100644 index 0000000..cfd2773 --- /dev/null +++ b/examples/simple_duckdb/.fastflowtransform/target/manifest.json @@ -0,0 +1,126 @@ +{ + "macros": { + "nz": "models/macros/util.sql", + "on_or_before": "models/macros/util.sql", + "sql_email_domain": "models/macros_py/sql_helpers.py", + "upper_col": "models/macros/util.sql" + }, + "metadata": { + "generated_at": "2025-10-28T19:05:02+00:00", + "tool": "fastflowtransform" + }, + "nodes": { + "ephemeral_ids.ff": { + "deps": [ + "users.ff" + ], + "kind": "sql", + "materialized": "ephemeral", + "name": "ephemeral_ids.ff", + "path": "models/ephemeral_ids.ff.sql", + "relation": "ephemeral_ids" + }, + "mart_orders_enriched": { + "deps": [ + "orders.ff", + "users_enriched" + ], + "kind": "python", + "materialized": "table", + "name": "mart_orders_enriched", + "path": "models/mart_orders_enriched.ff.py", + "relation": "mart_orders_enriched" + }, + "mart_users.ff": { + "deps": [ + "users_enriched" + ], + "kind": "sql", + "materialized": "table", + "name": "mart_users.ff", + "path": "models/mart_users.ff.sql", + "relation": "mart_users" + }, + "orders.ff": { + "deps": [], + "kind": "sql", + "materialized": "table", + "name": "orders.ff", + "path": "models/orders.ff.sql", + "relation": "orders" + }, + "users.ff": { + "deps": [], + "kind": "sql", + "materialized": "table", + "name": "users.ff", + "path": "models/users.ff.sql", + "relation": "users" + }, + "users_enriched": { + "deps": [ + "users.ff" + ], + "kind": "python", + "materialized": "table", + "name": "users_enriched", + "path": "models/users_enriched.ff.py", + "relation": "users_enriched" + }, + "v_users.ff": { + "deps": [ + "ephemeral_ids.ff", + "users.ff" + ], + "kind": "sql", + "materialized": "view", + "name": "v_users.ff", + "path": "models/v_users.ff.sql", + "relation": "v_users" + }, + "v_users_enriched.ff": { + "deps": [ + "users_enriched" + ], + "kind": "sql", + "materialized": "view", + "name": "v_users_enriched.ff", + "path": "models/v_users_enriched.ff.sql", + "relation": "v_users_enriched" + } + }, + "sources": { + "crm": { + "users": { + "base": { + "catalog": null, + "database": null, + "dataset": null, + "format": null, + "identifier": "seed_users", + "location": null, + "options": {}, + "project": null, + "schema": null + }, + "overrides": {} + } + }, + "erp": { + "orders": { + "base": { + "catalog": null, + "database": null, + "dataset": null, + "format": null, + "identifier": "seed_orders", + "location": null, + "options": {}, + "project": null, + "schema": null + }, + "overrides": {} + } + } + } +} diff --git a/examples/simple_duckdb/.fastflowtransform/target/run_results.json b/examples/simple_duckdb/.fastflowtransform/target/run_results.json new file mode 100644 index 0000000..0f5c774 --- /dev/null +++ b/examples/simple_duckdb/.fastflowtransform/target/run_results.json @@ -0,0 +1,82 @@ +{ + "metadata": { + "generated_at": "2025-10-28T19:05:02+00:00", + "tool": "fastflowtransform" + }, + "results": [ + { + "duration_ms": 0, + "finished_at": "2025-10-28T19:05:02+00:00", + "http": null, + "message": null, + "name": "ephemeral_ids.ff", + "started_at": "2025-10-28T19:05:02+00:00", + "status": "success" + }, + { + "duration_ms": 1, + "finished_at": "2025-10-28T19:05:02+00:00", + "http": null, + "message": null, + "name": "mart_orders_enriched", + "started_at": "2025-10-28T19:05:02+00:00", + "status": "success" + }, + { + "duration_ms": 1, + "finished_at": "2025-10-28T19:05:02+00:00", + "http": null, + "message": null, + "name": "mart_users.ff", + "started_at": "2025-10-28T19:05:02+00:00", + "status": "success" + }, + { + "duration_ms": 4, + "finished_at": "2025-10-28T19:05:02+00:00", + "http": null, + "message": null, + "name": "orders.ff", + "started_at": "2025-10-28T19:05:02+00:00", + "status": "success" + }, + { + "duration_ms": 1, + "finished_at": "2025-10-28T19:05:02+00:00", + "http": null, + "message": null, + "name": "users.ff", + "started_at": "2025-10-28T19:05:02+00:00", + "status": "success" + }, + { + "duration_ms": 1, + "finished_at": "2025-10-28T19:05:02+00:00", + "http": null, + "message": null, + "name": "users_enriched", + "started_at": "2025-10-28T19:05:02+00:00", + "status": "success" + }, + { + "duration_ms": 2, + "finished_at": "2025-10-28T19:05:02+00:00", + "http": null, + "message": null, + "name": "v_users.ff", + "started_at": "2025-10-28T19:05:02+00:00", + "status": "success" + }, + { + "duration_ms": 2, + "finished_at": "2025-10-28T19:05:02+00:00", + "http": null, + "message": null, + "name": "v_users_enriched.ff", + "started_at": "2025-10-28T19:05:02+00:00", + "status": "success" + } + ], + "run_finished_at": "2025-10-28T19:05:02+00:00", + "run_started_at": "2025-10-28T19:05:02+00:00" +} diff --git a/examples/simple_duckdb/site/dag/index.html b/examples/simple_duckdb/site/dag/index.html index feb098d..0402a3a 100644 --- a/examples/simple_duckdb/site/dag/index.html +++ b/examples/simple_duckdb/site/dag/index.html @@ -118,6 +118,8 @@

DAG

ephemeral + incremental +
flowchart TD classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; diff --git a/examples/simple_duckdb/site/dag/mart_orders_enriched.html b/examples/simple_duckdb/site/dag/mart_orders_enriched.html index 1cf0946..135623b 100644 --- a/examples/simple_duckdb/site/dag/mart_orders_enriched.html +++ b/examples/simple_duckdb/site/dag/mart_orders_enriched.html @@ -114,197 +114,6 @@

Metadata

-
-

Columns

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeNullableDescriptionLineage
order_idBIGINT - - yes - - - - — - - - - unknown - -
user_idBIGINT - - yes - - - - — - - - - unknown - -
amountDOUBLE - - yes - - - - — - - - - unknown - -
idBIGINT - - yes - - - - — - - - - unknown - -
emailVARCHAR - - yes - - - - — - - - - unknown - -
signup_tsTIMESTAMP - - yes - - - - — - - - - unknown - -
is_gmailBOOLEAN - - yes - - - - — - - - - unknown - -
valid_amtBOOLEAN - - yes - - - - — - - - - - ?.amount - - transformed - - - - -
-
-