diff --git a/.flowforge/cache/dev-duckdb.json b/.flowforge/cache/dev-duckdb.json new file mode 100644 index 0000000..e28ef33 --- /dev/null +++ b/.flowforge/cache/dev-duckdb.json @@ -0,0 +1,6 @@ +{ + "engine": "duckdb", + "entries": {}, + "profile": "dev", + "version": 1 +} \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 86f9cbd..7e3bac2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,8 @@ name: CI +env: + ACT: "" + on: push: branches: [ main ] @@ -63,7 +66,7 @@ jobs: - name: Unit tests (fast) env: PYTHONWARNINGS: default - run: uv run pytest -q -m "not slow and not postgres" --maxfail=1 + run: uv run pytest -q tests -m "not slow and not postgres" --maxfail=1 # ---------- smoke: examples/simple_duckdb with view + ephemeral ---------- smoke-duckdb: diff --git a/.gitignore b/.gitignore index 823c543..97ebb6e 100644 --- a/.gitignore +++ b/.gitignore @@ -38,4 +38,4 @@ dist/ # Docs Output examples/**/docs/ -FlowForge.md +tickets/** diff --git a/Makefile.dev b/Makefile.dev index 0476201..bba49be 100644 --- a/Makefile.dev +++ b/Makefile.dev @@ -24,6 +24,9 @@ test-pg-batch: unittest: FLOWFORGE_SQL_DEBUG=1 $(UV) run pytest -q tests +cover: + uv run pytest --cov=src/flowforge --cov-report=term-missing --cov-report=xml --cov-report=html + utest: flowforge utest "$(FF_PROJECT)" --env "$(FF_ENV)" diff --git a/Makefile.pipeline b/Makefile.pipeline index 984b913..b24eb73 100644 --- a/Makefile.pipeline +++ b/Makefile.pipeline @@ -10,7 +10,14 @@ seed: # Run/DAG/Test reuse the same duckdb path (FF_ENV can switch engine) run: - $(FLOWFORGE) run "$(FF_PROJECT)" --env "$(FF_ENV)" + $(FLOWFORGE) run "$(FF_PROJECT)" --env "$(FF_ENV)" --jobs=1 + +run_parallel: + # Two independent staging nodes ("users", "orders") run in parallel + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" flowforge run "$(PROJECT)" --env dev --jobs 4 + +run-parallel: + $(FLOWFORGE) run "$(FF_PROJECT)" --env "$(FF_ENV)" --jobs=4 --keep-going dag: $(FLOWFORGE) dag "$(FF_PROJECT)" --env "$(FF_ENV)" --html @@ -32,3 +39,22 @@ demo: seed run dag demo-open test clean: rm -rf .local "$(FF_PROJECT)/docs" dist build *.egg-info + +# --- Cache demos (v0.3) --- + +cache_rw_first: + # first run writes cache and meta + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" flowforge run "$(PROJECT)" --env dev --cache=rw + +cache_rw_second: + # second run: should be a no-op (skips) if nothing changed + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" flowforge run "$(PROJECT)" --env dev --cache=rw + +cache_invalidate_env: + # changing an FF_* env var invalidates fingerprints + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" FF_DEMO_TOGGLE=1 flowforge run "$(PROJECT)" --env dev --cache=rw + +rebuild_users: + # force rebuild of a single model regardless of cache + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" flowforge run "$(PROJECT)" --env dev --cache=rw --rebuild users.ff + \ No newline at end of file diff --git a/README.md b/README.md index e8dd866..a39f699 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# FlowForge (PoC 0.1) +# FlowForge (PoC 0.3) [![CI](https://github.com///actions/workflows/ci.yml/badge.svg)](https://github.com///actions/workflows/ci.yml) [![PyPI version](https://img.shields.io/pypi/v/flowforge.svg)](https://pypi.org/project/flowforge/) @@ -94,6 +94,96 @@ flowforge test examples/simple_duckdb --env dev --select batch --- +> For a deep dive into the v0.3 features, see **[Parallelism & Cache](docs/Cache_and_Parallelism.md)**. + +## Parallelism & Cache (v0.3) + +FlowForge 0.3 adds a level-wise parallel scheduler and an opt-in build cache. + +### Parallel execution +- DAG is split into **levels** (all nodes with the same maximum distance from sources). +- Within a level, up to `--jobs` nodes run **in parallel**. Dependencies are never violated. +- `--keep-going`: tasks already started in a level run to completion, but **subsequent levels won’t start** if any task in the current level fails. + +**Examples** +```bash +# run with 4 workers per level +flowforge run examples/simple_duckdb --env dev --jobs 4 + +# keep tasks in the current level running even if one fails +flowforge run examples/simple_duckdb --env dev --jobs 4 --keep-going +``` + +### Cache modes +The cache decides whether a node can be **skipped** when nothing relevant changed. + +``` +--cache=off # always build +--cache=rw # default: skip on match; write cache after build +--cache=ro # skip on match; build on miss, but don't write cache +--cache=wo # always build and write cache +--rebuild # ignore cache for selected nodes +--no-cache # alias for --cache=off +``` + +**When is a node skipped?** +FlowForge computes a **fingerprint** from: +- SQL/Python source (rendered SQL or function source) +- environment context (engine, profile name, selected `FF_*` env vars, normalized `sources.yml`) +- **dependency fingerprints** (change upstream ⇒ downstream fingerprint changes) +The node is skipped if the fingerprint matches the on-disk cache **and** the physical relation exists. + +**Examples** +```bash +# first run (build + cache write) +flowforge run . --env dev --cache=rw + +# second run (no-op if nothing changed) +flowforge run . --env dev --cache=rw + +# force rebuild of a specific model +flowforge run . --env dev --cache=rw --rebuild marts_daily.ff + +# diagnose a surprising skip: change an FF_* env var to invalidate fingerprints +FF_DEMO_TOGGLE=1 flowforge run . --env dev --cache=rw +``` + +**Troubleshooting** +- *“Why did it skip?”* → Compare your last changes: SQL/Python code, `sources.yml`, `FF_*` env vars, profile/engine. Any change alters the fingerprint. +- *“Relation missing but cache says skip”* → FlowForge also checks relation existence; if it was dropped externally, it will **rebuild**. +- *“Parallel tasks interleave logs”* → Logs are serialized via an internal queue to keep lines readable; use `-v`/`-vv` for more detail. + +--- + +## Selective runs + +Use patterns to run only a subgraph. + +- `--select `: builds only targets that match **and their dependencies**. +- `--exclude `: excludes matching targets from the build (deps remain if still required). + +Examples: + flowforge run . --select marts_daily.ff + flowforge run . --exclude 'mart_*' + +--- + +## Rebuild controls + +- `--rebuild` → rebuild **all selected** nodes (ignore cache). +- `--rebuild-only NAME …` → rebuild only the specified nodes (ignore cache). + +These flags compose with `--select/--exclude`. + +Examples: + # Rebuild everything that matches --select + flowforge run . --select marts_daily.ff --rebuild + + # Rebuild only a specific node + flowforge run . --rebuild-only marts_daily.ff + +--- + ## Documentation - **Documentation hub:** choose your path (operators vs contributors) — see [`docs/index.md`](docs/index.md). diff --git a/docs/Cache_and_Parallelism.md b/docs/Cache_and_Parallelism.md new file mode 100644 index 0000000..7e580bb --- /dev/null +++ b/docs/Cache_and_Parallelism.md @@ -0,0 +1,217 @@ +### 🆕 `docs/Cache_and_Parallelism.md` + +````markdown +# Parallelism & Cache (FlowForge v0.3) + +FlowForge 0.3 introduces a level-wise parallel scheduler and a build cache driven by stable fingerprints. This document explains **how parallel execution works**, **when nodes are skipped**, the exact **fingerprint formula**, and the **meta table** written after successful builds. + +--- + +## Table of Contents +- [Parallel Scheduler](#parallel-scheduler) +- [Cache Policy](#cache-policy) +- [Fingerprint Formula](#fingerprint-formula) +- [Meta Table Schema](#meta-table-schema) +- [CLI Recipes](#cli-recipes) +- [Troubleshooting & FAQ](#troubleshooting--faq) +- [Example: simple_duckdb](#example-simple_duckdb) +- [Appendix: Environment Inputs](#appendix-environment-inputs) + +--- + +## Parallel Scheduler + +FlowForge splits the DAG into **levels** (all nodes that can run together without violating dependencies). Within a level, up to `--jobs` nodes execute in **parallel**. + +- Dependencies are **never** violated. +- `--keep-going`: tasks already started in a level finish; **subsequent levels won’t start** if any task in the current level fails. +- Logs are serialized through an internal queue to keep lines readable and per-node timing visible. + +**Quick start** +```bash +# Run with 4 workers per level +flowforge run . --env dev --jobs 4 + +# Keep tasks in the same level running even if one fails +flowforge run . --env dev --jobs 4 --keep-going +```` + +--- + +## Cache Policy + +The cache decides whether a node can be **skipped** when nothing relevant changed. Modes: + +``` +--cache=off # always build +--cache=rw # default; skip on match; write cache after build +--cache=ro # skip on match; on miss build but don't write cache +--cache=wo # always build and write cache +--rebuild # ignore cache for matching nodes +--no-cache # alias for --cache=off +``` + +### Skip condition + +A node is skipped iff: + +1. The current **fingerprint** matches the on-disk cache value, **and** +2. The **physical relation exists** on the target engine. + +If the relation was dropped externally, FlowForge will **rebuild** even if the fingerprint matches. + +--- + +## Fingerprint Formula + +Fingerprints are stable hashes that change on any relevant input: + +* **SQL models**: `fingerprint_sql(node, rendered_sql, env_ctx, dep_fps)` + + * Uses **rendered** SQL (after Jinja), not the raw template. +* **Python models**: `fingerprint_py(node, func_src, env_ctx, dep_fps)` + + * Uses `inspect.getsource(func)` with a **file-content fallback** if needed. + +`env_ctx` includes: + +* `engine` (e.g., `duckdb`, `postgres`, `bigquery`) +* `profile_name` (CLI `--env`) +* Selected environment entries: **all `FF_*` keys** (key + value) +* A **normalized** portion of `sources.yml` (sorted keys/dump) + +`dep_fps` are upstream fingerprints; **any upstream change** invalidates downstream fingerprints. + +**Properties** + +* Same inputs ⇒ same hash. +* Minimal change in SQL/function ⇒ different hash. +* Dependency changes propagate downstream. + +--- + +## Meta Table Schema + +After a successful build, FlowForge writes a per-node audit row: + +``` +_ff_meta ( + node_name TEXT/STRING, -- logical name, e.g. "users.ff" + relation TEXT/STRING, -- physical table/view, e.g. "users" + fingerprint TEXT/STRING, + engine TEXT/STRING, + built_at TIMESTAMP +) +``` + +Backends: + +* **DuckDB:** table `_ff_meta` in `main`. +* **Postgres:** table `_ff_meta` in the active schema. +* **BigQuery:** table `._ff_meta`. + +> Note: Skip logic uses the file-backed fingerprint cache and a direct relation existence check; the meta table is for auditing and tooling. + +--- + +## CLI Recipes + +```bash +# First run — builds everything, writes cache and meta +flowforge run . --env dev --cache=rw + +# No-op run — should skip all nodes (if nothing changed) +flowforge run . --env dev --cache=rw + +# Force rebuild of a single model (ignores cache for it) +flowforge run . --env dev --cache=rw --rebuild marts_daily.ff + +# Read-only cache (skip on match, build on miss, no writes) +flowforge run . --env dev --cache=ro + +# Always build and write cache +flowforge run . --env dev --cache=wo + +# Disable cache entirely +flowforge run . --env dev --no-cache +``` + +With parallelism: + +```bash +flowforge run . --env dev --jobs 4 +flowforge run . --env dev --jobs 4 --keep-going +``` + +--- + +## Troubleshooting & FAQ + +**“Why did it skip?”** +A skip requires a fingerprint match and an existing relation. Fingerprints include: + +* rendered SQL / Python function source, +* `sources.yml` (normalized), +* engine/profile, +* **all `FF_*` environment variables**, +* upstream fingerprints. + +Any change in the above triggers a rebuild downstream. + +**“Relation missing but cache says skip?”** +We also check relation existence. If the table/view was dropped externally, FlowForge will **rebuild**. + +**“My logs interleave under parallelism.”** +Logs are serialized via a queue; use `-v` / `-vv` for richer but still stable output. Each node prints start/end and duration; levels summarize. + +**“Utest cache?”** +`flowforge utest --cache {off|ro|rw}` defaults to `off` for deterministic runs. With `rw`, expensive unit cases can be accelerated. Unit tests do not rely on the meta table by default. + +--- + +## Example: simple_duckdb + +The demo contains two independent staging nodes (`users.ff.sql`, `orders.ff.sql`). They run in **parallel** within the same level. + +Makefile targets: + +```makefile +run_parallel: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" flowforge run "$(PROJECT)" --env dev --jobs 4 + +cache_rw_first: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" flowforge run "$(PROJECT)" --env dev --cache=rw + +cache_rw_second: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" flowforge run "$(PROJECT)" --env dev --cache=rw + +cache_invalidate_env: + FF_ENGINE=duckdb FF_DUCKDB_PATH="$(DB)" FF_DEMO_TOGGLE=1 flowforge run "$(PROJECT)" --env dev --cache=rw +``` + +--- + +## Appendix: Environment Inputs + +Only environment variables with the `FF_` prefix affect fingerprints (keys and values). If you change one (e.g., `FF_RUN_DATE`, `FF_REGION`), fingerprints change and downstream nodes rebuild. + +```bash +# Will invalidate fingerprints and rebuild affected nodes +FF_RUN_DATE=2025-01-01 flowforge run . --env dev --cache=rw +``` + +```` + +--- + +### 🔗 `docs/index.md` – Link zum neuen Kapitel + +```diff +--- a/docs/index.md ++++ b/docs/index.md +@@ -10,6 +10,7 @@ + - [User Guide – Operational](./Technical_Overview.md#part-i--operational-guide) + - [Modeling Reference](./Config_and_Macros.md) + - [Parallelism & Cache (v0.3)](./Cache_and_Parallelism.md) + - [Developer Guide – Architecture & Internals](./Technical_Overview.md#part-ii--architecture--internals) +```` diff --git a/docs/Config_and_Macros.md b/docs/Config_and_Macros.md index 1b5da56..de1001d 100644 --- a/docs/Config_and_Macros.md +++ b/docs/Config_and_Macros.md @@ -2,6 +2,13 @@ > Authoritative reference for FlowForge’s modeling layer: SQL/Python models, configuration macros, templating helpers, and testing hooks. > Works with FlowForge v0.1 (T1–T11). Supported engines: DuckDB, Postgres, BigQuery (pandas & BigFrames), Databricks/Spark, Snowflake/Snowpark. +> **Execution & Cache (v0.3) quick notes** +> - Parallelism is level-wise; use `flowforge run --jobs N`. +> - Use `--cache={off|ro|rw|wo}` to control skipping behavior. +> - Fingerprints include rendered SQL / Python function source, selected `FF_*` env vars, `sources.yml` and upstream fingerprints. +> - Change any of these → downstream nodes rebuild. +> - `--rebuild ` forces rebuilding selected models (ignores cache). + For an operational walkthrough (CLI usage, troubleshooting, pipelines) see the [Technical Overview](./Technical_Overview.md). This document focuses purely on how you author and test models. diff --git a/docs/Technical_Overview.md b/docs/Technical_Overview.md index a1ea72c..8f2e2b3 100644 --- a/docs/Technical_Overview.md +++ b/docs/Technical_Overview.md @@ -30,8 +30,14 @@ - [Troubleshooting](#troubleshooting) - [Error Codes](#error-codes) - [Profiles & Environment Overrides](#profiles--environment-overrides) + - [Parallel Scheduler (v0.3)](#parallel-scheduler-v03) + - [Cache Policy (v0.3)](#cache-policy-v03) + - [Fingerprint Formula (v0.3)](#fingerprint-formula-v03) + - [Meta Table Schema (v0.3)](#meta-table-schema-v03) - [Jinja DSL Quick Reference](#jinja-dsl-quick-reference) - [Roadmap Snapshot](#roadmap-snapshot) + - [Cross-Table Reconciliations](#cross-table-reconciliations) + - [Auto-Docs & Lineage](#auto-docs--lineage) - [Part II – Architecture & Internals](#part-ii--architecture--internals) - [Architecture Overview](#architecture-overview) - [Core Modules](#core-modules) @@ -206,6 +212,12 @@ flowforge run . -v # verbose progress (model names, executor info) flowforge run . -vv # full debug + SQL channel ``` +#### Parallel logging UX + +- Per node: start/end lines with duration, truncated name, and engine abbrev (DUCK/PG/BQ/…). +- Output is line-stable via a thread-safe log queue; per-level summaries at the end. +- On errors, the familiar “error block” is shown per node. + **Notes** - SQL debug output routes through the `flowforge.sql` logger; use `-vv` or the env var to see it. @@ -215,6 +227,19 @@ flowforge run . -vv # full debug + SQL channel `flowforge utest` executes a single model in isolation, loading only the inputs you provide and comparing the result to an expected dataset. It works for SQL and Python models and runs against DuckDB or Postgres by default. +#### Unit tests & cache + +`flowforge utest --cache {off|ro|rw}` (default: `off`) + +- `off`: deterministic, never skips. +- `ro`: skip on cache hit; on miss, build but **do not write** cache. +- `rw`: skip on hit; on miss, build **and write** fingerprint. + +Notes: +- UTests key the cache with `profile="utest"`. +- Fingerprints include case inputs (CSV content hash / inline rows), so changing inputs invalidates the cache. + + #### Why? - Fast feedback on transformation logic without full DAG runs @@ -433,6 +458,99 @@ bq: For the Pydantic models and resolution flow, see [Settings Infrastructure](#settings-infrastructure). +### Parallel Scheduler (v0.3) + +FlowForge executes the DAG in **levels**. Each level contains nodes without mutual dependencies. + +- `--jobs N` limits the **maximum concurrency per level**. +- `--keep-going` keeps tasks within the current level running even if one fails; subsequent levels are not started. + +**CLI** +```bash +flowforge run . --env dev --jobs 4 # parallel (level-wise) +flowforge run . --env dev --jobs 4 --keep-going + +flowforge run . --select model_b --jobs 4 # Run only model_b and whatever it depends on +flowforge run . --rebuild-only model_b # Rebuild only model_b, even if cache hits +``` + +**Internals** +- `dag.levels(nodes)` builds level lists using indegrees. +- `run_executor.schedule(levels, jobs, fail_policy)` spawns a thread pool per level and aggregates timings. + +### Cache Policy (v0.3) + +**Modes** +``` +off – always build +rw – default; skip if fingerprint matches and relation exists; write cache after build +ro – skip on match; on miss build but do not write cache +wo – always build and write cache +``` +`--rebuild ` ignores cache for matching nodes. + +**Skip condition** +1) Fingerprint matches the stored value (file-backed cache) +2) Physical relation exists on the target engine + +**Examples** +```bash +flowforge run . --env dev --cache=rw +flowforge run . --env dev --cache=ro +flowforge run . --env dev --cache=rw --rebuild marts_daily.ff +``` + +### Fingerprint Formula (v0.3) + +**SQL nodes**: +`fingerprint_sql(node, rendered_sql, env_ctx, dep_fps)` + +**Python nodes**: +`fingerprint_py(node, func_src, env_ctx, dep_fps)` + +**`env_ctx` content** +- `engine` (e.g. duckdb, postgres, bigquery) +- `profile_name` (CLI `--env`) +- selected environment keys/values: all `FF_*` +- normalized excerpt of `sources.yml` (sorted dump) + +**Properties** +- Same inputs ⇒ same hash. +- Minimal change in SQL/function ⇒ different hash. +- Any dependency fingerprint change bubbles downstream via `dep_fps`. + +### Meta Table Schema (v0.3) + +FlowForge writes a per-node audit row after successful builds: + +``` +_ff_meta ( + node_name TEXT / STRING -- logical name, e.g. "users.ff" + relation TEXT / STRING -- physical name, e.g. "users" + fingerprint TEXT / STRING + engine TEXT / STRING + built_at TIMESTAMP +) +``` + +**Backends** +- DuckDB: table `_ff_meta` in `main`. +- Postgres: table `_ff_meta` in the active schema. +- BigQuery: table `._ff_meta`. + +**Notes** +- Meta is currently used for auditing and tooling; skip logic relies on fingerprint cache + relation existence checks. + +#### Executor meta hook + +After a successful materialization the executor calls: + on_node_built(node, relation, fingerprint) + +This performs an upsert into `_ff_meta` with `(node_name, relation, fingerprint, built_at, engine)`. + +Skipped nodes do **not** touch the meta table. + + ### Jinja DSL Quick Reference `ref()`, `source()`, `var()`, `config()`, `this` – see details in the [Modeling Reference](./Config_and_Macros.md). @@ -451,6 +569,150 @@ For the Pydantic models and resolution flow, see [Settings Infrastructure](#sett --- +### Cross-Table Reconciliations + +FlowForge can compare aggregates and key coverage **across two tables** and surface drift with clear, numeric messages. These checks run via the standard `flowforge test` entrypoint and integrate into the DQ summary output. + +**CLI** +```bash +# only run reconciliation checks +flowforge test . --env dev --select reconcile +``` + +**YAML DSL** + +All checks live under `project.yml → tests:` and should carry the tag `reconcile` for easy selection. + +1) **Equality / Approx Equality** +```yaml +- type: reconcile_equal + name: orders_total_equals_mart + tags: [reconcile] + left: { table: orders, expr: "sum(amount)" } + right: { table: mart_orders_enriched, expr: "sum(amount)", where: "valid_amt" } + # optional tolerances: + abs_tolerance: 0.01 # |L - R| <= 0.01 + rel_tolerance_pct: 0.1 # |L - R| / max(|R|, eps) <= 0.1% (0.1) +``` + +2) **Ratio within bounds** +```yaml +- type: reconcile_ratio_within + name: orders_vs_mart_ratio + tags: [reconcile] + left: { table: orders, expr: "sum(amount)" } + right: { table: mart_orders_enriched, expr: "sum(amount)" } + min_ratio: 0.999 + max_ratio: 1.001 +``` + +3) **Absolute difference within limit** +```yaml +- type: reconcile_diff_within + name: count_stability + tags: [reconcile] + left: { table: events_raw, expr: "count(*)", where: "event_type='purchase'" } + right: { table: fct_sales, expr: "sum(txn_count)" } + max_abs_diff: 10 +``` + +4) **Coverage (anti-join = 0)** +```yaml +- type: reconcile_coverage + name: all_orders_covered + tags: [reconcile] + source: { table: orders, key: "order_id" } + target: { table: mart_orders_enriched, key: "order_id" } + # optional filters + source_where: "order_date >= current_date - interval '7 days'" + target_where: "valid_amt" +``` + +**Parameter semantics** +- `expr`: SQL snippet placed into `SELECT {expr} FROM {table}` (keep it engine-neutral: `sum(...)`, `count(*)`, simple filters). +- `where`: optional SQL appended as `WHERE {where}`. +- `abs_tolerance`: absolute tolerance on the difference. +- `rel_tolerance_pct`: relative tolerance in **percent**; denominator is `max(|right|, 1e-12)`. +- `min_ratio` / `max_ratio`: inclusive bounds for `left/right`. +- Coverage uses an anti-join (`source` minus `target` on the given key). The check passes if missing = 0. + +**Summary output** +Each reconciliation contributes a line in the summary with a compact scope, e.g.: +``` +✅ reconcile_equal orders ⇔ mart_orders_enriched (4ms) +✅ reconcile_coverage orders ⇒ mart_orders_enriched (3ms) +``` + +**Engine notes** +- DuckDB and Postgres are supported out-of-the-box. BigQuery works with simple aggregates/filters (expressions should avoid dialect-specific functions). +- For relative tolerances, the implementation guards against zero denominators with a small epsilon (`1e-12`). + + +### Auto-Docs & Lineage + +FlowForge can generate a lightweight documentation site (DAG + model detail pages) from your project: + +```bash +# Classic +flowforge dag . --env dev --html + +# Convenience wrapper (loads schema + descriptions + lineage, can emit JSON) +flowforge docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json +``` + +**Descriptions** can be provided in YAML (project.yml) and/or Markdown files. Markdown has higher priority. + +YAML in `project.yml`: + +```yaml +docs: + models: + users.ff: + description: "Raw users table imported from CRM." + columns: + id: "Primary key." + email: "User email address." + users_enriched: + description: "Adds gmail flag." + columns: + is_gmail: "True if email ends with @gmail.com" +``` + +Markdown (overrides YAML if present): + +``` +/docs/models/.md +/docs/columns//.md +``` + +Optional front matter is ignored for now (title/tags may be used later). + +**Column lineage (heuristic, best effort).** + +- SQL models: expressions like `col` / `alias AS out` / `upper(u.email) AS email_upper)` are parsed; + `u` must come from a `FROM ... AS u` that resolves to a relation. Functions mark lineage as *transformed*. +- Python (pandas) models: simple patterns like `rename`, `out["x"] = df["y"]`, `assign(x=...)` are recognized. +- You can override hints in YAML: + +```yaml +docs: + models: + mart_orders_enriched: + lineage: + email_upper: + from: [{ table: users, column: email }] + transformed: true +``` + +**JSON manifest** (optional via `--emit-json`) includes models, relations, descriptions, columns (with nullable/dtype), +and lineage per column. This is useful for custom doc portals or CI checks. + +Notes: +- Schema introspection currently supports DuckDB and Postgres. For other engines, the Columns card may be empty. +- Lineage is optional; when uncertain, entries fall back to “unknown” and never fail doc generation. + + + ## Part II – Architecture & Internals ### Architecture Overview diff --git a/docs/index.md b/docs/index.md index 010ea75..08c657c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,7 +8,10 @@ Welcome! This page is your starting point for FlowForge docs. Pick the track tha 1. **Getting Started** — you are here (`docs/index.md`) 2. [User Guide](./Technical_Overview.md#part-i--operational-guide) 3. [Modeling Reference](./Config_and_Macros.md) -4. [Developer Guide](./Technical_Overview.md#part-ii--architecture--internals) +4. [Parallelism & Cache](./Cache_and_Parallelism.md) +5. [Cross-Table Reconciliations](./Technical_Overview.md#cross-table-reconciliations) +6. [Auto-Docs & Lineage](./Technical_Overview.md#auto-docs--lineage) +7. [Developer Guide](./Technical_Overview.md#part-ii--architecture--internals) ## Table of Contents diff --git a/examples/postgres/.flowforge/cache/stg-postgres.json b/examples/postgres/.flowforge/cache/stg-postgres.json new file mode 100644 index 0000000..b1a6262 --- /dev/null +++ b/examples/postgres/.flowforge/cache/stg-postgres.json @@ -0,0 +1,12 @@ +{ + "engine": "postgres", + "entries": { + "mart_orders_enriched": "fc41294d6967cfcf3c9b7d2c5405210d9383e5538747f7f13bc16c96cc8754c5", + "mart_users.ff": "6a61e68266d9151e9c473340ee93ccb70146b0079371bae889e4c8313b40a8b8", + "orders.ff": "b45347dd5ad3adbf1637e637fb32e27b766995549c7b1ae4d9412a8ff1b0d375", + "users.ff": "68dbd147dcca21a36d04f031499eb8977a6fae8659873189b2a3169e560cb81e", + "users_enriched": "cf5157127bd1c72c6942a54049acd61ee8817782920534c274b4261783ceda4b" + }, + "profile": "stg", + "version": 1 +} \ No newline at end of file diff --git a/examples/postgres/project.yml b/examples/postgres/project.yml index 5cacf43..3b1f8eb 100644 --- a/examples/postgres/project.yml +++ b/examples/postgres/project.yml @@ -14,3 +14,16 @@ tests: table: users column: id tags: [batch] + + # Cross-table reconciliations (FF-310) + - type: reconcile_equal + name: orders_count_equals_mart + tags: [reconcile] + left: { table: orders, expr: "count(*)" } + right: { table: mart_orders_enriched, expr: "count(*)" } + + - type: reconcile_coverage + name: orders_fully_covered_in_mart + tags: [reconcile] + source: { table: orders, key: "order_id" } + target: { table: mart_orders_enriched, key: "order_id" } diff --git a/examples/postgres/site/dag/index.html b/examples/postgres/site/dag/index.html index 25589f9..082325c 100644 --- a/examples/postgres/site/dag/index.html +++ b/examples/postgres/site/dag/index.html @@ -3,7 +3,7 @@ - FlowForge – DAG & Mini Docs + FlowForge - DAG & Mini Docs