Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
105 changes: 48 additions & 57 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,75 +66,66 @@ jobs:
- name: Unit tests (fast)
env:
PYTHONWARNINGS: default
run: uv run pytest -q tests -m "not slow and not postgres" --maxfail=1
run: uv run pytest -q tests -m unit --maxfail=1

# ---------- smoke: examples/simple_duckdb with view + ephemeral ----------
smoke-duckdb:
# ---------- Examples: Integration Tests ----------
examples-matrix:
runs-on: ubuntu-latest
needs: checks
strategy:
fail-fast: false
matrix:
engine: [duckdb, postgres, databricks_spark]

services:
postgres:
image: postgres:16
env:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: fastflowtransform
ports:
- 5432:5432
options: >-
--health-cmd "pg_isready -U postgres"
--health-interval 10s
--health-timeout 5s
--health-retries 5

steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4

- name: Setup uv (and Python)
uses: astral-sh/setup-uv@v5
with:
python-version: "3.12"
enable-cache: true

- name: Sync deps
run: uv sync

- name: Prepare ephemeral + view models in example
shell: bash
run: |
set -euo pipefail
PROJECT="examples/simple_duckdb"
mkdir -p "${PROJECT}/models"

cat > "${PROJECT}/models/ephemeral_ids.ff.sql" <<'SQL'
{{ config(materialized='ephemeral') }}
select id from {{ source('crm','users') }}
SQL

cat > "${PROJECT}/models/v_users.ff.sql" <<'SQL'
{{ config(materialized='view') }}
select u.id
from {{ ref('users.ff') }} u
join {{ ref('ephemeral_ids.ff') }} e using(id)
SQL

- name: Seed example (DuckDB file db)
env:
FF_ENGINE: duckdb
FF_DUCKDB_PATH: examples/simple_duckdb/.local/demo.duckdb
run: uv run fft seed examples/simple_duckdb --env dev
- name: Sync deps (dev)
run: uv sync --extra dev --frozen

- name: Run models (ephemeral inline + view materialization)
env:
FF_ENGINE: duckdb
FF_DUCKDB_PATH: examples/simple_duckdb/.local/demo.duckdb
run: uv run fft run examples/simple_duckdb --env dev
- name: Setup Java for Spark
if: matrix.engine == 'databricks_spark'
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: '17'

- name: Smoke assertions (query DuckDB)
run: |
uv run python - <<'PY'
import duckdb, pathlib
db = "examples/simple_duckdb/.local/demo.duckdb"
assert pathlib.Path(db).exists(), "DuckDB file not found"
con = duckdb.connect(db)
n = con.execute("select count(*) from v_users").fetchone()[0]
assert n >= 1, f"v_users empty (count={n})"
existing = {r[0] for r in con.execute(
"select table_name from information_schema.tables where table_schema in ('main','temp')"
).fetchall()}
assert "ephemeral_ids" not in existing, "ephemeral_ids should not be materialized"
print("✓ smoke ok: v_users present, ephemeral inlined")
PY

- name: Build DAG (optional sanity)
- name: Run example/integration tests for engine
env:
FF_ENGINE: duckdb
FF_DUCKDB_PATH: examples/simple_duckdb/.local/demo.duckdb
FF_PG_DSN: postgresql+psycopg://postgres:postgres@localhost:5432/fastflowtransform
FF_PG_SCHEMA: ci_examples
run: |
uv run fft dag examples/simple_duckdb --env dev --html
test -f examples/simple_duckdb/site/dag/index.html
echo "Running integration tests for engine=${{ matrix.engine }}"
case "${{ matrix.engine }}" in
duckdb)
uv run pytest -m "integration and duckdb" --maxfail=1 -q tests
;;
postgres)
uv run pytest -m "integration and postgres" --maxfail=1 -q tests
;;
databricks_spark)
uv run pytest -m "integration and databricks_spark" --maxfail=1 -q tests
;;
esac
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Envs & Secrets
.env
.env.*
.env.local
.env.*.local

# Local DBs / Artifacts
*.duckdb
Expand Down Expand Up @@ -36,7 +36,7 @@ spark-warehouse
metastore_db
derby.log
.fastflowtransform
Combined.md
_exports/**

# Editors / IDEs
.vscode/
Expand Down
3 changes: 3 additions & 0 deletions Makefile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,6 @@ act-commit:

concat-docs:
$(UV) run python _scripts/concat_docs.py -o Combined.md

export-demo:
$(UV) python _scripts/export_subdir_md.py examples/incremental_demo -o _exports/incremental_demo_export.md --exclude-ext html css
3 changes: 3 additions & 0 deletions Makefile.pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@

FFT := FF_ENGINE=duckdb FF_DUCKDB_PATH="$(FF_DB)" fft

init:
$(UV) fft init examples/materializations_demo

seed:
$(FFT) seed "$(FF_PROJECT)" --env dev

Expand Down
4 changes: 1 addition & 3 deletions _scripts/concat_docs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
# concat_docs.py
# _scripts/concat_docs.py
"""
Concatenates all Markdown files from the docs directory into a single file.
- Respects the order in mkdocs.yml (nav).
Expand All @@ -15,7 +14,6 @@
from __future__ import annotations
import argparse
import fnmatch
import os
from pathlib import Path
import re
import sys
Expand Down
220 changes: 220 additions & 0 deletions _scripts/export_subdir_md.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#!/usr/bin/env python3
import argparse
import subprocess
from pathlib import Path


def get_git_root() -> Path:
"""Return the root directory of the current Git repository."""
try:
out = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], text=True).strip()
return Path(out)
except subprocess.CalledProcessError:
raise SystemExit("Error: This script must be run inside a Git repository.")


def get_git_files(git_root: Path) -> list[Path]:
"""
Return all files that are not ignored by Git
(tracked + untracked, but excluding standard ignored files).
"""
try:
out = subprocess.check_output(
["git", "ls-files", "--cached", "--others", "--exclude-standard"],
text=True,
cwd=git_root,
)
except subprocess.CalledProcessError as e:
raise SystemExit(f"Error while running 'git ls-files': {e}")
paths = [git_root / line.strip() for line in out.splitlines() if line.strip()]
return paths


def is_under_dir(path: Path, directory: Path) -> bool:
"""Return True if 'path' is located under 'directory'."""
try:
path.relative_to(directory)
return True
except ValueError:
return False


def is_binary_file(path: Path, chunk_size: int = 2048) -> bool:
"""
Simple heuristic to check if a file is binary.

Reads the first 'chunk_size' bytes and checks for NUL bytes or
decoding errors when interpreting as UTF-8.
"""
try:
with path.open("rb") as f:
chunk = f.read(chunk_size)
# NUL byte or decode error => treat as binary
if b"\0" in chunk:
return True
try:
chunk.decode("utf-8")
except UnicodeDecodeError:
return True
return False
except OSError:
# If file cannot be read for some reason, treat it as binary
return True


def build_tree_structure(files: list[Path], base_dir: Path) -> str:
"""
Build a textual tree structure relative to 'base_dir'.

'files' should be the list of all files under 'base_dir'.
"""
# Work with paths relative to base_dir
rel_paths = [f.relative_to(base_dir) for f in files]
# Nested dict-based tree representation
tree = {}

for rel in rel_paths:
parts = rel.parts
current = tree
for part in parts[:-1]:
current = current.setdefault(part + "/", {})
# Store files under special key
current.setdefault("__files__", []).append(parts[-1])

lines = []
root_name = base_dir.name + "/"
lines.append(root_name)

def walk(node: dict, prefix: str = " "):
"""Recursively traverse the tree and build the text representation."""
# Files in the current directory
files_here = sorted(node.get("__files__", []))
for fname in files_here:
lines.append(f"{prefix}{fname}")
# Subdirectories
for key in sorted(k for k in node.keys() if k != "__files__"):
lines.append(f"{prefix}{key}")
walk(node[key], prefix + " ")

walk(tree)
return "\n".join(lines)


def normalize_ext_list(exts: list[str]) -> set[str]:
"""
Normalize a list of file extensions:

- ensure each starts with a dot (.)
- convert all to lowercase
"""
norm = set()
for e in exts:
e = e.strip()
if not e:
continue
if not e.startswith("."):
e = "." + e
norm.add(e.lower())
return norm


def main():
parser = argparse.ArgumentParser(
description="Concatenate the contents of all non-ignored files in a subdirectory into a Markdown file."
)
parser.add_argument(
"subdir", help="Subdirectory inside the Git repository (relative or absolute)."
)
parser.add_argument(
"-o",
"--output",
default="combined.md",
help="Path to the output Markdown file (default: combined.md)",
)
parser.add_argument(
"--exclude-ext",
nargs="*",
default=[],
help="File extensions to exclude, e.g. --exclude-ext .html .css js",
)
args = parser.parse_args()

git_root = get_git_root()
subdir_path = Path(args.subdir).resolve()

# Ensure that the given subdirectory is inside the Git repository
if not is_under_dir(subdir_path, git_root):
raise SystemExit(
f"Error: The given subdirectory is not inside the Git repository: {subdir_path}"
)

if not subdir_path.is_dir():
raise SystemExit(f"Error: {subdir_path} is not a directory.")

all_git_files = get_git_files(git_root)

# Filter to files under the given subdirectory
files_in_subdir = [f for f in all_git_files if is_under_dir(f, subdir_path) and f.is_file()]

# Normalize and apply excluded extensions
excluded_exts = normalize_ext_list(args.exclude_ext)
if excluded_exts:
files_in_subdir = [f for f in files_in_subdir if f.suffix.lower() not in excluded_exts]

files_in_subdir = sorted(files_in_subdir)

if not files_in_subdir:
raise SystemExit(
"No matching files found in the subdirectory (or all are excluded/ignored)."
)

# Build directory tree for Markdown
tree_md = build_tree_structure(files_in_subdir, subdir_path)

output_path = Path(args.output).resolve()

skipped_binary = []

with output_path.open("w", encoding="utf-8") as out:
# Title
out.write(f"# Export from `{subdir_path.relative_to(git_root)}`\n\n")

# Directory structure
out.write("## Directory structure\n\n")
out.write("```text\n")
out.write(tree_md)
out.write("\n```\n\n")

# Files
out.write("## Files\n\n")

for file_path in files_in_subdir:
rel = file_path.relative_to(git_root)
if is_binary_file(file_path):
skipped_binary.append(rel)
continue

out.write(f"### `{rel}`\n\n")
out.write("```text\n")
try:
content = file_path.read_text(encoding="utf-8")
except UnicodeDecodeError:
skipped_binary.append(rel)
out.write("[File could not be read as UTF-8]\n")
out.write("```\n\n")
continue
out.write(content)
if not content.endswith("\n"):
out.write("\n")
out.write("```\n\n")

if skipped_binary:
out.write("## Skipped files (binary or not readable)\n\n")
for rel in skipped_binary:
out.write(f"- `{rel}`\n")

print(f"Done! Output written to: {output_path}")


if __name__ == "__main__":
main()
Loading