proboscis · proboscis · Mar 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -68,6 +68,10 @@ dmypy.json
 # Rust build artifacts
 rust/**/target/
 packages/**/target/
+packages/*/Cargo.lock
+
+# Benchmark artifacts
+benchmarks/results/
 
 # Local deploy keys
 proboscis_doeff_deploy

diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@
 
 .PHONY: help install sync lint lint-ruff lint-pyright lint-semgrep lint-semgrep-docs lint-doeff lint-packages \
         test test-unit test-e2e test-packages test-all test-spec-audit-sa002 format check pre-commit-install clean \
-        install-opencode-spec-gap-tdd
+        bench-python bench-vm install-opencode-spec-gap-tdd
 
 # Default target
 help:
@@ -33,6 +33,10 @@ help:
 	@echo "  make test-all          Run ALL tests (core + packages)"
 	@echo "  make test-spec-audit-sa002 Run SA-002 pytest + semgrep checks"
 	@echo ""
+	@echo "Benchmarks:"
+	@echo "  make bench-python      Run public Python benchmark runner"
+	@echo "  make bench-vm          Run criterion benchmark for doeff-vm"
+	@echo ""
 	@echo "Formatting:"
 	@echo "  make format            Format code with ruff"
 	@echo "  make check             Run format check without modifying files"
@@ -161,6 +165,23 @@ test-spec-audit-sa002:
 		exit 1; \
 	fi
 
+BENCH_VM_BASE_PYTHON := $(shell uv run python -c "import sys; print(sys._base_executable)")
+BENCH_VM_PYTHON_HOME := $(shell uv run python -c "import sys; print(sys.base_prefix)")
+BENCH_VM_SITE_PACKAGES := $(shell uv run python -c "import site; print(site.getsitepackages()[0])")
+BENCH_PYTHON_ARGS ?=
+BENCH_VM_ARGS ?=
+
+bench-python:
+	uv run python benchmarks/benchmark_runner.py $(BENCH_PYTHON_ARGS)
+
+bench-vm:
+	cd packages/doeff-vm && env \
+		PYO3_PYTHON=$(BENCH_VM_BASE_PYTHON) \
+		PYTHONHOME=$(BENCH_VM_PYTHON_HOME) \
+		DOEFF_BENCH_SITE_PACKAGES=$(BENCH_VM_SITE_PACKAGES) \
+		LD_LIBRARY_PATH=$(BENCH_VM_PYTHON_HOME)/lib \
+		cargo bench --bench pyvm_baseline -- $(BENCH_VM_ARGS)
+
 # =============================================================================
 # Formatting
 # =============================================================================

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,17 @@
+# Benchmarks
+
+Python benchmark artifacts:
+
+- Run `make bench-python` to execute the public `doeff.run(...)` benchmark suite.
+- Results are written to `benchmarks/results/doeff_vm_benchmark_results.json` and
+  `benchmarks/results/doeff_vm_benchmark_results.csv`.
+
+Rust criterion artifacts:
+
+- Run `make bench-vm` to execute the `criterion` baseline suite for `packages/doeff-vm`.
+- Criterion reports are written under `packages/doeff-vm/target/criterion/`.
+
+Useful overrides:
+
+- `make bench-python BENCH_PYTHON_ARGS="--runs 500 --iterations 50"`
+- `make bench-vm BENCH_VM_ARGS="--sample-size 20 --measurement-time 1 --warm-up-time 1 --noplot"`
diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py
@@ -1,84 +1,170 @@
-"""Micro-benchmarks for the doeff interpreter.
+"""Benchmark runner for the public doeff Python API backed by doeff-vm.
 
 Usage
 -----
-    uv run python benchmarks/benchmark_runner.py --runs 500
+    uv run python benchmarks/benchmark_runner.py --runs 500 --iterations 25
 """
 
 from __future__ import annotations
 
 import argparse
+import csv
+import json
 import statistics
 import time
-from collections.abc import Iterable
+from dataclasses import asdict, dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+
+from benchmarks.pyvm_workloads import build_public_benchmark_cases
+
+
+@dataclass(frozen=True)
+class BenchmarkMeasurement:
+    name: str
+    runner: str
+    workload: str
+    runs: int
+    workload_iterations: int
+    expected_value: int
+    min_ms: float
+    max_ms: float
+    mean_ms: float
+    median_ms: float
+
+
+@dataclass(frozen=True)
+class BenchmarkReport:
+    generated_at: str
+    runs: int
+    workload_iterations: int
+    results: list[BenchmarkMeasurement]
+
+
+def _measure_case(case, *, runs: int, workload_iterations: int) -> BenchmarkMeasurement:
+    observed = case.invoke()
+    if observed != case.expected_value:
+        raise AssertionError(
+            f"{case.name} returned {observed!r}, expected {case.expected_value!r}"
+        )
 
-from doeff import default_handlers, do, run
-from doeff.effects import Ask, Put, Tell
+    timings: list[float] = []
+    last_value = observed
+    for _ in range(runs):
+        start = time.perf_counter()
+        last_value = case.invoke()
+        elapsed = (time.perf_counter() - start) * 1000.0
+        timings.append(elapsed)
 
+    if last_value != case.expected_value:
+        raise AssertionError(
+            f"{case.name} returned {last_value!r} after timing, expected {case.expected_value!r}"
+        )
 
-@do
-def _stateful_workload(iterations: int) -> int:
-    value = yield Ask("seed")
-    total = value
-    for index in range(iterations):
-        yield Tell(f"iteration:{index}")
-        yield Put("counter", index)
-        total += index
-    return total
+    return BenchmarkMeasurement(
+        name=case.name,
+        runner=case.runner,
+        workload=case.workload,
+        runs=runs,
+        workload_iterations=workload_iterations,
+        expected_value=case.expected_value,
+        min_ms=min(timings),
+        max_ms=max(timings),
+        mean_ms=statistics.mean(timings),
+        median_ms=statistics.median(timings),
+    )
 
 
-def _run_once(workload_iterations: int) -> None:
-    run(
-        _stateful_workload(workload_iterations),
-        handlers=default_handlers(),
-        env={"seed": 1},
+def run_benchmarks(*, runs: int, workload_iterations: int) -> BenchmarkReport:
+    cases = build_public_benchmark_cases(workload_iterations)
+    results = [
+        _measure_case(case, runs=runs, workload_iterations=workload_iterations) for case in cases
+    ]
+    return BenchmarkReport(
+        generated_at=datetime.now(timezone.utc).isoformat(),
+        runs=runs,
+        workload_iterations=workload_iterations,
+        results=results,
     )
 
 
-def benchmark(runs: int, *, workload_iterations: int) -> dict[str, float]:
-    timings: list[float] = []
+def write_report(report: BenchmarkReport, output_dir: Path) -> dict[str, Path]:
+    output_dir.mkdir(parents=True, exist_ok=True)
 
-    for _ in range(runs):
-        start = time.perf_counter()
-        _run_once(workload_iterations)
-        elapsed = (time.perf_counter() - start) * 1000.0
-        timings.append(elapsed)
+    json_path = output_dir / "doeff_vm_benchmark_results.json"
+    csv_path = output_dir / "doeff_vm_benchmark_results.csv"
 
-    return {
-        "runs": runs,
-        "workload_iterations": workload_iterations,
-        "min_ms": min(timings),
-        "max_ms": max(timings),
-        "mean_ms": statistics.mean(timings),
-        "median_ms": statistics.median(timings),
+    json_payload = {
+        "metadata": {
+            "generated_at": report.generated_at,
+            "runs": report.runs,
+            "workload_iterations": report.workload_iterations,
+        },
+        "results": [asdict(result) for result in report.results],
     }
+    json_path.write_text(json.dumps(json_payload, indent=2, sort_keys=True) + "\n")
+
+    with csv_path.open("w", newline="") as handle:
+        writer = csv.DictWriter(
+            handle,
+            fieldnames=[
+                "name",
+                "runner",
+                "workload",
+                "runs",
+                "workload_iterations",
+                "expected_value",
+                "min_ms",
+                "max_ms",
+                "mean_ms",
+                "median_ms",
+            ],
+        )
+        writer.writeheader()
+        for result in report.results:
+            writer.writerow(asdict(result))
+
+    return {"json": json_path, "csv": csv_path}
 
 
-def format_report(results: Iterable[tuple[str, dict[str, float]]]) -> str:
-    lines = ["doeff benchmark results:"]
-    for label, stats in results:
-        lines.append(f"  {label}:")
+def format_report(report: BenchmarkReport, *, output_paths: dict[str, Path] | None = None) -> str:
+    lines = ["doeff-vm benchmark results:"]
+    for result in report.results:
         lines.append(
-            "    runs={runs} iterations={workload_iterations} | min={min_ms:.2f}ms "
-            "median={median_ms:.2f}ms mean={mean_ms:.2f}ms max={max_ms:.2f}ms".format(**stats)
+            "  {name}: runs={runs} iterations={workload_iterations} | "
+            "min={min_ms:.2f}ms median={median_ms:.2f}ms "
+            "mean={mean_ms:.2f}ms max={max_ms:.2f}ms".format(**asdict(result))
         )
+    if output_paths is not None:
+        lines.append(f"  json={output_paths['json']}")
+        lines.append(f"  csv={output_paths['csv']}")
     return "\n".join(lines)
 
 
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Benchmark doeff interpreter execution")
-    parser.add_argument("--runs", type=int, default=100, help="Number of interpreter executions")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Benchmark doeff-vm through the public Python API")
+    parser.add_argument("--runs", type=int, default=100, help="Number of executions per workload")
     parser.add_argument(
         "--iterations",
         type=int,
         default=25,
-        help="Inner loop iterations in the workload",
+        help="Inner loop iterations for stateful workloads",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("benchmarks/results"),
+        help="Directory for JSON and CSV benchmark artifacts",
     )
-    args = parser.parse_args()
+    return parser.parse_args()
 
-    stats = benchmark(args.runs, workload_iterations=args.iterations)
-    print(format_report([("stateful_workload", stats)]))
+
+def main() -> None:
+    args = parse_args()
+    report = run_benchmarks(runs=args.runs, workload_iterations=args.iterations)
+    output_paths = write_report(report, args.output_dir)
+    print(format_report(report, output_paths=output_paths))
 
 
-if __name__ == "main":  # pragma: no cover - CLI script
+if __name__ == "__main__":  # pragma: no cover - CLI script
     main()