Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ dmypy.json
# Rust build artifacts
rust/**/target/
packages/**/target/
packages/*/Cargo.lock

# Benchmark artifacts
benchmarks/results/

# Local deploy keys
proboscis_doeff_deploy
Expand Down
23 changes: 22 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

.PHONY: help install sync lint lint-ruff lint-pyright lint-semgrep lint-semgrep-docs lint-doeff lint-packages \
test test-unit test-e2e test-packages test-all test-spec-audit-sa002 format check pre-commit-install clean \
install-opencode-spec-gap-tdd
bench-python bench-vm install-opencode-spec-gap-tdd

# Default target
help:
Expand Down Expand Up @@ -33,6 +33,10 @@ help:
@echo " make test-all Run ALL tests (core + packages)"
@echo " make test-spec-audit-sa002 Run SA-002 pytest + semgrep checks"
@echo ""
@echo "Benchmarks:"
@echo " make bench-python Run public Python benchmark runner"
@echo " make bench-vm Run criterion benchmark for doeff-vm"
@echo ""
@echo "Formatting:"
@echo " make format Format code with ruff"
@echo " make check Run format check without modifying files"
Expand Down Expand Up @@ -161,6 +165,23 @@ test-spec-audit-sa002:
exit 1; \
fi

BENCH_VM_BASE_PYTHON := $(shell uv run python -c "import sys; print(sys._base_executable)")
BENCH_VM_PYTHON_HOME := $(shell uv run python -c "import sys; print(sys.base_prefix)")
BENCH_VM_SITE_PACKAGES := $(shell uv run python -c "import site; print(site.getsitepackages()[0])")
BENCH_PYTHON_ARGS ?=
BENCH_VM_ARGS ?=

bench-python:
uv run python benchmarks/benchmark_runner.py $(BENCH_PYTHON_ARGS)

bench-vm:
cd packages/doeff-vm && env \
PYO3_PYTHON=$(BENCH_VM_BASE_PYTHON) \
PYTHONHOME=$(BENCH_VM_PYTHON_HOME) \
DOEFF_BENCH_SITE_PACKAGES=$(BENCH_VM_SITE_PACKAGES) \
LD_LIBRARY_PATH=$(BENCH_VM_PYTHON_HOME)/lib \
cargo bench --bench pyvm_baseline -- $(BENCH_VM_ARGS)

# =============================================================================
# Formatting
# =============================================================================
Expand Down
17 changes: 17 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Benchmarks

Python benchmark artifacts:

- Run `make bench-python` to execute the public `doeff.run(...)` benchmark suite.
- Results are written to `benchmarks/results/doeff_vm_benchmark_results.json` and
`benchmarks/results/doeff_vm_benchmark_results.csv`.

Rust criterion artifacts:

- Run `make bench-vm` to execute the `criterion` baseline suite for `packages/doeff-vm`.
- Criterion reports are written under `packages/doeff-vm/target/criterion/`.

Useful overrides:

- `make bench-python BENCH_PYTHON_ARGS="--runs 500 --iterations 50"`
- `make bench-vm BENCH_VM_ARGS="--sample-size 20 --measurement-time 1 --warm-up-time 1 --noplot"`
180 changes: 133 additions & 47 deletions benchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,170 @@
"""Micro-benchmarks for the doeff interpreter.
"""Benchmark runner for the public doeff Python API backed by doeff-vm.

Usage
-----
uv run python benchmarks/benchmark_runner.py --runs 500
uv run python benchmarks/benchmark_runner.py --runs 500 --iterations 25
"""

from __future__ import annotations

import argparse
import csv
import json
import statistics
import time
from collections.abc import Iterable
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path

from benchmarks.pyvm_workloads import build_public_benchmark_cases


@dataclass(frozen=True)
class BenchmarkMeasurement:
name: str
runner: str
workload: str
runs: int
workload_iterations: int
expected_value: int
min_ms: float
max_ms: float
mean_ms: float
median_ms: float


@dataclass(frozen=True)
class BenchmarkReport:
generated_at: str
runs: int
workload_iterations: int
results: list[BenchmarkMeasurement]


def _measure_case(case, *, runs: int, workload_iterations: int) -> BenchmarkMeasurement:
observed = case.invoke()
if observed != case.expected_value:
raise AssertionError(
f"{case.name} returned {observed!r}, expected {case.expected_value!r}"
)

from doeff import default_handlers, do, run
from doeff.effects import Ask, Put, Tell
timings: list[float] = []
last_value = observed
for _ in range(runs):
start = time.perf_counter()
last_value = case.invoke()
elapsed = (time.perf_counter() - start) * 1000.0
timings.append(elapsed)

if last_value != case.expected_value:
raise AssertionError(
f"{case.name} returned {last_value!r} after timing, expected {case.expected_value!r}"
)

@do
def _stateful_workload(iterations: int) -> int:
value = yield Ask("seed")
total = value
for index in range(iterations):
yield Tell(f"iteration:{index}")
yield Put("counter", index)
total += index
return total
return BenchmarkMeasurement(
name=case.name,
runner=case.runner,
workload=case.workload,
runs=runs,
workload_iterations=workload_iterations,
expected_value=case.expected_value,
min_ms=min(timings),
max_ms=max(timings),
mean_ms=statistics.mean(timings),
median_ms=statistics.median(timings),
)


def _run_once(workload_iterations: int) -> None:
run(
_stateful_workload(workload_iterations),
handlers=default_handlers(),
env={"seed": 1},
def run_benchmarks(*, runs: int, workload_iterations: int) -> BenchmarkReport:
cases = build_public_benchmark_cases(workload_iterations)
results = [
_measure_case(case, runs=runs, workload_iterations=workload_iterations) for case in cases
]
return BenchmarkReport(
generated_at=datetime.now(timezone.utc).isoformat(),
runs=runs,
workload_iterations=workload_iterations,
results=results,
)


def benchmark(runs: int, *, workload_iterations: int) -> dict[str, float]:
timings: list[float] = []
def write_report(report: BenchmarkReport, output_dir: Path) -> dict[str, Path]:
output_dir.mkdir(parents=True, exist_ok=True)

for _ in range(runs):
start = time.perf_counter()
_run_once(workload_iterations)
elapsed = (time.perf_counter() - start) * 1000.0
timings.append(elapsed)
json_path = output_dir / "doeff_vm_benchmark_results.json"
csv_path = output_dir / "doeff_vm_benchmark_results.csv"

return {
"runs": runs,
"workload_iterations": workload_iterations,
"min_ms": min(timings),
"max_ms": max(timings),
"mean_ms": statistics.mean(timings),
"median_ms": statistics.median(timings),
json_payload = {
"metadata": {
"generated_at": report.generated_at,
"runs": report.runs,
"workload_iterations": report.workload_iterations,
},
"results": [asdict(result) for result in report.results],
}
json_path.write_text(json.dumps(json_payload, indent=2, sort_keys=True) + "\n")

with csv_path.open("w", newline="") as handle:
writer = csv.DictWriter(
handle,
fieldnames=[
"name",
"runner",
"workload",
"runs",
"workload_iterations",
"expected_value",
"min_ms",
"max_ms",
"mean_ms",
"median_ms",
],
)
writer.writeheader()
for result in report.results:
writer.writerow(asdict(result))

return {"json": json_path, "csv": csv_path}


def format_report(results: Iterable[tuple[str, dict[str, float]]]) -> str:
lines = ["doeff benchmark results:"]
for label, stats in results:
lines.append(f" {label}:")
def format_report(report: BenchmarkReport, *, output_paths: dict[str, Path] | None = None) -> str:
lines = ["doeff-vm benchmark results:"]
for result in report.results:
lines.append(
" runs={runs} iterations={workload_iterations} | min={min_ms:.2f}ms "
"median={median_ms:.2f}ms mean={mean_ms:.2f}ms max={max_ms:.2f}ms".format(**stats)
" {name}: runs={runs} iterations={workload_iterations} | "
"min={min_ms:.2f}ms median={median_ms:.2f}ms "
"mean={mean_ms:.2f}ms max={max_ms:.2f}ms".format(**asdict(result))
)
if output_paths is not None:
lines.append(f" json={output_paths['json']}")
lines.append(f" csv={output_paths['csv']}")
return "\n".join(lines)


def main() -> None:
parser = argparse.ArgumentParser(description="Benchmark doeff interpreter execution")
parser.add_argument("--runs", type=int, default=100, help="Number of interpreter executions")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Benchmark doeff-vm through the public Python API")
parser.add_argument("--runs", type=int, default=100, help="Number of executions per workload")
parser.add_argument(
"--iterations",
type=int,
default=25,
help="Inner loop iterations in the workload",
help="Inner loop iterations for stateful workloads",
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("benchmarks/results"),
help="Directory for JSON and CSV benchmark artifacts",
)
args = parser.parse_args()
return parser.parse_args()

stats = benchmark(args.runs, workload_iterations=args.iterations)
print(format_report([("stateful_workload", stats)]))

def main() -> None:
args = parse_args()
report = run_benchmarks(runs=args.runs, workload_iterations=args.iterations)
output_paths = write_report(report, args.output_dir)
print(format_report(report, output_paths=output_paths))


if __name__ == "main": # pragma: no cover - CLI script
if __name__ == "__main__": # pragma: no cover - CLI script
main()
Loading
Loading