chore: entity exports, pyproject config

renaudcepre · renaudcepre · commit 4e4d91dc2209 · 2026-03-30T09:39:57.000+02:00
diff --git a/docs/evals.md b/docs/evals.md
@@ -1,6 +1,6 @@
 # Evals
 
-Evaluate LLM outputs with scored metrics, thresholds, and historical tracking.
+Evaluate LLM outputs with scored metrics and historical tracking.
 
 ## What is an Eval?
 
@@ -41,7 +41,7 @@ protest eval evals.session:session
 1. Your function receives case data via `ForEach`/`From` (same as parameterized tests)
 2. It returns the output (string, object, anything)
 3. ProTest passes the output to evaluators → scores
-4. Scores determine pass/fail via thresholds
+4. Bool verdicts determine pass/fail
 5. Aggregated stats appear in the terminal
 
 The rest of the pipeline — fixtures, DI, parallelism, reporters — works identically to tests.
@@ -87,15 +87,23 @@ An evaluator is a function decorated with `@evaluator` that receives an `EvalCon
 
 ### Return Types
 
-Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). The framework reads fields by type:
+Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). In dataclasses, annotate fields to tell the framework what each one is:
 
-| Field Type | Role |
+```python
+from typing import Annotated
+from protest.evals import Metric, Verdict, Reason
+```
+
+| Annotation | Role |
 |------------|------|
-| `bool` | Verdict — pass/fail (`all(bool_fields)`) |
-| `float` | Metric — aggregated in stats (mean/p50/p95) |
-| `str` | Reason — displayed on failure, stored in history |
+| `Annotated[bool, Verdict]` | Verdict — pass/fail (`all(verdicts)`) |
+| `Annotated[float, Metric]` | Metric — aggregated in stats (mean/p50/p95) |
+| `Annotated[int, Metric]` | Metric — converted to float |
+| `Annotated[str, Reason]` | Reason — displayed on failure, stored in history |
+
+Unannotated fields are ignored by the runner — free metadata.
 
-Returning `float`, `dict`, or any other type raises `TypeError`.
+Returning `float`, `dict`, or any other non-dataclass/non-bool type raises `TypeError`.
 
 ### Simple Evaluator
 
@@ -109,12 +117,14 @@ def not_empty(ctx: EvalContext) -> bool:
 
 ```python
 from dataclasses import dataclass
+from typing import Annotated
+from protest.evals import Metric, Verdict, Reason
 
 @dataclass
 class KeywordScores:
-    keyword_recall: float      # metric → stats
-    all_present: bool          # verdict → pass/fail
-    detail: str = ""           # reason → shown on failure
+    keyword_recall: Annotated[float, Metric]
+    all_present: Annotated[bool, Verdict]
+    detail: Annotated[str, Reason] = ""
 
 @evaluator
 def keyword_check(ctx: EvalContext, keywords: list[str], min_recall: float = 0.5) -> KeywordScores:
@@ -134,9 +144,9 @@ The threshold (`min_recall`) is a parameter of the evaluator, not a framework co
 ```python
 @dataclass
 class JudgeResult:
-    accuracy: float
-    accurate_enough: bool
-    reason: str = ""
+    accuracy: Annotated[float, Metric]
+    accurate_enough: Annotated[bool, Verdict]
+    reason: Annotated[str, Reason] = ""
 
 @evaluator
 async def llm_judge(ctx: EvalContext, rubric: str = "", min_score: float = 0.7) -> JudgeResult:
@@ -334,7 +344,7 @@ protest history --evals --compare
 Each case in history carries two hashes:
 
 - **`case_hash`** — hash of inputs + expected output. Changes when the test data changes.
-- **`eval_hash`** — hash of evaluators + thresholds. Changes when the scoring criteria change.
+- **`eval_hash`** — hash of evaluators. Changes when the scoring criteria change.
 
 `protest history --compare` uses these hashes to detect modified cases vs regressions. If a case's `eval_hash` changed between runs, it's reported as "scoring modified" rather than a real regression.
 
diff --git a/protest/entities/__init__.py b/protest/entities/__init__.py
@@ -10,6 +10,8 @@
     format_fixture_scope,
 )
 from protest.entities.events import (
+    EvalPayload,
+    EvalScoreEntry,
     FixtureInfo,
     HandlerInfo,
     RunResult,
@@ -31,6 +33,8 @@
 from protest.entities.xfail import Xfail, normalize_xfail
 
 __all__ = [
+    "EvalPayload",
+    "EvalScoreEntry",
     "Fixture",
     "FixtureCallable",
     "FixtureInfo",
diff --git a/protest/entities/core.py b/protest/entities/core.py
@@ -49,6 +49,7 @@ class TestRegistration:
     xfail: Xfail | None = None
     timeout: float | None = None
     retry: Retry | None = None
+    is_eval: bool = False
 
 
 @dataclass(frozen=True, slots=True)
@@ -111,6 +112,7 @@ class TestItem:
     xfail: Xfail | None = None
     timeout: float | None = None
     retry: Retry | None = None
+    is_eval: bool = False
 
     @property
     def test_name(self) -> str:
diff --git a/protest/entities/suite_path.py b/protest/entities/suite_path.py
@@ -58,6 +58,11 @@ def lower(self) -> str:
         """Return lowercase string representation for case-insensitive comparison."""
         return self._path.lower()
 
+    @property
+    def root_name(self) -> str:
+        """Return the top-level suite name: 'A::B::C' -> 'A'."""
+        return self.parts[0] if self.parts else ""
+
     def __str__(self) -> str:
         return self._path
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,9 @@ rich = [
 web = [
     "websockets>=12.0",
 ]
+evals = [
+    "pydantic-evals>=0.1",
+]
 
 
 [tool.ruff]
@@ -100,6 +103,23 @@ ignore = [
     "PLC0415", # lazy import for optional rich dependency
     "PLR0913", # many args is deliberate API design
 ]
+"protest/core/execution/test_executor.py" = [
+    "PLR0915", # _run_test is inherently complex (retry loop + eval capture)
+]
+"protest/history/**" = [
+    "PLC0415", # lazy imports
+    "S603", # subprocess git calls are safe
+    "PLR0913", # load_history has many filter params by design
+]
+"protest/cli/history.py" = [
+    "T201", # print for CLI output
+    "PLC0415", # lazy imports
+]
+"protest/evals/**" = [
+    "T201", # print for eval reporting
+    "PLC0415", # lazy imports for optional pydantic-evals dependency
+    "PLR0913", # adapter functions have many params by design
+]
 "protest/reporting/ascii.py" = [
     "T201", # print is the purpose of this module
 ]
diff --git a/uv.lock b/uv.lock