{{ body }}
+ diff --git a/docs/two-pass-evaluation.md b/docs/two-pass-evaluation.md new file mode 100644 index 0000000..12b5596 --- /dev/null +++ b/docs/two-pass-evaluation.md @@ -0,0 +1,61 @@ +# Two-Pass LLM Evaluation + +## Overview + +Evonic's evaluation runner can score model answers in two passes: + +1. **Pass 1** — The model answers the benchmark prompt (often with reasoning in Indonesian or English). +2. **Pass 2** — A second LLM call extracts only the final answer in a strict format (number, `ya`/`tidak`, SQL, and so on). + +Pass 2 makes scoring reliable when Pass 1 is verbose or formatted inconsistently. + +## When it runs + +Two-pass extraction is used by the built-in **Two-Pass Evaluator** (`two_pass`), which is the default for domains such as **math**, **reasoning**, and **health** (see `evaluator/domain_evaluators.py` and `test_definitions/evaluators/two_pass.json`). + +Custom evaluators can set `"uses_pass2": true` in their JSON definition to opt in. + +## Flow + +``` +Prompt → LLM (Pass 1) → raw response + ↓ + extraction prompt (Pass 2) + ↓ + clean answer → domain scorer +``` + +If Pass 2 output does not match the expected format, the extractor tries regex fallbacks on the Pass 1 text before marking extraction as failed. + +## Configuration + +| Setting | Env variable | Default | +|--------|--------------|---------| +| Enable Pass 2 globally | `TWO_PASS_ENABLED` | `1` (on) | +| Extraction temperature | `TWO_PASS_TEMPERATURE` | `0.0` | +| UI override (persisted) | System → Evaluators page | Falls back to env | + +The UI toggle writes `two_pass_enabled` to the app settings store and takes effect on the next evaluation without restarting the server. + +## Result details + +Evaluation results include a `pass2` object when extraction ran: + +- `success`, `format`, `extracted_answer` +- `prompt`, `raw_output`, optional `thinking` +- `error` when format validation failed + +These fields appear in the evaluation runner, history detail view, and API JSON. + +## Disabling two-pass + +- Turn off **Two-pass extraction** on `/evaluate/evaluators`, or +- Set `TWO_PASS_ENABLED=0` in `.env` and restart if no DB override exists. + +When disabled, scoring uses the raw Pass 1 response (same as `extraction_method: disabled` in logs). + +## Related code + +- `evaluator/answer_extractor.py` — Pass 2 prompts and validation +- `evaluator/strategies/two_pass.py` — Evaluator strategy +- `tests/test_answer_extractor.py` — Unit tests for format validation diff --git a/evaluator/answer_extractor.py b/evaluator/answer_extractor.py index f9c6757..34eec6d 100644 --- a/evaluator/answer_extractor.py +++ b/evaluator/answer_extractor.py @@ -187,8 +187,18 @@ class AnswerExtractor: def __init__(self): self.client = llm_client - self.enabled = getattr(config, 'TWO_PASS_ENABLED', True) self.temperature = getattr(config, 'TWO_PASS_TEMPERATURE', 0.0) + + def is_enabled(self) -> bool: + """Whether Pass 2 extraction runs (DB setting overrides env default).""" + try: + from models.db import db + stored = db.get_setting('two_pass_enabled', None) + if stored is not None: + return stored == '1' + except Exception: + pass + return getattr(config, 'TWO_PASS_ENABLED', True) def extract(self, domain: str, level: int, response: str, question: str = "") -> Dict[str, Any]: """ @@ -217,7 +227,7 @@ def extract(self, domain: str, level: int, response: str, question: str = "") -> } """ # Check if two-pass is enabled - if not self.enabled: + if not self.is_enabled(): return { "success": True, "extracted": response, diff --git a/routes/evaluation.py b/routes/evaluation.py index f36aac8..81d57b0 100644 --- a/routes/evaluation.py +++ b/routes/evaluation.py @@ -1,6 +1,7 @@ import json +import os import queue -from flask import Blueprint, render_template, jsonify, request +from flask import Blueprint, render_template, jsonify, request, abort from evaluator.engine import evaluation_engine from models.db import db @@ -8,6 +9,8 @@ evaluation_bp = Blueprint('evaluation', __name__) +_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + @evaluation_bp.route('/evaluate/domains') def evaluate_domains(): @@ -21,6 +24,17 @@ def evaluate_evaluators(): return render_template('evaluate_evaluators.html') +@evaluation_bp.route('/evaluate/docs/two-pass') +def evaluate_two_pass_docs(): + """Serve two-pass evaluation documentation (markdown source).""" + doc_path = os.path.join(_ROOT, 'docs', 'two-pass-evaluation.md') + if not os.path.isfile(doc_path): + abort(404) + with open(doc_path, encoding='utf-8') as f: + body = f.read() + return render_template('evaluate_doc.html', title='Two-Pass Evaluation', body=body) + + @evaluation_bp.route('/evaluate') def evaluate(): """LLM Evaluation runner page""" diff --git a/routes/settings.py b/routes/settings.py index 763b2b0..d3eb46b 100644 --- a/routes/settings.py +++ b/routes/settings.py @@ -476,6 +476,22 @@ def api_sync_tests(): # ---- App settings toggles ---- +@settings_bp.route('/api/settings/two-pass-enabled', methods=['GET', 'PUT']) +def api_two_pass_enabled(): + """Get or set global two-pass (Pass 2) answer extraction for evaluation.""" + from models.db import db + import config as app_config + + default = '1' if getattr(app_config, 'TWO_PASS_ENABLED', True) else '0' + if request.method == 'PUT': + data = request.get_json() or {} + enabled = '1' if data.get('enabled', False) else '0' + db.set_setting('two_pass_enabled', enabled) + return jsonify({'success': True, 'enabled': enabled == '1'}) + val = db.get_setting('two_pass_enabled', default) + return jsonify({'enabled': val == '1'}) + + @settings_bp.route('/api/settings/public-history', methods=['GET', 'PUT']) def api_public_history(): """Get or set the public history page toggle.""" diff --git a/templates/evaluate_doc.html b/templates/evaluate_doc.html new file mode 100644 index 0000000..aa328dc --- /dev/null +++ b/templates/evaluate_doc.html @@ -0,0 +1,14 @@ +{% extends "base.html" %} + +{% block content %} +
{{ body }}
+
+ After the model answers a benchmark prompt, a second LLM call extracts a clean final answer
+ (number, ya/tidak, SQL, etc.) before scoring. Used by the built-in two_pass evaluator
+ and domains such as math and reasoning.
+
+ Read the guide
+ · Env fallback: TWO_PASS_ENABLED, TWO_PASS_TEMPERATURE
+
${evaluator.description || 'No description'}