diff --git a/docs/two-pass-evaluation.md b/docs/two-pass-evaluation.md new file mode 100644 index 0000000..12b5596 --- /dev/null +++ b/docs/two-pass-evaluation.md @@ -0,0 +1,61 @@ +# Two-Pass LLM Evaluation + +## Overview + +Evonic's evaluation runner can score model answers in two passes: + +1. **Pass 1** — The model answers the benchmark prompt (often with reasoning in Indonesian or English). +2. **Pass 2** — A second LLM call extracts only the final answer in a strict format (number, `ya`/`tidak`, SQL, and so on). + +Pass 2 makes scoring reliable when Pass 1 is verbose or formatted inconsistently. + +## When it runs + +Two-pass extraction is used by the built-in **Two-Pass Evaluator** (`two_pass`), which is the default for domains such as **math**, **reasoning**, and **health** (see `evaluator/domain_evaluators.py` and `test_definitions/evaluators/two_pass.json`). + +Custom evaluators can set `"uses_pass2": true` in their JSON definition to opt in. + +## Flow + +``` +Prompt → LLM (Pass 1) → raw response + ↓ + extraction prompt (Pass 2) + ↓ + clean answer → domain scorer +``` + +If Pass 2 output does not match the expected format, the extractor tries regex fallbacks on the Pass 1 text before marking extraction as failed. + +## Configuration + +| Setting | Env variable | Default | +|--------|--------------|---------| +| Enable Pass 2 globally | `TWO_PASS_ENABLED` | `1` (on) | +| Extraction temperature | `TWO_PASS_TEMPERATURE` | `0.0` | +| UI override (persisted) | System → Evaluators page | Falls back to env | + +The UI toggle writes `two_pass_enabled` to the app settings store and takes effect on the next evaluation without restarting the server. + +## Result details + +Evaluation results include a `pass2` object when extraction ran: + +- `success`, `format`, `extracted_answer` +- `prompt`, `raw_output`, optional `thinking` +- `error` when format validation failed + +These fields appear in the evaluation runner, history detail view, and API JSON. + +## Disabling two-pass + +- Turn off **Two-pass extraction** on `/evaluate/evaluators`, or +- Set `TWO_PASS_ENABLED=0` in `.env` and restart if no DB override exists. + +When disabled, scoring uses the raw Pass 1 response (same as `extraction_method: disabled` in logs). + +## Related code + +- `evaluator/answer_extractor.py` — Pass 2 prompts and validation +- `evaluator/strategies/two_pass.py` — Evaluator strategy +- `tests/test_answer_extractor.py` — Unit tests for format validation diff --git a/evaluator/answer_extractor.py b/evaluator/answer_extractor.py index f9c6757..34eec6d 100644 --- a/evaluator/answer_extractor.py +++ b/evaluator/answer_extractor.py @@ -187,8 +187,18 @@ class AnswerExtractor: def __init__(self): self.client = llm_client - self.enabled = getattr(config, 'TWO_PASS_ENABLED', True) self.temperature = getattr(config, 'TWO_PASS_TEMPERATURE', 0.0) + + def is_enabled(self) -> bool: + """Whether Pass 2 extraction runs (DB setting overrides env default).""" + try: + from models.db import db + stored = db.get_setting('two_pass_enabled', None) + if stored is not None: + return stored == '1' + except Exception: + pass + return getattr(config, 'TWO_PASS_ENABLED', True) def extract(self, domain: str, level: int, response: str, question: str = "") -> Dict[str, Any]: """ @@ -217,7 +227,7 @@ def extract(self, domain: str, level: int, response: str, question: str = "") -> } """ # Check if two-pass is enabled - if not self.enabled: + if not self.is_enabled(): return { "success": True, "extracted": response, diff --git a/routes/evaluation.py b/routes/evaluation.py index f36aac8..81d57b0 100644 --- a/routes/evaluation.py +++ b/routes/evaluation.py @@ -1,6 +1,7 @@ import json +import os import queue -from flask import Blueprint, render_template, jsonify, request +from flask import Blueprint, render_template, jsonify, request, abort from evaluator.engine import evaluation_engine from models.db import db @@ -8,6 +9,8 @@ evaluation_bp = Blueprint('evaluation', __name__) +_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + @evaluation_bp.route('/evaluate/domains') def evaluate_domains(): @@ -21,6 +24,17 @@ def evaluate_evaluators(): return render_template('evaluate_evaluators.html') +@evaluation_bp.route('/evaluate/docs/two-pass') +def evaluate_two_pass_docs(): + """Serve two-pass evaluation documentation (markdown source).""" + doc_path = os.path.join(_ROOT, 'docs', 'two-pass-evaluation.md') + if not os.path.isfile(doc_path): + abort(404) + with open(doc_path, encoding='utf-8') as f: + body = f.read() + return render_template('evaluate_doc.html', title='Two-Pass Evaluation', body=body) + + @evaluation_bp.route('/evaluate') def evaluate(): """LLM Evaluation runner page""" diff --git a/routes/settings.py b/routes/settings.py index 763b2b0..d3eb46b 100644 --- a/routes/settings.py +++ b/routes/settings.py @@ -476,6 +476,22 @@ def api_sync_tests(): # ---- App settings toggles ---- +@settings_bp.route('/api/settings/two-pass-enabled', methods=['GET', 'PUT']) +def api_two_pass_enabled(): + """Get or set global two-pass (Pass 2) answer extraction for evaluation.""" + from models.db import db + import config as app_config + + default = '1' if getattr(app_config, 'TWO_PASS_ENABLED', True) else '0' + if request.method == 'PUT': + data = request.get_json() or {} + enabled = '1' if data.get('enabled', False) else '0' + db.set_setting('two_pass_enabled', enabled) + return jsonify({'success': True, 'enabled': enabled == '1'}) + val = db.get_setting('two_pass_enabled', default) + return jsonify({'enabled': val == '1'}) + + @settings_bp.route('/api/settings/public-history', methods=['GET', 'PUT']) def api_public_history(): """Get or set the public history page toggle.""" diff --git a/templates/evaluate_doc.html b/templates/evaluate_doc.html new file mode 100644 index 0000000..aa328dc --- /dev/null +++ b/templates/evaluate_doc.html @@ -0,0 +1,14 @@ +{% extends "base.html" %} + +{% block content %} +
+ {% include "partials/evaluate_subnav.html" %} +
+

{{ title }}

+ ← Back to evaluators +
+
+
{{ body }}
+
+
+{% endblock %} diff --git a/templates/evaluate_evaluators.html b/templates/evaluate_evaluators.html index 0c7cab3..8e7104b 100644 --- a/templates/evaluate_evaluators.html +++ b/templates/evaluate_evaluators.html @@ -1,4 +1,5 @@ {% extends "base.html" %} +{% from 'partials/toggle.html' import toggle %} {% block content %}
@@ -8,6 +9,27 @@

⚖️ Evaluators

+
+
+
+

Two-pass extraction (Pass 2)

+

+ After the model answers a benchmark prompt, a second LLM call extracts a clean final answer + (number, ya/tidak, SQL, etc.) before scoring. Used by the built-in two_pass evaluator + and domains such as math and reasoning. +

+
+ +
+

+ Read the guide + · Env fallback: TWO_PASS_ENABLED, TWO_PASS_TEMPERATURE +

+
+
@@ -127,6 +149,27 @@

Add color: #7c3aed; } +.evaluator-type.predefined { + background: #d1fae5; + color: #047857; +} + +.evaluator-badge-pass2 { + display: inline-block; + padding: 2px 8px; + border-radius: 4px; + font-size: 11px; + background: #fef3c7; + color: #b45309; + margin-left: 6px; + font-weight: 600; +} + +html.dark .evaluator-badge-pass2 { + background: #422006; + color: #fcd34d; +} + /* Modal Styles */ .modal { display: none; @@ -287,7 +330,10 @@

Add container.innerHTML = evaluators.map(evaluator => `
-

${evaluator.name} ${evaluator.type}

+

${evaluator.name} + ${evaluator.type} + ${evaluator.uses_pass2 || evaluator.id === 'two_pass' ? 'Pass 2' : ''} +

${evaluator.description || 'No description'}

@@ -400,8 +446,33 @@

${evaluator.name} ${evaluator } } +async function loadTwoPassSetting() { + const toggleEl = document.getElementById('two-pass-enabled-toggle'); + if (!toggleEl) return; + try { + const data = await apiGet('/api/settings/two-pass-enabled'); + toggleEl.checked = !!data.enabled; + } catch (error) { + console.error('Error loading two-pass setting:', error); + } +} + +async function saveTwoPassSetting(enabled) { + try { + await apiPut('/api/settings/two-pass-enabled', { enabled }); + } catch (error) { + console.error('Error saving two-pass setting:', error); + alert('Could not save two-pass setting.'); + } +} + document.addEventListener('DOMContentLoaded', () => { loadEvaluators(); + loadTwoPassSetting(); + const twoPassToggle = document.getElementById('two-pass-enabled-toggle'); + if (twoPassToggle) { + twoPassToggle.addEventListener('change', () => saveTwoPassSetting(twoPassToggle.checked)); + } }); {% endblock %} diff --git a/tests/test_answer_extractor.py b/tests/test_answer_extractor.py index 9a6821c..7d42108 100644 --- a/tests/test_answer_extractor.py +++ b/tests/test_answer_extractor.py @@ -6,6 +6,19 @@ from evaluator.answer_extractor import answer_extractor +class TestTwoPassEnabled: + """Runtime toggle for Pass 2 extraction.""" + + def test_is_enabled_respects_config_default(self, monkeypatch): + monkeypatch.setattr('evaluator.answer_extractor.config.TWO_PASS_ENABLED', True) + # No DB in unit test — should fall back to config + assert answer_extractor.is_enabled() is True + + def test_is_enabled_when_disabled_in_config(self, monkeypatch): + monkeypatch.setattr('evaluator.answer_extractor.config.TWO_PASS_ENABLED', False) + assert answer_extractor.is_enabled() is False + + class TestFormatValidation: """Test format validation for PASS 2 output"""