anvie · DeryFerd · May 15, 2026
diff --git a/docs/two-pass-evaluation.md b/docs/two-pass-evaluation.md
@@ -0,0 +1,61 @@
+# Two-Pass LLM Evaluation
+
+## Overview
+
+Evonic's evaluation runner can score model answers in two passes:
+
+1. **Pass 1** — The model answers the benchmark prompt (often with reasoning in Indonesian or English).
+2. **Pass 2** — A second LLM call extracts only the final answer in a strict format (number, `ya`/`tidak`, SQL, and so on).
+
+Pass 2 makes scoring reliable when Pass 1 is verbose or formatted inconsistently.
+
+## When it runs
+
+Two-pass extraction is used by the built-in **Two-Pass Evaluator** (`two_pass`), which is the default for domains such as **math**, **reasoning**, and **health** (see `evaluator/domain_evaluators.py` and `test_definitions/evaluators/two_pass.json`).
+
+Custom evaluators can set `"uses_pass2": true` in their JSON definition to opt in.
+
+## Flow
+
+```
+Prompt → LLM (Pass 1) → raw response
+                              ↓
+                    extraction prompt (Pass 2)
+                              ↓
+                    clean answer → domain scorer
+```
+
+If Pass 2 output does not match the expected format, the extractor tries regex fallbacks on the Pass 1 text before marking extraction as failed.
+
+## Configuration
+
+| Setting | Env variable | Default |
+|--------|--------------|---------|
+| Enable Pass 2 globally | `TWO_PASS_ENABLED` | `1` (on) |
+| Extraction temperature | `TWO_PASS_TEMPERATURE` | `0.0` |
+| UI override (persisted) | System → Evaluators page | Falls back to env |
+
+The UI toggle writes `two_pass_enabled` to the app settings store and takes effect on the next evaluation without restarting the server.
+
+## Result details
+
+Evaluation results include a `pass2` object when extraction ran:
+
+- `success`, `format`, `extracted_answer`
+- `prompt`, `raw_output`, optional `thinking`
+- `error` when format validation failed
+
+These fields appear in the evaluation runner, history detail view, and API JSON.
+
+## Disabling two-pass
+
+- Turn off **Two-pass extraction** on `/evaluate/evaluators`, or
+- Set `TWO_PASS_ENABLED=0` in `.env` and restart if no DB override exists.
+
+When disabled, scoring uses the raw Pass 1 response (same as `extraction_method: disabled` in logs).
+
+## Related code
+
+- `evaluator/answer_extractor.py` — Pass 2 prompts and validation
+- `evaluator/strategies/two_pass.py` — Evaluator strategy
+- `tests/test_answer_extractor.py` — Unit tests for format validation
diff --git a/evaluator/answer_extractor.py b/evaluator/answer_extractor.py
@@ -187,8 +187,18 @@ class AnswerExtractor:
 
     def __init__(self):
         self.client = llm_client
-        self.enabled = getattr(config, 'TWO_PASS_ENABLED', True)
         self.temperature = getattr(config, 'TWO_PASS_TEMPERATURE', 0.0)
+
+    def is_enabled(self) -> bool:
+        """Whether Pass 2 extraction runs (DB setting overrides env default)."""
+        try:
+            from models.db import db
+            stored = db.get_setting('two_pass_enabled', None)
+            if stored is not None:
+                return stored == '1'
+        except Exception:
+            pass
+        return getattr(config, 'TWO_PASS_ENABLED', True)
 
     def extract(self, domain: str, level: int, response: str, question: str = "") -> Dict[str, Any]:
         """
@@ -217,7 +227,7 @@ def extract(self, domain: str, level: int, response: str, question: str = "") ->
             }
         """
         # Check if two-pass is enabled
-        if not self.enabled:
+        if not self.is_enabled():
             return {
                 "success": True,
                 "extracted": response,

diff --git a/routes/evaluation.py b/routes/evaluation.py
@@ -1,13 +1,16 @@
 import json
+import os
 import queue
-from flask import Blueprint, render_template, jsonify, request
+from flask import Blueprint, render_template, jsonify, request, abort
 
 from evaluator.engine import evaluation_engine
 from models.db import db
 import config
 
 evaluation_bp = Blueprint('evaluation', __name__)
 
+_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
 
 @evaluation_bp.route('/evaluate/domains')
 def evaluate_domains():
@@ -21,6 +24,17 @@ def evaluate_evaluators():
     return render_template('evaluate_evaluators.html')
 
 
+@evaluation_bp.route('/evaluate/docs/two-pass')
+def evaluate_two_pass_docs():
+    """Serve two-pass evaluation documentation (markdown source)."""
+    doc_path = os.path.join(_ROOT, 'docs', 'two-pass-evaluation.md')
+    if not os.path.isfile(doc_path):
+        abort(404)
+    with open(doc_path, encoding='utf-8') as f:
+        body = f.read()
+    return render_template('evaluate_doc.html', title='Two-Pass Evaluation', body=body)
+
+
 @evaluation_bp.route('/evaluate')
 def evaluate():
     """LLM Evaluation runner page"""

diff --git a/routes/settings.py b/routes/settings.py
@@ -476,6 +476,22 @@ def api_sync_tests():
 
 # ---- App settings toggles ----
 
+@settings_bp.route('/api/settings/two-pass-enabled', methods=['GET', 'PUT'])
+def api_two_pass_enabled():
+    """Get or set global two-pass (Pass 2) answer extraction for evaluation."""
+    from models.db import db
+    import config as app_config
+
+    default = '1' if getattr(app_config, 'TWO_PASS_ENABLED', True) else '0'
+    if request.method == 'PUT':
+        data = request.get_json() or {}
+        enabled = '1' if data.get('enabled', False) else '0'
+        db.set_setting('two_pass_enabled', enabled)
+        return jsonify({'success': True, 'enabled': enabled == '1'})
+    val = db.get_setting('two_pass_enabled', default)
+    return jsonify({'enabled': val == '1'})
+
+
 @settings_bp.route('/api/settings/public-history', methods=['GET', 'PUT'])
 def api_public_history():
     """Get or set the public history page toggle."""

diff --git a/templates/evaluate_doc.html b/templates/evaluate_doc.html
@@ -0,0 +1,14 @@
+{% extends "base.html" %}
+
+{% block content %}
+<div class="settings-container" style="max-width: 900px; margin: 0 auto; padding: 20px;">
+    {% include "partials/evaluate_subnav.html" %}
+    <div class="settings-header" style="margin-bottom: 16px; display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 8px;">
+        <h2 style="margin: 0;">{{ title }}</h2>
+        <a href="/evaluate/evaluators" class="text-sm text-blue-600 dark:text-blue-400 hover:underline">← Back to evaluators</a>
+    </div>
+    <article class="doc-body bg-white dark:bg-gray-800 border border-gray-200 dark:border-gray-700 rounded-lg p-6 shadow-sm">
+        <pre class="whitespace-pre-wrap text-sm leading-relaxed text-gray-800 dark:text-gray-200 font-sans m-0">{{ body }}</pre>
+    </article>
+</div>
+{% endblock %}
diff --git a/templates/evaluate_evaluators.html b/templates/evaluate_evaluators.html
@@ -1,4 +1,5 @@
 {% extends "base.html" %}
+{% from 'partials/toggle.html' import toggle %}
 
 {% block content %}
 <div class="settings-container">
@@ -8,6 +9,27 @@
         <h2>⚖️ Evaluators</h2>
     </div>
 
+    <div class="two-pass-panel evaluator-card" style="flex-direction: column; align-items: stretch; gap: 12px;">
+        <div style="display: flex; justify-content: space-between; align-items: flex-start; flex-wrap: wrap; gap: 12px;">
+            <div>
+                <h4 style="margin: 0 0 6px;">Two-pass extraction (Pass 2)</h4>
+                <p style="margin: 0; font-size: 14px; color: #6b7280;">
+                    After the model answers a benchmark prompt, a second LLM call extracts a clean final answer
+                    (number, ya/tidak, SQL, etc.) before scoring. Used by the built-in <code>two_pass</code> evaluator
+                    and domains such as math and reasoning.
+                </p>
+            </div>
+            <label class="flex items-center gap-3 cursor-pointer select-none shrink-0">
+                {{ toggle(id='two-pass-enabled-toggle', checked=true) }}
+                <span class="text-sm font-medium text-gray-700 dark:text-gray-200">Enabled</span>
+            </label>
+        </div>
+        <p style="margin: 0; font-size: 13px; color: #6b7280;">
+            <a href="/evaluate/docs/two-pass" class="text-blue-600 dark:text-blue-400 hover:underline">Read the guide</a>
+            · Env fallback: <code>TWO_PASS_ENABLED</code>, <code>TWO_PASS_TEMPERATURE</code>
+        </p>
+    </div>
+
     <div class="evaluators-list" id="evaluators-list">
         <!-- Evaluators will be loaded here -->
     </div>
@@ -127,6 +149,27 @@ <h3 id="evaluator-modal-title" class="m-0 text-gray-800 dark:text-gray-100">Add
     color: #7c3aed;
 }
 
+.evaluator-type.predefined {
+    background: #d1fae5;
+    color: #047857;
+}
+
+.evaluator-badge-pass2 {
+    display: inline-block;
+    padding: 2px 8px;
+    border-radius: 4px;
+    font-size: 11px;
+    background: #fef3c7;
+    color: #b45309;
+    margin-left: 6px;
+    font-weight: 600;
+}
+
+html.dark .evaluator-badge-pass2 {
+    background: #422006;
+    color: #fcd34d;
+}
+
 /* Modal Styles */
 .modal {
     display: none;
@@ -287,7 +330,10 @@ <h3 id="evaluator-modal-title" class="m-0 text-gray-800 dark:text-gray-100">Add
     container.innerHTML = evaluators.map(evaluator => `
         <div class="evaluator-card">
             <div class="evaluator-info">
-                <h4>${evaluator.name} <span class="evaluator-type ${evaluator.type}">${evaluator.type}</span></h4>
+                <h4>${evaluator.name}
+                    <span class="evaluator-type ${evaluator.type}">${evaluator.type}</span>
+                    ${evaluator.uses_pass2 || evaluator.id === 'two_pass' ? '<span class="evaluator-badge-pass2">Pass 2</span>' : ''}
+                </h4>
                 <p>${evaluator.description || 'No description'}</p>
             </div>
             <div class="evaluator-actions">
@@ -400,8 +446,33 @@ <h4>${evaluator.name} <span class="evaluator-type ${evaluator.type}">${evaluator
     }
 }
 
+async function loadTwoPassSetting() {
+    const toggleEl = document.getElementById('two-pass-enabled-toggle');
+    if (!toggleEl) return;
+    try {
+        const data = await apiGet('/api/settings/two-pass-enabled');
+        toggleEl.checked = !!data.enabled;
+    } catch (error) {
+        console.error('Error loading two-pass setting:', error);
+    }
+}
+
+async function saveTwoPassSetting(enabled) {
+    try {
+        await apiPut('/api/settings/two-pass-enabled', { enabled });
+    } catch (error) {
+        console.error('Error saving two-pass setting:', error);
+        alert('Could not save two-pass setting.');
+    }
+}
+
 document.addEventListener('DOMContentLoaded', () => {
     loadEvaluators();
+    loadTwoPassSetting();
+    const twoPassToggle = document.getElementById('two-pass-enabled-toggle');
+    if (twoPassToggle) {
+        twoPassToggle.addEventListener('change', () => saveTwoPassSetting(twoPassToggle.checked));
+    }
 });
 </script>
 {% endblock %}
diff --git a/tests/test_answer_extractor.py b/tests/test_answer_extractor.py
@@ -6,6 +6,19 @@
 from evaluator.answer_extractor import answer_extractor
 
 
+class TestTwoPassEnabled:
+    """Runtime toggle for Pass 2 extraction."""
+
+    def test_is_enabled_respects_config_default(self, monkeypatch):
+        monkeypatch.setattr('evaluator.answer_extractor.config.TWO_PASS_ENABLED', True)
+        # No DB in unit test — should fall back to config
+        assert answer_extractor.is_enabled() is True
+
+    def test_is_enabled_when_disabled_in_config(self, monkeypatch):
+        monkeypatch.setattr('evaluator.answer_extractor.config.TWO_PASS_ENABLED', False)
+        assert answer_extractor.is_enabled() is False
+
+
 class TestFormatValidation:
     """Test format validation for PASS 2 output"""