Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions docs/two-pass-evaluation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Two-Pass LLM Evaluation

## Overview

Evonic's evaluation runner can score model answers in two passes:

1. **Pass 1** — The model answers the benchmark prompt (often with reasoning in Indonesian or English).
2. **Pass 2** — A second LLM call extracts only the final answer in a strict format (number, `ya`/`tidak`, SQL, and so on).

Pass 2 makes scoring reliable when Pass 1 is verbose or formatted inconsistently.

## When it runs

Two-pass extraction is used by the built-in **Two-Pass Evaluator** (`two_pass`), which is the default for domains such as **math**, **reasoning**, and **health** (see `evaluator/domain_evaluators.py` and `test_definitions/evaluators/two_pass.json`).

Custom evaluators can set `"uses_pass2": true` in their JSON definition to opt in.

## Flow

```
Prompt → LLM (Pass 1) → raw response
extraction prompt (Pass 2)
clean answer → domain scorer
```

If Pass 2 output does not match the expected format, the extractor tries regex fallbacks on the Pass 1 text before marking extraction as failed.

## Configuration

| Setting | Env variable | Default |
|--------|--------------|---------|
| Enable Pass 2 globally | `TWO_PASS_ENABLED` | `1` (on) |
| Extraction temperature | `TWO_PASS_TEMPERATURE` | `0.0` |
| UI override (persisted) | System → Evaluators page | Falls back to env |

The UI toggle writes `two_pass_enabled` to the app settings store and takes effect on the next evaluation without restarting the server.

## Result details

Evaluation results include a `pass2` object when extraction ran:

- `success`, `format`, `extracted_answer`
- `prompt`, `raw_output`, optional `thinking`
- `error` when format validation failed

These fields appear in the evaluation runner, history detail view, and API JSON.

## Disabling two-pass

- Turn off **Two-pass extraction** on `/evaluate/evaluators`, or
- Set `TWO_PASS_ENABLED=0` in `.env` and restart if no DB override exists.

When disabled, scoring uses the raw Pass 1 response (same as `extraction_method: disabled` in logs).

## Related code

- `evaluator/answer_extractor.py` — Pass 2 prompts and validation
- `evaluator/strategies/two_pass.py` — Evaluator strategy
- `tests/test_answer_extractor.py` — Unit tests for format validation
14 changes: 12 additions & 2 deletions evaluator/answer_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,18 @@ class AnswerExtractor:

def __init__(self):
self.client = llm_client
self.enabled = getattr(config, 'TWO_PASS_ENABLED', True)
self.temperature = getattr(config, 'TWO_PASS_TEMPERATURE', 0.0)

def is_enabled(self) -> bool:
"""Whether Pass 2 extraction runs (DB setting overrides env default)."""
try:
from models.db import db
stored = db.get_setting('two_pass_enabled', None)
if stored is not None:
return stored == '1'
except Exception:
pass
return getattr(config, 'TWO_PASS_ENABLED', True)

def extract(self, domain: str, level: int, response: str, question: str = "") -> Dict[str, Any]:
"""
Expand Down Expand Up @@ -217,7 +227,7 @@ def extract(self, domain: str, level: int, response: str, question: str = "") ->
}
"""
# Check if two-pass is enabled
if not self.enabled:
if not self.is_enabled():
return {
"success": True,
"extracted": response,
Expand Down
16 changes: 15 additions & 1 deletion routes/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import json
import os
import queue
from flask import Blueprint, render_template, jsonify, request
from flask import Blueprint, render_template, jsonify, request, abort

from evaluator.engine import evaluation_engine
from models.db import db
import config

evaluation_bp = Blueprint('evaluation', __name__)

_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))


@evaluation_bp.route('/evaluate/domains')
def evaluate_domains():
Expand All @@ -21,6 +24,17 @@ def evaluate_evaluators():
return render_template('evaluate_evaluators.html')


@evaluation_bp.route('/evaluate/docs/two-pass')
def evaluate_two_pass_docs():
"""Serve two-pass evaluation documentation (markdown source)."""
doc_path = os.path.join(_ROOT, 'docs', 'two-pass-evaluation.md')
if not os.path.isfile(doc_path):
abort(404)
with open(doc_path, encoding='utf-8') as f:
body = f.read()
return render_template('evaluate_doc.html', title='Two-Pass Evaluation', body=body)


@evaluation_bp.route('/evaluate')
def evaluate():
"""LLM Evaluation runner page"""
Expand Down
16 changes: 16 additions & 0 deletions routes/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,22 @@ def api_sync_tests():

# ---- App settings toggles ----

@settings_bp.route('/api/settings/two-pass-enabled', methods=['GET', 'PUT'])
def api_two_pass_enabled():
"""Get or set global two-pass (Pass 2) answer extraction for evaluation."""
from models.db import db
import config as app_config

default = '1' if getattr(app_config, 'TWO_PASS_ENABLED', True) else '0'
if request.method == 'PUT':
data = request.get_json() or {}
enabled = '1' if data.get('enabled', False) else '0'
db.set_setting('two_pass_enabled', enabled)
return jsonify({'success': True, 'enabled': enabled == '1'})
val = db.get_setting('two_pass_enabled', default)
return jsonify({'enabled': val == '1'})


@settings_bp.route('/api/settings/public-history', methods=['GET', 'PUT'])
def api_public_history():
"""Get or set the public history page toggle."""
Expand Down
14 changes: 14 additions & 0 deletions templates/evaluate_doc.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{% extends "base.html" %}

{% block content %}
<div class="settings-container" style="max-width: 900px; margin: 0 auto; padding: 20px;">
{% include "partials/evaluate_subnav.html" %}
<div class="settings-header" style="margin-bottom: 16px; display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 8px;">
<h2 style="margin: 0;">{{ title }}</h2>
<a href="/evaluate/evaluators" class="text-sm text-blue-600 dark:text-blue-400 hover:underline">← Back to evaluators</a>
</div>
<article class="doc-body bg-white dark:bg-gray-800 border border-gray-200 dark:border-gray-700 rounded-lg p-6 shadow-sm">
<pre class="whitespace-pre-wrap text-sm leading-relaxed text-gray-800 dark:text-gray-200 font-sans m-0">{{ body }}</pre>
</article>
</div>
{% endblock %}
73 changes: 72 additions & 1 deletion templates/evaluate_evaluators.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{% extends "base.html" %}
{% from 'partials/toggle.html' import toggle %}

{% block content %}
<div class="settings-container">
Expand All @@ -8,6 +9,27 @@
<h2>⚖️ Evaluators</h2>
</div>

<div class="two-pass-panel evaluator-card" style="flex-direction: column; align-items: stretch; gap: 12px;">
<div style="display: flex; justify-content: space-between; align-items: flex-start; flex-wrap: wrap; gap: 12px;">
<div>
<h4 style="margin: 0 0 6px;">Two-pass extraction (Pass 2)</h4>
<p style="margin: 0; font-size: 14px; color: #6b7280;">
After the model answers a benchmark prompt, a second LLM call extracts a clean final answer
(number, ya/tidak, SQL, etc.) before scoring. Used by the built-in <code>two_pass</code> evaluator
and domains such as math and reasoning.
</p>
</div>
<label class="flex items-center gap-3 cursor-pointer select-none shrink-0">
{{ toggle(id='two-pass-enabled-toggle', checked=true) }}
<span class="text-sm font-medium text-gray-700 dark:text-gray-200">Enabled</span>
</label>
</div>
<p style="margin: 0; font-size: 13px; color: #6b7280;">
<a href="/evaluate/docs/two-pass" class="text-blue-600 dark:text-blue-400 hover:underline">Read the guide</a>
· Env fallback: <code>TWO_PASS_ENABLED</code>, <code>TWO_PASS_TEMPERATURE</code>
</p>
</div>

<div class="evaluators-list" id="evaluators-list">
<!-- Evaluators will be loaded here -->
</div>
Expand Down Expand Up @@ -127,6 +149,27 @@ <h3 id="evaluator-modal-title" class="m-0 text-gray-800 dark:text-gray-100">Add
color: #7c3aed;
}

.evaluator-type.predefined {
background: #d1fae5;
color: #047857;
}

.evaluator-badge-pass2 {
display: inline-block;
padding: 2px 8px;
border-radius: 4px;
font-size: 11px;
background: #fef3c7;
color: #b45309;
margin-left: 6px;
font-weight: 600;
}

html.dark .evaluator-badge-pass2 {
background: #422006;
color: #fcd34d;
}

/* Modal Styles */
.modal {
display: none;
Expand Down Expand Up @@ -287,7 +330,10 @@ <h3 id="evaluator-modal-title" class="m-0 text-gray-800 dark:text-gray-100">Add
container.innerHTML = evaluators.map(evaluator => `
<div class="evaluator-card">
<div class="evaluator-info">
<h4>${evaluator.name} <span class="evaluator-type ${evaluator.type}">${evaluator.type}</span></h4>
<h4>${evaluator.name}
<span class="evaluator-type ${evaluator.type}">${evaluator.type}</span>
${evaluator.uses_pass2 || evaluator.id === 'two_pass' ? '<span class="evaluator-badge-pass2">Pass 2</span>' : ''}
</h4>
<p>${evaluator.description || 'No description'}</p>
</div>
<div class="evaluator-actions">
Expand Down Expand Up @@ -400,8 +446,33 @@ <h4>${evaluator.name} <span class="evaluator-type ${evaluator.type}">${evaluator
}
}

async function loadTwoPassSetting() {
const toggleEl = document.getElementById('two-pass-enabled-toggle');
if (!toggleEl) return;
try {
const data = await apiGet('/api/settings/two-pass-enabled');
toggleEl.checked = !!data.enabled;
} catch (error) {
console.error('Error loading two-pass setting:', error);
}
}

async function saveTwoPassSetting(enabled) {
try {
await apiPut('/api/settings/two-pass-enabled', { enabled });
} catch (error) {
console.error('Error saving two-pass setting:', error);
alert('Could not save two-pass setting.');
}
}

document.addEventListener('DOMContentLoaded', () => {
loadEvaluators();
loadTwoPassSetting();
const twoPassToggle = document.getElementById('two-pass-enabled-toggle');
if (twoPassToggle) {
twoPassToggle.addEventListener('change', () => saveTwoPassSetting(twoPassToggle.checked));
}
});
</script>
{% endblock %}
13 changes: 13 additions & 0 deletions tests/test_answer_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@
from evaluator.answer_extractor import answer_extractor


class TestTwoPassEnabled:
"""Runtime toggle for Pass 2 extraction."""

def test_is_enabled_respects_config_default(self, monkeypatch):
monkeypatch.setattr('evaluator.answer_extractor.config.TWO_PASS_ENABLED', True)
# No DB in unit test — should fall back to config
assert answer_extractor.is_enabled() is True

def test_is_enabled_when_disabled_in_config(self, monkeypatch):
monkeypatch.setattr('evaluator.answer_extractor.config.TWO_PASS_ENABLED', False)
assert answer_extractor.is_enabled() is False


class TestFormatValidation:
"""Test format validation for PASS 2 output"""

Expand Down
Loading