diff --git a/2.0/problems/duckdb_e2e_query_optimization/config.yaml b/2.0/problems/duckdb_e2e_query_optimization/config.yaml
index be09ae9f..c9c4f02e 100644
--- a/2.0/problems/duckdb_e2e_query_optimization/config.yaml
+++ b/2.0/problems/duckdb_e2e_query_optimization/config.yaml
@@ -36,9 +36,12 @@ environment:
   build_timeout_seconds: 7200
 evaluation:
   scale_factor: 1
+  agent_scale_factors:
+    - 1
   benchmark_repetitions: 3
   build_timeout_seconds: 7200
   query_timeout_seconds: 300
+  sqllogictest_timeout_seconds: 600
   duckdb_memory_limit: "6GB"
   duckdb_temp_limit: "2GB"
   child_memory_mb: 12288
diff --git a/2.0/problems/duckdb_e2e_query_optimization/evaluator.py b/2.0/problems/duckdb_e2e_query_optimization/evaluator.py
index d5ce2148..240d962a 100644
--- a/2.0/problems/duckdb_e2e_query_optimization/evaluator.py
+++ b/2.0/problems/duckdb_e2e_query_optimization/evaluator.py
@@ -63,6 +63,11 @@ def _config_bool(env_name: str, config_name: str, default: bool) -> bool:
 
 BUILD_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_BUILD_TIMEOUT", "build_timeout_seconds", 3600)
 QUERY_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_QUERY_TIMEOUT", "query_timeout_seconds", 300)
+SQLLOGICTEST_TIMEOUT_SECONDS = _config_int(
+    "FRONTIER_DUCKDB_SQLLOGICTEST_TIMEOUT",
+    "sqllogictest_timeout_seconds",
+    600,
+)
 DUCKDB_MEMORY_LIMIT = _config_str("FRONTIER_DUCKDB_MEMORY_LIMIT", "duckdb_memory_limit", "6GB")
 DUCKDB_TEMP_LIMIT = _config_str("FRONTIER_DUCKDB_TEMP_LIMIT", "duckdb_temp_limit", "2GB")
 CHILD_MEMORY_LIMIT_MB = _config_int("FRONTIER_DUCKDB_CHILD_MEMORY_MB", "child_memory_mb", 12288)
@@ -76,7 +81,15 @@ def _config_bool(env_name: str, config_name: str, default: bool) -> bool:
 
 BENCHMARK_RUNNER_REL = Path("build/release/benchmark/benchmark_runner")
 DUCKDB_SHELL_REL = Path("build/release/duckdb")
+DUCKDB_UNITTEST_REL = Path("build/release/test/unittest")
 DEFAULT_CORRECTNESS_QUERIES = tuple(range(1, 23))
+FINAL_SQLLOGICTEST_FILES = (
+    "test/sql/join/inner/test_join.test",
+    "test/sql/join/inner/test_inner_join_filter_pushdown.test",
+    "test/sql/join/semianti/semijoin.test",
+    "test/sql/filter/test_transitive_filters.test",
+    "test/sql/aggregate/distinct/test_distinct.test",
+)
 
 STRONGLY_ALLOWED_PATTERNS = (
     "src/optimizer/**",
@@ -480,12 +493,15 @@ def _parse_csv_or_list(raw: Any) -> tuple[str, ...]:
     return ()
 
 
-def evaluation_scale_factors() -> tuple[str, ...]:
-    configured = _parse_csv_or_list(EVALUATION_CONFIG.get("scale_factors"))
+def evaluation_scale_factors(*, final_role: bool) -> tuple[str, ...]:
+    config_key = "scale_factors" if final_role else "agent_scale_factors"
+    configured = _parse_csv_or_list(EVALUATION_CONFIG.get(config_key))
     if configured:
         raw_values = configured
-    else:
+    elif final_role:
         raw_values = (PUBLIC_SCALE_FACTOR, *DEFAULT_HIDDEN_SCALE_FACTORS)
+    else:
+        raw_values = (PUBLIC_SCALE_FACTOR,)
     seen: set[str] = set()
     result: list[str] = []
     for value in raw_values:
@@ -522,16 +538,19 @@ def restore_prebuilt_source(source_dir: Path, env: dict[str, str]) -> bool:
     return bool(status_before.strip()) or bool(clean.strip())
 
 
-def build_duckdb(source_dir: Path, env: dict[str, str]) -> None:
+def build_duckdb(source_dir: Path, env: dict[str, str], *, include_unittest: bool = False) -> None:
     build_env = dict(env)
     build_env["GEN"] = "ninja"
     build_env["BUILD_BENCHMARK"] = "1"
     build_env["BUILD_EXTENSIONS"] = "tpch"
-    build_env.setdefault("BUILD_UNITTESTS", "0")
+    build_env.setdefault("BUILD_UNITTESTS", "1" if include_unittest else "0")
     build_env.setdefault("DISABLE_UNITY", "1")
     build_env.setdefault("DISABLE_PARQUET", "1")
     build_env.setdefault("BUILD_JEMALLOC", "0")
     build_env.setdefault("CMAKE_BUILD_PARALLEL_LEVEL", "1")
+    targets = ["duckdb", "benchmark_runner"]
+    if include_unittest:
+        targets.append("unittest")
     run_checked(
         [
             "cmake",
@@ -540,8 +559,7 @@ def build_duckdb(source_dir: Path, env: dict[str, str]) -> None:
             "--config",
             "Release",
             "--target",
-            "duckdb",
-            "benchmark_runner",
+            *targets,
         ],
         cwd=source_dir,
         env=build_env,
@@ -563,6 +581,13 @@ def duckdb_shell(source_dir: Path) -> Path:
     return shell
 
 
+def duckdb_unittest(source_dir: Path) -> Path:
+    runner = source_dir / DUCKDB_UNITTEST_REL
+    if not runner.exists():
+        raise FileNotFoundError(f"DuckDB unittest runner not found at {runner}")
+    return runner
+
+
 def benchmark_env(base_env: dict[str, str], tmp_root: Path) -> dict[str, str]:
     env = dict(base_env)
     env["DUCKDB_BENCHMARK_MEMORY_LIMIT"] = DUCKDB_MEMORY_LIMIT
@@ -584,6 +609,30 @@ def settings_sql(temp_dir: Path) -> str:
     )
 
 
+def is_final_submission_role() -> bool:
+    return os.environ.get("FRONTIER_SUBMISSION_ROLE", "agent") == "final"
+
+
+def run_final_sqllogictest_smoke(
+    source_dir: Path,
+    *,
+    env: dict[str, str],
+    metrics: dict[str, Any],
+) -> None:
+    runner = duckdb_unittest(source_dir)
+    start = time.perf_counter()
+    for test_file in FINAL_SQLLOGICTEST_FILES:
+        run_checked(
+            [str(runner), test_file],
+            cwd=source_dir,
+            env=env,
+            timeout=SQLLOGICTEST_TIMEOUT_SECONDS,
+        )
+    metrics["final_sqllogictest"] = 1
+    metrics["final_sqllogictest_count"] = len(FINAL_SQLLOGICTEST_FILES)
+    metrics["final_sqllogictest_seconds"] = round(time.perf_counter() - start, 3)
+
+
 def run_duckdb_sql(
     shell: Path,
     database: Path,
@@ -865,6 +914,8 @@ def safe_failed_command(cmd: Any) -> str:
         return " ".join(str(part) for part in cmd[:3])
     if executable == "cmake":
         return "cmake build"
+    if executable == "unittest":
+        return "duckdb sqllogictest"
     if executable in {"duckdb", "benchmark_runner"}:
         return executable
     return executable or "subprocess"
@@ -877,7 +928,9 @@ def safe_exception(exc: Exception) -> str:
 
 
 def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
-    scale_factors = evaluation_scale_factors()
+    final_role = is_final_submission_role()
+    metrics["submission_role"] = "final" if final_role else "agent"
+    scale_factors = evaluation_scale_factors(final_role=final_role)
     benchmark_queries = benchmark_query_numbers()
     correctness_query_set = correctness_queries()
     correctness_cases = shuffled(
@@ -907,11 +960,13 @@ def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
             metrics["used_prebuilt_empty_patch"] = 1
             if restored_source:
                 metrics["rebuilt_after_source_restore"] = 1
-                build_duckdb(patched_source, env)
+                build_duckdb(patched_source, env, include_unittest=final_role)
+            elif final_role:
+                build_duckdb(patched_source, env, include_unittest=True)
         else:
             run_checked(["git", "apply", "--check", str(patch_path)], cwd=patched_source, env=env, timeout=60)
             run_checked(["git", "apply", str(patch_path)], cwd=patched_source, env=env, timeout=60)
-            build_duckdb(patched_source, env)
+            build_duckdb(patched_source, env, include_unittest=final_role)
 
         vanilla_source = DEFAULT_VANILLA_SOURCE if DEFAULT_VANILLA_SOURCE.exists() else DEFAULT_CLEAN_SOURCE
         mismatches: dict[str, str] = {}
@@ -934,6 +989,9 @@ def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
             metrics["correctness_mismatch_count"] = len(mismatches)
             return _invalid("patched DuckDB produced incorrect TPC-H results", metrics)
 
+        if final_role:
+            run_final_sqllogictest_smoke(patched_source, env=env, metrics=metrics)
+
         if USE_BENCHMARK_RUNNER:
             vanilla_runner = benchmark_runner(vanilla_source)
             patched_runner = benchmark_runner(patched_source)
diff --git a/2.0/problems/duckdb_e2e_query_optimization/readme b/2.0/problems/duckdb_e2e_query_optimization/readme
index d6d9cefa..b04af52a 100644
--- a/2.0/problems/duckdb_e2e_query_optimization/readme
+++ b/2.0/problems/duckdb_e2e_query_optimization/readme
@@ -69,8 +69,11 @@ penalized before performance is considered.
 
 The experimental evaluator currently encodes the patch policy, timing
 orchestration, and vanilla-vs-patched TPC-H result comparison inside the custom
-judge image. DuckDB's broader SQLLogicTest/unit-test tooling is a natural future
-hardening step, but the current score path already gates on query results before
+judge image. During iterative asynchronous submissions, the judge keeps feedback
+focused on the public scale-factor TPC-H gate so agents can submit early and
+continue working while evaluation runs. During final verification, the judge
+uses the broader hidden scale-factor set and also runs a small DuckDB
+SQLLogicTest smoke set covering join, filter, and aggregate behavior before
 timing is considered.
 
 ## Scoring
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
index ac9de00b..30ef2cfc 100644
--- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
@@ -15,6 +15,7 @@
 import traceback
 import threading
 import secrets
+import uuid
 from collections import deque
 from datetime import datetime, timezone
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
@@ -291,6 +292,19 @@ def evaluate_archive(archive_b64: str, *, submission_role: str = "agent") -> dic
         return evaluate_path(root, submission_role=submission_role)
 
 
+def load_best_submission_payload(submission_uuid: str) -> tuple[dict[str, Any], str]:
+    if not BEST_SUBMISSION_PAYLOAD.exists():
+        raise FileNotFoundError("best submission payload not found")
+    payload = json.loads(BEST_SUBMISSION_PAYLOAD.read_text(encoding="utf-8"))
+    if not isinstance(payload, dict):
+        raise ValueError("best submission payload is invalid")
+    payload = dict(payload)
+    payload["submission_role"] = "final"
+    payload["submission_uuid"] = submission_uuid
+    submission_kind = str(payload.get("submission_kind", "file"))
+    return payload, submission_kind
+
+
 def validate_payload(payload: dict[str, Any], *, allow_final: bool, role_token: str = "") -> tuple[str, str, str]:
     submission_uuid = str(payload.get("submission_uuid") or "")
     if not submission_uuid:
@@ -432,6 +446,9 @@ def do_POST(self) -> None:
             submission_uuid = parsed.path.removeprefix("/submission/").removesuffix("/cancel").strip("/")
             self.handle_cancel(submission_uuid)
             return
+        if parsed.path == "/evaluate_best":
+            self.handle_evaluate_best()
+            return
         if parsed.path != "/evaluate":
             self._write_json(404, {"status": "error", "error": "not found"})
             return
@@ -513,6 +530,61 @@ def read_json_body(self) -> dict[str, Any]:
             raise ValueError("request body must be a JSON object")
         return payload
 
+    def handle_evaluate_best(self) -> None:
+        if not READY:
+            self._write_json(
+                503,
+                {
+                    "status": "error",
+                    "score": 0.0,
+                    "score_unbounded": 0.0,
+                    "message": "judge is not ready",
+                    "health": READY_PAYLOAD,
+                },
+            )
+            return
+
+        submission_uuid = ""
+        submission_kind = "file"
+        try:
+            request_payload = self.read_json_body()
+            if request_payload.get("submission_role") != "final":
+                raise PermissionError("best-submission rerun must use final role")
+            if not secrets.compare_digest(
+                self.headers.get("X-Frontier-CS-Role-Token", ""),
+                FINAL_ROLE_TOKEN,
+            ):
+                raise PermissionError("final evaluation role is verifier-only")
+            submission_uuid = str(request_payload.get("submission_uuid") or uuid.uuid4())
+            payload, submission_kind = load_best_submission_payload(submission_uuid)
+            result = run_payload(payload, submission_role="final")
+            log_submission(
+                {
+                    "submission_uuid": submission_uuid,
+                    "submission_role": "final",
+                    "submission_kind": submission_kind,
+                    **result,
+                }
+            )
+            self._write_json(200, result)
+        except Exception:
+            print(traceback.format_exc(), flush=True)
+            result = {
+                "status": "error",
+                "score": 0.0,
+                "score_unbounded": 0.0,
+                "message": "best iterative evaluation failed",
+            }
+            log_submission(
+                {
+                    "submission_uuid": submission_uuid,
+                    "submission_role": "final",
+                    "submission_kind": submission_kind,
+                    **result,
+                }
+            )
+            self._write_json(200, result)
+
     def handle_submit(self) -> None:
         if not READY:
             self._write_json(503, {"status": "error", "error": "judge is not ready", "health": READY_PAYLOAD})
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py
index 4a3850ba..e9fc977b 100644
--- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py
@@ -311,6 +311,19 @@ def evaluate_with_judge(payload: dict) -> dict:
     return result
 
 
+def evaluate_best_with_judge() -> dict | None:
+    wait_for_judge()
+    payload = {
+        "submission_kind": "file",
+        "submission_uuid": str(uuid.uuid4()),
+        "submission_role": "final",
+    }
+    result = post_json(f"{JUDGE_URL}/evaluate_best", payload)
+    if result.get("status") != "done":
+        return None
+    return result
+
+
 def load_submission_path(config: dict) -> Path:
     if config.get("kind") == "directory":
         return Path(config.get("path") or APP_PATH)
@@ -342,46 +355,45 @@ def write_result(result: dict) -> None:
 
 
 def main() -> None:
-    def write_best_submission_reward(reason: str) -> bool:
-        if best is None:
-            return False
-        score_raw = float(best.get("score", 0.0))
-        reward = score_raw / 100.0
-        score_unbounded = float(best.get("score_unbounded", score_raw))
-        metrics = best.get("metrics", {})
-        if not isinstance(metrics, dict):
-            metrics = {}
-        print(f"Using best iterative submission after {reason}: reward={reward:.4f}")
-        write_reward(
-            reward,
-            f"Using best iterative submission after {reason}: {best.get('message', '')}",
-            {
-                "score": score_raw,
-                "score_unbounded": score_unbounded,
-                "best_submission_reward": reward,
-                "used_best_submission": 1,
-                **metrics,
-            },
+    def write_best_final_result(best_result: dict, reason: str) -> None:
+        metrics = dict(best_result.get("metrics", {}) or {})
+        metrics["used_best_agent_artifact"] = 1
+        best_result["metrics"] = metrics
+        best_result["message"] = (
+            f"Using {describe_best_agent_payload()} after {reason}: "
+            f"{best_result.get('message', '')}"
         )
+        write_result(best_result)
+
+    def try_write_best_final_result(reason: str, final_key: tuple[float, float] | None = None) -> bool:
+        try:
+            best_result = evaluate_best_with_judge()
+            copy_judge_artifacts()
+        except Exception as exc:
+            print(f"WARN: failed to rerun best iterative artifact: {exc}")
+            best_result = None
+        if best_result is None:
+            return False
+        if final_key is not None and result_score_key(best_result) <= final_key:
+            return False
+        write_best_final_result(best_result, reason)
         return True
 
-    best: dict | None = None
     try:
         wait_for_judge()
         wait_for_async_submissions()
         copy_judge_artifacts()
-        best = best_submission()
         config = load_submission_config()
         solution_path = load_submission_path(config)
         if not solution_path.exists():
             print(f"ERROR: {solution_path} not found")
-            if write_best_submission_reward(f"{solution_path} not found"):
+            if try_write_best_final_result(f"{solution_path} not found"):
                 return
             write_reward(0.0, f"{solution_path} not found")
             return
         if solution_path.is_file() and not solution_path.read_text(encoding="utf-8").strip():
             print(f"ERROR: {solution_path} is empty")
-            if write_best_submission_reward(f"{solution_path} is empty"):
+            if try_write_best_final_result(f"{solution_path} is empty"):
                 return
             write_reward(0.0, f"{solution_path} is empty")
             return
@@ -389,46 +401,13 @@ def write_best_submission_reward(reason: str) -> bool:
         final_result = evaluate_with_judge(build_judge_payload(solution_path, config))
         copy_judge_artifacts()
         final_key = result_score_key(final_result)
-        best_payload = load_best_agent_payload()
-        if best_payload is not None:
-            try:
-                best_result = evaluate_with_judge(best_payload)
-                copy_judge_artifacts()
-                if result_score_key(best_result) > final_key:
-                    metrics = dict(best_result.get("metrics", {}) or {})
-                    metrics["used_best_agent_artifact"] = 1
-                    best_result["metrics"] = metrics
-                    best_result["message"] = (
-                        f"Using {describe_best_agent_payload()} after full-suite rerun: "
-                        f"{best_result.get('message', '')}"
-                    )
-                    write_result(best_result)
-                    return
-            except Exception as exc:
-                print(f"WARN: failed to rerun best iterative artifact: {exc}")
-        if best is not None and result_score_key(best) > final_key:
-            write_best_submission_reward("final solution scored below best submission")
+        if try_write_best_final_result("full-suite rerun", final_key):
             return
         write_result(final_result)
     except Exception as exc:
         print(traceback.format_exc())
         copy_judge_artifacts()
-        best_payload = load_best_agent_payload()
-        if best_payload is not None:
-            try:
-                best_result = evaluate_with_judge(best_payload)
-                metrics = dict(best_result.get("metrics", {}) or {})
-                metrics["used_best_agent_artifact"] = 1
-                best_result["metrics"] = metrics
-                best_result["message"] = (
-                    f"Using {describe_best_agent_payload()} after final evaluation failed: "
-                    f"{best_result.get('message', '')}"
-                )
-                write_result(best_result)
-                return
-            except Exception as best_exc:
-                print(f"WARN: failed to rerun best iterative artifact: {best_exc}")
-        if write_best_submission_reward(f"final evaluation failed: {exc}"):
+        if try_write_best_final_result("final evaluation failed"):
             return
         write_reward(0.0, f"Evaluation failed: {exc}")