diff --git a/2.0/problems/duckdb_e2e_query_optimization/config.yaml b/2.0/problems/duckdb_e2e_query_optimization/config.yaml index be09ae9f..c9c4f02e 100644 --- a/2.0/problems/duckdb_e2e_query_optimization/config.yaml +++ b/2.0/problems/duckdb_e2e_query_optimization/config.yaml @@ -36,9 +36,12 @@ environment: build_timeout_seconds: 7200 evaluation: scale_factor: 1 + agent_scale_factors: + - 1 benchmark_repetitions: 3 build_timeout_seconds: 7200 query_timeout_seconds: 300 + sqllogictest_timeout_seconds: 600 duckdb_memory_limit: "6GB" duckdb_temp_limit: "2GB" child_memory_mb: 12288 diff --git a/2.0/problems/duckdb_e2e_query_optimization/evaluator.py b/2.0/problems/duckdb_e2e_query_optimization/evaluator.py index d5ce2148..240d962a 100644 --- a/2.0/problems/duckdb_e2e_query_optimization/evaluator.py +++ b/2.0/problems/duckdb_e2e_query_optimization/evaluator.py @@ -63,6 +63,11 @@ def _config_bool(env_name: str, config_name: str, default: bool) -> bool: BUILD_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_BUILD_TIMEOUT", "build_timeout_seconds", 3600) QUERY_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_QUERY_TIMEOUT", "query_timeout_seconds", 300) +SQLLOGICTEST_TIMEOUT_SECONDS = _config_int( + "FRONTIER_DUCKDB_SQLLOGICTEST_TIMEOUT", + "sqllogictest_timeout_seconds", + 600, +) DUCKDB_MEMORY_LIMIT = _config_str("FRONTIER_DUCKDB_MEMORY_LIMIT", "duckdb_memory_limit", "6GB") DUCKDB_TEMP_LIMIT = _config_str("FRONTIER_DUCKDB_TEMP_LIMIT", "duckdb_temp_limit", "2GB") CHILD_MEMORY_LIMIT_MB = _config_int("FRONTIER_DUCKDB_CHILD_MEMORY_MB", "child_memory_mb", 12288) @@ -76,7 +81,15 @@ def _config_bool(env_name: str, config_name: str, default: bool) -> bool: BENCHMARK_RUNNER_REL = Path("build/release/benchmark/benchmark_runner") DUCKDB_SHELL_REL = Path("build/release/duckdb") +DUCKDB_UNITTEST_REL = Path("build/release/test/unittest") DEFAULT_CORRECTNESS_QUERIES = tuple(range(1, 23)) +FINAL_SQLLOGICTEST_FILES = ( + "test/sql/join/inner/test_join.test", + "test/sql/join/inner/test_inner_join_filter_pushdown.test", + "test/sql/join/semianti/semijoin.test", + "test/sql/filter/test_transitive_filters.test", + "test/sql/aggregate/distinct/test_distinct.test", +) STRONGLY_ALLOWED_PATTERNS = ( "src/optimizer/**", @@ -480,12 +493,15 @@ def _parse_csv_or_list(raw: Any) -> tuple[str, ...]: return () -def evaluation_scale_factors() -> tuple[str, ...]: - configured = _parse_csv_or_list(EVALUATION_CONFIG.get("scale_factors")) +def evaluation_scale_factors(*, final_role: bool) -> tuple[str, ...]: + config_key = "scale_factors" if final_role else "agent_scale_factors" + configured = _parse_csv_or_list(EVALUATION_CONFIG.get(config_key)) if configured: raw_values = configured - else: + elif final_role: raw_values = (PUBLIC_SCALE_FACTOR, *DEFAULT_HIDDEN_SCALE_FACTORS) + else: + raw_values = (PUBLIC_SCALE_FACTOR,) seen: set[str] = set() result: list[str] = [] for value in raw_values: @@ -522,16 +538,19 @@ def restore_prebuilt_source(source_dir: Path, env: dict[str, str]) -> bool: return bool(status_before.strip()) or bool(clean.strip()) -def build_duckdb(source_dir: Path, env: dict[str, str]) -> None: +def build_duckdb(source_dir: Path, env: dict[str, str], *, include_unittest: bool = False) -> None: build_env = dict(env) build_env["GEN"] = "ninja" build_env["BUILD_BENCHMARK"] = "1" build_env["BUILD_EXTENSIONS"] = "tpch" - build_env.setdefault("BUILD_UNITTESTS", "0") + build_env.setdefault("BUILD_UNITTESTS", "1" if include_unittest else "0") build_env.setdefault("DISABLE_UNITY", "1") build_env.setdefault("DISABLE_PARQUET", "1") build_env.setdefault("BUILD_JEMALLOC", "0") build_env.setdefault("CMAKE_BUILD_PARALLEL_LEVEL", "1") + targets = ["duckdb", "benchmark_runner"] + if include_unittest: + targets.append("unittest") run_checked( [ "cmake", @@ -540,8 +559,7 @@ def build_duckdb(source_dir: Path, env: dict[str, str]) -> None: "--config", "Release", "--target", - "duckdb", - "benchmark_runner", + *targets, ], cwd=source_dir, env=build_env, @@ -563,6 +581,13 @@ def duckdb_shell(source_dir: Path) -> Path: return shell +def duckdb_unittest(source_dir: Path) -> Path: + runner = source_dir / DUCKDB_UNITTEST_REL + if not runner.exists(): + raise FileNotFoundError(f"DuckDB unittest runner not found at {runner}") + return runner + + def benchmark_env(base_env: dict[str, str], tmp_root: Path) -> dict[str, str]: env = dict(base_env) env["DUCKDB_BENCHMARK_MEMORY_LIMIT"] = DUCKDB_MEMORY_LIMIT @@ -584,6 +609,30 @@ def settings_sql(temp_dir: Path) -> str: ) +def is_final_submission_role() -> bool: + return os.environ.get("FRONTIER_SUBMISSION_ROLE", "agent") == "final" + + +def run_final_sqllogictest_smoke( + source_dir: Path, + *, + env: dict[str, str], + metrics: dict[str, Any], +) -> None: + runner = duckdb_unittest(source_dir) + start = time.perf_counter() + for test_file in FINAL_SQLLOGICTEST_FILES: + run_checked( + [str(runner), test_file], + cwd=source_dir, + env=env, + timeout=SQLLOGICTEST_TIMEOUT_SECONDS, + ) + metrics["final_sqllogictest"] = 1 + metrics["final_sqllogictest_count"] = len(FINAL_SQLLOGICTEST_FILES) + metrics["final_sqllogictest_seconds"] = round(time.perf_counter() - start, 3) + + def run_duckdb_sql( shell: Path, database: Path, @@ -865,6 +914,8 @@ def safe_failed_command(cmd: Any) -> str: return " ".join(str(part) for part in cmd[:3]) if executable == "cmake": return "cmake build" + if executable == "unittest": + return "duckdb sqllogictest" if executable in {"duckdb", "benchmark_runner"}: return executable return executable or "subprocess" @@ -877,7 +928,9 @@ def safe_exception(exc: Exception) -> str: def full_evaluation(patch_path: Path, metrics: dict[str, Any]): - scale_factors = evaluation_scale_factors() + final_role = is_final_submission_role() + metrics["submission_role"] = "final" if final_role else "agent" + scale_factors = evaluation_scale_factors(final_role=final_role) benchmark_queries = benchmark_query_numbers() correctness_query_set = correctness_queries() correctness_cases = shuffled( @@ -907,11 +960,13 @@ def full_evaluation(patch_path: Path, metrics: dict[str, Any]): metrics["used_prebuilt_empty_patch"] = 1 if restored_source: metrics["rebuilt_after_source_restore"] = 1 - build_duckdb(patched_source, env) + build_duckdb(patched_source, env, include_unittest=final_role) + elif final_role: + build_duckdb(patched_source, env, include_unittest=True) else: run_checked(["git", "apply", "--check", str(patch_path)], cwd=patched_source, env=env, timeout=60) run_checked(["git", "apply", str(patch_path)], cwd=patched_source, env=env, timeout=60) - build_duckdb(patched_source, env) + build_duckdb(patched_source, env, include_unittest=final_role) vanilla_source = DEFAULT_VANILLA_SOURCE if DEFAULT_VANILLA_SOURCE.exists() else DEFAULT_CLEAN_SOURCE mismatches: dict[str, str] = {} @@ -934,6 +989,9 @@ def full_evaluation(patch_path: Path, metrics: dict[str, Any]): metrics["correctness_mismatch_count"] = len(mismatches) return _invalid("patched DuckDB produced incorrect TPC-H results", metrics) + if final_role: + run_final_sqllogictest_smoke(patched_source, env=env, metrics=metrics) + if USE_BENCHMARK_RUNNER: vanilla_runner = benchmark_runner(vanilla_source) patched_runner = benchmark_runner(patched_source) diff --git a/2.0/problems/duckdb_e2e_query_optimization/readme b/2.0/problems/duckdb_e2e_query_optimization/readme index d6d9cefa..b04af52a 100644 --- a/2.0/problems/duckdb_e2e_query_optimization/readme +++ b/2.0/problems/duckdb_e2e_query_optimization/readme @@ -69,8 +69,11 @@ penalized before performance is considered. The experimental evaluator currently encodes the patch policy, timing orchestration, and vanilla-vs-patched TPC-H result comparison inside the custom -judge image. DuckDB's broader SQLLogicTest/unit-test tooling is a natural future -hardening step, but the current score path already gates on query results before +judge image. During iterative asynchronous submissions, the judge keeps feedback +focused on the public scale-factor TPC-H gate so agents can submit early and +continue working while evaluation runs. During final verification, the judge +uses the broader hidden scale-factor set and also runs a small DuckDB +SQLLogicTest smoke set covering join, filter, and aggregate behavior before timing is considered. ## Scoring diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py index ac9de00b..30ef2cfc 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py @@ -15,6 +15,7 @@ import traceback import threading import secrets +import uuid from collections import deque from datetime import datetime, timezone from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer @@ -291,6 +292,19 @@ def evaluate_archive(archive_b64: str, *, submission_role: str = "agent") -> dic return evaluate_path(root, submission_role=submission_role) +def load_best_submission_payload(submission_uuid: str) -> tuple[dict[str, Any], str]: + if not BEST_SUBMISSION_PAYLOAD.exists(): + raise FileNotFoundError("best submission payload not found") + payload = json.loads(BEST_SUBMISSION_PAYLOAD.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise ValueError("best submission payload is invalid") + payload = dict(payload) + payload["submission_role"] = "final" + payload["submission_uuid"] = submission_uuid + submission_kind = str(payload.get("submission_kind", "file")) + return payload, submission_kind + + def validate_payload(payload: dict[str, Any], *, allow_final: bool, role_token: str = "") -> tuple[str, str, str]: submission_uuid = str(payload.get("submission_uuid") or "") if not submission_uuid: @@ -432,6 +446,9 @@ def do_POST(self) -> None: submission_uuid = parsed.path.removeprefix("/submission/").removesuffix("/cancel").strip("/") self.handle_cancel(submission_uuid) return + if parsed.path == "/evaluate_best": + self.handle_evaluate_best() + return if parsed.path != "/evaluate": self._write_json(404, {"status": "error", "error": "not found"}) return @@ -513,6 +530,61 @@ def read_json_body(self) -> dict[str, Any]: raise ValueError("request body must be a JSON object") return payload + def handle_evaluate_best(self) -> None: + if not READY: + self._write_json( + 503, + { + "status": "error", + "score": 0.0, + "score_unbounded": 0.0, + "message": "judge is not ready", + "health": READY_PAYLOAD, + }, + ) + return + + submission_uuid = "" + submission_kind = "file" + try: + request_payload = self.read_json_body() + if request_payload.get("submission_role") != "final": + raise PermissionError("best-submission rerun must use final role") + if not secrets.compare_digest( + self.headers.get("X-Frontier-CS-Role-Token", ""), + FINAL_ROLE_TOKEN, + ): + raise PermissionError("final evaluation role is verifier-only") + submission_uuid = str(request_payload.get("submission_uuid") or uuid.uuid4()) + payload, submission_kind = load_best_submission_payload(submission_uuid) + result = run_payload(payload, submission_role="final") + log_submission( + { + "submission_uuid": submission_uuid, + "submission_role": "final", + "submission_kind": submission_kind, + **result, + } + ) + self._write_json(200, result) + except Exception: + print(traceback.format_exc(), flush=True) + result = { + "status": "error", + "score": 0.0, + "score_unbounded": 0.0, + "message": "best iterative evaluation failed", + } + log_submission( + { + "submission_uuid": submission_uuid, + "submission_role": "final", + "submission_kind": submission_kind, + **result, + } + ) + self._write_json(200, result) + def handle_submit(self) -> None: if not READY: self._write_json(503, {"status": "error", "error": "judge is not ready", "health": READY_PAYLOAD}) diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py index 4a3850ba..e9fc977b 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py @@ -311,6 +311,19 @@ def evaluate_with_judge(payload: dict) -> dict: return result +def evaluate_best_with_judge() -> dict | None: + wait_for_judge() + payload = { + "submission_kind": "file", + "submission_uuid": str(uuid.uuid4()), + "submission_role": "final", + } + result = post_json(f"{JUDGE_URL}/evaluate_best", payload) + if result.get("status") != "done": + return None + return result + + def load_submission_path(config: dict) -> Path: if config.get("kind") == "directory": return Path(config.get("path") or APP_PATH) @@ -342,46 +355,45 @@ def write_result(result: dict) -> None: def main() -> None: - def write_best_submission_reward(reason: str) -> bool: - if best is None: - return False - score_raw = float(best.get("score", 0.0)) - reward = score_raw / 100.0 - score_unbounded = float(best.get("score_unbounded", score_raw)) - metrics = best.get("metrics", {}) - if not isinstance(metrics, dict): - metrics = {} - print(f"Using best iterative submission after {reason}: reward={reward:.4f}") - write_reward( - reward, - f"Using best iterative submission after {reason}: {best.get('message', '')}", - { - "score": score_raw, - "score_unbounded": score_unbounded, - "best_submission_reward": reward, - "used_best_submission": 1, - **metrics, - }, + def write_best_final_result(best_result: dict, reason: str) -> None: + metrics = dict(best_result.get("metrics", {}) or {}) + metrics["used_best_agent_artifact"] = 1 + best_result["metrics"] = metrics + best_result["message"] = ( + f"Using {describe_best_agent_payload()} after {reason}: " + f"{best_result.get('message', '')}" ) + write_result(best_result) + + def try_write_best_final_result(reason: str, final_key: tuple[float, float] | None = None) -> bool: + try: + best_result = evaluate_best_with_judge() + copy_judge_artifacts() + except Exception as exc: + print(f"WARN: failed to rerun best iterative artifact: {exc}") + best_result = None + if best_result is None: + return False + if final_key is not None and result_score_key(best_result) <= final_key: + return False + write_best_final_result(best_result, reason) return True - best: dict | None = None try: wait_for_judge() wait_for_async_submissions() copy_judge_artifacts() - best = best_submission() config = load_submission_config() solution_path = load_submission_path(config) if not solution_path.exists(): print(f"ERROR: {solution_path} not found") - if write_best_submission_reward(f"{solution_path} not found"): + if try_write_best_final_result(f"{solution_path} not found"): return write_reward(0.0, f"{solution_path} not found") return if solution_path.is_file() and not solution_path.read_text(encoding="utf-8").strip(): print(f"ERROR: {solution_path} is empty") - if write_best_submission_reward(f"{solution_path} is empty"): + if try_write_best_final_result(f"{solution_path} is empty"): return write_reward(0.0, f"{solution_path} is empty") return @@ -389,46 +401,13 @@ def write_best_submission_reward(reason: str) -> bool: final_result = evaluate_with_judge(build_judge_payload(solution_path, config)) copy_judge_artifacts() final_key = result_score_key(final_result) - best_payload = load_best_agent_payload() - if best_payload is not None: - try: - best_result = evaluate_with_judge(best_payload) - copy_judge_artifacts() - if result_score_key(best_result) > final_key: - metrics = dict(best_result.get("metrics", {}) or {}) - metrics["used_best_agent_artifact"] = 1 - best_result["metrics"] = metrics - best_result["message"] = ( - f"Using {describe_best_agent_payload()} after full-suite rerun: " - f"{best_result.get('message', '')}" - ) - write_result(best_result) - return - except Exception as exc: - print(f"WARN: failed to rerun best iterative artifact: {exc}") - if best is not None and result_score_key(best) > final_key: - write_best_submission_reward("final solution scored below best submission") + if try_write_best_final_result("full-suite rerun", final_key): return write_result(final_result) except Exception as exc: print(traceback.format_exc()) copy_judge_artifacts() - best_payload = load_best_agent_payload() - if best_payload is not None: - try: - best_result = evaluate_with_judge(best_payload) - metrics = dict(best_result.get("metrics", {}) or {}) - metrics["used_best_agent_artifact"] = 1 - best_result["metrics"] = metrics - best_result["message"] = ( - f"Using {describe_best_agent_payload()} after final evaluation failed: " - f"{best_result.get('message', '')}" - ) - write_result(best_result) - return - except Exception as best_exc: - print(f"WARN: failed to rerun best iterative artifact: {best_exc}") - if write_best_submission_reward(f"final evaluation failed: {exc}"): + if try_write_best_final_result("final evaluation failed"): return write_reward(0.0, f"Evaluation failed: {exc}")