From 56f188a63a47212f4d864551136b109eaecf7192 Mon Sep 17 00:00:00 2001 From: SJTUyh Date: Thu, 21 May 2026 11:25:01 +0800 Subject: [PATCH 1/7] SWE Bench doc fix --- docs/source_en/extended_benchmark/agent/swe_bench.md | 5 ++++- docs/source_zh_cn/extended_benchmark/agent/swe_bench.md | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/source_en/extended_benchmark/agent/swe_bench.md b/docs/source_en/extended_benchmark/agent/swe_bench.md index 80ab57c6..59e4865e 100644 --- a/docs/source_en/extended_benchmark/agent/swe_bench.md +++ b/docs/source_en/extended_benchmark/agent/swe_bench.md @@ -28,7 +28,10 @@ Before running, make sure the following dependencies are available: 1) Install `mini-swe-agent` (required for infer) ```bash -pip install mini-swe-agent +git clone https://github.com/AISBench/mini-swe-agent.git +cd mini-swe-agent +pip install -e . +cd - ``` 2) Install the SWE-bench harness (required for eval) diff --git a/docs/source_zh_cn/extended_benchmark/agent/swe_bench.md b/docs/source_zh_cn/extended_benchmark/agent/swe_bench.md index 0f9d5ee3..731113de 100644 --- a/docs/source_zh_cn/extended_benchmark/agent/swe_bench.md +++ b/docs/source_zh_cn/extended_benchmark/agent/swe_bench.md @@ -30,7 +30,10 @@ SWE-bench是一个基准测试,用于评估大语言模型在从GitHub收集 1) 安装 `mini-swe-agent`(infer 依赖) ```bash -pip install mini-swe-agent +git clone https://github.com/AISBench/mini-swe-agent.git +cd mini-swe-agent +pip install -e . +cd - ``` 2) 安装 SWE-bench harness(eval 依赖) From eb0c6a5009220910c537a6defc494db5b1eaf838 Mon Sep 17 00:00:00 2001 From: SJTUyh Date: Sat, 30 May 2026 09:56:40 +0800 Subject: [PATCH 2/7] support run terminal-bench-2 with harbor --- ais_bench/benchmark/summarizers/__init__.py | 1 + ais_bench/benchmark/summarizers/harbor.py | 173 ++++++++ .../tasks/custom_tasks/harbor_task.py | 397 ++++++++++++++++++ .../tasks/custom_tasks/tau2_bench_task.py | 2 +- .../harbor_terminal_bench_2_task.py | 84 ++++ 5 files changed, 656 insertions(+), 1 deletion(-) create mode 100644 ais_bench/benchmark/summarizers/harbor.py create mode 100644 ais_bench/benchmark/tasks/custom_tasks/harbor_task.py create mode 100644 ais_bench/configs/agent_example/harbor_terminal_bench_2_task.py diff --git a/ais_bench/benchmark/summarizers/__init__.py b/ais_bench/benchmark/summarizers/__init__.py index 756d1a53..79fb0026 100644 --- a/ais_bench/benchmark/summarizers/__init__.py +++ b/ais_bench/benchmark/summarizers/__init__.py @@ -4,3 +4,4 @@ from ais_bench.benchmark.summarizers.default_perf import DefaultPerfSummarizer # noqa: F401 from ais_bench.benchmark.summarizers.vbench import VBenchSummarizer # noqa: F401 from ais_bench.benchmark.summarizers.swebench import SWEBenchSummarizer # noqa: F401 +from ais_bench.benchmark.summarizers.harbor import HarborSummarizer # noqa: F401 \ No newline at end of file diff --git a/ais_bench/benchmark/summarizers/harbor.py b/ais_bench/benchmark/summarizers/harbor.py new file mode 100644 index 00000000..930371be --- /dev/null +++ b/ais_bench/benchmark/summarizers/harbor.py @@ -0,0 +1,173 @@ +# flake8: noqa +# yapf: disable +import functools +import os.path as osp +from typing import Any, Dict, List + +import mmengine +import tabulate +from mmengine import ConfigDict + +from ais_bench.benchmark.summarizers.default import DefaultSummarizer +from ais_bench.benchmark.utils.logging.logger import AISLogger +from ais_bench.benchmark.utils.core.abbr import dataset_abbr_from_cfg, model_abbr_from_cfg + + +METRIC_WHITELIST = ['avg_score', 'score', 'accuracy', 'n_errors', 'n_total_trials'] +METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len', 'type', 'reward_distribution', 'exception_distribution', 'pass_at_k', 'details'] + + +class HarborSummarizer(DefaultSummarizer): + """Summarizer for Harbor benchmark results.""" + + def _pick_up_results(self): + raw_results: Dict[str, Dict[str, Any]] = {} + parsed_results: Dict[str, Dict[str, Dict[str, float]]] = {} + dataset_metrics: Dict[str, List[str]] = {} + dataset_eval_mode: Dict[str, str] = {} + + for model in self.model_cfgs: + model_abbr = model_abbr_from_cfg(model) + parsed_results.setdefault(model_abbr, {}) + raw_results.setdefault(model_abbr, {}) + for dataset in self.dataset_cfgs: + dataset_abbr = dataset_abbr_from_cfg(dataset) + filepath = osp.join(self.work_dir, 'results', model_abbr, f'{dataset_abbr}.json') + + if not osp.exists(filepath): + continue + + result = mmengine.load(filepath) + raw_results[model_abbr][dataset_abbr] = result + + _rst, _dm = {}, [] + for metric, score in result.items(): + if metric in METRIC_BLACKLIST: + continue + if isinstance(score, (int, float)): + _rst[metric] = score + _dm.append(metric) + elif isinstance(score, str): + _rst[metric] = score + _dm.append(metric) + + if len(_rst) == 0: + continue + + _dm = sorted(_dm, key=lambda i: METRIC_WHITELIST.index(i) if i in METRIC_WHITELIST else len(METRIC_WHITELIST)) + dataset_metrics[dataset_abbr] = _dm + parsed_results[model_abbr][dataset_abbr] = _rst + dataset_eval_mode[dataset_abbr] = 'gen' + + return raw_results, parsed_results, dataset_metrics, dataset_eval_mode + + def _print_harbor_details(self, raw_results: Dict[str, Dict[str, Any]]): + for model_abbr in self.model_abbrs: + if model_abbr not in raw_results: + continue + for dataset_abbr, result in raw_results[model_abbr].items(): + if 'reward_distribution' in result or 'exception_distribution' in result: + print('') + print('=' * 60) + print(f'Dataset: {dataset_abbr}') + print(f'Model: {model_abbr}') + print('=' * 60) + + if 'total_count' in result: + print(f'Total Count: {result["total_count"]}') + if 'n_errors' in result: + print(f'Errors: {result["n_errors"]}') + if 'avg_score' in result: + print(f'Avg Score: {result["avg_score"]}') + + if 'reward_distribution' in result and result['reward_distribution']: + print('\nReward Distribution:') + table_data = [[item['score'], item['count']] for item in result['reward_distribution']] + print(tabulate.tabulate(table_data, headers=['Score', 'Count'], tablefmt='grid')) + + if 'exception_distribution' in result and result['exception_distribution']: + print('\nException Distribution:') + table_data = [[item['exception_type'], item['count']] for item in result['exception_distribution']] + print(tabulate.tabulate(table_data, headers=['Exception', 'Count'], tablefmt='grid')) + + if 'pass_at_k' in result and result['pass_at_k']: + print('\nPass@k:') + table_data = [[k, f'{v:.4f}'] for k, v in sorted(result['pass_at_k'].items())] + print(tabulate.tabulate(table_data, headers=['k', 'Pass Rate'], tablefmt='grid')) + + print('') + + def summarize(self, time_str=None, subjective_scores=None, dataset_score_container=None, required_dataset_abbrs=None): + self._update_dataset_abbrs() + raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results() + + self._print_harbor_details(raw_results) + + dataset_abbrs = [] + for dataset_abbr in self.dataset_abbrs: + if dataset_abbr in dataset_metrics: + for metric in dataset_metrics[dataset_abbr]: + if metric in ("correct_count", "total_count"): + continue + dataset_abbrs.append((dataset_abbr, metric)) + else: + dataset_abbrs.append((dataset_abbr, None)) + + has_total_count = False + for dataset_abbr in dataset_metrics: + if 'total_count' in dataset_metrics[dataset_abbr]: + has_total_count = True + break + + table = [] + header = ['dataset', 'version', 'metric', 'mode'] + self.model_abbrs + if has_total_count: + header = ['dataset', 'version', 'metric', 'mode', 'total_count'] + self.model_abbrs + table.append(header) + + for dataset_abbr, metric in dataset_abbrs: + for model_abbr in self.model_abbrs: + if metric is None: + for k in parsed_results.get(model_abbr, {}).get(dataset_abbr, {}).keys(): + row = [dataset_abbr, 'a39421', k, dataset_eval_mode.get(dataset_abbr, 'gen')] + if has_total_count: + row.insert(4, raw_results[model_abbr][dataset_abbr].get('total_count', '-')) + row.append(parsed_results[model_abbr][dataset_abbr][k]) + table.append(row) + else: + if dataset_abbr in parsed_results[model_abbr] and metric in parsed_results[model_abbr][dataset_abbr]: + row = [dataset_abbr, 'a39421', metric, dataset_eval_mode.get(dataset_abbr, 'gen')] + if has_total_count: + row.insert(4, raw_results[model_abbr][dataset_abbr].get('total_count', '-')) + row.append(parsed_results[model_abbr][dataset_abbr][metric]) + table.append(row) + + for sg in self.summary_groups: + for model_abbr in self.model_abbrs: + if sg['name'] in parsed_results[model_abbr]: + row = [sg['name'], sg.get('version', '-'), sg.get('metric', 'naive_average'), dataset_eval_mode.get(sg['name'], 'gen')] + if has_total_count: + row.insert(4, len(sg['subsets'])) + row.extend([parsed_results[m].get(sg['name'], {}).get(sg.get('metric', 'naive_average'), '-') for m in self.model_abbrs]) + table.append(row) + + print('') + print(tabulate.tabulate(table[1:], headers=table[0], tablefmt='grid')) + print('') + + summary_dir = osp.join(self.work_dir, 'summary') + mmengine.mkdir_or_exist(summary_dir) + + time_str = time_str or mmengine.utils.TimeStub.now().time_str + summary_txt = osp.join(summary_dir, f'summary_{time_str}.txt') + summary_csv = osp.join(summary_dir, f'summary_{time_str}.csv') + + print(f'write summary to {summary_txt}') + with open(summary_txt, 'w', encoding='utf-8') as f: + f.write(tabulate.tabulate(table[1:], headers=table[0], tablefmt='grid')) + + print(f'write csv to {summary_csv}') + with open(summary_csv, 'w', encoding='utf-8') as out: + out.write(tabulate.tabulate(table[1:], headers=table[0], tablefmt='csv')) + + return parsed_results \ No newline at end of file diff --git a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py new file mode 100644 index 00000000..e2700028 --- /dev/null +++ b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py @@ -0,0 +1,397 @@ +import argparse +import copy +import json +import os +import os.path as osp +import re +import shutil +import signal +import sys +import threading +import time +from pathlib import Path +from typing import Any + +from mmengine.config import Config, ConfigDict +from mmengine.utils import mkdir_or_exist +from tqdm import tqdm + +from ais_bench.benchmark.registry import TASKS +from ais_bench.benchmark.tasks.base import BaseTask, TaskStateManager +from ais_bench.benchmark.utils.core.abbr import task_abbr_from_cfg +from ais_bench.benchmark.utils.logging import AISLogger +from ais_bench.benchmark.utils.logging.exceptions import AISBenchConfigError +from ais_bench.benchmark.utils.logging.error_codes import UTILS_CODES + +DEFAULT_FAKE_API_KEY = "fake_api_key" + + +def parse_kwargs(kwargs_list: list[str] | None) -> dict[str, Any]: + if not kwargs_list: + return {} + result = {} + for kwarg in kwargs_list: + if "=" in kwarg: + key, value = kwarg.split("=", 1) + result[key] = value + return result + + +def parse_env_vars(env_list: list[str] | None) -> dict[str, str]: + if not env_list: + return {} + result = {} + for env_var in env_list: + if "=" in env_var: + key, value = env_var.split("=", 1) + result[key] = value + return result + + +@TASKS.register_module() +class HarborTask(BaseTask): + name_prefix = "HarborTask" + log_subdir = "logs/eval" + output_subdir = "results" + + def __init__(self, cfg: ConfigDict) -> None: + super().__init__(cfg) + self.captured_metrics = None + self.job_dir = None + self.job_result = None + self.job = None + + def get_command(self, cfg_path, template) -> str: + sys.path.append(os.getcwd()) + script_path = __file__ + python = sys.executable + return f"{python} {script_path} {cfg_path}" + + def run(self, task_state_manager: TaskStateManager): + self.logger.info(f"Task {task_abbr_from_cfg(self.cfg)}") + self.task_state_manager = task_state_manager + + self._set_api_key() + self._prepare_out_dir() + + job, job_result = self._run_harbor_job() + + self._dump_eval_results(job, job_result) + + def _set_api_key(self): + api_key = self.cfg["models"][0].get("api_key") + if api_key is None: + api_key = DEFAULT_FAKE_API_KEY + os.environ["OPENAI_API_KEY"] = api_key + + def _prepare_out_dir(self): + self.out_dir = osp.join( + self.work_dir, self.output_subdir, self.cfg["models"][0]["abbr"] + ) + mkdir_or_exist(osp.join(self.out_dir, self.cfg["datasets"][0][0]["abbr"])) + self.out_detail_dir = osp.join( + self.out_dir, + self.cfg["datasets"][0][0]["abbr"], + ) + mkdir_or_exist(Path(self.out_detail_dir)) + + def _run_harbor_job(self): + from harbor.cli.utils import run_async + from harbor.job import Job + from harbor.models.job.config import ( + AgentConfig, + DatasetConfig, + JobConfig, + ) + from harbor.models.agent.name import AgentName + from harbor.models.environment_type import EnvironmentType + + args = self.cfg["datasets"][0][0]["args"] + + config = JobConfig() + + config.job_name = "details" + config.jobs_dir = Path(self.out_detail_dir) + + if args.get("n_attempts"): + config.n_attempts = args["n_attempts"] + if args.get("timeout_multiplier"): + config.timeout_multiplier = args["timeout_multiplier"] + if args.get("agent_timeout_multiplier"): + config.agent_timeout_multiplier = args["agent_timeout_multiplier"] + if args.get("verifier_timeout_multiplier"): + config.verifier_timeout_multiplier = args["verifier_timeout_multiplier"] + if args.get("agent_setup_timeout_multiplier"): + config.agent_setup_timeout_multiplier = args["agent_setup_timeout_multiplier"] + if args.get("environment_build_timeout_multiplier"): + config.environment_build_timeout_multiplier = args["environment_build_timeout_multiplier"] + if args.get("debug"): + config.debug = args["debug"] + + if args.get("n_concurrent_trials"): + config.n_concurrent_trials = args["n_concurrent_trials"] + if args.get("quiet"): + config.quiet = args["quiet"] + if args.get("max_retries"): + config.retry.max_retries = args["max_retries"] + if args.get("retry_include_exceptions"): + config.retry.include_exceptions = set(args["retry_include_exceptions"]) + if args.get("retry_exclude_exceptions"): + config.retry.exclude_exceptions = set(args["retry_exclude_exceptions"]) + + agent_config = self.cfg["models"][0] + agent_kwargs = agent_config.get("agent_kwargs") or {} + agent_env = agent_config.get("agent_env") or {} + + agent_name = AgentName(agent_config.get("agent_name", "oracle")) + model_names = agent_config.get("model_names") + if model_names: + config.agents = [ + AgentConfig( + name=agent_name, + model_name=model_name, + kwargs=agent_kwargs, + env=agent_env, + ) + for model_name in model_names + ] + else: + config.agents = [ + AgentConfig( + name=agent_name, + kwargs=agent_kwargs, + env=agent_env, + ) + ] + + if args.get("environment_type"): + config.environment.type = EnvironmentType(args["environment_type"]) + if args.get("environment_force_build") is not None: + config.environment.force_build = args["environment_force_build"] + if args.get("environment_delete") is not None: + config.environment.delete = args["environment_delete"] + + if args.get("disable_verification"): + config.verifier.disable = True + if args.get("verifier_env"): + config.verifier.env.update(parse_env_vars(args["verifier_env"])) + + reuse_timestamp = None + if self.work_dir: + details_dir = Path(self.work_dir) / "details" + config_path = details_dir / "config.json" + if config_path.exists(): + return self._resume_job(details_dir) + + if args.get("path"): + config.datasets = [DatasetConfig( + path=Path(args["path"]), + task_names=args.get("task_names"), + exclude_task_names=args.get("exclude_task_names"), + n_tasks=args.get("n_tasks"), + )] + elif args.get("dataset_name_version"): + name = args["dataset_name_version"] + version = None + if "@" in name: + name, version = name.split("@", 1) + config.datasets = [ + DatasetConfig( + name=name, + version=version, + task_names=args.get("task_names"), + exclude_task_names=args.get("exclude_task_names"), + n_tasks=args.get("n_tasks"), + ) + ] + + self.logger.info(f"Harbor Job Config: {config}") + + total_tasks = self._get_task_count(config) + if args.get("n_attempts", 1) > 1: + total_tasks *= args["n_attempts"] + + return self._run_with_tqdm(config, total_tasks) + + def _get_task_count(self, config) -> int: + from harbor.cli.utils import run_async + + async def _count(): + count = 0 + for dataset_config in config.datasets: + task_configs = await dataset_config.get_task_configs( + disable_verification=config.verifier.disable + ) + count += len(task_configs) + return count + + return run_async(_count()) + + def _resume_job(self, job_path): + from harbor.job import Job + from harbor.cli.utils import run_async + + async def _resume(): + job_dir = Path(job_path) + config_path = job_dir / "config.json" + if not config_path.exists(): + raise ValueError(f"Config file not found: {config_path}") + from harbor.models.job.config import JobConfig + config = JobConfig.model_validate_json(config_path.read_text()) + job = await Job.create(config) + return job, await job.run() + + return run_async(_resume()) + + def _run_with_tqdm(self, config, total_tasks): + from harbor.job import Job + from harbor.cli.utils import run_async + + pbar = tqdm(total=total_tasks, desc="Running Harbor Job", unit="task") + completed = 0 + + if self.task_state_manager: + self.task_state_manager.update_task_state({ + "status": "running", + "total_count": total_tasks, + "progress_description": "Running Harbor Job", + "finish_count": 0, + }) + + def monitor_progress(): + nonlocal completed + while True: + if self.job and self.job.job_dir: + trial_count = len(list(self.job.job_dir.glob("trial_*"))) + if trial_count > completed: + pbar.update(trial_count - completed) + completed = trial_count + if self.task_state_manager: + self.task_state_manager.update_task_state({ + "finish_count": completed, + }) + time.sleep(0.5) + if completed >= total_tasks: + pbar.update(total_tasks - pbar.n) + pbar.close() + break + + monitor_thread = threading.Thread(target=monitor_progress, daemon=True) + monitor_thread.start() + + def _handle_sigterm(signum, frame): + raise KeyboardInterrupt + + signal.signal(signal.SIGTERM, _handle_sigterm) + + try: + async def _run_job(): + job = await Job.create(config) + return job, await job.run() + + self.job, self.job_result = run_async(_run_job()) + self.logger.info("Harbor job completed, waiting for monitor thread...") + monitor_thread.join(timeout=5) + self.logger.info("Monitor thread joined") + finally: + pbar.close() + if self.task_state_manager: + self.task_state_manager.update_task_state({ + "finish_count": total_tasks, + }) + + return self.job, self.job_result + + def _dump_eval_results(self, job, job_result): + args = self.cfg["datasets"][0][0]["args"] + task_abbr = self.cfg["datasets"][0][0]["abbr"] + + if job_result is None: + self.logger.error(UTILS_CODES.UNKNOWN_ERROR, "No job result captured.") + return + + out_json = osp.join(self.out_dir, f"{task_abbr}.json") + + total_count = self._get_task_count(job.config) + all_rewards = [] + n_errors = 0 + reward_distribution = {} + exception_distribution = {} + + for trial_result in job_result.trial_results or []: + if trial_result.exception_info is not None: + n_errors += 1 + exc_type = trial_result.exception_info.exception_type + exception_distribution[exc_type] = exception_distribution.get(exc_type, 0) + 1 + elif trial_result.verifier_result and trial_result.verifier_result.rewards: + for key, value in trial_result.verifier_result.rewards.items(): + all_rewards.append(value) + score_key = str(value) + reward_distribution[score_key] = reward_distribution.get(score_key, 0) + 1 + + total_reward = sum(all_rewards) if all_rewards else 0.0 + avg_reward = (total_reward / len(all_rewards)) if all_rewards else 0.0 + + pass_at_k = {} + if job_result.stats and job_result.stats.evals: + for evals_key, eval_stats in job_result.stats.evals.items(): + if eval_stats.pass_at_k: + pass_at_k = eval_stats.pass_at_k + break + + results = { + "total_count": total_count, + "n_errors": n_errors, + "avg_score": round(avg_reward, 4), + "reward_distribution": [{"score": float(k), "count": v} for k, v in sorted(reward_distribution.items(), key=lambda x: float(x[0]), reverse=True)], + "exception_distribution": [{"exception_type": k, "count": v} for k, v in sorted(exception_distribution.items(), key=lambda x: x[1], reverse=True)], + "n_total_trials": job_result.n_total_trials, + "pass_at_k": pass_at_k, + } + + with open(out_json, "w") as f: + json.dump(results, f, indent=4) + + self.logger.info(f"Evaluation results saved to {out_json}") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Harbor Benchmark Task") + parser.add_argument("config", help="Config file path") + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logger = AISLogger() + args = parse_args() + cfg = Config.fromfile(args.config) + + task_state_manager = TaskStateManager( + tmp_path=os.path.join(cfg["work_dir"], "status_tmp"), + task_name=task_abbr_from_cfg(cfg), + is_debug=cfg["cli_args"]["debug"], + ) + + manager_t = threading.Thread(target=task_state_manager.launch, args=()) + manager_t.start() + + task_state_manager.update_task_state( + { + "status": "start", + "task_log_path": os.path.join(HarborTask.log_subdir, f"{task_abbr_from_cfg(cfg)}.out"), + } + ) + + start_time = time.perf_counter() + try: + inferencer = HarborTask(cfg) + inferencer.run(task_state_manager) + except Exception as e: + task_state_manager.update_task_state({"status": "error"}) + raise e + + end_time = time.perf_counter() + logger.info(f"Harbor benchmark task time elapsed: {end_time - start_time:.2f}s") + task_state_manager.update_task_state({"status": "finish"}) + manager_t.join() \ No newline at end of file diff --git a/ais_bench/benchmark/tasks/custom_tasks/tau2_bench_task.py b/ais_bench/benchmark/tasks/custom_tasks/tau2_bench_task.py index 6c8da3ae..00ef4dfe 100644 --- a/ais_bench/benchmark/tasks/custom_tasks/tau2_bench_task.py +++ b/ais_bench/benchmark/tasks/custom_tasks/tau2_bench_task.py @@ -252,7 +252,7 @@ def parse_args(): if __name__ == '__main__': - logger = AISLogger(__name__) + logger = AISLogger() args = parse_args() cfg = Config.fromfile(args.config) task_state_manager = TaskStateManager( diff --git a/ais_bench/configs/agent_example/harbor_terminal_bench_2_task.py b/ais_bench/configs/agent_example/harbor_terminal_bench_2_task.py new file mode 100644 index 00000000..0fa27e63 --- /dev/null +++ b/ais_bench/configs/agent_example/harbor_terminal_bench_2_task.py @@ -0,0 +1,84 @@ +from mmengine.config import read_base +from ais_bench.benchmark.tasks.custom_tasks.harbor_task import HarborTask +from ais_bench.benchmark.tasks.base import EmptyTask +from ais_bench.benchmark.summarizers.harbor import HarborSummarizer + +with read_base(): + from ais_bench.benchmark.configs.summarizers.example import summarizer + +models = [ + dict( + abbr="terminus-2", + agent_name="terminus-2", # -a/--agent: Agent名称 (terminus-2, claude-code, openhands等) + model_names=["hosted_vllm/qwen3"], # -m/--model: 模型名称, hosted_vllm/{模型名称} + agent_kwargs={ # --ak/--agent-kwarg: Agent额外参数 + "api_base": "http://0.0.0.0:8080/v1", # terminus-2需要api_base连接推理服务,例如填"http://0.0.0.0:8080/v1"会访问"http://0.0.0.0:8080/v1/chat/completions" + "model_info": { # 模型token限制和成本信息 + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0, + "output_cost_per_token": 0.0, + }, + }, + agent_env=None, # --ae/--agent-env: 传递给agent的环境变量 + ) +] + +datasets = [] + +sub_tasks = ["terminal-bench-2"] +for task in sub_tasks: + datasets.append( + dict( + abbr=f'harbor_{task}', + args=dict( + n_attempts=1, # -k/--n-attempts: 每个trial的尝试次数 + timeout_multiplier=1.0, # --timeout-multiplier: 超时倍数(所有超时乘以此系数) + agent_timeout_multiplier=None, # --agent-timeout-multiplier: Agent执行超时倍数(覆盖timeout-multiplier) + verifier_timeout_multiplier=None, # --verifier-timeout-multiplier: 验证器超时倍数 + agent_setup_timeout_multiplier=None, # --agent-setup-timeout-multiplier: Agent设置超时倍数 + environment_build_timeout_multiplier=None, # --environment-build-timeout-multiplier: 环境构建超时倍数 + debug=False, # --debug: 启用调试日志 + n_concurrent_trials=5, # -n/--n-concurrent: 并发运行的trial数量 + quiet=False, # -q/--quiet: 静默模式 + max_retries=0, # -r/--max-retries: 最大重试次数 + retry_include_exceptions=None, # --retry-include: 需要重试的异常类型列表 + retry_exclude_exceptions=[ # --retry-exclude: 不需要重试的异常类型列表 + # "AgentTimeoutError", + # "VerifierTimeoutError", + # "RewardFileNotFoundError", + "RewardFileEmptyError", + "VerifierOutputParseError", + ], + environment_type="docker", # -e/--env: 环境类型 (docker, daytona, e2b, modal) + environment_force_build=False, # --force-build/--no-force-build: 是否强制重建环境 + environment_delete=False, # --delete/--no-delete: 完成后是否删除环境 + path="/path/to/terminal-bench-2/", # -p/--path: 本地数据集路径 + dataset_name_version=None, # -d/--dataset: 远程数据集名称@版本 + task_names=None, # --include-task-name: 包含的任务名称(支持glob模式)例如 ["task_name1", "task_name2"] + exclude_task_names=None, # --exclude-task-name: 排除的任务名称 + n_tasks=None, # --n-tasks: 最大任务数量 + disable_verification=False, # --disable-verification: 禁用验证器 + verifier_env=None, # --ve/--verifier-env: 验证器环境变量 + yes=True, # -y/--yes: 自动确认环境变量提示 + env_file=None, # --env-file: .env文件路径 + ), + ) + ) + +infer = dict( + runner=dict( + task=dict(type=EmptyTask) + ), +) + +eval = dict( + runner=dict( + task=dict(type=HarborTask) + ), +) + +summarizer = dict( + attr="accuracy", + type=HarborSummarizer, +) \ No newline at end of file From c009a488e0ddb7706eea7038c6f0cef3e02de2e2 Mon Sep 17 00:00:00 2001 From: SJTUyh Date: Sat, 30 May 2026 10:12:45 +0800 Subject: [PATCH 3/7] review fix --- .../tasks/custom_tasks/harbor_task.py | 145 +++++++++--------- 1 file changed, 69 insertions(+), 76 deletions(-) diff --git a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py index e2700028..3ac1d53b 100644 --- a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py +++ b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py @@ -4,7 +4,6 @@ import os import os.path as osp import re -import shutil import signal import sys import threading @@ -26,28 +25,6 @@ DEFAULT_FAKE_API_KEY = "fake_api_key" -def parse_kwargs(kwargs_list: list[str] | None) -> dict[str, Any]: - if not kwargs_list: - return {} - result = {} - for kwarg in kwargs_list: - if "=" in kwarg: - key, value = kwarg.split("=", 1) - result[key] = value - return result - - -def parse_env_vars(env_list: list[str] | None) -> dict[str, str]: - if not env_list: - return {} - result = {} - for env_var in env_list: - if "=" in env_var: - key, value = env_var.split("=", 1) - result[key] = value - return result - - @TASKS.register_module() class HarborTask(BaseTask): name_prefix = "HarborTask" @@ -79,19 +56,20 @@ def run(self, task_state_manager: TaskStateManager): self._dump_eval_results(job, job_result) def _set_api_key(self): - api_key = self.cfg["models"][0].get("api_key") + api_key = self.model_cfg.get("api_key") if api_key is None: api_key = DEFAULT_FAKE_API_KEY os.environ["OPENAI_API_KEY"] = api_key def _prepare_out_dir(self): + dataset_cfg = self.dataset_cfgs[0] self.out_dir = osp.join( - self.work_dir, self.output_subdir, self.cfg["models"][0]["abbr"] + self.work_dir, self.output_subdir, self.model_cfg["abbr"] ) - mkdir_or_exist(osp.join(self.out_dir, self.cfg["datasets"][0][0]["abbr"])) + mkdir_or_exist(osp.join(self.out_dir, dataset_cfg["abbr"])) self.out_detail_dir = osp.join( self.out_dir, - self.cfg["datasets"][0][0]["abbr"], + dataset_cfg["abbr"], ) mkdir_or_exist(Path(self.out_detail_dir)) @@ -106,7 +84,8 @@ def _run_harbor_job(self): from harbor.models.agent.name import AgentName from harbor.models.environment_type import EnvironmentType - args = self.cfg["datasets"][0][0]["args"] + dataset_cfg = self.dataset_cfgs[0] + args = dataset_cfg.get("args") or {} config = JobConfig() @@ -139,12 +118,11 @@ def _run_harbor_job(self): if args.get("retry_exclude_exceptions"): config.retry.exclude_exceptions = set(args["retry_exclude_exceptions"]) - agent_config = self.cfg["models"][0] - agent_kwargs = agent_config.get("agent_kwargs") or {} - agent_env = agent_config.get("agent_env") or {} + agent_kwargs = self.model_cfg.get("agent_kwargs") or {} + agent_env = self.model_cfg.get("agent_env") or {} - agent_name = AgentName(agent_config.get("agent_name", "oracle")) - model_names = agent_config.get("model_names") + agent_name = AgentName(self.model_cfg.get("agent_name", "oracle")) + model_names = self.model_cfg.get("model_names") if model_names: config.agents = [ AgentConfig( @@ -174,22 +152,24 @@ def _run_harbor_job(self): if args.get("disable_verification"): config.verifier.disable = True if args.get("verifier_env"): - config.verifier.env.update(parse_env_vars(args["verifier_env"])) + env_list = args["verifier_env"] + if isinstance(env_list, list): + config.verifier.env.update({k: v for k, v in (e.split("=", 1) for e in env_list if "=" in e)}) - reuse_timestamp = None - if self.work_dir: - details_dir = Path(self.work_dir) / "details" - config_path = details_dir / "config.json" - if config_path.exists(): - return self._resume_job(details_dir) + details_dir = Path(self.work_dir) / "details" + config_path = details_dir / "config.json" + if config_path.exists(): + return self._resume_job(details_dir) if args.get("path"): - config.datasets = [DatasetConfig( - path=Path(args["path"]), - task_names=args.get("task_names"), - exclude_task_names=args.get("exclude_task_names"), - n_tasks=args.get("n_tasks"), - )] + config.datasets = [ + DatasetConfig( + path=Path(args["path"]), + task_names=args.get("task_names"), + exclude_task_names=args.get("exclude_task_names"), + n_tasks=args.get("n_tasks"), + ) + ] elif args.get("dataset_name_version"): name = args["dataset_name_version"] version = None @@ -228,8 +208,8 @@ async def _count(): return run_async(_count()) def _resume_job(self, job_path): - from harbor.job import Job from harbor.cli.utils import run_async + from harbor.job import Job async def _resume(): job_dir = Path(job_path) @@ -244,67 +224,68 @@ async def _resume(): return run_async(_resume()) def _run_with_tqdm(self, config, total_tasks): - from harbor.job import Job from harbor.cli.utils import run_async + from harbor.job import Job pbar = tqdm(total=total_tasks, desc="Running Harbor Job", unit="task") completed = 0 + stop_event = threading.Event() if self.task_state_manager: - self.task_state_manager.update_task_state({ - "status": "running", - "total_count": total_tasks, - "progress_description": "Running Harbor Job", - "finish_count": 0, - }) + self.task_state_manager.update_task_state( + { + "status": "running", + "total_count": total_tasks, + "progress_description": "Running Harbor Job", + "finish_count": 0, + } + ) def monitor_progress(): nonlocal completed - while True: + while not stop_event.is_set(): if self.job and self.job.job_dir: trial_count = len(list(self.job.job_dir.glob("trial_*"))) if trial_count > completed: pbar.update(trial_count - completed) completed = trial_count if self.task_state_manager: - self.task_state_manager.update_task_state({ - "finish_count": completed, - }) - time.sleep(0.5) - if completed >= total_tasks: - pbar.update(total_tasks - pbar.n) - pbar.close() - break + self.task_state_manager.update_task_state( + {"finish_count": completed} + ) + stop_event.wait(0.5) + pbar.close() monitor_thread = threading.Thread(target=monitor_progress, daemon=True) monitor_thread.start() def _handle_sigterm(signum, frame): + stop_event.set() raise KeyboardInterrupt signal.signal(signal.SIGTERM, _handle_sigterm) try: + async def _run_job(): job = await Job.create(config) return job, await job.run() self.job, self.job_result = run_async(_run_job()) - self.logger.info("Harbor job completed, waiting for monitor thread...") - monitor_thread.join(timeout=5) - self.logger.info("Monitor thread joined") finally: + stop_event.set() + monitor_thread.join(timeout=5) pbar.close() if self.task_state_manager: - self.task_state_manager.update_task_state({ - "finish_count": total_tasks, - }) + self.task_state_manager.update_task_state( + {"finish_count": total_tasks} + ) return self.job, self.job_result def _dump_eval_results(self, job, job_result): - args = self.cfg["datasets"][0][0]["args"] - task_abbr = self.cfg["datasets"][0][0]["abbr"] + dataset_cfg = self.dataset_cfgs[0] + task_abbr = dataset_cfg["abbr"] if job_result is None: self.logger.error(UTILS_CODES.UNKNOWN_ERROR, "No job result captured.") @@ -343,8 +324,18 @@ def _dump_eval_results(self, job, job_result): "total_count": total_count, "n_errors": n_errors, "avg_score": round(avg_reward, 4), - "reward_distribution": [{"score": float(k), "count": v} for k, v in sorted(reward_distribution.items(), key=lambda x: float(x[0]), reverse=True)], - "exception_distribution": [{"exception_type": k, "count": v} for k, v in sorted(exception_distribution.items(), key=lambda x: x[1], reverse=True)], + "reward_distribution": [ + {"score": float(k), "count": v} + for k, v in sorted( + reward_distribution.items(), key=lambda x: float(x[0]), reverse=True + ) + ], + "exception_distribution": [ + {"exception_type": k, "count": v} + for k, v in sorted( + exception_distribution.items(), key=lambda x: x[1], reverse=True + ) + ], "n_total_trials": job_result.n_total_trials, "pass_at_k": pass_at_k, } @@ -363,7 +354,7 @@ def parse_args(): if __name__ == "__main__": - logger = AISLogger() + logger = AISLogger(__name__) args = parse_args() cfg = Config.fromfile(args.config) @@ -379,7 +370,9 @@ def parse_args(): task_state_manager.update_task_state( { "status": "start", - "task_log_path": os.path.join(HarborTask.log_subdir, f"{task_abbr_from_cfg(cfg)}.out"), + "task_log_path": os.path.join( + HarborTask.log_subdir, f"{task_abbr_from_cfg(cfg)}.out" + ), } ) @@ -394,4 +387,4 @@ def parse_args(): end_time = time.perf_counter() logger.info(f"Harbor benchmark task time elapsed: {end_time - start_time:.2f}s") task_state_manager.update_task_state({"status": "finish"}) - manager_t.join() \ No newline at end of file + manager_t.join() From 52e62ce83a6add9ed851a0eb4f5ce1c2651cf7ac Mon Sep 17 00:00:00 2001 From: SJTUyh Date: Sat, 30 May 2026 10:16:37 +0800 Subject: [PATCH 4/7] add harbor dependencies --- requirements/datasets/harbor.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements/datasets/harbor.txt diff --git a/requirements/datasets/harbor.txt b/requirements/datasets/harbor.txt new file mode 100644 index 00000000..0361dad2 --- /dev/null +++ b/requirements/datasets/harbor.txt @@ -0,0 +1 @@ +harbor==0.6.1 \ No newline at end of file From 5959db1f2466ca39e8cdb83d03dd7ae6313e66ef Mon Sep 17 00:00:00 2001 From: SJTUyh Date: Sat, 30 May 2026 14:07:53 +0800 Subject: [PATCH 5/7] fix total score --- ais_bench/benchmark/tasks/custom_tasks/harbor_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py index 3ac1d53b..f7e0813f 100644 --- a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py +++ b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py @@ -311,7 +311,7 @@ def _dump_eval_results(self, job, job_result): reward_distribution[score_key] = reward_distribution.get(score_key, 0) + 1 total_reward = sum(all_rewards) if all_rewards else 0.0 - avg_reward = (total_reward / len(all_rewards)) if all_rewards else 0.0 + avg_reward = (total_reward / total_count) if total_count > 0 else 0.0 pass_at_k = {} if job_result.stats and job_result.stats.evals: From 37f248c6de0a416f9f157ca21550555b0bcb5411 Mon Sep 17 00:00:00 2001 From: SJTUyh Date: Sat, 30 May 2026 14:20:24 +0800 Subject: [PATCH 6/7] fix total score --- ais_bench/benchmark/tasks/custom_tasks/harbor_task.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py index f7e0813f..9d5f0baf 100644 --- a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py +++ b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py @@ -156,10 +156,10 @@ def _run_harbor_job(self): if isinstance(env_list, list): config.verifier.env.update({k: v for k, v in (e.split("=", 1) for e in env_list if "=" in e)}) - details_dir = Path(self.work_dir) / "details" - config_path = details_dir / "config.json" + existing_job_dir = Path(self.out_detail_dir) / config.job_name + config_path = existing_job_dir / "config.json" if config_path.exists(): - return self._resume_job(details_dir) + return self._resume_job(existing_job_dir) if args.get("path"): config.datasets = [ @@ -218,6 +218,9 @@ async def _resume(): raise ValueError(f"Config file not found: {config_path}") from harbor.models.job.config import JobConfig config = JobConfig.model_validate_json(config_path.read_text()) + self.logger.info(f"Resuming job from {job_dir}") + self.logger.info(f"Config jobs_dir: {config.jobs_dir}, job_name: {config.job_name}") + self.logger.info(f"Expected job_dir: {config.jobs_dir / config.job_name}") job = await Job.create(config) return job, await job.run() From f340630a533f34dec2843edef0e55aa53ad6decc Mon Sep 17 00:00:00 2001 From: SJTUyh Date: Sat, 30 May 2026 14:39:10 +0800 Subject: [PATCH 7/7] fix total score --- ais_bench/benchmark/tasks/custom_tasks/harbor_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py index 9d5f0baf..f5e18809 100644 --- a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py +++ b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py @@ -314,7 +314,7 @@ def _dump_eval_results(self, job, job_result): reward_distribution[score_key] = reward_distribution.get(score_key, 0) + 1 total_reward = sum(all_rewards) if all_rewards else 0.0 - avg_reward = (total_reward / total_count) if total_count > 0 else 0.0 + avg_reward = (total_reward / job_result.n_total_trials) if job_result.n_total_trials > 0 else 0.0 pass_at_k = {} if job_result.stats and job_result.stats.evals: