diff --git a/ais_bench/benchmark/summarizers/__init__.py b/ais_bench/benchmark/summarizers/__init__.py index 756d1a53..79fb0026 100644 --- a/ais_bench/benchmark/summarizers/__init__.py +++ b/ais_bench/benchmark/summarizers/__init__.py @@ -4,3 +4,4 @@ from ais_bench.benchmark.summarizers.default_perf import DefaultPerfSummarizer # noqa: F401 from ais_bench.benchmark.summarizers.vbench import VBenchSummarizer # noqa: F401 from ais_bench.benchmark.summarizers.swebench import SWEBenchSummarizer # noqa: F401 +from ais_bench.benchmark.summarizers.harbor import HarborSummarizer # noqa: F401 \ No newline at end of file diff --git a/ais_bench/benchmark/summarizers/harbor.py b/ais_bench/benchmark/summarizers/harbor.py new file mode 100644 index 00000000..930371be --- /dev/null +++ b/ais_bench/benchmark/summarizers/harbor.py @@ -0,0 +1,173 @@ +# flake8: noqa +# yapf: disable +import functools +import os.path as osp +from typing import Any, Dict, List + +import mmengine +import tabulate +from mmengine import ConfigDict + +from ais_bench.benchmark.summarizers.default import DefaultSummarizer +from ais_bench.benchmark.utils.logging.logger import AISLogger +from ais_bench.benchmark.utils.core.abbr import dataset_abbr_from_cfg, model_abbr_from_cfg + + +METRIC_WHITELIST = ['avg_score', 'score', 'accuracy', 'n_errors', 'n_total_trials'] +METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len', 'type', 'reward_distribution', 'exception_distribution', 'pass_at_k', 'details'] + + +class HarborSummarizer(DefaultSummarizer): + """Summarizer for Harbor benchmark results.""" + + def _pick_up_results(self): + raw_results: Dict[str, Dict[str, Any]] = {} + parsed_results: Dict[str, Dict[str, Dict[str, float]]] = {} + dataset_metrics: Dict[str, List[str]] = {} + dataset_eval_mode: Dict[str, str] = {} + + for model in self.model_cfgs: + model_abbr = model_abbr_from_cfg(model) + parsed_results.setdefault(model_abbr, {}) + raw_results.setdefault(model_abbr, {}) + for dataset in self.dataset_cfgs: + dataset_abbr = dataset_abbr_from_cfg(dataset) + filepath = osp.join(self.work_dir, 'results', model_abbr, f'{dataset_abbr}.json') + + if not osp.exists(filepath): + continue + + result = mmengine.load(filepath) + raw_results[model_abbr][dataset_abbr] = result + + _rst, _dm = {}, [] + for metric, score in result.items(): + if metric in METRIC_BLACKLIST: + continue + if isinstance(score, (int, float)): + _rst[metric] = score + _dm.append(metric) + elif isinstance(score, str): + _rst[metric] = score + _dm.append(metric) + + if len(_rst) == 0: + continue + + _dm = sorted(_dm, key=lambda i: METRIC_WHITELIST.index(i) if i in METRIC_WHITELIST else len(METRIC_WHITELIST)) + dataset_metrics[dataset_abbr] = _dm + parsed_results[model_abbr][dataset_abbr] = _rst + dataset_eval_mode[dataset_abbr] = 'gen' + + return raw_results, parsed_results, dataset_metrics, dataset_eval_mode + + def _print_harbor_details(self, raw_results: Dict[str, Dict[str, Any]]): + for model_abbr in self.model_abbrs: + if model_abbr not in raw_results: + continue + for dataset_abbr, result in raw_results[model_abbr].items(): + if 'reward_distribution' in result or 'exception_distribution' in result: + print('') + print('=' * 60) + print(f'Dataset: {dataset_abbr}') + print(f'Model: {model_abbr}') + print('=' * 60) + + if 'total_count' in result: + print(f'Total Count: {result["total_count"]}') + if 'n_errors' in result: + print(f'Errors: {result["n_errors"]}') + if 'avg_score' in result: + print(f'Avg Score: {result["avg_score"]}') + + if 'reward_distribution' in result and result['reward_distribution']: + print('\nReward Distribution:') + table_data = [[item['score'], item['count']] for item in result['reward_distribution']] + print(tabulate.tabulate(table_data, headers=['Score', 'Count'], tablefmt='grid')) + + if 'exception_distribution' in result and result['exception_distribution']: + print('\nException Distribution:') + table_data = [[item['exception_type'], item['count']] for item in result['exception_distribution']] + print(tabulate.tabulate(table_data, headers=['Exception', 'Count'], tablefmt='grid')) + + if 'pass_at_k' in result and result['pass_at_k']: + print('\nPass@k:') + table_data = [[k, f'{v:.4f}'] for k, v in sorted(result['pass_at_k'].items())] + print(tabulate.tabulate(table_data, headers=['k', 'Pass Rate'], tablefmt='grid')) + + print('') + + def summarize(self, time_str=None, subjective_scores=None, dataset_score_container=None, required_dataset_abbrs=None): + self._update_dataset_abbrs() + raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results() + + self._print_harbor_details(raw_results) + + dataset_abbrs = [] + for dataset_abbr in self.dataset_abbrs: + if dataset_abbr in dataset_metrics: + for metric in dataset_metrics[dataset_abbr]: + if metric in ("correct_count", "total_count"): + continue + dataset_abbrs.append((dataset_abbr, metric)) + else: + dataset_abbrs.append((dataset_abbr, None)) + + has_total_count = False + for dataset_abbr in dataset_metrics: + if 'total_count' in dataset_metrics[dataset_abbr]: + has_total_count = True + break + + table = [] + header = ['dataset', 'version', 'metric', 'mode'] + self.model_abbrs + if has_total_count: + header = ['dataset', 'version', 'metric', 'mode', 'total_count'] + self.model_abbrs + table.append(header) + + for dataset_abbr, metric in dataset_abbrs: + for model_abbr in self.model_abbrs: + if metric is None: + for k in parsed_results.get(model_abbr, {}).get(dataset_abbr, {}).keys(): + row = [dataset_abbr, 'a39421', k, dataset_eval_mode.get(dataset_abbr, 'gen')] + if has_total_count: + row.insert(4, raw_results[model_abbr][dataset_abbr].get('total_count', '-')) + row.append(parsed_results[model_abbr][dataset_abbr][k]) + table.append(row) + else: + if dataset_abbr in parsed_results[model_abbr] and metric in parsed_results[model_abbr][dataset_abbr]: + row = [dataset_abbr, 'a39421', metric, dataset_eval_mode.get(dataset_abbr, 'gen')] + if has_total_count: + row.insert(4, raw_results[model_abbr][dataset_abbr].get('total_count', '-')) + row.append(parsed_results[model_abbr][dataset_abbr][metric]) + table.append(row) + + for sg in self.summary_groups: + for model_abbr in self.model_abbrs: + if sg['name'] in parsed_results[model_abbr]: + row = [sg['name'], sg.get('version', '-'), sg.get('metric', 'naive_average'), dataset_eval_mode.get(sg['name'], 'gen')] + if has_total_count: + row.insert(4, len(sg['subsets'])) + row.extend([parsed_results[m].get(sg['name'], {}).get(sg.get('metric', 'naive_average'), '-') for m in self.model_abbrs]) + table.append(row) + + print('') + print(tabulate.tabulate(table[1:], headers=table[0], tablefmt='grid')) + print('') + + summary_dir = osp.join(self.work_dir, 'summary') + mmengine.mkdir_or_exist(summary_dir) + + time_str = time_str or mmengine.utils.TimeStub.now().time_str + summary_txt = osp.join(summary_dir, f'summary_{time_str}.txt') + summary_csv = osp.join(summary_dir, f'summary_{time_str}.csv') + + print(f'write summary to {summary_txt}') + with open(summary_txt, 'w', encoding='utf-8') as f: + f.write(tabulate.tabulate(table[1:], headers=table[0], tablefmt='grid')) + + print(f'write csv to {summary_csv}') + with open(summary_csv, 'w', encoding='utf-8') as out: + out.write(tabulate.tabulate(table[1:], headers=table[0], tablefmt='csv')) + + return parsed_results \ No newline at end of file diff --git a/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py new file mode 100644 index 00000000..f5e18809 --- /dev/null +++ b/ais_bench/benchmark/tasks/custom_tasks/harbor_task.py @@ -0,0 +1,393 @@ +import argparse +import copy +import json +import os +import os.path as osp +import re +import signal +import sys +import threading +import time +from pathlib import Path +from typing import Any + +from mmengine.config import Config, ConfigDict +from mmengine.utils import mkdir_or_exist +from tqdm import tqdm + +from ais_bench.benchmark.registry import TASKS +from ais_bench.benchmark.tasks.base import BaseTask, TaskStateManager +from ais_bench.benchmark.utils.core.abbr import task_abbr_from_cfg +from ais_bench.benchmark.utils.logging import AISLogger +from ais_bench.benchmark.utils.logging.exceptions import AISBenchConfigError +from ais_bench.benchmark.utils.logging.error_codes import UTILS_CODES + +DEFAULT_FAKE_API_KEY = "fake_api_key" + + +@TASKS.register_module() +class HarborTask(BaseTask): + name_prefix = "HarborTask" + log_subdir = "logs/eval" + output_subdir = "results" + + def __init__(self, cfg: ConfigDict) -> None: + super().__init__(cfg) + self.captured_metrics = None + self.job_dir = None + self.job_result = None + self.job = None + + def get_command(self, cfg_path, template) -> str: + sys.path.append(os.getcwd()) + script_path = __file__ + python = sys.executable + return f"{python} {script_path} {cfg_path}" + + def run(self, task_state_manager: TaskStateManager): + self.logger.info(f"Task {task_abbr_from_cfg(self.cfg)}") + self.task_state_manager = task_state_manager + + self._set_api_key() + self._prepare_out_dir() + + job, job_result = self._run_harbor_job() + + self._dump_eval_results(job, job_result) + + def _set_api_key(self): + api_key = self.model_cfg.get("api_key") + if api_key is None: + api_key = DEFAULT_FAKE_API_KEY + os.environ["OPENAI_API_KEY"] = api_key + + def _prepare_out_dir(self): + dataset_cfg = self.dataset_cfgs[0] + self.out_dir = osp.join( + self.work_dir, self.output_subdir, self.model_cfg["abbr"] + ) + mkdir_or_exist(osp.join(self.out_dir, dataset_cfg["abbr"])) + self.out_detail_dir = osp.join( + self.out_dir, + dataset_cfg["abbr"], + ) + mkdir_or_exist(Path(self.out_detail_dir)) + + def _run_harbor_job(self): + from harbor.cli.utils import run_async + from harbor.job import Job + from harbor.models.job.config import ( + AgentConfig, + DatasetConfig, + JobConfig, + ) + from harbor.models.agent.name import AgentName + from harbor.models.environment_type import EnvironmentType + + dataset_cfg = self.dataset_cfgs[0] + args = dataset_cfg.get("args") or {} + + config = JobConfig() + + config.job_name = "details" + config.jobs_dir = Path(self.out_detail_dir) + + if args.get("n_attempts"): + config.n_attempts = args["n_attempts"] + if args.get("timeout_multiplier"): + config.timeout_multiplier = args["timeout_multiplier"] + if args.get("agent_timeout_multiplier"): + config.agent_timeout_multiplier = args["agent_timeout_multiplier"] + if args.get("verifier_timeout_multiplier"): + config.verifier_timeout_multiplier = args["verifier_timeout_multiplier"] + if args.get("agent_setup_timeout_multiplier"): + config.agent_setup_timeout_multiplier = args["agent_setup_timeout_multiplier"] + if args.get("environment_build_timeout_multiplier"): + config.environment_build_timeout_multiplier = args["environment_build_timeout_multiplier"] + if args.get("debug"): + config.debug = args["debug"] + + if args.get("n_concurrent_trials"): + config.n_concurrent_trials = args["n_concurrent_trials"] + if args.get("quiet"): + config.quiet = args["quiet"] + if args.get("max_retries"): + config.retry.max_retries = args["max_retries"] + if args.get("retry_include_exceptions"): + config.retry.include_exceptions = set(args["retry_include_exceptions"]) + if args.get("retry_exclude_exceptions"): + config.retry.exclude_exceptions = set(args["retry_exclude_exceptions"]) + + agent_kwargs = self.model_cfg.get("agent_kwargs") or {} + agent_env = self.model_cfg.get("agent_env") or {} + + agent_name = AgentName(self.model_cfg.get("agent_name", "oracle")) + model_names = self.model_cfg.get("model_names") + if model_names: + config.agents = [ + AgentConfig( + name=agent_name, + model_name=model_name, + kwargs=agent_kwargs, + env=agent_env, + ) + for model_name in model_names + ] + else: + config.agents = [ + AgentConfig( + name=agent_name, + kwargs=agent_kwargs, + env=agent_env, + ) + ] + + if args.get("environment_type"): + config.environment.type = EnvironmentType(args["environment_type"]) + if args.get("environment_force_build") is not None: + config.environment.force_build = args["environment_force_build"] + if args.get("environment_delete") is not None: + config.environment.delete = args["environment_delete"] + + if args.get("disable_verification"): + config.verifier.disable = True + if args.get("verifier_env"): + env_list = args["verifier_env"] + if isinstance(env_list, list): + config.verifier.env.update({k: v for k, v in (e.split("=", 1) for e in env_list if "=" in e)}) + + existing_job_dir = Path(self.out_detail_dir) / config.job_name + config_path = existing_job_dir / "config.json" + if config_path.exists(): + return self._resume_job(existing_job_dir) + + if args.get("path"): + config.datasets = [ + DatasetConfig( + path=Path(args["path"]), + task_names=args.get("task_names"), + exclude_task_names=args.get("exclude_task_names"), + n_tasks=args.get("n_tasks"), + ) + ] + elif args.get("dataset_name_version"): + name = args["dataset_name_version"] + version = None + if "@" in name: + name, version = name.split("@", 1) + config.datasets = [ + DatasetConfig( + name=name, + version=version, + task_names=args.get("task_names"), + exclude_task_names=args.get("exclude_task_names"), + n_tasks=args.get("n_tasks"), + ) + ] + + self.logger.info(f"Harbor Job Config: {config}") + + total_tasks = self._get_task_count(config) + if args.get("n_attempts", 1) > 1: + total_tasks *= args["n_attempts"] + + return self._run_with_tqdm(config, total_tasks) + + def _get_task_count(self, config) -> int: + from harbor.cli.utils import run_async + + async def _count(): + count = 0 + for dataset_config in config.datasets: + task_configs = await dataset_config.get_task_configs( + disable_verification=config.verifier.disable + ) + count += len(task_configs) + return count + + return run_async(_count()) + + def _resume_job(self, job_path): + from harbor.cli.utils import run_async + from harbor.job import Job + + async def _resume(): + job_dir = Path(job_path) + config_path = job_dir / "config.json" + if not config_path.exists(): + raise ValueError(f"Config file not found: {config_path}") + from harbor.models.job.config import JobConfig + config = JobConfig.model_validate_json(config_path.read_text()) + self.logger.info(f"Resuming job from {job_dir}") + self.logger.info(f"Config jobs_dir: {config.jobs_dir}, job_name: {config.job_name}") + self.logger.info(f"Expected job_dir: {config.jobs_dir / config.job_name}") + job = await Job.create(config) + return job, await job.run() + + return run_async(_resume()) + + def _run_with_tqdm(self, config, total_tasks): + from harbor.cli.utils import run_async + from harbor.job import Job + + pbar = tqdm(total=total_tasks, desc="Running Harbor Job", unit="task") + completed = 0 + stop_event = threading.Event() + + if self.task_state_manager: + self.task_state_manager.update_task_state( + { + "status": "running", + "total_count": total_tasks, + "progress_description": "Running Harbor Job", + "finish_count": 0, + } + ) + + def monitor_progress(): + nonlocal completed + while not stop_event.is_set(): + if self.job and self.job.job_dir: + trial_count = len(list(self.job.job_dir.glob("trial_*"))) + if trial_count > completed: + pbar.update(trial_count - completed) + completed = trial_count + if self.task_state_manager: + self.task_state_manager.update_task_state( + {"finish_count": completed} + ) + stop_event.wait(0.5) + pbar.close() + + monitor_thread = threading.Thread(target=monitor_progress, daemon=True) + monitor_thread.start() + + def _handle_sigterm(signum, frame): + stop_event.set() + raise KeyboardInterrupt + + signal.signal(signal.SIGTERM, _handle_sigterm) + + try: + + async def _run_job(): + job = await Job.create(config) + return job, await job.run() + + self.job, self.job_result = run_async(_run_job()) + finally: + stop_event.set() + monitor_thread.join(timeout=5) + pbar.close() + if self.task_state_manager: + self.task_state_manager.update_task_state( + {"finish_count": total_tasks} + ) + + return self.job, self.job_result + + def _dump_eval_results(self, job, job_result): + dataset_cfg = self.dataset_cfgs[0] + task_abbr = dataset_cfg["abbr"] + + if job_result is None: + self.logger.error(UTILS_CODES.UNKNOWN_ERROR, "No job result captured.") + return + + out_json = osp.join(self.out_dir, f"{task_abbr}.json") + + total_count = self._get_task_count(job.config) + all_rewards = [] + n_errors = 0 + reward_distribution = {} + exception_distribution = {} + + for trial_result in job_result.trial_results or []: + if trial_result.exception_info is not None: + n_errors += 1 + exc_type = trial_result.exception_info.exception_type + exception_distribution[exc_type] = exception_distribution.get(exc_type, 0) + 1 + elif trial_result.verifier_result and trial_result.verifier_result.rewards: + for key, value in trial_result.verifier_result.rewards.items(): + all_rewards.append(value) + score_key = str(value) + reward_distribution[score_key] = reward_distribution.get(score_key, 0) + 1 + + total_reward = sum(all_rewards) if all_rewards else 0.0 + avg_reward = (total_reward / job_result.n_total_trials) if job_result.n_total_trials > 0 else 0.0 + + pass_at_k = {} + if job_result.stats and job_result.stats.evals: + for evals_key, eval_stats in job_result.stats.evals.items(): + if eval_stats.pass_at_k: + pass_at_k = eval_stats.pass_at_k + break + + results = { + "total_count": total_count, + "n_errors": n_errors, + "avg_score": round(avg_reward, 4), + "reward_distribution": [ + {"score": float(k), "count": v} + for k, v in sorted( + reward_distribution.items(), key=lambda x: float(x[0]), reverse=True + ) + ], + "exception_distribution": [ + {"exception_type": k, "count": v} + for k, v in sorted( + exception_distribution.items(), key=lambda x: x[1], reverse=True + ) + ], + "n_total_trials": job_result.n_total_trials, + "pass_at_k": pass_at_k, + } + + with open(out_json, "w") as f: + json.dump(results, f, indent=4) + + self.logger.info(f"Evaluation results saved to {out_json}") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Harbor Benchmark Task") + parser.add_argument("config", help="Config file path") + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logger = AISLogger(__name__) + args = parse_args() + cfg = Config.fromfile(args.config) + + task_state_manager = TaskStateManager( + tmp_path=os.path.join(cfg["work_dir"], "status_tmp"), + task_name=task_abbr_from_cfg(cfg), + is_debug=cfg["cli_args"]["debug"], + ) + + manager_t = threading.Thread(target=task_state_manager.launch, args=()) + manager_t.start() + + task_state_manager.update_task_state( + { + "status": "start", + "task_log_path": os.path.join( + HarborTask.log_subdir, f"{task_abbr_from_cfg(cfg)}.out" + ), + } + ) + + start_time = time.perf_counter() + try: + inferencer = HarborTask(cfg) + inferencer.run(task_state_manager) + except Exception as e: + task_state_manager.update_task_state({"status": "error"}) + raise e + + end_time = time.perf_counter() + logger.info(f"Harbor benchmark task time elapsed: {end_time - start_time:.2f}s") + task_state_manager.update_task_state({"status": "finish"}) + manager_t.join() diff --git a/ais_bench/benchmark/tasks/custom_tasks/tau2_bench_task.py b/ais_bench/benchmark/tasks/custom_tasks/tau2_bench_task.py index 6c8da3ae..00ef4dfe 100644 --- a/ais_bench/benchmark/tasks/custom_tasks/tau2_bench_task.py +++ b/ais_bench/benchmark/tasks/custom_tasks/tau2_bench_task.py @@ -252,7 +252,7 @@ def parse_args(): if __name__ == '__main__': - logger = AISLogger(__name__) + logger = AISLogger() args = parse_args() cfg = Config.fromfile(args.config) task_state_manager = TaskStateManager( diff --git a/ais_bench/configs/agent_example/harbor_terminal_bench_2_task.py b/ais_bench/configs/agent_example/harbor_terminal_bench_2_task.py new file mode 100644 index 00000000..0fa27e63 --- /dev/null +++ b/ais_bench/configs/agent_example/harbor_terminal_bench_2_task.py @@ -0,0 +1,84 @@ +from mmengine.config import read_base +from ais_bench.benchmark.tasks.custom_tasks.harbor_task import HarborTask +from ais_bench.benchmark.tasks.base import EmptyTask +from ais_bench.benchmark.summarizers.harbor import HarborSummarizer + +with read_base(): + from ais_bench.benchmark.configs.summarizers.example import summarizer + +models = [ + dict( + abbr="terminus-2", + agent_name="terminus-2", # -a/--agent: Agent名称 (terminus-2, claude-code, openhands等) + model_names=["hosted_vllm/qwen3"], # -m/--model: 模型名称, hosted_vllm/{模型名称} + agent_kwargs={ # --ak/--agent-kwarg: Agent额外参数 + "api_base": "http://0.0.0.0:8080/v1", # terminus-2需要api_base连接推理服务,例如填"http://0.0.0.0:8080/v1"会访问"http://0.0.0.0:8080/v1/chat/completions" + "model_info": { # 模型token限制和成本信息 + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0, + "output_cost_per_token": 0.0, + }, + }, + agent_env=None, # --ae/--agent-env: 传递给agent的环境变量 + ) +] + +datasets = [] + +sub_tasks = ["terminal-bench-2"] +for task in sub_tasks: + datasets.append( + dict( + abbr=f'harbor_{task}', + args=dict( + n_attempts=1, # -k/--n-attempts: 每个trial的尝试次数 + timeout_multiplier=1.0, # --timeout-multiplier: 超时倍数(所有超时乘以此系数) + agent_timeout_multiplier=None, # --agent-timeout-multiplier: Agent执行超时倍数(覆盖timeout-multiplier) + verifier_timeout_multiplier=None, # --verifier-timeout-multiplier: 验证器超时倍数 + agent_setup_timeout_multiplier=None, # --agent-setup-timeout-multiplier: Agent设置超时倍数 + environment_build_timeout_multiplier=None, # --environment-build-timeout-multiplier: 环境构建超时倍数 + debug=False, # --debug: 启用调试日志 + n_concurrent_trials=5, # -n/--n-concurrent: 并发运行的trial数量 + quiet=False, # -q/--quiet: 静默模式 + max_retries=0, # -r/--max-retries: 最大重试次数 + retry_include_exceptions=None, # --retry-include: 需要重试的异常类型列表 + retry_exclude_exceptions=[ # --retry-exclude: 不需要重试的异常类型列表 + # "AgentTimeoutError", + # "VerifierTimeoutError", + # "RewardFileNotFoundError", + "RewardFileEmptyError", + "VerifierOutputParseError", + ], + environment_type="docker", # -e/--env: 环境类型 (docker, daytona, e2b, modal) + environment_force_build=False, # --force-build/--no-force-build: 是否强制重建环境 + environment_delete=False, # --delete/--no-delete: 完成后是否删除环境 + path="/path/to/terminal-bench-2/", # -p/--path: 本地数据集路径 + dataset_name_version=None, # -d/--dataset: 远程数据集名称@版本 + task_names=None, # --include-task-name: 包含的任务名称(支持glob模式)例如 ["task_name1", "task_name2"] + exclude_task_names=None, # --exclude-task-name: 排除的任务名称 + n_tasks=None, # --n-tasks: 最大任务数量 + disable_verification=False, # --disable-verification: 禁用验证器 + verifier_env=None, # --ve/--verifier-env: 验证器环境变量 + yes=True, # -y/--yes: 自动确认环境变量提示 + env_file=None, # --env-file: .env文件路径 + ), + ) + ) + +infer = dict( + runner=dict( + task=dict(type=EmptyTask) + ), +) + +eval = dict( + runner=dict( + task=dict(type=HarborTask) + ), +) + +summarizer = dict( + attr="accuracy", + type=HarborSummarizer, +) \ No newline at end of file diff --git a/requirements/datasets/harbor.txt b/requirements/datasets/harbor.txt new file mode 100644 index 00000000..0361dad2 --- /dev/null +++ b/requirements/datasets/harbor.txt @@ -0,0 +1 @@ +harbor==0.6.1 \ No newline at end of file