diff --git a/ais_bench/benchmark/configs/datasets/mmlu_pro/README.md b/ais_bench/benchmark/configs/datasets/mmlu_pro/README.md index 1f18821d..c1b88243 100644 --- a/ais_bench/benchmark/configs/datasets/mmlu_pro/README.md +++ b/ais_bench/benchmark/configs/datasets/mmlu_pro/README.md @@ -23,9 +23,9 @@ rm mmlu_pro.zip ``` ## 可用数据集任务 -### mmlu_pro_gen_0_shot_str +### mmlu_pro_gen_0_shot_cot_str #### 基本信息 |任务名称|简介|评估指标|few-shot|prompt格式|对应源码配置文件路径| | --- | --- | --- | --- | --- | --- | -|mmlu_pro_gen_0_shot_str|mmlu-pro数据集生成式任务|pass@1|0-shot|字符串格式|[mmlu_pro_gen_0_shot_str.py](mmlu_pro_gen_0_shot_str.py)| +|mmlu_pro_gen_0_shot_cot_str|mmlu-pro数据集生成式任务|pass@1|0-shot|字符串格式|[mmlu_pro_gen_0_shot_cot_str.py](mmlu_pro_gen_0_shot_cot_str.py)| |mmlu_pro_gen_5_shot_str|mmlu-pro数据集生成式任务|pass@1|0-shot|字符串格式|[mmlu_pro_gen_5_shot_str.py](mmlu_pro_gen_5_shot_str.py)| diff --git a/ais_bench/benchmark/configs/datasets/mmlu_pro/README_en.md b/ais_bench/benchmark/configs/datasets/mmlu_pro/README_en.md index 857b90f0..0c4681d9 100644 --- a/ais_bench/benchmark/configs/datasets/mmlu_pro/README_en.md +++ b/ais_bench/benchmark/configs/datasets/mmlu_pro/README_en.md @@ -23,11 +23,11 @@ rm mmlu_pro.zip ``` ## Available Dataset Tasks -### mmlu_pro_gen_0_shot_str +### mmlu_pro_gen_0_shot_cot_str #### Basic Information | Task Name | Introduction | Evaluation Metric | Few-Shot | Prompt Format | Corresponding Source Code Configuration File Path | | --- | --- | --- | --- | --- | --- | -| mmlu_pro_gen_0_shot_str | Generative task for the mmlu-pro dataset | pass@1 | 0-shot | String format | [mmlu_pro_gen_0_shot_str.py](mmlu_pro_gen_0_shot_str.py) | +| mmlu_pro_gen_0_shot_cot_str | Generative task for the mmlu-pro dataset | pass@1 | 0-shot | String format | [mmlu_pro_gen_0_shot_cot_str.py](mmlu_pro_gen_0_shot_cot_str.py) | | mmlu_pro_gen_5_shot_str | Generative task for the mmlu-pro dataset | pass@1 | 5-shot | String format | [mmlu_pro_gen_5_shot_str.py](mmlu_pro_gen_5_shot_str.py) | diff --git a/ais_bench/benchmark/configs/datasets/mmlu_pro/mmlu_pro_gen_0_shot_str.py b/ais_bench/benchmark/configs/datasets/mmlu_pro/mmlu_pro_gen_0_shot_cot_str.py similarity index 97% rename from ais_bench/benchmark/configs/datasets/mmlu_pro/mmlu_pro_gen_0_shot_str.py rename to ais_bench/benchmark/configs/datasets/mmlu_pro/mmlu_pro_gen_0_shot_cot_str.py index 3b8d3d2c..bb15912b 100644 --- a/ais_bench/benchmark/configs/datasets/mmlu_pro/mmlu_pro_gen_0_shot_str.py +++ b/ais_bench/benchmark/configs/datasets/mmlu_pro/mmlu_pro_gen_0_shot_cot_str.py @@ -44,7 +44,7 @@ evaluator=dict(type=AccEvaluator), pred_postprocessor=dict( type=match_answer_pattern, - answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])') + answer_pattern=r'(?i)\bANSWER\s*:\s*([A-P])\b') ) mmlu_pro_datasets.append( diff --git a/ais_bench/benchmark/datasets/mmlu_pro.py b/ais_bench/benchmark/datasets/mmlu_pro.py index f27f63c7..92b65e6b 100644 --- a/ais_bench/benchmark/datasets/mmlu_pro.py +++ b/ais_bench/benchmark/datasets/mmlu_pro.py @@ -43,12 +43,16 @@ class MMLUProBaseEvaluator(BaseEvaluator): def is_equal(self, pred, refer): try: - refer_option, refer_string = refer.split('. ') + # Handle exact match first + if pred.strip() == refer.strip(): + return True + # Limit split to 1 to avoid ValueError when refer contains multiple '. ' + refer_option, refer_string = refer.split('. ', 1) if pred in CHOICES and refer_option == pred: return True elif refer_string.strip() == pred: return True - else : + else: return False except Exception: pass diff --git a/ais_bench/configs/api_examples/all_dataset_configs.py b/ais_bench/configs/api_examples/all_dataset_configs.py index aa24c823..aa6cfedb 100644 --- a/ais_bench/configs/api_examples/all_dataset_configs.py +++ b/ais_bench/configs/api_examples/all_dataset_configs.py @@ -33,7 +33,7 @@ from ais_bench.benchmark.configs.datasets.mmlu.mmlu_gen_5_shot_str import mmlu_datasets as mmlu_5_shot_str # mmlu_pro - from ais_bench.benchmark.configs.datasets.mmlu_pro.mmlu_pro_gen_0_shot_str import mmlu_pro_datasets as mmlu_pro_0_shot_str + from ais_bench.benchmark.configs.datasets.mmlu_pro.mmlu_pro_gen_0_shot_cot_str import mmlu_pro_datasets as mmlu_pro_0_shot_str from ais_bench.benchmark.configs.datasets.mmlu_pro.mmlu_pro_gen_5_shot_str import mmlu_pro_datasets as mmlu_pro_5_shot_str # boolq