【Test-Cases】: add UT coverage for core datasets (aime, realworldqa, math, gsm8k, gpqa, dapo_math)#315
Conversation
…8k, gpqa, dapo_math) - test_aime2026.py: new file with 4 tests for Aime2026Dataset - test_dapo_math.py: new file with 31 tests covering boxed functions, normalization, extractors, postprocessors and evaluators - test_aime_datasets.py: +4 tests (multi-record, field mapping, JDG class) - test_gsm8k.py: +4 tests (get_action, agent score paths) - test_gpqa.py: +7 tests (evaluator edge cases, postprocess, multi-row) - test_realworldqa.py: +5 tests (colon variants, tabs, dict refs) - test_math.py: +52 tests (boxed edge cases, normalize branches, strip_string v1/v2 paths, is_equiv versions, agent evaluator paths) Total: ~107 new tests, all passing pytest.
There was a problem hiding this comment.
Code Review
This pull request significantly expands the unit test coverage for various dataset loaders, postprocessors, and evaluators, including AIME (2024, 2025, 2026), DAPO Math, GPQA, GSM8K, MATH, and RealworldQA. The review feedback focuses on improving test quality and consistency: it suggests using new_callable=mock_open directly in @patch decorators to simplify file mocking, avoiding __new__ to bypass initialization in tests, inheriting from unittest.TestCase in test_dapo_math.py for consistency, and moving inline imports of AISBenchDataContentError to the top of the files.
| @patch("builtins.open") | ||
| def test_aime2026_load(self, mock_open_file, mock_get_path): | ||
| line = '{"question": "What is 1+1?", "answer": "2"}' | ||
| m = mock_open(read_data=line + "\n") | ||
| mock_open_file.return_value = m.return_value |
There was a problem hiding this comment.
Using new_callable=mock_open directly in the @patch decorator is cleaner and avoids manual assignment of mock_open_file.return_value = m.return_value.
| @patch("builtins.open") | |
| def test_aime2026_load(self, mock_open_file, mock_get_path): | |
| line = '{"question": "What is 1+1?", "answer": "2"}' | |
| m = mock_open(read_data=line + "\n") | |
| mock_open_file.return_value = m.return_value | |
| @patch("builtins.open", new_callable=mock_open, read_data='{"question": "What is 1+1?", "answer": "2"}\n') | |
| def test_aime2026_load(self, mock_open_file, mock_get_path): |
| @patch("builtins.open") | ||
| def test_aime2024_multiple_records(self, mock_open_file, mock_get_path): | ||
| line1 = '{"origin_prompt": "Q1?", "gold_answer": "1"}' | ||
| line2 = '{"origin_prompt": "Q2?", "gold_answer": "2"}' | ||
| m = mock_open(read_data=line1 + "\n" + line2 + "\n") | ||
| mock_open_file.return_value = m.return_value |
There was a problem hiding this comment.
Using new_callable=mock_open directly in the @patch decorator is cleaner and avoids manual assignment of mock_open_file.return_value = m.return_value.
| @patch("builtins.open") | |
| def test_aime2024_multiple_records(self, mock_open_file, mock_get_path): | |
| line1 = '{"origin_prompt": "Q1?", "gold_answer": "1"}' | |
| line2 = '{"origin_prompt": "Q2?", "gold_answer": "2"}' | |
| m = mock_open(read_data=line1 + "\n" + line2 + "\n") | |
| mock_open_file.return_value = m.return_value | |
| @patch("builtins.open", new_callable=mock_open, read_data='{"origin_prompt": "Q1?", "gold_answer": "1"}\n{"origin_prompt": "Q2?", "gold_answer": "2"}\n') | |
| def test_aime2024_multiple_records(self, mock_open_file, mock_get_path): |
| def test_aime2025_jdg_dataset_class(self): | ||
| instance = Aime2025JDGDataset.__new__(Aime2025JDGDataset) | ||
| result = instance._get_dataset_class() | ||
| self.assertIs(result, Aime2025Dataset) |
There was a problem hiding this comment.
Using __new__ to bypass __init__ is generally considered a code smell in unit tests. Since _get_dataset_class is a method that does not access self, you can call it directly on the class by passing None as self.
| def test_aime2025_jdg_dataset_class(self): | |
| instance = Aime2025JDGDataset.__new__(Aime2025JDGDataset) | |
| result = instance._get_dataset_class() | |
| self.assertIs(result, Aime2025Dataset) | |
| def test_aime2025_jdg_dataset_class(self): | |
| result = Aime2025JDGDataset._get_dataset_class(None) | |
| self.assertIs(result, Aime2025Dataset) |
| from ais_bench.benchmark.utils.logging.exceptions import AISBenchDataContentError | ||
|
|
||
|
|
||
| class TestBoxedFunctions: |
| @patch("builtins.open") | ||
| def test_dataset_multiple_rows(self, mock_open_file, mock_get_path): | ||
| content = ( | ||
| "h0,h1,h2,h3,h4,h5,h6,Question,A,B,C,D\n" | ||
| ",,,,,,,Q1,opt1_a,opt1_b,opt1_c,opt1_d\n" | ||
| ",,,,,,,Q2,opt2_a,opt2_b,opt2_c,opt2_d\n" | ||
| ) | ||
| m = mock_open(read_data=content) | ||
| mock_open_file.return_value = m.return_value |
There was a problem hiding this comment.
Using new_callable=mock_open directly in the @patch decorator is cleaner and avoids manual assignment of mock_open_file.return_value = m.return_value.
| @patch("builtins.open") | |
| def test_dataset_multiple_rows(self, mock_open_file, mock_get_path): | |
| content = ( | |
| "h0,h1,h2,h3,h4,h5,h6,Question,A,B,C,D\n" | |
| ",,,,,,,Q1,opt1_a,opt1_b,opt1_c,opt1_d\n" | |
| ",,,,,,,Q2,opt2_a,opt2_b,opt2_c,opt2_d\n" | |
| ) | |
| m = mock_open(read_data=content) | |
| mock_open_file.return_value = m.return_value | |
| @patch("builtins.open", new_callable=mock_open, read_data="h0,h1,h2,h3,h4,h5,h6,Question,A,B,C,D\n,,,,,,,Q1,opt1_a,opt1_b,opt1_c,opt1_d\n,,,,,,,Q2,opt2_a,opt2_b,opt2_c,opt2_d\n") | |
| def test_dataset_multiple_rows(self, mock_open_file, mock_get_path): |
| def test_extract_boxed_answer_fbox(self): | ||
| from ais_bench.benchmark.utils.logging.exceptions import AISBenchDataContentError | ||
| text = "The answer is \\fbox{99}" |
There was a problem hiding this comment.
Please move the import of AISBenchDataContentError to the top of the file to avoid redundant inline imports across multiple test methods.
| def test_extract_boxed_answer_fbox(self): | |
| from ais_bench.benchmark.utils.logging.exceptions import AISBenchDataContentError | |
| text = "The answer is \\fbox{99}" | |
| def test_extract_boxed_answer_fbox(self): | |
| text = "The answer is \\fbox{99}" |
| def test_remove_right_units_multiple_splits(self): | ||
| from ais_bench.benchmark.utils.logging.exceptions import AISBenchDataContentError | ||
| evaluator = MATHEvaluator() |
There was a problem hiding this comment.
Please move the import of AISBenchDataContentError to the top of the file to avoid redundant inline imports across multiple test methods.
| def test_remove_right_units_multiple_splits(self): | |
| from ais_bench.benchmark.utils.logging.exceptions import AISBenchDataContentError | |
| evaluator = MATHEvaluator() | |
| def test_remove_right_units_multiple_splits(self): | |
| evaluator = MATHEvaluator() |
概述
补充核心数据集的单元测试,提升 UT 覆盖率。新增约 107 个测试,全部通过 pytest 验证。
变更内容
新建文件
test_aime2026.pytest_dapo_math.py增强文件
test_aime_datasets.pytest_gsm8k.pytest_gpqa.pytest_realworldqa.pytest_math.py测试结果
关联 Issue
提升核心数据集(AIME、RealworldQA、MATH、GSM8K、GPQA、DAPO-Math)的单元测试覆盖率。