Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion ais_bench/benchmark/configs/datasets/mmstar/mmstar_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
from ais_bench.benchmark.datasets import MMStarDataset, MMStarEvaluator
from ais_bench.benchmark.utils.postprocess.text_postprocessors import last_option_postprocess


mmstar_reader_cfg = dict(
Expand All @@ -26,7 +27,8 @@
)

mmstar_eval_cfg = dict(
evaluator=dict(type=MMStarEvaluator)
evaluator=dict(type=MMStarEvaluator),
pred_postprocessor=dict(type=last_option_postprocess, options="ABCD"),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Using last_option_postprocess is risky for multiple-choice evaluation, especially if the model provides reasoning (Chain-of-Thought). This function extracts the last occurrence of any character in the options string ("ABCD"). If the model's explanation mentions other options (e.g., "Option B is incorrect, so the answer is A"), this will incorrectly return 'B' as the prediction.

Consider using first_option_postprocess or a more specific regex-based postprocessor that targets a final answer pattern (e.g., ANSWER: [A-D]).

)

mmstar_datasets = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
from ais_bench.benchmark.datasets import MMStarDataset, MMStarEvaluator
from ais_bench.benchmark.utils.postprocess.text_postprocessors import last_option_postprocess


mmstar_reader_cfg = dict(
Expand Down Expand Up @@ -29,7 +30,8 @@
)

mmstar_eval_cfg = dict(
evaluator=dict(type=MMStarEvaluator)
evaluator=dict(type=MMStarEvaluator),
pred_postprocessor=dict(type=last_option_postprocess, options="ABCD"),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Using last_option_postprocess is risky here. Although the prompt asks for the answer on the last line, any mention of other option letters in the reasoning or in a trailing sentence will cause this postprocessor to extract the wrong letter.

Since the prompt explicitly asks for the ANSWER: [LETTER] format, it would be much safer to use a postprocessor that specifically extracts the letter following that prefix.

)

mmstar_datasets = [
Expand Down
129 changes: 77 additions & 52 deletions ais_bench/benchmark/datasets/mmstar.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,70 @@
import json
import os
import re
import string
import pandas as pd
import numpy as np

import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict

from ais_bench.benchmark.datasets import build_choices, can_infer, dump_image, split_MMMU
from ais_bench.benchmark.datasets.utils.datasets import get_data_path, toliststr
from ais_bench.benchmark.openicl import BaseEvaluator
from ais_bench.benchmark.registry import LOAD_DATASET
from ais_bench.benchmark.datasets.utils.datasets import get_data_path, toliststr
from ais_bench.benchmark.utils.logging import AISLogger
from ais_bench.benchmark.datasets import dump_image, split_MMMU, build_choices, can_infer
from ais_bench.benchmark.utils.prompt import AIS_CONTENT_TAG, AIS_TEXT_START, AIS_IMAGE_START
from ais_bench.benchmark.utils.prompt import AIS_CONTENT_TAG, AIS_IMAGE_START, AIS_TEXT_START

from .base import BaseDataset

IMAGE_MAP_LEN = 64
logger = AISLogger()


def extract_options_from_question(question_text:str):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The type hint for question_text is missing a space after the colon, which deviates from PEP 8 style guidelines.

Suggested change
def extract_options_from_question(question_text:str):
def extract_options_from_question(question_text: str):

options = {}
if "Options:" in question_text:
options_part = question_text.split("Options:")[1].strip()
pattern = r"([A-Z]):\s*([^,]+(?:,\s*[^,]+)*?)(?=(?:,\s*[A-Z]:|$))"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The regex pattern is fragile because it strictly relies on commas as separators between options. If the dataset uses newlines, periods, or just spaces (e.g., A: Option 1 B: Option 2), this pattern will fail to correctly extract individual options. Additionally, if an option contains a comma that isn't followed by an option letter, the non-greedy match might behave unexpectedly.


matches = re.findall(pattern, options_part)
for letter, content in matches:
content = content.strip()
if content.endswith("."):
content = content[:-1]
options[letter] = content

return options


@LOAD_DATASET.register_module()
class MMStarDataset(BaseDataset):

@staticmethod
def load(path):
path = get_data_path(path)
image_root_path = os.path.join(os.path.dirname(path), "MMStar_images")
logger.info(f"Convert base64 to image and save it in {image_root_path}")
skip_noimg = True
data = pd.read_csv(path, sep='\t')
if skip_noimg and 'image' in data:
data = data[~pd.isna(data['image'])]

data = pd.read_csv(path, sep="\t")
if skip_noimg and "image" in data:
data = data[~pd.isna(data["image"])]
# The image field can store the base64 encoded image or another question index (for saving space)
if 'image' in data:
data['image'] = [str(x) for x in data['image']]
image_map = {x: y for x, y in zip(data['index'], data['image'])}
if "image" in data:
data["image"] = [str(x) for x in data["image"]]
image_map = {x: y for x, y in zip(data["index"], data["image"])}
for k in image_map:
if len(image_map[k]) <= IMAGE_MAP_LEN:
idx = image_map[k]
image_map[k] = image_map[idx]

images = [toliststr(image_map[k]) for k in data['index']]
data['image'] = [x[0] if len(x) == 1 else x for x in images]
if 'image_path' in data:
paths = [toliststr(x) for x in data['image_path']]
data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
images = [toliststr(image_map[k]) for k in data["index"]]
data["image"] = [x[0] if len(x) == 1 else x for x in images]
if "image_path" in data:
paths = [toliststr(x) for x in data["image_path"]]
data["image_path"] = [x[0] if len(x) == 1 else x for x in paths]

if np.all([isinstance(x, int) for x in data['index']]):
data['index'] = [int(x) for x in data['index']]
if np.all([isinstance(x, int) for x in data["index"]]):
data["index"] = [int(x) for x in data["index"]]

sheet_indices = list(range(0, len(data), 1))
data = data.iloc[sheet_indices]
Expand All @@ -61,59 +78,67 @@ def load(path):
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
options_prompt = "Options:\n"
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
# get text prompt
prompt = ''
options_prompt += f"{key}. {item}\n"

hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None
# get text prompt
prompt = ""
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f"Hint: {hint}\n"
prompt += line["question"]
if len(options):
prompt += options_prompt
prompt += 'Please select the correct answer from the options above. \n'
prompt += "Please select the correct answer from the options above. \n"
# add image info
if isinstance(tgt_path, list):
tgt_path = tgt_path[0]

content = AIS_IMAGE_START + tgt_path + AIS_CONTENT_TAG \
+ AIS_TEXT_START + prompt + AIS_CONTENT_TAG
choices = build_choices(line)
dataset.append({"content": content,
"answer": {'choices': json.dumps(choices),
'answer': line['answer'],
'split': line.get('split'),
'l2-category': line.get('l2-category'),
'category': line.get('category')}})

content = (
AIS_IMAGE_START
+ tgt_path
+ AIS_CONTENT_TAG
+ AIS_TEXT_START
+ prompt
+ AIS_CONTENT_TAG
)
choices = build_choices(extract_options_from_question(line['question']))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Using extract_options_from_question(line['question']) to build choices is unreliable and likely the cause of incorrect evaluations. If the regex fails to match the question text (which is highly probable given its restrictive nature and the fact that line['question'] may not even contain the options in that specific format), choices will be empty. This leads to can_infer failing and a resulting score of 0. You should use the options dictionary defined at line 82, which is derived from the structured columns of the dataset and is already used to build the prompt.

Suggested change
choices = build_choices(extract_options_from_question(line['question']))
choices = build_choices(options)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This change introduces a regression. By calling extract_options_from_question directly, you are ignoring the options already extracted from the TSV columns (lines 77-81). If a dataset provides options in columns but does not include the "Options:" string in the question text, choices will be empty, leading to a score of 0.

You should prioritize the options from the columns and only fallback to parsing the question text if they are missing.

Suggested change
choices = build_choices(extract_options_from_question(line['question']))
choices = build_choices(options if options else extract_options_from_question(line['question']))

dataset.append(
{
"content": content,
"answer": {
"choices": json.dumps(choices),
"answer": line["answer"],
"split": line.get("split"),
"l2-category": line.get("l2-category"),
"category": line.get("category"),
},
}
)
return Dataset.from_list(dataset)

class MMStarEvaluator(BaseEvaluator):

class MMStarEvaluator(BaseEvaluator):
def score(self, predictions, references):
result = {}
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
return {"error": "predictions and references have different length"}
details = []
overall_key = 'Overall'
overall_key = "Overall"
for pred, refer in zip(predictions, references):
detail = {'pred': pred, 'answer': refer, 'correct': False}
choices = json.loads(refer['choices'])
detail = {"pred": pred, "answer": refer, "correct": False}
choices = json.loads(refer["choices"])
infer_res = can_infer(pred, choices)
key_category = refer['category']
score = 1 if infer_res == refer['answer'] else 0

key_category = refer["category"]
score = 1 if infer_res == refer["answer"] else 0
if score == 1:
detail['correct'] = True
detail["correct"] = True
details.append(detail)
result.setdefault(overall_key, []).append(score)
result.setdefault(key_category, []).append(score)
for key in result:
result[key] = 100 * sum(result[key]) / len(result[key])
result['details'] = details
result["details"] = details
return result