mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-25 17:10:42 +00:00
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
This commit is contained in:
parent
22884d2bf7
commit
d84e3c70b7
16 changed files with 270 additions and 143 deletions
|
|
@ -80,15 +80,19 @@ class AI2D(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_answer(self, response: str, num_choices: int) -> Tuple[Optional[str], str]:
|
||||
def extract_answer(
|
||||
self, response: str, num_choices: int
|
||||
) -> Tuple[Optional[str], str]:
|
||||
valid_letters = set(ascii_uppercase[:num_choices])
|
||||
|
||||
letter, method = extract_letter_from_answer_tag(response, valid_letters)
|
||||
|
|
|
|||
|
|
@ -31,7 +31,12 @@ class BLINK(EvalBase):
|
|||
except Exception as e:
|
||||
print(f"Warning: Could not load BLINK: {e}")
|
||||
try:
|
||||
tasks = ["Counting", "Spatial_Relation", "Object_Localization", "Visual_Similarity"]
|
||||
tasks = [
|
||||
"Counting",
|
||||
"Spatial_Relation",
|
||||
"Object_Localization",
|
||||
"Visual_Similarity",
|
||||
]
|
||||
all_data = []
|
||||
for t in tasks:
|
||||
try:
|
||||
|
|
@ -88,15 +93,19 @@ class BLINK(EvalBase):
|
|||
|
||||
content = []
|
||||
for img_b64 in images:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_answer(self, response: str, num_choices: int) -> Tuple[Optional[str], str]:
|
||||
def extract_answer(
|
||||
self, response: str, num_choices: int
|
||||
) -> Tuple[Optional[str], str]:
|
||||
valid_letters = set(ascii_uppercase[:num_choices])
|
||||
|
||||
letter, method = extract_letter_from_answer_tag(response, valid_letters)
|
||||
|
|
@ -130,9 +139,12 @@ class BLINK(EvalBase):
|
|||
answer = data_item.get("answer", "")
|
||||
|
||||
num_choices = sum(
|
||||
1 for letter in ascii_uppercase[:6]
|
||||
if letter in data_item and data_item[letter] is not None
|
||||
and isinstance(data_item[letter], str) and data_item[letter].strip()
|
||||
1
|
||||
for letter in ascii_uppercase[:6]
|
||||
if letter in data_item
|
||||
and data_item[letter] is not None
|
||||
and isinstance(data_item[letter], str)
|
||||
and data_item[letter].strip()
|
||||
)
|
||||
num_choices = max(num_choices, 4)
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,9 @@ class CountBench(EvalBase):
|
|||
return list(dataset)
|
||||
except Exception:
|
||||
try:
|
||||
dataset = load_dataset("google-research/countbenchqa", split="train")
|
||||
dataset = load_dataset(
|
||||
"google-research/countbenchqa", split="train"
|
||||
)
|
||||
print(f"Loaded {len(dataset)} examples from CountBench (train)")
|
||||
return list(dataset)
|
||||
except Exception:
|
||||
|
|
@ -58,17 +60,19 @@ class CountBench(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_number(self, response: str) -> Optional[str]:
|
||||
"""Extract a number from the response."""
|
||||
numbers = re.findall(r'\b(\d+)\b', response)
|
||||
numbers = re.findall(r"\b(\d+)\b", response)
|
||||
if numbers:
|
||||
return numbers[0]
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -44,7 +44,9 @@ Example of expected JSON response format:
|
|||
print(f"Warning: Could not load DynaMath: {e}")
|
||||
try:
|
||||
# Try sample_variant1 explicitly
|
||||
dataset = load_dataset("DynaMath/DynaMath_Sample", split="sample_variant1")
|
||||
dataset = load_dataset(
|
||||
"DynaMath/DynaMath_Sample", split="sample_variant1"
|
||||
)
|
||||
print(f"Loaded {len(dataset)} examples from DynaMath (sample_variant1)")
|
||||
return list(dataset)
|
||||
except Exception:
|
||||
|
|
@ -88,10 +90,12 @@ Example of expected JSON response format:
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
|
@ -100,14 +104,14 @@ Example of expected JSON response format:
|
|||
"""Preprocess response to extract JSON."""
|
||||
response = str(response)
|
||||
if 0 <= response.find("{") < response.rfind("}"):
|
||||
response = response[response.find("{"): response.rfind("}") + 1]
|
||||
response = response[response.find("{") : response.rfind("}") + 1]
|
||||
response = response.replace("\\", "").replace("\\n", "\n")
|
||||
return response
|
||||
|
||||
def transfer_pi(self, value: str) -> float:
|
||||
"""Convert pi symbol to numeric value."""
|
||||
if "\u03c0" in value:
|
||||
parts = value.split('\u03c0')
|
||||
parts = value.split("\u03c0")
|
||||
return float(parts[0]) * np.pi
|
||||
return float(value)
|
||||
|
||||
|
|
@ -116,7 +120,7 @@ Example of expected JSON response format:
|
|||
if answer_type == "float":
|
||||
if answer.isdigit():
|
||||
return True, str(float(answer))
|
||||
parts = answer.split(' ')
|
||||
parts = answer.split(" ")
|
||||
answer = parts[0]
|
||||
try:
|
||||
result = self.transfer_pi(answer)
|
||||
|
|
@ -136,7 +140,9 @@ Example of expected JSON response format:
|
|||
else:
|
||||
return True, answer
|
||||
|
||||
def extract_answer(self, response: str, answer_type: str) -> Tuple[bool, Optional[str]]:
|
||||
def extract_answer(
|
||||
self, response: str, answer_type: str
|
||||
) -> Tuple[bool, Optional[str]]:
|
||||
"""Extract answer from response."""
|
||||
processed = self.preprocess_response(response)
|
||||
|
||||
|
|
@ -156,7 +162,7 @@ Example of expected JSON response format:
|
|||
if ch in response.upper()[:20]:
|
||||
return True, ch
|
||||
elif answer_type == "float":
|
||||
numbers = re.findall(r'-?\d+\.?\d*', response)
|
||||
numbers = re.findall(r"-?\d+\.?\d*", response)
|
||||
if numbers:
|
||||
try:
|
||||
return True, str(float(numbers[0]))
|
||||
|
|
@ -186,7 +192,10 @@ Example of expected JSON response format:
|
|||
|
||||
else:
|
||||
# Free form: substring match
|
||||
return extracted.lower() in answer.lower() or answer.lower() in extracted.lower()
|
||||
return (
|
||||
extracted.lower() in answer.lower()
|
||||
or answer.lower() in extracted.lower()
|
||||
)
|
||||
|
||||
async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]:
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -36,7 +36,9 @@ class HallusionBench(EvalBase):
|
|||
except Exception:
|
||||
pass
|
||||
if all_data:
|
||||
print(f"Loaded {len(all_data)} examples from HallusionBench (combined)")
|
||||
print(
|
||||
f"Loaded {len(all_data)} examples from HallusionBench (combined)"
|
||||
)
|
||||
return all_data
|
||||
raise ValueError(f"Could not load HallusionBench dataset: {e}")
|
||||
except Exception:
|
||||
|
|
@ -62,10 +64,12 @@ class HallusionBench(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
|
@ -79,8 +83,8 @@ class HallusionBench(EvalBase):
|
|||
if response_lower.startswith("no"):
|
||||
return "No"
|
||||
|
||||
yes_patterns = [r'\byes\b', r'\btrue\b', r'\bcorrect\b']
|
||||
no_patterns = [r'\bno\b', r'\bfalse\b', r'\bincorrect\b']
|
||||
yes_patterns = [r"\byes\b", r"\btrue\b", r"\bcorrect\b"]
|
||||
no_patterns = [r"\bno\b", r"\bfalse\b", r"\bincorrect\b"]
|
||||
|
||||
for pattern in yes_patterns:
|
||||
if re.search(pattern, response_lower):
|
||||
|
|
|
|||
|
|
@ -77,15 +77,19 @@ class MMBench(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_answer(self, response: str, num_choices: int) -> Tuple[Optional[str], str]:
|
||||
def extract_answer(
|
||||
self, response: str, num_choices: int
|
||||
) -> Tuple[Optional[str], str]:
|
||||
valid_letters = set(ascii_uppercase[:num_choices])
|
||||
|
||||
letter, method = extract_letter_from_answer_tag(response, valid_letters)
|
||||
|
|
|
|||
|
|
@ -29,14 +29,36 @@ class MMMU(EvalBase):
|
|||
dataset = load_dataset("MMMU/MMMU", subset, split=split)
|
||||
else:
|
||||
subjects = [
|
||||
"Accounting", "Agriculture", "Architecture_and_Engineering",
|
||||
"Art", "Art_Theory", "Basic_Medical_Science", "Biology",
|
||||
"Chemistry", "Clinical_Medicine", "Computer_Science",
|
||||
"Design", "Diagnostics_and_Laboratory_Medicine", "Economics",
|
||||
"Electronics", "Energy_and_Power", "Finance", "Geography",
|
||||
"History", "Literature", "Manage", "Marketing", "Materials",
|
||||
"Math", "Mechanical_Engineering", "Music", "Pharmacy",
|
||||
"Physics", "Psychology", "Public_Health", "Sociology"
|
||||
"Accounting",
|
||||
"Agriculture",
|
||||
"Architecture_and_Engineering",
|
||||
"Art",
|
||||
"Art_Theory",
|
||||
"Basic_Medical_Science",
|
||||
"Biology",
|
||||
"Chemistry",
|
||||
"Clinical_Medicine",
|
||||
"Computer_Science",
|
||||
"Design",
|
||||
"Diagnostics_and_Laboratory_Medicine",
|
||||
"Economics",
|
||||
"Electronics",
|
||||
"Energy_and_Power",
|
||||
"Finance",
|
||||
"Geography",
|
||||
"History",
|
||||
"Literature",
|
||||
"Manage",
|
||||
"Marketing",
|
||||
"Materials",
|
||||
"Math",
|
||||
"Mechanical_Engineering",
|
||||
"Music",
|
||||
"Pharmacy",
|
||||
"Physics",
|
||||
"Psychology",
|
||||
"Public_Health",
|
||||
"Sociology",
|
||||
]
|
||||
all_data = []
|
||||
for subj in subjects:
|
||||
|
|
@ -80,24 +102,28 @@ class MMMU(EvalBase):
|
|||
options = []
|
||||
|
||||
if options:
|
||||
options_text = "\n".join([
|
||||
f"({ascii_uppercase[i]}) {opt}" for i, opt in enumerate(options)
|
||||
])
|
||||
options_text = "\n".join(
|
||||
[f"({ascii_uppercase[i]}) {opt}" for i, opt in enumerate(options)]
|
||||
)
|
||||
prompt = f"Question: {question}\n\nOptions:\n{options_text}\n\nPlease select the correct answer from the options above."
|
||||
else:
|
||||
prompt = f"Question: {question}\n\nProvide your answer."
|
||||
|
||||
content = []
|
||||
for img_b64 in images:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_answer(self, response: str, num_choices: int) -> Tuple[Optional[str], str]:
|
||||
def extract_answer(
|
||||
self, response: str, num_choices: int
|
||||
) -> Tuple[Optional[str], str]:
|
||||
"""Extract answer letter from response."""
|
||||
valid_letters = set(ascii_uppercase[:num_choices])
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,9 @@ class MMMUPro(EvalBase):
|
|||
except Exception as e:
|
||||
print(f"Error loading MMMU-Pro: {e}")
|
||||
try:
|
||||
dataset = load_dataset("MMMU/MMMU_Pro", "standard (10 options)", split="test")
|
||||
dataset = load_dataset(
|
||||
"MMMU/MMMU_Pro", "standard (10 options)", split="test"
|
||||
)
|
||||
print(f"Loaded {len(dataset)} examples from MMMU-Pro (test)")
|
||||
return list(dataset)
|
||||
except Exception:
|
||||
|
|
@ -80,9 +82,9 @@ class MMMUPro(EvalBase):
|
|||
prompt = "Answer the following multiple-choice question in the image. Answer directly with the option letter from the given choices."
|
||||
else:
|
||||
if options:
|
||||
options_text = "\n".join([
|
||||
f"{ascii_uppercase[i]}. {opt}" for i, opt in enumerate(options)
|
||||
])
|
||||
options_text = "\n".join(
|
||||
[f"{ascii_uppercase[i]}. {opt}" for i, opt in enumerate(options)]
|
||||
)
|
||||
prompt = f"Question: {question}\n\nOptions:\n{options_text}\n\n"
|
||||
|
||||
if variant == "cot":
|
||||
|
|
@ -93,30 +95,37 @@ class MMMUPro(EvalBase):
|
|||
"Think step by step before answering."
|
||||
)
|
||||
else:
|
||||
prompt += "Answer directly with the option letter from the given choices."
|
||||
prompt += (
|
||||
"Answer directly with the option letter from the given choices."
|
||||
)
|
||||
else:
|
||||
prompt = f"Question: {question}\n\nProvide your answer."
|
||||
|
||||
content = []
|
||||
for img_b64 in images:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_answer_cot(self, response: str) -> Optional[str]:
|
||||
"""Extract answer from COT response format 'Answer: X'."""
|
||||
lines = response.strip().split('\n')
|
||||
lines = response.strip().split("\n")
|
||||
lines = [x.strip() for x in lines]
|
||||
|
||||
for line in reversed(lines):
|
||||
if line.startswith('Answer:'):
|
||||
if line.startswith("Answer:"):
|
||||
rest = line[7:].strip()
|
||||
from collections import Counter
|
||||
letter_counts = Counter(ch for ch in rest.upper() if ch in ascii_uppercase[:10])
|
||||
|
||||
letter_counts = Counter(
|
||||
ch for ch in rest.upper() if ch in ascii_uppercase[:10]
|
||||
)
|
||||
if len(letter_counts) == 1:
|
||||
return list(letter_counts.keys())[0]
|
||||
elif letter_counts:
|
||||
|
|
@ -125,7 +134,9 @@ class MMMUPro(EvalBase):
|
|||
return ch
|
||||
return None
|
||||
|
||||
def extract_answer(self, response: str, num_choices: int) -> Tuple[Optional[str], str]:
|
||||
def extract_answer(
|
||||
self, response: str, num_choices: int
|
||||
) -> Tuple[Optional[str], str]:
|
||||
"""Extract answer letter from response."""
|
||||
variant = getattr(self, "variant", "standard")
|
||||
|
||||
|
|
|
|||
|
|
@ -68,15 +68,19 @@ class MMStar(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_answer(self, response: str, num_choices: int) -> Tuple[Optional[str], str]:
|
||||
def extract_answer(
|
||||
self, response: str, num_choices: int
|
||||
) -> Tuple[Optional[str], str]:
|
||||
valid_letters = set(ascii_uppercase[:num_choices])
|
||||
|
||||
letter, method = extract_letter_from_answer_tag(response, valid_letters)
|
||||
|
|
@ -110,9 +114,12 @@ class MMStar(EvalBase):
|
|||
answer = data_item.get("answer", "")
|
||||
|
||||
num_choices = sum(
|
||||
1 for letter in ascii_uppercase[:6]
|
||||
if letter in data_item and data_item[letter] is not None
|
||||
and isinstance(data_item[letter], str) and data_item[letter].strip()
|
||||
1
|
||||
for letter in ascii_uppercase[:6]
|
||||
if letter in data_item
|
||||
and data_item[letter] is not None
|
||||
and isinstance(data_item[letter], str)
|
||||
and data_item[letter].strip()
|
||||
)
|
||||
num_choices = max(num_choices, 4)
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,9 @@ class MMTBench(EvalBase):
|
|||
print(f"Warning: Full download failed, using streaming: {e}")
|
||||
# Fallback to streaming if full download fails (known column mismatch issue)
|
||||
try:
|
||||
dataset = load_dataset("OpenGVLab/MMT-Bench", split=split, streaming=True)
|
||||
dataset = load_dataset(
|
||||
"OpenGVLab/MMT-Bench", split=split, streaming=True
|
||||
)
|
||||
if max_samples:
|
||||
data = list(dataset.take(max_samples))
|
||||
else:
|
||||
|
|
@ -46,7 +48,9 @@ class MMTBench(EvalBase):
|
|||
data.append(item)
|
||||
if i % 5000 == 0 and i > 0:
|
||||
print(f" Streamed {i} samples...")
|
||||
print(f"Loaded {len(data)} examples from MMT-Bench ({split}, streaming)")
|
||||
print(
|
||||
f"Loaded {len(data)} examples from MMT-Bench ({split}, streaming)"
|
||||
)
|
||||
return data
|
||||
except Exception:
|
||||
raise ValueError(f"Could not load MMT-Bench dataset: {e}")
|
||||
|
|
@ -92,15 +96,19 @@ class MMTBench(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_answer(self, response: str, num_choices: int) -> Tuple[Optional[str], str]:
|
||||
def extract_answer(
|
||||
self, response: str, num_choices: int
|
||||
) -> Tuple[Optional[str], str]:
|
||||
valid_letters = set(ascii_uppercase[:num_choices])
|
||||
|
||||
letter, method = extract_letter_from_answer_tag(response, valid_letters)
|
||||
|
|
@ -134,9 +142,12 @@ class MMTBench(EvalBase):
|
|||
answer = data_item.get("answer", "")
|
||||
|
||||
num_choices = sum(
|
||||
1 for letter in ascii_uppercase[:8]
|
||||
if letter in data_item and data_item[letter] is not None
|
||||
and isinstance(data_item[letter], str) and data_item[letter].strip()
|
||||
1
|
||||
for letter in ascii_uppercase[:8]
|
||||
if letter in data_item
|
||||
and data_item[letter] is not None
|
||||
and isinstance(data_item[letter], str)
|
||||
and data_item[letter].strip()
|
||||
)
|
||||
num_choices = max(num_choices, 4)
|
||||
|
||||
|
|
|
|||
|
|
@ -51,10 +51,12 @@ class MMVet(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": question})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
|
|
|||
|
|
@ -76,15 +76,19 @@ class MMVP(EvalBase):
|
|||
|
||||
content = []
|
||||
for img_b64 in images:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_answer(self, response: str, num_choices: int) -> Tuple[Optional[str], str]:
|
||||
def extract_answer(
|
||||
self, response: str, num_choices: int
|
||||
) -> Tuple[Optional[str], str]:
|
||||
valid_letters = set(ascii_uppercase[:num_choices])
|
||||
|
||||
letter, method = extract_letter_from_answer_tag(response, valid_letters)
|
||||
|
|
@ -118,9 +122,12 @@ class MMVP(EvalBase):
|
|||
answer = data_item.get("answer", "")
|
||||
|
||||
num_choices = sum(
|
||||
1 for letter in ascii_uppercase[:4]
|
||||
if letter in data_item and data_item[letter] is not None
|
||||
and isinstance(data_item[letter], str) and data_item[letter].strip()
|
||||
1
|
||||
for letter in ascii_uppercase[:4]
|
||||
if letter in data_item
|
||||
and data_item[letter] is not None
|
||||
and isinstance(data_item[letter], str)
|
||||
and data_item[letter].strip()
|
||||
)
|
||||
num_choices = max(num_choices, 2)
|
||||
|
||||
|
|
|
|||
|
|
@ -17,16 +17,16 @@ class OCRBench(EvalBase):
|
|||
|
||||
# Categories and their scoring
|
||||
CATEGORIES = [
|
||||
'Regular Text Recognition',
|
||||
'Irregular Text Recognition',
|
||||
'Artistic Text Recognition',
|
||||
'Handwriting Recognition',
|
||||
'Digit String Recognition',
|
||||
'Non-Semantic Text Recognition',
|
||||
'Scene Text-centric VQA',
|
||||
'Doc-oriented VQA',
|
||||
'Key Information Extraction',
|
||||
'Handwritten Mathematical Expression Recognition',
|
||||
"Regular Text Recognition",
|
||||
"Irregular Text Recognition",
|
||||
"Artistic Text Recognition",
|
||||
"Handwriting Recognition",
|
||||
"Digit String Recognition",
|
||||
"Non-Semantic Text Recognition",
|
||||
"Scene Text-centric VQA",
|
||||
"Doc-oriented VQA",
|
||||
"Key Information Extraction",
|
||||
"Handwritten Mathematical Expression Recognition",
|
||||
]
|
||||
|
||||
def setup_data(self) -> list:
|
||||
|
|
@ -65,10 +65,12 @@ class OCRBench(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
|
@ -77,16 +79,16 @@ class OCRBench(EvalBase):
|
|||
"""Category-specific scoring for OCR tasks."""
|
||||
predict = prediction.strip()
|
||||
|
||||
if category == 'Handwritten Mathematical Expression Recognition':
|
||||
predict_clean = predict.replace('\n', ' ').replace(' ', '')
|
||||
if category == "Handwritten Mathematical Expression Recognition":
|
||||
predict_clean = predict.replace("\n", " ").replace(" ", "")
|
||||
for answer in answers:
|
||||
answer_clean = answer.strip().replace('\n', ' ').replace(' ', '')
|
||||
answer_clean = answer.strip().replace("\n", " ").replace(" ", "")
|
||||
if answer_clean in predict_clean:
|
||||
return True
|
||||
else:
|
||||
predict_lower = predict.lower().replace('\n', ' ')
|
||||
predict_lower = predict.lower().replace("\n", " ")
|
||||
for answer in answers:
|
||||
answer_lower = answer.lower().strip().replace('\n', ' ')
|
||||
answer_lower = answer.lower().strip().replace("\n", " ")
|
||||
if answer_lower in predict_lower:
|
||||
return True
|
||||
|
||||
|
|
|
|||
|
|
@ -53,10 +53,12 @@ class POPE(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
|
@ -70,8 +72,8 @@ class POPE(EvalBase):
|
|||
if response_lower.startswith("no"):
|
||||
return "No"
|
||||
|
||||
yes_patterns = [r'\byes\b', r'\btrue\b', r'\bcorrect\b', r'\baffirmative\b']
|
||||
no_patterns = [r'\bno\b', r'\bfalse\b', r'\bincorrect\b', r'\bnegative\b']
|
||||
yes_patterns = [r"\byes\b", r"\btrue\b", r"\bcorrect\b", r"\baffirmative\b"]
|
||||
no_patterns = [r"\bno\b", r"\bfalse\b", r"\bincorrect\b", r"\bnegative\b"]
|
||||
|
||||
for pattern in yes_patterns:
|
||||
if re.search(pattern, response_lower):
|
||||
|
|
|
|||
|
|
@ -40,12 +40,16 @@ class SEEDBench2Plus(EvalBase):
|
|||
except Exception as e:
|
||||
print(f"Warning: Could not load SEED-Bench2: {e}")
|
||||
try:
|
||||
dataset = load_dataset("lmms-lab/SEED-Bench", split=split, streaming=True)
|
||||
dataset = load_dataset(
|
||||
"lmms-lab/SEED-Bench", split=split, streaming=True
|
||||
)
|
||||
if max_samples:
|
||||
data = list(dataset.take(max_samples))
|
||||
else:
|
||||
data = list(dataset.take(1000))
|
||||
print(f"Loaded {len(data)} examples from SEED-Bench ({split}, streaming)")
|
||||
print(
|
||||
f"Loaded {len(data)} examples from SEED-Bench ({split}, streaming)"
|
||||
)
|
||||
return data
|
||||
except Exception:
|
||||
raise ValueError(f"Could not load SEED-Bench2-Plus dataset: {e}")
|
||||
|
|
@ -103,15 +107,19 @@ class SEEDBench2Plus(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_answer(self, response: str, num_choices: int) -> Tuple[Optional[str], str]:
|
||||
def extract_answer(
|
||||
self, response: str, num_choices: int
|
||||
) -> Tuple[Optional[str], str]:
|
||||
valid_letters = set(ascii_uppercase[:num_choices])
|
||||
|
||||
letter, method = extract_letter_from_answer_tag(response, valid_letters)
|
||||
|
|
@ -154,7 +162,8 @@ class SEEDBench2Plus(EvalBase):
|
|||
num_choices = len(choices) if choices else 4
|
||||
if num_choices == 0:
|
||||
num_choices = sum(
|
||||
1 for letter in ascii_uppercase[:6]
|
||||
1
|
||||
for letter in ascii_uppercase[:6]
|
||||
if letter in data_item and data_item[letter] is not None
|
||||
)
|
||||
num_choices = max(num_choices, 4)
|
||||
|
|
@ -168,7 +177,9 @@ class SEEDBench2Plus(EvalBase):
|
|||
sample = {
|
||||
"id": data_item.get("index", data_item.get("question_id", "")),
|
||||
"question": data_item.get("question", "")[:200],
|
||||
"category": data_item.get("question_type_id", data_item.get("category", "")),
|
||||
"category": data_item.get(
|
||||
"question_type_id", data_item.get("category", "")
|
||||
),
|
||||
"answer": answer,
|
||||
"prediction": extracted,
|
||||
"raw_response": response[:500],
|
||||
|
|
|
|||
|
|
@ -65,20 +65,29 @@ class VLMBlind(EvalBase):
|
|||
|
||||
content = []
|
||||
if image_base64:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
})
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
}
|
||||
)
|
||||
content.append({"type": "text", "text": prompt})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def extract_and_score(self, response: str, answer: str, task: str) -> Tuple[bool, str]:
|
||||
def extract_and_score(
|
||||
self, response: str, answer: str, task: str
|
||||
) -> Tuple[bool, str]:
|
||||
"""Task-specific answer extraction and scoring."""
|
||||
response_lower = response.lower().strip()
|
||||
answer_lower = str(answer).lower().strip()
|
||||
|
||||
if task in ["Subway Connections", "Nested Squares", "Line Plot Intersections", "Circled Letter"]:
|
||||
if task in [
|
||||
"Subway Connections",
|
||||
"Nested Squares",
|
||||
"Line Plot Intersections",
|
||||
"Circled Letter",
|
||||
]:
|
||||
match = re.search(r"\{([^}]+)\}", response)
|
||||
if match:
|
||||
extracted = match.group(1).strip().lower()
|
||||
|
|
@ -140,7 +149,9 @@ class VLMBlind(EvalBase):
|
|||
|
||||
sample = {
|
||||
"id": data_item.get("index", data_item.get("id", "")),
|
||||
"question": data_item.get("prompt", data_item.get("question", ""))[:200],
|
||||
"question": data_item.get("prompt", data_item.get("question", ""))[
|
||||
:200
|
||||
],
|
||||
"task": task,
|
||||
"answer": answer,
|
||||
"prediction": extracted,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue