[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
This commit is contained in:
pre-commit-ci[bot] 2026-01-23 00:49:46 +00:00
parent 0a16fafadb
commit 655faa775c
27 changed files with 134 additions and 66 deletions

View file

@ -7,10 +7,10 @@ from string import ascii_uppercase
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
from environments.eval_environments.eval_helpers import (
extract_letter_from_answer_tag,
extract_mcqa_answer_with_fallback,
@ -102,7 +102,9 @@ class AI2D(EvalBase):
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
return letter, method
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -7,10 +7,10 @@ from string import ascii_uppercase
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
from environments.eval_environments.eval_helpers import (
extract_letter_from_answer_tag,
extract_mcqa_answer_with_fallback,
@ -115,7 +115,9 @@ class BLINK(EvalBase):
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
return letter, method
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -8,10 +8,10 @@ from pathlib import Path
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class ChartQA(EvalBase):
@ -142,7 +142,9 @@ Question: {query}"""
# Non-numeric: exact match (case-insensitive)
return pred.lower() == ans.lower()
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
@ -183,6 +185,8 @@ Question: {query}"""
if __name__ == "__main__":
asyncio.run(
eval_runner(
ChartQA(subset="human", relaxed_tolerance=0.05, temperature=0.0, max_tokens=2048)
ChartQA(
subset="human", relaxed_tolerance=0.05, temperature=0.0, max_tokens=2048
)
)
)

View file

@ -10,10 +10,10 @@ from typing import Dict, List, Optional, Tuple
import openai
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
DESCRIPTIVE_CATEGORIES = {
1: "Information Extraction",
@ -270,7 +270,9 @@ class CharXiv(EvalBase):
inst_category = item.get("inst_category", 1)
return REASONING_CATEGORIES.get(inst_category, "Text-in-Chart")
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
mode = getattr(self, "mode", "descriptive")

View file

@ -7,10 +7,10 @@ import re
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class CountBench(EvalBase):
@ -97,7 +97,9 @@ class CountBench(EvalBase):
return False
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -5,10 +5,10 @@ import re
from typing import List, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class DocVQA(EvalBase):
@ -142,7 +142,9 @@ Provide only the answer, as concisely as possible."""
return previous_row[-1]
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -10,10 +10,10 @@ from typing import List, Optional, Tuple
import numpy as np
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class DynaMath(EvalBase):
@ -197,7 +197,9 @@ Example of expected JSON response format:
or answer.lower() in extracted.lower()
)
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
@ -240,6 +242,8 @@ Example of expected JSON response format:
if __name__ == "__main__":
asyncio.run(
eval_runner(
DynaMath(split="test", use_json_format=True, temperature=0.0, max_tokens=1024)
DynaMath(
split="test", use_json_format=True, temperature=0.0, max_tokens=1024
)
)
)

View file

@ -7,10 +7,10 @@ import re
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class HallusionBench(EvalBase):
@ -96,7 +96,9 @@ class HallusionBench(EvalBase):
return "Unknown"
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)
@ -140,4 +142,6 @@ class HallusionBench(EvalBase):
if __name__ == "__main__":
asyncio.run(eval_runner(HallusionBench(split="test", temperature=0.0, max_tokens=64)))
asyncio.run(
eval_runner(HallusionBench(split="test", temperature=0.0, max_tokens=64))
)

View file

@ -5,10 +5,10 @@ import re
from typing import List, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class InfoVQA(EvalBase):
@ -127,7 +127,9 @@ Provide only the answer, as concisely as possible."""
return previous_row[-1]
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -9,10 +9,10 @@ from typing import Dict, List, Optional, Tuple
import openai
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
EXTRACTION_PROMPT_TEMPLATE = """You are a information extractor that extracts multiple choice letter answer choices \
from a paragraph that contains the answer choice and sometimes explaination of why that \
@ -195,7 +195,9 @@ Provide your answer as the letter(s) of the correct choice(s), e.g., A, B, C, D,
return pred_normalized == answer_normalized
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)

View file

@ -9,10 +9,10 @@ from typing import List, Optional, Tuple
import openai
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
EXTRACT_ICL_EXAMPLES = [
"1.\nModel response: 'The perimeter of the sector is approximately (-2, 1)'\n"
@ -245,7 +245,9 @@ Judgement:"""
return False
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)

View file

@ -9,10 +9,10 @@ from typing import Dict, List, Optional, Tuple
import openai
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
ICL_EXAMPLES = [
"""Hint: Please answer the question and provide the final answer at the end.
@ -263,7 +263,9 @@ Then extract the answer from the model response and type it at the end of the pr
return is_equal(prediction, answer)
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)

View file

@ -9,10 +9,10 @@ from typing import Dict, List, Optional, Tuple
import openai
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
ICL_EXAMPLES = [
"""
@ -322,7 +322,9 @@ class MathVista(EvalBase):
return pred.lower() == ans.lower()
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)

View file

@ -7,10 +7,10 @@ from string import ascii_uppercase
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
from environments.eval_environments.eval_helpers import (
extract_letter_from_answer_tag,
extract_mcqa_answer_with_fallback,
@ -99,7 +99,9 @@ class MMBench(EvalBase):
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
return letter, method
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)
@ -149,4 +151,10 @@ class MMBench(EvalBase):
if __name__ == "__main__":
asyncio.run(eval_runner(MMBench(split="dev", lang="en", version="v1.1", temperature=0.0, max_tokens=256)))
asyncio.run(
eval_runner(
MMBench(
split="dev", lang="en", version="v1.1", temperature=0.0, max_tokens=256
)
)
)

View file

@ -8,10 +8,10 @@ from string import ascii_uppercase
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
from environments.eval_environments.eval_helpers import (
extract_letter_from_answer_tag,
extract_mcqa_answer_with_fallback,
@ -134,7 +134,9 @@ class MMMU(EvalBase):
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
return letter, method
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -8,10 +8,10 @@ from string import ascii_uppercase
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
from environments.eval_environments.eval_helpers import (
extract_letter_from_answer_tag,
extract_mcqa_answer_with_fallback,
@ -154,7 +154,9 @@ class MMMUPro(EvalBase):
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
return letter, method
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)
@ -201,4 +203,8 @@ class MMMUPro(EvalBase):
if __name__ == "__main__":
asyncio.run(eval_runner(MMMUPro(split="test", variant="standard", temperature=0.0, max_tokens=1024)))
asyncio.run(
eval_runner(
MMMUPro(split="test", variant="standard", temperature=0.0, max_tokens=1024)
)
)

View file

@ -7,10 +7,10 @@ from string import ascii_uppercase
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
from environments.eval_environments.eval_helpers import (
extract_letter_from_answer_tag,
extract_mcqa_answer_with_fallback,
@ -90,7 +90,9 @@ class MMStar(EvalBase):
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
return letter, method
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -7,10 +7,10 @@ from string import ascii_uppercase
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
from environments.eval_environments.eval_helpers import (
extract_letter_from_answer_tag,
extract_mcqa_answer_with_fallback,
@ -118,7 +118,9 @@ class MMTBench(EvalBase):
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
return letter, method
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
@ -169,6 +171,4 @@ class MMTBench(EvalBase):
if __name__ == "__main__":
asyncio.run(
eval_runner(MMTBench(split="val", temperature=0.0, max_tokens=256))
)
asyncio.run(eval_runner(MMTBench(split="val", temperature=0.0, max_tokens=256)))

View file

@ -8,10 +8,10 @@ from typing import List, Optional, Tuple
import openai
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class MMVet(EvalBase):
@ -124,7 +124,9 @@ Output ONLY a single number between 0 and 1."""
return 0.5
return 0.0
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)

View file

@ -7,10 +7,10 @@ from string import ascii_uppercase
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
from environments.eval_environments.eval_helpers import (
extract_letter_from_answer_tag,
extract_mcqa_answer_with_fallback,
@ -98,7 +98,9 @@ class MMVP(EvalBase):
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
return letter, method
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -6,10 +6,10 @@ import io
from typing import Dict, List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class OCRBench(EvalBase):
@ -94,7 +94,9 @@ class OCRBench(EvalBase):
return False
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -7,10 +7,10 @@ import re
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class POPE(EvalBase):
@ -85,7 +85,9 @@ class POPE(EvalBase):
return "Unknown"
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
completion = await self.chat_completion(server, messages)

View file

@ -4,10 +4,10 @@ import io
from typing import List, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class RealWorldQA(EvalBase):
@ -77,7 +77,9 @@ Provide a brief, direct answer."""
return False
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)

View file

@ -7,10 +7,10 @@ from string import ascii_uppercase
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
from environments.eval_environments.eval_helpers import (
extract_letter_from_answer_tag,
extract_mcqa_answer_with_fallback,
@ -129,7 +129,9 @@ class SEEDBench2Plus(EvalBase):
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
return letter, method
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)

View file

@ -6,11 +6,11 @@ import zipfile
from pathlib import Path
from typing import List, Tuple
from environments.eval_environments.eval import EvalBase, eval_runner
from huggingface_hub import hf_hub_download
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
DEFAULT_DATA_DIR = Path.home() / ".cache" / "visulogic_hf"
@ -139,7 +139,9 @@ Answer with only the letter (A, B, C, or D)."""
return False
return prediction.upper() == answer.upper()
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)

View file

@ -7,10 +7,10 @@ import re
from typing import List, Optional, Tuple
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class VLMBlind(EvalBase):
@ -120,7 +120,9 @@ class VLMBlind(EvalBase):
else:
return answer_lower in response_lower, response_lower[:50]
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
@ -160,6 +162,4 @@ class VLMBlind(EvalBase):
if __name__ == "__main__":
asyncio.run(
eval_runner(VLMBlind(split="test", temperature=0.0, max_tokens=512))
)
asyncio.run(eval_runner(VLMBlind(split="test", temperature=0.0, max_tokens=512)))

View file

@ -10,10 +10,10 @@ from typing import Dict, List, Tuple
import pandas as pd
from datasets import load_dataset
from environments.eval_environments.eval import EvalBase, eval_runner
from PIL import Image
from atroposlib.envs.server_handling.server_manager import ServerManager
from environments.eval_environments.eval import EvalBase, eval_runner
class WeMath(EvalBase):
@ -135,7 +135,9 @@ class WeMath(EvalBase):
return False
return prediction.upper() == answer.upper()
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
async def run_item(
self, server: ServerManager, data_item: dict
) -> Tuple[dict, dict]:
try:
messages = self.build_messages(data_item)
@ -365,5 +367,7 @@ def _compute_final_scores(total_counts: Dict, total_count: int = 525) -> Dict:
if __name__ == "__main__":
asyncio.run(
eval_runner(WeMath(split="testmini", use_cot=False, temperature=0.0, max_tokens=512))
eval_runner(
WeMath(split="testmini", use_cot=False, temperature=0.0, max_tokens=512)
)
)