mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
This commit is contained in:
parent
0a16fafadb
commit
655faa775c
27 changed files with 134 additions and 66 deletions
|
|
@ -7,10 +7,10 @@ from string import ascii_uppercase
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
extract_letter_from_answer_tag,
|
||||
extract_mcqa_answer_with_fallback,
|
||||
|
|
@ -102,7 +102,9 @@ class AI2D(EvalBase):
|
|||
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
|
||||
return letter, method
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ from string import ascii_uppercase
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
extract_letter_from_answer_tag,
|
||||
extract_mcqa_answer_with_fallback,
|
||||
|
|
@ -115,7 +115,9 @@ class BLINK(EvalBase):
|
|||
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
|
||||
return letter, method
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ from pathlib import Path
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class ChartQA(EvalBase):
|
||||
|
|
@ -142,7 +142,9 @@ Question: {query}"""
|
|||
# Non-numeric: exact match (case-insensitive)
|
||||
return pred.lower() == ans.lower()
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
@ -183,6 +185,8 @@ Question: {query}"""
|
|||
if __name__ == "__main__":
|
||||
asyncio.run(
|
||||
eval_runner(
|
||||
ChartQA(subset="human", relaxed_tolerance=0.05, temperature=0.0, max_tokens=2048)
|
||||
ChartQA(
|
||||
subset="human", relaxed_tolerance=0.05, temperature=0.0, max_tokens=2048
|
||||
)
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -10,10 +10,10 @@ from typing import Dict, List, Optional, Tuple
|
|||
|
||||
import openai
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
DESCRIPTIVE_CATEGORIES = {
|
||||
1: "Information Extraction",
|
||||
|
|
@ -270,7 +270,9 @@ class CharXiv(EvalBase):
|
|||
inst_category = item.get("inst_category", 1)
|
||||
return REASONING_CATEGORIES.get(inst_category, "Text-in-Chart")
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
mode = getattr(self, "mode", "descriptive")
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ import re
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class CountBench(EvalBase):
|
||||
|
|
@ -97,7 +97,9 @@ class CountBench(EvalBase):
|
|||
|
||||
return False
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@ import re
|
|||
from typing import List, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class DocVQA(EvalBase):
|
||||
|
|
@ -142,7 +142,9 @@ Provide only the answer, as concisely as possible."""
|
|||
|
||||
return previous_row[-1]
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -10,10 +10,10 @@ from typing import List, Optional, Tuple
|
|||
|
||||
import numpy as np
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class DynaMath(EvalBase):
|
||||
|
|
@ -197,7 +197,9 @@ Example of expected JSON response format:
|
|||
or answer.lower() in extracted.lower()
|
||||
)
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
@ -240,6 +242,8 @@ Example of expected JSON response format:
|
|||
if __name__ == "__main__":
|
||||
asyncio.run(
|
||||
eval_runner(
|
||||
DynaMath(split="test", use_json_format=True, temperature=0.0, max_tokens=1024)
|
||||
DynaMath(
|
||||
split="test", use_json_format=True, temperature=0.0, max_tokens=1024
|
||||
)
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ import re
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class HallusionBench(EvalBase):
|
||||
|
|
@ -96,7 +96,9 @@ class HallusionBench(EvalBase):
|
|||
|
||||
return "Unknown"
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
@ -140,4 +142,6 @@ class HallusionBench(EvalBase):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(eval_runner(HallusionBench(split="test", temperature=0.0, max_tokens=64)))
|
||||
asyncio.run(
|
||||
eval_runner(HallusionBench(split="test", temperature=0.0, max_tokens=64))
|
||||
)
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@ import re
|
|||
from typing import List, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class InfoVQA(EvalBase):
|
||||
|
|
@ -127,7 +127,9 @@ Provide only the answer, as concisely as possible."""
|
|||
|
||||
return previous_row[-1]
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ from typing import Dict, List, Optional, Tuple
|
|||
|
||||
import openai
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
EXTRACTION_PROMPT_TEMPLATE = """You are a information extractor that extracts multiple choice letter answer choices \
|
||||
from a paragraph that contains the answer choice and sometimes explaination of why that \
|
||||
|
|
@ -195,7 +195,9 @@ Provide your answer as the letter(s) of the correct choice(s), e.g., A, B, C, D,
|
|||
|
||||
return pred_normalized == answer_normalized
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ from typing import List, Optional, Tuple
|
|||
|
||||
import openai
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
EXTRACT_ICL_EXAMPLES = [
|
||||
"1.\nModel response: 'The perimeter of the sector is approximately (-2, 1)'\n"
|
||||
|
|
@ -245,7 +245,9 @@ Judgement:"""
|
|||
|
||||
return False
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ from typing import Dict, List, Optional, Tuple
|
|||
|
||||
import openai
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
ICL_EXAMPLES = [
|
||||
"""Hint: Please answer the question and provide the final answer at the end.
|
||||
|
|
@ -263,7 +263,9 @@ Then extract the answer from the model response and type it at the end of the pr
|
|||
|
||||
return is_equal(prediction, answer)
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ from typing import Dict, List, Optional, Tuple
|
|||
|
||||
import openai
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
ICL_EXAMPLES = [
|
||||
"""
|
||||
|
|
@ -322,7 +322,9 @@ class MathVista(EvalBase):
|
|||
|
||||
return pred.lower() == ans.lower()
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ from string import ascii_uppercase
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
extract_letter_from_answer_tag,
|
||||
extract_mcqa_answer_with_fallback,
|
||||
|
|
@ -99,7 +99,9 @@ class MMBench(EvalBase):
|
|||
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
|
||||
return letter, method
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
@ -149,4 +151,10 @@ class MMBench(EvalBase):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(eval_runner(MMBench(split="dev", lang="en", version="v1.1", temperature=0.0, max_tokens=256)))
|
||||
asyncio.run(
|
||||
eval_runner(
|
||||
MMBench(
|
||||
split="dev", lang="en", version="v1.1", temperature=0.0, max_tokens=256
|
||||
)
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ from string import ascii_uppercase
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
extract_letter_from_answer_tag,
|
||||
extract_mcqa_answer_with_fallback,
|
||||
|
|
@ -134,7 +134,9 @@ class MMMU(EvalBase):
|
|||
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
|
||||
return letter, method
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ from string import ascii_uppercase
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
extract_letter_from_answer_tag,
|
||||
extract_mcqa_answer_with_fallback,
|
||||
|
|
@ -154,7 +154,9 @@ class MMMUPro(EvalBase):
|
|||
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
|
||||
return letter, method
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
@ -201,4 +203,8 @@ class MMMUPro(EvalBase):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(eval_runner(MMMUPro(split="test", variant="standard", temperature=0.0, max_tokens=1024)))
|
||||
asyncio.run(
|
||||
eval_runner(
|
||||
MMMUPro(split="test", variant="standard", temperature=0.0, max_tokens=1024)
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ from string import ascii_uppercase
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
extract_letter_from_answer_tag,
|
||||
extract_mcqa_answer_with_fallback,
|
||||
|
|
@ -90,7 +90,9 @@ class MMStar(EvalBase):
|
|||
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
|
||||
return letter, method
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ from string import ascii_uppercase
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
extract_letter_from_answer_tag,
|
||||
extract_mcqa_answer_with_fallback,
|
||||
|
|
@ -118,7 +118,9 @@ class MMTBench(EvalBase):
|
|||
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
|
||||
return letter, method
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
@ -169,6 +171,4 @@ class MMTBench(EvalBase):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(
|
||||
eval_runner(MMTBench(split="val", temperature=0.0, max_tokens=256))
|
||||
)
|
||||
asyncio.run(eval_runner(MMTBench(split="val", temperature=0.0, max_tokens=256)))
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ from typing import List, Optional, Tuple
|
|||
|
||||
import openai
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class MMVet(EvalBase):
|
||||
|
|
@ -124,7 +124,9 @@ Output ONLY a single number between 0 and 1."""
|
|||
return 0.5
|
||||
return 0.0
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ from string import ascii_uppercase
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
extract_letter_from_answer_tag,
|
||||
extract_mcqa_answer_with_fallback,
|
||||
|
|
@ -98,7 +98,9 @@ class MMVP(EvalBase):
|
|||
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
|
||||
return letter, method
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -6,10 +6,10 @@ import io
|
|||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class OCRBench(EvalBase):
|
||||
|
|
@ -94,7 +94,9 @@ class OCRBench(EvalBase):
|
|||
|
||||
return False
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ import re
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class POPE(EvalBase):
|
||||
|
|
@ -85,7 +85,9 @@ class POPE(EvalBase):
|
|||
|
||||
return "Unknown"
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
completion = await self.chat_completion(server, messages)
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@ import io
|
|||
from typing import List, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class RealWorldQA(EvalBase):
|
||||
|
|
@ -77,7 +77,9 @@ Provide a brief, direct answer."""
|
|||
|
||||
return False
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ from string import ascii_uppercase
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
extract_letter_from_answer_tag,
|
||||
extract_mcqa_answer_with_fallback,
|
||||
|
|
@ -129,7 +129,9 @@ class SEEDBench2Plus(EvalBase):
|
|||
letter, method = extract_mcqa_answer_with_fallback(response, num_choices)
|
||||
return letter, method
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,11 +6,11 @@ import zipfile
|
|||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from huggingface_hub import hf_hub_download
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
DEFAULT_DATA_DIR = Path.home() / ".cache" / "visulogic_hf"
|
||||
|
||||
|
|
@ -139,7 +139,9 @@ Answer with only the letter (A, B, C, or D)."""
|
|||
return False
|
||||
return prediction.upper() == answer.upper()
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ import re
|
|||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class VLMBlind(EvalBase):
|
||||
|
|
@ -120,7 +120,9 @@ class VLMBlind(EvalBase):
|
|||
else:
|
||||
return answer_lower in response_lower, response_lower[:50]
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
@ -160,6 +162,4 @@ class VLMBlind(EvalBase):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(
|
||||
eval_runner(VLMBlind(split="test", temperature=0.0, max_tokens=512))
|
||||
)
|
||||
asyncio.run(eval_runner(VLMBlind(split="test", temperature=0.0, max_tokens=512)))
|
||||
|
|
|
|||
|
|
@ -10,10 +10,10 @@ from typing import Dict, List, Tuple
|
|||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
from PIL import Image
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
from environments.eval_environments.eval import EvalBase, eval_runner
|
||||
|
||||
|
||||
class WeMath(EvalBase):
|
||||
|
|
@ -135,7 +135,9 @@ class WeMath(EvalBase):
|
|||
return False
|
||||
return prediction.upper() == answer.upper()
|
||||
|
||||
async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]:
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, dict]:
|
||||
try:
|
||||
messages = self.build_messages(data_item)
|
||||
|
||||
|
|
@ -365,5 +367,7 @@ def _compute_final_scores(total_counts: Dict, total_count: int = 525) -> Dict:
|
|||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(
|
||||
eval_runner(WeMath(split="testmini", use_cot=False, temperature=0.0, max_tokens=512))
|
||||
eval_runner(
|
||||
WeMath(split="testmini", use_cot=False, temperature=0.0, max_tokens=512)
|
||||
)
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue