diff --git a/environments/eval_environments/eval_base.py b/environments/eval_environments/eval_base.py deleted file mode 100644 index cab054c5..00000000 --- a/environments/eval_environments/eval_base.py +++ /dev/null @@ -1,217 +0,0 @@ -""" -Base class for evaluation environments. - -based on PR #290 for eval-only environments. -""" - -import json -import logging -import os -import time -from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Tuple - -import jsonlines -from openai import AsyncOpenAI -from openai.types.chat import ChatCompletion -from tqdm.asyncio import tqdm_asyncio - -logger = logging.getLogger(__name__) - - -def evaluate_log( - metrics: Dict, - eval_dir: Optional[str] = None, - task_name: Optional[str] = None, - model_name: Optional[str] = None, - start_time: Optional[float] = None, - end_time: Optional[float] = None, - generation_parameters: Optional[Dict] = None, - samples: Optional[List[Dict]] = None, - verbose: bool = True, -): - if eval_dir is None: - logger.warning("eval_dir is not set, skipping evaluation logging") - return - - os.makedirs(eval_dir, exist_ok=True) - filepath = os.path.join(eval_dir, "metrics.json") - - if start_time is None: - start_time = time.time() - if end_time is None: - end_time = time.time() - if generation_parameters is None: - generation_parameters = {} - - if verbose: - print(f"\n{'='*60}") - print(f" {task_name}") - print(f"{'='*60}") - for key, value in metrics.items(): - if isinstance(value, float): - print(f" {key}: {value:.4f}") - else: - print(f" {key}: {value}") - print(f" Time: {end_time - start_time:.1f}s") - print(f"{'='*60}\n") - - task_key = f"atropos|{task_name}|0" - eval_result = { - "config_general": { - "model_name": model_name, - "total_evaluation_time_seconds": str(end_time - start_time), - "generation_parameters": generation_parameters, - }, - "results": { - task_key: metrics, - "all": metrics, - }, - } - - with open(filepath, "w") as f: - json.dump(eval_result, f, indent=2) - - print(f"Evaluation results saved to {filepath}") - - if samples: - samples_filepath = os.path.join(eval_dir, "samples.jsonl") - with jsonlines.open(samples_filepath, "w") as writer: - for sample in samples: - writer.write(sample) - print(f"Evaluation samples saved to {samples_filepath}") - - -class EvalBase(ABC): - """ - Base class for evaluation environments. - - Subclasses must implement: - - setup_data(): Returns list of data items to evaluate - - run_item(client, data_item): Process one item, returns (metrics_dict, sample) - """ - - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - self.data = self.setup_data() - - def get_generation_params(self) -> dict: - return { - "temperature": getattr(self, "temperature", 0.0), - "max_tokens": getattr(self, "max_tokens", 4096), - "n": getattr(self, "n", 1), - } - - async def chat_completion( - self, client: AsyncOpenAI, messages: List[dict] - ) -> ChatCompletion: - gen_params = self.get_generation_params() - return await client.chat.completions.create( - model=self.model_name, - messages=messages, - **gen_params, - ) - - @abstractmethod - def setup_data(self) -> list: - raise NotImplementedError - - @abstractmethod - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: - """ - Process a single data item. - - Returns: - Tuple[dict, dict]: (metrics_dict, sample_dict) - - metrics_dict: keys like "accuracy" with numeric values - - sample_dict: the sample data for logging - """ - raise NotImplementedError - - async def __call__(self, client: AsyncOpenAI): - start_time = time.time() - - task_coros = [self.run_item(client, item) for item in self.data] - task_results = await tqdm_asyncio.gather( - *task_coros, desc=f"Evaluating {self.__class__.__name__}" - ) - - end_time = time.time() - - metrics_list = [result[0] for result in task_results] - samples = [result[1] for result in task_results] - - keys = list(metrics_list[0].keys()) - metrics = { - key: sum(result[key] for result in metrics_list) / len(metrics_list) - for key in keys - } - - task_name = self.__class__.__name__ - - evaluate_log( - metrics, - eval_dir=getattr(self, "eval_dir", None), - task_name=task_name, - model_name=self.model_name, - start_time=start_time, - end_time=end_time, - generation_parameters=self.get_generation_params(), - samples=samples, - verbose=True, - ) - - return metrics - - -async def eval_runner(eval_cls, **eval_kwargs): - """ - CLI runner for evaluation environments. - - Usage in __main__: - if __name__ == "__main__": - import asyncio - from eval_base import eval_runner - asyncio.run(eval_runner(MyEval, temperature=0.0, max_tokens=4096)) - """ - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--base-url", - type=str, - default="http://localhost:8000/v1", - help="Base URL for OpenAI-compatible API", - ) - parser.add_argument( - "--model-name", - type=str, - required=True, - help="Model name", - ) - parser.add_argument( - "--api-key", - type=str, - default="x", - help="API key (use 'x' for local servers)", - ) - parser.add_argument( - "--eval-dir", - type=str, - default=None, - help="Directory to save evaluation results", - ) - - args, _ = parser.parse_known_args() - - client = AsyncOpenAI( - base_url=args.base_url, - api_key=args.api_key, - ) - - eval_kwargs["model_name"] = args.model_name - eval_kwargs["eval_dir"] = args.eval_dir - - eval_env = eval_cls(**eval_kwargs) - return await eval_env(client) diff --git a/environments/eval_environments/vision_evals/README.md b/environments/eval_environments/vision_evals/README.md new file mode 100644 index 00000000..9e6d7919 --- /dev/null +++ b/environments/eval_environments/vision_evals/README.md @@ -0,0 +1,216 @@ +# Vision Evaluation Benchmarks + +This folder contains 27 vision and multimodal benchmarks for evaluating vision-language models. The implementations follow VLMEvalKit patterns where applicable and use the Atropos Eval class for consistent async evaluation. + +## Benchmarks + +| Benchmark | What it Tests | Dataset | Scoring | +|-----------|---------------|---------|---------| +| MMMU | Multi-discipline academic QA | MMMU/MMMU | MCQ accuracy | +| MMMU-Pro | Harder MMMU with 10 choices | MMMU/MMMU_Pro | MCQ accuracy | +| MMBench | General multimodal understanding | lmms-lab/MMBench | MCQ accuracy | +| MMStar | Expert-level multimodal QA | Lin-Chen/MMStar | MCQ accuracy | +| MMVet | Open-ended VLM capabilities | lmms-lab/MMVet | GPT scoring | +| MMVP | CLIP blindspot detection | MMVP/MMVP | MCQ accuracy | +| AI2D | Scientific diagram understanding | lmms-lab/ai2d | MCQ accuracy | +| BLINK | Visual perception tasks | BLINK-Benchmark/BLINK | MCQ accuracy | +| ChartQA | Chart question answering | ahmed-masry/ChartQA | Relaxed accuracy | +| CharXiv | Scientific chart understanding | princeton-nlp/CharXiv | GPT judge | +| CountBench | Object counting | nielsr/countbench | Numeric match | +| DocVQA | Document understanding | lmms-lab/DocVQA | ANLS score | +| DynaMath | Dynamic math reasoning | DynaMath/DynaMath_Sample | JSON extraction | +| HallusionBench | Visual hallucination detection | lmms-lab/HallusionBench | Yes/No accuracy | +| InfoVQA | Infographic QA | lmms-lab/InfoVQA | ANLS score | +| LogicVista | Visual logical reasoning | Yijia-Xiao/LogicVista | GPT extraction | +| MathVerse | Visual math (multi-version) | AI4Math/MathVerse | GPT extraction + scoring | +| MathVision | Visual math problems | MathLLMs/MathVision | GPT extraction | +| MathVista | Visual math reasoning | AI4Math/MathVista | GPT extraction | +| MMT-Bench | Multi-task multimodal | OpenGVLab/MMT-Bench | MCQ accuracy | +| OCRBench | OCR capabilities | echo840/OCRBench | Substring match | +| POPE | Object hallucination | lmms-lab/POPE | Yes/No accuracy | +| RealWorldQA | Real-world visual QA | xai-org/RealworldQA | Fuzzy match | +| SEED-Bench2 | Visual understanding | lmms-lab/SEED-Bench-2 | MCQ accuracy | +| VisuLogic | Visual logic puzzles | visulogic dataset | MCQ accuracy | +| VLMBlind | Basic visual perception | XAI/vlmsareblind | Task-specific | +| WeMath | Visual math with 4D metrics | We-Math/We-Math | 4D scoring | + +## Running an Evaluation + +All benchmarks use the same CLI pattern: + +```bash +python mmmu_environment.py \ + --model-name "gpt-4o" \ + --server-url "https://api.openai.com/v1" +``` + +For local models with vLLM or Ollama: + +```bash +python mmbench_environment.py \ + --model-name "Qwen/Qwen2-VL-7B-Instruct" \ + --server-url "http://localhost:8000/v1" +``` + +The evaluations use the `ServerManager` from atroposlib for making API calls. + +## Comparison with VLMEvalKit + +These implementations are aligned with VLMEvalKit where it makes sense, but simplified for standalone use. Here are the key differences and similarities: + +### Scoring Methods + +**ChartQA** uses relaxed accuracy with 5% tolerance. Percentages are converted to decimals before comparison (5% becomes 0.05). This matches VLMEvalKit behavior in `vqa_eval.py`. + +**DocVQA and InfoVQA** use ANLS (Average Normalized Levenshtein Similarity) with a 0.5 threshold. This is the standard metric from the original papers. + +**MathVista** uses GPT-based answer extraction with 5 in-context learning examples. There is a prefetch mechanism that tries regex first before calling GPT. The extraction prompt and ICL examples are taken from VLMEvalKit. + +**MathVerse** uses two-stage GPT evaluation. First GPT extracts the answer from the response, then GPT judges whether the extracted answer matches the ground truth. This matches the VLMEvalKit approach. + +**WeMath** computes 4-dimensional metrics beyond simple accuracy: +- IK (Insufficient Knowledge): wrong on steps AND wrong on multi +- IG (Inadequate Generalization): right on steps BUT wrong on multi +- CM (Complete Mastery): right on steps AND right on multi +- RM (Rote Memorization): wrong on steps BUT right on multi + +**MMVet** uses GPT to score open-ended responses on a 0-1 scale. Without an API key it falls back to substring matching. + +**OCRBench** uses category-specific scoring. For handwritten math expressions it compares without spaces. For other categories it does case-insensitive substring matching. + +### What We Changed + +**Simpler data loading**: We use HuggingFace datasets directly instead of VLMEvalKit's TSV preprocessing. This makes the code easier to understand but may load data slightly differently. + +**Async evaluation**: Everything runs async with tqdm progress bars. VLMEvalKit uses synchronous evaluation by default. + +**No circular evaluation**: VLMEvalKit supports "circular" MCQ evaluation where options are rotated and the model must get all rotations correct. We do not implement this, which means our MCQ scores may be slightly higher than VLMEvalKit on some benchmarks. + +**Unified CLI**: All benchmarks use the same `eval_runner` CLI instead of VLMEvalKit's `run.py` with config files. + +### Expected Score Differences + +Due to the differences above, you should expect: + +- MCQ benchmarks (MMMU, MMBench, MMStar, AI2D): Within 1-2% of VLMEvalKit +- VQA benchmarks (DocVQA, ChartQA): Very close, same scoring methods +- Math benchmarks (MathVista, MathVerse): Within 2-3%, depends on GPT extraction +- Open-ended (MMVet): Can vary more, depends on GPT judge prompts + +## Benchmark Details + +### General Multimodal Understanding + +**MMMU** tests multi-discipline academic knowledge across 30 subjects from accounting to physics. Questions require understanding images and domain knowledge. The validation split has about 900 questions. + +**MMMU-Pro** is a harder version with 10 answer choices instead of 4. It has three variants: standard (10 options), standard_4 (4 options), and vision (question in image). + +**MMBench** is a comprehensive benchmark covering perception, reasoning, and knowledge. It has English and Chinese versions. + +**MMStar** focuses on expert-level questions that require both visual understanding and specialized knowledge. + +**SEED-Bench2** tests visual understanding across many categories including scene understanding, instance identity, and spatial relations. The dataset is large (24k samples) so we stream by default and limit to 1000 samples. + +**MMT-Bench** is a multi-task benchmark covering 32 different task types. Good for testing breadth of capabilities. + +### Document and Chart Understanding + +**DocVQA** tests understanding of document images like forms, receipts, and scientific papers. Uses ANLS scoring which allows for minor OCR errors. + +**InfoVQA** is similar to DocVQA but focuses on infographics with more complex layouts. + +**ChartQA** tests chart reading. Has human and augmented subsets. The human subset is harder. Uses relaxed accuracy (5% tolerance for numbers). + +**CharXiv** focuses on scientific charts from arXiv papers. Uses GPT as a judge with grading queries from the dataset. + +**OCRBench** tests pure OCR capabilities across 10 categories from regular text to handwritten math expressions. + +### Math and Reasoning + +**MathVista** is a visual math benchmark with multiple question types (free form, multiple choice) and answer types (integer, float, text, list). Uses the dataset's built-in query prompts. + +**MathVerse** has problems at different visual complexity levels from "text dominant" to "vision only". Uses two-stage GPT evaluation. + +**MathVision** is another visual math benchmark. Uses GPT extraction with fallback to regex. + +**DynaMath** tests dynamic math reasoning with JSON-formatted outputs. Has subject and difficulty level breakdowns. + +**WeMath** provides detailed 4D metrics to understand where models fail. Good for diagnosing reasoning vs memorization issues. + +**LogicVista** tests visual logical reasoning with 5 skill types. Supports multi-letter answers where multiple options can be correct. + +**VisuLogic** tests visual logic with diagram-based puzzles. + +### Perception and Hallucination + +**POPE** tests object hallucination with yes/no questions about whether objects exist in images. Has random, popular, and adversarial variants. + +**HallusionBench** tests visual hallucinations more broadly. Questions are designed to trick models into seeing things that are not there. + +**MMVP** tests visual perception on cases where CLIP-based models tend to fail. Useful for understanding encoder limitations. + +**BLINK** tests basic visual perception like counting, spatial relations, and similarity. Models often struggle on these "easy" tasks. + +**VLMBlind** (VLMs Are Blind) tests very basic visual tasks that humans find trivial but VLMs often fail. Includes counting grid cells, finding circled letters, and counting Olympic rings. + +**CountBench** is a simple object counting benchmark. + +### Real World + +**RealWorldQA** tests understanding of real-world images from XAI. Uses fuzzy matching for answers. + +**AI2D** tests understanding of scientific diagrams from AI2 (Allen Institute). Good for testing diagram reasoning. + +## GPT Judge Configuration + +Several benchmarks use GPT for answer extraction or scoring. To enable this: + +```bash +export OPENAI_API_KEY="your-key" +``` + +You can also configure the judge model when instantiating: + +```python +eval_env = MathVista( + use_gpt_extraction=True, + judge_model="gpt-4o-mini", + judge_base_url="https://api.openai.com/v1", +) +asyncio.run(eval_runner(eval_env)) +``` + +Without an API key, benchmarks fall back to regex-based extraction which is less accurate but free. + +## Output Format + +Results are saved to the eval directory: + +``` +eval_results/ + metrics.json # Overall scores + samples.jsonl # Per-item predictions +``` + +The metrics.json file contains accuracy and other metrics depending on the benchmark. The samples.jsonl file has one line per question with the prediction, answer, and whether it was correct. + +## Adding New Benchmarks + +To add a new vision benchmark: + +1. Create a new file like `new_benchmark_environment.py` +2. Inherit from `EvalBase` +3. Implement `setup_data()` to load the dataset +4. Implement `run_item(self, server: ServerManager, data_item: dict)` to process one item +5. Use `await self.chat_completion(server, messages)` for API calls +6. Add image encoding using the standard `encode_image()` pattern + +See any existing benchmark for a template. The MMMU implementation is a good starting point for MCQ benchmarks. DocVQA is a good template for VQA benchmarks. + +## References + +- VLMEvalKit: https://github.com/open-compass/VLMEvalKit +- OpenVLM Leaderboard: https://huggingface.co/spaces/opencompass/open_vlm_leaderboard +- MMMU: https://mmmu-benchmark.github.io/ +- MathVista: https://mathvista.github.io/ +- DocVQA: https://www.docvqa.org/ diff --git a/environments/eval_environments/ai2d_environment.py b/environments/eval_environments/vision_evals/ai2d_environment.py similarity index 88% rename from environments/eval_environments/ai2d_environment.py rename to environments/eval_environments/vision_evals/ai2d_environment.py index 8ebce645..0d3f37f4 100644 --- a/environments/eval_environments/ai2d_environment.py +++ b/environments/eval_environments/vision_evals/ai2d_environment.py @@ -7,10 +7,10 @@ from string import ascii_uppercase from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner from environments.eval_environments.eval_helpers import ( extract_letter_from_answer_tag, extract_mcqa_answer_with_fallback, @@ -102,17 +102,10 @@ class AI2D(EvalBase): letter, method = extract_mcqa_answer_with_fallback(response, num_choices) return letter, method - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -161,11 +154,4 @@ class AI2D(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - AI2D, - split="test", - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(AI2D(split="test", temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/blink_environment.py b/environments/eval_environments/vision_evals/blink_environment.py similarity index 89% rename from environments/eval_environments/blink_environment.py rename to environments/eval_environments/vision_evals/blink_environment.py index 6c1a6a96..fadfd768 100644 --- a/environments/eval_environments/blink_environment.py +++ b/environments/eval_environments/vision_evals/blink_environment.py @@ -7,10 +7,10 @@ from string import ascii_uppercase from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner from environments.eval_environments.eval_helpers import ( extract_letter_from_answer_tag, extract_mcqa_answer_with_fallback, @@ -115,17 +115,10 @@ class BLINK(EvalBase): letter, method = extract_mcqa_answer_with_fallback(response, num_choices) return letter, method - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -172,11 +165,4 @@ class BLINK(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - BLINK, - split="val", - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(BLINK(split="val", temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/chartqa_environment.py b/environments/eval_environments/vision_evals/chartqa_environment.py similarity index 90% rename from environments/eval_environments/chartqa_environment.py rename to environments/eval_environments/vision_evals/chartqa_environment.py index 785e01ac..77a570e8 100644 --- a/environments/eval_environments/chartqa_environment.py +++ b/environments/eval_environments/vision_evals/chartqa_environment.py @@ -8,10 +8,10 @@ from pathlib import Path from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class ChartQA(EvalBase): @@ -142,17 +142,11 @@ Question: {query}""" # Non-numeric: exact match (case-insensitive) return pred.lower() == ans.lower() - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -189,10 +183,6 @@ Question: {query}""" if __name__ == "__main__": asyncio.run( eval_runner( - ChartQA, - subset="human", - relaxed_tolerance=0.05, - temperature=0.0, - max_tokens=2048, + ChartQA(subset="human", relaxed_tolerance=0.05, temperature=0.0, max_tokens=2048) ) ) diff --git a/environments/eval_environments/charxiv_environment.py b/environments/eval_environments/vision_evals/charxiv_environment.py similarity index 94% rename from environments/eval_environments/charxiv_environment.py rename to environments/eval_environments/vision_evals/charxiv_environment.py index 52ebec70..3deaeae2 100644 --- a/environments/eval_environments/charxiv_environment.py +++ b/environments/eval_environments/vision_evals/charxiv_environment.py @@ -10,10 +10,10 @@ from typing import Dict, List, Optional, Tuple import openai from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner DESCRIPTIVE_CATEGORIES = { 1: "Information Extraction", @@ -270,18 +270,12 @@ class CharXiv(EvalBase): inst_category = item.get("inst_category", 1) return REASONING_CATEGORIES.get(inst_category, "Text-in-Chart") - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) mode = getattr(self, "mode", "descriptive") - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0, "score": 0.0}, {"error": "Empty response"} @@ -374,12 +368,13 @@ def compute_category_metrics(samples: List[dict]) -> Dict: if __name__ == "__main__": asyncio.run( eval_runner( - CharXiv, - mode="descriptive", # or "reasoning" - split="validation", - use_gpt_judge=True, - judge_model="gpt-4o-mini", - temperature=0.0, - max_tokens=1024, + CharXiv( + mode="descriptive", # or "reasoning" + split="validation", + use_gpt_judge=True, + judge_model="gpt-4o-mini", + temperature=0.0, + max_tokens=1024, + ) ) ) diff --git a/environments/eval_environments/countbench_environment.py b/environments/eval_environments/vision_evals/countbench_environment.py similarity index 86% rename from environments/eval_environments/countbench_environment.py rename to environments/eval_environments/vision_evals/countbench_environment.py index fc291f31..62ebd622 100644 --- a/environments/eval_environments/countbench_environment.py +++ b/environments/eval_environments/vision_evals/countbench_environment.py @@ -7,10 +7,10 @@ import re from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class CountBench(EvalBase): @@ -97,17 +97,10 @@ class CountBench(EvalBase): return False - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -139,11 +132,4 @@ class CountBench(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - CountBench, - split="test", - temperature=0.0, - max_tokens=64, - ) - ) + asyncio.run(eval_runner(CountBench(split="test", temperature=0.0, max_tokens=64))) diff --git a/environments/eval_environments/docvqa_environment.py b/environments/eval_environments/vision_evals/docvqa_environment.py similarity index 89% rename from environments/eval_environments/docvqa_environment.py rename to environments/eval_environments/vision_evals/docvqa_environment.py index 0d64c1a0..b781604f 100644 --- a/environments/eval_environments/docvqa_environment.py +++ b/environments/eval_environments/vision_evals/docvqa_environment.py @@ -5,10 +5,10 @@ import re from typing import List, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class DocVQA(EvalBase): @@ -142,17 +142,10 @@ Provide only the answer, as concisely as possible.""" return previous_row[-1] - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0, "anls": 0.0}, {"error": "Empty response"} @@ -194,11 +187,4 @@ Provide only the answer, as concisely as possible.""" if __name__ == "__main__": - asyncio.run( - eval_runner( - DocVQA, - split="test", - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(DocVQA(split="test", temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/dynamath_environment.py b/environments/eval_environments/vision_evals/dynamath_environment.py similarity index 93% rename from environments/eval_environments/dynamath_environment.py rename to environments/eval_environments/vision_evals/dynamath_environment.py index dca125ce..f9f18c27 100644 --- a/environments/eval_environments/dynamath_environment.py +++ b/environments/eval_environments/vision_evals/dynamath_environment.py @@ -10,10 +10,10 @@ from typing import List, Optional, Tuple import numpy as np from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class DynaMath(EvalBase): @@ -197,17 +197,11 @@ Example of expected JSON response format: or answer.lower() in extracted.lower() ) - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -246,10 +240,6 @@ Example of expected JSON response format: if __name__ == "__main__": asyncio.run( eval_runner( - DynaMath, - split="test", - use_json_format=True, - temperature=0.0, - max_tokens=1024, + DynaMath(split="test", use_json_format=True, temperature=0.0, max_tokens=1024) ) ) diff --git a/environments/eval_environments/hallusionbench_environment.py b/environments/eval_environments/vision_evals/hallusionbench_environment.py similarity index 87% rename from environments/eval_environments/hallusionbench_environment.py rename to environments/eval_environments/vision_evals/hallusionbench_environment.py index 1fdb147e..3abe2b6b 100644 --- a/environments/eval_environments/hallusionbench_environment.py +++ b/environments/eval_environments/vision_evals/hallusionbench_environment.py @@ -7,10 +7,10 @@ import re from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class HallusionBench(EvalBase): @@ -96,17 +96,10 @@ class HallusionBench(EvalBase): return "Unknown" - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -147,11 +140,4 @@ class HallusionBench(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - HallusionBench, - split="test", - temperature=0.0, - max_tokens=64, - ) - ) + asyncio.run(eval_runner(HallusionBench(split="test", temperature=0.0, max_tokens=64))) diff --git a/environments/eval_environments/infovqa_environment.py b/environments/eval_environments/vision_evals/infovqa_environment.py similarity index 88% rename from environments/eval_environments/infovqa_environment.py rename to environments/eval_environments/vision_evals/infovqa_environment.py index 077e7db5..c320182f 100644 --- a/environments/eval_environments/infovqa_environment.py +++ b/environments/eval_environments/vision_evals/infovqa_environment.py @@ -5,10 +5,10 @@ import re from typing import List, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class InfoVQA(EvalBase): @@ -127,17 +127,10 @@ Provide only the answer, as concisely as possible.""" return previous_row[-1] - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0, "anls": 0.0}, {"error": "Empty response"} @@ -178,11 +171,4 @@ Provide only the answer, as concisely as possible.""" if __name__ == "__main__": - asyncio.run( - eval_runner( - InfoVQA, - split="test", - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(InfoVQA(split="test", temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/logicvista_environment.py b/environments/eval_environments/vision_evals/logicvista_environment.py similarity index 92% rename from environments/eval_environments/logicvista_environment.py rename to environments/eval_environments/vision_evals/logicvista_environment.py index 388b42b2..f20277e8 100644 --- a/environments/eval_environments/logicvista_environment.py +++ b/environments/eval_environments/vision_evals/logicvista_environment.py @@ -9,10 +9,10 @@ from typing import Dict, List, Optional, Tuple import openai from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner EXTRACTION_PROMPT_TEMPLATE = """You are a information extractor that extracts multiple choice letter answer choices \ from a paragraph that contains the answer choice and sometimes explaination of why that \ @@ -195,17 +195,11 @@ Provide your answer as the letter(s) of the correct choice(s), e.g., A, B, C, D, return pred_normalized == answer_normalized - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0, "hit": 0}, {"error": "Empty response"} @@ -296,11 +290,12 @@ def compute_skill_metrics(samples: List[dict]) -> Dict: if __name__ == "__main__": asyncio.run( eval_runner( - LogicVista, - split="test", - use_gpt_extraction=True, - judge_model="gpt-4o-mini", - temperature=0.0, - max_tokens=512, + LogicVista( + split="test", + use_gpt_extraction=True, + judge_model="gpt-4o-mini", + temperature=0.0, + max_tokens=512, + ) ) ) diff --git a/environments/eval_environments/mathverse_environment.py b/environments/eval_environments/vision_evals/mathverse_environment.py similarity index 94% rename from environments/eval_environments/mathverse_environment.py rename to environments/eval_environments/vision_evals/mathverse_environment.py index 74a1948d..d4dfc478 100644 --- a/environments/eval_environments/mathverse_environment.py +++ b/environments/eval_environments/vision_evals/mathverse_environment.py @@ -9,10 +9,10 @@ from typing import List, Optional, Tuple import openai from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner EXTRACT_ICL_EXAMPLES = [ "1.\nModel response: 'The perimeter of the sector is approximately (-2, 1)'\n" @@ -245,17 +245,11 @@ Judgement:""" return False - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -328,12 +322,13 @@ Judgement:""" if __name__ == "__main__": asyncio.run( eval_runner( - MathVerse, - split="testmini", - use_cot=False, - use_gpt_evaluation=True, - judge_model="gpt-4o-mini", - temperature=0.0, - max_tokens=2048, + MathVerse( + split="testmini", + use_cot=False, + use_gpt_evaluation=True, + judge_model="gpt-4o-mini", + temperature=0.0, + max_tokens=2048, + ) ) ) diff --git a/environments/eval_environments/mathvision_environment.py b/environments/eval_environments/vision_evals/mathvision_environment.py similarity index 94% rename from environments/eval_environments/mathvision_environment.py rename to environments/eval_environments/vision_evals/mathvision_environment.py index 9df0913a..dda9fd10 100644 --- a/environments/eval_environments/mathvision_environment.py +++ b/environments/eval_environments/vision_evals/mathvision_environment.py @@ -9,10 +9,10 @@ from typing import Dict, List, Optional, Tuple import openai from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner ICL_EXAMPLES = [ """Hint: Please answer the question and provide the final answer at the end. @@ -263,17 +263,11 @@ Then extract the answer from the model response and type it at the end of the pr return is_equal(prediction, answer) - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -335,11 +329,12 @@ Then extract the answer from the model response and type it at the end of the pr if __name__ == "__main__": asyncio.run( eval_runner( - MathVision, - split="testmini", - use_gpt_extraction=True, - judge_model="gpt-4o-mini", - temperature=0.0, - max_tokens=2048, + MathVision( + split="testmini", + use_gpt_extraction=True, + judge_model="gpt-4o-mini", + temperature=0.0, + max_tokens=2048, + ) ) ) diff --git a/environments/eval_environments/mathvista_environment.py b/environments/eval_environments/vision_evals/mathvista_environment.py similarity index 95% rename from environments/eval_environments/mathvista_environment.py rename to environments/eval_environments/vision_evals/mathvista_environment.py index 1a39f70d..be1f9779 100644 --- a/environments/eval_environments/mathvista_environment.py +++ b/environments/eval_environments/vision_evals/mathvista_environment.py @@ -9,10 +9,10 @@ from typing import Dict, List, Optional, Tuple import openai from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner ICL_EXAMPLES = [ """ @@ -322,17 +322,11 @@ class MathVista(EvalBase): return pred.lower() == ans.lower() - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -400,12 +394,13 @@ class MathVista(EvalBase): if __name__ == "__main__": asyncio.run( eval_runner( - MathVista, - split="testmini", - use_query=True, - use_gpt_extraction=True, - judge_model="gpt-4o-mini", - temperature=0.0, - max_tokens=4096, + MathVista( + split="testmini", + use_query=True, + use_gpt_extraction=True, + judge_model="gpt-4o-mini", + temperature=0.0, + max_tokens=4096, + ) ) ) diff --git a/environments/eval_environments/mmbench_environment.py b/environments/eval_environments/vision_evals/mmbench_environment.py similarity index 87% rename from environments/eval_environments/mmbench_environment.py rename to environments/eval_environments/vision_evals/mmbench_environment.py index 60453b44..fe647039 100644 --- a/environments/eval_environments/mmbench_environment.py +++ b/environments/eval_environments/vision_evals/mmbench_environment.py @@ -7,10 +7,10 @@ from string import ascii_uppercase from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner from environments.eval_environments.eval_helpers import ( extract_letter_from_answer_tag, extract_mcqa_answer_with_fallback, @@ -99,17 +99,10 @@ class MMBench(EvalBase): letter, method = extract_mcqa_answer_with_fallback(response, num_choices) return letter, method - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -156,13 +149,4 @@ class MMBench(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - MMBench, - split="dev", - lang="en", - version="v1.1", - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(MMBench(split="dev", lang="en", version="v1.1", temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/mmmu_environment.py b/environments/eval_environments/vision_evals/mmmu_environment.py similarity index 89% rename from environments/eval_environments/mmmu_environment.py rename to environments/eval_environments/vision_evals/mmmu_environment.py index cfcfb77e..9939da1c 100644 --- a/environments/eval_environments/mmmu_environment.py +++ b/environments/eval_environments/vision_evals/mmmu_environment.py @@ -8,10 +8,10 @@ from string import ascii_uppercase from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner from environments.eval_environments.eval_helpers import ( extract_letter_from_answer_tag, extract_mcqa_answer_with_fallback, @@ -134,17 +134,10 @@ class MMMU(EvalBase): letter, method = extract_mcqa_answer_with_fallback(response, num_choices) return letter, method - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -188,11 +181,4 @@ class MMMU(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - MMMU, - split="validation", - temperature=0.0, - max_tokens=1024, - ) - ) + asyncio.run(eval_runner(MMMU(split="validation", temperature=0.0, max_tokens=1024))) diff --git a/environments/eval_environments/mmmu_pro_environment.py b/environments/eval_environments/vision_evals/mmmu_pro_environment.py similarity index 91% rename from environments/eval_environments/mmmu_pro_environment.py rename to environments/eval_environments/vision_evals/mmmu_pro_environment.py index 711b5d6f..950644d6 100644 --- a/environments/eval_environments/mmmu_pro_environment.py +++ b/environments/eval_environments/vision_evals/mmmu_pro_environment.py @@ -8,10 +8,10 @@ from string import ascii_uppercase from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner from environments.eval_environments.eval_helpers import ( extract_letter_from_answer_tag, extract_mcqa_answer_with_fallback, @@ -154,17 +154,10 @@ class MMMUPro(EvalBase): letter, method = extract_mcqa_answer_with_fallback(response, num_choices) return letter, method - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -208,12 +201,4 @@ class MMMUPro(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - MMMUPro, - split="test", - variant="standard", - temperature=0.0, - max_tokens=1024, - ) - ) + asyncio.run(eval_runner(MMMUPro(split="test", variant="standard", temperature=0.0, max_tokens=1024))) diff --git a/environments/eval_environments/mmstar_environment.py b/environments/eval_environments/vision_evals/mmstar_environment.py similarity index 87% rename from environments/eval_environments/mmstar_environment.py rename to environments/eval_environments/vision_evals/mmstar_environment.py index b0ddb21d..7512e2d9 100644 --- a/environments/eval_environments/mmstar_environment.py +++ b/environments/eval_environments/vision_evals/mmstar_environment.py @@ -7,10 +7,10 @@ from string import ascii_uppercase from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner from environments.eval_environments.eval_helpers import ( extract_letter_from_answer_tag, extract_mcqa_answer_with_fallback, @@ -90,17 +90,10 @@ class MMStar(EvalBase): letter, method = extract_mcqa_answer_with_fallback(response, num_choices) return letter, method - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -147,11 +140,4 @@ class MMStar(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - MMStar, - split="val", - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(MMStar(split="val", temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/mmt_bench_environment.py b/environments/eval_environments/vision_evals/mmt_bench_environment.py similarity index 90% rename from environments/eval_environments/mmt_bench_environment.py rename to environments/eval_environments/vision_evals/mmt_bench_environment.py index f4e9e43d..84e4f03f 100644 --- a/environments/eval_environments/mmt_bench_environment.py +++ b/environments/eval_environments/vision_evals/mmt_bench_environment.py @@ -7,10 +7,10 @@ from string import ascii_uppercase from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner from environments.eval_environments.eval_helpers import ( extract_letter_from_answer_tag, extract_mcqa_answer_with_fallback, @@ -118,17 +118,11 @@ class MMTBench(EvalBase): letter, method = extract_mcqa_answer_with_fallback(response, num_choices) return letter, method - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -176,10 +170,5 @@ class MMTBench(EvalBase): if __name__ == "__main__": asyncio.run( - eval_runner( - MMTBench, - split="val", - temperature=0.0, - max_tokens=256, - ) + eval_runner(MMTBench(split="val", temperature=0.0, max_tokens=256)) ) diff --git a/environments/eval_environments/mmvet_environment.py b/environments/eval_environments/vision_evals/mmvet_environment.py similarity index 89% rename from environments/eval_environments/mmvet_environment.py rename to environments/eval_environments/vision_evals/mmvet_environment.py index be878b78..c85ba794 100644 --- a/environments/eval_environments/mmvet_environment.py +++ b/environments/eval_environments/vision_evals/mmvet_environment.py @@ -8,10 +8,10 @@ from typing import List, Optional, Tuple import openai from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class MMVet(EvalBase): @@ -124,17 +124,11 @@ Output ONLY a single number between 0 and 1.""" return 0.5 return 0.0 - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -179,11 +173,12 @@ Output ONLY a single number between 0 and 1.""" if __name__ == "__main__": asyncio.run( eval_runner( - MMVet, - split="test", - use_gpt_scoring=True, - judge_model="gpt-4o-mini", - temperature=0.0, - max_tokens=512, + MMVet( + split="test", + use_gpt_scoring=True, + judge_model="gpt-4o-mini", + temperature=0.0, + max_tokens=512, + ) ) ) diff --git a/environments/eval_environments/mmvp_environment.py b/environments/eval_environments/vision_evals/mmvp_environment.py similarity index 88% rename from environments/eval_environments/mmvp_environment.py rename to environments/eval_environments/vision_evals/mmvp_environment.py index 62712061..70a7e823 100644 --- a/environments/eval_environments/mmvp_environment.py +++ b/environments/eval_environments/vision_evals/mmvp_environment.py @@ -7,10 +7,10 @@ from string import ascii_uppercase from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner from environments.eval_environments.eval_helpers import ( extract_letter_from_answer_tag, extract_mcqa_answer_with_fallback, @@ -98,17 +98,10 @@ class MMVP(EvalBase): letter, method = extract_mcqa_answer_with_fallback(response, num_choices) return letter, method - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -155,11 +148,4 @@ class MMVP(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - MMVP, - split="test", - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(MMVP(split="test", temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/ocrbench_environment.py b/environments/eval_environments/vision_evals/ocrbench_environment.py similarity index 87% rename from environments/eval_environments/ocrbench_environment.py rename to environments/eval_environments/vision_evals/ocrbench_environment.py index cec6e689..2b0e13f1 100644 --- a/environments/eval_environments/ocrbench_environment.py +++ b/environments/eval_environments/vision_evals/ocrbench_environment.py @@ -6,10 +6,10 @@ import io from typing import Dict, List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class OCRBench(EvalBase): @@ -94,17 +94,10 @@ class OCRBench(EvalBase): return False - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -144,11 +137,4 @@ class OCRBench(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - OCRBench, - split="test", - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(OCRBench(split="test", temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/pope_environment.py b/environments/eval_environments/vision_evals/pope_environment.py similarity index 86% rename from environments/eval_environments/pope_environment.py rename to environments/eval_environments/vision_evals/pope_environment.py index ada7b47f..00c9f529 100644 --- a/environments/eval_environments/pope_environment.py +++ b/environments/eval_environments/vision_evals/pope_environment.py @@ -7,10 +7,10 @@ import re from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class POPE(EvalBase): @@ -85,17 +85,10 @@ class POPE(EvalBase): return "Unknown" - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -136,11 +129,4 @@ class POPE(EvalBase): if __name__ == "__main__": - asyncio.run( - eval_runner( - POPE, - split="test", - temperature=0.0, - max_tokens=64, - ) - ) + asyncio.run(eval_runner(POPE(split="test", temperature=0.0, max_tokens=64))) diff --git a/environments/eval_environments/realworldqa_environment.py b/environments/eval_environments/vision_evals/realworldqa_environment.py similarity index 83% rename from environments/eval_environments/realworldqa_environment.py rename to environments/eval_environments/vision_evals/realworldqa_environment.py index c1b9bf3a..4265c696 100644 --- a/environments/eval_environments/realworldqa_environment.py +++ b/environments/eval_environments/vision_evals/realworldqa_environment.py @@ -4,10 +4,10 @@ import io from typing import List, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class RealWorldQA(EvalBase): @@ -77,17 +77,11 @@ Provide a brief, direct answer.""" return False - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -122,11 +116,4 @@ Provide a brief, direct answer.""" if __name__ == "__main__": - asyncio.run( - eval_runner( - RealWorldQA, - split="test", - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(RealWorldQA(split="test", temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/seedbench2_plus_environment.py b/environments/eval_environments/vision_evals/seedbench2_plus_environment.py similarity index 91% rename from environments/eval_environments/seedbench2_plus_environment.py rename to environments/eval_environments/vision_evals/seedbench2_plus_environment.py index 300dcb58..fbb9c8ca 100644 --- a/environments/eval_environments/seedbench2_plus_environment.py +++ b/environments/eval_environments/vision_evals/seedbench2_plus_environment.py @@ -7,10 +7,10 @@ from string import ascii_uppercase from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner from environments.eval_environments.eval_helpers import ( extract_letter_from_answer_tag, extract_mcqa_answer_with_fallback, @@ -129,17 +129,11 @@ class SEEDBench2Plus(EvalBase): letter, method = extract_mcqa_answer_with_fallback(response, num_choices) return letter, method - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -195,10 +189,5 @@ class SEEDBench2Plus(EvalBase): if __name__ == "__main__": asyncio.run( - eval_runner( - SEEDBench2Plus, - split="test", - temperature=0.0, - max_tokens=256, - ) + eval_runner(SEEDBench2Plus(split="test", temperature=0.0, max_tokens=256)) ) diff --git a/environments/eval_environments/visulogic_environment.py b/environments/eval_environments/vision_evals/visulogic_environment.py similarity index 89% rename from environments/eval_environments/visulogic_environment.py rename to environments/eval_environments/vision_evals/visulogic_environment.py index 6f63f1d3..ddd72a59 100644 --- a/environments/eval_environments/visulogic_environment.py +++ b/environments/eval_environments/vision_evals/visulogic_environment.py @@ -7,10 +7,10 @@ from pathlib import Path from typing import List, Tuple from huggingface_hub import hf_hub_download -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner DEFAULT_DATA_DIR = Path.home() / ".cache" / "visulogic_hf" @@ -139,17 +139,11 @@ Answer with only the letter (A, B, C, or D).""" return False return prediction.upper() == answer.upper() - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -185,10 +179,4 @@ Answer with only the letter (A, B, C, or D).""" if __name__ == "__main__": - asyncio.run( - eval_runner( - VisuLogic, - temperature=0.0, - max_tokens=256, - ) - ) + asyncio.run(eval_runner(VisuLogic(temperature=0.0, max_tokens=256))) diff --git a/environments/eval_environments/vlmblind_environment.py b/environments/eval_environments/vision_evals/vlmblind_environment.py similarity index 90% rename from environments/eval_environments/vlmblind_environment.py rename to environments/eval_environments/vision_evals/vlmblind_environment.py index cd988342..9cb19cb6 100644 --- a/environments/eval_environments/vlmblind_environment.py +++ b/environments/eval_environments/vision_evals/vlmblind_environment.py @@ -7,10 +7,10 @@ import re from typing import List, Optional, Tuple from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class VLMBlind(EvalBase): @@ -120,17 +120,11 @@ class VLMBlind(EvalBase): else: return answer_lower in response_lower, response_lower[:50] - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0}, {"error": "Empty response"} @@ -167,10 +161,5 @@ class VLMBlind(EvalBase): if __name__ == "__main__": asyncio.run( - eval_runner( - VLMBlind, - split="test", - temperature=0.0, - max_tokens=512, - ) + eval_runner(VLMBlind(split="test", temperature=0.0, max_tokens=512)) ) diff --git a/environments/eval_environments/wemath_environment.py b/environments/eval_environments/vision_evals/wemath_environment.py similarity index 94% rename from environments/eval_environments/wemath_environment.py rename to environments/eval_environments/vision_evals/wemath_environment.py index 0aaeb70f..3ddc9336 100644 --- a/environments/eval_environments/wemath_environment.py +++ b/environments/eval_environments/vision_evals/wemath_environment.py @@ -10,10 +10,10 @@ from typing import Dict, List, Tuple import pandas as pd from datasets import load_dataset -from openai import AsyncOpenAI from PIL import Image -from environments.eval_environments.eval_base import EvalBase, eval_runner +from atroposlib.envs.server_handling.server_manager import ServerManager +from environments.eval_environments.eval import EvalBase, eval_runner class WeMath(EvalBase): @@ -135,17 +135,11 @@ class WeMath(EvalBase): return False return prediction.upper() == answer.upper() - async def run_item(self, client: AsyncOpenAI, data_item: dict) -> Tuple[dict, dict]: + async def run_item(self, server: ServerManager, data_item: dict) -> Tuple[dict, dict]: try: messages = self.build_messages(data_item) - gen_params = self.get_generation_params() - completion = await client.chat.completions.create( - model=self.model_name, - messages=messages, - temperature=gen_params["temperature"], - max_tokens=gen_params["max_tokens"], - ) + completion = await self.chat_completion(server, messages) if not completion.choices: return {"accuracy": 0.0, "hit": 0}, {"error": "Empty response"} @@ -371,11 +365,5 @@ def _compute_final_scores(total_counts: Dict, total_count: int = 525) -> Dict: if __name__ == "__main__": asyncio.run( - eval_runner( - WeMath, - split="testmini", - use_cot=False, - temperature=0.0, - max_tokens=512, - ) + eval_runner(WeMath(split="testmini", use_cot=False, temperature=0.0, max_tokens=512)) )