diff --git a/eval/eval.py b/eval/eval.py
index bbfc585d..4eb1e12d 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -1,171 +1,136 @@
import argparse
import asyncio
import json
+import logging
import os
-import re
-import time
+from dataclasses import asdict
from datetime import datetime
from typing import Any
-from openai import AsyncOpenAI
-from tqdm.asyncio import tqdm_asyncio
+import aiohttp
+from eval_config import EvalConfig
+from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential
-from reasoning_gym.factory import create_dataset
-from reasoning_gym.utils import SYSTEM_PROMPTS
+import reasoning_gym
+from reasoning_gym.utils import extract_answer
-class AsyncOpenRouterEvaluator:
- def __init__(self, model: str, max_concurrent: int = 10):
- self.client = AsyncOpenAI(base_url="https://openrouter.ai/api/v1", api_key=os.getenv("OPENROUTER_API_KEY"))
+class OpenRouterEvaluator:
+ def __init__(self, model: str, config: EvalConfig):
+ self.logger = logging.getLogger(f"OpenRouterEvaluator.{model}")
+ self.config = config
+ self.output_dir = f"{config.eval_dir}/{config.category}"
+ os.makedirs(self.output_dir, exist_ok=True)
+ self.base_url = "https://openrouter.ai/api/v1/chat/completions"
+ self.api_key = os.getenv("OPENROUTER_API_KEY")
self.model = model
- self.extra_headers = {}
- self.max_concurrent = max_concurrent
- self.semaphore = asyncio.Semaphore(max_concurrent)
+ self.headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "HTTP-Referer": os.getenv("OR_SITE_URL", "localhost"),
+ "X-Title": os.getenv("OR_APP_NAME", "Model Evaluation"),
+ "Content-Type": "application/json",
+ }
+ self.semaphore = asyncio.Semaphore(10) # Control concurrency
- async def get_model_response(self, prompt: str) -> str:
- """Get response from the model via OpenRouter API with rate limiting."""
- async with self.semaphore:
- try:
- completion = await self.client.chat.completions.create(
- extra_headers=self.extra_headers,
- model=self.model,
- messages=[
- {"role": "system", "content": SYSTEM_PROMPTS["default"]},
- {"role": "user", "content": prompt},
- ],
- )
- return completion.choices[0].message.content
- except Exception as e:
- print(f"Error calling OpenRouter API: {str(e)}")
- raise
+ def save_results(self, results: list[dict[str, Any]], dataset, dataset_name) -> dict[str, Any]:
+ file_name = f"{self.output_dir}/{dataset_name}.json"
+ total_score = sum(r["score"] for r in results)
- def parse_model_response(self, response: str) -> str:
- """Gather the final answer between the and tags."""
- match = re.search(r"(.*?)", response, re.DOTALL)
- return match.group(1).strip() if match else response
-
- async def process_single_question(self, entry: dict, dataset) -> dict:
- """Process a single question and return the result."""
- response = await self.get_model_response(entry["question"])
- answer = self.parse_model_response(response)
- score = dataset.score_answer(answer=answer, entry=entry)
-
- return {
- "question": entry["question"],
- "expected_answer": entry["answer"],
- "model_answer": answer,
- "full_model_response": response,
- "score": score,
- "metadata": entry["metadata"],
+ metrics = {
+ "dataset_name": dataset_name,
+ "model": self.model,
+ "size": dataset.size,
+ "provider": self.config.provider,
+ "average_score": total_score / len(results) if results else 0,
+ "total_examples": len(results),
+ "timestamp": datetime.now().isoformat(),
+ "config": asdict(dataset.config),
+ "results": results,
}
- async def evaluate_dataset(self, dataset_config: dict[str, Any]) -> dict[str, Any]:
- """Evaluate a single dataset with concurrent question processing."""
- dataset_name = dataset_config.pop("name")
- print(f"\nEvaluating dataset: {dataset_name}")
+ with open(file_name, "w") as f:
+ json.dump(metrics, f, indent=2)
+ return metrics
- try:
- # Create dataset with its specific configuration
- data = create_dataset(dataset_name, **dataset_config)
- all_entries = list(data)
+ async def get_model_response(self, session: aiohttp.ClientSession, prompt: str) -> str:
+ payload = {
+ "model": self.model,
+ "messages": [
+ {"role": self.config.developer_role, "content": self.config.developer_prompt},
+ {"role": "user", "content": prompt},
+ ],
+ "provider": {"order": ["Nebius"], "allow_fallbacks": False},
+ }
- # Process all questions concurrently
- tasks = [self.process_single_question(entry, data) for entry in all_entries]
+ async for attempt in AsyncRetrying(
+ stop=stop_after_attempt(20),
+ wait=wait_exponential(multiplier=1, min=1, max=60),
+ retry=retry_if_exception_type(
+ (aiohttp.ClientError, asyncio.TimeoutError, json.JSONDecodeError, ValueError)
+ ),
+ ):
+ with attempt:
+ async with session.post(self.base_url, json=payload) as response:
+ data = await response.json()
- # Use tqdm to track progress
- results = await tqdm_asyncio.gather(*tasks, desc=f"Processing {dataset_name}")
+ if not data:
+ raise ValueError("Empty response")
- # Calculate aggregate metrics
- total_score = sum(r["score"] for r in results)
- metrics = {
- "dataset_name": dataset_name,
- "model": self.model,
- "size": len(data),
- "average_score": total_score / len(results) if results else 0,
- "total_examples": len(results),
- "timestamp": datetime.now().isoformat(),
- "config": dataset_config,
+ if not data.get("choices"):
+ raise ValueError("Missing choices in response")
+
+ return data["choices"][0]["message"]["content"]
+
+ raise Exception("Failed to get valid response after retries")
+
+ async def process_entry(self, session: aiohttp.ClientSession, dataset: Any, entry: Any) -> dict[str, Any]:
+ """Process a single entry with concurrency control."""
+ async with self.semaphore:
+ response = await self.get_model_response(session, entry["question"])
+ model_answer = extract_answer(response)
+ score = dataset.score_answer(answer=model_answer, entry=entry)
+
+ return {
+ "question": entry["question"],
+ "expected_answer": str(entry["answer"]),
+ "model_answer": model_answer,
+ "full_model_response": response,
+ "score": score,
+ "metadata": str(entry["metadata"]),
}
- return {"metrics": metrics, "results": results}
+ async def evaluate_dataset(self, session: aiohttp.ClientSession, dataset_name: str) -> dict[str, Any]:
+ """Evaluate a single dataset asynchronously."""
+ self.logger.info(f"\nEvaluating dataset: {dataset_name}")
+ dataset = reasoning_gym.create_dataset(
+ dataset_name, size=self.config.dataset_size, seed=self.config.dataset_seed
+ )
- except Exception as e:
- print(f"Error evaluating dataset {dataset_name}: {str(e)}")
- return None
-
- async def evaluate_datasets(self, dataset_configs: list[dict[str, Any]]) -> list[dict[str, Any]]:
- """Evaluate multiple datasets concurrently."""
- tasks = [self.evaluate_dataset(config) for config in dataset_configs]
-
- # Process all datasets concurrently
+ tasks = [self.process_entry(session, dataset, entry) for entry in dataset]
results = await asyncio.gather(*tasks)
- return [r for r in results if r is not None]
+ return self.save_results(results, dataset, dataset_name)
+
+ async def evaluate_datasets(self) -> list[dict[str, Any]]:
+ """Main async evaluation entry point."""
+ all_results = []
+ async with aiohttp.ClientSession(headers=self.headers) as session:
+ return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))
-async def main_async():
+async def async_main():
parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
- parser.add_argument("--model", required=True, help="Model to evaluate")
- parser.add_argument("--config", required=True, help="Path to JSON configuration file")
- parser.add_argument("--output-dir", default="results", help="Output directory")
- parser.add_argument("--max-concurrent", type=int, default=10, help="Maximum number of concurrent API calls")
-
+ parser.add_argument("--yaml", required=True, help="Path to YAML configuration file")
args = parser.parse_args()
- # Create output directory if it doesn't exist
- os.makedirs(args.output_dir, exist_ok=True)
+ config = EvalConfig.from_yaml(args.yaml)
+ evaluator = OpenRouterEvaluator(model=config.model, config=config)
+ results = await evaluator.evaluate_datasets()
- # Load dataset configurations
- with open(args.config, "r") as f:
- dataset_configs = json.load(f)
-
- evaluator = AsyncOpenRouterEvaluator(model=args.model, max_concurrent=args.max_concurrent)
-
- eval_start_time = time.time()
- all_results = await evaluator.evaluate_datasets(dataset_configs)
- print(f"Time taken to collect evaluation data: {time.time() - eval_start_time:.2f} seconds")
- # Save results
- output_file = os.path.join(
- args.output_dir, f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
- )
-
- with open(output_file, "w") as f:
- json.dump(all_results, f, indent=2)
-
- # Create and save summary
- summary = []
- for result in all_results:
- metrics = result["metrics"]
- summary_entry = {
- "dataset_name": metrics["dataset_name"],
- "model": metrics["model"],
- "average_score": metrics["average_score"],
- "total_examples": metrics["total_examples"],
- "timestamp": metrics["timestamp"],
- "config": metrics["config"],
- }
- summary.append(summary_entry)
-
- summary_file = os.path.join(
- args.output_dir, f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
- )
-
- with open(summary_file, "w") as f:
- json.dump(summary, f, indent=2)
-
- # Print summary
- print("\nEvaluation Summary:")
- for entry in summary:
- print(f"\nDataset: {entry['dataset_name']}")
- print(f"Average Score: {entry['average_score']:.2%}")
- print(f"Total Examples: {entry['total_examples']}")
-
- print(f"\nDetailed results saved to: {output_file}")
- print(f"Summary saved to: {summary_file}")
-
-
-def main():
- asyncio.run(main_async())
+ output_dir = f"{config.eval_dir}/{config.category}"
+ os.makedirs(output_dir, exist_ok=True)
+ with open(f"{output_dir}/summary.json", "w") as f:
+ json.dump(results, f, indent=2)
if __name__ == "__main__":
- main()
+ asyncio.run(async_main())
diff --git a/eval/eval.sh b/eval/eval.sh
deleted file mode 100755
index 1d2a0beb..00000000
--- a/eval/eval.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-# Check if OPENROUTER_API_KEY is set
-if [ -z "$OPENROUTER_API_KEY" ]; then
- echo "Error: OPENROUTER_API_KEY environment variable is not set"
- echo "Please set it using: export OPENROUTER_API_KEY=your-api-key"
- exit 1
-fi
-
-# Configuration
-OUTPUT_DIR="results"
-
-# List of models to evaluate
-MODELS=(
- "google/gemini-2.0-flash-001"
-)
-
-# Create output directory
-mkdir -p "$OUTPUT_DIR"
-
-# Run evaluations
-for model in "${MODELS[@]}"; do
- echo "Evaluating $model..."
- python eval.py \
- --model "$model" \
- --config "eval_basic.json" \
- --output-dir "$OUTPUT_DIR"
-done
-
-echo "All evaluations completed!"
diff --git a/eval/eval_basic.json b/eval/eval_basic.json
deleted file mode 100644
index 7a739f99..00000000
--- a/eval/eval_basic.json
+++ /dev/null
@@ -1,31 +0,0 @@
-[
- {
- "name": "letter_counting",
- "min_words": 5,
- "max_words": 15,
- "size": 50,
- "seed": 42
- },
- {
- "name": "propositional_logic",
- "size": 50,
- "seed": 42
- },
- {
- "name": "leg_counting",
- "min_animals": 3,
- "max_animals": 8,
- "size": 50,
- "seed": 42
- },
- {
- "name": "group_anagrams",
- "size": 50,
- "seed": 42
- },
- {
- "name": "spell_backward",
- "size": 50,
- "seed": 42
- }
- ]
diff --git a/eval/r1/eval_config.py b/eval/eval_config.py
similarity index 100%
rename from eval/r1/eval_config.py
rename to eval/eval_config.py
diff --git a/eval/r1/__init__.py b/eval/r1/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/eval/r1/eval.py b/eval/r1/eval.py
deleted file mode 100644
index 4eb1e12d..00000000
--- a/eval/r1/eval.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import argparse
-import asyncio
-import json
-import logging
-import os
-from dataclasses import asdict
-from datetime import datetime
-from typing import Any
-
-import aiohttp
-from eval_config import EvalConfig
-from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential
-
-import reasoning_gym
-from reasoning_gym.utils import extract_answer
-
-
-class OpenRouterEvaluator:
- def __init__(self, model: str, config: EvalConfig):
- self.logger = logging.getLogger(f"OpenRouterEvaluator.{model}")
- self.config = config
- self.output_dir = f"{config.eval_dir}/{config.category}"
- os.makedirs(self.output_dir, exist_ok=True)
- self.base_url = "https://openrouter.ai/api/v1/chat/completions"
- self.api_key = os.getenv("OPENROUTER_API_KEY")
- self.model = model
- self.headers = {
- "Authorization": f"Bearer {self.api_key}",
- "HTTP-Referer": os.getenv("OR_SITE_URL", "localhost"),
- "X-Title": os.getenv("OR_APP_NAME", "Model Evaluation"),
- "Content-Type": "application/json",
- }
- self.semaphore = asyncio.Semaphore(10) # Control concurrency
-
- def save_results(self, results: list[dict[str, Any]], dataset, dataset_name) -> dict[str, Any]:
- file_name = f"{self.output_dir}/{dataset_name}.json"
- total_score = sum(r["score"] for r in results)
-
- metrics = {
- "dataset_name": dataset_name,
- "model": self.model,
- "size": dataset.size,
- "provider": self.config.provider,
- "average_score": total_score / len(results) if results else 0,
- "total_examples": len(results),
- "timestamp": datetime.now().isoformat(),
- "config": asdict(dataset.config),
- "results": results,
- }
-
- with open(file_name, "w") as f:
- json.dump(metrics, f, indent=2)
- return metrics
-
- async def get_model_response(self, session: aiohttp.ClientSession, prompt: str) -> str:
- payload = {
- "model": self.model,
- "messages": [
- {"role": self.config.developer_role, "content": self.config.developer_prompt},
- {"role": "user", "content": prompt},
- ],
- "provider": {"order": ["Nebius"], "allow_fallbacks": False},
- }
-
- async for attempt in AsyncRetrying(
- stop=stop_after_attempt(20),
- wait=wait_exponential(multiplier=1, min=1, max=60),
- retry=retry_if_exception_type(
- (aiohttp.ClientError, asyncio.TimeoutError, json.JSONDecodeError, ValueError)
- ),
- ):
- with attempt:
- async with session.post(self.base_url, json=payload) as response:
- data = await response.json()
-
- if not data:
- raise ValueError("Empty response")
-
- if not data.get("choices"):
- raise ValueError("Missing choices in response")
-
- return data["choices"][0]["message"]["content"]
-
- raise Exception("Failed to get valid response after retries")
-
- async def process_entry(self, session: aiohttp.ClientSession, dataset: Any, entry: Any) -> dict[str, Any]:
- """Process a single entry with concurrency control."""
- async with self.semaphore:
- response = await self.get_model_response(session, entry["question"])
- model_answer = extract_answer(response)
- score = dataset.score_answer(answer=model_answer, entry=entry)
-
- return {
- "question": entry["question"],
- "expected_answer": str(entry["answer"]),
- "model_answer": model_answer,
- "full_model_response": response,
- "score": score,
- "metadata": str(entry["metadata"]),
- }
-
- async def evaluate_dataset(self, session: aiohttp.ClientSession, dataset_name: str) -> dict[str, Any]:
- """Evaluate a single dataset asynchronously."""
- self.logger.info(f"\nEvaluating dataset: {dataset_name}")
- dataset = reasoning_gym.create_dataset(
- dataset_name, size=self.config.dataset_size, seed=self.config.dataset_seed
- )
-
- tasks = [self.process_entry(session, dataset, entry) for entry in dataset]
- results = await asyncio.gather(*tasks)
- return self.save_results(results, dataset, dataset_name)
-
- async def evaluate_datasets(self) -> list[dict[str, Any]]:
- """Main async evaluation entry point."""
- all_results = []
- async with aiohttp.ClientSession(headers=self.headers) as session:
- return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))
-
-
-async def async_main():
- parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
- parser.add_argument("--yaml", required=True, help="Path to YAML configuration file")
- args = parser.parse_args()
-
- config = EvalConfig.from_yaml(args.yaml)
- evaluator = OpenRouterEvaluator(model=config.model, config=config)
- results = await evaluator.evaluate_datasets()
-
- output_dir = f"{config.eval_dir}/{config.category}"
- os.makedirs(output_dir, exist_ok=True)
- with open(f"{output_dir}/summary.json", "w") as f:
- json.dump(results, f, indent=2)
-
-
-if __name__ == "__main__":
- asyncio.run(async_main())
diff --git a/eval/requirements-eval.txt b/eval/requirements-eval.txt
index cfd4254e..91567b6a 100644
--- a/eval/requirements-eval.txt
+++ b/eval/requirements-eval.txt
@@ -1,3 +1,2 @@
-openai>=1.64.0
aiohttp>=3.11.13
tenacity>=9.0.0
diff --git a/eval/r1/yaml/algebra.yaml b/eval/yaml/algebra.yaml
similarity index 100%
rename from eval/r1/yaml/algebra.yaml
rename to eval/yaml/algebra.yaml
diff --git a/eval/r1/yaml/algorithmic.yaml b/eval/yaml/algorithmic.yaml
similarity index 100%
rename from eval/r1/yaml/algorithmic.yaml
rename to eval/yaml/algorithmic.yaml
diff --git a/eval/r1/yaml/cognition.yaml b/eval/yaml/cognition.yaml
similarity index 100%
rename from eval/r1/yaml/cognition.yaml
rename to eval/yaml/cognition.yaml
diff --git a/eval/r1/yaml/logic.yaml b/eval/yaml/logic.yaml
similarity index 100%
rename from eval/r1/yaml/logic.yaml
rename to eval/yaml/logic.yaml
diff --git a/eval/r1/yaml/test.yaml b/eval/yaml/test.yaml
similarity index 100%
rename from eval/r1/yaml/test.yaml
rename to eval/yaml/test.yaml