reasoning-gym/eval/r1/eval.py
2025-02-11 06:48:59 +00:00

139 lines
5.1 KiB
Python

import argparse
import json
import logging
import os
from dataclasses import asdict
from datetime import datetime
from typing import Any, Dict, List
import requests
from eval_config import EvalConfig
from requests.exceptions import RequestException
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
import reasoning_gym
from reasoning_gym.utils import extract_answer
class OpenRouterEvaluator:
def __init__(self, model: str, config: EvalConfig):
self.logger = logging.getLogger(f"OpenRouterEvaluator.{model}")
self.config = config
self.output_dir = f"{config.eval_dir}/{config.category}"
os.makedirs(self.output_dir, exist_ok=True)
self.base_url = "https://openrouter.ai/api/v1/chat/completions"
self.api_key = os.getenv("OPENROUTER_API_KEY")
self.model = model
self.headers = {
"Authorization": f"Bearer {self.api_key}",
"HTTP-Referer": os.getenv("OR_SITE_URL", "localhost"),
"X-Title": os.getenv("OR_APP_NAME", "Model Evaluation"),
"Content-Type": "application/json",
}
def save_results(self, results: List[Dict[str, Any]], dataset, dataset_name) -> Dict[str, Any]:
file_name = f"{self.output_dir}/{dataset_name}.json"
total_score = sum(r["score"] for r in results)
metrics = {
"dataset_name": dataset_name,
"model": self.model,
"size": dataset.size,
"provider": self.config.provider,
"average_score": total_score / len(results) if results else 0,
"total_examples": len(results),
"timestamp": datetime.now().isoformat(),
"config": asdict(dataset.config),
"results": results, # save results to allow for performance recalculation
}
with open(file_name, "w") as f:
json.dump(metrics, f, indent=2)
return metrics
def prepare_messages(self, prompt: str) -> List[Dict[str, str]]:
messages = [
{"role": self.config.developer_role, "content": self.config.developer_prompt},
{"role": "user", "content": prompt},
]
payload = {
"model": self.model,
"messages": messages,
"provider": {"order": ["Nebius"], "allow_fallbacks": False},
} # make sure only one provider is used
return payload
@retry(
retry=retry_if_exception_type(RequestException),
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=4, max=60),
)
def get_model_response(self, prompt: str) -> str:
"""Get response from the model via OpenRouter API."""
payload = self.prepare_messages(prompt)
try:
response = requests.post(self.base_url, headers=self.headers, json=payload, timeout=30)
response.raise_for_status()
except requests.exceptions.RequestException as e:
raise RequestException(
f"API request failed: {str(e)}", {"endpoint": self.base_url, "model": self.model}
) from e
return response.json()["choices"][0]["message"]["content"]
def evaluate_datasets(self) -> List[Dict[str, Any]]:
"""Evaluate model on multiple datasets with their respective configurations."""
all_results = []
for dataset_name in self.config.datasets:
self.logger.info(f"\nEvaluating dataset: {dataset_name}")
# Create dataset with its specific configuration
dataset = reasoning_gym.create_dataset(
dataset_name, size=self.config.dataset_size, seed=self.config.dataset_seed
)
results = []
for i, entry in enumerate(dataset):
print(f"On example {i+1} of {len(dataset)}")
response = self.get_model_response(entry["question"])
model_answer = extract_answer(response)
score = dataset.score_answer(answer=model_answer, entry=entry)
result = {
"question": entry["question"],
"expected_answer": str(entry["answer"]),
"model_answer": model_answer,
"score": score,
"metadata": str(entry["metadata"]),
}
results.append(result)
metrics = self.save_results(results, dataset, dataset_name)
all_results.append({"metrics": metrics, "results": results})
return all_results
def main():
parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
parser.add_argument("--yaml", required=True, help="Path to YAML configuration file")
args = parser.parse_args()
config = EvalConfig.from_yaml(args.yaml)
output_dir = f"{config.eval_dir}/{config.category}"
os.makedirs(output_dir, exist_ok=True)
evaluator = OpenRouterEvaluator(model=config.model, config=config)
all_results = evaluator.evaluate_datasets()
with open(f"{output_dir}/summary.json", "w") as f:
json.dump(all_results, f, indent=2)
if __name__ == "__main__":
main()