added r1 evaluation logic

This commit is contained in:
joesharratt1229 2025-02-11 03:46:56 +00:00
parent 0657222a8f
commit 42e02640a3
6 changed files with 208 additions and 0 deletions

134
eval/r1/eval.py Normal file
View file

@ -0,0 +1,134 @@
import argparse
import json
import logging
import os
from dataclasses import asdict
from datetime import datetime
from typing import Any, Dict, List
import requests
from eval_config import EvalConfig
from requests.exceptions import RequestException
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
import reasoning_gym
from reasoning_gym.utils import extract_answer
class OpenRouterEvaluator:
def __init__(self, model: str, config: EvalConfig):
self.logger = logging.getLogger(f"OpenRouterEvaluator.{model}")
self.config = config
self.output_dir = f"{config.eval_dir}/{config.category}"
os.makedirs(self.output_dir, exist_ok=True)
self.base_url = "https://openrouter.ai/api/v1/chat/completions"
self.api_key = os.getenv("OPENROUTER_API_KEY")
self.model = model
self.headers = {
"Authorization": f"Bearer {self.api_key}",
"HTTP-Referer": os.getenv("OR_SITE_URL", "localhost"),
"X-Title": os.getenv("OR_APP_NAME", "Model Evaluation"),
"Content-Type": "application/json",
}
def save_results(self, results: List[Dict[str, Any]], dataset, dataset_name) -> Dict[str, Any]:
file_name = f"{self.output_dir}/{dataset_name}.json"
total_score = sum(r["score"] for r in results)
metrics = {
"dataset_name": dataset_name,
"model": self.model,
"size": dataset.size,
"provider": self.config.provider,
"average_score": total_score / len(results) if results else 0,
"total_examples": len(results),
"timestamp": datetime.now().isoformat(),
"config": asdict(dataset.config),
"results": results,
}
with open(file_name, "w") as f:
json.dump(metrics, f, indent=2)
return metrics
def prepare_messages(self, prompt: str) -> List[Dict[str, str]]:
messages = [
{"role": self.config.developer_role, "content": self.config.developer_prompt},
{"role": "user", "content": prompt},
]
payload = {"model": self.model, "messages": messages, "provider": {"order": ["Nebius"]}}
return payload
@retry(
retry=retry_if_exception_type(RequestException),
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=4, max=60),
)
def get_model_response(self, prompt: str) -> str:
"""Get response from the model via OpenRouter API."""
payload = self.prepare_messages(prompt)
try:
response = requests.post(self.base_url, headers=self.headers, json=payload, timeout=30)
response.raise_for_status()
except requests.exceptions.RequestException as e:
raise RequestException(
f"API request failed: {str(e)}", {"endpoint": self.base_url, "model": self.model}
) from e
return response.json()["choices"][0]["message"]["content"]
def evaluate_datasets(self) -> List[Dict[str, Any]]:
"""Evaluate model on multiple datasets with their respective configurations."""
all_results = []
for dataset_name in self.config.datasets:
self.logger.info(f"\nEvaluating dataset: {dataset_name}")
# Create dataset with its specific configuration
dataset = reasoning_gym.create_dataset(
dataset_name, size=self.config.dataset_size, seed=self.config.dataset_seed
)
results = []
for entry in dataset:
response = self.get_model_response(entry["question"])
model_answer = extract_answer(response)
score = dataset.score_answer(answer=model_answer, entry=entry)
result = {
"question": entry["question"],
"expected_answer": entry["answer"],
"model_answer": model_answer,
"score": score,
"metadata": entry["metadata"],
}
results.append(result)
metrics = self.save_results(results, dataset)
all_results.append({"metrics": metrics, "results": results})
return all_results
def main():
parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
parser.add_argument("--yaml", required=True, help="Path to YAML configuration file")
args = parser.parse_args()
config = EvalConfig.from_yaml(args.yaml)
output_dir = f"{config.eval_dir}/{config.category}"
os.makedirs(output_dir, exist_ok=True)
evaluator = OpenRouterEvaluator(model=config.model, config=config)
all_results = evaluator.evaluate_datasets()
with open(f"{output_dir}/summary.json", "w") as f:
json.dump(all_results, f, indent=2)
if __name__ == "__main__":
main()