updated async impl and added r1

This commit is contained in:
joesharratt1229 2025-02-13 03:51:01 +00:00
parent 1a3728ec3a
commit b2e3ccf3d6
2 changed files with 77 additions and 72 deletions

View file

@ -1,4 +1,5 @@
import argparse import argparse
import asyncio
import json import json
import logging import logging
import os import os
@ -6,10 +7,9 @@ from dataclasses import asdict
from datetime import datetime from datetime import datetime
from typing import Any, Dict, List from typing import Any, Dict, List
import requests import aiohttp
from eval_config import EvalConfig from eval_config import EvalConfig
from requests.exceptions import RequestException from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
import reasoning_gym import reasoning_gym
from reasoning_gym.utils import extract_answer from reasoning_gym.utils import extract_answer
@ -30,9 +30,9 @@ class OpenRouterEvaluator:
"X-Title": os.getenv("OR_APP_NAME", "Model Evaluation"), "X-Title": os.getenv("OR_APP_NAME", "Model Evaluation"),
"Content-Type": "application/json", "Content-Type": "application/json",
} }
self.semaphore = asyncio.Semaphore(10) # Control concurrency
def save_results(self, results: List[Dict[str, Any]], dataset, dataset_name) -> Dict[str, Any]: def save_results(self, results: List[Dict[str, Any]], dataset, dataset_name) -> Dict[str, Any]:
file_name = f"{self.output_dir}/{dataset_name}.json" file_name = f"{self.output_dir}/{dataset_name}.json"
total_score = sum(r["score"] for r in results) total_score = sum(r["score"] for r in results)
@ -45,7 +45,7 @@ class OpenRouterEvaluator:
"total_examples": len(results), "total_examples": len(results),
"timestamp": datetime.now().isoformat(), "timestamp": datetime.now().isoformat(),
"config": asdict(dataset.config), "config": asdict(dataset.config),
"results": results, # save results to allow for performance recalculation "results": results,
} }
with open(file_name, "w") as f: with open(file_name, "w") as f:
@ -53,87 +53,93 @@ class OpenRouterEvaluator:
return metrics return metrics
def prepare_messages(self, prompt: str) -> List[Dict[str, str]]: def prepare_messages(self, prompt: str) -> List[Dict[str, str]]:
messages = [ return {
"model": self.model,
"messages": [
{"role": self.config.developer_role, "content": self.config.developer_prompt}, {"role": self.config.developer_role, "content": self.config.developer_prompt},
{"role": "user", "content": prompt}, {"role": "user", "content": prompt},
] ],
"provider": {"order": ["Nebius"], "allow_fallbacks": False},
}
async def get_model_response(self, session: aiohttp.ClientSession, prompt: str) -> str:
payload = { payload = {
"model": self.model, "model": self.model,
"messages": messages, "messages": [
"provider": {"order": ["Nebius"], "allow_fallbacks": False}, {"role": self.config.developer_role, "content": self.config.developer_prompt},
} # make sure only one provider is used {"role": "user", "content": prompt},
],
}
return payload async for attempt in AsyncRetrying(
stop=stop_after_attempt(20),
wait=wait_exponential(multiplier=1, min=1, max=60),
retry=retry_if_exception_type(
(aiohttp.ClientError, asyncio.TimeoutError, json.JSONDecodeError, ValueError)
),
):
with attempt:
async with session.post(self.base_url, json=payload) as response:
data = await response.json()
@retry( if not data:
retry=retry_if_exception_type(RequestException), raise ValueError("Empty response")
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=4, max=60),
)
def get_model_response(self, prompt: str) -> str:
"""Get response from the model via OpenRouter API."""
payload = self.prepare_messages(prompt) if not data.get("choices"):
try: raise ValueError("Missing choices in response")
response = requests.post(self.base_url, headers=self.headers, json=payload, timeout=30)
response.raise_for_status()
except requests.exceptions.RequestException as e:
raise RequestException(
f"API request failed: {str(e)}", {"endpoint": self.base_url, "model": self.model}
) from e
return response.json()["choices"][0]["message"]["content"]
def evaluate_datasets(self) -> List[Dict[str, Any]]: return data["choices"][0]["message"]["content"]
"""Evaluate model on multiple datasets with their respective configurations."""
all_results = []
for dataset_name in self.config.datasets: raise Exception("Failed to get valid response after retries")
self.logger.info(f"\nEvaluating dataset: {dataset_name}")
# Create dataset with its specific configuration async def process_entry(self, session: aiohttp.ClientSession, dataset: Any, entry: Any) -> Dict[str, Any]:
dataset = reasoning_gym.create_dataset( """Process a single entry with concurrency control."""
dataset_name, size=self.config.dataset_size, seed=self.config.dataset_seed async with self.semaphore:
) response = await self.get_model_response(session, entry["question"])
results = []
for i, entry in enumerate(dataset):
print(f"On example {i+1} of {len(dataset)}")
response = self.get_model_response(entry["question"])
model_answer = extract_answer(response) model_answer = extract_answer(response)
score = dataset.score_answer(answer=model_answer, entry=entry) score = dataset.score_answer(answer=model_answer, entry=entry)
print(f"Question: {entry['question']}")
result = { return {
"question": entry["question"], "question": entry["question"],
"expected_answer": str(entry["answer"]), "expected_answer": str(entry["answer"]),
"model_answer": model_answer, "model_answer": model_answer,
"score": score, "score": score,
"metadata": str(entry["metadata"]), "metadata": str(entry["metadata"]),
} }
results.append(result)
metrics = self.save_results(results, dataset, dataset_name) async def evaluate_dataset(self, session: aiohttp.ClientSession, dataset_name: str) -> Dict[str, Any]:
"""Evaluate a single dataset asynchronously."""
self.logger.info(f"\nEvaluating dataset: {dataset_name}")
dataset = reasoning_gym.create_dataset(
dataset_name, size=self.config.dataset_size, seed=self.config.dataset_seed
)
all_results.append({"metrics": metrics, "results": results}) tasks = [self.process_entry(session, dataset, entry) for entry in dataset]
results = await asyncio.gather(*tasks)
return self.save_results(results, dataset, dataset_name)
return all_results async def evaluate_datasets(self) -> List[Dict[str, Any]]:
"""Main async evaluation entry point."""
all_results = []
async with aiohttp.ClientSession(headers=self.headers) as session:
return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))
def main(): async def async_main():
parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets") parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
parser.add_argument("--yaml", required=True, help="Path to YAML configuration file") parser.add_argument("--yaml", required=True, help="Path to YAML configuration file")
args = parser.parse_args() args = parser.parse_args()
config = EvalConfig.from_yaml(args.yaml) config = EvalConfig.from_yaml(args.yaml)
evaluator = OpenRouterEvaluator(model=config.model, config=config)
results = await evaluator.evaluate_datasets()
output_dir = f"{config.eval_dir}/{config.category}" output_dir = f"{config.eval_dir}/{config.category}"
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
evaluator = OpenRouterEvaluator(model=config.model, config=config)
all_results = evaluator.evaluate_datasets()
with open(f"{output_dir}/summary.json", "w") as f: with open(f"{output_dir}/summary.json", "w") as f:
json.dump(all_results, f, indent=2) json.dump(results, f, indent=2)
if __name__ == "__main__": if __name__ == "__main__":
main() asyncio.run(async_main())

View file

@ -1,7 +1,6 @@
model: deepseek/deepseek-r1 model: deepseek/deepseek-r1
category: algorithmic category: algorithmic
datasets: datasets:
- base_conversion
- binary_matrix - binary_matrix
- caesar_cipher - caesar_cipher
- group_anagrams - group_anagrams