import argparse import json import os from datetime import datetime from typing import Any, Dict, List from openai import OpenAI from reasoning_gym.factory import DATASETS, create_dataset class OpenRouterEvaluator: def __init__(self, model: str): self.client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=os.getenv("OPENROUTER_API_KEY")) self.model = model self.extra_headers = {} def get_model_response(self, prompt: str) -> str: """Get response from the model via OpenRouter API.""" try: completion = self.client.chat.completions.create( extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}] ) return completion.choices[0].message.content except Exception as e: print(f"Error calling OpenRouter API: {str(e)}") raise def evaluate_datasets(self, dataset_configs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Evaluate model on multiple datasets with their respective configurations.""" all_results = [] for dataset_config in dataset_configs: dataset_name = dataset_config.pop("name") print(f"\nEvaluating dataset: {dataset_name}") try: # Create dataset with its specific configuration data = create_dataset(dataset_name, **dataset_config) results = [] for entry in data: try: response = self.get_model_response(entry["question"]) score = data.score_answer(answer=response, entry=entry) result = { "question": entry["question"], "expected_answer": entry["answer"], "model_answer": response, "score": score, "metadata": entry["metadata"], } results.append(result) print(f"Processed question {len(results)}/{len(data)}. Score: {score}") except Exception as e: print(f"Error processing question: {entry['question']}") print(f"Error: {str(e)}") # Calculate aggregate metrics total_score = sum(r["score"] for r in results) metrics = { "dataset_name": dataset_name, "model": self.model, "size": len(data), "average_score": total_score / len(results) if results else 0, "total_examples": len(results), "timestamp": datetime.now().isoformat(), "config": dataset_config, } all_results.append({"metrics": metrics, "results": results}) except Exception as e: print(f"Error evaluating dataset {dataset_name}: {str(e)}") continue return all_results def main(): parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets") parser.add_argument("--model", required=True, help="Model to evaluate") parser.add_argument("--config", required=True, help="Path to JSON configuration file") parser.add_argument("--output-dir", default="results", help="Output directory") args = parser.parse_args() # Create output directory if it doesn't exist os.makedirs(args.output_dir, exist_ok=True) # Load dataset configurations with open(args.config, "r") as f: dataset_configs = json.load(f) evaluator = OpenRouterEvaluator(model=args.model) all_results = evaluator.evaluate_datasets(dataset_configs) # Save results output_file = os.path.join( args.output_dir, f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" ) # Save detailed results with open(output_file, "w") as f: json.dump(all_results, f, indent=2) # Create summary summary = [] for result in all_results: metrics = result["metrics"] summary_entry = { "dataset_name": metrics["dataset_name"], "model": metrics["model"], "average_score": metrics["average_score"], "total_examples": metrics["total_examples"], "timestamp": metrics["timestamp"], "config": metrics["config"], } summary.append(summary_entry) # Save summary to a separate file summary_file = os.path.join( args.output_dir, f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" ) with open(summary_file, "w") as f: json.dump(summary, f, indent=2) # Print summary print("\nEvaluation Summary:") for entry in summary: print(f"\nDataset: {entry['dataset_name']}") print(f"Average Score: {entry['average_score']:.2%}") print(f"Total Examples: {entry['total_examples']}") print(f"\nDetailed results saved to: {output_file}") print(f"Summary saved to: {summary_file}") if __name__ == "__main__": main()