mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-26 17:13:17 +00:00
Eval script consolidation (#238)
The script now supports: - YAML and JSON configurations - Dataset-specific parameters - Overriding configuration via command line - Detailed logging and error handling
This commit is contained in:
parent
8a66d2a216
commit
850c1cf6f4
40 changed files with 1111 additions and 670 deletions
563
eval/eval.py
563
eval/eval.py
|
|
@ -1,16 +1,40 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Evaluation script for reasoning gym datasets.
|
||||
|
||||
This script evaluates LLM performance on reasoning gym datasets using the OpenRouter API.
|
||||
|
||||
Usage:
|
||||
python eval.py --config config.yaml [options]
|
||||
|
||||
Options:
|
||||
--model MODEL Override model specified in config
|
||||
--output-dir DIR Override output directory specified in config
|
||||
--max-concurrent NUM Maximum number of concurrent API calls
|
||||
--save-metadata Save entry metadata in results
|
||||
--full-results Save the full results file
|
||||
--verbose Print detailed model responses
|
||||
--debug Enable debug logging
|
||||
|
||||
Environment variables:
|
||||
OPENROUTER_API_KEY Required API key for OpenRouter
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import OrderedDict
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
from typing import Any, Union
|
||||
|
||||
import aiohttp
|
||||
from eval_config import EvalConfig
|
||||
from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential
|
||||
from eval_config import CategoryConfig, DatasetConfig, EvalConfig
|
||||
from openai import AsyncOpenAI
|
||||
from tqdm.asyncio import tqdm_asyncio
|
||||
|
||||
import reasoning_gym
|
||||
from reasoning_gym.utils import extract_answer
|
||||
|
|
@ -22,130 +46,473 @@ logging.basicConfig(
|
|||
handlers=[logging.StreamHandler()],
|
||||
)
|
||||
|
||||
# httpx logging will be configured in the AsyncModelEvaluator class
|
||||
# based on the debug flag
|
||||
|
||||
class OpenRouterEvaluator:
|
||||
def __init__(self, model: str, config: EvalConfig, api_key: str):
|
||||
self.logger = logging.getLogger(f"OpenRouterEvaluator.{model}")
|
||||
|
||||
def get_git_hash() -> str:
|
||||
"""Get current git hash for reproducibility."""
|
||||
cmd = ["git", "rev-parse", "HEAD"]
|
||||
try:
|
||||
return subprocess.check_output(cmd, text=True, stderr=subprocess.PIPE).strip()
|
||||
except Exception:
|
||||
return "unknown"
|
||||
|
||||
|
||||
class AsyncModelEvaluator:
|
||||
"""Evaluates models on reasoning datasets with async API calls via OpenRouter."""
|
||||
|
||||
def __init__(self, config: EvalConfig, verbose: bool = False, debug: bool = False):
|
||||
"""Initialize the evaluator with configuration.
|
||||
|
||||
Args:
|
||||
config: Evaluation configuration
|
||||
verbose: Whether to print detailed model responses
|
||||
debug: Whether to enable debug logging
|
||||
"""
|
||||
self.config = config
|
||||
self.output_dir = f"{config.eval_dir}/{config.category}"
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
self.base_url = "https://openrouter.ai/api/v1/chat/completions"
|
||||
self.api_key = api_key
|
||||
self.model = model
|
||||
self.headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"HTTP-Referer": os.getenv("OR_SITE_URL", "localhost"),
|
||||
"X-Title": os.getenv("OR_APP_NAME", "Model Evaluation"),
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
self.semaphore = asyncio.Semaphore(15) # Control concurrency
|
||||
self.verbose = verbose
|
||||
self.debug = debug
|
||||
|
||||
def save_results(self, results: list[dict[str, Any]], dataset, dataset_name) -> dict[str, Any]:
|
||||
file_name = f"{self.output_dir}/{dataset_name}.json"
|
||||
total_score = sum(r["score"] for r in results)
|
||||
# Set up logging
|
||||
self.logger = logging.getLogger("AsyncModelEvaluator")
|
||||
if debug:
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
# Enable httpx logs in debug mode
|
||||
logging.getLogger("httpx").setLevel(logging.INFO)
|
||||
else:
|
||||
# Suppress httpx logs in normal mode
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
|
||||
metrics = {
|
||||
"dataset_name": dataset_name,
|
||||
"model": self.model,
|
||||
"size": dataset.size,
|
||||
"provider": self.config.provider,
|
||||
"average_score": total_score / len(results) if results else 0,
|
||||
"total_examples": len(results),
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"config": asdict(dataset.config),
|
||||
"results": results,
|
||||
}
|
||||
# Set up OpenRouter API client
|
||||
api_key = os.getenv("OPENROUTER_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENROUTER_API_KEY environment variable is not set")
|
||||
|
||||
with open(file_name, "w") as f:
|
||||
json.dump(metrics, f, indent=2)
|
||||
return metrics
|
||||
self.client = AsyncOpenAI(base_url="https://openrouter.ai/api/v1", api_key=api_key)
|
||||
|
||||
async def get_model_response(self, session: aiohttp.ClientSession, prompt: str) -> str:
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": self.config.developer_role, "content": self.config.developer_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"provider": {"order": [self.config.provider], "allow_fallbacks": False},
|
||||
}
|
||||
# Concurrency control
|
||||
self.semaphore = asyncio.Semaphore(config.max_concurrent)
|
||||
|
||||
async for attempt in AsyncRetrying(
|
||||
stop=stop_after_attempt(20),
|
||||
wait=wait_exponential(multiplier=1, min=1, max=60),
|
||||
retry=retry_if_exception_type(
|
||||
(aiohttp.ClientError, asyncio.TimeoutError, json.JSONDecodeError, ValueError)
|
||||
),
|
||||
):
|
||||
with attempt:
|
||||
async with session.post(self.base_url, json=payload) as response:
|
||||
data = await response.json()
|
||||
# Metadata
|
||||
self.git_hash = get_git_hash()
|
||||
self.start_time = datetime.now()
|
||||
|
||||
if not data:
|
||||
raise ValueError("Empty response")
|
||||
async def get_model_response(self, prompt: str) -> str:
|
||||
"""Get response from model with retry logic via OpenRouter.
|
||||
|
||||
if not data.get("choices"):
|
||||
raise ValueError("Missing choices in response")
|
||||
Args:
|
||||
prompt: The prompt to send to the model
|
||||
|
||||
return data["choices"][0]["message"]["content"]
|
||||
Returns:
|
||||
The model's response text
|
||||
|
||||
raise Exception("Failed to get valid response after retries")
|
||||
Raises:
|
||||
Exception: If all retries fail
|
||||
"""
|
||||
max_retries = 10
|
||||
base_delay = 1.0
|
||||
max_delay = 60.0
|
||||
backoff_factor = 2.0
|
||||
|
||||
async def process_entry(self, session: aiohttp.ClientSession, dataset: Any, entry: Any) -> dict[str, Any]:
|
||||
"""Process a single entry with concurrency control."""
|
||||
async with self.semaphore:
|
||||
response = await self.get_model_response(session, entry["question"])
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with self.semaphore:
|
||||
# Prepare API call parameters
|
||||
params = {
|
||||
"model": self.config.model,
|
||||
"messages": [
|
||||
{"role": self.config.system_role, "content": self.config.system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
}
|
||||
|
||||
# Add provider configuration if specified
|
||||
if self.config.provider:
|
||||
params["extra_body"] = {"provider": {"order": [self.config.provider], "allow_fallbacks": False}}
|
||||
|
||||
completion = await self.client.chat.completions.create(**params)
|
||||
response = completion.choices[0].message.content
|
||||
|
||||
if self.verbose:
|
||||
self.logger.info(f"Prompt: {prompt}")
|
||||
self.logger.info(f"Response: {response}")
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
delay = min(max_delay, base_delay * (backoff_factor**attempt))
|
||||
self.logger.warning(f"Attempt {attempt+1}/{max_retries} failed: {str(e)}")
|
||||
self.logger.warning(f"Retrying in {delay:.2f} seconds...")
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
raise Exception(f"Failed to get model response after {max_retries} attempts")
|
||||
|
||||
async def process_entry(
|
||||
self, dataset: reasoning_gym.dataset.ProceduralDataset, entry: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
"""Process a single dataset entry.
|
||||
|
||||
Args:
|
||||
dataset: The dataset instance
|
||||
entry: The entry to process
|
||||
|
||||
Returns:
|
||||
Dict with processing results
|
||||
"""
|
||||
try:
|
||||
response = await self.get_model_response(entry["question"])
|
||||
model_answer = extract_answer(response)
|
||||
score = dataset.score_answer(answer=model_answer, entry=entry)
|
||||
|
||||
print(f"answer: {model_answer}, score: {score}")
|
||||
if self.verbose:
|
||||
print(f"Question: {entry['question']}")
|
||||
print(f"Expected: {entry['answer']}")
|
||||
print(f"Answer: {model_answer}")
|
||||
print(f"Score: {score}")
|
||||
print("-" * 40)
|
||||
|
||||
return {
|
||||
result = {
|
||||
"question": entry["question"],
|
||||
"expected_answer": str(entry["answer"]),
|
||||
"model_answer": model_answer,
|
||||
"full_model_response": response,
|
||||
"score": score,
|
||||
"metadata": str(entry["metadata"]),
|
||||
}
|
||||
|
||||
async def evaluate_dataset(self, session: aiohttp.ClientSession, dataset_name: str) -> dict[str, Any]:
|
||||
"""Evaluate a single dataset asynchronously."""
|
||||
self.logger.info(f"\nEvaluating dataset: {dataset_name}")
|
||||
dataset = reasoning_gym.create_dataset(
|
||||
dataset_name, size=self.config.dataset_size, seed=self.config.dataset_seed
|
||||
)
|
||||
# Only include metadata if configured to do so
|
||||
if self.config.save_metadata:
|
||||
result["metadata"] = entry["metadata"]
|
||||
|
||||
tasks = [self.process_entry(session, dataset, entry) for entry in dataset]
|
||||
results = await asyncio.gather(*tasks)
|
||||
return self.save_results(results, dataset, dataset_name)
|
||||
return result
|
||||
|
||||
async def evaluate_datasets(self) -> list[dict[str, Any]]:
|
||||
"""Main async evaluation entry point."""
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing entry: {str(e)}")
|
||||
result = {
|
||||
"question": entry["question"],
|
||||
"expected_answer": str(entry["answer"]),
|
||||
"model_answer": "ERROR",
|
||||
"full_model_response": f"Error: {str(e)}",
|
||||
"score": 0.0,
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
# Only include metadata if configured to do so
|
||||
if self.config.save_metadata:
|
||||
result["metadata"] = entry["metadata"]
|
||||
|
||||
return result
|
||||
|
||||
async def evaluate_dataset(self, category_name: str, dataset_config: DatasetConfig) -> dict[str, Any]:
|
||||
"""Evaluate a single dataset.
|
||||
|
||||
Args:
|
||||
category_name: Name of the category
|
||||
dataset_config: Configuration for the dataset
|
||||
|
||||
Returns:
|
||||
Dict with evaluation results
|
||||
"""
|
||||
dataset_name = dataset_config.dataset
|
||||
self.logger.info(f"Evaluating dataset: {dataset_name}")
|
||||
|
||||
try:
|
||||
# Create dataset with all parameters
|
||||
dataset_params = {}
|
||||
|
||||
# Add all parameters from the config params dictionary
|
||||
# Make sure we don't have a nested 'params' dictionary
|
||||
for k, v in dataset_config.params.items():
|
||||
if k != "params":
|
||||
dataset_params[k] = v
|
||||
elif isinstance(v, dict):
|
||||
# If there's a nested params dict, flatten it
|
||||
dataset_params.update(v)
|
||||
|
||||
# Add size and seed if they're not None
|
||||
if dataset_config.size is not None:
|
||||
dataset_params["size"] = dataset_config.size
|
||||
if dataset_config.seed is not None:
|
||||
dataset_params["seed"] = dataset_config.seed
|
||||
|
||||
dataset = reasoning_gym.create_dataset(dataset_name, **dataset_params)
|
||||
|
||||
# Get all entries
|
||||
all_entries = list(dataset)
|
||||
|
||||
# Process entries with progress bar
|
||||
tasks = [self.process_entry(dataset, entry) for entry in all_entries]
|
||||
results = await tqdm_asyncio.gather(*tasks, desc=f"Processing {dataset_name}", leave=True)
|
||||
|
||||
# Calculate metrics
|
||||
total_score = sum(r["score"] for r in results)
|
||||
average_score = total_score / len(results) if results else 0
|
||||
|
||||
return {
|
||||
"name": dataset_name,
|
||||
"category": category_name,
|
||||
"average_score": average_score,
|
||||
"total_examples": len(results),
|
||||
"config": {"size": dataset_config.size, "seed": dataset_config.seed, **dataset_config.params},
|
||||
"results": results,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error evaluating dataset {dataset_name}: {str(e)}")
|
||||
return {
|
||||
"name": dataset_name,
|
||||
"category": category_name,
|
||||
"average_score": 0.0,
|
||||
"total_examples": 0,
|
||||
"config": {"size": dataset_config.size, "seed": dataset_config.seed, **dataset_config.params},
|
||||
"error": str(e),
|
||||
"results": [],
|
||||
}
|
||||
|
||||
async def evaluate_category(self, category_config: CategoryConfig) -> dict[str, Any]:
|
||||
"""Evaluate all datasets in a category.
|
||||
|
||||
Args:
|
||||
category_config: Configuration for the category
|
||||
|
||||
Returns:
|
||||
Dict with category evaluation results
|
||||
"""
|
||||
category_name = category_config.category
|
||||
self.logger.info(f"Evaluating category: {category_name}")
|
||||
|
||||
tasks = [self.evaluate_dataset(category_name, dataset_config) for dataset_config in category_config.datasets]
|
||||
|
||||
dataset_results = await asyncio.gather(*tasks)
|
||||
|
||||
return {
|
||||
"name": category_name,
|
||||
"datasets": dataset_results,
|
||||
}
|
||||
|
||||
async def evaluate_all(self) -> dict[str, Any]:
|
||||
"""Evaluate all categories and datasets.
|
||||
|
||||
Returns:
|
||||
Dict with all evaluation results and summary
|
||||
"""
|
||||
self.logger.info(f"Starting evaluation of {len(self.config.categories)} categories")
|
||||
|
||||
tasks = [self.evaluate_category(category) for category in self.config.categories]
|
||||
category_results = await asyncio.gather(*tasks)
|
||||
|
||||
# Generate results structure
|
||||
results = {
|
||||
"metadata": {
|
||||
"timestamp": self.start_time.isoformat(),
|
||||
"model": self.config.model,
|
||||
"provider": self.config.provider,
|
||||
"git_hash": self.git_hash,
|
||||
"duration_seconds": (datetime.now() - self.start_time).total_seconds(),
|
||||
},
|
||||
"categories": category_results,
|
||||
}
|
||||
|
||||
# Generate summary
|
||||
results["summary"] = self.generate_summary(results)
|
||||
|
||||
return results
|
||||
|
||||
def generate_summary(self, results: dict[str, Any]) -> dict[str, Union[int, OrderedDict]]:
|
||||
"""Generate a summary of evaluation results in the original configuration order.
|
||||
|
||||
Args:
|
||||
results: The full evaluation results
|
||||
|
||||
Returns:
|
||||
Dict with summary information
|
||||
"""
|
||||
summary = {
|
||||
"total_datasets": 0,
|
||||
"total_examples": 0,
|
||||
"dataset_scores": OrderedDict(),
|
||||
}
|
||||
|
||||
# Iterate through categories and datasets in the original order from config
|
||||
for category_config in self.config.categories:
|
||||
for dataset_config in category_config.datasets:
|
||||
dataset_name = dataset_config.dataset
|
||||
dataset_found = False
|
||||
|
||||
# Find corresponding results
|
||||
for category in results["categories"]:
|
||||
if category["name"] == category_config.category:
|
||||
for dataset in category["datasets"]:
|
||||
if dataset["name"] == dataset_name:
|
||||
# Add to summary in original order
|
||||
summary["dataset_scores"][dataset_name] = dataset["average_score"]
|
||||
summary["total_datasets"] += 1
|
||||
summary["total_examples"] += dataset["total_examples"]
|
||||
dataset_found = True
|
||||
break
|
||||
|
||||
# If dataset wasn't found in results (error), add with score 0
|
||||
if not dataset_found:
|
||||
summary["dataset_scores"][dataset_name] = 0.0
|
||||
summary["total_datasets"] += 1
|
||||
|
||||
return summary
|
||||
|
||||
def save_results(self, results: dict[str, Any]) -> tuple[str, str]:
|
||||
"""Save evaluation results to files.
|
||||
|
||||
Args:
|
||||
results: The evaluation results to save
|
||||
|
||||
Returns:
|
||||
Tuple of (results_path, summary_path)
|
||||
"""
|
||||
# Create output directory with timestamp
|
||||
timestamp = self.start_time.strftime("%Y%m%d_%H%M%S")
|
||||
model_name = self.config.model.replace("/", "_")
|
||||
|
||||
# Format directory name with model and timestamp only
|
||||
output_dir = Path(self.config.output_dir) / f"{model_name}_{timestamp}"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results_path = None
|
||||
|
||||
# Save full results if configured to do so
|
||||
if self.config.save_full_results:
|
||||
results_path = output_dir / "results.json"
|
||||
with open(results_path, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
|
||||
# Add timestamp, git hash, model, provider, and duration to summary
|
||||
summary_data = results["summary"].copy()
|
||||
summary_data["timestamp"] = self.start_time.isoformat()
|
||||
summary_data["git_hash"] = self.git_hash
|
||||
summary_data["model"] = self.config.model
|
||||
summary_data["provider"] = self.config.provider
|
||||
summary_data["duration_seconds"] = results["metadata"]["duration_seconds"]
|
||||
|
||||
# Save summary
|
||||
summary_path = output_dir / "summary.json"
|
||||
with open(summary_path, "w") as f:
|
||||
json.dump(summary_data, f, indent=2)
|
||||
|
||||
# Save individual dataset results
|
||||
for category in results["categories"]:
|
||||
category_dir = output_dir / category["name"]
|
||||
category_dir.mkdir(exist_ok=True)
|
||||
|
||||
for dataset in category["datasets"]:
|
||||
dataset_path = category_dir / f"{dataset['name']}.json"
|
||||
with open(dataset_path, "w") as f:
|
||||
json.dump(dataset, f, indent=2)
|
||||
|
||||
return str(results_path) if results_path else None, str(summary_path)
|
||||
|
||||
def print_summary(self, results: dict[str, Any]) -> None:
|
||||
"""Print a summary of evaluation results to the console.
|
||||
|
||||
Args:
|
||||
results: The evaluation results
|
||||
"""
|
||||
summary = results["summary"]
|
||||
|
||||
print("\nEvaluation Summary:")
|
||||
print("------------------")
|
||||
print(f"Model: {self.config.model}")
|
||||
print(f"Provider: {self.config.provider}")
|
||||
print(f"Git Hash: {self.git_hash}")
|
||||
print(f"Duration: {results['metadata']['duration_seconds']:.2f} seconds")
|
||||
print()
|
||||
|
||||
print("Dataset Scores (in configuration order):")
|
||||
for dataset_name, score in summary["dataset_scores"].items():
|
||||
# Find the number of examples for this dataset
|
||||
examples = 0
|
||||
for category in results["categories"]:
|
||||
for dataset in category["datasets"]:
|
||||
if dataset["name"] == dataset_name:
|
||||
examples = dataset["total_examples"]
|
||||
break
|
||||
|
||||
print(f" {dataset_name}: {score:.1%} ({examples} examples)")
|
||||
|
||||
print()
|
||||
print(f"Total datasets: {summary['total_datasets']}")
|
||||
print(f"Total examples: {summary['total_examples']}")
|
||||
|
||||
|
||||
async def async_main():
|
||||
api_key = os.getenv("OPENROUTER_API_KEY")
|
||||
if not api_key:
|
||||
print("Error: OPENROUTER_API_KEY environment variable is not set")
|
||||
print("Please set it using: export OPENROUTER_API_KEY=your-api-key")
|
||||
exit(1)
|
||||
|
||||
async def main_async():
|
||||
"""Main async function."""
|
||||
parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
|
||||
parser.add_argument("--yaml", required=True, help="Path to YAML configuration file")
|
||||
parser.add_argument("--config", required=True, help="Path to configuration file (YAML or JSON)")
|
||||
parser.add_argument("--model", help="Override model specified in config")
|
||||
parser.add_argument("--output-dir", help="Override output directory specified in config")
|
||||
parser.add_argument("--max-concurrent", type=int, help="Maximum number of concurrent API calls")
|
||||
parser.add_argument("--save-metadata", action="store_true", help="Save entry metadata in results")
|
||||
parser.add_argument("--full-results", action="store_true", help="Save the full results file")
|
||||
parser.add_argument("--verbose", action="store_true", help="Print detailed model responses")
|
||||
parser.add_argument("--debug", action="store_true", help="Enable debug logging")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
config = EvalConfig.from_yaml(args.yaml)
|
||||
evaluator = OpenRouterEvaluator(model=config.model, config=config, api_key=api_key)
|
||||
results = await evaluator.evaluate_datasets()
|
||||
# Check for required API key
|
||||
if not os.getenv("OPENROUTER_API_KEY"):
|
||||
print("Error: OPENROUTER_API_KEY environment variable is not set")
|
||||
print("Please set it using: export OPENROUTER_API_KEY=your-api-key")
|
||||
return 1
|
||||
|
||||
output_dir = f"{config.eval_dir}/{config.category}"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with open(f"{output_dir}/summary.json", "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
# Load configuration
|
||||
config_path = args.config
|
||||
if config_path.endswith(".yaml") or config_path.endswith(".yml"):
|
||||
config = EvalConfig.from_yaml(config_path)
|
||||
elif config_path.endswith(".json"):
|
||||
config = EvalConfig.from_json(config_path)
|
||||
else:
|
||||
print("Error: Configuration file must be YAML or JSON")
|
||||
return 1
|
||||
|
||||
# Apply command line overrides
|
||||
if args.model:
|
||||
config.model = args.model
|
||||
if args.output_dir:
|
||||
config.output_dir = args.output_dir
|
||||
if args.max_concurrent:
|
||||
config.max_concurrent = args.max_concurrent
|
||||
if args.save_metadata:
|
||||
config.save_metadata = True
|
||||
if args.full_results:
|
||||
config.save_full_results = True
|
||||
|
||||
# Create evaluator
|
||||
evaluator = AsyncModelEvaluator(config=config, verbose=args.verbose, debug=args.debug)
|
||||
|
||||
# Run evaluation
|
||||
try:
|
||||
results = await evaluator.evaluate_all()
|
||||
|
||||
# Save and print results
|
||||
results_path, summary_path = evaluator.save_results(results)
|
||||
evaluator.print_summary(results)
|
||||
|
||||
if results_path:
|
||||
print(f"\nResults saved to: {results_path}")
|
||||
print(f"Summary saved to: {summary_path}")
|
||||
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f"Error during evaluation: {str(e)}")
|
||||
if args.debug:
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
def main():
|
||||
"""Entry point."""
|
||||
exit_code = asyncio.run(main_async())
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(async_main())
|
||||
main()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue