diff --git a/eval/README.md b/eval/README.md index 3bf8d750..29e6527b 100644 --- a/eval/README.md +++ b/eval/README.md @@ -34,108 +34,100 @@ export OPENROUTER_API_KEY=your-api-key ``` -4. Prepare your dataset configuration in YAML format (see examples in `yaml//algorithmic.yaml` e.g `yaml/r1/algorithmic.yaml`): +4. Prepare your evaluation configuration in YAML or JSON format (see example in `example_config.yaml`): + ```yaml -model: model-name -provider: provider-name -category: category-name -datasets: - - dataset1 - - dataset2 -eval_dir: results/model-name -dataset_size: 50 -dataset_seed: 42 -developer_role: system +# Example configuration +model: "meta-llama/llama-3.3-70b-instruct" +provider: "Hyperbolic" # Optional, can be omitted +output_dir: "results" +max_concurrent: 10 +default_size: 20 # Default size for all datasets +default_seed: 42 # Default seed for all datasets -``` -For example the following file will run an evaluation for deepseek r1 for algorithmic datasets. -``` yaml -model: deepseek/deepseek-r1 -provider: Nebius -category: algorithmic -datasets: - - ab - - base_conversion - - binary_matrix - - caesar_cipher - - count_primes - - game_of_life - - graph_color - - group_anagrams - - isomorphic_strings - - letter_counting - - letter_jumble - - manipulate_matrix - - number_filtering - - number_sorting - - palindrome - - pool_matrix - - ransom_note - - rotate_matrix - - sentence_reordering - - spell_backward - - spiral_matrix - - string_insertion - - string_manipulation - - string_synthesis - - word_ladder - - word_sequence_reversal - - word_sorting -eval_dir: results/deepseek-r1 -dataset_size: 50 -dataset_seed: 45 -developer_role: system +categories: + - category: "algebra" + datasets: + - dataset: "complex_arithmetic" + params: + min_real: -10 + max_real: 10 + min_imag: -10 + max_imag: 10 + - category: "arithmetic" + datasets: + - dataset: "chain_sum" + size: 12 + seed: 43 + params: + min_digits: 2 + allow_negation: true + + - dataset: "products" + size: 10 + seed: 43 + params: + min_digits: 2 + allow_negation: true ``` - The following would run Claude 3.5 on the algorithmic dataset. +For example, to evaluate Claude 3.5 Sonnet on algorithmic datasets: + ```yaml -model: anthropic/claude-3.5-sonnet -category: algorithmic -provider: Anthropic -datasets: - - count_primes - - game_of_life - - graph_color - - group_anagrams - - isomorphic_strings - - letter_counting - - letter_jumble - - manipulate_matrix - - number_filtering - - number_sorting - - palindrome - - pool_matrix - - ransom_note - - rotate_matrix - - sentence_reordering - - spell_backward - - spiral_matrix - - string_insertion - - string_manipulation - - string_synthesis - - word_ladder - - word_sequence_reversal - - word_sorting -eval_dir: results/claude-3.5-sonnet -dataset_size: 50 -dataset_seed: 45 -developer_role: system +model: "anthropic/claude-3.5-sonnet" +provider: "Anthropic" +output_dir: "results" +max_concurrent: 5 +default_size: 50 +default_seed: 45 + +categories: + - category: "algorithmic" + datasets: + - dataset: "count_primes" + - dataset: "game_of_life" + - dataset: "graph_color" + - dataset: "isomorphic_strings" + - dataset: "letter_jumble" + - dataset: "rotate_matrix" + - dataset: "sentence_reordering" + - dataset: "string_manipulation" + - dataset: "word_ladder" + - dataset: "word_sorting" ``` -Here you specify individual model and provider ### Running Evaluations -To run evaluations +To run evaluations: + +```bash +python eval.py --config configs/your_config.yaml ``` -python eval.py --yaml + +For example: + +```bash +python eval.py --config example_config.yaml --full-results ``` -e.g -``` -python eval.py --yaml yaml/r1/algorithmic.yaml -``` -To run r1 evaluations on algorithmic.yaml -The results of individual model on a dataset will be stored in a new folder in the directory E.g `r1/algorithmic/proposition_logic.json`. -Please upload records of your results to [reasoning-gym-eval](https://github.com/open-thought/reasoning-gym-eval). +The results will be stored in a directory named after the model and timestamp, containing: +- `summary.json` - Summary of all results +- `results.json` - Full results (if `--full-results` is specified) +- Individual dataset results in category subdirectories + +For example: +``` +results/ +└── meta-llama_llama-3.3-70b-instruct_20250227_162030/ + ├── summary.json + ├── results.json + ├── algebra/ + │ └── complex_arithmetic.json + └── arithmetic/ + ├── chain_sum.json + └── products.json +``` + +Please upload your results to [reasoning-gym-eval](https://github.com/open-thought/reasoning-gym-eval). diff --git a/eval/__init__.py b/eval/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/eval/eval.py b/eval/eval.py index 72bf1140..32293a0c 100755 --- a/eval/eval.py +++ b/eval/eval.py @@ -1,16 +1,40 @@ #!/usr/bin/env python +""" +Evaluation script for reasoning gym datasets. + +This script evaluates LLM performance on reasoning gym datasets using the OpenRouter API. + +Usage: + python eval.py --config config.yaml [options] + +Options: + --model MODEL Override model specified in config + --output-dir DIR Override output directory specified in config + --max-concurrent NUM Maximum number of concurrent API calls + --save-metadata Save entry metadata in results + --full-results Save the full results file + --verbose Print detailed model responses + --debug Enable debug logging + +Environment variables: + OPENROUTER_API_KEY Required API key for OpenRouter +""" + import argparse import asyncio import json import logging import os -from dataclasses import asdict +import subprocess +import sys +from collections import OrderedDict from datetime import datetime -from typing import Any +from pathlib import Path +from typing import Any, Union -import aiohttp -from eval_config import EvalConfig -from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential +from eval_config import CategoryConfig, DatasetConfig, EvalConfig +from openai import AsyncOpenAI +from tqdm.asyncio import tqdm_asyncio import reasoning_gym from reasoning_gym.utils import extract_answer @@ -22,130 +46,473 @@ logging.basicConfig( handlers=[logging.StreamHandler()], ) +# httpx logging will be configured in the AsyncModelEvaluator class +# based on the debug flag -class OpenRouterEvaluator: - def __init__(self, model: str, config: EvalConfig, api_key: str): - self.logger = logging.getLogger(f"OpenRouterEvaluator.{model}") + +def get_git_hash() -> str: + """Get current git hash for reproducibility.""" + cmd = ["git", "rev-parse", "HEAD"] + try: + return subprocess.check_output(cmd, text=True, stderr=subprocess.PIPE).strip() + except Exception: + return "unknown" + + +class AsyncModelEvaluator: + """Evaluates models on reasoning datasets with async API calls via OpenRouter.""" + + def __init__(self, config: EvalConfig, verbose: bool = False, debug: bool = False): + """Initialize the evaluator with configuration. + + Args: + config: Evaluation configuration + verbose: Whether to print detailed model responses + debug: Whether to enable debug logging + """ self.config = config - self.output_dir = f"{config.eval_dir}/{config.category}" - os.makedirs(self.output_dir, exist_ok=True) - self.base_url = "https://openrouter.ai/api/v1/chat/completions" - self.api_key = api_key - self.model = model - self.headers = { - "Authorization": f"Bearer {self.api_key}", - "HTTP-Referer": os.getenv("OR_SITE_URL", "localhost"), - "X-Title": os.getenv("OR_APP_NAME", "Model Evaluation"), - "Content-Type": "application/json", - } - self.semaphore = asyncio.Semaphore(15) # Control concurrency + self.verbose = verbose + self.debug = debug - def save_results(self, results: list[dict[str, Any]], dataset, dataset_name) -> dict[str, Any]: - file_name = f"{self.output_dir}/{dataset_name}.json" - total_score = sum(r["score"] for r in results) + # Set up logging + self.logger = logging.getLogger("AsyncModelEvaluator") + if debug: + self.logger.setLevel(logging.DEBUG) + # Enable httpx logs in debug mode + logging.getLogger("httpx").setLevel(logging.INFO) + else: + # Suppress httpx logs in normal mode + logging.getLogger("httpx").setLevel(logging.WARNING) - metrics = { - "dataset_name": dataset_name, - "model": self.model, - "size": dataset.size, - "provider": self.config.provider, - "average_score": total_score / len(results) if results else 0, - "total_examples": len(results), - "timestamp": datetime.now().isoformat(), - "config": asdict(dataset.config), - "results": results, - } + # Set up OpenRouter API client + api_key = os.getenv("OPENROUTER_API_KEY") + if not api_key: + raise ValueError("OPENROUTER_API_KEY environment variable is not set") - with open(file_name, "w") as f: - json.dump(metrics, f, indent=2) - return metrics + self.client = AsyncOpenAI(base_url="https://openrouter.ai/api/v1", api_key=api_key) - async def get_model_response(self, session: aiohttp.ClientSession, prompt: str) -> str: - payload = { - "model": self.model, - "messages": [ - {"role": self.config.developer_role, "content": self.config.developer_prompt}, - {"role": "user", "content": prompt}, - ], - "provider": {"order": [self.config.provider], "allow_fallbacks": False}, - } + # Concurrency control + self.semaphore = asyncio.Semaphore(config.max_concurrent) - async for attempt in AsyncRetrying( - stop=stop_after_attempt(20), - wait=wait_exponential(multiplier=1, min=1, max=60), - retry=retry_if_exception_type( - (aiohttp.ClientError, asyncio.TimeoutError, json.JSONDecodeError, ValueError) - ), - ): - with attempt: - async with session.post(self.base_url, json=payload) as response: - data = await response.json() + # Metadata + self.git_hash = get_git_hash() + self.start_time = datetime.now() - if not data: - raise ValueError("Empty response") + async def get_model_response(self, prompt: str) -> str: + """Get response from model with retry logic via OpenRouter. - if not data.get("choices"): - raise ValueError("Missing choices in response") + Args: + prompt: The prompt to send to the model - return data["choices"][0]["message"]["content"] + Returns: + The model's response text - raise Exception("Failed to get valid response after retries") + Raises: + Exception: If all retries fail + """ + max_retries = 10 + base_delay = 1.0 + max_delay = 60.0 + backoff_factor = 2.0 - async def process_entry(self, session: aiohttp.ClientSession, dataset: Any, entry: Any) -> dict[str, Any]: - """Process a single entry with concurrency control.""" - async with self.semaphore: - response = await self.get_model_response(session, entry["question"]) + for attempt in range(max_retries): + try: + async with self.semaphore: + # Prepare API call parameters + params = { + "model": self.config.model, + "messages": [ + {"role": self.config.system_role, "content": self.config.system_prompt}, + {"role": "user", "content": prompt}, + ], + } + + # Add provider configuration if specified + if self.config.provider: + params["extra_body"] = {"provider": {"order": [self.config.provider], "allow_fallbacks": False}} + + completion = await self.client.chat.completions.create(**params) + response = completion.choices[0].message.content + + if self.verbose: + self.logger.info(f"Prompt: {prompt}") + self.logger.info(f"Response: {response}") + + return response + + except Exception as e: + delay = min(max_delay, base_delay * (backoff_factor**attempt)) + self.logger.warning(f"Attempt {attempt+1}/{max_retries} failed: {str(e)}") + self.logger.warning(f"Retrying in {delay:.2f} seconds...") + await asyncio.sleep(delay) + + raise Exception(f"Failed to get model response after {max_retries} attempts") + + async def process_entry( + self, dataset: reasoning_gym.dataset.ProceduralDataset, entry: dict[str, Any] + ) -> dict[str, Any]: + """Process a single dataset entry. + + Args: + dataset: The dataset instance + entry: The entry to process + + Returns: + Dict with processing results + """ + try: + response = await self.get_model_response(entry["question"]) model_answer = extract_answer(response) score = dataset.score_answer(answer=model_answer, entry=entry) - print(f"answer: {model_answer}, score: {score}") + if self.verbose: + print(f"Question: {entry['question']}") + print(f"Expected: {entry['answer']}") + print(f"Answer: {model_answer}") + print(f"Score: {score}") + print("-" * 40) - return { + result = { "question": entry["question"], "expected_answer": str(entry["answer"]), "model_answer": model_answer, "full_model_response": response, "score": score, - "metadata": str(entry["metadata"]), } - async def evaluate_dataset(self, session: aiohttp.ClientSession, dataset_name: str) -> dict[str, Any]: - """Evaluate a single dataset asynchronously.""" - self.logger.info(f"\nEvaluating dataset: {dataset_name}") - dataset = reasoning_gym.create_dataset( - dataset_name, size=self.config.dataset_size, seed=self.config.dataset_seed - ) + # Only include metadata if configured to do so + if self.config.save_metadata: + result["metadata"] = entry["metadata"] - tasks = [self.process_entry(session, dataset, entry) for entry in dataset] - results = await asyncio.gather(*tasks) - return self.save_results(results, dataset, dataset_name) + return result - async def evaluate_datasets(self) -> list[dict[str, Any]]: - """Main async evaluation entry point.""" - async with aiohttp.ClientSession(headers=self.headers) as session: - return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets)) + except Exception as e: + self.logger.error(f"Error processing entry: {str(e)}") + result = { + "question": entry["question"], + "expected_answer": str(entry["answer"]), + "model_answer": "ERROR", + "full_model_response": f"Error: {str(e)}", + "score": 0.0, + "error": str(e), + } + + # Only include metadata if configured to do so + if self.config.save_metadata: + result["metadata"] = entry["metadata"] + + return result + + async def evaluate_dataset(self, category_name: str, dataset_config: DatasetConfig) -> dict[str, Any]: + """Evaluate a single dataset. + + Args: + category_name: Name of the category + dataset_config: Configuration for the dataset + + Returns: + Dict with evaluation results + """ + dataset_name = dataset_config.dataset + self.logger.info(f"Evaluating dataset: {dataset_name}") + + try: + # Create dataset with all parameters + dataset_params = {} + + # Add all parameters from the config params dictionary + # Make sure we don't have a nested 'params' dictionary + for k, v in dataset_config.params.items(): + if k != "params": + dataset_params[k] = v + elif isinstance(v, dict): + # If there's a nested params dict, flatten it + dataset_params.update(v) + + # Add size and seed if they're not None + if dataset_config.size is not None: + dataset_params["size"] = dataset_config.size + if dataset_config.seed is not None: + dataset_params["seed"] = dataset_config.seed + + dataset = reasoning_gym.create_dataset(dataset_name, **dataset_params) + + # Get all entries + all_entries = list(dataset) + + # Process entries with progress bar + tasks = [self.process_entry(dataset, entry) for entry in all_entries] + results = await tqdm_asyncio.gather(*tasks, desc=f"Processing {dataset_name}", leave=True) + + # Calculate metrics + total_score = sum(r["score"] for r in results) + average_score = total_score / len(results) if results else 0 + + return { + "name": dataset_name, + "category": category_name, + "average_score": average_score, + "total_examples": len(results), + "config": {"size": dataset_config.size, "seed": dataset_config.seed, **dataset_config.params}, + "results": results, + } + + except Exception as e: + self.logger.error(f"Error evaluating dataset {dataset_name}: {str(e)}") + return { + "name": dataset_name, + "category": category_name, + "average_score": 0.0, + "total_examples": 0, + "config": {"size": dataset_config.size, "seed": dataset_config.seed, **dataset_config.params}, + "error": str(e), + "results": [], + } + + async def evaluate_category(self, category_config: CategoryConfig) -> dict[str, Any]: + """Evaluate all datasets in a category. + + Args: + category_config: Configuration for the category + + Returns: + Dict with category evaluation results + """ + category_name = category_config.category + self.logger.info(f"Evaluating category: {category_name}") + + tasks = [self.evaluate_dataset(category_name, dataset_config) for dataset_config in category_config.datasets] + + dataset_results = await asyncio.gather(*tasks) + + return { + "name": category_name, + "datasets": dataset_results, + } + + async def evaluate_all(self) -> dict[str, Any]: + """Evaluate all categories and datasets. + + Returns: + Dict with all evaluation results and summary + """ + self.logger.info(f"Starting evaluation of {len(self.config.categories)} categories") + + tasks = [self.evaluate_category(category) for category in self.config.categories] + category_results = await asyncio.gather(*tasks) + + # Generate results structure + results = { + "metadata": { + "timestamp": self.start_time.isoformat(), + "model": self.config.model, + "provider": self.config.provider, + "git_hash": self.git_hash, + "duration_seconds": (datetime.now() - self.start_time).total_seconds(), + }, + "categories": category_results, + } + + # Generate summary + results["summary"] = self.generate_summary(results) + + return results + + def generate_summary(self, results: dict[str, Any]) -> dict[str, Union[int, OrderedDict]]: + """Generate a summary of evaluation results in the original configuration order. + + Args: + results: The full evaluation results + + Returns: + Dict with summary information + """ + summary = { + "total_datasets": 0, + "total_examples": 0, + "dataset_scores": OrderedDict(), + } + + # Iterate through categories and datasets in the original order from config + for category_config in self.config.categories: + for dataset_config in category_config.datasets: + dataset_name = dataset_config.dataset + dataset_found = False + + # Find corresponding results + for category in results["categories"]: + if category["name"] == category_config.category: + for dataset in category["datasets"]: + if dataset["name"] == dataset_name: + # Add to summary in original order + summary["dataset_scores"][dataset_name] = dataset["average_score"] + summary["total_datasets"] += 1 + summary["total_examples"] += dataset["total_examples"] + dataset_found = True + break + + # If dataset wasn't found in results (error), add with score 0 + if not dataset_found: + summary["dataset_scores"][dataset_name] = 0.0 + summary["total_datasets"] += 1 + + return summary + + def save_results(self, results: dict[str, Any]) -> tuple[str, str]: + """Save evaluation results to files. + + Args: + results: The evaluation results to save + + Returns: + Tuple of (results_path, summary_path) + """ + # Create output directory with timestamp + timestamp = self.start_time.strftime("%Y%m%d_%H%M%S") + model_name = self.config.model.replace("/", "_") + + # Format directory name with model and timestamp only + output_dir = Path(self.config.output_dir) / f"{model_name}_{timestamp}" + output_dir.mkdir(parents=True, exist_ok=True) + + results_path = None + + # Save full results if configured to do so + if self.config.save_full_results: + results_path = output_dir / "results.json" + with open(results_path, "w") as f: + json.dump(results, f, indent=2) + + # Add timestamp, git hash, model, provider, and duration to summary + summary_data = results["summary"].copy() + summary_data["timestamp"] = self.start_time.isoformat() + summary_data["git_hash"] = self.git_hash + summary_data["model"] = self.config.model + summary_data["provider"] = self.config.provider + summary_data["duration_seconds"] = results["metadata"]["duration_seconds"] + + # Save summary + summary_path = output_dir / "summary.json" + with open(summary_path, "w") as f: + json.dump(summary_data, f, indent=2) + + # Save individual dataset results + for category in results["categories"]: + category_dir = output_dir / category["name"] + category_dir.mkdir(exist_ok=True) + + for dataset in category["datasets"]: + dataset_path = category_dir / f"{dataset['name']}.json" + with open(dataset_path, "w") as f: + json.dump(dataset, f, indent=2) + + return str(results_path) if results_path else None, str(summary_path) + + def print_summary(self, results: dict[str, Any]) -> None: + """Print a summary of evaluation results to the console. + + Args: + results: The evaluation results + """ + summary = results["summary"] + + print("\nEvaluation Summary:") + print("------------------") + print(f"Model: {self.config.model}") + print(f"Provider: {self.config.provider}") + print(f"Git Hash: {self.git_hash}") + print(f"Duration: {results['metadata']['duration_seconds']:.2f} seconds") + print() + + print("Dataset Scores (in configuration order):") + for dataset_name, score in summary["dataset_scores"].items(): + # Find the number of examples for this dataset + examples = 0 + for category in results["categories"]: + for dataset in category["datasets"]: + if dataset["name"] == dataset_name: + examples = dataset["total_examples"] + break + + print(f" {dataset_name}: {score:.1%} ({examples} examples)") + + print() + print(f"Total datasets: {summary['total_datasets']}") + print(f"Total examples: {summary['total_examples']}") -async def async_main(): - api_key = os.getenv("OPENROUTER_API_KEY") - if not api_key: - print("Error: OPENROUTER_API_KEY environment variable is not set") - print("Please set it using: export OPENROUTER_API_KEY=your-api-key") - exit(1) - +async def main_async(): + """Main async function.""" parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets") - parser.add_argument("--yaml", required=True, help="Path to YAML configuration file") + parser.add_argument("--config", required=True, help="Path to configuration file (YAML or JSON)") + parser.add_argument("--model", help="Override model specified in config") + parser.add_argument("--output-dir", help="Override output directory specified in config") + parser.add_argument("--max-concurrent", type=int, help="Maximum number of concurrent API calls") + parser.add_argument("--save-metadata", action="store_true", help="Save entry metadata in results") + parser.add_argument("--full-results", action="store_true", help="Save the full results file") + parser.add_argument("--verbose", action="store_true", help="Print detailed model responses") + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + args = parser.parse_args() - config = EvalConfig.from_yaml(args.yaml) - evaluator = OpenRouterEvaluator(model=config.model, config=config, api_key=api_key) - results = await evaluator.evaluate_datasets() + # Check for required API key + if not os.getenv("OPENROUTER_API_KEY"): + print("Error: OPENROUTER_API_KEY environment variable is not set") + print("Please set it using: export OPENROUTER_API_KEY=your-api-key") + return 1 - output_dir = f"{config.eval_dir}/{config.category}" - os.makedirs(output_dir, exist_ok=True) - with open(f"{output_dir}/summary.json", "w") as f: - json.dump(results, f, indent=2) + # Load configuration + config_path = args.config + if config_path.endswith(".yaml") or config_path.endswith(".yml"): + config = EvalConfig.from_yaml(config_path) + elif config_path.endswith(".json"): + config = EvalConfig.from_json(config_path) + else: + print("Error: Configuration file must be YAML or JSON") + return 1 + + # Apply command line overrides + if args.model: + config.model = args.model + if args.output_dir: + config.output_dir = args.output_dir + if args.max_concurrent: + config.max_concurrent = args.max_concurrent + if args.save_metadata: + config.save_metadata = True + if args.full_results: + config.save_full_results = True + + # Create evaluator + evaluator = AsyncModelEvaluator(config=config, verbose=args.verbose, debug=args.debug) + + # Run evaluation + try: + results = await evaluator.evaluate_all() + + # Save and print results + results_path, summary_path = evaluator.save_results(results) + evaluator.print_summary(results) + + if results_path: + print(f"\nResults saved to: {results_path}") + print(f"Summary saved to: {summary_path}") + + return 0 + except Exception as e: + print(f"Error during evaluation: {str(e)}") + if args.debug: + import traceback + + traceback.print_exc() + return 1 + + +def main(): + """Entry point.""" + exit_code = asyncio.run(main_async()) + sys.exit(exit_code) if __name__ == "__main__": - asyncio.run(async_main()) + main() diff --git a/eval/eval_config.py b/eval/eval_config.py index c92016b3..6099069c 100644 --- a/eval/eval_config.py +++ b/eval/eval_config.py @@ -1,25 +1,139 @@ -from dataclasses import dataclass -from typing import Union +"""Configuration classes for the evaluation script""" + +import json +import re +from dataclasses import dataclass, field +from typing import Any, Optional import yaml from reasoning_gym.utils import SYSTEM_PROMPTS +def is_valid_unix_filename(filename: str) -> bool: + """ + Check for shell-safe filenames. + Only allows alphanumeric characters, hyphens, and underscores. + """ + if not filename: + return False + return bool(re.match(r"^[a-zA-Z0-9_-]+$", filename)) + + +@dataclass +class DatasetConfig: + """Configuration for a specific dataset""" + + dataset: str + size: int = 500 + seed: Optional[int] = None + # Allow any additional dataset-specific parameters + params: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class CategoryConfig: + """Configuration for a category of datasets""" + + category: str + datasets: list[DatasetConfig] + + @dataclass class EvalConfig: - category: str - datasets: Union[str, list[str]] - eval_dir: str - dataset_size: int - dataset_seed: int + """Global evaluation configuration""" + model: str - provider: str - developer_role: str = "system" - developer_prompt: str = SYSTEM_PROMPTS["DeepSeekZero"] + provider: Optional[str] = None + system_prompt: str = SYSTEM_PROMPTS["default"] + system_role: str = "system" + output_dir: str = "results" + max_concurrent: int = 10 + default_size: int = 500 + default_seed: Optional[int] = None + save_metadata: bool = False + save_full_results: bool = False + categories: list[CategoryConfig] = field(default_factory=list) @classmethod - def from_yaml(cls, yaml_path: str): + def from_json(cls, json_path: str) -> "EvalConfig": + """Load configuration from JSON file""" + with open(json_path, "r") as f: + config_data = json.load(f) + + return cls._process_config_data(config_data) + + @classmethod + def from_yaml(cls, yaml_path: str) -> "EvalConfig": + """Load configuration from YAML file""" with open(yaml_path, "r") as f: - config = yaml.safe_load(f) - return cls(**config) + config_data = yaml.safe_load(f) + + return cls._process_config_data(config_data) + + @classmethod + def _process_config_data(cls, config_data: dict[str, Any]) -> "EvalConfig": + """Process configuration data from either JSON or YAML""" + # Extract categories + categories_data = config_data.pop("categories", []) + categories = [] + + for category_data in categories_data: + category_name = category_data.get("category") + if not is_valid_unix_filename(category_name): + raise ValueError( + f"Invalid category name '{category_name}'. Category names must be valid Unix filenames." + ) + + # Process datasets in this category + datasets_data = category_data.get("datasets", []) + datasets = [] + + for dataset_data in datasets_data: + # If it's just a string, convert to dict with name + if isinstance(dataset_data, str): + dataset_data = {"name": dataset_data} + + # Extract dataset name + dataset_name = dataset_data.get("dataset") + + # Extract size and seed with defaults + size = dataset_data.get("size", config_data.get("default_size", 500)) + seed = dataset_data.get("seed", config_data.get("default_seed")) + + # Extract all other parameters (everything except dataset, size, and seed) + # If there's a nested 'params' dictionary, use its contents directly + params = {} + for k, v in dataset_data.items(): + if k not in ["dataset", "size", "seed"]: + if k == "params" and isinstance(v, dict): + # Flatten nested params dictionary + params.update(v) + else: + params[k] = v + + # Create dataset config + dataset_config = DatasetConfig( + dataset=dataset_name, + size=size, + seed=seed, + params=params, + ) + datasets.append(dataset_config) + + # Create category config + category_config = CategoryConfig(category=category_name, datasets=datasets) + categories.append(category_config) + + # Create main config + return cls( + model=config_data.get("model"), + provider=config_data.get("provider", "openai"), + system_prompt=config_data.get("system_prompt", SYSTEM_PROMPTS["default"]), + system_role=config_data.get("system_role", "system"), + output_dir=config_data.get("output_dir", "results"), + max_concurrent=config_data.get("max_concurrent", 10), + save_metadata=config_data.get("save_metadata", False), + save_full_results=config_data.get("save_full_results", False), + categories=categories, + ) diff --git a/eval/example_config.json b/eval/example_config.json new file mode 100644 index 00000000..ae83a2f6 --- /dev/null +++ b/eval/example_config.json @@ -0,0 +1,47 @@ +{ + "model": "meta-llama/llama-3.3-70b-instruct", + "provider": "Hyperbolic", + "output_dir": "results", + "max_concurrent": 10, + "default_size": 20, + "default_seed": 42, + "categories": [ + { + "category": "algebra", + "datasets": [ + { + "dataset": "complex_arithmetic", + "params": { + "min_real": -10, + "max_real": 10, + "min_imag": -10, + "max_imag": 10 + } + } + ] + }, + { + "category": "arithmetic", + "datasets": [ + { + "dataset": "products", + "size": 10, + "seed": 43, + "params": { + "min_digits": 2, + "allow_negation": true + } + }, + { + "dataset": "chain_sum", + "size": 12, + "seed": 43, + "params": { + "min_digits": 2, + "allow_negation": true + } + } + ] + } + ] +} diff --git a/eval/example_config.yaml b/eval/example_config.yaml new file mode 100644 index 00000000..31cb909f --- /dev/null +++ b/eval/example_config.yaml @@ -0,0 +1,33 @@ +# Example configuration for the evaluation script +model: "meta-llama/llama-3.3-70b-instruct" +provider: "Hyperbolic" +output_dir: "results" +max_concurrent: 10 +default_size: 20 # Default size for all datasets +default_seed: 42 # Default seed for all datasets + +categories: + - category: "algebra" + datasets: + - dataset: "complex_arithmetic" + params: + min_real: -10 + max_real: 10 + min_imag: -10 + max_imag: 10 + + - category: "arithmetic" + datasets: + - dataset: "products" + size: 10 + seed: 43 + params: + min_digits: 2 + allow_negation: true + + - dataset: "chain_sum" + size: 12 + seed: 43 + params: + min_digits: 2 + allow_negation: true diff --git a/eval/requirements-eval.txt b/eval/requirements-eval.txt index 91567b6a..8650b14a 100644 --- a/eval/requirements-eval.txt +++ b/eval/requirements-eval.txt @@ -1,2 +1,3 @@ -aiohttp>=3.11.13 -tenacity>=9.0.0 +openai>=1.64.0 +PyYAML>=6.0 +tqdm>=4.66.0 diff --git a/eval/scripts/run_llama-3.3-70-instruct_all.sh b/eval/scripts/run_llama-3.3-70-instruct_all.sh deleted file mode 100755 index 08b355ec..00000000 --- a/eval/scripts/run_llama-3.3-70-instruct_all.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# run this script from the parent directory -./eval.py --yaml yaml/llama-3.3-70b-instruct/algebra.yaml -./eval.py --yaml yaml/llama-3.3-70b-instruct/algorithmic.yaml -./eval.py --yaml yaml/llama-3.3-70b-instruct/arc.yaml -./eval.py --yaml yaml/llama-3.3-70b-instruct/arithmetic.yaml -./eval.py --yaml yaml/llama-3.3-70b-instruct/code.yaml -./eval.py --yaml yaml/llama-3.3-70b-instruct/cognition.yaml -./eval.py --yaml yaml/llama-3.3-70b-instruct/games.yaml -./eval.py --yaml yaml/llama-3.3-70b-instruct/geometry.yaml -./eval.py --yaml yaml/llama-3.3-70b-instruct/graphs.yaml -./eval.py --yaml yaml/llama-3.3-70b-instruct/logic.yaml diff --git a/eval/yaml/anthropic/algorithmic.yaml b/eval/yaml/anthropic/algorithmic.yaml deleted file mode 100644 index fd612212..00000000 --- a/eval/yaml/anthropic/algorithmic.yaml +++ /dev/null @@ -1,31 +0,0 @@ -model: anthropic/claude-3.5-sonnet -category: algorithmic -provider: Anthropic -datasets: - - count_primes - - game_of_life - - graph_color - - group_anagrams - - isomorphic_strings - - letter_counting - - letter_jumble - - manipulate_matrix - - number_filtering - - number_sorting - - palindrome - - pool_matrix - - ransom_note - - rotate_matrix - - sentence_reordering - - spell_backward - - spiral_matrix - - string_insertion - - string_manipulation - - string_synthesis - - word_ladder - - word_sequence_reversal - - word_sorting -eval_dir: eval/sonnet-3.5 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/claude-3.5-sonnet.yaml b/eval/yaml/claude-3.5-sonnet.yaml new file mode 100644 index 00000000..82f4d43b --- /dev/null +++ b/eval/yaml/claude-3.5-sonnet.yaml @@ -0,0 +1,34 @@ +# Combined configuration for Claude 3.5 Sonnet +model: "anthropic/claude-3.5-sonnet" +provider: "Anthropic" +output_dir: "results" +max_concurrent: 10 +default_size: 50 +default_seed: 45 + +categories: + - category: "algorithmic" + datasets: + - dataset: "count_primes" + - dataset: "game_of_life" + - dataset: "graph_color" + - dataset: "group_anagrams" + - dataset: "isomorphic_strings" + - dataset: "letter_counting" + - dataset: "letter_jumble" + - dataset: "manipulate_matrix" + - dataset: "number_filtering" + - dataset: "number_sorting" + - dataset: "palindrome" + - dataset: "pool_matrix" + - dataset: "ransom_note" + - dataset: "rotate_matrix" + - dataset: "sentence_reordering" + - dataset: "spell_backward" + - dataset: "spiral_matrix" + - dataset: "string_insertion" + - dataset: "string_manipulation" + - dataset: "string_synthesis" + - dataset: "word_ladder" + - dataset: "word_sequence_reversal" + - dataset: "word_sorting" diff --git a/eval/yaml/deepseek-r1.yaml b/eval/yaml/deepseek-r1.yaml new file mode 100644 index 00000000..b0232494 --- /dev/null +++ b/eval/yaml/deepseek-r1.yaml @@ -0,0 +1,61 @@ +# Combined configuration for deepseek-r1 +model: "deepseek/deepseek-r1" +provider: "Nebius" +output_dir: "results" +max_concurrent: 10 +default_size: 50 +default_seed: 45 + +categories: + - category: "algebra" + datasets: + - dataset: "intermediate_integration" + - dataset: "polynomial_equations" + - dataset: "polynomial_multiplication" + - dataset: "simple_equations" + - dataset: "simple_integration" + - dataset: "complex_arithmetic" + + - category: "algorithmic" + datasets: + - dataset: "ab" + - dataset: "base_conversion" + - dataset: "binary_matrix" + - dataset: "caesar_cipher" + - dataset: "count_primes" + - dataset: "game_of_life" + - dataset: "graph_color" + - dataset: "group_anagrams" + - dataset: "isomorphic_strings" + - dataset: "letter_counting" + - dataset: "letter_jumble" + - dataset: "manipulate_matrix" + - dataset: "number_filtering" + - dataset: "number_sorting" + - dataset: "palindrome" + - dataset: "pool_matrix" + - dataset: "ransom_note" + - dataset: "rotate_matrix" + - dataset: "sentence_reordering" + - dataset: "spell_backward" + - dataset: "spiral_matrix" + - dataset: "string_insertion" + - dataset: "string_manipulation" + - dataset: "string_synthesis" + - dataset: "word_ladder" + - dataset: "word_sequence_reversal" + - dataset: "word_sorting" + + - category: "cognition" + datasets: + - dataset: "color_cube_rotation" + - dataset: "figlet_font" + - dataset: "number_sequence" + - dataset: "rubiks_cube" + + - category: "logic" + datasets: + - dataset: "propositional_logic" + - dataset: "self_reference" + - dataset: "syllogism" + - dataset: "zebra_puzzles" diff --git a/eval/yaml/example.yaml b/eval/yaml/example.yaml deleted file mode 100644 index 0722bd8e..00000000 --- a/eval/yaml/example.yaml +++ /dev/null @@ -1,9 +0,0 @@ -model: anthropic/claude-3.7-sonnet # find model id: https://openrouter.ai/models -provider: Anthropic -category: test -datasets: - - YOUR_DATASET_NAME -eval_dir: results/test -dataset_size: 100 -dataset_seed: 42 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct.yaml b/eval/yaml/llama-3.3-70b-instruct.yaml new file mode 100644 index 00000000..b42ab9bc --- /dev/null +++ b/eval/yaml/llama-3.3-70b-instruct.yaml @@ -0,0 +1,130 @@ +# Combined configuration for llama-3.3-70b-instruct +model: "meta-llama/llama-3.3-70b-instruct" +provider: "Hyperbolic" +output_dir: "results" +max_concurrent: 10 +default_size: 50 +default_seed: 45 + +categories: + - category: "algebra" + datasets: + - dataset: "intermediate_integration" + - dataset: "polynomial_equations" + - dataset: "polynomial_multiplication" + - dataset: "simple_equations" + - dataset: "simple_integration" + - dataset: "complex_arithmetic" + + - category: "algorithmic" + datasets: + - dataset: "ab" + - dataset: "base_conversion" + - dataset: "binary_alternation" + - dataset: "binary_matrix" + - dataset: "caesar_cipher" + - dataset: "count_primes" + - dataset: "cryptarithm" + - dataset: "game_of_life" + - dataset: "graph_color" + - dataset: "group_anagrams" + - dataset: "isomorphic_strings" + - dataset: "jugs" + - dataset: "letter_counting" + - dataset: "letter_jumble" + - dataset: "manipulate_matrix" + - dataset: "number_filtering" + - dataset: "number_sorting" + - dataset: "palindrome" + - dataset: "palindrome_partitioning" + - dataset: "pool_matrix" + - dataset: "ransom_note" + - dataset: "rotate_matrix" + - dataset: "rotten_oranges" + - dataset: "sentence_reordering" + - dataset: "spell_backward" + - dataset: "spiral_matrix" + - dataset: "string_insertion" + - dataset: "string_manipulation" + - dataset: "string_splitting" + - dataset: "string_synthesis" + - dataset: "word_ladder" + - dataset: "word_sequence_reversal" + - dataset: "word_sorting" + + - category: "arc" + datasets: + - dataset: "arc_1d" + - dataset: "arc_agi" + - dataset: "rearc" + + - category: "arithmetic" + datasets: + - dataset: "basic_arithmetic" + - dataset: "bitwise_arithmetic" + - dataset: "calendar_arithmetic" + - dataset: "chain_sum" + - dataset: "count_bits" + - dataset: "decimal_arithmetic" + - dataset: "decimal_chain_sum" + - dataset: "dice" + - dataset: "fraction_simplification" + - dataset: "gcd" + - dataset: "gsm_symbolic" + - dataset: "lcm" + - dataset: "leg_counting" + - dataset: "number_format" + - dataset: "power_function" + - dataset: "prime_factorization" + - dataset: "products" + - dataset: "time_intervals" + + - category: "code" + datasets: + - dataset: "bf" + + - category: "cognition" + datasets: + - dataset: "color_cube_rotation" + - dataset: "figlet_font" + - dataset: "needle_haystack" + - dataset: "number_sequence" + - dataset: "rectangle_count" + - dataset: "rubiks_cube" + + - category: "games" + datasets: + - dataset: "countdown" + - dataset: "emoji_mystery" + - dataset: "futoshuki" + - dataset: "knight_swap" + - dataset: "maze" + - dataset: "mini_sudoku" + - dataset: "n_queens" + - dataset: "sokoban" + - dataset: "sudoku" + - dataset: "tower_of_hanoi" + - dataset: "tsumego" + + - category: "geometry" + datasets: + - dataset: "simple_geometry" + - dataset: "advanced_geometry" + + - category: "graphs" + datasets: + - dataset: "course_schedule" + - dataset: "family_relationships" + - dataset: "largest_island" + - dataset: "list_functions" + - dataset: "quantum_lock" + - dataset: "shortest_path" + + - category: "logic" + datasets: + - dataset: "aiw" + - dataset: "circuit_logic" + - dataset: "propositional_logic" + - dataset: "self_reference" + - dataset: "syllogism" + - dataset: "zebra_puzzles" diff --git a/eval/yaml/llama-3.3-70b-instruct/algebra.yaml b/eval/yaml/llama-3.3-70b-instruct/algebra.yaml deleted file mode 100644 index 13da48ea..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/algebra.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: algebra -datasets: - - intermediate_integration - - polynomial_equations - - polynomial_multiplication - - simple_equations - - simple_integration - - complex_arithmetic -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 42 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml b/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml deleted file mode 100644 index 5291bc7d..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml +++ /dev/null @@ -1,41 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: algorithmic -datasets: - - ab - - base_conversion - - binary_alternation - - binary_matrix - - caesar_cipher - - count_primes - - cryptarithm - - game_of_life - - graph_color - - group_anagrams - - isomorphic_strings - - jugs - - letter_counting - - letter_jumble - - manipulate_matrix - - number_filtering - - number_sorting - - palindrome - - palindrome_partitioning - - pool_matrix - - ransom_note - - rotate_matrix - - rotten_oranges - - sentence_reordering - - spell_backward - - spiral_matrix - - string_insertion - - string_manipulation - - string_splitting - - string_synthesis - - word_ladder - - word_sequence_reversal - - word_sorting -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/arc.yaml b/eval/yaml/llama-3.3-70b-instruct/arc.yaml deleted file mode 100644 index 50ca22b4..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/arc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: arc -datasets: - - arc_1d - - arc_agi - - rearc -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/arithmetic.yaml b/eval/yaml/llama-3.3-70b-instruct/arithmetic.yaml deleted file mode 100644 index 63e8e53f..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/arithmetic.yaml +++ /dev/null @@ -1,26 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: arithmetic -datasets: - - basic_arithmetic - - bitwise_arithmetic - - calendar_arithmetic - - chain_sum - - count_bits - - decimal_arithmetic - - decimal_chain_sum - - dice - - fraction_simplification - - gcd - - gsm_symbolic - - lcm - - leg_counting - - number_format - - power_function - - prime_factorization - - products - - time_intervals -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/code.yaml b/eval/yaml/llama-3.3-70b-instruct/code.yaml deleted file mode 100644 index eb17fe1e..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/code.yaml +++ /dev/null @@ -1,9 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: code -datasets: - - bf -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/cognition.yaml b/eval/yaml/llama-3.3-70b-instruct/cognition.yaml deleted file mode 100644 index 9ae33127..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/cognition.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: cognition -datasets: - - color_cube_rotation - - figlet_font - - needle_haystack - - number_sequence - - rectangle_count - - rubiks_cube -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/games.yaml b/eval/yaml/llama-3.3-70b-instruct/games.yaml deleted file mode 100644 index 1620da14..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/games.yaml +++ /dev/null @@ -1,19 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: games -datasets: - - countdown - - emoji_mystery - - futoshuki - - knight_swap - - maze - - mini_sudoku - - n_queens - - sokoban - - sudoku - - tower_of_hanoi - - tsumego -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/geometry.yaml b/eval/yaml/llama-3.3-70b-instruct/geometry.yaml deleted file mode 100644 index a3b8fd0d..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/geometry.yaml +++ /dev/null @@ -1,10 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: geometry -datasets: - - simple_geometry - - advanced_geometry -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/graphs.yaml b/eval/yaml/llama-3.3-70b-instruct/graphs.yaml deleted file mode 100644 index c9414474..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/graphs.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: graphs -datasets: - - course_schedule - - family_relationships - - largest_island - - list_functions - - quantum_lock - - shortest_path -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/logic.yaml b/eval/yaml/llama-3.3-70b-instruct/logic.yaml deleted file mode 100644 index 9d2e126d..00000000 --- a/eval/yaml/llama-3.3-70b-instruct/logic.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model: meta-llama/llama-3.3-70b-instruct -provider: Hyperbolic -category: logic -datasets: - - aiw - - circuit_logic - - propositional_logic - - self_reference - - syllogism - - zebra_puzzles -eval_dir: results/llama-3.3-70b-instruct -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3.yaml b/eval/yaml/openai-o3.yaml new file mode 100644 index 00000000..87574df0 --- /dev/null +++ b/eval/yaml/openai-o3.yaml @@ -0,0 +1,126 @@ +# Combined configuration for openai/o3-mini +model: "openai/o3-mini" +provider: "OpenAI" +output_dir: "results" +max_concurrent: 10 +default_size: 50 +default_seed: 45 + +categories: + - category: "algebra" + datasets: + - dataset: "complex_arithmetic" + - dataset: "intermediate_integration" + - dataset: "polynomial_equations" + - dataset: "polynomial_multiplication" + - dataset: "simple_equations" + - dataset: "simple_integration" + + - category: "algorithmic" + datasets: + - dataset: "ab" + - dataset: "binary_alternation" + - dataset: "base_conversion" + - dataset: "binary_matrix" + - dataset: "caesar_cipher" + - dataset: "count_primes" + - dataset: "cryptarithm" + - dataset: "game_of_life" + - dataset: "graph_color" + - dataset: "group_anagrams" + - dataset: "isomorphic_strings" + - dataset: "letter_counting" + - dataset: "letter_jumble" + - dataset: "manipulate_matrix" + - dataset: "number_filtering" + - dataset: "number_sorting" + - dataset: "palindrome" + - dataset: "pool_matrix" + - dataset: "ransom_note" + - dataset: "rotate_matrix" + - dataset: "sentence_reordering" + - dataset: "spell_backward" + - dataset: "spiral_matrix" + - dataset: "string_insertion" + - dataset: "string_manipulation" + - dataset: "string_synthesis" + - dataset: "word_ladder" + - dataset: "word_sequence_reversal" + - dataset: "word_sorting" + + - category: "arc" + datasets: + - dataset: "arc_1d" + - dataset: "arc_agi" + - dataset: "rearc" + + - category: "arithmetic" + datasets: + - dataset: "basic_arithmetic" + - dataset: "bitwise_arithmetic" + - dataset: "calendar_arithmetic" + - dataset: "chain_sum" + - dataset: "count_bits" + - dataset: "decimal_arithmetic" + - dataset: "decimal_chain_sum" + - dataset: "dice" + - dataset: "fraction_simplification" + - dataset: "gcd" + - dataset: "gsm_symbolic" + - dataset: "lcm" + - dataset: "leg_counting" + - dataset: "number_format" + - dataset: "power_function" + - dataset: "prime_factorization" + - dataset: "products" + - dataset: "time_intervals" + + - category: "code" + datasets: + - dataset: "bf" + + - category: "cognition" + datasets: + - dataset: "color_cube_rotation" + - dataset: "figlet_font" + - dataset: "needle_haystack" + - dataset: "number_sequence" + - dataset: "rectangle_count" + - dataset: "rubiks_cube" + + - category: "games" + datasets: + - dataset: "countdown" + - dataset: "emoji_mystery" + - dataset: "futoshuki" + - dataset: "knight_swap" + - dataset: "maze" + - dataset: "mini_sudoku" + - dataset: "n_queens" + - dataset: "sokoban" + - dataset: "sudoku" + - dataset: "tower_of_hanoi" + - dataset: "tsumego" + + - category: "geometry" + datasets: + - dataset: "simple_geometry" + - dataset: "advanced_geometry" + + - category: "graphs" + datasets: + - dataset: "course_schedule" + - dataset: "family_relationships" + - dataset: "largest_island" + - dataset: "list_functions" + - dataset: "quantum_lock" + - dataset: "shortest_path" + + - category: "logic" + datasets: + - dataset: "aiw" + - dataset: "circuit_logic" + - dataset: "propositional_logic" + - dataset: "self_reference" + - dataset: "syllogism" + - dataset: "zebra_puzzles" diff --git a/eval/yaml/openai-o3/algebra.yaml b/eval/yaml/openai-o3/algebra.yaml deleted file mode 100644 index aee04d20..00000000 --- a/eval/yaml/openai-o3/algebra.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model: openai/o3-mini -category: algebra -provider: OpenAI -datasets: - - complex_arithmetic - - intermediate_integration - - polynomial_equations - - polynomial_multiplication - - simple_equations - - simple_integration -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3/algorithmic.yaml b/eval/yaml/openai-o3/algorithmic.yaml deleted file mode 100644 index 295afc7c..00000000 --- a/eval/yaml/openai-o3/algorithmic.yaml +++ /dev/null @@ -1,37 +0,0 @@ -model: openai/o3-mini -category: algorithmic -provider: OpenAI -datasets: - - ab - - binary_alternation - - base_conversion - - binary_matrix - - caesar_cipher - - count_primes - - cryptarithm - - game_of_life - - graph_color - - group_anagrams - - isomorphic_strings - - letter_counting - - letter_jumble - - manipulate_matrix - - number_filtering - - number_sorting - - palindrome - - pool_matrix - - ransom_note - - rotate_matrix - - sentence_reordering - - spell_backward - - spiral_matrix - - string_insertion - - string_manipulation - - string_synthesis - - word_ladder - - word_sequence_reversal - - word_sorting -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3/arc.yaml b/eval/yaml/openai-o3/arc.yaml deleted file mode 100644 index 992fc8a9..00000000 --- a/eval/yaml/openai-o3/arc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -model: openai/o3-mini -category: arc -provider: OpenAI -datasets: - - arc_1d - - arc_agi - - rearc -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3/arithmetic.yaml b/eval/yaml/openai-o3/arithmetic.yaml deleted file mode 100644 index 67f56e59..00000000 --- a/eval/yaml/openai-o3/arithmetic.yaml +++ /dev/null @@ -1,26 +0,0 @@ -model: openai/o3-mini -category: arithmetic -provider: OpenAI -datasets: - - basic_arithmetic - - bitwise_arithmetic - - calendar_arithmetic - - chain_sum - - count_bits - - decimal_arithmetic - - decimal_chain_sum - - dice - - fraction_simplification - - gcd - - gsm_symbolic - - lcm - - leg_counting - - number_format - - power_function - - prime_factorization - - products - - time_intervals -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3/code.yaml b/eval/yaml/openai-o3/code.yaml deleted file mode 100644 index 8fa83d73..00000000 --- a/eval/yaml/openai-o3/code.yaml +++ /dev/null @@ -1,9 +0,0 @@ -model: openai/o3-mini -category: code -provider: OpenAI -datasets: - - bf -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3/cognition.yaml b/eval/yaml/openai-o3/cognition.yaml deleted file mode 100644 index e5cb510d..00000000 --- a/eval/yaml/openai-o3/cognition.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model: openai/o3-mini -category: cognition -provider: OpenAI -datasets: - - color_cube_rotation - - figlet_font - - needle_haystack - - number_sequence - - rectangle_count - - rubiks_cube -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3/games.yaml b/eval/yaml/openai-o3/games.yaml deleted file mode 100644 index f01c13c1..00000000 --- a/eval/yaml/openai-o3/games.yaml +++ /dev/null @@ -1,19 +0,0 @@ -model: openai/o3-mini -category: games -provider: OpenAI -datasets: - - countdown - - emoji_mystery - - futoshuki - - knight_swap - - maze - - mini_sudoku - - n_queens - - sokoban - - sudoku - - tower_of_hanoi - - tsumego -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3/geometry.yaml b/eval/yaml/openai-o3/geometry.yaml deleted file mode 100644 index 57a40a6d..00000000 --- a/eval/yaml/openai-o3/geometry.yaml +++ /dev/null @@ -1,10 +0,0 @@ -model: openai/o3-mini -category: geometry -provider: OpenAI -datasets: - - simple_geometry - - advanced_geometry -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3/graphs.yaml b/eval/yaml/openai-o3/graphs.yaml deleted file mode 100644 index 371b4459..00000000 --- a/eval/yaml/openai-o3/graphs.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model: openai/o3-mini -category: graphs -provider: OpenAI -datasets: - - course_schedule - - family_relationships - - largest_island - - list_functions - - quantum_lock - - shortest_path -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/openai-o3/logic.yaml b/eval/yaml/openai-o3/logic.yaml deleted file mode 100644 index a941b54d..00000000 --- a/eval/yaml/openai-o3/logic.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model: openai/o3-mini -category: logic -provider: OpenAI -datasets: - - aiw - - circuit_logic - - propositional_logic - - self_reference - - syllogism - - zebra_puzzles -eval_dir: results/openai-03 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/r1/algebra.yaml b/eval/yaml/r1/algebra.yaml deleted file mode 100644 index a4939285..00000000 --- a/eval/yaml/r1/algebra.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model: deepseek/deepseek-r1 -provider: Nebius -category: algebra -datasets: - - intermediate_integration - - polynomial_equations - - polynomial_multiplication - - simple_equations - - simple_integration - - complex_arithmetic -eval_dir: results/r1 -dataset_size: 50 -dataset_seed: 42 -developer_role: system diff --git a/eval/yaml/r1/algorithmic.yaml b/eval/yaml/r1/algorithmic.yaml deleted file mode 100644 index 55374af6..00000000 --- a/eval/yaml/r1/algorithmic.yaml +++ /dev/null @@ -1,35 +0,0 @@ -model: deepseek/deepseek-r1 -provider: Nebius -category: algorithmic -datasets: - - ab - - base_conversion - - binary_matrix - - caesar_cipher - - count_primes - - game_of_life - - graph_color - - group_anagrams - - isomorphic_strings - - letter_counting - - letter_jumble - - manipulate_matrix - - number_filtering - - number_sorting - - palindrome - - pool_matrix - - ransom_note - - rotate_matrix - - sentence_reordering - - spell_backward - - spiral_matrix - - string_insertion - - string_manipulation - - string_synthesis - - word_ladder - - word_sequence_reversal - - word_sorting -eval_dir: results/r1 -dataset_size: 50 -dataset_seed: 45 -developer_role: system diff --git a/eval/yaml/r1/cognition.yaml b/eval/yaml/r1/cognition.yaml deleted file mode 100644 index dec09f72..00000000 --- a/eval/yaml/r1/cognition.yaml +++ /dev/null @@ -1,12 +0,0 @@ -model: deepseek/deepseek-r1 -provider: Nebius -category: cognition -datasets: - - color_cube_rotation - - figlet_font - - number_sequence - - rubiks_cube -eval_dir: results/r1 -dataset_size: 50 -dataset_seed: 42 -developer_role: system diff --git a/eval/yaml/r1/logic.yaml b/eval/yaml/r1/logic.yaml deleted file mode 100644 index 45eef787..00000000 --- a/eval/yaml/r1/logic.yaml +++ /dev/null @@ -1,12 +0,0 @@ -model: deepseek/deepseek-r1 -provider: Nebius -category: logic -datasets: - - propositional_logic - - self_reference - - syllogism - - zebra_puzzles -eval_dir: results/r1 -dataset_size: 50 -dataset_seed: 42 -developer_role: system diff --git a/reasoning_gym/dataset.py b/reasoning_gym/dataset.py index f4d263cd..0733ac0b 100644 --- a/reasoning_gym/dataset.py +++ b/reasoning_gym/dataset.py @@ -37,7 +37,7 @@ class ProceduralDataset(ABC, Sized, Iterable[dict[str, Any]]): return item @abstractmethod - def __getitem__(self, idx: int) -> dict: + def __getitem__(self, idx: int) -> dict[str, Any]: """Generate a single dataset item Args: diff --git a/reasoning_gym/utils.py b/reasoning_gym/utils.py index 02dbe461..70d4a343 100644 --- a/reasoning_gym/utils.py +++ b/reasoning_gym/utils.py @@ -15,6 +15,7 @@ Once you have thought about the reasoning process, provide the answer in the fol answer here Do not explain your reasoning inside the answer tags, provide only the final answer. When an example is provided, you should strictly follow the format of the output/answer in that example. """, + "simple": "You are a helpful assistant that answers questions accurately and concisely. When asked to solve a problem, show your work step by step. Provide your final answer between and tags.", }