Generate eval config tool (#240)

* feat: Add generate_config.py script to create eval  configurations
This commit is contained in:
Andreas Köpf 2025-02-27 21:40:53 +01:00 committed by GitHub
parent 850c1cf6f4
commit 5b8d1b5175
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 858 additions and 338 deletions

View file

@ -97,6 +97,22 @@ categories:
- dataset: "word_sorting" - dataset: "word_sorting"
``` ```
### Generating Configurations
You can generate a configuration file with all registered datasets using the `generate_config.py` script:
```bash
python generate_config.py --output my_config.yaml --model "anthropic/claude-3.5-sonnet" --provider "Anthropic" --size 50 --seed 42
```
Options:
- `--output`: Output YAML file path (default: all_datasets.yaml)
- `--model`: Model name (default: openai/gpt-4)
- `--provider`: Provider name (default: None)
- `--size`: Default dataset size (default: 100)
- `--seed`: Default dataset seed (default: 42)
- `--include-params`: Include all configuration parameters (default: False)
### Running Evaluations ### Running Evaluations
To run evaluations: To run evaluations:

View file

@ -27,7 +27,6 @@ import logging
import os import os
import subprocess import subprocess
import sys import sys
from collections import OrderedDict
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Any, Union from typing import Any, Union
@ -318,7 +317,7 @@ class AsyncModelEvaluator:
return results return results
def generate_summary(self, results: dict[str, Any]) -> dict[str, Union[int, OrderedDict]]: def generate_summary(self, results: dict[str, Any]) -> dict[str, Any]:
"""Generate a summary of evaluation results in the original configuration order. """Generate a summary of evaluation results in the original configuration order.
Args: Args:
@ -330,7 +329,7 @@ class AsyncModelEvaluator:
summary = { summary = {
"total_datasets": 0, "total_datasets": 0,
"total_examples": 0, "total_examples": 0,
"dataset_scores": OrderedDict(), "dataset_scores": {},
} }
# Iterate through categories and datasets in the original order from config # Iterate through categories and datasets in the original order from config

112
eval/generate_config.py Normal file
View file

@ -0,0 +1,112 @@
#!/usr/bin/env python
"""
Configuration generator for reasoning-gym evaluation.
This script generates a YAML configuration file with all registered datasets
from reasoning_gym, organized by category.
Usage:
python generate_config.py [options]
Options:
--output OUTPUT Output YAML file path (default: all_datasets.yaml)
--model MODEL Model name (default: openai/gpt-4)
--provider PROVIDER Provider name (default: None)
--size SIZE Default dataset size (default: 100)
--seed SEED Default dataset seed (default: 42)
--include-params Include all configuration parameters (default: False)
"""
import argparse
import inspect
from collections import defaultdict
from dataclasses import fields
import yaml
from reasoning_gym.factory import DATASETS
def extract_category(module_name):
"""Extract category from module name."""
parts = module_name.split(".")
if len(parts) >= 3:
return parts[1] # reasoning_gym.{category}.dataset_name
return "other"
def generate_config(model, provider, size, seed, include_params):
"""Generate configuration with all registered datasets."""
# Group datasets by category
categories = defaultdict(list)
for dataset_name, (dataset_cls, config_cls) in DATASETS.items():
# Extract category from module name
category = extract_category(dataset_cls.__module__)
# Create dataset entry
dataset_entry = {"dataset": dataset_name}
# Optionally include all configuration parameters
if include_params:
params = {}
# Get default values from config class fields
for field in fields(config_cls):
# Skip seed and size as they're handled separately
if field.name not in ["seed", "size"]:
# Only include fields with default values
if field.default != inspect.Parameter.empty:
params[field.name] = field.default
if params:
dataset_entry["params"] = params
# Add to appropriate category
categories[category].append(dataset_entry)
# Create configuration structure
config = {
"model": model,
"provider": provider,
"output_dir": "results",
"max_concurrent": 10,
"default_size": size,
"default_seed": seed,
"categories": [],
}
# Add categories
for category_name, datasets in sorted(categories.items()):
config["categories"].append({"category": category_name, "datasets": datasets})
return config
def main():
parser = argparse.ArgumentParser(description="Generate evaluation configuration with all datasets")
parser.add_argument("--output", default="all_datasets.yaml", help="Output YAML file path")
parser.add_argument("--model", default="openai/gpt-4", help="Model name")
parser.add_argument("--provider", default=None, help="Provider name")
parser.add_argument("--size", type=int, default=100, help="Default dataset size")
parser.add_argument("--seed", type=int, default=42, help="Default dataset seed")
parser.add_argument("--include-params", action="store_true", help="Include all configuration parameters")
args = parser.parse_args()
# Generate configuration
config = generate_config(
model=args.model, provider=args.provider, size=args.size, seed=args.seed, include_params=args.include_params
)
# Write to file
with open(args.output, "w") as f:
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
print(
f"Configuration with {sum(len(cat['datasets']) for cat in config['categories'])} datasets written to {args.output}"
)
print(f"Categories: {', '.join(cat['category'] for cat in config['categories'])}")
if __name__ == "__main__":
main()

View file

@ -1,34 +1,124 @@
# Combined configuration for Claude 3.5 Sonnet model: anthropic/claude-3.5-sonnet
model: "anthropic/claude-3.5-sonnet" provider: Anthropic
provider: "Anthropic" output_dir: results
output_dir: "results"
max_concurrent: 10 max_concurrent: 10
default_size: 50 default_size: 50
default_seed: 45 default_seed: 45
categories: categories:
- category: "algorithmic" - category: algebra
datasets: datasets:
- dataset: "count_primes" - dataset: complex_arithmetic
- dataset: "game_of_life" - dataset: intermediate_integration
- dataset: "graph_color" - dataset: polynomial_equations
- dataset: "group_anagrams" - dataset: polynomial_multiplication
- dataset: "isomorphic_strings" - dataset: simple_equations
- dataset: "letter_counting" - dataset: simple_integration
- dataset: "letter_jumble" - category: algorithmic
- dataset: "manipulate_matrix" datasets:
- dataset: "number_filtering" - dataset: ab
- dataset: "number_sorting" - dataset: base_conversion
- dataset: "palindrome" - dataset: binary_alternation
- dataset: "pool_matrix" - dataset: binary_matrix
- dataset: "ransom_note" - dataset: caesar_cipher
- dataset: "rotate_matrix" - dataset: count_primes
- dataset: "sentence_reordering" - dataset: cryptarithm
- dataset: "spell_backward" - dataset: game_of_life
- dataset: "spiral_matrix" - dataset: graph_color
- dataset: "string_insertion" - dataset: group_anagrams
- dataset: "string_manipulation" - dataset: isomorphic_strings
- dataset: "string_synthesis" - dataset: jugs
- dataset: "word_ladder" - dataset: letter_counting
- dataset: "word_sequence_reversal" - dataset: letter_jumble
- dataset: "word_sorting" - dataset: manipulate_matrix
- dataset: number_filtering
- dataset: number_sorting
- dataset: palindrome
- dataset: palindrome_partitioning
- dataset: pool_matrix
- dataset: ransom_note
- dataset: rotate_matrix
- dataset: rotten_oranges
- dataset: sentence_reordering
- dataset: spell_backward
- dataset: spiral_matrix
- dataset: string_insertion
- dataset: string_manipulation
- dataset: string_splitting
- dataset: string_synthesis
- dataset: word_ladder
- dataset: word_sequence_reversal
- dataset: word_sorting
- category: arc
datasets:
- dataset: arc_1d
- dataset: arc_agi
- dataset: rearc
- category: arithmetic
datasets:
- dataset: basic_arithmetic
- dataset: bitwise_arithmetic
- dataset: calendar_arithmetic
- dataset: chain_sum
- dataset: count_bits
- dataset: decimal_arithmetic
- dataset: decimal_chain_sum
- dataset: dice
- dataset: fraction_simplification
- dataset: gcd
- dataset: gsm_symbolic
- dataset: lcm
- dataset: leg_counting
- dataset: number_format
- dataset: power_function
- dataset: prime_factorization
- dataset: products
- dataset: time_intervals
- category: code
datasets:
- dataset: bf
- dataset: codeio
- category: cognition
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
- dataset: rubiks_cube
- category: games
datasets:
- dataset: countdown
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
- dataset: tower_of_hanoi
- dataset: tsumego
- category: geometry
datasets:
- dataset: advanced_geometry
- dataset: simple_geometry
- category: graphs
datasets:
- dataset: course_schedule
- dataset: family_relationships
- dataset: largest_island
- dataset: quantum_lock
- dataset: shortest_path
- category: induction
datasets:
- dataset: list_functions
- category: logic
datasets:
- dataset: aiw
- dataset: circuit_logic
- dataset: knights_knaves
- dataset: propositional_logic
- dataset: self_reference
- dataset: syllogism
- dataset: zebra_puzzles

View file

@ -0,0 +1,124 @@
model: anthropic/claude-3.7-sonnet:thinking
provider: Anthropic
output_dir: results
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: algebra
datasets:
- dataset: complex_arithmetic
- dataset: intermediate_integration
- dataset: polynomial_equations
- dataset: polynomial_multiplication
- dataset: simple_equations
- dataset: simple_integration
- category: algorithmic
datasets:
- dataset: ab
- dataset: base_conversion
- dataset: binary_alternation
- dataset: binary_matrix
- dataset: caesar_cipher
- dataset: count_primes
- dataset: cryptarithm
- dataset: game_of_life
- dataset: graph_color
- dataset: group_anagrams
- dataset: isomorphic_strings
- dataset: jugs
- dataset: letter_counting
- dataset: letter_jumble
- dataset: manipulate_matrix
- dataset: number_filtering
- dataset: number_sorting
- dataset: palindrome
- dataset: palindrome_partitioning
- dataset: pool_matrix
- dataset: ransom_note
- dataset: rotate_matrix
- dataset: rotten_oranges
- dataset: sentence_reordering
- dataset: spell_backward
- dataset: spiral_matrix
- dataset: string_insertion
- dataset: string_manipulation
- dataset: string_splitting
- dataset: string_synthesis
- dataset: word_ladder
- dataset: word_sequence_reversal
- dataset: word_sorting
- category: arc
datasets:
- dataset: arc_1d
- dataset: arc_agi
- dataset: rearc
- category: arithmetic
datasets:
- dataset: basic_arithmetic
- dataset: bitwise_arithmetic
- dataset: calendar_arithmetic
- dataset: chain_sum
- dataset: count_bits
- dataset: decimal_arithmetic
- dataset: decimal_chain_sum
- dataset: dice
- dataset: fraction_simplification
- dataset: gcd
- dataset: gsm_symbolic
- dataset: lcm
- dataset: leg_counting
- dataset: number_format
- dataset: power_function
- dataset: prime_factorization
- dataset: products
- dataset: time_intervals
- category: code
datasets:
- dataset: bf
- dataset: codeio
- category: cognition
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
- dataset: rubiks_cube
- category: games
datasets:
- dataset: countdown
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
- dataset: tower_of_hanoi
- dataset: tsumego
- category: geometry
datasets:
- dataset: advanced_geometry
- dataset: simple_geometry
- category: graphs
datasets:
- dataset: course_schedule
- dataset: family_relationships
- dataset: largest_island
- dataset: quantum_lock
- dataset: shortest_path
- category: induction
datasets:
- dataset: list_functions
- category: logic
datasets:
- dataset: aiw
- dataset: circuit_logic
- dataset: knights_knaves
- dataset: propositional_logic
- dataset: self_reference
- dataset: syllogism
- dataset: zebra_puzzles

View file

@ -1,61 +1,124 @@
# Combined configuration for deepseek-r1 model: deepseek/deepseek-r1
model: "deepseek/deepseek-r1" provider: Nebius
provider: "Nebius" output_dir: results
output_dir: "results"
max_concurrent: 10 max_concurrent: 10
default_size: 50 default_size: 50
default_seed: 45 default_seed: 45
categories: categories:
- category: "algebra" - category: algebra
datasets: datasets:
- dataset: "intermediate_integration" - dataset: complex_arithmetic
- dataset: "polynomial_equations" - dataset: intermediate_integration
- dataset: "polynomial_multiplication" - dataset: polynomial_equations
- dataset: "simple_equations" - dataset: polynomial_multiplication
- dataset: "simple_integration" - dataset: simple_equations
- dataset: "complex_arithmetic" - dataset: simple_integration
- category: algorithmic
- category: "algorithmic" datasets:
datasets: - dataset: ab
- dataset: "ab" - dataset: base_conversion
- dataset: "base_conversion" - dataset: binary_alternation
- dataset: "binary_matrix" - dataset: binary_matrix
- dataset: "caesar_cipher" - dataset: caesar_cipher
- dataset: "count_primes" - dataset: count_primes
- dataset: "game_of_life" - dataset: cryptarithm
- dataset: "graph_color" - dataset: game_of_life
- dataset: "group_anagrams" - dataset: graph_color
- dataset: "isomorphic_strings" - dataset: group_anagrams
- dataset: "letter_counting" - dataset: isomorphic_strings
- dataset: "letter_jumble" - dataset: jugs
- dataset: "manipulate_matrix" - dataset: letter_counting
- dataset: "number_filtering" - dataset: letter_jumble
- dataset: "number_sorting" - dataset: manipulate_matrix
- dataset: "palindrome" - dataset: number_filtering
- dataset: "pool_matrix" - dataset: number_sorting
- dataset: "ransom_note" - dataset: palindrome
- dataset: "rotate_matrix" - dataset: palindrome_partitioning
- dataset: "sentence_reordering" - dataset: pool_matrix
- dataset: "spell_backward" - dataset: ransom_note
- dataset: "spiral_matrix" - dataset: rotate_matrix
- dataset: "string_insertion" - dataset: rotten_oranges
- dataset: "string_manipulation" - dataset: sentence_reordering
- dataset: "string_synthesis" - dataset: spell_backward
- dataset: "word_ladder" - dataset: spiral_matrix
- dataset: "word_sequence_reversal" - dataset: string_insertion
- dataset: "word_sorting" - dataset: string_manipulation
- dataset: string_splitting
- category: "cognition" - dataset: string_synthesis
datasets: - dataset: word_ladder
- dataset: "color_cube_rotation" - dataset: word_sequence_reversal
- dataset: "figlet_font" - dataset: word_sorting
- dataset: "number_sequence" - category: arc
- dataset: "rubiks_cube" datasets:
- dataset: arc_1d
- category: "logic" - dataset: arc_agi
datasets: - dataset: rearc
- dataset: "propositional_logic" - category: arithmetic
- dataset: "self_reference" datasets:
- dataset: "syllogism" - dataset: basic_arithmetic
- dataset: "zebra_puzzles" - dataset: bitwise_arithmetic
- dataset: calendar_arithmetic
- dataset: chain_sum
- dataset: count_bits
- dataset: decimal_arithmetic
- dataset: decimal_chain_sum
- dataset: dice
- dataset: fraction_simplification
- dataset: gcd
- dataset: gsm_symbolic
- dataset: lcm
- dataset: leg_counting
- dataset: number_format
- dataset: power_function
- dataset: prime_factorization
- dataset: products
- dataset: time_intervals
- category: code
datasets:
- dataset: bf
- dataset: codeio
- category: cognition
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
- dataset: rubiks_cube
- category: games
datasets:
- dataset: countdown
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
- dataset: tower_of_hanoi
- dataset: tsumego
- category: geometry
datasets:
- dataset: advanced_geometry
- dataset: simple_geometry
- category: graphs
datasets:
- dataset: course_schedule
- dataset: family_relationships
- dataset: largest_island
- dataset: quantum_lock
- dataset: shortest_path
- category: induction
datasets:
- dataset: list_functions
- category: logic
datasets:
- dataset: aiw
- dataset: circuit_logic
- dataset: knights_knaves
- dataset: propositional_logic
- dataset: self_reference
- dataset: syllogism
- dataset: zebra_puzzles

View file

@ -1,130 +1,124 @@
# Combined configuration for llama-3.3-70b-instruct model: meta-llama/llama-3.3-70b-instruct
model: "meta-llama/llama-3.3-70b-instruct" provider: Hyperbolic
provider: "Hyperbolic" output_dir: results
output_dir: "results"
max_concurrent: 10 max_concurrent: 10
default_size: 50 default_size: 50
default_seed: 45 default_seed: 45
categories: categories:
- category: "algebra" - category: algebra
datasets: datasets:
- dataset: "intermediate_integration" - dataset: complex_arithmetic
- dataset: "polynomial_equations" - dataset: intermediate_integration
- dataset: "polynomial_multiplication" - dataset: polynomial_equations
- dataset: "simple_equations" - dataset: polynomial_multiplication
- dataset: "simple_integration" - dataset: simple_equations
- dataset: "complex_arithmetic" - dataset: simple_integration
- category: algorithmic
- category: "algorithmic" datasets:
datasets: - dataset: ab
- dataset: "ab" - dataset: base_conversion
- dataset: "base_conversion" - dataset: binary_alternation
- dataset: "binary_alternation" - dataset: binary_matrix
- dataset: "binary_matrix" - dataset: caesar_cipher
- dataset: "caesar_cipher" - dataset: count_primes
- dataset: "count_primes" - dataset: cryptarithm
- dataset: "cryptarithm" - dataset: game_of_life
- dataset: "game_of_life" - dataset: graph_color
- dataset: "graph_color" - dataset: group_anagrams
- dataset: "group_anagrams" - dataset: isomorphic_strings
- dataset: "isomorphic_strings" - dataset: jugs
- dataset: "jugs" - dataset: letter_counting
- dataset: "letter_counting" - dataset: letter_jumble
- dataset: "letter_jumble" - dataset: manipulate_matrix
- dataset: "manipulate_matrix" - dataset: number_filtering
- dataset: "number_filtering" - dataset: number_sorting
- dataset: "number_sorting" - dataset: palindrome
- dataset: "palindrome" - dataset: palindrome_partitioning
- dataset: "palindrome_partitioning" - dataset: pool_matrix
- dataset: "pool_matrix" - dataset: ransom_note
- dataset: "ransom_note" - dataset: rotate_matrix
- dataset: "rotate_matrix" - dataset: rotten_oranges
- dataset: "rotten_oranges" - dataset: sentence_reordering
- dataset: "sentence_reordering" - dataset: spell_backward
- dataset: "spell_backward" - dataset: spiral_matrix
- dataset: "spiral_matrix" - dataset: string_insertion
- dataset: "string_insertion" - dataset: string_manipulation
- dataset: "string_manipulation" - dataset: string_splitting
- dataset: "string_splitting" - dataset: string_synthesis
- dataset: "string_synthesis" - dataset: word_ladder
- dataset: "word_ladder" - dataset: word_sequence_reversal
- dataset: "word_sequence_reversal" - dataset: word_sorting
- dataset: "word_sorting" - category: arc
datasets:
- category: "arc" - dataset: arc_1d
datasets: - dataset: arc_agi
- dataset: "arc_1d" - dataset: rearc
- dataset: "arc_agi" - category: arithmetic
- dataset: "rearc" datasets:
- dataset: basic_arithmetic
- category: "arithmetic" - dataset: bitwise_arithmetic
datasets: - dataset: calendar_arithmetic
- dataset: "basic_arithmetic" - dataset: chain_sum
- dataset: "bitwise_arithmetic" - dataset: count_bits
- dataset: "calendar_arithmetic" - dataset: decimal_arithmetic
- dataset: "chain_sum" - dataset: decimal_chain_sum
- dataset: "count_bits" - dataset: dice
- dataset: "decimal_arithmetic" - dataset: fraction_simplification
- dataset: "decimal_chain_sum" - dataset: gcd
- dataset: "dice" - dataset: gsm_symbolic
- dataset: "fraction_simplification" - dataset: lcm
- dataset: "gcd" - dataset: leg_counting
- dataset: "gsm_symbolic" - dataset: number_format
- dataset: "lcm" - dataset: power_function
- dataset: "leg_counting" - dataset: prime_factorization
- dataset: "number_format" - dataset: products
- dataset: "power_function" - dataset: time_intervals
- dataset: "prime_factorization" - category: code
- dataset: "products" datasets:
- dataset: "time_intervals" - dataset: bf
- dataset: codeio
- category: "code" - category: cognition
datasets: datasets:
- dataset: "bf" - dataset: color_cube_rotation
- dataset: figlet_font
- category: "cognition" - dataset: needle_haystack
datasets: - dataset: number_sequence
- dataset: "color_cube_rotation" - dataset: rectangle_count
- dataset: "figlet_font" - dataset: rubiks_cube
- dataset: "needle_haystack" - category: games
- dataset: "number_sequence" datasets:
- dataset: "rectangle_count" - dataset: countdown
- dataset: "rubiks_cube" - dataset: emoji_mystery
- dataset: futoshiki
- category: "games" - dataset: knight_swap
datasets: - dataset: maze
- dataset: "countdown" - dataset: mini_sudoku
- dataset: "emoji_mystery" - dataset: n_queens
- dataset: "futoshuki" - dataset: rush_hour
- dataset: "knight_swap" - dataset: sokoban
- dataset: "maze" - dataset: sudoku
- dataset: "mini_sudoku" - dataset: tower_of_hanoi
- dataset: "n_queens" - dataset: tsumego
- dataset: "sokoban" - category: geometry
- dataset: "sudoku" datasets:
- dataset: "tower_of_hanoi" - dataset: advanced_geometry
- dataset: "tsumego" - dataset: simple_geometry
- category: graphs
- category: "geometry" datasets:
datasets: - dataset: course_schedule
- dataset: "simple_geometry" - dataset: family_relationships
- dataset: "advanced_geometry" - dataset: largest_island
- dataset: quantum_lock
- category: "graphs" - dataset: shortest_path
datasets: - category: induction
- dataset: "course_schedule" datasets:
- dataset: "family_relationships" - dataset: list_functions
- dataset: "largest_island" - category: logic
- dataset: "list_functions" datasets:
- dataset: "quantum_lock" - dataset: aiw
- dataset: "shortest_path" - dataset: circuit_logic
- dataset: knights_knaves
- category: "logic" - dataset: propositional_logic
datasets: - dataset: self_reference
- dataset: "aiw" - dataset: syllogism
- dataset: "circuit_logic" - dataset: zebra_puzzles
- dataset: "propositional_logic"
- dataset: "self_reference"
- dataset: "syllogism"
- dataset: "zebra_puzzles"

124
eval/yaml/openai-o1.yaml Normal file
View file

@ -0,0 +1,124 @@
model: openai/o1
provider: OpenAI
output_dir: results
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: algebra
datasets:
- dataset: complex_arithmetic
- dataset: intermediate_integration
- dataset: polynomial_equations
- dataset: polynomial_multiplication
- dataset: simple_equations
- dataset: simple_integration
- category: algorithmic
datasets:
- dataset: ab
- dataset: base_conversion
- dataset: binary_alternation
- dataset: binary_matrix
- dataset: caesar_cipher
- dataset: count_primes
- dataset: cryptarithm
- dataset: game_of_life
- dataset: graph_color
- dataset: group_anagrams
- dataset: isomorphic_strings
- dataset: jugs
- dataset: letter_counting
- dataset: letter_jumble
- dataset: manipulate_matrix
- dataset: number_filtering
- dataset: number_sorting
- dataset: palindrome
- dataset: palindrome_partitioning
- dataset: pool_matrix
- dataset: ransom_note
- dataset: rotate_matrix
- dataset: rotten_oranges
- dataset: sentence_reordering
- dataset: spell_backward
- dataset: spiral_matrix
- dataset: string_insertion
- dataset: string_manipulation
- dataset: string_splitting
- dataset: string_synthesis
- dataset: word_ladder
- dataset: word_sequence_reversal
- dataset: word_sorting
- category: arc
datasets:
- dataset: arc_1d
- dataset: arc_agi
- dataset: rearc
- category: arithmetic
datasets:
- dataset: basic_arithmetic
- dataset: bitwise_arithmetic
- dataset: calendar_arithmetic
- dataset: chain_sum
- dataset: count_bits
- dataset: decimal_arithmetic
- dataset: decimal_chain_sum
- dataset: dice
- dataset: fraction_simplification
- dataset: gcd
- dataset: gsm_symbolic
- dataset: lcm
- dataset: leg_counting
- dataset: number_format
- dataset: power_function
- dataset: prime_factorization
- dataset: products
- dataset: time_intervals
- category: code
datasets:
- dataset: bf
- dataset: codeio
- category: cognition
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
- dataset: rubiks_cube
- category: games
datasets:
- dataset: countdown
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
- dataset: tower_of_hanoi
- dataset: tsumego
- category: geometry
datasets:
- dataset: advanced_geometry
- dataset: simple_geometry
- category: graphs
datasets:
- dataset: course_schedule
- dataset: family_relationships
- dataset: largest_island
- dataset: quantum_lock
- dataset: shortest_path
- category: induction
datasets:
- dataset: list_functions
- category: logic
datasets:
- dataset: aiw
- dataset: circuit_logic
- dataset: knights_knaves
- dataset: propositional_logic
- dataset: self_reference
- dataset: syllogism
- dataset: zebra_puzzles

View file

@ -1,126 +1,124 @@
# Combined configuration for openai/o3-mini model: openai/o3-mini
model: "openai/o3-mini" provider: OpenAI
provider: "OpenAI" output_dir: results
output_dir: "results"
max_concurrent: 10 max_concurrent: 10
default_size: 50 default_size: 50
default_seed: 45 default_seed: 45
categories: categories:
- category: "algebra" - category: algebra
datasets: datasets:
- dataset: "complex_arithmetic" - dataset: complex_arithmetic
- dataset: "intermediate_integration" - dataset: intermediate_integration
- dataset: "polynomial_equations" - dataset: polynomial_equations
- dataset: "polynomial_multiplication" - dataset: polynomial_multiplication
- dataset: "simple_equations" - dataset: simple_equations
- dataset: "simple_integration" - dataset: simple_integration
- category: algorithmic
- category: "algorithmic" datasets:
datasets: - dataset: ab
- dataset: "ab" - dataset: base_conversion
- dataset: "binary_alternation" - dataset: binary_alternation
- dataset: "base_conversion" - dataset: binary_matrix
- dataset: "binary_matrix" - dataset: caesar_cipher
- dataset: "caesar_cipher" - dataset: count_primes
- dataset: "count_primes" - dataset: cryptarithm
- dataset: "cryptarithm" - dataset: game_of_life
- dataset: "game_of_life" - dataset: graph_color
- dataset: "graph_color" - dataset: group_anagrams
- dataset: "group_anagrams" - dataset: isomorphic_strings
- dataset: "isomorphic_strings" - dataset: jugs
- dataset: "letter_counting" - dataset: letter_counting
- dataset: "letter_jumble" - dataset: letter_jumble
- dataset: "manipulate_matrix" - dataset: manipulate_matrix
- dataset: "number_filtering" - dataset: number_filtering
- dataset: "number_sorting" - dataset: number_sorting
- dataset: "palindrome" - dataset: palindrome
- dataset: "pool_matrix" - dataset: palindrome_partitioning
- dataset: "ransom_note" - dataset: pool_matrix
- dataset: "rotate_matrix" - dataset: ransom_note
- dataset: "sentence_reordering" - dataset: rotate_matrix
- dataset: "spell_backward" - dataset: rotten_oranges
- dataset: "spiral_matrix" - dataset: sentence_reordering
- dataset: "string_insertion" - dataset: spell_backward
- dataset: "string_manipulation" - dataset: spiral_matrix
- dataset: "string_synthesis" - dataset: string_insertion
- dataset: "word_ladder" - dataset: string_manipulation
- dataset: "word_sequence_reversal" - dataset: string_splitting
- dataset: "word_sorting" - dataset: string_synthesis
- dataset: word_ladder
- category: "arc" - dataset: word_sequence_reversal
datasets: - dataset: word_sorting
- dataset: "arc_1d" - category: arc
- dataset: "arc_agi" datasets:
- dataset: "rearc" - dataset: arc_1d
- dataset: arc_agi
- category: "arithmetic" - dataset: rearc
datasets: - category: arithmetic
- dataset: "basic_arithmetic" datasets:
- dataset: "bitwise_arithmetic" - dataset: basic_arithmetic
- dataset: "calendar_arithmetic" - dataset: bitwise_arithmetic
- dataset: "chain_sum" - dataset: calendar_arithmetic
- dataset: "count_bits" - dataset: chain_sum
- dataset: "decimal_arithmetic" - dataset: count_bits
- dataset: "decimal_chain_sum" - dataset: decimal_arithmetic
- dataset: "dice" - dataset: decimal_chain_sum
- dataset: "fraction_simplification" - dataset: dice
- dataset: "gcd" - dataset: fraction_simplification
- dataset: "gsm_symbolic" - dataset: gcd
- dataset: "lcm" - dataset: gsm_symbolic
- dataset: "leg_counting" - dataset: lcm
- dataset: "number_format" - dataset: leg_counting
- dataset: "power_function" - dataset: number_format
- dataset: "prime_factorization" - dataset: power_function
- dataset: "products" - dataset: prime_factorization
- dataset: "time_intervals" - dataset: products
- dataset: time_intervals
- category: "code" - category: code
datasets: datasets:
- dataset: "bf" - dataset: bf
- dataset: codeio
- category: "cognition" - category: cognition
datasets: datasets:
- dataset: "color_cube_rotation" - dataset: color_cube_rotation
- dataset: "figlet_font" - dataset: figlet_font
- dataset: "needle_haystack" - dataset: needle_haystack
- dataset: "number_sequence" - dataset: number_sequence
- dataset: "rectangle_count" - dataset: rectangle_count
- dataset: "rubiks_cube" - dataset: rubiks_cube
- category: games
- category: "games" datasets:
datasets: - dataset: countdown
- dataset: "countdown" - dataset: emoji_mystery
- dataset: "emoji_mystery" - dataset: futoshiki
- dataset: "futoshuki" - dataset: knight_swap
- dataset: "knight_swap" - dataset: maze
- dataset: "maze" - dataset: mini_sudoku
- dataset: "mini_sudoku" - dataset: n_queens
- dataset: "n_queens" - dataset: rush_hour
- dataset: "sokoban" - dataset: sokoban
- dataset: "sudoku" - dataset: sudoku
- dataset: "tower_of_hanoi" - dataset: tower_of_hanoi
- dataset: "tsumego" - dataset: tsumego
- category: geometry
- category: "geometry" datasets:
datasets: - dataset: advanced_geometry
- dataset: "simple_geometry" - dataset: simple_geometry
- dataset: "advanced_geometry" - category: graphs
datasets:
- category: "graphs" - dataset: course_schedule
datasets: - dataset: family_relationships
- dataset: "course_schedule" - dataset: largest_island
- dataset: "family_relationships" - dataset: quantum_lock
- dataset: "largest_island" - dataset: shortest_path
- dataset: "list_functions" - category: induction
- dataset: "quantum_lock" datasets:
- dataset: "shortest_path" - dataset: list_functions
- category: logic
- category: "logic" datasets:
datasets: - dataset: aiw
- dataset: "aiw" - dataset: circuit_logic
- dataset: "circuit_logic" - dataset: knights_knaves
- dataset: "propositional_logic" - dataset: propositional_logic
- dataset: "self_reference" - dataset: self_reference
- dataset: "syllogism" - dataset: syllogism
- dataset: "zebra_puzzles" - dataset: zebra_puzzles