mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
Generate eval config tool (#240)
* feat: Add generate_config.py script to create eval configurations
This commit is contained in:
parent
850c1cf6f4
commit
5b8d1b5175
9 changed files with 858 additions and 338 deletions
|
|
@ -97,6 +97,22 @@ categories:
|
||||||
- dataset: "word_sorting"
|
- dataset: "word_sorting"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Generating Configurations
|
||||||
|
|
||||||
|
You can generate a configuration file with all registered datasets using the `generate_config.py` script:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python generate_config.py --output my_config.yaml --model "anthropic/claude-3.5-sonnet" --provider "Anthropic" --size 50 --seed 42
|
||||||
|
```
|
||||||
|
|
||||||
|
Options:
|
||||||
|
- `--output`: Output YAML file path (default: all_datasets.yaml)
|
||||||
|
- `--model`: Model name (default: openai/gpt-4)
|
||||||
|
- `--provider`: Provider name (default: None)
|
||||||
|
- `--size`: Default dataset size (default: 100)
|
||||||
|
- `--seed`: Default dataset seed (default: 42)
|
||||||
|
- `--include-params`: Include all configuration parameters (default: False)
|
||||||
|
|
||||||
### Running Evaluations
|
### Running Evaluations
|
||||||
|
|
||||||
To run evaluations:
|
To run evaluations:
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,6 @@ import logging
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from collections import OrderedDict
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
@ -318,7 +317,7 @@ class AsyncModelEvaluator:
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def generate_summary(self, results: dict[str, Any]) -> dict[str, Union[int, OrderedDict]]:
|
def generate_summary(self, results: dict[str, Any]) -> dict[str, Any]:
|
||||||
"""Generate a summary of evaluation results in the original configuration order.
|
"""Generate a summary of evaluation results in the original configuration order.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -330,7 +329,7 @@ class AsyncModelEvaluator:
|
||||||
summary = {
|
summary = {
|
||||||
"total_datasets": 0,
|
"total_datasets": 0,
|
||||||
"total_examples": 0,
|
"total_examples": 0,
|
||||||
"dataset_scores": OrderedDict(),
|
"dataset_scores": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
# Iterate through categories and datasets in the original order from config
|
# Iterate through categories and datasets in the original order from config
|
||||||
|
|
|
||||||
112
eval/generate_config.py
Normal file
112
eval/generate_config.py
Normal file
|
|
@ -0,0 +1,112 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Configuration generator for reasoning-gym evaluation.
|
||||||
|
|
||||||
|
This script generates a YAML configuration file with all registered datasets
|
||||||
|
from reasoning_gym, organized by category.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python generate_config.py [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--output OUTPUT Output YAML file path (default: all_datasets.yaml)
|
||||||
|
--model MODEL Model name (default: openai/gpt-4)
|
||||||
|
--provider PROVIDER Provider name (default: None)
|
||||||
|
--size SIZE Default dataset size (default: 100)
|
||||||
|
--seed SEED Default dataset seed (default: 42)
|
||||||
|
--include-params Include all configuration parameters (default: False)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import inspect
|
||||||
|
from collections import defaultdict
|
||||||
|
from dataclasses import fields
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from reasoning_gym.factory import DATASETS
|
||||||
|
|
||||||
|
|
||||||
|
def extract_category(module_name):
|
||||||
|
"""Extract category from module name."""
|
||||||
|
parts = module_name.split(".")
|
||||||
|
if len(parts) >= 3:
|
||||||
|
return parts[1] # reasoning_gym.{category}.dataset_name
|
||||||
|
return "other"
|
||||||
|
|
||||||
|
|
||||||
|
def generate_config(model, provider, size, seed, include_params):
|
||||||
|
"""Generate configuration with all registered datasets."""
|
||||||
|
# Group datasets by category
|
||||||
|
categories = defaultdict(list)
|
||||||
|
|
||||||
|
for dataset_name, (dataset_cls, config_cls) in DATASETS.items():
|
||||||
|
# Extract category from module name
|
||||||
|
category = extract_category(dataset_cls.__module__)
|
||||||
|
|
||||||
|
# Create dataset entry
|
||||||
|
dataset_entry = {"dataset": dataset_name}
|
||||||
|
|
||||||
|
# Optionally include all configuration parameters
|
||||||
|
if include_params:
|
||||||
|
params = {}
|
||||||
|
# Get default values from config class fields
|
||||||
|
for field in fields(config_cls):
|
||||||
|
# Skip seed and size as they're handled separately
|
||||||
|
if field.name not in ["seed", "size"]:
|
||||||
|
# Only include fields with default values
|
||||||
|
if field.default != inspect.Parameter.empty:
|
||||||
|
params[field.name] = field.default
|
||||||
|
|
||||||
|
if params:
|
||||||
|
dataset_entry["params"] = params
|
||||||
|
|
||||||
|
# Add to appropriate category
|
||||||
|
categories[category].append(dataset_entry)
|
||||||
|
|
||||||
|
# Create configuration structure
|
||||||
|
config = {
|
||||||
|
"model": model,
|
||||||
|
"provider": provider,
|
||||||
|
"output_dir": "results",
|
||||||
|
"max_concurrent": 10,
|
||||||
|
"default_size": size,
|
||||||
|
"default_seed": seed,
|
||||||
|
"categories": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add categories
|
||||||
|
for category_name, datasets in sorted(categories.items()):
|
||||||
|
config["categories"].append({"category": category_name, "datasets": datasets})
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Generate evaluation configuration with all datasets")
|
||||||
|
parser.add_argument("--output", default="all_datasets.yaml", help="Output YAML file path")
|
||||||
|
parser.add_argument("--model", default="openai/gpt-4", help="Model name")
|
||||||
|
parser.add_argument("--provider", default=None, help="Provider name")
|
||||||
|
parser.add_argument("--size", type=int, default=100, help="Default dataset size")
|
||||||
|
parser.add_argument("--seed", type=int, default=42, help="Default dataset seed")
|
||||||
|
parser.add_argument("--include-params", action="store_true", help="Include all configuration parameters")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Generate configuration
|
||||||
|
config = generate_config(
|
||||||
|
model=args.model, provider=args.provider, size=args.size, seed=args.seed, include_params=args.include_params
|
||||||
|
)
|
||||||
|
|
||||||
|
# Write to file
|
||||||
|
with open(args.output, "w") as f:
|
||||||
|
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Configuration with {sum(len(cat['datasets']) for cat in config['categories'])} datasets written to {args.output}"
|
||||||
|
)
|
||||||
|
print(f"Categories: {', '.join(cat['category'] for cat in config['categories'])}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -1,34 +1,124 @@
|
||||||
# Combined configuration for Claude 3.5 Sonnet
|
model: anthropic/claude-3.5-sonnet
|
||||||
model: "anthropic/claude-3.5-sonnet"
|
provider: Anthropic
|
||||||
provider: "Anthropic"
|
output_dir: results
|
||||||
output_dir: "results"
|
|
||||||
max_concurrent: 10
|
max_concurrent: 10
|
||||||
default_size: 50
|
default_size: 50
|
||||||
default_seed: 45
|
default_seed: 45
|
||||||
|
|
||||||
categories:
|
categories:
|
||||||
- category: "algorithmic"
|
- category: algebra
|
||||||
datasets:
|
datasets:
|
||||||
- dataset: "count_primes"
|
- dataset: complex_arithmetic
|
||||||
- dataset: "game_of_life"
|
- dataset: intermediate_integration
|
||||||
- dataset: "graph_color"
|
- dataset: polynomial_equations
|
||||||
- dataset: "group_anagrams"
|
- dataset: polynomial_multiplication
|
||||||
- dataset: "isomorphic_strings"
|
- dataset: simple_equations
|
||||||
- dataset: "letter_counting"
|
- dataset: simple_integration
|
||||||
- dataset: "letter_jumble"
|
- category: algorithmic
|
||||||
- dataset: "manipulate_matrix"
|
datasets:
|
||||||
- dataset: "number_filtering"
|
- dataset: ab
|
||||||
- dataset: "number_sorting"
|
- dataset: base_conversion
|
||||||
- dataset: "palindrome"
|
- dataset: binary_alternation
|
||||||
- dataset: "pool_matrix"
|
- dataset: binary_matrix
|
||||||
- dataset: "ransom_note"
|
- dataset: caesar_cipher
|
||||||
- dataset: "rotate_matrix"
|
- dataset: count_primes
|
||||||
- dataset: "sentence_reordering"
|
- dataset: cryptarithm
|
||||||
- dataset: "spell_backward"
|
- dataset: game_of_life
|
||||||
- dataset: "spiral_matrix"
|
- dataset: graph_color
|
||||||
- dataset: "string_insertion"
|
- dataset: group_anagrams
|
||||||
- dataset: "string_manipulation"
|
- dataset: isomorphic_strings
|
||||||
- dataset: "string_synthesis"
|
- dataset: jugs
|
||||||
- dataset: "word_ladder"
|
- dataset: letter_counting
|
||||||
- dataset: "word_sequence_reversal"
|
- dataset: letter_jumble
|
||||||
- dataset: "word_sorting"
|
- dataset: manipulate_matrix
|
||||||
|
- dataset: number_filtering
|
||||||
|
- dataset: number_sorting
|
||||||
|
- dataset: palindrome
|
||||||
|
- dataset: palindrome_partitioning
|
||||||
|
- dataset: pool_matrix
|
||||||
|
- dataset: ransom_note
|
||||||
|
- dataset: rotate_matrix
|
||||||
|
- dataset: rotten_oranges
|
||||||
|
- dataset: sentence_reordering
|
||||||
|
- dataset: spell_backward
|
||||||
|
- dataset: spiral_matrix
|
||||||
|
- dataset: string_insertion
|
||||||
|
- dataset: string_manipulation
|
||||||
|
- dataset: string_splitting
|
||||||
|
- dataset: string_synthesis
|
||||||
|
- dataset: word_ladder
|
||||||
|
- dataset: word_sequence_reversal
|
||||||
|
- dataset: word_sorting
|
||||||
|
- category: arc
|
||||||
|
datasets:
|
||||||
|
- dataset: arc_1d
|
||||||
|
- dataset: arc_agi
|
||||||
|
- dataset: rearc
|
||||||
|
- category: arithmetic
|
||||||
|
datasets:
|
||||||
|
- dataset: basic_arithmetic
|
||||||
|
- dataset: bitwise_arithmetic
|
||||||
|
- dataset: calendar_arithmetic
|
||||||
|
- dataset: chain_sum
|
||||||
|
- dataset: count_bits
|
||||||
|
- dataset: decimal_arithmetic
|
||||||
|
- dataset: decimal_chain_sum
|
||||||
|
- dataset: dice
|
||||||
|
- dataset: fraction_simplification
|
||||||
|
- dataset: gcd
|
||||||
|
- dataset: gsm_symbolic
|
||||||
|
- dataset: lcm
|
||||||
|
- dataset: leg_counting
|
||||||
|
- dataset: number_format
|
||||||
|
- dataset: power_function
|
||||||
|
- dataset: prime_factorization
|
||||||
|
- dataset: products
|
||||||
|
- dataset: time_intervals
|
||||||
|
- category: code
|
||||||
|
datasets:
|
||||||
|
- dataset: bf
|
||||||
|
- dataset: codeio
|
||||||
|
- category: cognition
|
||||||
|
datasets:
|
||||||
|
- dataset: color_cube_rotation
|
||||||
|
- dataset: figlet_font
|
||||||
|
- dataset: needle_haystack
|
||||||
|
- dataset: number_sequence
|
||||||
|
- dataset: rectangle_count
|
||||||
|
- dataset: rubiks_cube
|
||||||
|
- category: games
|
||||||
|
datasets:
|
||||||
|
- dataset: countdown
|
||||||
|
- dataset: emoji_mystery
|
||||||
|
- dataset: futoshiki
|
||||||
|
- dataset: knight_swap
|
||||||
|
- dataset: maze
|
||||||
|
- dataset: mini_sudoku
|
||||||
|
- dataset: n_queens
|
||||||
|
- dataset: rush_hour
|
||||||
|
- dataset: sokoban
|
||||||
|
- dataset: sudoku
|
||||||
|
- dataset: tower_of_hanoi
|
||||||
|
- dataset: tsumego
|
||||||
|
- category: geometry
|
||||||
|
datasets:
|
||||||
|
- dataset: advanced_geometry
|
||||||
|
- dataset: simple_geometry
|
||||||
|
- category: graphs
|
||||||
|
datasets:
|
||||||
|
- dataset: course_schedule
|
||||||
|
- dataset: family_relationships
|
||||||
|
- dataset: largest_island
|
||||||
|
- dataset: quantum_lock
|
||||||
|
- dataset: shortest_path
|
||||||
|
- category: induction
|
||||||
|
datasets:
|
||||||
|
- dataset: list_functions
|
||||||
|
- category: logic
|
||||||
|
datasets:
|
||||||
|
- dataset: aiw
|
||||||
|
- dataset: circuit_logic
|
||||||
|
- dataset: knights_knaves
|
||||||
|
- dataset: propositional_logic
|
||||||
|
- dataset: self_reference
|
||||||
|
- dataset: syllogism
|
||||||
|
- dataset: zebra_puzzles
|
||||||
|
|
|
||||||
124
eval/yaml/claude-3.7-sonnet_thinking.yaml
Normal file
124
eval/yaml/claude-3.7-sonnet_thinking.yaml
Normal file
|
|
@ -0,0 +1,124 @@
|
||||||
|
model: anthropic/claude-3.7-sonnet:thinking
|
||||||
|
provider: Anthropic
|
||||||
|
output_dir: results
|
||||||
|
max_concurrent: 10
|
||||||
|
default_size: 50
|
||||||
|
default_seed: 45
|
||||||
|
categories:
|
||||||
|
- category: algebra
|
||||||
|
datasets:
|
||||||
|
- dataset: complex_arithmetic
|
||||||
|
- dataset: intermediate_integration
|
||||||
|
- dataset: polynomial_equations
|
||||||
|
- dataset: polynomial_multiplication
|
||||||
|
- dataset: simple_equations
|
||||||
|
- dataset: simple_integration
|
||||||
|
- category: algorithmic
|
||||||
|
datasets:
|
||||||
|
- dataset: ab
|
||||||
|
- dataset: base_conversion
|
||||||
|
- dataset: binary_alternation
|
||||||
|
- dataset: binary_matrix
|
||||||
|
- dataset: caesar_cipher
|
||||||
|
- dataset: count_primes
|
||||||
|
- dataset: cryptarithm
|
||||||
|
- dataset: game_of_life
|
||||||
|
- dataset: graph_color
|
||||||
|
- dataset: group_anagrams
|
||||||
|
- dataset: isomorphic_strings
|
||||||
|
- dataset: jugs
|
||||||
|
- dataset: letter_counting
|
||||||
|
- dataset: letter_jumble
|
||||||
|
- dataset: manipulate_matrix
|
||||||
|
- dataset: number_filtering
|
||||||
|
- dataset: number_sorting
|
||||||
|
- dataset: palindrome
|
||||||
|
- dataset: palindrome_partitioning
|
||||||
|
- dataset: pool_matrix
|
||||||
|
- dataset: ransom_note
|
||||||
|
- dataset: rotate_matrix
|
||||||
|
- dataset: rotten_oranges
|
||||||
|
- dataset: sentence_reordering
|
||||||
|
- dataset: spell_backward
|
||||||
|
- dataset: spiral_matrix
|
||||||
|
- dataset: string_insertion
|
||||||
|
- dataset: string_manipulation
|
||||||
|
- dataset: string_splitting
|
||||||
|
- dataset: string_synthesis
|
||||||
|
- dataset: word_ladder
|
||||||
|
- dataset: word_sequence_reversal
|
||||||
|
- dataset: word_sorting
|
||||||
|
- category: arc
|
||||||
|
datasets:
|
||||||
|
- dataset: arc_1d
|
||||||
|
- dataset: arc_agi
|
||||||
|
- dataset: rearc
|
||||||
|
- category: arithmetic
|
||||||
|
datasets:
|
||||||
|
- dataset: basic_arithmetic
|
||||||
|
- dataset: bitwise_arithmetic
|
||||||
|
- dataset: calendar_arithmetic
|
||||||
|
- dataset: chain_sum
|
||||||
|
- dataset: count_bits
|
||||||
|
- dataset: decimal_arithmetic
|
||||||
|
- dataset: decimal_chain_sum
|
||||||
|
- dataset: dice
|
||||||
|
- dataset: fraction_simplification
|
||||||
|
- dataset: gcd
|
||||||
|
- dataset: gsm_symbolic
|
||||||
|
- dataset: lcm
|
||||||
|
- dataset: leg_counting
|
||||||
|
- dataset: number_format
|
||||||
|
- dataset: power_function
|
||||||
|
- dataset: prime_factorization
|
||||||
|
- dataset: products
|
||||||
|
- dataset: time_intervals
|
||||||
|
- category: code
|
||||||
|
datasets:
|
||||||
|
- dataset: bf
|
||||||
|
- dataset: codeio
|
||||||
|
- category: cognition
|
||||||
|
datasets:
|
||||||
|
- dataset: color_cube_rotation
|
||||||
|
- dataset: figlet_font
|
||||||
|
- dataset: needle_haystack
|
||||||
|
- dataset: number_sequence
|
||||||
|
- dataset: rectangle_count
|
||||||
|
- dataset: rubiks_cube
|
||||||
|
- category: games
|
||||||
|
datasets:
|
||||||
|
- dataset: countdown
|
||||||
|
- dataset: emoji_mystery
|
||||||
|
- dataset: futoshiki
|
||||||
|
- dataset: knight_swap
|
||||||
|
- dataset: maze
|
||||||
|
- dataset: mini_sudoku
|
||||||
|
- dataset: n_queens
|
||||||
|
- dataset: rush_hour
|
||||||
|
- dataset: sokoban
|
||||||
|
- dataset: sudoku
|
||||||
|
- dataset: tower_of_hanoi
|
||||||
|
- dataset: tsumego
|
||||||
|
- category: geometry
|
||||||
|
datasets:
|
||||||
|
- dataset: advanced_geometry
|
||||||
|
- dataset: simple_geometry
|
||||||
|
- category: graphs
|
||||||
|
datasets:
|
||||||
|
- dataset: course_schedule
|
||||||
|
- dataset: family_relationships
|
||||||
|
- dataset: largest_island
|
||||||
|
- dataset: quantum_lock
|
||||||
|
- dataset: shortest_path
|
||||||
|
- category: induction
|
||||||
|
datasets:
|
||||||
|
- dataset: list_functions
|
||||||
|
- category: logic
|
||||||
|
datasets:
|
||||||
|
- dataset: aiw
|
||||||
|
- dataset: circuit_logic
|
||||||
|
- dataset: knights_knaves
|
||||||
|
- dataset: propositional_logic
|
||||||
|
- dataset: self_reference
|
||||||
|
- dataset: syllogism
|
||||||
|
- dataset: zebra_puzzles
|
||||||
|
|
@ -1,61 +1,124 @@
|
||||||
# Combined configuration for deepseek-r1
|
model: deepseek/deepseek-r1
|
||||||
model: "deepseek/deepseek-r1"
|
provider: Nebius
|
||||||
provider: "Nebius"
|
output_dir: results
|
||||||
output_dir: "results"
|
|
||||||
max_concurrent: 10
|
max_concurrent: 10
|
||||||
default_size: 50
|
default_size: 50
|
||||||
default_seed: 45
|
default_seed: 45
|
||||||
|
|
||||||
categories:
|
categories:
|
||||||
- category: "algebra"
|
- category: algebra
|
||||||
datasets:
|
datasets:
|
||||||
- dataset: "intermediate_integration"
|
- dataset: complex_arithmetic
|
||||||
- dataset: "polynomial_equations"
|
- dataset: intermediate_integration
|
||||||
- dataset: "polynomial_multiplication"
|
- dataset: polynomial_equations
|
||||||
- dataset: "simple_equations"
|
- dataset: polynomial_multiplication
|
||||||
- dataset: "simple_integration"
|
- dataset: simple_equations
|
||||||
- dataset: "complex_arithmetic"
|
- dataset: simple_integration
|
||||||
|
- category: algorithmic
|
||||||
- category: "algorithmic"
|
datasets:
|
||||||
datasets:
|
- dataset: ab
|
||||||
- dataset: "ab"
|
- dataset: base_conversion
|
||||||
- dataset: "base_conversion"
|
- dataset: binary_alternation
|
||||||
- dataset: "binary_matrix"
|
- dataset: binary_matrix
|
||||||
- dataset: "caesar_cipher"
|
- dataset: caesar_cipher
|
||||||
- dataset: "count_primes"
|
- dataset: count_primes
|
||||||
- dataset: "game_of_life"
|
- dataset: cryptarithm
|
||||||
- dataset: "graph_color"
|
- dataset: game_of_life
|
||||||
- dataset: "group_anagrams"
|
- dataset: graph_color
|
||||||
- dataset: "isomorphic_strings"
|
- dataset: group_anagrams
|
||||||
- dataset: "letter_counting"
|
- dataset: isomorphic_strings
|
||||||
- dataset: "letter_jumble"
|
- dataset: jugs
|
||||||
- dataset: "manipulate_matrix"
|
- dataset: letter_counting
|
||||||
- dataset: "number_filtering"
|
- dataset: letter_jumble
|
||||||
- dataset: "number_sorting"
|
- dataset: manipulate_matrix
|
||||||
- dataset: "palindrome"
|
- dataset: number_filtering
|
||||||
- dataset: "pool_matrix"
|
- dataset: number_sorting
|
||||||
- dataset: "ransom_note"
|
- dataset: palindrome
|
||||||
- dataset: "rotate_matrix"
|
- dataset: palindrome_partitioning
|
||||||
- dataset: "sentence_reordering"
|
- dataset: pool_matrix
|
||||||
- dataset: "spell_backward"
|
- dataset: ransom_note
|
||||||
- dataset: "spiral_matrix"
|
- dataset: rotate_matrix
|
||||||
- dataset: "string_insertion"
|
- dataset: rotten_oranges
|
||||||
- dataset: "string_manipulation"
|
- dataset: sentence_reordering
|
||||||
- dataset: "string_synthesis"
|
- dataset: spell_backward
|
||||||
- dataset: "word_ladder"
|
- dataset: spiral_matrix
|
||||||
- dataset: "word_sequence_reversal"
|
- dataset: string_insertion
|
||||||
- dataset: "word_sorting"
|
- dataset: string_manipulation
|
||||||
|
- dataset: string_splitting
|
||||||
- category: "cognition"
|
- dataset: string_synthesis
|
||||||
datasets:
|
- dataset: word_ladder
|
||||||
- dataset: "color_cube_rotation"
|
- dataset: word_sequence_reversal
|
||||||
- dataset: "figlet_font"
|
- dataset: word_sorting
|
||||||
- dataset: "number_sequence"
|
- category: arc
|
||||||
- dataset: "rubiks_cube"
|
datasets:
|
||||||
|
- dataset: arc_1d
|
||||||
- category: "logic"
|
- dataset: arc_agi
|
||||||
datasets:
|
- dataset: rearc
|
||||||
- dataset: "propositional_logic"
|
- category: arithmetic
|
||||||
- dataset: "self_reference"
|
datasets:
|
||||||
- dataset: "syllogism"
|
- dataset: basic_arithmetic
|
||||||
- dataset: "zebra_puzzles"
|
- dataset: bitwise_arithmetic
|
||||||
|
- dataset: calendar_arithmetic
|
||||||
|
- dataset: chain_sum
|
||||||
|
- dataset: count_bits
|
||||||
|
- dataset: decimal_arithmetic
|
||||||
|
- dataset: decimal_chain_sum
|
||||||
|
- dataset: dice
|
||||||
|
- dataset: fraction_simplification
|
||||||
|
- dataset: gcd
|
||||||
|
- dataset: gsm_symbolic
|
||||||
|
- dataset: lcm
|
||||||
|
- dataset: leg_counting
|
||||||
|
- dataset: number_format
|
||||||
|
- dataset: power_function
|
||||||
|
- dataset: prime_factorization
|
||||||
|
- dataset: products
|
||||||
|
- dataset: time_intervals
|
||||||
|
- category: code
|
||||||
|
datasets:
|
||||||
|
- dataset: bf
|
||||||
|
- dataset: codeio
|
||||||
|
- category: cognition
|
||||||
|
datasets:
|
||||||
|
- dataset: color_cube_rotation
|
||||||
|
- dataset: figlet_font
|
||||||
|
- dataset: needle_haystack
|
||||||
|
- dataset: number_sequence
|
||||||
|
- dataset: rectangle_count
|
||||||
|
- dataset: rubiks_cube
|
||||||
|
- category: games
|
||||||
|
datasets:
|
||||||
|
- dataset: countdown
|
||||||
|
- dataset: emoji_mystery
|
||||||
|
- dataset: futoshiki
|
||||||
|
- dataset: knight_swap
|
||||||
|
- dataset: maze
|
||||||
|
- dataset: mini_sudoku
|
||||||
|
- dataset: n_queens
|
||||||
|
- dataset: rush_hour
|
||||||
|
- dataset: sokoban
|
||||||
|
- dataset: sudoku
|
||||||
|
- dataset: tower_of_hanoi
|
||||||
|
- dataset: tsumego
|
||||||
|
- category: geometry
|
||||||
|
datasets:
|
||||||
|
- dataset: advanced_geometry
|
||||||
|
- dataset: simple_geometry
|
||||||
|
- category: graphs
|
||||||
|
datasets:
|
||||||
|
- dataset: course_schedule
|
||||||
|
- dataset: family_relationships
|
||||||
|
- dataset: largest_island
|
||||||
|
- dataset: quantum_lock
|
||||||
|
- dataset: shortest_path
|
||||||
|
- category: induction
|
||||||
|
datasets:
|
||||||
|
- dataset: list_functions
|
||||||
|
- category: logic
|
||||||
|
datasets:
|
||||||
|
- dataset: aiw
|
||||||
|
- dataset: circuit_logic
|
||||||
|
- dataset: knights_knaves
|
||||||
|
- dataset: propositional_logic
|
||||||
|
- dataset: self_reference
|
||||||
|
- dataset: syllogism
|
||||||
|
- dataset: zebra_puzzles
|
||||||
|
|
|
||||||
|
|
@ -1,130 +1,124 @@
|
||||||
# Combined configuration for llama-3.3-70b-instruct
|
model: meta-llama/llama-3.3-70b-instruct
|
||||||
model: "meta-llama/llama-3.3-70b-instruct"
|
provider: Hyperbolic
|
||||||
provider: "Hyperbolic"
|
output_dir: results
|
||||||
output_dir: "results"
|
|
||||||
max_concurrent: 10
|
max_concurrent: 10
|
||||||
default_size: 50
|
default_size: 50
|
||||||
default_seed: 45
|
default_seed: 45
|
||||||
|
|
||||||
categories:
|
categories:
|
||||||
- category: "algebra"
|
- category: algebra
|
||||||
datasets:
|
datasets:
|
||||||
- dataset: "intermediate_integration"
|
- dataset: complex_arithmetic
|
||||||
- dataset: "polynomial_equations"
|
- dataset: intermediate_integration
|
||||||
- dataset: "polynomial_multiplication"
|
- dataset: polynomial_equations
|
||||||
- dataset: "simple_equations"
|
- dataset: polynomial_multiplication
|
||||||
- dataset: "simple_integration"
|
- dataset: simple_equations
|
||||||
- dataset: "complex_arithmetic"
|
- dataset: simple_integration
|
||||||
|
- category: algorithmic
|
||||||
- category: "algorithmic"
|
datasets:
|
||||||
datasets:
|
- dataset: ab
|
||||||
- dataset: "ab"
|
- dataset: base_conversion
|
||||||
- dataset: "base_conversion"
|
- dataset: binary_alternation
|
||||||
- dataset: "binary_alternation"
|
- dataset: binary_matrix
|
||||||
- dataset: "binary_matrix"
|
- dataset: caesar_cipher
|
||||||
- dataset: "caesar_cipher"
|
- dataset: count_primes
|
||||||
- dataset: "count_primes"
|
- dataset: cryptarithm
|
||||||
- dataset: "cryptarithm"
|
- dataset: game_of_life
|
||||||
- dataset: "game_of_life"
|
- dataset: graph_color
|
||||||
- dataset: "graph_color"
|
- dataset: group_anagrams
|
||||||
- dataset: "group_anagrams"
|
- dataset: isomorphic_strings
|
||||||
- dataset: "isomorphic_strings"
|
- dataset: jugs
|
||||||
- dataset: "jugs"
|
- dataset: letter_counting
|
||||||
- dataset: "letter_counting"
|
- dataset: letter_jumble
|
||||||
- dataset: "letter_jumble"
|
- dataset: manipulate_matrix
|
||||||
- dataset: "manipulate_matrix"
|
- dataset: number_filtering
|
||||||
- dataset: "number_filtering"
|
- dataset: number_sorting
|
||||||
- dataset: "number_sorting"
|
- dataset: palindrome
|
||||||
- dataset: "palindrome"
|
- dataset: palindrome_partitioning
|
||||||
- dataset: "palindrome_partitioning"
|
- dataset: pool_matrix
|
||||||
- dataset: "pool_matrix"
|
- dataset: ransom_note
|
||||||
- dataset: "ransom_note"
|
- dataset: rotate_matrix
|
||||||
- dataset: "rotate_matrix"
|
- dataset: rotten_oranges
|
||||||
- dataset: "rotten_oranges"
|
- dataset: sentence_reordering
|
||||||
- dataset: "sentence_reordering"
|
- dataset: spell_backward
|
||||||
- dataset: "spell_backward"
|
- dataset: spiral_matrix
|
||||||
- dataset: "spiral_matrix"
|
- dataset: string_insertion
|
||||||
- dataset: "string_insertion"
|
- dataset: string_manipulation
|
||||||
- dataset: "string_manipulation"
|
- dataset: string_splitting
|
||||||
- dataset: "string_splitting"
|
- dataset: string_synthesis
|
||||||
- dataset: "string_synthesis"
|
- dataset: word_ladder
|
||||||
- dataset: "word_ladder"
|
- dataset: word_sequence_reversal
|
||||||
- dataset: "word_sequence_reversal"
|
- dataset: word_sorting
|
||||||
- dataset: "word_sorting"
|
- category: arc
|
||||||
|
datasets:
|
||||||
- category: "arc"
|
- dataset: arc_1d
|
||||||
datasets:
|
- dataset: arc_agi
|
||||||
- dataset: "arc_1d"
|
- dataset: rearc
|
||||||
- dataset: "arc_agi"
|
- category: arithmetic
|
||||||
- dataset: "rearc"
|
datasets:
|
||||||
|
- dataset: basic_arithmetic
|
||||||
- category: "arithmetic"
|
- dataset: bitwise_arithmetic
|
||||||
datasets:
|
- dataset: calendar_arithmetic
|
||||||
- dataset: "basic_arithmetic"
|
- dataset: chain_sum
|
||||||
- dataset: "bitwise_arithmetic"
|
- dataset: count_bits
|
||||||
- dataset: "calendar_arithmetic"
|
- dataset: decimal_arithmetic
|
||||||
- dataset: "chain_sum"
|
- dataset: decimal_chain_sum
|
||||||
- dataset: "count_bits"
|
- dataset: dice
|
||||||
- dataset: "decimal_arithmetic"
|
- dataset: fraction_simplification
|
||||||
- dataset: "decimal_chain_sum"
|
- dataset: gcd
|
||||||
- dataset: "dice"
|
- dataset: gsm_symbolic
|
||||||
- dataset: "fraction_simplification"
|
- dataset: lcm
|
||||||
- dataset: "gcd"
|
- dataset: leg_counting
|
||||||
- dataset: "gsm_symbolic"
|
- dataset: number_format
|
||||||
- dataset: "lcm"
|
- dataset: power_function
|
||||||
- dataset: "leg_counting"
|
- dataset: prime_factorization
|
||||||
- dataset: "number_format"
|
- dataset: products
|
||||||
- dataset: "power_function"
|
- dataset: time_intervals
|
||||||
- dataset: "prime_factorization"
|
- category: code
|
||||||
- dataset: "products"
|
datasets:
|
||||||
- dataset: "time_intervals"
|
- dataset: bf
|
||||||
|
- dataset: codeio
|
||||||
- category: "code"
|
- category: cognition
|
||||||
datasets:
|
datasets:
|
||||||
- dataset: "bf"
|
- dataset: color_cube_rotation
|
||||||
|
- dataset: figlet_font
|
||||||
- category: "cognition"
|
- dataset: needle_haystack
|
||||||
datasets:
|
- dataset: number_sequence
|
||||||
- dataset: "color_cube_rotation"
|
- dataset: rectangle_count
|
||||||
- dataset: "figlet_font"
|
- dataset: rubiks_cube
|
||||||
- dataset: "needle_haystack"
|
- category: games
|
||||||
- dataset: "number_sequence"
|
datasets:
|
||||||
- dataset: "rectangle_count"
|
- dataset: countdown
|
||||||
- dataset: "rubiks_cube"
|
- dataset: emoji_mystery
|
||||||
|
- dataset: futoshiki
|
||||||
- category: "games"
|
- dataset: knight_swap
|
||||||
datasets:
|
- dataset: maze
|
||||||
- dataset: "countdown"
|
- dataset: mini_sudoku
|
||||||
- dataset: "emoji_mystery"
|
- dataset: n_queens
|
||||||
- dataset: "futoshuki"
|
- dataset: rush_hour
|
||||||
- dataset: "knight_swap"
|
- dataset: sokoban
|
||||||
- dataset: "maze"
|
- dataset: sudoku
|
||||||
- dataset: "mini_sudoku"
|
- dataset: tower_of_hanoi
|
||||||
- dataset: "n_queens"
|
- dataset: tsumego
|
||||||
- dataset: "sokoban"
|
- category: geometry
|
||||||
- dataset: "sudoku"
|
datasets:
|
||||||
- dataset: "tower_of_hanoi"
|
- dataset: advanced_geometry
|
||||||
- dataset: "tsumego"
|
- dataset: simple_geometry
|
||||||
|
- category: graphs
|
||||||
- category: "geometry"
|
datasets:
|
||||||
datasets:
|
- dataset: course_schedule
|
||||||
- dataset: "simple_geometry"
|
- dataset: family_relationships
|
||||||
- dataset: "advanced_geometry"
|
- dataset: largest_island
|
||||||
|
- dataset: quantum_lock
|
||||||
- category: "graphs"
|
- dataset: shortest_path
|
||||||
datasets:
|
- category: induction
|
||||||
- dataset: "course_schedule"
|
datasets:
|
||||||
- dataset: "family_relationships"
|
- dataset: list_functions
|
||||||
- dataset: "largest_island"
|
- category: logic
|
||||||
- dataset: "list_functions"
|
datasets:
|
||||||
- dataset: "quantum_lock"
|
- dataset: aiw
|
||||||
- dataset: "shortest_path"
|
- dataset: circuit_logic
|
||||||
|
- dataset: knights_knaves
|
||||||
- category: "logic"
|
- dataset: propositional_logic
|
||||||
datasets:
|
- dataset: self_reference
|
||||||
- dataset: "aiw"
|
- dataset: syllogism
|
||||||
- dataset: "circuit_logic"
|
- dataset: zebra_puzzles
|
||||||
- dataset: "propositional_logic"
|
|
||||||
- dataset: "self_reference"
|
|
||||||
- dataset: "syllogism"
|
|
||||||
- dataset: "zebra_puzzles"
|
|
||||||
|
|
|
||||||
124
eval/yaml/openai-o1.yaml
Normal file
124
eval/yaml/openai-o1.yaml
Normal file
|
|
@ -0,0 +1,124 @@
|
||||||
|
model: openai/o1
|
||||||
|
provider: OpenAI
|
||||||
|
output_dir: results
|
||||||
|
max_concurrent: 10
|
||||||
|
default_size: 50
|
||||||
|
default_seed: 45
|
||||||
|
categories:
|
||||||
|
- category: algebra
|
||||||
|
datasets:
|
||||||
|
- dataset: complex_arithmetic
|
||||||
|
- dataset: intermediate_integration
|
||||||
|
- dataset: polynomial_equations
|
||||||
|
- dataset: polynomial_multiplication
|
||||||
|
- dataset: simple_equations
|
||||||
|
- dataset: simple_integration
|
||||||
|
- category: algorithmic
|
||||||
|
datasets:
|
||||||
|
- dataset: ab
|
||||||
|
- dataset: base_conversion
|
||||||
|
- dataset: binary_alternation
|
||||||
|
- dataset: binary_matrix
|
||||||
|
- dataset: caesar_cipher
|
||||||
|
- dataset: count_primes
|
||||||
|
- dataset: cryptarithm
|
||||||
|
- dataset: game_of_life
|
||||||
|
- dataset: graph_color
|
||||||
|
- dataset: group_anagrams
|
||||||
|
- dataset: isomorphic_strings
|
||||||
|
- dataset: jugs
|
||||||
|
- dataset: letter_counting
|
||||||
|
- dataset: letter_jumble
|
||||||
|
- dataset: manipulate_matrix
|
||||||
|
- dataset: number_filtering
|
||||||
|
- dataset: number_sorting
|
||||||
|
- dataset: palindrome
|
||||||
|
- dataset: palindrome_partitioning
|
||||||
|
- dataset: pool_matrix
|
||||||
|
- dataset: ransom_note
|
||||||
|
- dataset: rotate_matrix
|
||||||
|
- dataset: rotten_oranges
|
||||||
|
- dataset: sentence_reordering
|
||||||
|
- dataset: spell_backward
|
||||||
|
- dataset: spiral_matrix
|
||||||
|
- dataset: string_insertion
|
||||||
|
- dataset: string_manipulation
|
||||||
|
- dataset: string_splitting
|
||||||
|
- dataset: string_synthesis
|
||||||
|
- dataset: word_ladder
|
||||||
|
- dataset: word_sequence_reversal
|
||||||
|
- dataset: word_sorting
|
||||||
|
- category: arc
|
||||||
|
datasets:
|
||||||
|
- dataset: arc_1d
|
||||||
|
- dataset: arc_agi
|
||||||
|
- dataset: rearc
|
||||||
|
- category: arithmetic
|
||||||
|
datasets:
|
||||||
|
- dataset: basic_arithmetic
|
||||||
|
- dataset: bitwise_arithmetic
|
||||||
|
- dataset: calendar_arithmetic
|
||||||
|
- dataset: chain_sum
|
||||||
|
- dataset: count_bits
|
||||||
|
- dataset: decimal_arithmetic
|
||||||
|
- dataset: decimal_chain_sum
|
||||||
|
- dataset: dice
|
||||||
|
- dataset: fraction_simplification
|
||||||
|
- dataset: gcd
|
||||||
|
- dataset: gsm_symbolic
|
||||||
|
- dataset: lcm
|
||||||
|
- dataset: leg_counting
|
||||||
|
- dataset: number_format
|
||||||
|
- dataset: power_function
|
||||||
|
- dataset: prime_factorization
|
||||||
|
- dataset: products
|
||||||
|
- dataset: time_intervals
|
||||||
|
- category: code
|
||||||
|
datasets:
|
||||||
|
- dataset: bf
|
||||||
|
- dataset: codeio
|
||||||
|
- category: cognition
|
||||||
|
datasets:
|
||||||
|
- dataset: color_cube_rotation
|
||||||
|
- dataset: figlet_font
|
||||||
|
- dataset: needle_haystack
|
||||||
|
- dataset: number_sequence
|
||||||
|
- dataset: rectangle_count
|
||||||
|
- dataset: rubiks_cube
|
||||||
|
- category: games
|
||||||
|
datasets:
|
||||||
|
- dataset: countdown
|
||||||
|
- dataset: emoji_mystery
|
||||||
|
- dataset: futoshiki
|
||||||
|
- dataset: knight_swap
|
||||||
|
- dataset: maze
|
||||||
|
- dataset: mini_sudoku
|
||||||
|
- dataset: n_queens
|
||||||
|
- dataset: rush_hour
|
||||||
|
- dataset: sokoban
|
||||||
|
- dataset: sudoku
|
||||||
|
- dataset: tower_of_hanoi
|
||||||
|
- dataset: tsumego
|
||||||
|
- category: geometry
|
||||||
|
datasets:
|
||||||
|
- dataset: advanced_geometry
|
||||||
|
- dataset: simple_geometry
|
||||||
|
- category: graphs
|
||||||
|
datasets:
|
||||||
|
- dataset: course_schedule
|
||||||
|
- dataset: family_relationships
|
||||||
|
- dataset: largest_island
|
||||||
|
- dataset: quantum_lock
|
||||||
|
- dataset: shortest_path
|
||||||
|
- category: induction
|
||||||
|
datasets:
|
||||||
|
- dataset: list_functions
|
||||||
|
- category: logic
|
||||||
|
datasets:
|
||||||
|
- dataset: aiw
|
||||||
|
- dataset: circuit_logic
|
||||||
|
- dataset: knights_knaves
|
||||||
|
- dataset: propositional_logic
|
||||||
|
- dataset: self_reference
|
||||||
|
- dataset: syllogism
|
||||||
|
- dataset: zebra_puzzles
|
||||||
|
|
@ -1,126 +1,124 @@
|
||||||
# Combined configuration for openai/o3-mini
|
model: openai/o3-mini
|
||||||
model: "openai/o3-mini"
|
provider: OpenAI
|
||||||
provider: "OpenAI"
|
output_dir: results
|
||||||
output_dir: "results"
|
|
||||||
max_concurrent: 10
|
max_concurrent: 10
|
||||||
default_size: 50
|
default_size: 50
|
||||||
default_seed: 45
|
default_seed: 45
|
||||||
|
|
||||||
categories:
|
categories:
|
||||||
- category: "algebra"
|
- category: algebra
|
||||||
datasets:
|
datasets:
|
||||||
- dataset: "complex_arithmetic"
|
- dataset: complex_arithmetic
|
||||||
- dataset: "intermediate_integration"
|
- dataset: intermediate_integration
|
||||||
- dataset: "polynomial_equations"
|
- dataset: polynomial_equations
|
||||||
- dataset: "polynomial_multiplication"
|
- dataset: polynomial_multiplication
|
||||||
- dataset: "simple_equations"
|
- dataset: simple_equations
|
||||||
- dataset: "simple_integration"
|
- dataset: simple_integration
|
||||||
|
- category: algorithmic
|
||||||
- category: "algorithmic"
|
datasets:
|
||||||
datasets:
|
- dataset: ab
|
||||||
- dataset: "ab"
|
- dataset: base_conversion
|
||||||
- dataset: "binary_alternation"
|
- dataset: binary_alternation
|
||||||
- dataset: "base_conversion"
|
- dataset: binary_matrix
|
||||||
- dataset: "binary_matrix"
|
- dataset: caesar_cipher
|
||||||
- dataset: "caesar_cipher"
|
- dataset: count_primes
|
||||||
- dataset: "count_primes"
|
- dataset: cryptarithm
|
||||||
- dataset: "cryptarithm"
|
- dataset: game_of_life
|
||||||
- dataset: "game_of_life"
|
- dataset: graph_color
|
||||||
- dataset: "graph_color"
|
- dataset: group_anagrams
|
||||||
- dataset: "group_anagrams"
|
- dataset: isomorphic_strings
|
||||||
- dataset: "isomorphic_strings"
|
- dataset: jugs
|
||||||
- dataset: "letter_counting"
|
- dataset: letter_counting
|
||||||
- dataset: "letter_jumble"
|
- dataset: letter_jumble
|
||||||
- dataset: "manipulate_matrix"
|
- dataset: manipulate_matrix
|
||||||
- dataset: "number_filtering"
|
- dataset: number_filtering
|
||||||
- dataset: "number_sorting"
|
- dataset: number_sorting
|
||||||
- dataset: "palindrome"
|
- dataset: palindrome
|
||||||
- dataset: "pool_matrix"
|
- dataset: palindrome_partitioning
|
||||||
- dataset: "ransom_note"
|
- dataset: pool_matrix
|
||||||
- dataset: "rotate_matrix"
|
- dataset: ransom_note
|
||||||
- dataset: "sentence_reordering"
|
- dataset: rotate_matrix
|
||||||
- dataset: "spell_backward"
|
- dataset: rotten_oranges
|
||||||
- dataset: "spiral_matrix"
|
- dataset: sentence_reordering
|
||||||
- dataset: "string_insertion"
|
- dataset: spell_backward
|
||||||
- dataset: "string_manipulation"
|
- dataset: spiral_matrix
|
||||||
- dataset: "string_synthesis"
|
- dataset: string_insertion
|
||||||
- dataset: "word_ladder"
|
- dataset: string_manipulation
|
||||||
- dataset: "word_sequence_reversal"
|
- dataset: string_splitting
|
||||||
- dataset: "word_sorting"
|
- dataset: string_synthesis
|
||||||
|
- dataset: word_ladder
|
||||||
- category: "arc"
|
- dataset: word_sequence_reversal
|
||||||
datasets:
|
- dataset: word_sorting
|
||||||
- dataset: "arc_1d"
|
- category: arc
|
||||||
- dataset: "arc_agi"
|
datasets:
|
||||||
- dataset: "rearc"
|
- dataset: arc_1d
|
||||||
|
- dataset: arc_agi
|
||||||
- category: "arithmetic"
|
- dataset: rearc
|
||||||
datasets:
|
- category: arithmetic
|
||||||
- dataset: "basic_arithmetic"
|
datasets:
|
||||||
- dataset: "bitwise_arithmetic"
|
- dataset: basic_arithmetic
|
||||||
- dataset: "calendar_arithmetic"
|
- dataset: bitwise_arithmetic
|
||||||
- dataset: "chain_sum"
|
- dataset: calendar_arithmetic
|
||||||
- dataset: "count_bits"
|
- dataset: chain_sum
|
||||||
- dataset: "decimal_arithmetic"
|
- dataset: count_bits
|
||||||
- dataset: "decimal_chain_sum"
|
- dataset: decimal_arithmetic
|
||||||
- dataset: "dice"
|
- dataset: decimal_chain_sum
|
||||||
- dataset: "fraction_simplification"
|
- dataset: dice
|
||||||
- dataset: "gcd"
|
- dataset: fraction_simplification
|
||||||
- dataset: "gsm_symbolic"
|
- dataset: gcd
|
||||||
- dataset: "lcm"
|
- dataset: gsm_symbolic
|
||||||
- dataset: "leg_counting"
|
- dataset: lcm
|
||||||
- dataset: "number_format"
|
- dataset: leg_counting
|
||||||
- dataset: "power_function"
|
- dataset: number_format
|
||||||
- dataset: "prime_factorization"
|
- dataset: power_function
|
||||||
- dataset: "products"
|
- dataset: prime_factorization
|
||||||
- dataset: "time_intervals"
|
- dataset: products
|
||||||
|
- dataset: time_intervals
|
||||||
- category: "code"
|
- category: code
|
||||||
datasets:
|
datasets:
|
||||||
- dataset: "bf"
|
- dataset: bf
|
||||||
|
- dataset: codeio
|
||||||
- category: "cognition"
|
- category: cognition
|
||||||
datasets:
|
datasets:
|
||||||
- dataset: "color_cube_rotation"
|
- dataset: color_cube_rotation
|
||||||
- dataset: "figlet_font"
|
- dataset: figlet_font
|
||||||
- dataset: "needle_haystack"
|
- dataset: needle_haystack
|
||||||
- dataset: "number_sequence"
|
- dataset: number_sequence
|
||||||
- dataset: "rectangle_count"
|
- dataset: rectangle_count
|
||||||
- dataset: "rubiks_cube"
|
- dataset: rubiks_cube
|
||||||
|
- category: games
|
||||||
- category: "games"
|
datasets:
|
||||||
datasets:
|
- dataset: countdown
|
||||||
- dataset: "countdown"
|
- dataset: emoji_mystery
|
||||||
- dataset: "emoji_mystery"
|
- dataset: futoshiki
|
||||||
- dataset: "futoshuki"
|
- dataset: knight_swap
|
||||||
- dataset: "knight_swap"
|
- dataset: maze
|
||||||
- dataset: "maze"
|
- dataset: mini_sudoku
|
||||||
- dataset: "mini_sudoku"
|
- dataset: n_queens
|
||||||
- dataset: "n_queens"
|
- dataset: rush_hour
|
||||||
- dataset: "sokoban"
|
- dataset: sokoban
|
||||||
- dataset: "sudoku"
|
- dataset: sudoku
|
||||||
- dataset: "tower_of_hanoi"
|
- dataset: tower_of_hanoi
|
||||||
- dataset: "tsumego"
|
- dataset: tsumego
|
||||||
|
- category: geometry
|
||||||
- category: "geometry"
|
datasets:
|
||||||
datasets:
|
- dataset: advanced_geometry
|
||||||
- dataset: "simple_geometry"
|
- dataset: simple_geometry
|
||||||
- dataset: "advanced_geometry"
|
- category: graphs
|
||||||
|
datasets:
|
||||||
- category: "graphs"
|
- dataset: course_schedule
|
||||||
datasets:
|
- dataset: family_relationships
|
||||||
- dataset: "course_schedule"
|
- dataset: largest_island
|
||||||
- dataset: "family_relationships"
|
- dataset: quantum_lock
|
||||||
- dataset: "largest_island"
|
- dataset: shortest_path
|
||||||
- dataset: "list_functions"
|
- category: induction
|
||||||
- dataset: "quantum_lock"
|
datasets:
|
||||||
- dataset: "shortest_path"
|
- dataset: list_functions
|
||||||
|
- category: logic
|
||||||
- category: "logic"
|
datasets:
|
||||||
datasets:
|
- dataset: aiw
|
||||||
- dataset: "aiw"
|
- dataset: circuit_logic
|
||||||
- dataset: "circuit_logic"
|
- dataset: knights_knaves
|
||||||
- dataset: "propositional_logic"
|
- dataset: propositional_logic
|
||||||
- dataset: "self_reference"
|
- dataset: self_reference
|
||||||
- dataset: "syllogism"
|
- dataset: syllogism
|
||||||
- dataset: "zebra_puzzles"
|
- dataset: zebra_puzzles
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue