mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
Eval script consolidation (#238)
The script now supports: - YAML and JSON configurations - Dataset-specific parameters - Overriding configuration via command line - Detailed logging and error handling
This commit is contained in:
parent
8a66d2a216
commit
850c1cf6f4
40 changed files with 1111 additions and 670 deletions
|
|
@ -1,31 +0,0 @@
|
|||
model: anthropic/claude-3.5-sonnet
|
||||
category: algorithmic
|
||||
provider: Anthropic
|
||||
datasets:
|
||||
- count_primes
|
||||
- game_of_life
|
||||
- graph_color
|
||||
- group_anagrams
|
||||
- isomorphic_strings
|
||||
- letter_counting
|
||||
- letter_jumble
|
||||
- manipulate_matrix
|
||||
- number_filtering
|
||||
- number_sorting
|
||||
- palindrome
|
||||
- pool_matrix
|
||||
- ransom_note
|
||||
- rotate_matrix
|
||||
- sentence_reordering
|
||||
- spell_backward
|
||||
- spiral_matrix
|
||||
- string_insertion
|
||||
- string_manipulation
|
||||
- string_synthesis
|
||||
- word_ladder
|
||||
- word_sequence_reversal
|
||||
- word_sorting
|
||||
eval_dir: eval/sonnet-3.5
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
34
eval/yaml/claude-3.5-sonnet.yaml
Normal file
34
eval/yaml/claude-3.5-sonnet.yaml
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
# Combined configuration for Claude 3.5 Sonnet
|
||||
model: "anthropic/claude-3.5-sonnet"
|
||||
provider: "Anthropic"
|
||||
output_dir: "results"
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
|
||||
categories:
|
||||
- category: "algorithmic"
|
||||
datasets:
|
||||
- dataset: "count_primes"
|
||||
- dataset: "game_of_life"
|
||||
- dataset: "graph_color"
|
||||
- dataset: "group_anagrams"
|
||||
- dataset: "isomorphic_strings"
|
||||
- dataset: "letter_counting"
|
||||
- dataset: "letter_jumble"
|
||||
- dataset: "manipulate_matrix"
|
||||
- dataset: "number_filtering"
|
||||
- dataset: "number_sorting"
|
||||
- dataset: "palindrome"
|
||||
- dataset: "pool_matrix"
|
||||
- dataset: "ransom_note"
|
||||
- dataset: "rotate_matrix"
|
||||
- dataset: "sentence_reordering"
|
||||
- dataset: "spell_backward"
|
||||
- dataset: "spiral_matrix"
|
||||
- dataset: "string_insertion"
|
||||
- dataset: "string_manipulation"
|
||||
- dataset: "string_synthesis"
|
||||
- dataset: "word_ladder"
|
||||
- dataset: "word_sequence_reversal"
|
||||
- dataset: "word_sorting"
|
||||
61
eval/yaml/deepseek-r1.yaml
Normal file
61
eval/yaml/deepseek-r1.yaml
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
# Combined configuration for deepseek-r1
|
||||
model: "deepseek/deepseek-r1"
|
||||
provider: "Nebius"
|
||||
output_dir: "results"
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
|
||||
categories:
|
||||
- category: "algebra"
|
||||
datasets:
|
||||
- dataset: "intermediate_integration"
|
||||
- dataset: "polynomial_equations"
|
||||
- dataset: "polynomial_multiplication"
|
||||
- dataset: "simple_equations"
|
||||
- dataset: "simple_integration"
|
||||
- dataset: "complex_arithmetic"
|
||||
|
||||
- category: "algorithmic"
|
||||
datasets:
|
||||
- dataset: "ab"
|
||||
- dataset: "base_conversion"
|
||||
- dataset: "binary_matrix"
|
||||
- dataset: "caesar_cipher"
|
||||
- dataset: "count_primes"
|
||||
- dataset: "game_of_life"
|
||||
- dataset: "graph_color"
|
||||
- dataset: "group_anagrams"
|
||||
- dataset: "isomorphic_strings"
|
||||
- dataset: "letter_counting"
|
||||
- dataset: "letter_jumble"
|
||||
- dataset: "manipulate_matrix"
|
||||
- dataset: "number_filtering"
|
||||
- dataset: "number_sorting"
|
||||
- dataset: "palindrome"
|
||||
- dataset: "pool_matrix"
|
||||
- dataset: "ransom_note"
|
||||
- dataset: "rotate_matrix"
|
||||
- dataset: "sentence_reordering"
|
||||
- dataset: "spell_backward"
|
||||
- dataset: "spiral_matrix"
|
||||
- dataset: "string_insertion"
|
||||
- dataset: "string_manipulation"
|
||||
- dataset: "string_synthesis"
|
||||
- dataset: "word_ladder"
|
||||
- dataset: "word_sequence_reversal"
|
||||
- dataset: "word_sorting"
|
||||
|
||||
- category: "cognition"
|
||||
datasets:
|
||||
- dataset: "color_cube_rotation"
|
||||
- dataset: "figlet_font"
|
||||
- dataset: "number_sequence"
|
||||
- dataset: "rubiks_cube"
|
||||
|
||||
- category: "logic"
|
||||
datasets:
|
||||
- dataset: "propositional_logic"
|
||||
- dataset: "self_reference"
|
||||
- dataset: "syllogism"
|
||||
- dataset: "zebra_puzzles"
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
model: anthropic/claude-3.7-sonnet # find model id: https://openrouter.ai/models
|
||||
provider: Anthropic
|
||||
category: test
|
||||
datasets:
|
||||
- YOUR_DATASET_NAME
|
||||
eval_dir: results/test
|
||||
dataset_size: 100
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
130
eval/yaml/llama-3.3-70b-instruct.yaml
Normal file
130
eval/yaml/llama-3.3-70b-instruct.yaml
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
# Combined configuration for llama-3.3-70b-instruct
|
||||
model: "meta-llama/llama-3.3-70b-instruct"
|
||||
provider: "Hyperbolic"
|
||||
output_dir: "results"
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
|
||||
categories:
|
||||
- category: "algebra"
|
||||
datasets:
|
||||
- dataset: "intermediate_integration"
|
||||
- dataset: "polynomial_equations"
|
||||
- dataset: "polynomial_multiplication"
|
||||
- dataset: "simple_equations"
|
||||
- dataset: "simple_integration"
|
||||
- dataset: "complex_arithmetic"
|
||||
|
||||
- category: "algorithmic"
|
||||
datasets:
|
||||
- dataset: "ab"
|
||||
- dataset: "base_conversion"
|
||||
- dataset: "binary_alternation"
|
||||
- dataset: "binary_matrix"
|
||||
- dataset: "caesar_cipher"
|
||||
- dataset: "count_primes"
|
||||
- dataset: "cryptarithm"
|
||||
- dataset: "game_of_life"
|
||||
- dataset: "graph_color"
|
||||
- dataset: "group_anagrams"
|
||||
- dataset: "isomorphic_strings"
|
||||
- dataset: "jugs"
|
||||
- dataset: "letter_counting"
|
||||
- dataset: "letter_jumble"
|
||||
- dataset: "manipulate_matrix"
|
||||
- dataset: "number_filtering"
|
||||
- dataset: "number_sorting"
|
||||
- dataset: "palindrome"
|
||||
- dataset: "palindrome_partitioning"
|
||||
- dataset: "pool_matrix"
|
||||
- dataset: "ransom_note"
|
||||
- dataset: "rotate_matrix"
|
||||
- dataset: "rotten_oranges"
|
||||
- dataset: "sentence_reordering"
|
||||
- dataset: "spell_backward"
|
||||
- dataset: "spiral_matrix"
|
||||
- dataset: "string_insertion"
|
||||
- dataset: "string_manipulation"
|
||||
- dataset: "string_splitting"
|
||||
- dataset: "string_synthesis"
|
||||
- dataset: "word_ladder"
|
||||
- dataset: "word_sequence_reversal"
|
||||
- dataset: "word_sorting"
|
||||
|
||||
- category: "arc"
|
||||
datasets:
|
||||
- dataset: "arc_1d"
|
||||
- dataset: "arc_agi"
|
||||
- dataset: "rearc"
|
||||
|
||||
- category: "arithmetic"
|
||||
datasets:
|
||||
- dataset: "basic_arithmetic"
|
||||
- dataset: "bitwise_arithmetic"
|
||||
- dataset: "calendar_arithmetic"
|
||||
- dataset: "chain_sum"
|
||||
- dataset: "count_bits"
|
||||
- dataset: "decimal_arithmetic"
|
||||
- dataset: "decimal_chain_sum"
|
||||
- dataset: "dice"
|
||||
- dataset: "fraction_simplification"
|
||||
- dataset: "gcd"
|
||||
- dataset: "gsm_symbolic"
|
||||
- dataset: "lcm"
|
||||
- dataset: "leg_counting"
|
||||
- dataset: "number_format"
|
||||
- dataset: "power_function"
|
||||
- dataset: "prime_factorization"
|
||||
- dataset: "products"
|
||||
- dataset: "time_intervals"
|
||||
|
||||
- category: "code"
|
||||
datasets:
|
||||
- dataset: "bf"
|
||||
|
||||
- category: "cognition"
|
||||
datasets:
|
||||
- dataset: "color_cube_rotation"
|
||||
- dataset: "figlet_font"
|
||||
- dataset: "needle_haystack"
|
||||
- dataset: "number_sequence"
|
||||
- dataset: "rectangle_count"
|
||||
- dataset: "rubiks_cube"
|
||||
|
||||
- category: "games"
|
||||
datasets:
|
||||
- dataset: "countdown"
|
||||
- dataset: "emoji_mystery"
|
||||
- dataset: "futoshuki"
|
||||
- dataset: "knight_swap"
|
||||
- dataset: "maze"
|
||||
- dataset: "mini_sudoku"
|
||||
- dataset: "n_queens"
|
||||
- dataset: "sokoban"
|
||||
- dataset: "sudoku"
|
||||
- dataset: "tower_of_hanoi"
|
||||
- dataset: "tsumego"
|
||||
|
||||
- category: "geometry"
|
||||
datasets:
|
||||
- dataset: "simple_geometry"
|
||||
- dataset: "advanced_geometry"
|
||||
|
||||
- category: "graphs"
|
||||
datasets:
|
||||
- dataset: "course_schedule"
|
||||
- dataset: "family_relationships"
|
||||
- dataset: "largest_island"
|
||||
- dataset: "list_functions"
|
||||
- dataset: "quantum_lock"
|
||||
- dataset: "shortest_path"
|
||||
|
||||
- category: "logic"
|
||||
datasets:
|
||||
- dataset: "aiw"
|
||||
- dataset: "circuit_logic"
|
||||
- dataset: "propositional_logic"
|
||||
- dataset: "self_reference"
|
||||
- dataset: "syllogism"
|
||||
- dataset: "zebra_puzzles"
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: algebra
|
||||
datasets:
|
||||
- intermediate_integration
|
||||
- polynomial_equations
|
||||
- polynomial_multiplication
|
||||
- simple_equations
|
||||
- simple_integration
|
||||
- complex_arithmetic
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: algorithmic
|
||||
datasets:
|
||||
- ab
|
||||
- base_conversion
|
||||
- binary_alternation
|
||||
- binary_matrix
|
||||
- caesar_cipher
|
||||
- count_primes
|
||||
- cryptarithm
|
||||
- game_of_life
|
||||
- graph_color
|
||||
- group_anagrams
|
||||
- isomorphic_strings
|
||||
- jugs
|
||||
- letter_counting
|
||||
- letter_jumble
|
||||
- manipulate_matrix
|
||||
- number_filtering
|
||||
- number_sorting
|
||||
- palindrome
|
||||
- palindrome_partitioning
|
||||
- pool_matrix
|
||||
- ransom_note
|
||||
- rotate_matrix
|
||||
- rotten_oranges
|
||||
- sentence_reordering
|
||||
- spell_backward
|
||||
- spiral_matrix
|
||||
- string_insertion
|
||||
- string_manipulation
|
||||
- string_splitting
|
||||
- string_synthesis
|
||||
- word_ladder
|
||||
- word_sequence_reversal
|
||||
- word_sorting
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: arc
|
||||
datasets:
|
||||
- arc_1d
|
||||
- arc_agi
|
||||
- rearc
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: arithmetic
|
||||
datasets:
|
||||
- basic_arithmetic
|
||||
- bitwise_arithmetic
|
||||
- calendar_arithmetic
|
||||
- chain_sum
|
||||
- count_bits
|
||||
- decimal_arithmetic
|
||||
- decimal_chain_sum
|
||||
- dice
|
||||
- fraction_simplification
|
||||
- gcd
|
||||
- gsm_symbolic
|
||||
- lcm
|
||||
- leg_counting
|
||||
- number_format
|
||||
- power_function
|
||||
- prime_factorization
|
||||
- products
|
||||
- time_intervals
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: code
|
||||
datasets:
|
||||
- bf
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: cognition
|
||||
datasets:
|
||||
- color_cube_rotation
|
||||
- figlet_font
|
||||
- needle_haystack
|
||||
- number_sequence
|
||||
- rectangle_count
|
||||
- rubiks_cube
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: games
|
||||
datasets:
|
||||
- countdown
|
||||
- emoji_mystery
|
||||
- futoshuki
|
||||
- knight_swap
|
||||
- maze
|
||||
- mini_sudoku
|
||||
- n_queens
|
||||
- sokoban
|
||||
- sudoku
|
||||
- tower_of_hanoi
|
||||
- tsumego
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: geometry
|
||||
datasets:
|
||||
- simple_geometry
|
||||
- advanced_geometry
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: graphs
|
||||
datasets:
|
||||
- course_schedule
|
||||
- family_relationships
|
||||
- largest_island
|
||||
- list_functions
|
||||
- quantum_lock
|
||||
- shortest_path
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
model: meta-llama/llama-3.3-70b-instruct
|
||||
provider: Hyperbolic
|
||||
category: logic
|
||||
datasets:
|
||||
- aiw
|
||||
- circuit_logic
|
||||
- propositional_logic
|
||||
- self_reference
|
||||
- syllogism
|
||||
- zebra_puzzles
|
||||
eval_dir: results/llama-3.3-70b-instruct
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
126
eval/yaml/openai-o3.yaml
Normal file
126
eval/yaml/openai-o3.yaml
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
# Combined configuration for openai/o3-mini
|
||||
model: "openai/o3-mini"
|
||||
provider: "OpenAI"
|
||||
output_dir: "results"
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
|
||||
categories:
|
||||
- category: "algebra"
|
||||
datasets:
|
||||
- dataset: "complex_arithmetic"
|
||||
- dataset: "intermediate_integration"
|
||||
- dataset: "polynomial_equations"
|
||||
- dataset: "polynomial_multiplication"
|
||||
- dataset: "simple_equations"
|
||||
- dataset: "simple_integration"
|
||||
|
||||
- category: "algorithmic"
|
||||
datasets:
|
||||
- dataset: "ab"
|
||||
- dataset: "binary_alternation"
|
||||
- dataset: "base_conversion"
|
||||
- dataset: "binary_matrix"
|
||||
- dataset: "caesar_cipher"
|
||||
- dataset: "count_primes"
|
||||
- dataset: "cryptarithm"
|
||||
- dataset: "game_of_life"
|
||||
- dataset: "graph_color"
|
||||
- dataset: "group_anagrams"
|
||||
- dataset: "isomorphic_strings"
|
||||
- dataset: "letter_counting"
|
||||
- dataset: "letter_jumble"
|
||||
- dataset: "manipulate_matrix"
|
||||
- dataset: "number_filtering"
|
||||
- dataset: "number_sorting"
|
||||
- dataset: "palindrome"
|
||||
- dataset: "pool_matrix"
|
||||
- dataset: "ransom_note"
|
||||
- dataset: "rotate_matrix"
|
||||
- dataset: "sentence_reordering"
|
||||
- dataset: "spell_backward"
|
||||
- dataset: "spiral_matrix"
|
||||
- dataset: "string_insertion"
|
||||
- dataset: "string_manipulation"
|
||||
- dataset: "string_synthesis"
|
||||
- dataset: "word_ladder"
|
||||
- dataset: "word_sequence_reversal"
|
||||
- dataset: "word_sorting"
|
||||
|
||||
- category: "arc"
|
||||
datasets:
|
||||
- dataset: "arc_1d"
|
||||
- dataset: "arc_agi"
|
||||
- dataset: "rearc"
|
||||
|
||||
- category: "arithmetic"
|
||||
datasets:
|
||||
- dataset: "basic_arithmetic"
|
||||
- dataset: "bitwise_arithmetic"
|
||||
- dataset: "calendar_arithmetic"
|
||||
- dataset: "chain_sum"
|
||||
- dataset: "count_bits"
|
||||
- dataset: "decimal_arithmetic"
|
||||
- dataset: "decimal_chain_sum"
|
||||
- dataset: "dice"
|
||||
- dataset: "fraction_simplification"
|
||||
- dataset: "gcd"
|
||||
- dataset: "gsm_symbolic"
|
||||
- dataset: "lcm"
|
||||
- dataset: "leg_counting"
|
||||
- dataset: "number_format"
|
||||
- dataset: "power_function"
|
||||
- dataset: "prime_factorization"
|
||||
- dataset: "products"
|
||||
- dataset: "time_intervals"
|
||||
|
||||
- category: "code"
|
||||
datasets:
|
||||
- dataset: "bf"
|
||||
|
||||
- category: "cognition"
|
||||
datasets:
|
||||
- dataset: "color_cube_rotation"
|
||||
- dataset: "figlet_font"
|
||||
- dataset: "needle_haystack"
|
||||
- dataset: "number_sequence"
|
||||
- dataset: "rectangle_count"
|
||||
- dataset: "rubiks_cube"
|
||||
|
||||
- category: "games"
|
||||
datasets:
|
||||
- dataset: "countdown"
|
||||
- dataset: "emoji_mystery"
|
||||
- dataset: "futoshuki"
|
||||
- dataset: "knight_swap"
|
||||
- dataset: "maze"
|
||||
- dataset: "mini_sudoku"
|
||||
- dataset: "n_queens"
|
||||
- dataset: "sokoban"
|
||||
- dataset: "sudoku"
|
||||
- dataset: "tower_of_hanoi"
|
||||
- dataset: "tsumego"
|
||||
|
||||
- category: "geometry"
|
||||
datasets:
|
||||
- dataset: "simple_geometry"
|
||||
- dataset: "advanced_geometry"
|
||||
|
||||
- category: "graphs"
|
||||
datasets:
|
||||
- dataset: "course_schedule"
|
||||
- dataset: "family_relationships"
|
||||
- dataset: "largest_island"
|
||||
- dataset: "list_functions"
|
||||
- dataset: "quantum_lock"
|
||||
- dataset: "shortest_path"
|
||||
|
||||
- category: "logic"
|
||||
datasets:
|
||||
- dataset: "aiw"
|
||||
- dataset: "circuit_logic"
|
||||
- dataset: "propositional_logic"
|
||||
- dataset: "self_reference"
|
||||
- dataset: "syllogism"
|
||||
- dataset: "zebra_puzzles"
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: algebra
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- complex_arithmetic
|
||||
- intermediate_integration
|
||||
- polynomial_equations
|
||||
- polynomial_multiplication
|
||||
- simple_equations
|
||||
- simple_integration
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: algorithmic
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- ab
|
||||
- binary_alternation
|
||||
- base_conversion
|
||||
- binary_matrix
|
||||
- caesar_cipher
|
||||
- count_primes
|
||||
- cryptarithm
|
||||
- game_of_life
|
||||
- graph_color
|
||||
- group_anagrams
|
||||
- isomorphic_strings
|
||||
- letter_counting
|
||||
- letter_jumble
|
||||
- manipulate_matrix
|
||||
- number_filtering
|
||||
- number_sorting
|
||||
- palindrome
|
||||
- pool_matrix
|
||||
- ransom_note
|
||||
- rotate_matrix
|
||||
- sentence_reordering
|
||||
- spell_backward
|
||||
- spiral_matrix
|
||||
- string_insertion
|
||||
- string_manipulation
|
||||
- string_synthesis
|
||||
- word_ladder
|
||||
- word_sequence_reversal
|
||||
- word_sorting
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: arc
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- arc_1d
|
||||
- arc_agi
|
||||
- rearc
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: arithmetic
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- basic_arithmetic
|
||||
- bitwise_arithmetic
|
||||
- calendar_arithmetic
|
||||
- chain_sum
|
||||
- count_bits
|
||||
- decimal_arithmetic
|
||||
- decimal_chain_sum
|
||||
- dice
|
||||
- fraction_simplification
|
||||
- gcd
|
||||
- gsm_symbolic
|
||||
- lcm
|
||||
- leg_counting
|
||||
- number_format
|
||||
- power_function
|
||||
- prime_factorization
|
||||
- products
|
||||
- time_intervals
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: code
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- bf
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: cognition
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- color_cube_rotation
|
||||
- figlet_font
|
||||
- needle_haystack
|
||||
- number_sequence
|
||||
- rectangle_count
|
||||
- rubiks_cube
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: games
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- countdown
|
||||
- emoji_mystery
|
||||
- futoshuki
|
||||
- knight_swap
|
||||
- maze
|
||||
- mini_sudoku
|
||||
- n_queens
|
||||
- sokoban
|
||||
- sudoku
|
||||
- tower_of_hanoi
|
||||
- tsumego
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: geometry
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- simple_geometry
|
||||
- advanced_geometry
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: graphs
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- course_schedule
|
||||
- family_relationships
|
||||
- largest_island
|
||||
- list_functions
|
||||
- quantum_lock
|
||||
- shortest_path
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
model: openai/o3-mini
|
||||
category: logic
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- aiw
|
||||
- circuit_logic
|
||||
- propositional_logic
|
||||
- self_reference
|
||||
- syllogism
|
||||
- zebra_puzzles
|
||||
eval_dir: results/openai-03
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
category: algebra
|
||||
datasets:
|
||||
- intermediate_integration
|
||||
- polynomial_equations
|
||||
- polynomial_multiplication
|
||||
- simple_equations
|
||||
- simple_integration
|
||||
- complex_arithmetic
|
||||
eval_dir: results/r1
|
||||
dataset_size: 50
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
category: algorithmic
|
||||
datasets:
|
||||
- ab
|
||||
- base_conversion
|
||||
- binary_matrix
|
||||
- caesar_cipher
|
||||
- count_primes
|
||||
- game_of_life
|
||||
- graph_color
|
||||
- group_anagrams
|
||||
- isomorphic_strings
|
||||
- letter_counting
|
||||
- letter_jumble
|
||||
- manipulate_matrix
|
||||
- number_filtering
|
||||
- number_sorting
|
||||
- palindrome
|
||||
- pool_matrix
|
||||
- ransom_note
|
||||
- rotate_matrix
|
||||
- sentence_reordering
|
||||
- spell_backward
|
||||
- spiral_matrix
|
||||
- string_insertion
|
||||
- string_manipulation
|
||||
- string_synthesis
|
||||
- word_ladder
|
||||
- word_sequence_reversal
|
||||
- word_sorting
|
||||
eval_dir: results/r1
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
category: cognition
|
||||
datasets:
|
||||
- color_cube_rotation
|
||||
- figlet_font
|
||||
- number_sequence
|
||||
- rubiks_cube
|
||||
eval_dir: results/r1
|
||||
dataset_size: 50
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
category: logic
|
||||
datasets:
|
||||
- propositional_logic
|
||||
- self_reference
|
||||
- syllogism
|
||||
- zebra_puzzles
|
||||
eval_dir: results/r1
|
||||
dataset_size: 50
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
Loading…
Add table
Add a link
Reference in a new issue