mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
update eval yaml config files
This commit is contained in:
parent
a49463c323
commit
4109b5b72c
9 changed files with 79 additions and 6 deletions
|
|
@ -23,6 +23,7 @@ categories:
|
|||
- dataset: count_primes
|
||||
- dataset: cryptarithm
|
||||
- dataset: game_of_life
|
||||
- dataset: game_of_life_halting
|
||||
- dataset: graph_color
|
||||
- dataset: group_anagrams
|
||||
- dataset: isomorphic_strings
|
||||
|
|
@ -32,7 +33,7 @@ categories:
|
|||
- dataset: manipulate_matrix
|
||||
- dataset: number_filtering
|
||||
- dataset: number_sorting
|
||||
- dataset: palindrome
|
||||
- dataset: palindrome_generation
|
||||
- dataset: palindrome_partitioning
|
||||
- dataset: pool_matrix
|
||||
- dataset: ransom_note
|
||||
|
|
@ -81,6 +82,7 @@ categories:
|
|||
datasets:
|
||||
- dataset: color_cube_rotation
|
||||
- dataset: figlet_font
|
||||
- dataset: modulo_grid
|
||||
- dataset: needle_haystack
|
||||
- dataset: number_sequence
|
||||
- dataset: rectangle_count
|
||||
|
|
@ -91,9 +93,11 @@ categories:
|
|||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
- dataset: knight_swap
|
||||
- dataset: mahjong_puzzle
|
||||
- dataset: maze
|
||||
- dataset: mini_sudoku
|
||||
- dataset: n_queens
|
||||
- dataset: puzzle24
|
||||
- dataset: rush_hour
|
||||
- dataset: sokoban
|
||||
- dataset: sudoku
|
||||
|
|
@ -112,6 +116,7 @@ categories:
|
|||
- dataset: shortest_path
|
||||
- category: induction
|
||||
datasets:
|
||||
- dataset: acre
|
||||
- dataset: list_functions
|
||||
- category: logic
|
||||
datasets:
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ categories:
|
|||
- dataset: count_primes
|
||||
- dataset: cryptarithm
|
||||
- dataset: game_of_life
|
||||
- dataset: game_of_life_halting
|
||||
- dataset: graph_color
|
||||
- dataset: group_anagrams
|
||||
- dataset: isomorphic_strings
|
||||
|
|
@ -32,7 +33,7 @@ categories:
|
|||
- dataset: manipulate_matrix
|
||||
- dataset: number_filtering
|
||||
- dataset: number_sorting
|
||||
- dataset: palindrome
|
||||
- dataset: palindrome_generation
|
||||
- dataset: palindrome_partitioning
|
||||
- dataset: pool_matrix
|
||||
- dataset: ransom_note
|
||||
|
|
@ -81,6 +82,7 @@ categories:
|
|||
datasets:
|
||||
- dataset: color_cube_rotation
|
||||
- dataset: figlet_font
|
||||
- dataset: modulo_grid
|
||||
- dataset: needle_haystack
|
||||
- dataset: number_sequence
|
||||
- dataset: rectangle_count
|
||||
|
|
@ -91,9 +93,11 @@ categories:
|
|||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
- dataset: knight_swap
|
||||
- dataset: mahjong_puzzle
|
||||
- dataset: maze
|
||||
- dataset: mini_sudoku
|
||||
- dataset: n_queens
|
||||
- dataset: puzzle24
|
||||
- dataset: rush_hour
|
||||
- dataset: sokoban
|
||||
- dataset: sudoku
|
||||
|
|
@ -112,6 +116,7 @@ categories:
|
|||
- dataset: shortest_path
|
||||
- category: induction
|
||||
datasets:
|
||||
- dataset: acre
|
||||
- dataset: list_functions
|
||||
- category: logic
|
||||
datasets:
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ categories:
|
|||
- dataset: count_primes
|
||||
- dataset: cryptarithm
|
||||
- dataset: game_of_life
|
||||
- dataset: game_of_life_halting
|
||||
- dataset: graph_color
|
||||
- dataset: group_anagrams
|
||||
- dataset: isomorphic_strings
|
||||
|
|
@ -32,7 +33,7 @@ categories:
|
|||
- dataset: manipulate_matrix
|
||||
- dataset: number_filtering
|
||||
- dataset: number_sorting
|
||||
- dataset: palindrome
|
||||
- dataset: palindrome_generation
|
||||
- dataset: palindrome_partitioning
|
||||
- dataset: pool_matrix
|
||||
- dataset: ransom_note
|
||||
|
|
@ -81,6 +82,7 @@ categories:
|
|||
datasets:
|
||||
- dataset: color_cube_rotation
|
||||
- dataset: figlet_font
|
||||
- dataset: modulo_grid
|
||||
- dataset: needle_haystack
|
||||
- dataset: number_sequence
|
||||
- dataset: rectangle_count
|
||||
|
|
@ -91,9 +93,11 @@ categories:
|
|||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
- dataset: knight_swap
|
||||
- dataset: mahjong_puzzle
|
||||
- dataset: maze
|
||||
- dataset: mini_sudoku
|
||||
- dataset: n_queens
|
||||
- dataset: puzzle24
|
||||
- dataset: rush_hour
|
||||
- dataset: sokoban
|
||||
- dataset: sudoku
|
||||
|
|
@ -112,6 +116,7 @@ categories:
|
|||
- dataset: shortest_path
|
||||
- category: induction
|
||||
datasets:
|
||||
- dataset: acre
|
||||
- dataset: list_functions
|
||||
- category: logic
|
||||
datasets:
|
||||
|
|
|
|||
15
eval/yaml/deepseek-r1_algebra.yaml
Normal file
15
eval/yaml/deepseek-r1_algebra.yaml
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
output_dir: results/deepseek-r1_algebra
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
categories:
|
||||
- category: algebra
|
||||
datasets:
|
||||
- dataset: complex_arithmetic
|
||||
- dataset: intermediate_integration
|
||||
- dataset: polynomial_equations
|
||||
- dataset: polynomial_multiplication
|
||||
- dataset: simple_equations
|
||||
- dataset: simple_integration
|
||||
12
eval/yaml/deepseek-r1_arc.yaml
Normal file
12
eval/yaml/deepseek-r1_arc.yaml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
output_dir: results/deepseek-r1_arc
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
categories:
|
||||
- category: arc
|
||||
datasets:
|
||||
- dataset: arc_1d
|
||||
- dataset: arc_agi
|
||||
- dataset: rearc
|
||||
16
eval/yaml/deepseek-r1_logic.yaml
Normal file
16
eval/yaml/deepseek-r1_logic.yaml
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
output_dir: results/deepseek-r1_logic
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
categories:
|
||||
- category: logic
|
||||
datasets:
|
||||
- dataset: aiw
|
||||
- dataset: circuit_logic
|
||||
- dataset: knights_knaves
|
||||
- dataset: propositional_logic
|
||||
- dataset: self_reference
|
||||
- dataset: syllogism
|
||||
- dataset: zebra_puzzles
|
||||
|
|
@ -23,6 +23,7 @@ categories:
|
|||
- dataset: count_primes
|
||||
- dataset: cryptarithm
|
||||
- dataset: game_of_life
|
||||
- dataset: game_of_life_halting
|
||||
- dataset: graph_color
|
||||
- dataset: group_anagrams
|
||||
- dataset: isomorphic_strings
|
||||
|
|
@ -32,7 +33,7 @@ categories:
|
|||
- dataset: manipulate_matrix
|
||||
- dataset: number_filtering
|
||||
- dataset: number_sorting
|
||||
- dataset: palindrome
|
||||
- dataset: palindrome_generation
|
||||
- dataset: palindrome_partitioning
|
||||
- dataset: pool_matrix
|
||||
- dataset: ransom_note
|
||||
|
|
@ -81,6 +82,7 @@ categories:
|
|||
datasets:
|
||||
- dataset: color_cube_rotation
|
||||
- dataset: figlet_font
|
||||
- dataset: modulo_grid
|
||||
- dataset: needle_haystack
|
||||
- dataset: number_sequence
|
||||
- dataset: rectangle_count
|
||||
|
|
@ -91,9 +93,11 @@ categories:
|
|||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
- dataset: knight_swap
|
||||
- dataset: mahjong_puzzle
|
||||
- dataset: maze
|
||||
- dataset: mini_sudoku
|
||||
- dataset: n_queens
|
||||
- dataset: puzzle24
|
||||
- dataset: rush_hour
|
||||
- dataset: sokoban
|
||||
- dataset: sudoku
|
||||
|
|
@ -112,6 +116,7 @@ categories:
|
|||
- dataset: shortest_path
|
||||
- category: induction
|
||||
datasets:
|
||||
- dataset: acre
|
||||
- dataset: list_functions
|
||||
- category: logic
|
||||
datasets:
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ categories:
|
|||
- dataset: count_primes
|
||||
- dataset: cryptarithm
|
||||
- dataset: game_of_life
|
||||
- dataset: game_of_life_halting
|
||||
- dataset: graph_color
|
||||
- dataset: group_anagrams
|
||||
- dataset: isomorphic_strings
|
||||
|
|
@ -32,7 +33,7 @@ categories:
|
|||
- dataset: manipulate_matrix
|
||||
- dataset: number_filtering
|
||||
- dataset: number_sorting
|
||||
- dataset: palindrome
|
||||
- dataset: palindrome_generation
|
||||
- dataset: palindrome_partitioning
|
||||
- dataset: pool_matrix
|
||||
- dataset: ransom_note
|
||||
|
|
@ -81,6 +82,7 @@ categories:
|
|||
datasets:
|
||||
- dataset: color_cube_rotation
|
||||
- dataset: figlet_font
|
||||
- dataset: modulo_grid
|
||||
- dataset: needle_haystack
|
||||
- dataset: number_sequence
|
||||
- dataset: rectangle_count
|
||||
|
|
@ -91,9 +93,11 @@ categories:
|
|||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
- dataset: knight_swap
|
||||
- dataset: mahjong_puzzle
|
||||
- dataset: maze
|
||||
- dataset: mini_sudoku
|
||||
- dataset: n_queens
|
||||
- dataset: puzzle24
|
||||
- dataset: rush_hour
|
||||
- dataset: sokoban
|
||||
- dataset: sudoku
|
||||
|
|
@ -112,6 +116,7 @@ categories:
|
|||
- dataset: shortest_path
|
||||
- category: induction
|
||||
datasets:
|
||||
- dataset: acre
|
||||
- dataset: list_functions
|
||||
- category: logic
|
||||
datasets:
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ categories:
|
|||
- dataset: count_primes
|
||||
- dataset: cryptarithm
|
||||
- dataset: game_of_life
|
||||
- dataset: game_of_life_halting
|
||||
- dataset: graph_color
|
||||
- dataset: group_anagrams
|
||||
- dataset: isomorphic_strings
|
||||
|
|
@ -32,7 +33,7 @@ categories:
|
|||
- dataset: manipulate_matrix
|
||||
- dataset: number_filtering
|
||||
- dataset: number_sorting
|
||||
- dataset: palindrome
|
||||
- dataset: palindrome_generation
|
||||
- dataset: palindrome_partitioning
|
||||
- dataset: pool_matrix
|
||||
- dataset: ransom_note
|
||||
|
|
@ -81,6 +82,7 @@ categories:
|
|||
datasets:
|
||||
- dataset: color_cube_rotation
|
||||
- dataset: figlet_font
|
||||
- dataset: modulo_grid
|
||||
- dataset: needle_haystack
|
||||
- dataset: number_sequence
|
||||
- dataset: rectangle_count
|
||||
|
|
@ -91,9 +93,11 @@ categories:
|
|||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
- dataset: knight_swap
|
||||
- dataset: mahjong_puzzle
|
||||
- dataset: maze
|
||||
- dataset: mini_sudoku
|
||||
- dataset: n_queens
|
||||
- dataset: puzzle24
|
||||
- dataset: rush_hour
|
||||
- dataset: sokoban
|
||||
- dataset: sudoku
|
||||
|
|
@ -112,6 +116,7 @@ categories:
|
|||
- dataset: shortest_path
|
||||
- category: induction
|
||||
datasets:
|
||||
- dataset: acre
|
||||
- dataset: list_functions
|
||||
- category: logic
|
||||
datasets:
|
||||
Loading…
Add table
Add a link
Reference in a new issue