update eval yaml config files

This commit is contained in:
Andreas Koepf 2025-03-10 00:48:32 +01:00
parent a49463c323
commit 4109b5b72c
9 changed files with 79 additions and 6 deletions

View file

@ -23,6 +23,7 @@ categories:
- dataset: count_primes
- dataset: cryptarithm
- dataset: game_of_life
- dataset: game_of_life_halting
- dataset: graph_color
- dataset: group_anagrams
- dataset: isomorphic_strings
@ -32,7 +33,7 @@ categories:
- dataset: manipulate_matrix
- dataset: number_filtering
- dataset: number_sorting
- dataset: palindrome
- dataset: palindrome_generation
- dataset: palindrome_partitioning
- dataset: pool_matrix
- dataset: ransom_note
@ -81,6 +82,7 @@ categories:
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: modulo_grid
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
@ -91,9 +93,11 @@ categories:
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: mahjong_puzzle
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: puzzle24
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
@ -112,6 +116,7 @@ categories:
- dataset: shortest_path
- category: induction
datasets:
- dataset: acre
- dataset: list_functions
- category: logic
datasets:

View file

@ -23,6 +23,7 @@ categories:
- dataset: count_primes
- dataset: cryptarithm
- dataset: game_of_life
- dataset: game_of_life_halting
- dataset: graph_color
- dataset: group_anagrams
- dataset: isomorphic_strings
@ -32,7 +33,7 @@ categories:
- dataset: manipulate_matrix
- dataset: number_filtering
- dataset: number_sorting
- dataset: palindrome
- dataset: palindrome_generation
- dataset: palindrome_partitioning
- dataset: pool_matrix
- dataset: ransom_note
@ -81,6 +82,7 @@ categories:
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: modulo_grid
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
@ -91,9 +93,11 @@ categories:
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: mahjong_puzzle
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: puzzle24
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
@ -112,6 +116,7 @@ categories:
- dataset: shortest_path
- category: induction
datasets:
- dataset: acre
- dataset: list_functions
- category: logic
datasets:

View file

@ -23,6 +23,7 @@ categories:
- dataset: count_primes
- dataset: cryptarithm
- dataset: game_of_life
- dataset: game_of_life_halting
- dataset: graph_color
- dataset: group_anagrams
- dataset: isomorphic_strings
@ -32,7 +33,7 @@ categories:
- dataset: manipulate_matrix
- dataset: number_filtering
- dataset: number_sorting
- dataset: palindrome
- dataset: palindrome_generation
- dataset: palindrome_partitioning
- dataset: pool_matrix
- dataset: ransom_note
@ -81,6 +82,7 @@ categories:
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: modulo_grid
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
@ -91,9 +93,11 @@ categories:
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: mahjong_puzzle
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: puzzle24
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
@ -112,6 +116,7 @@ categories:
- dataset: shortest_path
- category: induction
datasets:
- dataset: acre
- dataset: list_functions
- category: logic
datasets:

View file

@ -0,0 +1,15 @@
model: deepseek/deepseek-r1
provider: Nebius
output_dir: results/deepseek-r1_algebra
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: algebra
datasets:
- dataset: complex_arithmetic
- dataset: intermediate_integration
- dataset: polynomial_equations
- dataset: polynomial_multiplication
- dataset: simple_equations
- dataset: simple_integration

View file

@ -0,0 +1,12 @@
model: deepseek/deepseek-r1
provider: Nebius
output_dir: results/deepseek-r1_arc
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: arc
datasets:
- dataset: arc_1d
- dataset: arc_agi
- dataset: rearc

View file

@ -0,0 +1,16 @@
model: deepseek/deepseek-r1
provider: Nebius
output_dir: results/deepseek-r1_logic
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: logic
datasets:
- dataset: aiw
- dataset: circuit_logic
- dataset: knights_knaves
- dataset: propositional_logic
- dataset: self_reference
- dataset: syllogism
- dataset: zebra_puzzles

View file

@ -23,6 +23,7 @@ categories:
- dataset: count_primes
- dataset: cryptarithm
- dataset: game_of_life
- dataset: game_of_life_halting
- dataset: graph_color
- dataset: group_anagrams
- dataset: isomorphic_strings
@ -32,7 +33,7 @@ categories:
- dataset: manipulate_matrix
- dataset: number_filtering
- dataset: number_sorting
- dataset: palindrome
- dataset: palindrome_generation
- dataset: palindrome_partitioning
- dataset: pool_matrix
- dataset: ransom_note
@ -81,6 +82,7 @@ categories:
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: modulo_grid
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
@ -91,9 +93,11 @@ categories:
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: mahjong_puzzle
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: puzzle24
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
@ -112,6 +116,7 @@ categories:
- dataset: shortest_path
- category: induction
datasets:
- dataset: acre
- dataset: list_functions
- category: logic
datasets:

View file

@ -23,6 +23,7 @@ categories:
- dataset: count_primes
- dataset: cryptarithm
- dataset: game_of_life
- dataset: game_of_life_halting
- dataset: graph_color
- dataset: group_anagrams
- dataset: isomorphic_strings
@ -32,7 +33,7 @@ categories:
- dataset: manipulate_matrix
- dataset: number_filtering
- dataset: number_sorting
- dataset: palindrome
- dataset: palindrome_generation
- dataset: palindrome_partitioning
- dataset: pool_matrix
- dataset: ransom_note
@ -81,6 +82,7 @@ categories:
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: modulo_grid
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
@ -91,9 +93,11 @@ categories:
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: mahjong_puzzle
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: puzzle24
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
@ -112,6 +116,7 @@ categories:
- dataset: shortest_path
- category: induction
datasets:
- dataset: acre
- dataset: list_functions
- category: logic
datasets:

View file

@ -23,6 +23,7 @@ categories:
- dataset: count_primes
- dataset: cryptarithm
- dataset: game_of_life
- dataset: game_of_life_halting
- dataset: graph_color
- dataset: group_anagrams
- dataset: isomorphic_strings
@ -32,7 +33,7 @@ categories:
- dataset: manipulate_matrix
- dataset: number_filtering
- dataset: number_sorting
- dataset: palindrome
- dataset: palindrome_generation
- dataset: palindrome_partitioning
- dataset: pool_matrix
- dataset: ransom_note
@ -81,6 +82,7 @@ categories:
datasets:
- dataset: color_cube_rotation
- dataset: figlet_font
- dataset: modulo_grid
- dataset: needle_haystack
- dataset: number_sequence
- dataset: rectangle_count
@ -91,9 +93,11 @@ categories:
- dataset: emoji_mystery
- dataset: futoshiki
- dataset: knight_swap
- dataset: mahjong_puzzle
- dataset: maze
- dataset: mini_sudoku
- dataset: n_queens
- dataset: puzzle24
- dataset: rush_hour
- dataset: sokoban
- dataset: sudoku
@ -112,6 +116,7 @@ categories:
- dataset: shortest_path
- category: induction
datasets:
- dataset: acre
- dataset: list_functions
- category: logic
datasets: