Eval script consolidation (#238)

The script now supports:
   - YAML and JSON configurations
   - Dataset-specific parameters
   - Overriding configuration via command line
   - Detailed logging and error handling
This commit is contained in:
Andreas Köpf 2025-02-27 17:39:14 +01:00 committed by GitHub
parent 8a66d2a216
commit 850c1cf6f4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
40 changed files with 1111 additions and 670 deletions

View file

@ -1,31 +0,0 @@
model: anthropic/claude-3.5-sonnet
category: algorithmic
provider: Anthropic
datasets:
- count_primes
- game_of_life
- graph_color
- group_anagrams
- isomorphic_strings
- letter_counting
- letter_jumble
- manipulate_matrix
- number_filtering
- number_sorting
- palindrome
- pool_matrix
- ransom_note
- rotate_matrix
- sentence_reordering
- spell_backward
- spiral_matrix
- string_insertion
- string_manipulation
- string_synthesis
- word_ladder
- word_sequence_reversal
- word_sorting
eval_dir: eval/sonnet-3.5
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -0,0 +1,34 @@
# Combined configuration for Claude 3.5 Sonnet
model: "anthropic/claude-3.5-sonnet"
provider: "Anthropic"
output_dir: "results"
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: "algorithmic"
datasets:
- dataset: "count_primes"
- dataset: "game_of_life"
- dataset: "graph_color"
- dataset: "group_anagrams"
- dataset: "isomorphic_strings"
- dataset: "letter_counting"
- dataset: "letter_jumble"
- dataset: "manipulate_matrix"
- dataset: "number_filtering"
- dataset: "number_sorting"
- dataset: "palindrome"
- dataset: "pool_matrix"
- dataset: "ransom_note"
- dataset: "rotate_matrix"
- dataset: "sentence_reordering"
- dataset: "spell_backward"
- dataset: "spiral_matrix"
- dataset: "string_insertion"
- dataset: "string_manipulation"
- dataset: "string_synthesis"
- dataset: "word_ladder"
- dataset: "word_sequence_reversal"
- dataset: "word_sorting"

View file

@ -0,0 +1,61 @@
# Combined configuration for deepseek-r1
model: "deepseek/deepseek-r1"
provider: "Nebius"
output_dir: "results"
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: "algebra"
datasets:
- dataset: "intermediate_integration"
- dataset: "polynomial_equations"
- dataset: "polynomial_multiplication"
- dataset: "simple_equations"
- dataset: "simple_integration"
- dataset: "complex_arithmetic"
- category: "algorithmic"
datasets:
- dataset: "ab"
- dataset: "base_conversion"
- dataset: "binary_matrix"
- dataset: "caesar_cipher"
- dataset: "count_primes"
- dataset: "game_of_life"
- dataset: "graph_color"
- dataset: "group_anagrams"
- dataset: "isomorphic_strings"
- dataset: "letter_counting"
- dataset: "letter_jumble"
- dataset: "manipulate_matrix"
- dataset: "number_filtering"
- dataset: "number_sorting"
- dataset: "palindrome"
- dataset: "pool_matrix"
- dataset: "ransom_note"
- dataset: "rotate_matrix"
- dataset: "sentence_reordering"
- dataset: "spell_backward"
- dataset: "spiral_matrix"
- dataset: "string_insertion"
- dataset: "string_manipulation"
- dataset: "string_synthesis"
- dataset: "word_ladder"
- dataset: "word_sequence_reversal"
- dataset: "word_sorting"
- category: "cognition"
datasets:
- dataset: "color_cube_rotation"
- dataset: "figlet_font"
- dataset: "number_sequence"
- dataset: "rubiks_cube"
- category: "logic"
datasets:
- dataset: "propositional_logic"
- dataset: "self_reference"
- dataset: "syllogism"
- dataset: "zebra_puzzles"

View file

@ -1,9 +0,0 @@
model: anthropic/claude-3.7-sonnet # find model id: https://openrouter.ai/models
provider: Anthropic
category: test
datasets:
- YOUR_DATASET_NAME
eval_dir: results/test
dataset_size: 100
dataset_seed: 42
developer_role: system

View file

@ -0,0 +1,130 @@
# Combined configuration for llama-3.3-70b-instruct
model: "meta-llama/llama-3.3-70b-instruct"
provider: "Hyperbolic"
output_dir: "results"
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: "algebra"
datasets:
- dataset: "intermediate_integration"
- dataset: "polynomial_equations"
- dataset: "polynomial_multiplication"
- dataset: "simple_equations"
- dataset: "simple_integration"
- dataset: "complex_arithmetic"
- category: "algorithmic"
datasets:
- dataset: "ab"
- dataset: "base_conversion"
- dataset: "binary_alternation"
- dataset: "binary_matrix"
- dataset: "caesar_cipher"
- dataset: "count_primes"
- dataset: "cryptarithm"
- dataset: "game_of_life"
- dataset: "graph_color"
- dataset: "group_anagrams"
- dataset: "isomorphic_strings"
- dataset: "jugs"
- dataset: "letter_counting"
- dataset: "letter_jumble"
- dataset: "manipulate_matrix"
- dataset: "number_filtering"
- dataset: "number_sorting"
- dataset: "palindrome"
- dataset: "palindrome_partitioning"
- dataset: "pool_matrix"
- dataset: "ransom_note"
- dataset: "rotate_matrix"
- dataset: "rotten_oranges"
- dataset: "sentence_reordering"
- dataset: "spell_backward"
- dataset: "spiral_matrix"
- dataset: "string_insertion"
- dataset: "string_manipulation"
- dataset: "string_splitting"
- dataset: "string_synthesis"
- dataset: "word_ladder"
- dataset: "word_sequence_reversal"
- dataset: "word_sorting"
- category: "arc"
datasets:
- dataset: "arc_1d"
- dataset: "arc_agi"
- dataset: "rearc"
- category: "arithmetic"
datasets:
- dataset: "basic_arithmetic"
- dataset: "bitwise_arithmetic"
- dataset: "calendar_arithmetic"
- dataset: "chain_sum"
- dataset: "count_bits"
- dataset: "decimal_arithmetic"
- dataset: "decimal_chain_sum"
- dataset: "dice"
- dataset: "fraction_simplification"
- dataset: "gcd"
- dataset: "gsm_symbolic"
- dataset: "lcm"
- dataset: "leg_counting"
- dataset: "number_format"
- dataset: "power_function"
- dataset: "prime_factorization"
- dataset: "products"
- dataset: "time_intervals"
- category: "code"
datasets:
- dataset: "bf"
- category: "cognition"
datasets:
- dataset: "color_cube_rotation"
- dataset: "figlet_font"
- dataset: "needle_haystack"
- dataset: "number_sequence"
- dataset: "rectangle_count"
- dataset: "rubiks_cube"
- category: "games"
datasets:
- dataset: "countdown"
- dataset: "emoji_mystery"
- dataset: "futoshuki"
- dataset: "knight_swap"
- dataset: "maze"
- dataset: "mini_sudoku"
- dataset: "n_queens"
- dataset: "sokoban"
- dataset: "sudoku"
- dataset: "tower_of_hanoi"
- dataset: "tsumego"
- category: "geometry"
datasets:
- dataset: "simple_geometry"
- dataset: "advanced_geometry"
- category: "graphs"
datasets:
- dataset: "course_schedule"
- dataset: "family_relationships"
- dataset: "largest_island"
- dataset: "list_functions"
- dataset: "quantum_lock"
- dataset: "shortest_path"
- category: "logic"
datasets:
- dataset: "aiw"
- dataset: "circuit_logic"
- dataset: "propositional_logic"
- dataset: "self_reference"
- dataset: "syllogism"
- dataset: "zebra_puzzles"

View file

@ -1,14 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: algebra
datasets:
- intermediate_integration
- polynomial_equations
- polynomial_multiplication
- simple_equations
- simple_integration
- complex_arithmetic
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 42
developer_role: system

View file

@ -1,41 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: algorithmic
datasets:
- ab
- base_conversion
- binary_alternation
- binary_matrix
- caesar_cipher
- count_primes
- cryptarithm
- game_of_life
- graph_color
- group_anagrams
- isomorphic_strings
- jugs
- letter_counting
- letter_jumble
- manipulate_matrix
- number_filtering
- number_sorting
- palindrome
- palindrome_partitioning
- pool_matrix
- ransom_note
- rotate_matrix
- rotten_oranges
- sentence_reordering
- spell_backward
- spiral_matrix
- string_insertion
- string_manipulation
- string_splitting
- string_synthesis
- word_ladder
- word_sequence_reversal
- word_sorting
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,11 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: arc
datasets:
- arc_1d
- arc_agi
- rearc
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,26 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: arithmetic
datasets:
- basic_arithmetic
- bitwise_arithmetic
- calendar_arithmetic
- chain_sum
- count_bits
- decimal_arithmetic
- decimal_chain_sum
- dice
- fraction_simplification
- gcd
- gsm_symbolic
- lcm
- leg_counting
- number_format
- power_function
- prime_factorization
- products
- time_intervals
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,9 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: code
datasets:
- bf
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,14 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: cognition
datasets:
- color_cube_rotation
- figlet_font
- needle_haystack
- number_sequence
- rectangle_count
- rubiks_cube
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,19 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: games
datasets:
- countdown
- emoji_mystery
- futoshuki
- knight_swap
- maze
- mini_sudoku
- n_queens
- sokoban
- sudoku
- tower_of_hanoi
- tsumego
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,10 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: geometry
datasets:
- simple_geometry
- advanced_geometry
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,14 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: graphs
datasets:
- course_schedule
- family_relationships
- largest_island
- list_functions
- quantum_lock
- shortest_path
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,14 +0,0 @@
model: meta-llama/llama-3.3-70b-instruct
provider: Hyperbolic
category: logic
datasets:
- aiw
- circuit_logic
- propositional_logic
- self_reference
- syllogism
- zebra_puzzles
eval_dir: results/llama-3.3-70b-instruct
dataset_size: 50
dataset_seed: 45
developer_role: system

126
eval/yaml/openai-o3.yaml Normal file
View file

@ -0,0 +1,126 @@
# Combined configuration for openai/o3-mini
model: "openai/o3-mini"
provider: "OpenAI"
output_dir: "results"
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: "algebra"
datasets:
- dataset: "complex_arithmetic"
- dataset: "intermediate_integration"
- dataset: "polynomial_equations"
- dataset: "polynomial_multiplication"
- dataset: "simple_equations"
- dataset: "simple_integration"
- category: "algorithmic"
datasets:
- dataset: "ab"
- dataset: "binary_alternation"
- dataset: "base_conversion"
- dataset: "binary_matrix"
- dataset: "caesar_cipher"
- dataset: "count_primes"
- dataset: "cryptarithm"
- dataset: "game_of_life"
- dataset: "graph_color"
- dataset: "group_anagrams"
- dataset: "isomorphic_strings"
- dataset: "letter_counting"
- dataset: "letter_jumble"
- dataset: "manipulate_matrix"
- dataset: "number_filtering"
- dataset: "number_sorting"
- dataset: "palindrome"
- dataset: "pool_matrix"
- dataset: "ransom_note"
- dataset: "rotate_matrix"
- dataset: "sentence_reordering"
- dataset: "spell_backward"
- dataset: "spiral_matrix"
- dataset: "string_insertion"
- dataset: "string_manipulation"
- dataset: "string_synthesis"
- dataset: "word_ladder"
- dataset: "word_sequence_reversal"
- dataset: "word_sorting"
- category: "arc"
datasets:
- dataset: "arc_1d"
- dataset: "arc_agi"
- dataset: "rearc"
- category: "arithmetic"
datasets:
- dataset: "basic_arithmetic"
- dataset: "bitwise_arithmetic"
- dataset: "calendar_arithmetic"
- dataset: "chain_sum"
- dataset: "count_bits"
- dataset: "decimal_arithmetic"
- dataset: "decimal_chain_sum"
- dataset: "dice"
- dataset: "fraction_simplification"
- dataset: "gcd"
- dataset: "gsm_symbolic"
- dataset: "lcm"
- dataset: "leg_counting"
- dataset: "number_format"
- dataset: "power_function"
- dataset: "prime_factorization"
- dataset: "products"
- dataset: "time_intervals"
- category: "code"
datasets:
- dataset: "bf"
- category: "cognition"
datasets:
- dataset: "color_cube_rotation"
- dataset: "figlet_font"
- dataset: "needle_haystack"
- dataset: "number_sequence"
- dataset: "rectangle_count"
- dataset: "rubiks_cube"
- category: "games"
datasets:
- dataset: "countdown"
- dataset: "emoji_mystery"
- dataset: "futoshuki"
- dataset: "knight_swap"
- dataset: "maze"
- dataset: "mini_sudoku"
- dataset: "n_queens"
- dataset: "sokoban"
- dataset: "sudoku"
- dataset: "tower_of_hanoi"
- dataset: "tsumego"
- category: "geometry"
datasets:
- dataset: "simple_geometry"
- dataset: "advanced_geometry"
- category: "graphs"
datasets:
- dataset: "course_schedule"
- dataset: "family_relationships"
- dataset: "largest_island"
- dataset: "list_functions"
- dataset: "quantum_lock"
- dataset: "shortest_path"
- category: "logic"
datasets:
- dataset: "aiw"
- dataset: "circuit_logic"
- dataset: "propositional_logic"
- dataset: "self_reference"
- dataset: "syllogism"
- dataset: "zebra_puzzles"

View file

@ -1,14 +0,0 @@
model: openai/o3-mini
category: algebra
provider: OpenAI
datasets:
- complex_arithmetic
- intermediate_integration
- polynomial_equations
- polynomial_multiplication
- simple_equations
- simple_integration
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,37 +0,0 @@
model: openai/o3-mini
category: algorithmic
provider: OpenAI
datasets:
- ab
- binary_alternation
- base_conversion
- binary_matrix
- caesar_cipher
- count_primes
- cryptarithm
- game_of_life
- graph_color
- group_anagrams
- isomorphic_strings
- letter_counting
- letter_jumble
- manipulate_matrix
- number_filtering
- number_sorting
- palindrome
- pool_matrix
- ransom_note
- rotate_matrix
- sentence_reordering
- spell_backward
- spiral_matrix
- string_insertion
- string_manipulation
- string_synthesis
- word_ladder
- word_sequence_reversal
- word_sorting
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,11 +0,0 @@
model: openai/o3-mini
category: arc
provider: OpenAI
datasets:
- arc_1d
- arc_agi
- rearc
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,26 +0,0 @@
model: openai/o3-mini
category: arithmetic
provider: OpenAI
datasets:
- basic_arithmetic
- bitwise_arithmetic
- calendar_arithmetic
- chain_sum
- count_bits
- decimal_arithmetic
- decimal_chain_sum
- dice
- fraction_simplification
- gcd
- gsm_symbolic
- lcm
- leg_counting
- number_format
- power_function
- prime_factorization
- products
- time_intervals
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,9 +0,0 @@
model: openai/o3-mini
category: code
provider: OpenAI
datasets:
- bf
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,14 +0,0 @@
model: openai/o3-mini
category: cognition
provider: OpenAI
datasets:
- color_cube_rotation
- figlet_font
- needle_haystack
- number_sequence
- rectangle_count
- rubiks_cube
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,19 +0,0 @@
model: openai/o3-mini
category: games
provider: OpenAI
datasets:
- countdown
- emoji_mystery
- futoshuki
- knight_swap
- maze
- mini_sudoku
- n_queens
- sokoban
- sudoku
- tower_of_hanoi
- tsumego
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,10 +0,0 @@
model: openai/o3-mini
category: geometry
provider: OpenAI
datasets:
- simple_geometry
- advanced_geometry
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,14 +0,0 @@
model: openai/o3-mini
category: graphs
provider: OpenAI
datasets:
- course_schedule
- family_relationships
- largest_island
- list_functions
- quantum_lock
- shortest_path
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,14 +0,0 @@
model: openai/o3-mini
category: logic
provider: OpenAI
datasets:
- aiw
- circuit_logic
- propositional_logic
- self_reference
- syllogism
- zebra_puzzles
eval_dir: results/openai-03
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,14 +0,0 @@
model: deepseek/deepseek-r1
provider: Nebius
category: algebra
datasets:
- intermediate_integration
- polynomial_equations
- polynomial_multiplication
- simple_equations
- simple_integration
- complex_arithmetic
eval_dir: results/r1
dataset_size: 50
dataset_seed: 42
developer_role: system

View file

@ -1,35 +0,0 @@
model: deepseek/deepseek-r1
provider: Nebius
category: algorithmic
datasets:
- ab
- base_conversion
- binary_matrix
- caesar_cipher
- count_primes
- game_of_life
- graph_color
- group_anagrams
- isomorphic_strings
- letter_counting
- letter_jumble
- manipulate_matrix
- number_filtering
- number_sorting
- palindrome
- pool_matrix
- ransom_note
- rotate_matrix
- sentence_reordering
- spell_backward
- spiral_matrix
- string_insertion
- string_manipulation
- string_synthesis
- word_ladder
- word_sequence_reversal
- word_sorting
eval_dir: results/r1
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -1,12 +0,0 @@
model: deepseek/deepseek-r1
provider: Nebius
category: cognition
datasets:
- color_cube_rotation
- figlet_font
- number_sequence
- rubiks_cube
eval_dir: results/r1
dataset_size: 50
dataset_seed: 42
developer_role: system

View file

@ -1,12 +0,0 @@
model: deepseek/deepseek-r1
provider: Nebius
category: logic
datasets:
- propositional_logic
- self_reference
- syllogism
- zebra_puzzles
eval_dir: results/r1
dataset_size: 50
dataset_seed: 42
developer_role: system