(evals): Medium configs (#415)

* updated medium configs * fix problematic curriculum values / small issues causing exceptions to be raised * optimus alpha config * all configs so far * fix tests
2026-04-19 12:58:07 +00:00 · 2025-04-14 08:25:31 +02:00 · 2025-04-14 08:25:31 +02:00 · 290bfc4fdd
commit 290bfc4fdd
parent cd1a9ea58b
25 changed files with 7050 additions and 63 deletions
--- a/eval/yaml/medium/claude-3.5-sonnet.yaml
+++ b/eval/yaml/medium/claude-3.5-sonnet.yaml
@ -109,7 +109,7 @@ categories:
  - dataset: jugs
    params:
      num_jugs: 4
-      difficulty: 50
+      difficulty: 10
  - dataset: letter_counting
    params:
      min_words: 25
@ -152,10 +152,10 @@ categories:
      max_length: 100
  - dataset: palindrome_partitioning
    params:
-      min_string_len: 50
-      max_string_len: 100
-      min_substring_palindrome_len: 5
-      max_substring_palindrome_len: 10
+      min_string_len: 5
+      max_string_len: 15
+      min_substring_palindrome_len: 1
+      max_substring_palindrome_len: 5
  - dataset: pool_matrix
    params:
      min_rows: 25
@ -234,8 +234,8 @@ categories:
      mirrors_weights: [0.2, 0.2, 0.2, 0.2, 0.2]
  - dataset: rearc
    params:
-      pso_difficulty_weights: [0, 0, 0, 1, 0, 0, 0, 0]
-      rng_difficulty_weights: [0, 0, 0, 1, 0, 0, 0, 0]
+      pso_difficulty_weights: [0, 0, 0, 1, 0, 0, 0]
+      rng_difficulty_weights: [0, 0, 0, 1, 0, 0, 0]
 - category: arithmetic
  datasets:
  - dataset: basic_arithmetic
@ -361,8 +361,8 @@ categories:
      max_num_statements: 500
  - dataset: number_sequence
    params:
-      min_terms: 8
-      max_terms: 12
+      min_terms: 5
+      max_terms: 10
      min_value: -500
      max_value: 500
      max_complexity: 3
@ -378,16 +378,16 @@ categories:
  datasets:
  - dataset: countdown
    params:
-      min_numbers: 6
+      min_numbers: 3
      max_numbers: 9
      min_target: 100
      max_target: 1000
      min_value: 1
-      max_value: 250
+      max_value: 100
  - dataset: emoji_mystery
    params:
-      min_words_in_sentence: 20
-      max_words_in_sentence: 40
+      min_words_in_sentence: 10
+      max_words_in_sentence: 30
  - dataset: futoshiki
    params:
      min_board_size: 6
@ -410,8 +410,8 @@ categories:
    params:
      min_grid_size: 25
      max_grid_size: 50
-      min_dist: 25
-      max_dist: 50
+      min_dist: 10
+      max_dist: 15
  - dataset: mini_sudoku
    params:
      min_empty: 6