diff --git a/training/configs/intra_generalisation/games_qwen_3b.yaml b/training/configs/intra_generalisation/games_qwen_3b.yaml index 91e0725a..38229216 100644 --- a/training/configs/intra_generalisation/games_qwen_3b.yaml +++ b/training/configs/intra_generalisation/games_qwen_3b.yaml @@ -2,11 +2,11 @@ reasoning_gym: dataset_size: 20000 developer_prompt: DeepSeekZero datasets: - sudoku: + mini_sudoku: weight: 0.33 config: - min_empty: 30 - max_empty: 50 + min_empty: 8 + max_empty: 12 futoshiki: weight: 0.34 config: diff --git a/training/evaluations/eval_qwen_3b.yaml b/training/evaluations/eval_qwen_3b.yaml deleted file mode 100644 index 132989fb..00000000 --- a/training/evaluations/eval_qwen_3b.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Model configuration -model_path: Qwen/Qwen2.5-3B-Instruct # Change to the smaller model -max_tokens: 1024 # From max_response_length in training config -temperature: 0.7 # Lower temperature for more focused responses -top_p: 0.9 # From rollout top_p -developer_prompt: DeepSeekZero -developer_role: system # Standard role for system prompts - -# Output configuration -output_dir: results -save_metadata: true -save_full_results: true -eval_repeats: 3 - -# Categories and datasets to evaluate -categories: - - category: reasoning - datasets: - - dataset: decimal_chain_sum - size: 100 - seed: 42 - params: - min_terms: 2 - max_terms: 4 - min_digits: 1 - max_digits: 3 - min_decimal_places: 1 - max_decimal_places: 4 diff --git a/training/evaluations/eval_algebraic_composite.yaml b/training/evaluations/intra-generalisation/eval_algebraic_composite.yaml similarity index 100% rename from training/evaluations/eval_algebraic_composite.yaml rename to training/evaluations/intra-generalisation/eval_algebraic_composite.yaml diff --git a/training/evaluations/eval_algorithmic_composite.yaml b/training/evaluations/intra-generalisation/eval_algorithmic_composite.yaml similarity index 100% rename from training/evaluations/eval_algorithmic_composite.yaml rename to training/evaluations/intra-generalisation/eval_algorithmic_composite.yaml diff --git a/training/evaluations/eval_arithmetic_composite.yaml b/training/evaluations/intra-generalisation/eval_arithmetic_composite.yaml similarity index 100% rename from training/evaluations/eval_arithmetic_composite.yaml rename to training/evaluations/intra-generalisation/eval_arithmetic_composite.yaml diff --git a/training/evaluations/eval_cognition_composite.yaml b/training/evaluations/intra-generalisation/eval_cognition_composite.yaml similarity index 100% rename from training/evaluations/eval_cognition_composite.yaml rename to training/evaluations/intra-generalisation/eval_cognition_composite.yaml diff --git a/training/evaluations/eval_games_composite.yaml b/training/evaluations/intra-generalisation/eval_games_composite.yaml similarity index 68% rename from training/evaluations/eval_games_composite.yaml rename to training/evaluations/intra-generalisation/eval_games_composite.yaml index b183b6fd..a609a74b 100644 --- a/training/evaluations/eval_games_composite.yaml +++ b/training/evaluations/intra-generalisation/eval_games_composite.yaml @@ -1,12 +1,10 @@ -# Model configuration -model_path: ../utils/qwen3b_games -max_tokens: 1024 +model_path: ../utils/games +max_tokens: 2048 temperature: 0.6 # Lower temperature for more focused responses top_p: 0.9 # From rollout top_p developer_prompt: DeepSeekZero developer_role: system # Standard role for system prompts -# Output configuration output_dir: results save_metadata: true save_full_results: true @@ -16,9 +14,11 @@ eval_repeats: 3 categories: - category: reasoning datasets: - - dataset: mahjong_puzzle + - dataset: tower_of_hanoi size: 100 seed: 42 params: - min_num_rounds: 10 - max_num_rounds: 50 + min_disks: 3 + max_disks: 4 + min_pegs: 3 + max_pegs: 4