Feat/intragen experiments (#414)

* added curriculum * readapted readme * corrected small errors * Delete eval/eval/r1/algorithmic/word_sorting.json * removed redundant argument * added spell * removed duplicated fit * changed config * added composite changes * added composite changes * updated yaml * added spell backward * updated read me * added qwen2.5 * added * Add files via upload * updated missing trainer func * updated curr * updated spell back * updated correctness score func * updated configs * added local evals * added updates * updated datasets * added fsdp to hf utility * added algorithmic qwen 3b yaml * updated read me * updated configs * added preappend token * updated with thinking token * updated test score board * resolved comments * added evaluation scripts * removed results from pr * added config * added partial reward scoring * added evaluation composites * added training configs * added games eval * added rubriks cube * resolved merge cinflicts * added games config * added latest eval configs * updated strucutre * Delete training/evaluations/eval_graphs_composite.yaml --------- Co-authored-by: joesharratt1229 <joesharrat1229@gmail.com>
2026-04-19 12:58:07 +00:00 · 2025-04-16 07:04:52 +01:00 · 2025-04-16 07:04:52 +01:00 · d0ef136d5b
commit d0ef136d5b
parent 224532f12a
21 changed files with 1331 additions and 48 deletions
--- a/reasoning_gym/algorithmic/number_sorting.py
+++ b/reasoning_gym/algorithmic/number_sorting.py
@ -170,7 +170,7 @@ class NumberSortingCurriculum(BaseCurriculum):
        self._define_attributes(
            RangeAttributeDefinition(
                name="numbers",
-                levels=[10, 50, 100, 200],
+                levels=[10, 100, 500, 1000],
                description="How many numbers to sort",
                lower_field_name="min_numbers",
                upper_field_name="max_numbers",
--- a/reasoning_gym/arithmetic/basic_arithmetic.py
+++ b/reasoning_gym/arithmetic/basic_arithmetic.py
@ -2,7 +2,7 @@ from dataclasses import dataclass
 from random import Random
 from typing import Any, Literal, Optional
-from ..coaching import BaseCurriculum, RangeAttributeDefinition
+from ..coaching import BaseCurriculum, RangeAttributeDefinition, ScalarAttributeDefinition
 from ..factory import ProceduralDataset, register_dataset
 DATASET_NAME = "basic_arithmetic"
@ -250,17 +250,19 @@ class BasicArithmeticCurriculum(BaseCurriculum):
        self._define_attributes(
            RangeAttributeDefinition(
                name="num_terms",
-                levels=[2, 5, 10, 15],
+                levels=[2, 3, 4, 5, 6],
                description="Number of terms in the expression",
                lower_field_name="min_terms",
                upper_field_name="max_terms",
                ensure_interval=False,
            ),
            RangeAttributeDefinition(
                name="num_digits",
-                levels=[1, 2, 5, 10],
+                levels=[1, 2, 3, 4],
                description="Number of digits in the numbers",
                lower_field_name="min_digits",
                upper_field_name="max_digits",
                ensure_interval=False,
            ),
        )
--- a/reasoning_gym/cognition/rectangle_count.py
+++ b/reasoning_gym/cognition/rectangle_count.py
@ -16,6 +16,8 @@ Now, it's your turn. How many rectangles do you see in the grid below?
 """
 DATASET_NAME = "rectangle_count"
 CONST_TERM = 0.8
 D = 5
 def draw_rectangles_with_overlap(n, width, height, rng):
@ -132,22 +134,29 @@ class RectangleCountDataset(ProceduralDataset):
        }
    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
-        """Determine if the solution provided solves the RectangleCount task.
+        """Determine if the solution provided solves the RectangleCount task,
-
+        awarding partial credit if the guess is close.
        The function awards 1.0 for a correct answer.
        Args:
            answer (Optional[str]): The user's answer.
            entry (dict[str, Any]): The original dataset entry containing the correct answer.
        Returns:
-            float: The computed score between 0.0 and 1.0.
+            float: A score between 0.0 and 1.0.
        """
        correct_str = entry["answer"].lower().replace("\n", "")
-        if isinstance(answer, str):
+        try:
-            if answer.lower().replace("\n", "") == entry["answer"].lower().replace("\n", ""):
+            correct_val = int(correct_str)
-                return 1.0  # Yay
+            user_val = int(answer.strip())
-        return 0.0
+        except (ValueError, TypeError, AttributeError):
            return 0.0
        distance = abs(user_val - correct_val)
        if distance == 0:
            return 1.0
        if distance >= D:
            return 0.0
        score = 1.0 - (distance / float(D))
        score = CONST_TERM * score
        return max(0.0, score)
 class RectangleCountCurriculum(BaseCurriculum):
--- a/reasoning_gym/cognition/rubiks_cube.py
+++ b/reasoning_gym/cognition/rubiks_cube.py
@ -121,29 +121,49 @@ class RubiksCubeDataset(ProceduralDataset):
            },
        }
    def partial_score(self, cube: Cube) -> float:
        """
        Returns a fraction between 0 and 1, indicating how many stickers are
        correctly positioned (i.e., match the solved color for that face).
        """
        total_stickers = 6 * (cube.size**2)
        correct_stickers = 0
        for face_index in range(6):
            face = cube.faces[face_index]
            solved_color = face[cube.size // 2][cube.size // 2].color
            for row in range(cube.size):
                for col in range(cube.size):
                    sticker = face[row][col]
                    if sticker.color == solved_color:
                        correct_stickers += 1
        return correct_stickers / total_stickers
    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
-        """Determine if the solution provided solves the cube"""
+        """Determine if the solution provided solves the cube, with partial rewards."""
-        reward = 0.0  # default reward
+        reward = 0.0  # default
        if answer is not None:
            # Reconstruct the test cube
            eval_cube = Cube(entry["metadata"]["cube_size"])
            eval_cube.rotate(entry["metadata"]["scramble_moves"])
            # Test the solution
            try:
                expanded_answer = self.expand_moves(answer)
                eval_cube.rotate(expanded_answer)
                solved = eval_cube.is_done()
                # 3) Check if fully solved
                solved = eval_cube.is_done()
                if solved:
                    reward = 1.0
                elif len(answer.strip()) > 0:  # encourage non-empty answers
                    reward = 0.05  # Incorrect, but rotate could parse the answer
                else:
-                    reward = 0.01
+                    partial = self.partial_score(eval_cube)
            except:
                reward = 0.01  # At least you tried
                    if len(answer.strip()) > 0:
                        reward = max(0.05, partial)
                    else:
                        reward = max(0.01, partial)
            except:
                reward = 0.01
        return reward
    def remove_ansi(self, line):
--- a/reasoning_gym/games/sokoban.py
+++ b/reasoning_gym/games/sokoban.py
@ -99,6 +99,7 @@ Here is your puzzle:
                "source_dataset": DATASET_NAME,
                "source_index": idx,
                "gamestr": gamestr,
                "source_dataset": DATASET_NAME,
                "width": puzzle_data["width"],
                "height": puzzle_data["height"],
                "difficulty": {
--- a/tests/test_basic_arithmetic.py
+++ b/tests/test_basic_arithmetic.py
@ -6,6 +6,7 @@ from reasoning_gym.arithmetic.basic_arithmetic import (
    BasicArithmeticDatasetConfig,
    eval_floordiv,
 )
 from reasoning_gym.coaching.base_curriculum import DefaultCurriculumContext, RangeAttributeMode
 def test_arithmetic_dataset_config_validation():
@ -103,7 +104,7 @@ def test_basic_arithmetic_curriculum():
    """Test the BasicArithmeticCurriculum functionality"""
    curriculum = BasicArithmeticCurriculum()
-    base_value = {"size": 150, "seed": 1}
+    base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1}
    base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(base_value)
    assert base_cfg.seed == 1
@ -115,7 +116,7 @@ def test_basic_arithmetic_curriculum():
    curriculum.increment_attr_level("num_terms")
    curriculum.increment_attr_level("num_digits")
    increased_cfg = curriculum.generate_configuration(base_value)
-    assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 5
+    assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3
    assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2
    # Test decrementing attribute level for num_terms
@ -128,7 +129,7 @@ def test_basic_arithmetic_curriculum():
    curriculum.increment_attr_level("num_terms")
    curriculum.increment_attr_level("num_terms")
    higher_level_cfg = curriculum.generate_configuration(base_value)
-    assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 10
+    assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 4
    assert higher_level_cfg.min_digits == 1 and higher_level_cfg.max_digits == 2
    # Test boundary conditions - trying to decrement below level 0
@ -144,5 +145,26 @@ def test_basic_arithmetic_curriculum():
        curriculum.increment_attr_level("num_terms")
        curriculum.increment_attr_level("num_digits")
    upper_bound_cfg = curriculum.generate_configuration(base_value)
-    assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 15
+    assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 6
-    assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 10
+    assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 4
 def test_basic_arithmetic_curriculum_upper_bound():
    curriculum = BasicArithmeticCurriculum()
    base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1}
    base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(
        base_value, context=DefaultCurriculumContext(mode=RangeAttributeMode.UPPER_BOUND)
    )
    assert base_cfg.seed == 1
    assert base_cfg.size == 150
    assert base_cfg.min_terms == 2 and base_cfg.max_terms == 2
    assert base_cfg.min_digits == 1 and base_cfg.max_digits == 1
    # Test incrementing attribute levels
    curriculum.increment_attr_level("num_terms")
    curriculum.increment_attr_level("num_digits")
    increased_cfg = curriculum.generate_configuration(base_value)
    assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3
    assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2
--- a/tests/test_number_sorting.py
+++ b/tests/test_number_sorting.py
@ -56,6 +56,7 @@ def test_number_sorting_dataset_items():
        # Verify number count constraints
        numbers = item["metadata"]["original_numbers"]
        print(numbers)
        assert len(numbers) >= config.min_numbers
        assert len(numbers) <= config.max_numbers
@ -99,7 +100,7 @@ def test_number_sorting_curriculum():
    base_cfg: NumberSortingConfig = curriculum.generate_configuration(base_value)
    assert base_cfg.seed == 1
    assert base_cfg.size == 150
-    assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 50
+    assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 100
    assert base_cfg.min_decimals == 0 and base_cfg.max_decimals == 1
    assert base_cfg.min_value == -100 and base_cfg.max_value == 100
@ -107,14 +108,14 @@ def test_number_sorting_curriculum():
    curriculum.increment_attr_level("numbers")
    curriculum.increment_attr_level("decimals")
    increased_cfg = curriculum.generate_configuration(base_value)
-    assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 100
+    assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 500
    assert increased_cfg.min_decimals == 0 and increased_cfg.max_decimals == 2
    assert increased_cfg.min_value == -100 and increased_cfg.max_value == 100
    # test decrementing attribute level for numbers again
    curriculum.decrement_attr_level("numbers")
    partially_decreased_cfg = curriculum.generate_configuration(base_value)
-    assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 50
+    assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 100
    assert partially_decreased_cfg.min_decimals == 0 and partially_decreased_cfg.max_decimals == 2
    assert partially_decreased_cfg.min_value == -100 and partially_decreased_cfg.max_value == 100
--- a/tests/test_rubiks_cube.py
+++ b/tests/test_rubiks_cube.py
@ -55,9 +55,9 @@ def test_rubikscube_items():
        assert dataset.score_answer(answer=None, entry=item) == 0.0
        if item["metadata"]["example_correct_answer"] != "R":
-            assert dataset.score_answer(answer="R", entry=item) == 0.05
+            assert dataset.score_answer(answer="R", entry=item) == 0.01
-        assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.05
+        assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.01
        if len(item["metadata"]["example_correct_answer"]) > 0:
            assert dataset.score_answer(answer="", entry=item) == 0.01
--- a/training/README.md
+++ b/training/README.md
@ -87,6 +87,7 @@ python utils/load_fsdp_to_hf.py checkpoints/rg-test/intra_reasoning_algorithmic_
 From here you may to run evaluations of your trained model. In the `training/evaluation` directory there is a script `evaluate_model.py` which you csn run to evaluate your trained model on a specific dataset. You specify evaluation parameters in a yaml file. This evaluation can point to either a local or remote model. For example the configuration file `training/evaluation/eval_algorithmic_composite.yaml` specifies the path to a local model which is stored as a hugginface checkpoint at `training/utils/qwen3b_500` (note that you have to convert to fsdp checkpoint to hf checkpoint for evaluation script to work as shown in the previous step).
 ## Run the script
 export VLLM_ATTENTION_BACKEND=XFORMERS
 Navigate to evaluations directory:
 ```
 python evaluate_model.py --config path-to-yaml
--- a/training/configs/intra_generalisation/algebra_qwen_3b.yaml
+++ b/training/configs/intra_generalisation/algebra_qwen_3b.yaml
@ -0,0 +1,221 @@
 reasoning_gym:
  dataset_size: 20000
  developer_prompt: DeepSeekZero
  datasets:
    simple_equations:
        weight: 0.5
        config:
          min_terms: 2
          max_terms: 4
          min_value: 1
          max_value: 100
    polynomial_multiplication:
        weight: 0.5
        config:
          min_terms: 2
          max_terms: 4
          min_value: 1
          max_value: 100
          min_degree: 0
          max_degree: 3
          min_polynomials: 2
          max_polynomials: 3
 curriculum:
    enabled: False
    schedule:
      automatic: True
      update_steps: 30 # automatic curriculum updating after 50 steps
    last_k: 20
    success_threshold: 0.70
    failure_threshold: 0.10
    curricula:
      spell_backward:
        attribute_levels:
          word_len: 0
 reward:
  use_accuracy: True
  secondary_rewards:
   - name: cosine
     scaling_factor: 0.3
   - name: format
     scaling_factor: 0.2
     kwargs:
        preappend_thinking_token: False
 data:
  tokenizer: null
  train_files: train.parquet
  val_files: test.parquet
  prompt_key: prompt
  max_prompt_length: 512
  max_response_length: 1024
  train_batch_size: 32
  val_batch_size: 64
  return_raw_chat: True
  return_raw_input_ids: True
 actor_rollout_ref:
  hybrid_engine: True
  model:
    path: Qwen/Qwen2.5-3B-Instruct
    external_lib: null
    override_config: { }
    enable_gradient_checkpointing: True
    use_remove_padding: True
  actor:
    strategy: fsdp  # This is for backward-compatibility
    ppo_mini_batch_size: 16
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: 4
    use_dynamic_bsz: False
    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
    clip_ratio: 0.2
    entropy_coeff: 0.001
    use_kl_loss: True # True for GRPO
    kl_loss_coef: 0.001 # for grpo
    kl_loss_type: low_var_kl # for grpo
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: null   # only useful for warmup with cosine
      warmup_style: constant  # select from constant/cosine
      total_training_steps: 500  # must be override by program
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      param_offload: False
      optimizer_offload: False
      fsdp_size: -1
  ref:
    fsdp_config:
      param_offload: True
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
  rollout:
    name: vllm
    temperature: 1.0
    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
    top_p: 1
    prompt_length: ${data.max_prompt_length}  # not use for opensource
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
    gpu_memory_utilization: 0.7
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor
    tensor_model_parallel_size: 4
    max_num_batched_tokens: 12288
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    disable_log_stats: True
    enable_chunked_prefill: True # could get higher throughput
    # for hf rollout
    do_sample: True
    use_fire_sampling: False
    max_model_len: 12288
    # number of responses (i.e. num sample times)
    n: 8 # > 1 for grpo
    val_kwargs:
      do_sample: True
 algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: grpo
  kl_penalty: kl  # how to estimate kl divergence
  kl_ctrl:
    type: fixed
    kl_coef: 0.001
 verbose: True
 trainer:
  balance_batch: True
  total_epochs: 1
  total_training_steps: 500
  project_name: rg-test
  experiment_name: intra_reasoning_algebra_qwen_3b_composite
  logger: [ 'console', 'wandb' ]
  val_generations_to_log_to_wandb: 0
  nnodes: 1
  n_gpus_per_node: 4
  save_freq: 100
  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or auto or resume_path if
  resume_from_path: False
  test_freq: 100
  critic_warmup: 0
  default_hdfs_dir: null
  remove_previous_ckpt_in_save: False
  del_local_ckpt_after_load: False
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
 critic:
  strategy: fsdp
  optim:
    lr: 1e-5
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program
  model:
    path: ~/models/deepseek-llm-7b-chat
    tokenizer_path: ${actor_rollout_ref.model.path}
    override_config: { }
    external_lib: ${actor_rollout_ref.model.external_lib}
    enable_gradient_checkpointing: True
    use_remove_padding: False
    fsdp_config:
      param_offload: False
      optimizer_offload: False
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      fsdp_size: -1
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
  ppo_micro_batch_size_per_gpu: null
  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
  ulysses_sequence_parallel_size: 1 # sp size
  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
  shuffle: ${actor_rollout_ref.actor.shuffle}
  grad_clip: 1.0
  cliprange_value: 0.5
 # Reward model not used for GRPO
 reward_model:
  enable: False
  strategy: fsdp
  model:
    input_tokenizer: ${actor_rollout_ref.model.path}
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: ${actor_rollout_ref.model.external_lib}
    use_remove_padding: False
    fsdp_config:
      min_num_params: 0
      param_offload: False
      fsdp_size: -1
  micro_batch_size: null
  micro_batch_size_per_gpu: null
  max_length: null
  ulysses_sequence_parallel_size: 1
  use_dynamic_bsz: ${critic.use_dynamic_bsz}
  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
--- a/training/configs/intra_generalisation/algorithmic_qwen_3b.yaml
+++ b/training/configs/intra_generalisation/algorithmic_qwen_3b.yaml
--- a/training/configs/intra_generalisation/arithmetic_qwen_3b.yaml
+++ b/training/configs/intra_generalisation/arithmetic_qwen_3b.yaml
@ -0,0 +1,224 @@
 reasoning_gym:
  dataset_size: 20000
  developer_prompt: DeepSeekZero
  datasets:
    fraction_simplification:
      weight: 0.33
      config:
        min_value: 1
        max_value: 1000
        min_factor: 1
        max_factor: 100
    gcd:
        weight: 0.34
        config:
          min_numbers: 2  # Minimum numbers to find GCD of
          max_numbers: 2  # Maximum numbers to find GCD of
          min_value: 1  # Minimum value for each number
          max_value: 1000  # Maximum value for each number
    lcm:
       weight: 0.33
       config:
          min_numbers: 2
          max_numbers: 2
          min_value: 1
          max_value: 100
 curriculum:
    enabled: False
    schedule:
      automatic: True
      update_steps: 30
    last_k: 20
    success_threshold: 0.70
    failure_threshold: 0.10
    curricula:
      spell_backward:
        attribute_levels:
          word_len: 0
 reward:
  use_accuracy: True
  secondary_rewards:
   - name: cosine
     scaling_factor: 0.3
   - name: format
     scaling_factor: 0.2
     kwargs:
        preappend_thinking_token: False
 data:
  tokenizer: null
  train_files: train.parquet
  val_files: test.parquet
  prompt_key: prompt
  max_prompt_length: 512
  max_response_length: 1024
  train_batch_size: 32
  val_batch_size: 64
  return_raw_chat: True
  return_raw_input_ids: True
 actor_rollout_ref:
  hybrid_engine: True
  model:
    path: Qwen/Qwen2.5-3B-Instruct
    external_lib: null
    override_config: { }
    enable_gradient_checkpointing: True
    use_remove_padding: True
  actor:
    strategy: fsdp  # This is for backward-compatibility
    ppo_mini_batch_size: 16
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: 4
    use_dynamic_bsz: False
    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
    clip_ratio: 0.2
    entropy_coeff: 0.001
    use_kl_loss: True # True for GRPO
    kl_loss_coef: 0.001 # for grpo
    kl_loss_type: low_var_kl # for grpo
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: null   # only useful for warmup with cosine
      warmup_style: constant  # select from constant/cosine
      total_training_steps: 500  # must be override by program
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      param_offload: False
      optimizer_offload: False
      fsdp_size: -1
  ref:
    fsdp_config:
      param_offload: True
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
  rollout:
    name: vllm
    temperature: 1.0
    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
    top_p: 1
    prompt_length: ${data.max_prompt_length}  # not use for opensource
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
    gpu_memory_utilization: 0.7
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor
    tensor_model_parallel_size: 4
    max_num_batched_tokens: 12288
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    disable_log_stats: True
    enable_chunked_prefill: True # could get higher throughput
    # for hf rollout
    do_sample: True
    use_fire_sampling: False
    max_model_len: 12288
    # number of responses (i.e. num sample times)
    n: 8 # > 1 for grpo
    val_kwargs:
      do_sample: True
 algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: grpo
  kl_penalty: kl  # how to estimate kl divergence
  kl_ctrl:
    type: fixed
    kl_coef: 0.001
 verbose: True
 trainer:
  balance_batch: True
  total_epochs: 1
  total_training_steps: 500
  project_name: rg-test
  experiment_name: intra_reasoning_arithmetic_qwen_3b_composite
  logger: [ 'console', 'wandb' ]
  val_generations_to_log_to_wandb: 0
  nnodes: 1
  n_gpus_per_node: 4
  save_freq: 100
  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or auto or resume_path if
  resume_from_path: False
  test_freq: 100
  critic_warmup: 0
  default_hdfs_dir: null
  remove_previous_ckpt_in_save: False
  del_local_ckpt_after_load: False
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
 critic:
  strategy: fsdp
  optim:
    lr: 1e-5
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program
  model:
    path: ~/models/deepseek-llm-7b-chat
    tokenizer_path: ${actor_rollout_ref.model.path}
    override_config: { }
    external_lib: ${actor_rollout_ref.model.external_lib}
    enable_gradient_checkpointing: True
    use_remove_padding: False
    fsdp_config:
      param_offload: False
      optimizer_offload: False
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      fsdp_size: -1
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
  ppo_micro_batch_size_per_gpu: null
  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
  ulysses_sequence_parallel_size: 1 # sp size
  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
  shuffle: ${actor_rollout_ref.actor.shuffle}
  grad_clip: 1.0
  cliprange_value: 0.5
 # Reward model not used for GRPO
 reward_model:
  enable: False
  strategy: fsdp
  model:
    input_tokenizer: ${actor_rollout_ref.model.path}
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: ${actor_rollout_ref.model.external_lib}
    use_remove_padding: False
    fsdp_config:
      min_num_params: 0
      param_offload: False
      fsdp_size: -1
  micro_batch_size: null
  micro_batch_size_per_gpu: null
  max_length: null
  ulysses_sequence_parallel_size: 1
  use_dynamic_bsz: ${critic.use_dynamic_bsz}
  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
--- a/training/configs/intra_generalisation/cognition_qwen_3b.yaml
+++ b/training/configs/intra_generalisation/cognition_qwen_3b.yaml
@ -0,0 +1,219 @@
 reasoning_gym:
  dataset_size: 20000
  developer_prompt: DeepSeekZero
  datasets:
    rubiks_cube:
      weight: 0.33
      config:
        min_scramble_steps: 3
        max_scramble_steps: 10
    figlet_font:
      weight: 0.34
      config:
        min_word_len: 3
        max_word_len: 7
    rectangle_count:
      weight: 0.33
      config:
        max_rectangles:  10
        width: 80
        height: 80
 curriculum:
    enabled: False
    schedule:
      automatic: True
      update_steps: 30 # automatic curriculum updating after 50 steps
    last_k: 20
    success_threshold: 0.70
    failure_threshold: 0.10
    curricula:
      spell_backward:
        attribute_levels:
          word_len: 0
 reward:
  use_accuracy: True
  secondary_rewards:
   - name: cosine
     scaling_factor: 0.3
   - name: format
     scaling_factor: 0.2
     kwargs:
        preappend_thinking_token: False
 data:
  tokenizer: null
  train_files: train.parquet
  val_files: test.parquet
  prompt_key: prompt
  max_prompt_length: 512
  max_response_length: 1024
  train_batch_size: 32
  val_batch_size: 64
  return_raw_chat: True
  return_raw_input_ids: True
 actor_rollout_ref:
  hybrid_engine: True
  model:
    path: Qwen/Qwen2.5-3B-Instruct
    external_lib: null
    override_config: { }
    enable_gradient_checkpointing: True
    use_remove_padding: True
  actor:
    strategy: fsdp  # This is for backward-compatibility
    ppo_mini_batch_size: 16
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: 4
    use_dynamic_bsz: False
    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
    clip_ratio: 0.2
    entropy_coeff: 0.001
    use_kl_loss: True # True for GRPO
    kl_loss_coef: 0.001 # for grpo
    kl_loss_type: low_var_kl # for grpo
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: null   # only useful for warmup with cosine
      warmup_style: constant  # select from constant/cosine
      total_training_steps: 500  # must be override by program
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      param_offload: False
      optimizer_offload: False
      fsdp_size: -1
  ref:
    fsdp_config:
      param_offload: True
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
  rollout:
    name: vllm
    temperature: 1.0
    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
    top_p: 1
    prompt_length: ${data.max_prompt_length}  # not use for opensource
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
    gpu_memory_utilization: 0.7
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor
    tensor_model_parallel_size: 4
    max_num_batched_tokens: 12288
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    disable_log_stats: True
    enable_chunked_prefill: True # could get higher throughput
    # for hf rollout
    do_sample: True
    use_fire_sampling: False
    max_model_len: 12288
    # number of responses (i.e. num sample times)
    n: 8 # > 1 for grpo
    val_kwargs:
      do_sample: True
 algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: grpo
  kl_penalty: kl  # how to estimate kl divergence
  kl_ctrl:
    type: fixed
    kl_coef: 0.001
 verbose: True
 trainer:
  balance_batch: True
  total_epochs: 1
  total_training_steps: 500
  project_name: rg-test
  experiment_name: intra_reasoning_cognition_qwen_3b_composite_test
  logger: [ 'console', 'wandb' ]
  val_generations_to_log_to_wandb: 0
  nnodes: 1
  n_gpus_per_node: 4
  save_freq: 100
  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or auto or resume_path if
  resume_from_path: False
  test_freq: 100
  critic_warmup: 0
  default_hdfs_dir: null
  remove_previous_ckpt_in_save: False
  del_local_ckpt_after_load: False
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
 critic:
  strategy: fsdp
  optim:
    lr: 1e-5
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program
  model:
    path: ~/models/deepseek-llm-7b-chat
    tokenizer_path: ${actor_rollout_ref.model.path}
    override_config: { }
    external_lib: ${actor_rollout_ref.model.external_lib}
    enable_gradient_checkpointing: True
    use_remove_padding: False
    fsdp_config:
      param_offload: False
      optimizer_offload: False
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      fsdp_size: -1
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
  ppo_micro_batch_size_per_gpu: null
  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
  ulysses_sequence_parallel_size: 1 # sp size
  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
  shuffle: ${actor_rollout_ref.actor.shuffle}
  grad_clip: 1.0
  cliprange_value: 0.5
 # Reward model not used for GRPO
 reward_model:
  enable: False
  strategy: fsdp
  model:
    input_tokenizer: ${actor_rollout_ref.model.path}
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: ${actor_rollout_ref.model.external_lib}
    use_remove_padding: False
    fsdp_config:
      min_num_params: 0
      param_offload: False
      fsdp_size: -1
  micro_batch_size: null
  micro_batch_size_per_gpu: null
  max_length: null
  ulysses_sequence_parallel_size: 1
  use_dynamic_bsz: ${critic.use_dynamic_bsz}
  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
--- a/training/configs/intra_generalisation/games_qwen_3b.yaml
+++ b/training/configs/intra_generalisation/games_qwen_3b.yaml
@ -0,0 +1,225 @@
 reasoning_gym:
  dataset_size: 20000
  developer_prompt: DeepSeekZero
  datasets:
    sudoku:
      weight: 0.33
      config:
        min_empty: 30
        max_empty: 50
    futoshiki:
      weight: 0.34
      config:
        min_board_size: 4  # Board will be NxN where N is this value
        max_board_size: 9
        min_difficulty: 0
        max_difficulty: 3
    sokoban:
      weight: 0.33
      config:
        min_w: 6  # Minimum width of the puzzle
        min_h: 6  # Minimum height of the puzzle
        max_w: 10  # Maximum width of the puzzle
        max_h: 10  # Maximum height of the puzzle
        min_boxes: 4  # Minimum number of boxes
        max_boxes: 10  # Maximum number of boxes
        max_depth: 80  # Maximum search depth
 curriculum:
    enabled: False
    schedule:
      automatic: True
      update_steps: 30 # automatic curriculum updating after 50 steps
    last_k: 20
    success_threshold: 0.70
    failure_threshold: 0.10
    curricula:
      spell_backward:
        attribute_levels:
          word_len: 0
 reward:
  use_accuracy: True
  secondary_rewards:
   - name: cosine
     scaling_factor: 0.3
   - name: format
     scaling_factor: 0.2
     kwargs:
        preappend_thinking_token: False
 data:
  tokenizer: null
  train_files: train.parquet
  val_files: test.parquet
  prompt_key: prompt
  max_prompt_length: 512
  max_response_length: 1024
  train_batch_size: 32
  val_batch_size: 64
  return_raw_chat: True
  return_raw_input_ids: True
 actor_rollout_ref:
  hybrid_engine: True
  model:
    path: Qwen/Qwen2.5-3B-Instruct
    external_lib: null
    override_config: { }
    enable_gradient_checkpointing: True
    use_remove_padding: True
  actor:
    strategy: fsdp  # This is for backward-compatibility
    ppo_mini_batch_size: 16
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: 4
    use_dynamic_bsz: False
    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
    clip_ratio: 0.2
    entropy_coeff: 0.001
    use_kl_loss: True # True for GRPO
    kl_loss_coef: 0.001 # for grpo
    kl_loss_type: low_var_kl # for grpo
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: null   # only useful for warmup with cosine
      warmup_style: constant  # select from constant/cosine
      total_training_steps: 500  # must be override by program
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      param_offload: False
      optimizer_offload: False
      fsdp_size: -1
  ref:
    fsdp_config:
      param_offload: True
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
  rollout:
    name: vllm
    temperature: 1.0
    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
    top_p: 1
    prompt_length: ${data.max_prompt_length}  # not use for opensource
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
    gpu_memory_utilization: 0.7
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor
    tensor_model_parallel_size: 4
    max_num_batched_tokens: 12288
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    disable_log_stats: True
    enable_chunked_prefill: True # could get higher throughput
    # for hf rollout
    do_sample: True
    use_fire_sampling: False
    max_model_len: 12288
    # number of responses (i.e. num sample times)
    n: 8 # > 1 for grpo
    val_kwargs:
      do_sample: True
 algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: grpo
  kl_penalty: kl  # how to estimate kl divergence
  kl_ctrl:
    type: fixed
    kl_coef: 0.001
 verbose: True
 trainer:
  balance_batch: True
  total_epochs: 1
  total_training_steps: 500
  project_name: rg-test
  experiment_name: intra_reasoning_games_qwen_3b_composite
  logger: [ 'console', 'wandb' ]
  val_generations_to_log_to_wandb: 0
  nnodes: 1
  n_gpus_per_node: 4
  save_freq: 100
  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or auto or resume_path if
  resume_from_path: False
  test_freq: 100
  critic_warmup: 0
  default_hdfs_dir: null
  remove_previous_ckpt_in_save: False
  del_local_ckpt_after_load: False
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
 critic:
  strategy: fsdp
  optim:
    lr: 1e-5
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program
  model:
    path: ~/models/deepseek-llm-7b-chat
    tokenizer_path: ${actor_rollout_ref.model.path}
    override_config: { }
    external_lib: ${actor_rollout_ref.model.external_lib}
    enable_gradient_checkpointing: True
    use_remove_padding: False
    fsdp_config:
      param_offload: False
      optimizer_offload: False
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      fsdp_size: -1
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
  ppo_micro_batch_size_per_gpu: null
  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
  ulysses_sequence_parallel_size: 1 # sp size
  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
  shuffle: ${actor_rollout_ref.actor.shuffle}
  grad_clip: 1.0
  cliprange_value: 0.5
 # Reward model not used for GRPO
 reward_model:
  enable: False
  strategy: fsdp
  model:
    input_tokenizer: ${actor_rollout_ref.model.path}
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: ${actor_rollout_ref.model.external_lib}
    use_remove_padding: False
    fsdp_config:
      min_num_params: 0
      param_offload: False
      fsdp_size: -1
  micro_batch_size: null
  micro_batch_size_per_gpu: null
  max_length: null
  ulysses_sequence_parallel_size: 1
  use_dynamic_bsz: ${critic.use_dynamic_bsz}
  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
--- a/training/configs/intra_generalisation/graphs_qwen_3b.yaml
+++ b/training/configs/intra_generalisation/graphs_qwen_3b.yaml
@ -0,0 +1,226 @@
 reasoning_gym:
  dataset_size: 20000
  developer_prompt: DeepSeekZero
  datasets:
    shortest_path:
      weight: 0.33
      config:
          min_rows: 5
          max_rows: 8
          min_cols: 5
          max_cols: 8
          p_blocked: 0.4
    largest_island:
      weight: 0.34
      config:
        min_rows: 5
        max_rows: 10
        min_cols: 5
        max_cols: 10
        min_num_islands: 0
        max_num_islands: 5
        min_island_size: 0
        max_island_size: 10
    quantum_lock:
      weight: 0.33
      config:
        difficulty: 10
 curriculum:
    enabled: False
    schedule:
      automatic: True
      update_steps: 30 # automatic curriculum updating after 50 steps
    last_k: 20
    success_threshold: 0.70
    failure_threshold: 0.10
    curricula:
      spell_backward:
        attribute_levels:
          word_len: 0
 reward:
  use_accuracy: True
  secondary_rewards:
   - name: cosine
     scaling_factor: 0.3
   - name: format
     scaling_factor: 0.2
     kwargs:
        preappend_thinking_token: False
 data:
  tokenizer: null
  train_files: train.parquet
  val_files: test.parquet
  prompt_key: prompt
  max_prompt_length: 512
  max_response_length: 1024
  train_batch_size: 32
  val_batch_size: 64
  return_raw_chat: True
  return_raw_input_ids: True
 actor_rollout_ref:
  hybrid_engine: True
  model:
    path: Qwen/Qwen2.5-3B-Instruct
    external_lib: null
    override_config: { }
    enable_gradient_checkpointing: True
    use_remove_padding: True
  actor:
    strategy: fsdp  # This is for backward-compatibility
    ppo_mini_batch_size: 16
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: 4
    use_dynamic_bsz: False
    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
    clip_ratio: 0.2
    entropy_coeff: 0.001
    use_kl_loss: True # True for GRPO
    kl_loss_coef: 0.001 # for grpo
    kl_loss_type: low_var_kl # for grpo
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: null   # only useful for warmup with cosine
      warmup_style: constant  # select from constant/cosine
      total_training_steps: 500  # must be override by program
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      param_offload: False
      optimizer_offload: False
      fsdp_size: -1
  ref:
    fsdp_config:
      param_offload: True
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
  rollout:
    name: vllm
    temperature: 1.0
    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
    top_p: 1
    prompt_length: ${data.max_prompt_length}  # not use for opensource
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
    gpu_memory_utilization: 0.7
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor
    tensor_model_parallel_size: 4
    max_num_batched_tokens: 12288
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    disable_log_stats: True
    enable_chunked_prefill: True # could get higher throughput
    # for hf rollout
    do_sample: True
    use_fire_sampling: False
    max_model_len: 12288
    # number of responses (i.e. num sample times)
    n: 8 # > 1 for grpo
    val_kwargs:
      do_sample: True
 algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: grpo
  kl_penalty: kl  # how to estimate kl divergence
  kl_ctrl:
    type: fixed
    kl_coef: 0.001
 verbose: True
 trainer:
  balance_batch: True
  total_epochs: 1
  total_training_steps: 500
  project_name: rg-test
  experiment_name: intra_reasoning_games_qwen_3b_graphs
  logger: [ 'console', 'wandb' ]
  val_generations_to_log_to_wandb: 0
  nnodes: 1
  n_gpus_per_node: 4
  save_freq: 100
  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or auto or resume_path if
  resume_from_path: False
  test_freq: 100
  critic_warmup: 0
  default_hdfs_dir: null
  remove_previous_ckpt_in_save: False
  del_local_ckpt_after_load: False
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
 critic:
  strategy: fsdp
  optim:
    lr: 1e-5
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program
  model:
    path: ~/models/deepseek-llm-7b-chat
    tokenizer_path: ${actor_rollout_ref.model.path}
    override_config: { }
    external_lib: ${actor_rollout_ref.model.external_lib}
    enable_gradient_checkpointing: True
    use_remove_padding: False
    fsdp_config:
      param_offload: False
      optimizer_offload: False
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      fsdp_size: -1
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
  ppo_micro_batch_size_per_gpu: null
  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
  ulysses_sequence_parallel_size: 1 # sp size
  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
  shuffle: ${actor_rollout_ref.actor.shuffle}
  grad_clip: 1.0
  cliprange_value: 0.5
 # Reward model not used for GRPO
 reward_model:
  enable: False
  strategy: fsdp
  model:
    input_tokenizer: ${actor_rollout_ref.model.path}
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: ${actor_rollout_ref.model.external_lib}
    use_remove_padding: False
    fsdp_config:
      min_num_params: 0
      param_offload: False
      fsdp_size: -1
  micro_batch_size: null
  micro_batch_size_per_gpu: null
  max_length: null
  ulysses_sequence_parallel_size: 1
  use_dynamic_bsz: ${critic.use_dynamic_bsz}
  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
--- a/training/evaluations/eval_algebraic_composite.yaml
+++ b/training/evaluations/eval_algebraic_composite.yaml
@ -0,0 +1,28 @@
 # Model configuration
 model_path: ../utils/qwen3b_algebraic
 max_tokens: 1024
 temperature: 0.6
 top_p: 0.9
 developer_prompt: DeepSeekZero
 developer_role: system  # Standard role for system prompts
 # Output configuration
 output_dir: results
 save_metadata: true
 save_full_results: true
 eval_repeats: 3
 # Categories and datasets to evaluate
 categories:
  - category: reasoning
    datasets:
      - dataset: simple_integration
        size: 100
        seed: 42
        params:
          min_terms: 2
          max_terms: 5
          min_degree: 1
          max_degree: 10
          min_bounds: 1
          max_bounds: 10
--- a/training/evaluations/eval_algorithmic_composite.yaml
+++ b/training/evaluations/eval_algorithmic_composite.yaml
@ -1,8 +1,8 @@
 # Model configuration
-model_path: ../utils/qwen3b_500 # Change to the smaller model
+model_path: ../utils/qwen3b_algorithmic_500
-max_tokens: 1024  # From max_response_length in training config
+max_tokens: 1024
-temperature: 0.7  # Lower temperature for more focused responses
+temperature: 0.6
-top_p: 0.9  # From rollout top_p
+top_p: 0.9
 developer_prompt: DeepSeekZero
 developer_role: system  # Standard role for system prompts
--- a/training/evaluations/eval_arithmetic_composite.yaml
+++ b/training/evaluations/eval_arithmetic_composite.yaml
@ -0,0 +1,24 @@
 # Model configuration
 model_path: ../utils/qwen_3b_arithmetic_100
 max_tokens: 1024
 temperature: 0.6
 top_p: 0.9
 developer_prompt: DeepSeekZero
 developer_role: system  # Standard role for system prompts
 # Output configuration
 output_dir: results
 save_metadata: true
 save_full_results: true
 eval_repeats: 3
 # Categories and datasets to evaluate
 categories:
  - category: reasoning
    datasets:
      - dataset: prime_factorization
        size: 100
        seed: 42
        params:
          min_value: 2
          max_value: 1000
--- a/training/evaluations/eval_cognition_composite.yaml
+++ b/training/evaluations/eval_cognition_composite.yaml
@ -0,0 +1,36 @@
 # Model configuration
 model_path: ../utils/qwen3b_cognition
 max_tokens: 1024
 temperature: 0.6  # Lower temperature for more focused responses
 top_p: 0.9  # From rollout top_p
 developer_prompt: DeepSeekZero
 developer_role: system  # Standard role for system prompts
 # Output configuration
 output_dir: results
 save_metadata: true
 save_full_results: true
 eval_repeats: 3
 # Categories and datasets to evaluate
 categories:
  - category: reasoning
    datasets:
    - dataset: number_sequence
      size: 100
      seed: 42
      params:
        min_terms: 4  # Minimum visible terms
        max_terms:  8  # Maximum visible terms
        min_value: -100  # Minimum allowed number
        max_value: 100  # Maximum allowed number
        max_complexity: 3  # Maximum number of operations to combine
    - dataset: modulo_grid
      size: 100
      seed: 42
      params:
        size_x: 20
        size_y: 20
        max_divisor: 20
        max_target: 20
        max_holes: 1
--- a/training/evaluations/eval_games_composite.yaml
+++ b/training/evaluations/eval_games_composite.yaml
@ -0,0 +1,24 @@
 # Model configuration
 model_path: ../utils/qwen3b_games
 max_tokens: 1024
 temperature: 0.6  # Lower temperature for more focused responses
 top_p: 0.9  # From rollout top_p
 developer_prompt: DeepSeekZero
 developer_role: system  # Standard role for system prompts
 # Output configuration
 output_dir: results
 save_metadata: true
 save_full_results: true
 eval_repeats: 3
 # Categories and datasets to evaluate
 categories:
  - category: reasoning
    datasets:
      - dataset: mahjong_puzzle
        size: 100
        seed: 42
        params:
          min_num_rounds: 10
          max_num_rounds:  50
--- a/training/evaluations/eval_qwen_3b.yaml
+++ b/training/evaluations/eval_qwen_3b.yaml
@ -16,13 +16,13 @@ eval_repeats: 3
 categories:
  - category: reasoning
    datasets:
-      - dataset: number_sorting
+      - dataset: decimal_chain_sum
        size: 100
        seed: 42
        params:
-          min_numbers: 3
+          min_terms: 2
-          max_numbers: 10
+          max_terms: 4
-          min_decimals: 0
+          min_digits: 1
-          max_decimals: 2
+          max_digits: 3
-          min_value: -100.0
+          min_decimal_places: 1
-          max_value: 100.0
+          max_decimal_places: 4