diff --git a/reasoning_gym/algorithmic/number_sorting.py b/reasoning_gym/algorithmic/number_sorting.py
index 67e3ba07..7406d96f 100644
--- a/reasoning_gym/algorithmic/number_sorting.py
+++ b/reasoning_gym/algorithmic/number_sorting.py
@@ -170,7 +170,7 @@ class NumberSortingCurriculum(BaseCurriculum):
         self._define_attributes(
             RangeAttributeDefinition(
                 name="numbers",
-                levels=[10, 50, 100, 200],
+                levels=[10, 100, 500, 1000],
                 description="How many numbers to sort",
                 lower_field_name="min_numbers",
                 upper_field_name="max_numbers",
diff --git a/reasoning_gym/arithmetic/basic_arithmetic.py b/reasoning_gym/arithmetic/basic_arithmetic.py
index c12cbb76..b7b1f950 100644
--- a/reasoning_gym/arithmetic/basic_arithmetic.py
+++ b/reasoning_gym/arithmetic/basic_arithmetic.py
@@ -2,7 +2,7 @@ from dataclasses import dataclass
 from random import Random
 from typing import Any, Literal, Optional
 
-from ..coaching import BaseCurriculum, RangeAttributeDefinition
+from ..coaching import BaseCurriculum, RangeAttributeDefinition, ScalarAttributeDefinition
 from ..factory import ProceduralDataset, register_dataset
 
 DATASET_NAME = "basic_arithmetic"
@@ -250,17 +250,19 @@ class BasicArithmeticCurriculum(BaseCurriculum):
         self._define_attributes(
             RangeAttributeDefinition(
                 name="num_terms",
-                levels=[2, 5, 10, 15],
+                levels=[2, 3, 4, 5, 6],
                 description="Number of terms in the expression",
                 lower_field_name="min_terms",
                 upper_field_name="max_terms",
+                ensure_interval=False,
             ),
             RangeAttributeDefinition(
                 name="num_digits",
-                levels=[1, 2, 5, 10],
+                levels=[1, 2, 3, 4],
                 description="Number of digits in the numbers",
                 lower_field_name="min_digits",
                 upper_field_name="max_digits",
+                ensure_interval=False,
             ),
         )
 
diff --git a/reasoning_gym/cognition/rectangle_count.py b/reasoning_gym/cognition/rectangle_count.py
index 7539e5e0..8c11af17 100644
--- a/reasoning_gym/cognition/rectangle_count.py
+++ b/reasoning_gym/cognition/rectangle_count.py
@@ -16,6 +16,8 @@ Now, it's your turn. How many rectangles do you see in the grid below?
 """
 
 DATASET_NAME = "rectangle_count"
+CONST_TERM = 0.8
+D = 5
 
 
 def draw_rectangles_with_overlap(n, width, height, rng):
@@ -132,22 +134,29 @@ class RectangleCountDataset(ProceduralDataset):
         }
 
     def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
-        """Determine if the solution provided solves the RectangleCount task.
-
-        The function awards 1.0 for a correct answer.
-
-        Args:
-            answer (Optional[str]): The user's answer.
-            entry (dict[str, Any]): The original dataset entry containing the correct answer.
+        """Determine if the solution provided solves the RectangleCount task,
+        awarding partial credit if the guess is close.
 
         Returns:
-            float: The computed score between 0.0 and 1.0.
+            float: A score between 0.0 and 1.0.
         """
+        correct_str = entry["answer"].lower().replace("\n", "")
 
-        if isinstance(answer, str):
-            if answer.lower().replace("\n", "") == entry["answer"].lower().replace("\n", ""):
-                return 1.0  # Yay
-        return 0.0
+        try:
+            correct_val = int(correct_str)
+            user_val = int(answer.strip())
+        except (ValueError, TypeError, AttributeError):
+            return 0.0
+        distance = abs(user_val - correct_val)
+
+        if distance == 0:
+            return 1.0
+        if distance >= D:
+            return 0.0
+
+        score = 1.0 - (distance / float(D))
+        score = CONST_TERM * score
+        return max(0.0, score)
 
 
 class RectangleCountCurriculum(BaseCurriculum):
diff --git a/reasoning_gym/cognition/rubiks_cube.py b/reasoning_gym/cognition/rubiks_cube.py
index 44819be1..7624fdac 100644
--- a/reasoning_gym/cognition/rubiks_cube.py
+++ b/reasoning_gym/cognition/rubiks_cube.py
@@ -121,29 +121,49 @@ class RubiksCubeDataset(ProceduralDataset):
             },
         }
 
+    def partial_score(self, cube: Cube) -> float:
+        """
+        Returns a fraction between 0 and 1, indicating how many stickers are
+        correctly positioned (i.e., match the solved color for that face).
+        """
+        total_stickers = 6 * (cube.size**2)
+        correct_stickers = 0
+
+        for face_index in range(6):
+            face = cube.faces[face_index]
+
+            solved_color = face[cube.size // 2][cube.size // 2].color
+            for row in range(cube.size):
+                for col in range(cube.size):
+                    sticker = face[row][col]
+                    if sticker.color == solved_color:
+                        correct_stickers += 1
+
+        return correct_stickers / total_stickers
+
     def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
-        """Determine if the solution provided solves the cube"""
-        reward = 0.0  # default reward
+        """Determine if the solution provided solves the cube, with partial rewards."""
+        reward = 0.0  # default
         if answer is not None:
-            # Reconstruct the test cube
             eval_cube = Cube(entry["metadata"]["cube_size"])
             eval_cube.rotate(entry["metadata"]["scramble_moves"])
-
-            # Test the solution
             try:
                 expanded_answer = self.expand_moves(answer)
                 eval_cube.rotate(expanded_answer)
-                solved = eval_cube.is_done()
 
+                # 3) Check if fully solved
+                solved = eval_cube.is_done()
                 if solved:
                     reward = 1.0
-                elif len(answer.strip()) > 0:  # encourage non-empty answers
-                    reward = 0.05  # Incorrect, but rotate could parse the answer
                 else:
-                    reward = 0.01
-            except:
-                reward = 0.01  # At least you tried
+                    partial = self.partial_score(eval_cube)
 
+                    if len(answer.strip()) > 0:
+                        reward = max(0.05, partial)
+                    else:
+                        reward = max(0.01, partial)
+            except:
+                reward = 0.01
         return reward
 
     def remove_ansi(self, line):
diff --git a/reasoning_gym/games/sokoban.py b/reasoning_gym/games/sokoban.py
index 09a9a96d..8a568694 100644
--- a/reasoning_gym/games/sokoban.py
+++ b/reasoning_gym/games/sokoban.py
@@ -99,6 +99,7 @@ Here is your puzzle:
                 "source_dataset": DATASET_NAME,
                 "source_index": idx,
                 "gamestr": gamestr,
+                "source_dataset": DATASET_NAME,
                 "width": puzzle_data["width"],
                 "height": puzzle_data["height"],
                 "difficulty": {
diff --git a/tests/test_basic_arithmetic.py b/tests/test_basic_arithmetic.py
index be2f6224..0101109f 100644
--- a/tests/test_basic_arithmetic.py
+++ b/tests/test_basic_arithmetic.py
@@ -6,6 +6,7 @@ from reasoning_gym.arithmetic.basic_arithmetic import (
     BasicArithmeticDatasetConfig,
     eval_floordiv,
 )
+from reasoning_gym.coaching.base_curriculum import DefaultCurriculumContext, RangeAttributeMode
 
 
 def test_arithmetic_dataset_config_validation():
@@ -103,7 +104,7 @@ def test_basic_arithmetic_curriculum():
     """Test the BasicArithmeticCurriculum functionality"""
     curriculum = BasicArithmeticCurriculum()
 
-    base_value = {"size": 150, "seed": 1}
+    base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1}
 
     base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(base_value)
     assert base_cfg.seed == 1
@@ -115,7 +116,7 @@ def test_basic_arithmetic_curriculum():
     curriculum.increment_attr_level("num_terms")
     curriculum.increment_attr_level("num_digits")
     increased_cfg = curriculum.generate_configuration(base_value)
-    assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 5
+    assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3
     assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2
 
     # Test decrementing attribute level for num_terms
@@ -128,7 +129,7 @@ def test_basic_arithmetic_curriculum():
     curriculum.increment_attr_level("num_terms")
     curriculum.increment_attr_level("num_terms")
     higher_level_cfg = curriculum.generate_configuration(base_value)
-    assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 10
+    assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 4
     assert higher_level_cfg.min_digits == 1 and higher_level_cfg.max_digits == 2
 
     # Test boundary conditions - trying to decrement below level 0
@@ -144,5 +145,26 @@ def test_basic_arithmetic_curriculum():
         curriculum.increment_attr_level("num_terms")
         curriculum.increment_attr_level("num_digits")
     upper_bound_cfg = curriculum.generate_configuration(base_value)
-    assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 15
-    assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 10
+    assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 6
+    assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 4
+
+
+def test_basic_arithmetic_curriculum_upper_bound():
+    curriculum = BasicArithmeticCurriculum()
+
+    base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1}
+
+    base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(
+        base_value, context=DefaultCurriculumContext(mode=RangeAttributeMode.UPPER_BOUND)
+    )
+    assert base_cfg.seed == 1
+    assert base_cfg.size == 150
+    assert base_cfg.min_terms == 2 and base_cfg.max_terms == 2
+    assert base_cfg.min_digits == 1 and base_cfg.max_digits == 1
+
+    # Test incrementing attribute levels
+    curriculum.increment_attr_level("num_terms")
+    curriculum.increment_attr_level("num_digits")
+    increased_cfg = curriculum.generate_configuration(base_value)
+    assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3
+    assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2
diff --git a/tests/test_number_sorting.py b/tests/test_number_sorting.py
index 729c421d..6eae6265 100644
--- a/tests/test_number_sorting.py
+++ b/tests/test_number_sorting.py
@@ -56,6 +56,7 @@ def test_number_sorting_dataset_items():
 
         # Verify number count constraints
         numbers = item["metadata"]["original_numbers"]
+        print(numbers)
         assert len(numbers) >= config.min_numbers
         assert len(numbers) <= config.max_numbers
 
@@ -99,7 +100,7 @@ def test_number_sorting_curriculum():
     base_cfg: NumberSortingConfig = curriculum.generate_configuration(base_value)
     assert base_cfg.seed == 1
     assert base_cfg.size == 150
-    assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 50
+    assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 100
     assert base_cfg.min_decimals == 0 and base_cfg.max_decimals == 1
     assert base_cfg.min_value == -100 and base_cfg.max_value == 100
 
@@ -107,14 +108,14 @@ def test_number_sorting_curriculum():
     curriculum.increment_attr_level("numbers")
     curriculum.increment_attr_level("decimals")
     increased_cfg = curriculum.generate_configuration(base_value)
-    assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 100
+    assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 500
     assert increased_cfg.min_decimals == 0 and increased_cfg.max_decimals == 2
     assert increased_cfg.min_value == -100 and increased_cfg.max_value == 100
 
     # test decrementing attribute level for numbers again
     curriculum.decrement_attr_level("numbers")
     partially_decreased_cfg = curriculum.generate_configuration(base_value)
-    assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 50
+    assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 100
     assert partially_decreased_cfg.min_decimals == 0 and partially_decreased_cfg.max_decimals == 2
     assert partially_decreased_cfg.min_value == -100 and partially_decreased_cfg.max_value == 100
 
diff --git a/tests/test_rubiks_cube.py b/tests/test_rubiks_cube.py
index 4b8f949b..f9a77efa 100644
--- a/tests/test_rubiks_cube.py
+++ b/tests/test_rubiks_cube.py
@@ -55,9 +55,9 @@ def test_rubikscube_items():
         assert dataset.score_answer(answer=None, entry=item) == 0.0
 
         if item["metadata"]["example_correct_answer"] != "R":
-            assert dataset.score_answer(answer="R", entry=item) == 0.05
+            assert dataset.score_answer(answer="R", entry=item) == 0.01
 
-        assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.05
+        assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.01
 
         if len(item["metadata"]["example_correct_answer"]) > 0:
             assert dataset.score_answer(answer="", entry=item) == 0.01
diff --git a/training/README.md b/training/README.md
index 6c740042..552b8fba 100644
--- a/training/README.md
+++ b/training/README.md
@@ -87,6 +87,7 @@ python utils/load_fsdp_to_hf.py checkpoints/rg-test/intra_reasoning_algorithmic_
 From here you may to run evaluations of your trained model. In the `training/evaluation` directory there is a script `evaluate_model.py` which you csn run to evaluate your trained model on a specific dataset. You specify evaluation parameters in a yaml file. This evaluation can point to either a local or remote model. For example the configuration file `training/evaluation/eval_algorithmic_composite.yaml` specifies the path to a local model which is stored as a hugginface checkpoint at `training/utils/qwen3b_500` (note that you have to convert to fsdp checkpoint to hf checkpoint for evaluation script to work as shown in the previous step).
 
 ## Run the script
+export VLLM_ATTENTION_BACKEND=XFORMERS
 Navigate to evaluations directory:
 ```
 python evaluate_model.py --config path-to-yaml
diff --git a/training/configs/intra_generalisation/algebra_qwen_3b.yaml b/training/configs/intra_generalisation/algebra_qwen_3b.yaml
new file mode 100644
index 00000000..1193ffb0
--- /dev/null
+++ b/training/configs/intra_generalisation/algebra_qwen_3b.yaml
@@ -0,0 +1,221 @@
+reasoning_gym:
+  dataset_size: 20000
+  developer_prompt: DeepSeekZero
+  datasets:
+    simple_equations:
+        weight: 0.5
+        config:
+          min_terms: 2
+          max_terms: 4
+          min_value: 1
+          max_value: 100
+    polynomial_multiplication:
+        weight: 0.5
+        config:
+          min_terms: 2
+          max_terms: 4
+          min_value: 1
+          max_value: 100
+          min_degree: 0
+          max_degree: 3
+          min_polynomials: 2
+          max_polynomials: 3
+curriculum:
+    enabled: False
+    schedule:
+      automatic: True
+      update_steps: 30 # automatic curriculum updating after 50 steps
+    last_k: 20
+    success_threshold: 0.70
+    failure_threshold: 0.10
+    curricula:
+      spell_backward:
+        attribute_levels:
+          word_len: 0
+reward:
+  use_accuracy: True
+  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2
+     kwargs:
+        preappend_thinking_token: False
+
+data:
+  tokenizer: null
+  train_files: train.parquet
+  val_files: test.parquet
+  prompt_key: prompt
+  max_prompt_length: 512
+  max_response_length: 1024
+  train_batch_size: 32
+  val_batch_size: 64
+  return_raw_chat: True
+  return_raw_input_ids: True
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: Qwen/Qwen2.5-3B-Instruct
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: True
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 16
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: 4
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: 500  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: True
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.7
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 4
+    max_num_batched_tokens: 12288
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    use_fire_sampling: False
+    max_model_len: 12288
+    # number of responses (i.e. num sample times)
+    n: 8 # > 1 for grpo
+    val_kwargs:
+      do_sample: True
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+verbose: True
+trainer:
+  balance_batch: True
+  total_epochs: 1
+  total_training_steps: 500
+  project_name: rg-test
+  experiment_name: intra_reasoning_algebra_qwen_3b_composite
+  logger: [ 'console', 'wandb' ]
+  val_generations_to_log_to_wandb: 0
+  nnodes: 1
+  n_gpus_per_node: 4
+  save_freq: 100
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  resume_from_path: False
+  test_freq: 100
+  critic_warmup: 0
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+# Reward model not used for GRPO
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  ulysses_sequence_parallel_size: 1
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
diff --git a/training/configs/algorithmic_qwen_3b.yaml b/training/configs/intra_generalisation/algorithmic_qwen_3b.yaml
similarity index 100%
rename from training/configs/algorithmic_qwen_3b.yaml
rename to training/configs/intra_generalisation/algorithmic_qwen_3b.yaml
diff --git a/training/configs/intra_generalisation/arithmetic_qwen_3b.yaml b/training/configs/intra_generalisation/arithmetic_qwen_3b.yaml
new file mode 100644
index 00000000..82199ae3
--- /dev/null
+++ b/training/configs/intra_generalisation/arithmetic_qwen_3b.yaml
@@ -0,0 +1,224 @@
+reasoning_gym:
+  dataset_size: 20000
+  developer_prompt: DeepSeekZero
+  datasets:
+    fraction_simplification:
+      weight: 0.33
+      config:
+        min_value: 1
+        max_value: 1000
+        min_factor: 1
+        max_factor: 100
+    gcd:
+        weight: 0.34
+        config:
+          min_numbers: 2  # Minimum numbers to find GCD of
+          max_numbers: 2  # Maximum numbers to find GCD of
+          min_value: 1  # Minimum value for each number
+          max_value: 1000  # Maximum value for each number
+    lcm:
+       weight: 0.33
+       config:
+          min_numbers: 2
+          max_numbers: 2
+          min_value: 1
+          max_value: 100
+curriculum:
+    enabled: False
+    schedule:
+      automatic: True
+      update_steps: 30
+    last_k: 20
+    success_threshold: 0.70
+    failure_threshold: 0.10
+    curricula:
+      spell_backward:
+        attribute_levels:
+          word_len: 0
+reward:
+  use_accuracy: True
+  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2
+     kwargs:
+        preappend_thinking_token: False
+
+data:
+  tokenizer: null
+  train_files: train.parquet
+  val_files: test.parquet
+  prompt_key: prompt
+  max_prompt_length: 512
+  max_response_length: 1024
+  train_batch_size: 32
+  val_batch_size: 64
+  return_raw_chat: True
+  return_raw_input_ids: True
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: Qwen/Qwen2.5-3B-Instruct
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: True
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 16
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: 4
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: 500  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: True
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.7
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 4
+    max_num_batched_tokens: 12288
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    use_fire_sampling: False
+    max_model_len: 12288
+    # number of responses (i.e. num sample times)
+    n: 8 # > 1 for grpo
+    val_kwargs:
+      do_sample: True
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+verbose: True
+trainer:
+  balance_batch: True
+  total_epochs: 1
+  total_training_steps: 500
+  project_name: rg-test
+  experiment_name: intra_reasoning_arithmetic_qwen_3b_composite
+  logger: [ 'console', 'wandb' ]
+  val_generations_to_log_to_wandb: 0
+  nnodes: 1
+  n_gpus_per_node: 4
+  save_freq: 100
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  resume_from_path: False
+  test_freq: 100
+  critic_warmup: 0
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+# Reward model not used for GRPO
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  ulysses_sequence_parallel_size: 1
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
diff --git a/training/configs/intra_generalisation/cognition_qwen_3b.yaml b/training/configs/intra_generalisation/cognition_qwen_3b.yaml
new file mode 100644
index 00000000..4b605923
--- /dev/null
+++ b/training/configs/intra_generalisation/cognition_qwen_3b.yaml
@@ -0,0 +1,219 @@
+reasoning_gym:
+  dataset_size: 20000
+  developer_prompt: DeepSeekZero
+  datasets:
+    rubiks_cube:
+      weight: 0.33
+      config:
+        min_scramble_steps: 3
+        max_scramble_steps: 10
+    figlet_font:
+      weight: 0.34
+      config:
+        min_word_len: 3
+        max_word_len: 7
+    rectangle_count:
+      weight: 0.33
+      config:
+        max_rectangles:  10
+        width: 80
+        height: 80
+curriculum:
+    enabled: False
+    schedule:
+      automatic: True
+      update_steps: 30 # automatic curriculum updating after 50 steps
+    last_k: 20
+    success_threshold: 0.70
+    failure_threshold: 0.10
+    curricula:
+      spell_backward:
+        attribute_levels:
+          word_len: 0
+reward:
+  use_accuracy: True
+  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2
+     kwargs:
+        preappend_thinking_token: False
+
+data:
+  tokenizer: null
+  train_files: train.parquet
+  val_files: test.parquet
+  prompt_key: prompt
+  max_prompt_length: 512
+  max_response_length: 1024
+  train_batch_size: 32
+  val_batch_size: 64
+  return_raw_chat: True
+  return_raw_input_ids: True
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: Qwen/Qwen2.5-3B-Instruct
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: True
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 16
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: 4
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: 500  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: True
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.7
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 4
+    max_num_batched_tokens: 12288
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    use_fire_sampling: False
+    max_model_len: 12288
+    # number of responses (i.e. num sample times)
+    n: 8 # > 1 for grpo
+    val_kwargs:
+      do_sample: True
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+verbose: True
+trainer:
+  balance_batch: True
+  total_epochs: 1
+  total_training_steps: 500
+  project_name: rg-test
+  experiment_name: intra_reasoning_cognition_qwen_3b_composite_test
+  logger: [ 'console', 'wandb' ]
+  val_generations_to_log_to_wandb: 0
+  nnodes: 1
+  n_gpus_per_node: 4
+  save_freq: 100
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  resume_from_path: False
+  test_freq: 100
+  critic_warmup: 0
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+# Reward model not used for GRPO
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  ulysses_sequence_parallel_size: 1
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
diff --git a/training/configs/intra_generalisation/games_qwen_3b.yaml b/training/configs/intra_generalisation/games_qwen_3b.yaml
new file mode 100644
index 00000000..91e0725a
--- /dev/null
+++ b/training/configs/intra_generalisation/games_qwen_3b.yaml
@@ -0,0 +1,225 @@
+reasoning_gym:
+  dataset_size: 20000
+  developer_prompt: DeepSeekZero
+  datasets:
+    sudoku:
+      weight: 0.33
+      config:
+        min_empty: 30
+        max_empty: 50
+    futoshiki:
+      weight: 0.34
+      config:
+        min_board_size: 4  # Board will be NxN where N is this value
+        max_board_size: 9
+        min_difficulty: 0
+        max_difficulty: 3
+    sokoban:
+      weight: 0.33
+      config:
+        min_w: 6  # Minimum width of the puzzle
+        min_h: 6  # Minimum height of the puzzle
+        max_w: 10  # Maximum width of the puzzle
+        max_h: 10  # Maximum height of the puzzle
+        min_boxes: 4  # Minimum number of boxes
+        max_boxes: 10  # Maximum number of boxes
+        max_depth: 80  # Maximum search depth
+curriculum:
+    enabled: False
+    schedule:
+      automatic: True
+      update_steps: 30 # automatic curriculum updating after 50 steps
+    last_k: 20
+    success_threshold: 0.70
+    failure_threshold: 0.10
+    curricula:
+      spell_backward:
+        attribute_levels:
+          word_len: 0
+reward:
+  use_accuracy: True
+  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2
+     kwargs:
+        preappend_thinking_token: False
+
+data:
+  tokenizer: null
+  train_files: train.parquet
+  val_files: test.parquet
+  prompt_key: prompt
+  max_prompt_length: 512
+  max_response_length: 1024
+  train_batch_size: 32
+  val_batch_size: 64
+  return_raw_chat: True
+  return_raw_input_ids: True
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: Qwen/Qwen2.5-3B-Instruct
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: True
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 16
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: 4
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: 500  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: True
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.7
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 4
+    max_num_batched_tokens: 12288
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    use_fire_sampling: False
+    max_model_len: 12288
+    # number of responses (i.e. num sample times)
+    n: 8 # > 1 for grpo
+    val_kwargs:
+      do_sample: True
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+verbose: True
+trainer:
+  balance_batch: True
+  total_epochs: 1
+  total_training_steps: 500
+  project_name: rg-test
+  experiment_name: intra_reasoning_games_qwen_3b_composite
+  logger: [ 'console', 'wandb' ]
+  val_generations_to_log_to_wandb: 0
+  nnodes: 1
+  n_gpus_per_node: 4
+  save_freq: 100
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  resume_from_path: False
+  test_freq: 100
+  critic_warmup: 0
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+# Reward model not used for GRPO
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  ulysses_sequence_parallel_size: 1
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
diff --git a/training/configs/intra_generalisation/graphs_qwen_3b.yaml b/training/configs/intra_generalisation/graphs_qwen_3b.yaml
new file mode 100644
index 00000000..2bc5c740
--- /dev/null
+++ b/training/configs/intra_generalisation/graphs_qwen_3b.yaml
@@ -0,0 +1,226 @@
+reasoning_gym:
+  dataset_size: 20000
+  developer_prompt: DeepSeekZero
+  datasets:
+    shortest_path:
+      weight: 0.33
+      config:
+          min_rows: 5
+          max_rows: 8
+          min_cols: 5
+          max_cols: 8
+          p_blocked: 0.4
+    largest_island:
+      weight: 0.34
+      config:
+        min_rows: 5
+        max_rows: 10
+        min_cols: 5
+        max_cols: 10
+        min_num_islands: 0
+        max_num_islands: 5
+        min_island_size: 0
+        max_island_size: 10
+    quantum_lock:
+      weight: 0.33
+      config:
+        difficulty: 10
+curriculum:
+    enabled: False
+    schedule:
+      automatic: True
+      update_steps: 30 # automatic curriculum updating after 50 steps
+    last_k: 20
+    success_threshold: 0.70
+    failure_threshold: 0.10
+    curricula:
+      spell_backward:
+        attribute_levels:
+          word_len: 0
+reward:
+  use_accuracy: True
+  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2
+     kwargs:
+        preappend_thinking_token: False
+
+data:
+  tokenizer: null
+  train_files: train.parquet
+  val_files: test.parquet
+  prompt_key: prompt
+  max_prompt_length: 512
+  max_response_length: 1024
+  train_batch_size: 32
+  val_batch_size: 64
+  return_raw_chat: True
+  return_raw_input_ids: True
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: Qwen/Qwen2.5-3B-Instruct
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: True
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 16
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: 4
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: 500  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: True
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.7
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 4
+    max_num_batched_tokens: 12288
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 160
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    use_fire_sampling: False
+    max_model_len: 12288
+    # number of responses (i.e. num sample times)
+    n: 8 # > 1 for grpo
+    val_kwargs:
+      do_sample: True
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+verbose: True
+trainer:
+  balance_batch: True
+  total_epochs: 1
+  total_training_steps: 500
+  project_name: rg-test
+  experiment_name: intra_reasoning_games_qwen_3b_graphs
+  logger: [ 'console', 'wandb' ]
+  val_generations_to_log_to_wandb: 0
+  nnodes: 1
+  n_gpus_per_node: 4
+  save_freq: 100
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  resume_from_path: False
+  test_freq: 100
+  critic_warmup: 0
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+# Reward model not used for GRPO
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  ulysses_sequence_parallel_size: 1
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
diff --git a/training/evaluations/eval_algebraic_composite.yaml b/training/evaluations/eval_algebraic_composite.yaml
new file mode 100644
index 00000000..f79514b4
--- /dev/null
+++ b/training/evaluations/eval_algebraic_composite.yaml
@@ -0,0 +1,28 @@
+# Model configuration
+model_path: ../utils/qwen3b_algebraic
+max_tokens: 1024
+temperature: 0.6
+top_p: 0.9
+developer_prompt: DeepSeekZero
+developer_role: system  # Standard role for system prompts
+
+# Output configuration
+output_dir: results
+save_metadata: true
+save_full_results: true
+eval_repeats: 3
+
+# Categories and datasets to evaluate
+categories:
+  - category: reasoning
+    datasets:
+      - dataset: simple_integration
+        size: 100
+        seed: 42
+        params:
+          min_terms: 2
+          max_terms: 5
+          min_degree: 1
+          max_degree: 10
+          min_bounds: 1
+          max_bounds: 10
diff --git a/training/evaluations/eval_algorithmic_composite.yaml b/training/evaluations/eval_algorithmic_composite.yaml
index 6eca3dda..1adba40f 100644
--- a/training/evaluations/eval_algorithmic_composite.yaml
+++ b/training/evaluations/eval_algorithmic_composite.yaml
@@ -1,8 +1,8 @@
 # Model configuration
-model_path: ../utils/qwen3b_500 # Change to the smaller model
-max_tokens: 1024  # From max_response_length in training config
-temperature: 0.7  # Lower temperature for more focused responses
-top_p: 0.9  # From rollout top_p
+model_path: ../utils/qwen3b_algorithmic_500
+max_tokens: 1024
+temperature: 0.6
+top_p: 0.9
 developer_prompt: DeepSeekZero
 developer_role: system  # Standard role for system prompts
 
diff --git a/training/evaluations/eval_arithmetic_composite.yaml b/training/evaluations/eval_arithmetic_composite.yaml
new file mode 100644
index 00000000..388cfdb3
--- /dev/null
+++ b/training/evaluations/eval_arithmetic_composite.yaml
@@ -0,0 +1,24 @@
+# Model configuration
+model_path: ../utils/qwen_3b_arithmetic_100
+max_tokens: 1024
+temperature: 0.6
+top_p: 0.9
+developer_prompt: DeepSeekZero
+developer_role: system  # Standard role for system prompts
+
+# Output configuration
+output_dir: results
+save_metadata: true
+save_full_results: true
+eval_repeats: 3
+
+# Categories and datasets to evaluate
+categories:
+  - category: reasoning
+    datasets:
+      - dataset: prime_factorization
+        size: 100
+        seed: 42
+        params:
+          min_value: 2
+          max_value: 1000
diff --git a/training/evaluations/eval_cognition_composite.yaml b/training/evaluations/eval_cognition_composite.yaml
new file mode 100644
index 00000000..6b5a2279
--- /dev/null
+++ b/training/evaluations/eval_cognition_composite.yaml
@@ -0,0 +1,36 @@
+# Model configuration
+model_path: ../utils/qwen3b_cognition
+max_tokens: 1024
+temperature: 0.6  # Lower temperature for more focused responses
+top_p: 0.9  # From rollout top_p
+developer_prompt: DeepSeekZero
+developer_role: system  # Standard role for system prompts
+
+# Output configuration
+output_dir: results
+save_metadata: true
+save_full_results: true
+eval_repeats: 3
+
+# Categories and datasets to evaluate
+categories:
+  - category: reasoning
+    datasets:
+    - dataset: number_sequence
+      size: 100
+      seed: 42
+      params:
+        min_terms: 4  # Minimum visible terms
+        max_terms:  8  # Maximum visible terms
+        min_value: -100  # Minimum allowed number
+        max_value: 100  # Maximum allowed number
+        max_complexity: 3  # Maximum number of operations to combine
+    - dataset: modulo_grid
+      size: 100
+      seed: 42
+      params:
+        size_x: 20
+        size_y: 20
+        max_divisor: 20
+        max_target: 20
+        max_holes: 1
diff --git a/training/evaluations/eval_games_composite.yaml b/training/evaluations/eval_games_composite.yaml
new file mode 100644
index 00000000..b183b6fd
--- /dev/null
+++ b/training/evaluations/eval_games_composite.yaml
@@ -0,0 +1,24 @@
+# Model configuration
+model_path: ../utils/qwen3b_games
+max_tokens: 1024
+temperature: 0.6  # Lower temperature for more focused responses
+top_p: 0.9  # From rollout top_p
+developer_prompt: DeepSeekZero
+developer_role: system  # Standard role for system prompts
+
+# Output configuration
+output_dir: results
+save_metadata: true
+save_full_results: true
+eval_repeats: 3
+
+# Categories and datasets to evaluate
+categories:
+  - category: reasoning
+    datasets:
+      - dataset: mahjong_puzzle
+        size: 100
+        seed: 42
+        params:
+          min_num_rounds: 10
+          max_num_rounds:  50
diff --git a/training/evaluations/eval_qwen_3b.yaml b/training/evaluations/eval_qwen_3b.yaml
index 4069895d..132989fb 100644
--- a/training/evaluations/eval_qwen_3b.yaml
+++ b/training/evaluations/eval_qwen_3b.yaml
@@ -16,13 +16,13 @@ eval_repeats: 3
 categories:
   - category: reasoning
     datasets:
-      - dataset: number_sorting
+      - dataset: decimal_chain_sum
         size: 100
         seed: 42
         params:
-          min_numbers: 3
-          max_numbers: 10
-          min_decimals: 0
-          max_decimals: 2
-          min_value: -100.0
-          max_value: 100.0
+          min_terms: 2
+          max_terms: 4
+          min_digits: 1
+          max_digits: 3
+          min_decimal_places: 1
+          max_decimal_places: 4