diff --git a/reasoning_gym/algorithmic/spell_backward.py b/reasoning_gym/algorithmic/spell_backward.py
index f26f1d19..b0249fd8 100644
--- a/reasoning_gym/algorithmic/spell_backward.py
+++ b/reasoning_gym/algorithmic/spell_backward.py
@@ -19,6 +19,7 @@ class SpellBackwardConfig:
     min_word_len: int = 3  # Minimum word length
     max_word_len: int = 10  # Maximum word length
     seed: Optional[int] = None
+    data_file: str = "words3to10.txt"
     size: int = 500  # Virtual dataset size
 
     def validate(self) -> None:
@@ -34,7 +35,7 @@ class SpellBackwardDataset(ProceduralDataset):
         super().__init__(config=config, seed=config.seed, size=config.size)
 
         # Load and preprocess text
-        text = read_data_file("words3to10.txt")
+        text = read_data_file(self.config.data_file)
         self.words = [
             word.strip()
             for word in text.splitlines()
@@ -73,9 +74,9 @@ class SpellBackwardDataset(ProceduralDataset):
                 if expected_answer == answer:
                     reward = 1.0
                 else:
-                    answer_len = len(answer)
+                    answer_len = len(expected_answer)
                     for i in range(len(expected_answer)):
-                        if (i < len(expected_answer) and i < len(answer)) and expected_answer[i] == answer[i]:
+                        if i < len(expected_answer) and i < len(answer):
                             if expected_answer[i] == answer[i]:
                                 reward += 1 / answer_len
                             else:
@@ -96,7 +97,7 @@ class SpellBackwardCurriculum(BaseCurriculum):
         self._define_attributes(
             RangeAttributeDefinition(
                 name="word_len",
-                levels=list(range(3, 11)),
+                levels=list(range(3, 10, 1)),
                 description="Word length",
                 lower_field_name="min_word_len",
                 upper_field_name="max_word_len",
diff --git a/reasoning_gym/coaching/curriculum_config.py b/reasoning_gym/coaching/curriculum_config.py
index 7b431204..452e29a3 100644
--- a/reasoning_gym/coaching/curriculum_config.py
+++ b/reasoning_gym/coaching/curriculum_config.py
@@ -54,7 +54,6 @@ class CurriculumExperimentConfig:
 
         if not isinstance(data, dict):
             raise ValueError("YAML data must contain a dictionary")
-
         if "curricula" not in data:
             raise ValueError("YAML data must contain a 'curricula' key")
 
diff --git a/reasoning_gym/coaching/experiment.py b/reasoning_gym/coaching/experiment.py
index bb6b74d0..53384e62 100644
--- a/reasoning_gym/coaching/experiment.py
+++ b/reasoning_gym/coaching/experiment.py
@@ -1,6 +1,6 @@
 """Experiment class combining dataset, scoreboard and curriculum."""
 
-from typing import Any, Optional
+from typing import Any, Literal, Optional
 
 from reasoning_gym.coaching.base_curriculum import CurriculumContext
 
@@ -27,7 +27,8 @@ class Experiment:
         entry = dataset[index]
         score = dataset.score_answer(answer, entry)
         metadata = entry["metadata"]
-        self.score_board.add_score(score, metadata, conversation)
+        score_board_metadata = {"difficulty": metadata["difficulty"], "source_dataset": metadata["source_dataset"]}
+        self.score_board.add_score(dataset_name, score, score_board_metadata, conversation)
         return score
 
     @classmethod
@@ -97,7 +98,15 @@ class CurriculumExperiment(Experiment):
         self.curriculum_config = config
         self.context = context
 
-    def update_difficulty(self):
+    def update_difficulty(self, dataset_name: str, method: Literal["increment", "decrement"]):
         """Update difficulty levels based on performance metrics"""
-        # TODO: Implement difficulty adjustment logic
-        pass
+        if method not in ["increment", "decrement"]:
+            raise ValueError(f"Invalid method: {method}")
+
+        if method == "increment":
+            self.curricula[dataset_name].increment_global_level()
+        elif method == "decrement":
+            self.curricula[dataset_name].decrement_global_level()
+
+        config = self.curricula[dataset_name].get_global_level()
+        self.composite.update_dataset_config(dataset_name, config)
diff --git a/reasoning_gym/coaching/score_board.py b/reasoning_gym/coaching/score_board.py
index 69413473..533e201b 100644
--- a/reasoning_gym/coaching/score_board.py
+++ b/reasoning_gym/coaching/score_board.py
@@ -114,11 +114,13 @@ class GroupedScores:
 class ScoreBoard:
     """Tracks scores and metadata for coaching sessions"""
 
-    scores: list[float] = field(default_factory=list)
-    metadata: list[dict[str, Any]] = field(default_factory=list)
-    conversations: list[Optional[list[dict]]] = field(default_factory=list)
+    scores: dict[str, list[float]] = field(default_factory=dict)
+    metadata: dict[str, list[dict[str, Any]]] = field(default_factory=dict)
+    conversations: dict[str, list[Optional[list[dict]]]] = field(default_factory=dict)
 
-    def add_score(self, score: float, metadata: dict[str, Any], conversation: Optional[list[dict]] = None) -> None:
+    def add_score(
+        self, dataset_name: str, score: float, metadata: dict[str, Any], conversation: Optional[list[dict]] = None
+    ) -> None:
         """Add a new score entry with associated metadata and optional conversation
 
         Args:
@@ -126,15 +128,19 @@ class ScoreBoard:
             metadata: Dictionary of metadata about the task/attempt
             conversation: Optional list of conversation turns as dicts
         """
-        self.scores.append(score)
-        self.metadata.append(metadata)
-        self.conversations.append(conversation)
+        if dataset_name not in self.scores:
+            self.scores[dataset_name] = []
+            self.metadata[dataset_name] = []
+            self.conversations[dataset_name] = []
+        self.scores[dataset_name].append(score)
+        self.metadata[dataset_name].append(metadata)
+        self.conversations[dataset_name].append(conversation)
 
-    def clear(self) -> None:
+    def clear(self, dataset_name: str) -> None:
         """Clear all stored scores, metadata and conversations"""
-        self.scores.clear()
-        self.metadata.clear()
-        self.conversations.clear()
+        self.scores[dataset_name] = []
+        self.metadata[dataset_name] = []
+        self.conversations[dataset_name] = []
 
     def __len__(self) -> int:
         """Return the number of stored scores"""
@@ -147,7 +153,7 @@ class ScoreBoard:
         placed first in the tuple as ("source", dataset) and ("idx", index).
         """
         # Start with empty list
-        key_items = [("source", metadata["source_dataset"]), ("idx", metadata["source_index"])]
+        key_items = [("source", metadata["source_dataset"])]
 
         # Add difficulty parameters or other metadata
         if "difficulty" in metadata:
@@ -155,39 +161,52 @@ class ScoreBoard:
             items = metadata["difficulty"].items()
         else:
             # Use all metadata except source info
-            items = ((k, v) for k, v in metadata.items() if k not in ("source_dataset", "source_index"))
+            items = ((k, v) for k, v in metadata.items() if k not in ("source_dataset"))
 
         # Add remaining items in sorted order
         key_items.extend(sorted((str(k), v) for k, v in items))
 
         return tuple(key_items)
 
-    def aggregate(self, last_n: Optional[int] = None) -> GroupedScores:
-        """Aggregate scores by difficulty parameters or full metadata if no difficulty present
+    def aggregate(self, last_n: Optional[int] = None) -> dict[str, GroupedScores]:
+        """Aggregate scores by dataset name and then by difficulty parameters
 
         Args:
             last_n: Optional number of most recent entries to consider
-                   If None, use all entries
+                If None, use all entries
 
         Returns:
-            OrderedDict mapping difficulty parameter combinations to lists of scores
-            Keys are tuples of (param_name, value) pairs, sorted by param_name
+            Dictionary mapping dataset names to their respective GroupedScores objects
+            Each GroupedScores contains scores grouped by difficulty parameters for that dataset
         """
         if not self.scores:
-            return GroupedScores(scores=OrderedDict(), total_scores=0)
+            return {}
 
-        # Determine start index for iteration
-        start_idx = max(0, len(self.scores) - last_n) if last_n is not None else 0
+        # Create a nested structure: dataset -> parameter groups -> scores
+        result = {}
 
-        # Group scores by difficulty parameters without creating intermediate lists
-        result = OrderedDict()
-        for i in range(start_idx, len(self.scores)):
-            key = self._metadata_to_key(self.metadata[i])
-            if key not in result:
-                result[key] = []
-            result[key].append(self.scores[i])
+        # Process each dataset
+        for dataset_name, dataset_scores in self.scores.items():
+            # Determine start index for this dataset
+            dataset_len = len(dataset_scores)
+            start_idx = max(0, dataset_len - last_n) if last_n is not None else 0
 
-        # Count total scores
-        total_scores = sum(len(scores) for scores in result.values())
+            # Create OrderedDict for this dataset's parameter groupings
+            dataset_groups = OrderedDict()
 
-        return GroupedScores(scores=result, total_scores=total_scores)
+            # Process scores for this dataset
+            for i in range(start_idx, dataset_len):
+                # Get metadata for this score
+                metadata = self.metadata[dataset_name][i]
+                params = self._metadata_to_key(metadata)
+
+                if params not in dataset_groups:
+                    dataset_groups[params] = []
+
+                dataset_groups[params].append(dataset_scores[i])
+
+            # Create a GroupedScores object for this dataset
+            total_scores = sum(len(scores) for scores in dataset_groups.values())
+            result[dataset_name] = GroupedScores(scores=dataset_groups, total_scores=total_scores)
+
+        return result
diff --git a/training/configs/qwen2.5_3b_grpo.yaml b/training/configs/qwen2.5_1.5b_grpo.yaml
similarity index 94%
rename from training/configs/qwen2.5_3b_grpo.yaml
rename to training/configs/qwen2.5_1.5b_grpo.yaml
index 644653df..e2536bb3 100644
--- a/training/configs/qwen2.5_3b_grpo.yaml
+++ b/training/configs/qwen2.5_1.5b_grpo.yaml
@@ -1,19 +1,12 @@
 reasoning_gym:
-  dataset_size: 20000
+  dataset_size: 10000
   developer_prompt: DeepSeekZero
   datasets:
-    mini_sudoku:
-      weight: 0.33
-      config:
-        min_empty: 6
-    futoshiki:
-      weight: 0.33
-      config:
-        max_board_size: 5
-    sudoku:
-      weight: 0.34
-      config:
-        min_empty: 20
+    spell_backward:
+        weight: 1
+        config:
+          min_word_len: 3
+          max_word_len: 10
 curriculum:
     enabled: False
     schedule:
@@ -29,8 +22,10 @@ curriculum:
 reward:
   use_accuracy: True
   secondary_rewards:
-   - name: length
-     scaling_factor: 1.0
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2
 
 data:
   tokenizer: null
@@ -46,7 +41,7 @@ data:
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: Qwen/Qwen2.5-3B-Instruct
+    path: Qwen/Qwen2.5-1.5B-Instruct
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -72,7 +67,7 @@ actor_rollout_ref:
       lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
       min_lr_ratio: null   # only useful for warmup with cosine
       warmup_style: constant  # select from constant/cosine
-      total_training_steps: -1  # must be override by program
+      total_training_steps: 200  # must be override by program
     fsdp_config:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
@@ -142,7 +137,7 @@ trainer:
   val_generations_to_log_to_wandb: 0
   nnodes: 1
   n_gpus_per_node: 2
-  save_freq: 100
+  save_freq: 50
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   resume_from_path: False
diff --git a/training/configs/qwen2.5_3b_grpo_composite.yaml b/training/configs/qwen2.5_1.5b_grpo_composite.yaml
similarity index 82%
rename from training/configs/qwen2.5_3b_grpo_composite.yaml
rename to training/configs/qwen2.5_1.5b_grpo_composite.yaml
index ae7fae4b..6f0121ff 100644
--- a/training/configs/qwen2.5_3b_grpo_composite.yaml
+++ b/training/configs/qwen2.5_1.5b_grpo_composite.yaml
@@ -2,26 +2,34 @@ reasoning_gym:
   dataset_size: 20000
   developer_prompt: DeepSeekZero
   datasets:
-    mini_sudoku:
-      weight: 0.33
-      config:
-        min_empty: 6
-    futoshiki:
-      weight: 0.33
-      config:
-        max_board_size: 5
-    sudoku:
-      weight: 0.34
-      config:
-        min_empty: 20
+    spell_backward:
+        weight: 0.33
+        config:
+          min_word_len: 3
+          max_word_len: 10
+    letter_counting:
+        weight: 0.34
+        config:
+           min_words: 5
+           max_words: 15
+    number_sorting:
+        weight: 0.33
+        config:
+           min_numbers: 3
+           max_numbers: 10
+           min_decimals: 0
+           max_decimals: 2
+           min_value: -100
+           max_value: 100
+
 curriculum:
     enabled: False
     schedule:
       automatic: True
       update_steps: 30 # automatic curriculum updating after 50 steps
     last_k: 20
-    success_threshold: 0.7
-    failure_threshold: 0.1
+    success_threshold: 0.70
+    failure_threshold: 0.10
     curricula:
       spell_backward:
         attribute_levels:
@@ -29,6 +37,8 @@ curriculum:
 reward:
   use_accuracy: True
   secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
    - name: format
      scaling_factor: 0.2
 
@@ -40,21 +50,20 @@ data:
   max_prompt_length: 512
   max_response_length: 1024
   train_batch_size: 32
-  val_batch_size: 32
+  val_batch_size: 64
   return_raw_chat: True
   return_raw_input_ids: True
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: Qwen/Qwen2.5-3B-Instruct
+    path: Qwen/Qwen2.5-1.5B-Instruct
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
     use_remove_padding: True
   actor:
     strategy: fsdp  # This is for backward-compatibility
-    ppo_mini_batch_size: 32
+    ppo_mini_batch_size: 16
     ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 8
     use_dynamic_bsz: False
@@ -70,10 +79,10 @@ actor_rollout_ref:
     ulysses_sequence_parallel_size: 1 # sp size
     optim:
       lr: 1e-6
-      lr_warmup_steps_ratio: 0  # the total steps will be injected during runtime
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
       min_lr_ratio: null   # only useful for warmup with cosine
-      warmup_style: constant # select from constant/cosine
-      total_training_steps: -1  # must be override by program
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: 200  # must be override by program
     fsdp_config:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
@@ -101,13 +110,13 @@ actor_rollout_ref:
     response_length: ${data.max_response_length}
     # for vllm rollout
     dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.6
+    gpu_memory_utilization: 0.7
     ignore_eos: False
     enforce_eager: True
     free_cache_engine: True
     load_format: dummy_dtensor
-    tensor_model_parallel_size: 4
-    max_num_batched_tokens: 16384
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 12288
     max_num_seqs: 1024
     log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 160
@@ -118,7 +127,7 @@ actor_rollout_ref:
     # for hf rollout
     do_sample: True
     use_fire_sampling: False
-    max_model_len: 16384
+    max_model_len: 12288
     # number of responses (i.e. num sample times)
     n: 8 # > 1 for grpo
     val_kwargs:
@@ -135,15 +144,15 @@ algorithm:
 verbose: True
 trainer:
   balance_batch: True
-  total_epochs: 5
+  total_epochs: 1
   total_training_steps: null
   project_name: rg-test
-  experiment_name: verl_grpo_qwen_composite
+  experiment_name: verl_grpo_qwen_3b_composite
   logger: [ 'console', 'wandb' ]
   val_generations_to_log_to_wandb: 0
   nnodes: 1
-  n_gpus_per_node: 4
-  save_freq: 100
+  n_gpus_per_node: 2
+  save_freq: 50
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   resume_from_path: False
@@ -154,13 +163,14 @@ trainer:
   del_local_ckpt_after_load: False
   default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
 
+
 critic:
   strategy: fsdp
   optim:
-    lr: 1e-6
-    lr_warmup_steps_ratio: 0  # the total steps will be injected during runtime
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
     min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: cosine  # select from constant/cosine
+    warmup_style: constant  # select from constant/cosine
     total_training_steps: -1  # must be override by program
   model:
     path: ~/models/deepseek-llm-7b-chat
@@ -178,7 +188,7 @@ critic:
       fsdp_size: -1
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
   ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 8
+  ppo_micro_batch_size_per_gpu: null
   forward_micro_batch_size: ${critic.ppo_micro_batch_size}
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
   use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
diff --git a/training/configs/qwen2.5_1.5b_grpo_curr.yaml b/training/configs/qwen2.5_1.5b_grpo_curr.yaml
index 31c70423..f0ced4bd 100644
--- a/training/configs/qwen2.5_1.5b_grpo_curr.yaml
+++ b/training/configs/qwen2.5_1.5b_grpo_curr.yaml
@@ -1,20 +1,31 @@
 reasoning_gym:
-  dataset_size: 10000
-  enable_curriculum_learning: True
+  dataset_size: 20000
   developer_prompt: DeepSeekZero
-reward:
-  secondary_rewards:
-   - name: format
-     scaling_factor: 0.5
+  datasets:
+    spell_backward:
+        weight: 1
+        config:
+          min_word_len: 3
+          max_word_len: 10
 curriculum:
     enabled: True
-    last_k: 30
-    success_threshold: 0.7
-    failure_threshold: 0.1
+    schedule:
+      automatic: False
+      update_steps: 30 # automatic curriculum updating after 50 steps
+    last_k: 5120 # Minimum number of samples needed for model to exceeded specific threshold - 20*num_generations*batch_size
+    success_threshold: 0.70
+    failure_threshold: 0.10
     curricula:
       spell_backward:
         attribute_levels:
           word_len: 0
+reward:
+  use_accuracy: True
+  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2
 
 data:
   tokenizer: null
@@ -23,11 +34,10 @@ data:
   prompt_key: prompt
   max_prompt_length: 512
   max_response_length: 1024
-  train_batch_size: 64
+  train_batch_size: 32
   val_batch_size: 64
-  return_raw_input_ids: True  # This should be set to true when the tokenizer between policy and rm differs
   return_raw_chat: True
-
+  return_raw_input_ids: True
 actor_rollout_ref:
   hybrid_engine: True
   model:
@@ -38,9 +48,9 @@ actor_rollout_ref:
     use_remove_padding: True
   actor:
     strategy: fsdp  # This is for backward-compatibility
-    ppo_mini_batch_size: 32
+    ppo_mini_batch_size: 16
     ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-    ppo_micro_batch_size_per_gpu: 16
+    ppo_micro_batch_size_per_gpu: 8
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
     grad_clip: 1.0
@@ -57,7 +67,7 @@ actor_rollout_ref:
       lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
       min_lr_ratio: null   # only useful for warmup with cosine
       warmup_style: constant  # select from constant/cosine
-      total_training_steps: -1  # must be override by program
+      total_training_steps: 200  # must be override by program
     fsdp_config:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
@@ -78,7 +88,6 @@ actor_rollout_ref:
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
   rollout:
     name: vllm
-    max_model_len: 512
     temperature: 1.0
     top_k: -1 # 0 for hf rollout, -1 for vllm rollout
     top_p: 1
@@ -86,13 +95,13 @@ actor_rollout_ref:
     response_length: ${data.max_response_length}
     # for vllm rollout
     dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.6
+    gpu_memory_utilization: 0.7
     ignore_eos: False
     enforce_eager: True
     free_cache_engine: True
     load_format: dummy_dtensor
     tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
+    max_num_batched_tokens: 12288
     max_num_seqs: 1024
     log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 160
@@ -103,6 +112,7 @@ actor_rollout_ref:
     # for hf rollout
     do_sample: True
     use_fire_sampling: False
+    max_model_len: 12288
     # number of responses (i.e. num sample times)
     n: 8 # > 1 for grpo
     val_kwargs:
@@ -119,15 +129,15 @@ algorithm:
 verbose: True
 trainer:
   balance_batch: True
-  total_epochs: 10
+  total_epochs: 1
   total_training_steps: null
   project_name: rg-test
-  experiment_name: verl_grpo_qwen_curr
+  experiment_name: verl_grpo_qwen_3b_curr
   logger: [ 'console', 'wandb' ]
   val_generations_to_log_to_wandb: 0
   nnodes: 1
   n_gpus_per_node: 2
-  save_freq: 100
+  save_freq: 50
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   resume_from_path: False
@@ -136,7 +146,8 @@ trainer:
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  default_local_dir: /workspace/joe/checkpoints/checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
 
 critic:
   strategy: fsdp
diff --git a/training/configs/qwen2.5_3b_grpo_curr.yaml b/training/configs/qwen2.5_3b_grpo_curr.yaml
deleted file mode 100644
index 405b9170..00000000
--- a/training/configs/qwen2.5_3b_grpo_curr.yaml
+++ /dev/null
@@ -1,201 +0,0 @@
-reasoning_gym:
-  dataset_size: 10000
-  enable_curriculum_learning: True
-  developer_prompt: DeepSeekZero
-curriculum:
-    enabled: True
-    schedule:
-      automatic: True
-      update_steps: 30 # automatic curriculum updating after 50 steps
-    last_k: 20
-    success_threshold: 0.7
-    failure_threshold: 0.1
-    curricula:
-      spell_backward:
-        attribute_levels:
-          word_len: 0
-reward:
-  use_accuracy: false
-  secondary_rewards:
-   - name: cosine
-     scaling_factor: 2
-   - name: format
-     scaling_factor: 0.5
-
-data:
-  tokenizer: null
-  train_files: train.parquet
-  val_files: test.parquet
-  prompt_key: prompt
-  max_prompt_length: 512
-  max_response_length: 1024
-  train_batch_size: 128
-  val_batch_size: 128
-  return_raw_chat: True
-  return_raw_input_ids: True
-
-actor_rollout_ref:
-  hybrid_engine: True
-  model:
-    path: Qwen/Qwen2.5-3B-Instruct
-    external_lib: null
-    override_config: { }
-    enable_gradient_checkpointing: True
-    use_remove_padding: True
-  actor:
-    strategy: fsdp  # This is for backward-compatibility
-    ppo_mini_batch_size: 32
-    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-    ppo_micro_batch_size_per_gpu: 8
-    use_dynamic_bsz: False
-    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
-    kl_loss_type: low_var_kl # for grpo
-    ppo_epochs: 1
-    shuffle: False
-    ulysses_sequence_parallel_size: 1 # sp size
-    optim:
-      lr: 1e-6
-      lr_warmup_steps_ratio: 0.1  # the total steps will be injected during runtime
-      min_lr_ratio: 0.1   # only useful for warmup with cosine
-      warmup_style: cosine # select from constant/cosine
-      total_training_steps: -1  # must be override by program
-    fsdp_config:
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      param_offload: False
-      optimizer_offload: False
-      fsdp_size: -1
-  ref:
-    fsdp_config:
-      param_offload: True
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 16
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    max_model_len: 1024
-    temperature: 0.7
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.6
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 4
-    max_num_batched_tokens: 8192
-    max_num_seqs: 1024
-    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 16
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    use_fire_sampling: False
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-    val_kwargs:
-      do_sample: True
-
-algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
-  kl_penalty: kl  # how to estimate kl divergence
-  kl_ctrl:
-    type: fixed
-    kl_coef: 0.001
-verbose: True
-trainer:
-  balance_batch: True
-  total_epochs: 5
-  total_training_steps: null
-  project_name: rg-test
-  experiment_name: verl_grpo_qwen_curr
-  logger: [ 'console', 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 4
-  save_freq: 50
-  # auto: find the last ckpt to resume. If can't find, start from scratch
-  resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
-  test_freq: 300
-  critic_warmup: 0
-  default_hdfs_dir: null
-  remove_previous_ckpt_in_save: False
-  del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-6
-    lr_warmup_steps_ratio: 0.1  # the total steps will be injected during runtime
-    min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: cosine  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: ~/models/deepseek-llm-7b-chat
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: null
-  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-# Reward model not used for GRPO
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  micro_batch_size: null
-  micro_batch_size_per_gpu: null
-  max_length: null
-  ulysses_sequence_parallel_size: 1
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}