From 73f7cc7a66d16778d3f0bcb4ff23e9377075e74b Mon Sep 17 00:00:00 2001
From: joesharratt1229 <joesharratt1229@gmail.com>
Date: Tue, 25 Mar 2025 06:17:54 +0000
Subject: [PATCH] added spell backward

---
 reasoning_gym/algorithmic/spell_backward.py |  2 +-
 training/configs/qwen2.5_3b_grpo.yaml       | 54 ++++++++++-----------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/reasoning_gym/algorithmic/spell_backward.py b/reasoning_gym/algorithmic/spell_backward.py
index e5627677..f7f4d843 100644
--- a/reasoning_gym/algorithmic/spell_backward.py
+++ b/reasoning_gym/algorithmic/spell_backward.py
@@ -34,7 +34,7 @@ class SpellBackwardDataset(ProceduralDataset):
         super().__init__(config=config, seed=config.seed, size=config.size)
 
         # Load and preprocess text
-        text = read_data_file("in_the_year_2889.txt")
+        text = read_data_file("words3to10.txt")
         self.words = [
             word.strip()
             for word in text.splitlines()
diff --git a/training/configs/qwen2.5_3b_grpo.yaml b/training/configs/qwen2.5_3b_grpo.yaml
index a6271688..a279e85d 100644
--- a/training/configs/qwen2.5_3b_grpo.yaml
+++ b/training/configs/qwen2.5_3b_grpo.yaml
@@ -1,12 +1,12 @@
 reasoning_gym:
-  dataset_size: 10000
+  dataset_size: 20000
   developer_prompt: DeepSeekZero
   datasets:
-    figlet_font:
-      weight: 1
-      config:
-        min_word_len: 3
-        max_word_len: 10
+    spell_backward:
+        weight: 1
+        config:
+          min_word_len: 3
+          max_word_len: 10
 curriculum:
     enabled: False
     schedule:
@@ -20,20 +20,20 @@ curriculum:
         attribute_levels:
           word_len: 0
 reward:
-  use_accuracy: true
+  use_accuracy: True
   secondary_rewards:
    - name: format
-     scaling_factor: 0.5
+     scaling_factor: 0.2
 
 data:
   tokenizer: null
   train_files: train.parquet
   val_files: test.parquet
   prompt_key: prompt
-  max_prompt_length: 2048
-  max_response_length: 4096
-  train_batch_size: 128
-  val_batch_size: 128
+  max_prompt_length: 512
+  max_response_length: 1024
+  train_batch_size: 32
+  val_batch_size: 32
   return_raw_chat: True
   return_raw_input_ids: True
 
@@ -62,10 +62,10 @@ actor_rollout_ref:
     shuffle: False
     ulysses_sequence_parallel_size: 1 # sp size
     optim:
-      lr: 3e-4
-      lr_warmup_steps_ratio: 0.1  # the total steps will be injected during runtime
-      min_lr_ratio: 0.1   # only useful for warmup with cosine
-      warmup_style: cosine # select from constant/cosine
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant # select from constant/cosine
       total_training_steps: -1  # must be override by program
     fsdp_config:
       wrap_policy:
@@ -81,7 +81,7 @@ actor_rollout_ref:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
     log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 16
+    log_prob_micro_batch_size_per_gpu: 160
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
@@ -90,7 +90,7 @@ actor_rollout_ref:
     temperature: 1.0
     top_k: -1 # 0 for hf rollout, -1 for vllm rollout
     top_p: 1
-    prompt_length: ${data.max_prompt_length} # not use for opensource
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
     response_length: ${data.max_response_length}
     # for vllm rollout
     dtype: bfloat16 # should align with FSDP
@@ -99,9 +99,8 @@ actor_rollout_ref:
     enforce_eager: True
     free_cache_engine: True
     load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_model_len: 32768
-    max_num_batched_tokens: 32768
+    tensor_model_parallel_size: 4
+    max_num_batched_tokens: 16384
     max_num_seqs: 1024
     log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 160
@@ -112,6 +111,7 @@ actor_rollout_ref:
     # for hf rollout
     do_sample: True
     use_fire_sampling: False
+    max_model_len: 16384
     # number of responses (i.e. num sample times)
     n: 8 # > 1 for grpo
     val_kwargs:
@@ -131,16 +131,16 @@ trainer:
   total_epochs: 5
   total_training_steps: null
   project_name: rg-test
-  experiment_name: verl_grpo_qwen_figlet
+  experiment_name: verl_grpo_qwen_composite
   logger: [ 'console', 'wandb' ]
   val_generations_to_log_to_wandb: 0
   nnodes: 1
-  n_gpus_per_node: 2
-  save_freq: 50
+  n_gpus_per_node: 4
+  save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   resume_from_path: False
-  test_freq: 300
+  test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
@@ -151,7 +151,7 @@ critic:
   strategy: fsdp
   optim:
     lr: 1e-6
-    lr_warmup_steps_ratio: 0.1  # the total steps will be injected during runtime
+    lr_warmup_steps_ratio: 0  # the total steps will be injected during runtime
     min_lr_ratio: null   # only useful for warmup with cosine
     warmup_style: cosine  # select from constant/cosine
     total_training_steps: -1  # must be override by program
@@ -171,7 +171,7 @@ critic:
       fsdp_size: -1
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
   ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: null
+  ppo_micro_batch_size_per_gpu: 8
   forward_micro_batch_size: ${critic.ppo_micro_batch_size}
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
   use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}