updated curr

2026-04-30 17:40:45 +00:00 · 2025-03-26 19:52:32 +00:00 · 2025-03-26 19:52:32 +00:00 · 74eca6c45b
commit 74eca6c45b
parent c952f31a61
1 changed files with 12 additions and 12 deletions
--- a/training/configs/qwen2.5_3b_grpo.yaml
+++ b/training/configs/qwen2.5_3b_grpo.yaml
@ -1,7 +1,7 @@
 reasoning_gym:
  dataset_size: 20000
  developer_prompt: DeepSeekZero
-  datasets:  # Used if enable_curriculum_learning is False
+  datasets:
    mini_sudoku:
      weight: 0.33
      config:
@ -29,8 +29,8 @@ curriculum:
 reward:
  use_accuracy: True
  secondary_rewards:
-   - name: format
-     scaling_factor: 0.2
+   - name: length
+     scaling_factor: 1.0

 data:
  tokenizer: null
@ -38,9 +38,9 @@ data:
  val_files: test.parquet
  prompt_key: prompt
  max_prompt_length: 512
-  max_response_length: 2048
-  train_batch_size: 16
-  val_batch_size: 16
+  max_response_length: 1024
+  train_batch_size: 32
+  val_batch_size: 64
  return_raw_chat: True
  return_raw_input_ids: True
 actor_rollout_ref:
@ -57,7 +57,7 @@ actor_rollout_ref:
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: 8
    use_dynamic_bsz: False
-    ppo_max_token_len_per_gpu: 20480 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
    clip_ratio: 0.2
    entropy_coeff: 0.001
@ -100,13 +100,13 @@ actor_rollout_ref:
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.6
+    gpu_memory_utilization: 0.7
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor
    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 16384
+    max_num_batched_tokens: 12288
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
@ -117,7 +117,7 @@ actor_rollout_ref:
    # for hf rollout
    do_sample: True
    use_fire_sampling: False
-    max_model_len: 16384
+    max_model_len: 12288
    # number of responses (i.e. num sample times)
    n: 8 # > 1 for grpo
    val_kwargs:
@ -134,7 +134,7 @@ algorithm:
 verbose: True
 trainer:
  balance_batch: True
-  total_epochs: 10
+  total_epochs: 1
  total_training_steps: null
  project_name: rg-test
  experiment_name: verl_grpo_qwen_3b
@ -178,7 +178,7 @@ critic:
      fsdp_size: -1
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 8
+  ppo_micro_batch_size_per_gpu: null
  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}