updated configs

2026-05-02 17:45:58 +00:00 · 2025-03-28 00:05:58 +00:00 · 2025-03-28 00:05:58 +00:00 · 7368d6d313
commit 7368d6d313
parent cc0bacd8e1
8 changed files with 158 additions and 315 deletions
--- a/training/configs/qwen2.5_1.5b_grpo.yaml
+++ b/training/configs/qwen2.5_1.5b_grpo.yaml
@ -1,19 +1,12 @@
 reasoning_gym:
-  dataset_size: 20000
+  dataset_size: 10000
  developer_prompt: DeepSeekZero
  datasets:
-    mini_sudoku:
-      weight: 0.33
-      config:
-        min_empty: 6
-    futoshiki:
-      weight: 0.33
-      config:
-        max_board_size: 5
-    sudoku:
-      weight: 0.34
-      config:
-        min_empty: 20
+    spell_backward:
+        weight: 1
+        config:
+          min_word_len: 3
+          max_word_len: 10
 curriculum:
    enabled: False
    schedule:
@ -29,8 +22,10 @@ curriculum:
 reward:
  use_accuracy: True
  secondary_rewards:
-   - name: length
-     scaling_factor: 1.0
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2

 data:
  tokenizer: null
@ -46,7 +41,7 @@ data:
 actor_rollout_ref:
  hybrid_engine: True
  model:
-    path: Qwen/Qwen2.5-3B-Instruct
+    path: Qwen/Qwen2.5-1.5B-Instruct
    external_lib: null
    override_config: { }
    enable_gradient_checkpointing: True
@ -72,7 +67,7 @@ actor_rollout_ref:
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: null   # only useful for warmup with cosine
      warmup_style: constant  # select from constant/cosine
-      total_training_steps: -1  # must be override by program
+      total_training_steps: 200  # must be override by program
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
@ -142,7 +137,7 @@ trainer:
  val_generations_to_log_to_wandb: 0
  nnodes: 1
  n_gpus_per_node: 2
-  save_freq: 100
+  save_freq: 50
  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or auto or resume_path if
  resume_from_path: False
--- a/training/configs/qwen2.5_1.5b_grpo_composite.yaml
+++ b/training/configs/qwen2.5_1.5b_grpo_composite.yaml
@ -2,26 +2,34 @@ reasoning_gym:
  dataset_size: 20000
  developer_prompt: DeepSeekZero
  datasets:
-    mini_sudoku:
-      weight: 0.33
-      config:
-        min_empty: 6
-    futoshiki:
-      weight: 0.33
-      config:
-        max_board_size: 5
-    sudoku:
-      weight: 0.34
-      config:
-        min_empty: 20
+    spell_backward:
+        weight: 0.33
+        config:
+          min_word_len: 3
+          max_word_len: 10
+    letter_counting:
+        weight: 0.34
+        config:
+           min_words: 5
+           max_words: 15
+    number_sorting:
+        weight: 0.33
+        config:
+           min_numbers: 3
+           max_numbers: 10
+           min_decimals: 0
+           max_decimals: 2
+           min_value: -100
+           max_value: 100
+
 curriculum:
    enabled: False
    schedule:
      automatic: True
      update_steps: 30 # automatic curriculum updating after 50 steps
    last_k: 20
-    success_threshold: 0.7
-    failure_threshold: 0.1
+    success_threshold: 0.70
+    failure_threshold: 0.10
    curricula:
      spell_backward:
        attribute_levels:
@ -29,6 +37,8 @@ curriculum:
 reward:
  use_accuracy: True
  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
   - name: format
     scaling_factor: 0.2

@ -40,21 +50,20 @@ data:
  max_prompt_length: 512
  max_response_length: 1024
  train_batch_size: 32
-  val_batch_size: 32
+  val_batch_size: 64
  return_raw_chat: True
  return_raw_input_ids: True
-
 actor_rollout_ref:
  hybrid_engine: True
  model:
-    path: Qwen/Qwen2.5-3B-Instruct
+    path: Qwen/Qwen2.5-1.5B-Instruct
    external_lib: null
    override_config: { }
    enable_gradient_checkpointing: True
    use_remove_padding: True
  actor:
    strategy: fsdp  # This is for backward-compatibility
-    ppo_mini_batch_size: 32
+    ppo_mini_batch_size: 16
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: 8
    use_dynamic_bsz: False
@ -70,10 +79,10 @@ actor_rollout_ref:
    ulysses_sequence_parallel_size: 1 # sp size
    optim:
      lr: 1e-6
-      lr_warmup_steps_ratio: 0  # the total steps will be injected during runtime
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: null   # only useful for warmup with cosine
-      warmup_style: constant # select from constant/cosine
-      total_training_steps: -1  # must be override by program
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: 200  # must be override by program
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
@ -101,13 +110,13 @@ actor_rollout_ref:
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.6
+    gpu_memory_utilization: 0.7
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor
-    tensor_model_parallel_size: 4
-    max_num_batched_tokens: 16384
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 12288
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
@ -118,7 +127,7 @@ actor_rollout_ref:
    # for hf rollout
    do_sample: True
    use_fire_sampling: False
-    max_model_len: 16384
+    max_model_len: 12288
    # number of responses (i.e. num sample times)
    n: 8 # > 1 for grpo
    val_kwargs:
@ -135,15 +144,15 @@ algorithm:
 verbose: True
 trainer:
  balance_batch: True
-  total_epochs: 5
+  total_epochs: 1
  total_training_steps: null
  project_name: rg-test
-  experiment_name: verl_grpo_qwen_composite
+  experiment_name: verl_grpo_qwen_3b_composite
  logger: [ 'console', 'wandb' ]
  val_generations_to_log_to_wandb: 0
  nnodes: 1
-  n_gpus_per_node: 4
-  save_freq: 100
+  n_gpus_per_node: 2
+  save_freq: 50
  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or auto or resume_path if
  resume_from_path: False
@ -154,13 +163,14 @@ trainer:
  del_local_ckpt_after_load: False
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}

+
 critic:
  strategy: fsdp
  optim:
-    lr: 1e-6
-    lr_warmup_steps_ratio: 0  # the total steps will be injected during runtime
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: cosine  # select from constant/cosine
+    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program
  model:
    path: ~/models/deepseek-llm-7b-chat
@ -178,7 +188,7 @@ critic:
      fsdp_size: -1
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 8
+  ppo_micro_batch_size_per_gpu: null
  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
--- a/training/configs/qwen2.5_1.5b_grpo_curr.yaml
+++ b/training/configs/qwen2.5_1.5b_grpo_curr.yaml
@ -1,20 +1,31 @@
 reasoning_gym:
-  dataset_size: 10000
-  enable_curriculum_learning: True
+  dataset_size: 20000
  developer_prompt: DeepSeekZero
-reward:
-  secondary_rewards:
-   - name: format
-     scaling_factor: 0.5
+  datasets:
+    spell_backward:
+        weight: 1
+        config:
+          min_word_len: 3
+          max_word_len: 10
 curriculum:
    enabled: True
-    last_k: 30
-    success_threshold: 0.7
-    failure_threshold: 0.1
+    schedule:
+      automatic: False
+      update_steps: 30 # automatic curriculum updating after 50 steps
+    last_k: 5120 # Minimum number of samples needed for model to exceeded specific threshold - 20*num_generations*batch_size
+    success_threshold: 0.70
+    failure_threshold: 0.10
    curricula:
      spell_backward:
        attribute_levels:
          word_len: 0
+reward:
+  use_accuracy: True
+  secondary_rewards:
+   - name: cosine
+     scaling_factor: 0.3
+   - name: format
+     scaling_factor: 0.2

 data:
  tokenizer: null
@ -23,11 +34,10 @@ data:
  prompt_key: prompt
  max_prompt_length: 512
  max_response_length: 1024
-  train_batch_size: 64
+  train_batch_size: 32
  val_batch_size: 64
-  return_raw_input_ids: True  # This should be set to true when the tokenizer between policy and rm differs
  return_raw_chat: True
-
+  return_raw_input_ids: True
 actor_rollout_ref:
  hybrid_engine: True
  model:
@ -38,9 +48,9 @@ actor_rollout_ref:
    use_remove_padding: True
  actor:
    strategy: fsdp  # This is for backward-compatibility
-    ppo_mini_batch_size: 32
+    ppo_mini_batch_size: 16
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-    ppo_micro_batch_size_per_gpu: 16
+    ppo_micro_batch_size_per_gpu: 8
    use_dynamic_bsz: False
    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
@ -57,7 +67,7 @@ actor_rollout_ref:
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: null   # only useful for warmup with cosine
      warmup_style: constant  # select from constant/cosine
-      total_training_steps: -1  # must be override by program
+      total_training_steps: 200  # must be override by program
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
@ -78,7 +88,6 @@ actor_rollout_ref:
    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
  rollout:
    name: vllm
-    max_model_len: 512
    temperature: 1.0
    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
    top_p: 1
@ -86,13 +95,13 @@ actor_rollout_ref:
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.6
+    gpu_memory_utilization: 0.7
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor
    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
+    max_num_batched_tokens: 12288
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
@ -103,6 +112,7 @@ actor_rollout_ref:
    # for hf rollout
    do_sample: True
    use_fire_sampling: False
+    max_model_len: 12288
    # number of responses (i.e. num sample times)
    n: 8 # > 1 for grpo
    val_kwargs:
@ -119,15 +129,15 @@ algorithm:
 verbose: True
 trainer:
  balance_batch: True
-  total_epochs: 10
+  total_epochs: 1
  total_training_steps: null
  project_name: rg-test
-  experiment_name: verl_grpo_qwen_curr
+  experiment_name: verl_grpo_qwen_3b_curr
  logger: [ 'console', 'wandb' ]
  val_generations_to_log_to_wandb: 0
  nnodes: 1
  n_gpus_per_node: 2
-  save_freq: 100
+  save_freq: 50
  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or auto or resume_path if
  resume_from_path: False
@ -136,7 +146,8 @@ trainer:
  default_hdfs_dir: null
  remove_previous_ckpt_in_save: False
  del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  default_local_dir: /workspace/joe/checkpoints/checkpoints/${trainer.project_name}/${trainer.experiment_name}
+

 critic:
  strategy: fsdp
--- a/training/configs/qwen2.5_3b_grpo_curr.yaml
+++ b/training/configs/qwen2.5_3b_grpo_curr.yaml
@ -1,201 +0,0 @@
-reasoning_gym:
-  dataset_size: 10000
-  enable_curriculum_learning: True
-  developer_prompt: DeepSeekZero
-curriculum:
-    enabled: True
-    schedule:
-      automatic: True
-      update_steps: 30 # automatic curriculum updating after 50 steps
-    last_k: 20
-    success_threshold: 0.7
-    failure_threshold: 0.1
-    curricula:
-      spell_backward:
-        attribute_levels:
-          word_len: 0
-reward:
-  use_accuracy: false
-  secondary_rewards:
-   - name: cosine
-     scaling_factor: 2
-   - name: format
-     scaling_factor: 0.5
-
-data:
-  tokenizer: null
-  train_files: train.parquet
-  val_files: test.parquet
-  prompt_key: prompt
-  max_prompt_length: 512
-  max_response_length: 1024
-  train_batch_size: 128
-  val_batch_size: 128
-  return_raw_chat: True
-  return_raw_input_ids: True
-
-actor_rollout_ref:
-  hybrid_engine: True
-  model:
-    path: Qwen/Qwen2.5-3B-Instruct
-    external_lib: null
-    override_config: { }
-    enable_gradient_checkpointing: True
-    use_remove_padding: True
-  actor:
-    strategy: fsdp  # This is for backward-compatibility
-    ppo_mini_batch_size: 32
-    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-    ppo_micro_batch_size_per_gpu: 8
-    use_dynamic_bsz: False
-    ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
-    kl_loss_type: low_var_kl # for grpo
-    ppo_epochs: 1
-    shuffle: False
-    ulysses_sequence_parallel_size: 1 # sp size
-    optim:
-      lr: 1e-6
-      lr_warmup_steps_ratio: 0.1  # the total steps will be injected during runtime
-      min_lr_ratio: 0.1   # only useful for warmup with cosine
-      warmup_style: cosine # select from constant/cosine
-      total_training_steps: -1  # must be override by program
-    fsdp_config:
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      param_offload: False
-      optimizer_offload: False
-      fsdp_size: -1
-  ref:
-    fsdp_config:
-      param_offload: True
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 16
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    max_model_len: 1024
-    temperature: 0.7
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.6
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 4
-    max_num_batched_tokens: 8192
-    max_num_seqs: 1024
-    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 16
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    use_fire_sampling: False
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-    val_kwargs:
-      do_sample: True
-
-algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
-  kl_penalty: kl  # how to estimate kl divergence
-  kl_ctrl:
-    type: fixed
-    kl_coef: 0.001
-verbose: True
-trainer:
-  balance_batch: True
-  total_epochs: 5
-  total_training_steps: null
-  project_name: rg-test
-  experiment_name: verl_grpo_qwen_curr
-  logger: [ 'console', 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 4
-  save_freq: 50
-  # auto: find the last ckpt to resume. If can't find, start from scratch
-  resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
-  test_freq: 300
-  critic_warmup: 0
-  default_hdfs_dir: null
-  remove_previous_ckpt_in_save: False
-  del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-6
-    lr_warmup_steps_ratio: 0.1  # the total steps will be injected during runtime
-    min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: cosine  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: ~/models/deepseek-llm-7b-chat
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: null
-  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-# Reward model not used for GRPO
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  micro_batch_size: null
-  micro_batch_size_per_gpu: null
-  max_length: null
-  ulysses_sequence_parallel_size: 1
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}