diff --git a/training/configs/qwen2.5_3b_grpo.yaml b/training/configs/qwen2.5_3b_grpo.yaml index 6a35ad7e..667b97fd 100644 --- a/training/configs/qwen2.5_3b_grpo.yaml +++ b/training/configs/qwen2.5_3b_grpo.yaml @@ -21,13 +21,13 @@ curriculum: update_steps: 30 # automatic curriculum updating after 50 steps last_k: 20 success_threshold: 0.7 - failure_threshold: 0.1 + failure_threshold: 0.10 curricula: spell_backward: attribute_levels: word_len: 0 reward: - use_accuracy: False + use_accuracy: True secondary_rewards: - name: format scaling_factor: 0.2 @@ -38,12 +38,11 @@ data: val_files: test.parquet prompt_key: prompt max_prompt_length: 512 - max_response_length: 1024 - train_batch_size: 32 - val_batch_size: 32 + max_response_length: 2048 + train_batch_size: 16 + val_batch_size: 16 return_raw_chat: True return_raw_input_ids: True - actor_rollout_ref: hybrid_engine: True model: @@ -58,7 +57,7 @@ actor_rollout_ref: ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 8 use_dynamic_bsz: False - ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + ppo_max_token_len_per_gpu: 20480 # n * ${data.max_prompt_length} + ${data.max_response_length} grad_clip: 1.0 clip_ratio: 0.2 entropy_coeff: 0.001 @@ -70,9 +69,9 @@ actor_rollout_ref: ulysses_sequence_parallel_size: 1 # sp size optim: lr: 1e-6 - lr_warmup_steps_ratio: 0 # the total steps will be injected during runtime + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine + warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program fsdp_config: wrap_policy: @@ -101,13 +100,14 @@ actor_rollout_ref: response_length: ${data.max_response_length} # for vllm rollout dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.8 + gpu_memory_utilization: 0.6 ignore_eos: False enforce_eager: True free_cache_engine: True load_format: dummy_dtensor tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 + max_num_batched_tokens: 16384 + max_model_len: 16384 max_num_seqs: 1024 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 160 @@ -118,7 +118,6 @@ actor_rollout_ref: # for hf rollout do_sample: True use_fire_sampling: False - max_model_len: 8192 # number of responses (i.e. num sample times) n: 8 # > 1 for grpo val_kwargs: @@ -135,10 +134,10 @@ algorithm: verbose: True trainer: balance_batch: True - total_epochs: 3 + total_epochs: 10 total_training_steps: null project_name: rg-test - experiment_name: verl_grpo_qwen_composite + experiment_name: verl_grpo_qwen_3b logger: [ 'console', 'wandb' ] val_generations_to_log_to_wandb: 0 nnodes: 1 @@ -154,13 +153,14 @@ trainer: del_local_ckpt_after_load: False default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + critic: strategy: fsdp optim: - lr: 1e-6 - lr_warmup_steps_ratio: 0 # the total steps will be injected during runtime + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine + warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program model: path: ~/models/deepseek-llm-7b-chat @@ -178,7 +178,7 @@ critic: fsdp_size: -1 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 16 + ppo_micro_batch_size_per_gpu: null forward_micro_batch_size: ${critic.ppo_micro_batch_size} forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} @@ -208,5 +208,4 @@ reward_model: max_length: null ulysses_sequence_parallel_size: 1 use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} - + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} \ No newline at end of file