diff --git a/training/configs/qwen2.5_3b_grpo.yaml b/training/configs/qwen2.5_3b_grpo.yaml index b4a9bac1..14b4c852 100644 --- a/training/configs/qwen2.5_3b_grpo.yaml +++ b/training/configs/qwen2.5_3b_grpo.yaml @@ -37,7 +37,7 @@ data: val_files: test.parquet prompt_key: prompt max_prompt_length: 512 - max_response_length: 1024 + max_response_length: 4096 train_batch_size: 16 val_batch_size: 16 return_raw_input_ids: True # This should be set to true when the tokenizer between policy and rm differs @@ -57,7 +57,7 @@ actor_rollout_ref: ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 8 use_dynamic_bsz: False - ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + ppo_max_token_len_per_gpu: 36864 # n * ${data.max_prompt_length} + ${data.max_response_length} grad_clip: 1.0 clip_ratio: 0.2 entropy_coeff: 0.001