diff --git a/training/configs/qwen2.5_3b_grpo_composite.yaml b/training/configs/qwen2.5_3b_grpo_composite.yaml index 0aabcfc3..1c51e0bc 100644 --- a/training/configs/qwen2.5_3b_grpo_composite.yaml +++ b/training/configs/qwen2.5_3b_grpo_composite.yaml @@ -118,6 +118,7 @@ actor_rollout_ref: # for hf rollout do_sample: True use_fire_sampling: False + max_model_len: 4096 # number of responses (i.e. num sample times) n: 8 # > 1 for grpo val_kwargs: @@ -188,7 +189,6 @@ critic: shuffle: ${actor_rollout_ref.actor.shuffle} grad_clip: 1.0 cliprange_value: 0.5 - max_model_len: 4096 # Reward model not used for GRPO reward_model: