diff --git a/training/configs/qwen2.5_3b_grpo_composite.yaml b/training/configs/qwen2.5_3b_grpo_composite.yaml
index 0aabcfc3..1c51e0bc 100644
--- a/training/configs/qwen2.5_3b_grpo_composite.yaml
+++ b/training/configs/qwen2.5_3b_grpo_composite.yaml
@@ -118,6 +118,7 @@ actor_rollout_ref:
     # for hf rollout
     do_sample: True
     use_fire_sampling: False
+    max_model_len: 4096
     # number of responses (i.e. num sample times)
     n: 8 # > 1 for grpo
     val_kwargs:
@@ -188,7 +189,6 @@ critic:
   shuffle: ${actor_rollout_ref.actor.shuffle}
   grad_clip: 1.0
   cliprange_value: 0.5
-  max_model_len: 4096
 
 # Reward model not used for GRPO
 reward_model: