exp_name="0302_qwen2.5_math_grpo_fast1_${RANDOM}" python src/grpo_trainer.py \ --model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ --beta 0.0 \ --num_unique_prompts_rollout 16 \ --num_samples_per_prompt_rollout 8 \ --output_dir ./open_instruct_checkpoints/$exp_name \ --save_freq 40 \ --kl_estimator kl3 \ --learning_rate 5e-7 \ --max_token_length 256 \ --max_prompt_token_length 128 \ --response_length 256 \ --pack_length 512 \ --stop_strings '""' \ --chat_template_name r1_simple_chat_postpend_think \ --temperature 1.0 \ --total_episodes 100000 \ --deepspeed_stage 3 \ --per_device_train_batch_size 1 \ --num_mini_batches 1\ --num_learners_per_node 1 \ --num_epochs 1 \ --vllm_tensor_parallel_size 1 \ --vllm_num_engines 1 \ --vllm_enforce_eager true \ --lr_scheduler_type linear \ --seed 1 \ --num_evals 100 \ --dataset_name chain_sum \ --gradient_checkpointing \ --with_tracking \ --single_gpu_mode false \ --gather_whole_model false