exp_name="0302_qwen2.5_math_grpo_fast1_${RANDOM}"
python src/grpo_trainer.py \
    --model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
    --beta 0.0 \
    --num_unique_prompts_rollout 16 \
    --num_samples_per_prompt_rollout 8 \
    --output_dir ./open_instruct_checkpoints/$exp_name \
    --save_freq 40 \
    --kl_estimator kl3 \
    --learning_rate 5e-7 \
    --max_token_length 256 \
    --max_prompt_token_length 128 \
    --response_length 256 \
    --pack_length 512 \
    --stop_strings '"</answer>"' \
    --chat_template_name r1_simple_chat_postpend_think \
    --temperature 1.0 \
    --total_episodes 100000 \
    --deepspeed_stage 3 \
    --per_device_train_batch_size 1 \
    --num_mini_batches 1\
    --num_learners_per_node 1 \
    --num_epochs 1 \
    --vllm_tensor_parallel_size 1 \
    --vllm_num_engines 1 \
    --vllm_enforce_eager true \
    --lr_scheduler_type linear \
    --seed 1 \
    --num_evals 100 \
    --dataset_name chain_sum \
    --gradient_checkpointing \
    --with_tracking \
    --single_gpu_mode false \
    --gather_whole_model false