reasoning-gym/examples/open-instruct/grpo_config.sh
joesharratt1229 9234aa77bf
Feat/open instruct example (#381)
* added open-instruct

* fixed hooks

* GRPO

---------

Co-authored-by: Andreas Koepf <andreas.koepf@provisio.com>
2025-03-17 23:20:11 +01:00

34 lines
1.1 KiB
Bash
Executable file

exp_name="0302_qwen2.5_math_grpo_fast1_${RANDOM}"
python src/grpo_trainer.py \
--model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--beta 0.0 \
--num_unique_prompts_rollout 16 \
--num_samples_per_prompt_rollout 8 \
--output_dir ./open_instruct_checkpoints/$exp_name \
--save_freq 40 \
--kl_estimator kl3 \
--learning_rate 5e-7 \
--max_token_length 256 \
--max_prompt_token_length 128 \
--response_length 256 \
--pack_length 512 \
--stop_strings '"</answer>"' \
--chat_template_name r1_simple_chat_postpend_think \
--temperature 1.0 \
--total_episodes 100000 \
--deepspeed_stage 3 \
--per_device_train_batch_size 1 \
--num_mini_batches 1\
--num_learners_per_node 1 \
--num_epochs 1 \
--vllm_tensor_parallel_size 1 \
--vllm_num_engines 1 \
--vllm_enforce_eager true \
--lr_scheduler_type linear \
--seed 1 \
--num_evals 100 \
--dataset_name chain_sum \
--gradient_checkpointing \
--with_tracking \
--single_gpu_mode false \
--gather_whole_model false