mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
* added open-instruct * fixed hooks * GRPO --------- Co-authored-by: Andreas Koepf <andreas.koepf@provisio.com>
34 lines
1.1 KiB
Bash
Executable file
34 lines
1.1 KiB
Bash
Executable file
exp_name="0302_qwen2.5_math_grpo_fast1_${RANDOM}"
|
|
python src/grpo_trainer.py \
|
|
--model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
|
--beta 0.0 \
|
|
--num_unique_prompts_rollout 16 \
|
|
--num_samples_per_prompt_rollout 8 \
|
|
--output_dir ./open_instruct_checkpoints/$exp_name \
|
|
--save_freq 40 \
|
|
--kl_estimator kl3 \
|
|
--learning_rate 5e-7 \
|
|
--max_token_length 256 \
|
|
--max_prompt_token_length 128 \
|
|
--response_length 256 \
|
|
--pack_length 512 \
|
|
--stop_strings '"</answer>"' \
|
|
--chat_template_name r1_simple_chat_postpend_think \
|
|
--temperature 1.0 \
|
|
--total_episodes 100000 \
|
|
--deepspeed_stage 3 \
|
|
--per_device_train_batch_size 1 \
|
|
--num_mini_batches 1\
|
|
--num_learners_per_node 1 \
|
|
--num_epochs 1 \
|
|
--vllm_tensor_parallel_size 1 \
|
|
--vllm_num_engines 1 \
|
|
--vllm_enforce_eager true \
|
|
--lr_scheduler_type linear \
|
|
--seed 1 \
|
|
--num_evals 100 \
|
|
--dataset_name chain_sum \
|
|
--gradient_checkpointing \
|
|
--with_tracking \
|
|
--single_gpu_mode false \
|
|
--gather_whole_model false
|