mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-22 16:49:06 +00:00
Feat/open instruct example (#381)
* added open-instruct * fixed hooks * GRPO --------- Co-authored-by: Andreas Koepf <andreas.koepf@provisio.com>
This commit is contained in:
parent
eaef88b45b
commit
1da84a0b41
5 changed files with 629 additions and 0 deletions
34
examples/open-instruct/grpo_config.sh
Executable file
34
examples/open-instruct/grpo_config.sh
Executable file
|
|
@ -0,0 +1,34 @@
|
|||
exp_name="0302_qwen2.5_math_grpo_fast1_${RANDOM}"
|
||||
python src/grpo_trainer.py \
|
||||
--model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--beta 0.0 \
|
||||
--num_unique_prompts_rollout 16 \
|
||||
--num_samples_per_prompt_rollout 8 \
|
||||
--output_dir ./open_instruct_checkpoints/$exp_name \
|
||||
--save_freq 40 \
|
||||
--kl_estimator kl3 \
|
||||
--learning_rate 5e-7 \
|
||||
--max_token_length 256 \
|
||||
--max_prompt_token_length 128 \
|
||||
--response_length 256 \
|
||||
--pack_length 512 \
|
||||
--stop_strings '"</answer>"' \
|
||||
--chat_template_name r1_simple_chat_postpend_think \
|
||||
--temperature 1.0 \
|
||||
--total_episodes 100000 \
|
||||
--deepspeed_stage 3 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--num_mini_batches 1\
|
||||
--num_learners_per_node 1 \
|
||||
--num_epochs 1 \
|
||||
--vllm_tensor_parallel_size 1 \
|
||||
--vllm_num_engines 1 \
|
||||
--vllm_enforce_eager true \
|
||||
--lr_scheduler_type linear \
|
||||
--seed 1 \
|
||||
--num_evals 100 \
|
||||
--dataset_name chain_sum \
|
||||
--gradient_checkpointing \
|
||||
--with_tracking \
|
||||
--single_gpu_mode false \
|
||||
--gather_whole_model false
|
||||
Loading…
Add table
Add a link
Reference in a new issue