Feat/open instruct example (#381)

* added open-instruct * fixed hooks * GRPO --------- Co-authored-by: Andreas Koepf <andreas.koepf@provisio.com>
2026-04-22 16:49:06 +00:00 · 2025-03-17 22:20:11 +00:00 · 2025-03-17 22:20:11 +00:00 · 1da84a0b41
commit 1da84a0b41
parent eaef88b45b
5 changed files with 629 additions and 0 deletions
--- a/examples/open-instruct/grpo_config.sh
+++ b/examples/open-instruct/grpo_config.sh
@ -0,0 +1,34 @@
+exp_name="0302_qwen2.5_math_grpo_fast1_${RANDOM}"
+python src/grpo_trainer.py \
+    --model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --beta 0.0 \
+    --num_unique_prompts_rollout 16 \
+    --num_samples_per_prompt_rollout 8 \
+    --output_dir ./open_instruct_checkpoints/$exp_name \
+    --save_freq 40 \
+    --kl_estimator kl3 \
+    --learning_rate 5e-7 \
+    --max_token_length 256 \
+    --max_prompt_token_length 128 \
+    --response_length 256 \
+    --pack_length 512 \
+    --stop_strings '"</answer>"' \
+    --chat_template_name r1_simple_chat_postpend_think \
+    --temperature 1.0 \
+    --total_episodes 100000 \
+    --deepspeed_stage 3 \
+    --per_device_train_batch_size 1 \
+    --num_mini_batches 1\
+    --num_learners_per_node 1 \
+    --num_epochs 1 \
+    --vllm_tensor_parallel_size 1 \
+    --vllm_num_engines 1 \
+    --vllm_enforce_eager true \
+    --lr_scheduler_type linear \
+    --seed 1 \
+    --num_evals 100 \
+    --dataset_name chain_sum \
+    --gradient_checkpointing \
+    --with_tracking \
+    --single_gpu_mode false \
+    --gather_whole_model false