Test training with trl (#70)

* first trl grpo implementation * added config yaml file * added read me and dependencies * updated reward format func
2026-04-19 12:58:07 +00:00 · 2025-02-07 06:42:32 +00:00 · 2025-02-07 06:42:32 +00:00 · a8e11e71be
commit a8e11e71be
parent 3f6b2fc807
5 changed files with 287 additions and 0 deletions
--- a/examples/trl/config/grpo.yaml
+++ b/examples/trl/config/grpo.yaml
@ -0,0 +1,37 @@
+#Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+
+
+#script arguments
+dataset_name: chain_sum
+
+#training arguments
+bf16: true
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+      use_reentrant: false
+hub_model_id:
+seed: 42
+eval_seed: 101
+log_level: info
+logging_steps: 10
+logging_strategy: steps
+lr_scheduler_type: cosine
+learning_rate: 2.0e-05
+max_prompt_length: 512
+max_completion_length: 1024
+num_generations: 8
+per_device_train_batch_size: 1
+per_device_eval_batch_size: 1
+overwrite_output_dir: true
+output_dir: data/Qwen-1.5B-GRPO
+train_size: 1000
+eval_size: 100
+num_train_epochs: 1
+max_steps: -1
+push_to_hub: true
+report_to: ['wandb']
+#do_eval: true
+#eval_strategy: steps
+#eval_steps: 100