Test training with trl (#70)

* first trl grpo implementation
* added config yaml file
* added read me and dependencies
* updated reward format func
This commit is contained in:
joesharratt1229 2025-02-07 06:42:32 +00:00 committed by GitHub
parent 3f6b2fc807
commit a8e11e71be
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 287 additions and 0 deletions

View file

@ -0,0 +1,37 @@
#Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
#script arguments
dataset_name: chain_sum
#training arguments
bf16: true
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id:
seed: 42
eval_seed: 101
log_level: info
logging_steps: 10
logging_strategy: steps
lr_scheduler_type: cosine
learning_rate: 2.0e-05
max_prompt_length: 512
max_completion_length: 1024
num_generations: 8
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
overwrite_output_dir: true
output_dir: data/Qwen-1.5B-GRPO
train_size: 1000
eval_size: 100
num_train_epochs: 1
max_steps: -1
push_to_hub: true
report_to: ['wandb']
#do_eval: true
#eval_strategy: steps
#eval_steps: 100