mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
* first trl grpo implementation * added config yaml file * added read me and dependencies * updated reward format func
37 lines
768 B
YAML
37 lines
768 B
YAML
#Model arguments
|
|
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
|
|
|
|
|
#script arguments
|
|
dataset_name: chain_sum
|
|
|
|
#training arguments
|
|
bf16: true
|
|
gradient_accumulation_steps: 16
|
|
gradient_checkpointing: true
|
|
gradient_checkpointing_kwargs:
|
|
use_reentrant: false
|
|
hub_model_id:
|
|
seed: 42
|
|
eval_seed: 101
|
|
log_level: info
|
|
logging_steps: 10
|
|
logging_strategy: steps
|
|
lr_scheduler_type: cosine
|
|
learning_rate: 2.0e-05
|
|
max_prompt_length: 512
|
|
max_completion_length: 1024
|
|
num_generations: 8
|
|
per_device_train_batch_size: 1
|
|
per_device_eval_batch_size: 1
|
|
overwrite_output_dir: true
|
|
output_dir: data/Qwen-1.5B-GRPO
|
|
train_size: 1000
|
|
eval_size: 100
|
|
num_train_epochs: 1
|
|
max_steps: -1
|
|
push_to_hub: true
|
|
report_to: ['wandb']
|
|
#do_eval: true
|
|
#eval_strategy: steps
|
|
#eval_steps: 100
|