reasoning-gym/examples/trl/config/grpo.yaml
joesharratt1229 1c98584f28
Feat/unsloth example (#482)
* cleaned up examples

* updated failing hooks

* updated readme

* corrected linting checks
2025-06-28 17:04:38 +01:00

47 lines
945 B
YAML

# Reasoning Gym configs
dataset_size: 10000
developer_prompt: DeepSeekZero
developer_role: system
datasets:
chain_sum:
weight: 1
# Model configs from trl
model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
attn_implementation: flash_attention_2
# GRPO trainer configs from trl
bf16: true
use_vllm: true
vllm_device: cuda:1
vllm_gpu_memory_utilization: 0.9
log_level: info
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
logging_first_step: true
logging_steps: 1
logging_strategy: steps
learning_rate: 1e-06
lr_scheduler_type: constant_with_warmup
lr_scheduler_kwargs:
num_warmup_steps: 10
max_prompt_length: 512
max_completion_length: 512
max_steps: 100
num_generations: 16
num_train_epochs: 1
overwrite_output_dir: true
per_device_train_batch_size: 8
report_to:
- wandb
save_strategy: steps
save_steps: 50
save_total_limit: 5
seed: 42
temperature: 0.6
warmup_ratio: 0.1