reasoning-gym/training/qwen-math/recipes/Qwen2.5-3B-Instruct/grpo/model_curated_rg_math.yaml

# check ./tina/utils/constant.py
model_post_train_dataset_name: curated_rg_math
model_post_train_type: grpo
rl_post_train_reward_funcs:
- tag_count_reward
- length
- accuracy
rl_post_train_reward_weights:
- 1.0
- 1.0
- 1.0


# Model configs from trl
model_name_or_path: Qwen2.5-3B-Instruct
attn_implementation: flash_attention_2
use_peft: false # full fine tune
lora_r: 32
lora_alpha: 128
lora_dropout: 0.05
lora_target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
- down_proj
- up_proj
- gate_proj


# GRPO trainer configs from trl
bf16: true
use_vllm: true
vllm_device: cuda:2
vllm_gpu_memory_utilization: 0.9
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
push_to_hub: true
hub_strategy: every_save
hub_private_repo: true
hub_model_id: starzmustdie

learning_rate: 1e-06
lr_scheduler_type: constant_with_warmup
lr_scheduler_kwargs:
  num_warmup_steps: 60
max_prompt_length: 512
max_completion_length: 2048
max_steps: 600
num_generations: 8
num_train_epochs: 1
overwrite_output_dir: true
per_device_train_batch_size: 8
report_to:
- wandb

save_strategy: steps
save_steps: 200
save_total_limit: 100

seed: 42
temperature: 0.6