# Reasoning Gym configs dataset_size: 10000 developer_prompt: DeepSeekZero developer_role: system datasets: chain_sum: weight: 1 # Model configs from trl model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct attn_implementation: flash_attention_2 # GRPO trainer configs from trl bf16: true use_vllm: true vllm_device: cuda:1 vllm_gpu_memory_utilization: 0.9 log_level: info gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_first_step: true logging_steps: 1 logging_strategy: steps learning_rate: 1e-06 lr_scheduler_type: constant_with_warmup lr_scheduler_kwargs: num_warmup_steps: 10 max_prompt_length: 512 max_completion_length: 512 max_steps: 100 num_generations: 16 num_train_epochs: 1 overwrite_output_dir: true per_device_train_batch_size: 8 report_to: - wandb save_strategy: steps save_steps: 50 save_total_limit: 5 seed: 42 temperature: 0.6 warmup_ratio: 0.1