reasoning_gym: dataset_size: 10000 developer_prompt: DeepSeekZero enable_curriculum_learning: False datasets: # Used if enable_curriculum_learning is False mini_sudoku: weight: 0.33 config: min_empty: 6 futoshiki: weight: 0.33 config: max_board_size: 5 sudoku: weight: 0.34 config: min_empty: 20 curricula: leg_counting: attribute_levels: num_animals: 2 weight: 1.0 products: attribute_levels: num_terms: 4 num_digits: 4 weight: 1.0 chain_sum: attribute_levels: num_terms: 4 num_digits: 4 weight: 1.0 reward: format_reward: enable: True scaling_factor: 0.2 prepend_think_token: False # Set to True only when the tokenizer's prompt template pre-fills the generation with , such as in the case of (distilled) r1 models length_reward: enable: True scaling_factor: 0.2 data: tokenizer: null train_files: train.parquet val_files: test.parquet prompt_key: prompt max_prompt_length: 512 max_response_length: 1024 train_batch_size: 16 val_batch_size: 16 return_raw_input_ids: True # This should be set to true when the tokenizer between policy and rm differs return_raw_chat: True actor_rollout_ref: hybrid_engine: True model: path: Qwen/Qwen2.5-1.5B-Instruct external_lib: null override_config: { } enable_gradient_checkpointing: True use_remove_padding: True actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 16 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 8 use_dynamic_bsz: False ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} grad_clip: 1.0 clip_ratio: 0.2 entropy_coeff: 0.001 use_kl_loss: True # True for GRPO kl_loss_coef: 0.001 # for grpo kl_loss_type: low_var_kl # for grpo ppo_epochs: 1 shuffle: False ulysses_sequence_parallel_size: 1 # sp size checkpoint: contents: ['model', 'hf_model', 'optimizer', 'extra'] optim: lr: 1e-6 lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program fsdp_config: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 param_offload: False optimizer_offload: False fsdp_size: -1 ref: fsdp_config: param_offload: True wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 160 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size rollout: name: vllm temperature: 1.0 top_k: -1 # 0 for hf rollout, -1 for vllm rollout top_p: 1 prompt_length: ${data.max_prompt_length} # not use for opensource response_length: ${data.max_response_length} # for vllm rollout dtype: bfloat16 # should align with FSDP gpu_memory_utilization: 0.6 ignore_eos: False enforce_eager: True free_cache_engine: True load_format: dummy_dtensor tensor_model_parallel_size: 2 max_num_batched_tokens: 8192 max_num_seqs: 1024 max_model_len: 1024 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 160 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} disable_log_stats: True enable_chunked_prefill: True # could get higher throughput # for hf rollout do_sample: True use_fire_sampling: False # number of responses (i.e. num sample times) n: 8 # > 1 for grpo val_kwargs: do_sample: True algorithm: gamma: 1.0 lam: 1.0 adv_estimator: grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed kl_coef: 0.001 trainer: balance_batch: True total_epochs: 10 total_training_steps: null project_name: rg-test experiment_name: verl_grpo_qwen2.5_1.5b logger: [ 'console', 'wandb' ] val_generations_to_log_to_wandb: 0 nnodes: 1 n_gpus_per_node: 2 save_freq: 100 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if resume_from_path: False test_freq: 100 critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} critic: strategy: fsdp optim: lr: 1e-5 lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program model: path: ~/models/deepseek-llm-7b-chat tokenizer_path: ${actor_rollout_ref.model.path} override_config: { } external_lib: ${actor_rollout_ref.model.external_lib} enable_gradient_checkpointing: True use_remove_padding: False fsdp_config: param_offload: False optimizer_offload: False wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 fsdp_size: -1 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: null forward_micro_batch_size: ${critic.ppo_micro_batch_size} forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: 1 # sp size ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} shuffle: ${actor_rollout_ref.actor.shuffle} grad_clip: 1.0 cliprange_value: 0.5 # Reward model not used for GRPO reward_model: enable: False strategy: fsdp model: input_tokenizer: ${actor_rollout_ref.model.path} path: ~/models/FsfairX-LLaMA3-RM-v0.1 external_lib: ${actor_rollout_ref.model.external_lib} use_remove_padding: False fsdp_config: min_num_params: 0 param_offload: False fsdp_size: -1 micro_batch_size: null micro_batch_size_per_gpu: null max_length: null ulysses_sequence_parallel_size: 1 use_dynamic_bsz: ${critic.use_dynamic_bsz} forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}