diff --git a/training/configs/external_generalisation/math_curriculum_qwen_7b.yaml b/training/configs/external_generalisation/math_curriculum_qwen_7b.yaml new file mode 100644 index 00000000..21cb27c3 --- /dev/null +++ b/training/configs/external_generalisation/math_curriculum_qwen_7b.yaml @@ -0,0 +1,242 @@ +reasoning_gym: + dataset_size: 50000 + developer_prompt: DeepSeekZero + datasets: + dummy: + weight: 1 + +curriculum: + enabled: True + schedule: + automatic: True + update_steps: 30 # automatic curriculum updating after 30 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + complex_arithmetic: + weight: 1 + intermediate_integration: + weight: 1 + polynomial_equations: + weight: 1 + polynomial_multiplication: + weight: 1 + simple_equations: + weight: 1 + simple_integration: + weight: 1 + propositional_logic: + weight: 1 + advanced_geometry: + weight: 1 + simple_geometry: + weight: 1 + basic_arithmetic: + weight: 1 + bitwise_arithmetic: + weight: 1 + chain_sum: + weight: 1 + decimal_arithmetic: + weight: 1 + decimal_chain_sum: + weight: 1 + fraction_simplification: + weight: 1 + gcd: + weight: 1 + lcm: + weight: 1 + prime_factorization: + weight: 1 +reward: + use_accuracy: True + secondary_rewards: + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + - name: length + scaling_factor: 0.2 + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 2048 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True + +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-7B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 32 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 8 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 49152 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + model_dtype: bfloat16 + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + model_dtype: bfloat16 + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 1500 + project_name: inter-domain-generalisation + experiment_name: inter_reasoning_algebra_qwen_3b_composite + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/external_generalisation/math_v1.yaml b/training/configs/external_generalisation/math_qwen_3b.yaml similarity index 100% rename from training/configs/external_generalisation/math_v1.yaml rename to training/configs/external_generalisation/math_qwen_3b.yaml