diff --git a/training/configs/external_generalisation/math_qwen_3b.yaml b/training/configs/external_generalisation/math_qwen_3b.yaml deleted file mode 100644 index 7916584c..00000000 --- a/training/configs/external_generalisation/math_qwen_3b.yaml +++ /dev/null @@ -1,242 +0,0 @@ -reasoning_gym: - dataset_size: 50000 - developer_prompt: DeepSeekZero - datasets: - complex_arithmetic: - weight: 1 - intermediate_integration: - weight: 1 - polynomial_equations: - weight: 1 - polynomial_multiplication: - weight: 1 - simple_equations: - weight: 1 - simple_integration: - weight: 1 - propositional_logic: - weight: 1 - advanced_geometry: - weight: 1 - simple_geometry: - weight: 1 - basic_arithmetic: - weight: 1 - bitwise_arithmetic: - weight: 1 - chain_sum: - weight: 1 - decimal_arithmetic: - weight: 1 - decimal_chain_sum: - weight: 1 - fraction_simplification: - weight: 1 - gcd: - weight: 1 - lcm: - weight: 1 - prime_factorization: - weight: 1 - -curriculum: - enabled: False - schedule: - automatic: True - update_steps: 30 # automatic curriculum updating after 50 steps - last_k: 20 - success_threshold: 0.70 - failure_threshold: 0.10 - curricula: - spell_backward: - attribute_levels: - word_len: 0 -reward: - use_accuracy: True - conditional_reward: False - secondary_rewards: - - name: format - scaling_factor: 0.2 - kwargs: - preappend_thinking_token: False - - name: length - scaling_factor: 0.2 - -data: - tokenizer: null - train_files: train.parquet - val_files: test.parquet - prompt_key: prompt - max_prompt_length: 4096 - max_response_length: 2048 - train_batch_size: 32 - val_batch_size: 64 - return_raw_chat: True - return_raw_input_ids: True - -actor_rollout_ref: - hybrid_engine: True - model: - path: Qwen/Qwen2.5-3B-Instruct - external_lib: null - override_config: { } - enable_gradient_checkpointing: True - use_remove_padding: True - actor: - strategy: fsdp # This is for backward-compatibility - ppo_mini_batch_size: 32 - ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 8 - use_dynamic_bsz: False - ppo_max_token_len_per_gpu: 49152 # n * ${data.max_prompt_length} + ${data.max_response_length} - grad_clip: 1.0 - clip_ratio: 0.2 - entropy_coeff: 0.001 - use_kl_loss: True # True for GRPO - kl_loss_coef: 0.001 # for grpo - kl_loss_type: low_var_kl # for grpo - ppo_epochs: 1 - shuffle: False - ulysses_sequence_parallel_size: 1 # sp size - optim: - lr: 1e-6 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: 500 # must be override by program - fsdp_config: - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - param_offload: False - optimizer_offload: False - fsdp_size: -1 - ref: - fsdp_config: - param_offload: True - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 160 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.7 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 4 - max_num_batched_tokens: 12288 - max_num_seqs: 1024 - log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 160 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - use_fire_sampling: False - max_model_len: 12288 - # number of responses (i.e. num sample times) - n: 8 # > 1 for grpo - val_kwargs: - do_sample: True - -algorithm: - gamma: 1.0 - lam: 1.0 - adv_estimator: grpo - kl_penalty: kl # how to estimate kl divergence - kl_ctrl: - type: fixed - kl_coef: 0.001 -verbose: True -trainer: - balance_batch: True - total_epochs: 1 - total_training_steps: 1500 - project_name: external-generalisation - experiment_name: math_qwen_3b - logger: [ 'console', 'wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 4 - save_freq: 100 - # auto: find the last ckpt to resume. If can't find, start from scratch - resume_mode: auto # or auto or resume_path if - resume_from_path: False - test_freq: 100 - critic_warmup: 0 - default_hdfs_dir: null - remove_previous_ckpt_in_save: False - del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} - - -critic: - strategy: fsdp - optim: - lr: 1e-5 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program - model: - path: ~/models/deepseek-llm-7b-chat - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: null - forward_micro_batch_size: ${critic.ppo_micro_batch_size} - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -# Reward model not used for GRPO -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - micro_batch_size: null - micro_batch_size_per_gpu: null - max_length: null - ulysses_sequence_parallel_size: 1 - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}