diff --git a/training/README.md b/training/README.md index 62225b6e..135e23d1 100644 --- a/training/README.md +++ b/training/README.md @@ -30,8 +30,7 @@ pip install -e . 5. Install vLLM: ```bash -pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly - +pip install vllm==0.6.3 ``` 6. Install flash attention ``` diff --git a/training/configs/qwen2.5_3b_grpo.yaml b/training/configs/qwen2.5_3b_grpo.yaml index a279e85d..00d6837d 100644 --- a/training/configs/qwen2.5_3b_grpo.yaml +++ b/training/configs/qwen2.5_3b_grpo.yaml @@ -1,20 +1,27 @@ reasoning_gym: dataset_size: 20000 developer_prompt: DeepSeekZero - datasets: - spell_backward: - weight: 1 - config: - min_word_len: 3 - max_word_len: 10 + datasets: # Used if enable_curriculum_learning is False + mini_sudoku: + weight: 0.33 + config: + min_empty: 6 + futoshiki: + weight: 0.33 + config: + max_board_size: 5 + sudoku: + weight: 0.34 + config: + min_empty: 20 curriculum: enabled: False schedule: automatic: True update_steps: 30 # automatic curriculum updating after 50 steps last_k: 20 - success_threshold: 0.7 - failure_threshold: 0.1 + success_threshold: 0.70 + failure_threshold: 0.10 curricula: spell_backward: attribute_levels: @@ -31,12 +38,11 @@ data: val_files: test.parquet prompt_key: prompt max_prompt_length: 512 - max_response_length: 1024 - train_batch_size: 32 - val_batch_size: 32 + max_response_length: 2048 + train_batch_size: 16 + val_batch_size: 16 return_raw_chat: True return_raw_input_ids: True - actor_rollout_ref: hybrid_engine: True model: @@ -47,11 +53,11 @@ actor_rollout_ref: use_remove_padding: True actor: strategy: fsdp # This is for backward-compatibility - ppo_mini_batch_size: 32 + ppo_mini_batch_size: 16 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 8 use_dynamic_bsz: False - ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + ppo_max_token_len_per_gpu: 20480 # n * ${data.max_prompt_length} + ${data.max_response_length} grad_clip: 1.0 clip_ratio: 0.2 entropy_coeff: 0.001 @@ -63,9 +69,9 @@ actor_rollout_ref: ulysses_sequence_parallel_size: 1 # sp size optim: lr: 1e-6 - lr_warmup_steps_ratio: 0 # the total steps will be injected during runtime + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine + warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program fsdp_config: wrap_policy: @@ -99,7 +105,7 @@ actor_rollout_ref: enforce_eager: True free_cache_engine: True load_format: dummy_dtensor - tensor_model_parallel_size: 4 + tensor_model_parallel_size: 2 max_num_batched_tokens: 16384 max_num_seqs: 1024 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu @@ -128,14 +134,14 @@ algorithm: verbose: True trainer: balance_batch: True - total_epochs: 5 + total_epochs: 10 total_training_steps: null project_name: rg-test - experiment_name: verl_grpo_qwen_composite + experiment_name: verl_grpo_qwen_3b logger: [ 'console', 'wandb' ] val_generations_to_log_to_wandb: 0 nnodes: 1 - n_gpus_per_node: 4 + n_gpus_per_node: 2 save_freq: 100 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if @@ -147,13 +153,14 @@ trainer: del_local_ckpt_after_load: False default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + critic: strategy: fsdp optim: - lr: 1e-6 - lr_warmup_steps_ratio: 0 # the total steps will be injected during runtime + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine - warmup_style: cosine # select from constant/cosine + warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program model: path: ~/models/deepseek-llm-7b-chat