diff --git a/reasoning_gym/games/sokoban.py b/reasoning_gym/games/sokoban.py index 9c13ed1e..09a9a96d 100644 --- a/reasoning_gym/games/sokoban.py +++ b/reasoning_gym/games/sokoban.py @@ -96,12 +96,12 @@ Here is your puzzle: + gamestr, "answer": solution, "metadata": { + "source_dataset": DATASET_NAME, + "source_index": idx, "gamestr": gamestr, "width": puzzle_data["width"], "height": puzzle_data["height"], "difficulty": { - "source_dataset": DATASET_NAME, - "source_index": idx, "width": (self.config.min_w, self.config.max_w), "height": (self.config.min_h, self.config.max_h), }, diff --git a/reasoning_gym/logic/syllogisms.py b/reasoning_gym/logic/syllogisms.py index ce30e191..600f09bc 100644 --- a/reasoning_gym/logic/syllogisms.py +++ b/reasoning_gym/logic/syllogisms.py @@ -431,6 +431,8 @@ class SyllogismDataset(ProceduralDataset): "question": question, "answer": "Yes" if is_valid else "No", "metadata": { + "source_dataset": DATASET_NAME, + "source_index": idx, "premise1": premise1_text, "premise2": premise2_text, "conclusion": conclusion_text, diff --git a/training/configs/inter_generalisation/algebra_qwen_3b.yaml b/training/configs/inter_generalisation/algebra_qwen_3b.yaml new file mode 100644 index 00000000..3b35a3c8 --- /dev/null +++ b/training/configs/inter_generalisation/algebra_qwen_3b.yaml @@ -0,0 +1,216 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + complex_arithmetic: + weight: 1 + intermediate_integration: + weight: 1 + polynomial_equations: + weight: 1 + polynomial_multiplication: + weight: 1 + simple_equations: + weight: 1 + simple_integration: + weight: 1 + +curriculum: + enabled: False + schedule: + automatic: True + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + - name: length + scaling_factor: 0.2 + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 2048 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 8 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 49152 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 500 + project_name: inter-domain-generalisation + experiment_name: inter_reasoning_algebra_qwen_3b_composite + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/inter_generalisation/algorithmic_qwen_3b.yaml b/training/configs/inter_generalisation/algorithmic_qwen_3b.yaml new file mode 100644 index 00000000..8c05fe0e --- /dev/null +++ b/training/configs/inter_generalisation/algorithmic_qwen_3b.yaml @@ -0,0 +1,241 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + ab: + weight: 1 + base_conversion: + weight: 1 + binary_alternation: + weight: 1 + config: + p_solvable: 0.9 + binary_matrix: + weight: 1 + config: + min_n: 2 + max_n: 6 + caesar_cipher: + weight: 1 + config: + max_words: 10 + cryptarithm: + weight: 1 + isomorphic_strings: + weight: 1 + config: + max_string_length: 8 + jugs: + weight: 1 + config: + difficulty: 6 + rotate_matrix: + weight: 1 + config: + min_n: 2 + max_n: 6 + string_manipulation: + weight: 1 + config: + max_string_length: 15 + max_num_rules: 6 + +curriculum: + enabled: False + schedule: + automatic: True + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + - name: length + scaling_factor: 0.2 + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 2048 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 8 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 49152 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 500 + project_name: inter-domain-generalisation + experiment_name: inter_reasoning_algorithmic_qwen_3b_composite + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/inter_generalisation/games_qwen_3b.yaml b/training/configs/inter_generalisation/games_qwen_3b.yaml new file mode 100644 index 00000000..545a2bd3 --- /dev/null +++ b/training/configs/inter_generalisation/games_qwen_3b.yaml @@ -0,0 +1,222 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + futoshiki: + weight: 1 + knight_swap: + weight: 1 + mahjong_puzzle: + weight: 1 + maze: + weight: 1 + mini_sudoku: + weight: 1 + n_queens: + weight: 1 + rush_hour: + weight: 1 + sokoban: + weight: 1 + sudoku: + weight: 1 + tsumego: + weight: 1 + +curriculum: + enabled: False + schedule: + automatic: True + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 2048 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 8 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 49152 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 500 + project_name: inter-domain-generalisation + experiment_name: inter_reasoning_games_qwen_3b_composite + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/inter_generalisation/logic_qwen_3b.yaml b/training/configs/inter_generalisation/logic_qwen_3b.yaml new file mode 100644 index 00000000..dbd64427 --- /dev/null +++ b/training/configs/inter_generalisation/logic_qwen_3b.yaml @@ -0,0 +1,218 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + aiw: + weight: 1 + circuit_logic: + weight: 1 + knights_knaves: + weight: 1 + propositional_logic: + weight: 1 + self_reference: + weight: 1 + syllogism: + weight: 1 + zebra_puzzles: + weight: 1 + +curriculum: + enabled: False + schedule: + automatic: True + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + - name: length + scaling_factor: 0.2 + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 2048 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 8 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 49152 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 500 + project_name: inter-domain-generalisation + experiment_name: inter_reasoning_logic_qwen_3b_composite + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/rewards/reward.py b/training/rewards/reward.py index 4e741483..9988e7dd 100644 --- a/training/rewards/reward.py +++ b/training/rewards/reward.py @@ -1,6 +1,5 @@ import math import re -from typing import Any, Callable, Dict class RewardRegistry: @@ -84,16 +83,10 @@ def compute_format_reward(solution_str: str, scaling_factor: float = 0.2, **kwar def length_reward(solution_str, scaling_factor, **kwargs): """Reward length appropriately based on correctness.""" correctness_score = kwargs.get("correctness_score", 0.0) - epsilon = 1e-6 max_score = kwargs.get("max_score", 1.0) max_output_length = kwargs.get("max_output_length", 1024) - generation_len = len(solution_str) - progress = min(generation_len / max_output_length, 1.0) - - if correctness_score < max_score - epsilon: - length_reward = (max_score - correctness_score) * progress - else: - length_reward = -progress - + progress = min(len(solution_str) / max_output_length, 1.0) + # for imperfect answers, incentivise longer ones + length_reward = (max_score - correctness_score) * progress return length_reward * scaling_factor diff --git a/training/train_grpo.py b/training/train_grpo.py index 48a6a4d0..0ccf5b6f 100644 --- a/training/train_grpo.py +++ b/training/train_grpo.py @@ -36,13 +36,21 @@ def prepare_datasets(config, tokenizer) -> tuple[ReasoningGymDataset, ReasoningG val_data_source = CompositeDataset(config=replace(train_data_source.composite.config, seed=2)) else: dataset_specs = [ - DatasetSpec(name=name, weight=ds.weight, config=OmegaConf.to_container(ds.config, resolve=True)) + DatasetSpec( + name=name, + weight=ds.weight, + config=OmegaConf.to_container(ds.config, resolve=True) if "config" in ds else {}, + ) for name, ds in config.reasoning_gym.datasets.items() ] train_data_source = reasoning_gym.create_dataset("composite", seed=1, size=dataset_size, datasets=dataset_specs) val_data_source = reasoning_gym.create_dataset("composite", seed=2, size=dataset_size, datasets=dataset_specs) - train_dataset = make_dataset(tokenizer, train_data_source, developer_prompt) - val_dataset = make_dataset(tokenizer, val_data_source, developer_prompt) + train_dataset = make_dataset( + tokenizer, train_data_source, developer_prompt, max_prompt_length=config.data.max_prompt_length + ) + val_dataset = make_dataset( + tokenizer, val_data_source, developer_prompt, max_prompt_length=config.data.max_prompt_length + ) return train_dataset, val_dataset diff --git a/training/trainers/ray_grpo_trainer.py b/training/trainers/ray_grpo_trainer.py index 1165c96e..7c414ccb 100644 --- a/training/trainers/ray_grpo_trainer.py +++ b/training/trainers/ray_grpo_trainer.py @@ -1,6 +1,7 @@ # Adapted version of Bytedance code: # https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/verl/trainer/main_ppo.py +import gc import uuid from copy import deepcopy @@ -385,3 +386,4 @@ class RayGRPOTrainer(RayPPOTrainer): return self.global_steps += 1 + gc.collect() diff --git a/training/utils/datasets.py b/training/utils/datasets.py index 7575aac7..3c9cf0ae 100644 --- a/training/utils/datasets.py +++ b/training/utils/datasets.py @@ -100,17 +100,26 @@ def make_dataset( tokenizer, data_source: Experiment | ProceduralDataset, developer_prompt: str, + max_prompt_length: int = 2048, ) -> ReasoningGymDataset: """ Create ReasoningGymDataset object using either a ProceduralDataset or Experiment as the underlying data source. """ - kwargs = { - "tokenizer": tokenizer, - # "dataset_name": dataset_name, - "developer_prompt": developer_prompt, - } if isinstance(data_source, Experiment): - kwargs["experiment"] = data_source + return ReasoningGymDataset( + tokenizer=tokenizer, + experiment=data_source, + developer_prompt=developer_prompt, + developer_role="system", + max_prompt_length=max_prompt_length, + truncation="error", + ) else: - kwargs["procedural_dataset"] = data_source - return ReasoningGymDataset(**kwargs) + return ReasoningGymDataset( + tokenizer=tokenizer, + procedural_dataset=data_source, + developer_prompt=developer_prompt, + developer_role="system", + max_prompt_length=max_prompt_length, + truncation="error", + )