diff --git a/reasoning_gym/algorithmic/number_sorting.py b/reasoning_gym/algorithmic/number_sorting.py index 67e3ba07..7406d96f 100644 --- a/reasoning_gym/algorithmic/number_sorting.py +++ b/reasoning_gym/algorithmic/number_sorting.py @@ -170,7 +170,7 @@ class NumberSortingCurriculum(BaseCurriculum): self._define_attributes( RangeAttributeDefinition( name="numbers", - levels=[10, 50, 100, 200], + levels=[10, 100, 500, 1000], description="How many numbers to sort", lower_field_name="min_numbers", upper_field_name="max_numbers", diff --git a/reasoning_gym/arithmetic/basic_arithmetic.py b/reasoning_gym/arithmetic/basic_arithmetic.py index c12cbb76..b7b1f950 100644 --- a/reasoning_gym/arithmetic/basic_arithmetic.py +++ b/reasoning_gym/arithmetic/basic_arithmetic.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from random import Random from typing import Any, Literal, Optional -from ..coaching import BaseCurriculum, RangeAttributeDefinition +from ..coaching import BaseCurriculum, RangeAttributeDefinition, ScalarAttributeDefinition from ..factory import ProceduralDataset, register_dataset DATASET_NAME = "basic_arithmetic" @@ -250,17 +250,19 @@ class BasicArithmeticCurriculum(BaseCurriculum): self._define_attributes( RangeAttributeDefinition( name="num_terms", - levels=[2, 5, 10, 15], + levels=[2, 3, 4, 5, 6], description="Number of terms in the expression", lower_field_name="min_terms", upper_field_name="max_terms", + ensure_interval=False, ), RangeAttributeDefinition( name="num_digits", - levels=[1, 2, 5, 10], + levels=[1, 2, 3, 4], description="Number of digits in the numbers", lower_field_name="min_digits", upper_field_name="max_digits", + ensure_interval=False, ), ) diff --git a/reasoning_gym/cognition/rectangle_count.py b/reasoning_gym/cognition/rectangle_count.py index 7539e5e0..8c11af17 100644 --- a/reasoning_gym/cognition/rectangle_count.py +++ b/reasoning_gym/cognition/rectangle_count.py @@ -16,6 +16,8 @@ Now, it's your turn. How many rectangles do you see in the grid below? """ DATASET_NAME = "rectangle_count" +CONST_TERM = 0.8 +D = 5 def draw_rectangles_with_overlap(n, width, height, rng): @@ -132,22 +134,29 @@ class RectangleCountDataset(ProceduralDataset): } def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float: - """Determine if the solution provided solves the RectangleCount task. - - The function awards 1.0 for a correct answer. - - Args: - answer (Optional[str]): The user's answer. - entry (dict[str, Any]): The original dataset entry containing the correct answer. + """Determine if the solution provided solves the RectangleCount task, + awarding partial credit if the guess is close. Returns: - float: The computed score between 0.0 and 1.0. + float: A score between 0.0 and 1.0. """ + correct_str = entry["answer"].lower().replace("\n", "") - if isinstance(answer, str): - if answer.lower().replace("\n", "") == entry["answer"].lower().replace("\n", ""): - return 1.0 # Yay - return 0.0 + try: + correct_val = int(correct_str) + user_val = int(answer.strip()) + except (ValueError, TypeError, AttributeError): + return 0.0 + distance = abs(user_val - correct_val) + + if distance == 0: + return 1.0 + if distance >= D: + return 0.0 + + score = 1.0 - (distance / float(D)) + score = CONST_TERM * score + return max(0.0, score) class RectangleCountCurriculum(BaseCurriculum): diff --git a/reasoning_gym/cognition/rubiks_cube.py b/reasoning_gym/cognition/rubiks_cube.py index 44819be1..7624fdac 100644 --- a/reasoning_gym/cognition/rubiks_cube.py +++ b/reasoning_gym/cognition/rubiks_cube.py @@ -121,29 +121,49 @@ class RubiksCubeDataset(ProceduralDataset): }, } + def partial_score(self, cube: Cube) -> float: + """ + Returns a fraction between 0 and 1, indicating how many stickers are + correctly positioned (i.e., match the solved color for that face). + """ + total_stickers = 6 * (cube.size**2) + correct_stickers = 0 + + for face_index in range(6): + face = cube.faces[face_index] + + solved_color = face[cube.size // 2][cube.size // 2].color + for row in range(cube.size): + for col in range(cube.size): + sticker = face[row][col] + if sticker.color == solved_color: + correct_stickers += 1 + + return correct_stickers / total_stickers + def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float: - """Determine if the solution provided solves the cube""" - reward = 0.0 # default reward + """Determine if the solution provided solves the cube, with partial rewards.""" + reward = 0.0 # default if answer is not None: - # Reconstruct the test cube eval_cube = Cube(entry["metadata"]["cube_size"]) eval_cube.rotate(entry["metadata"]["scramble_moves"]) - - # Test the solution try: expanded_answer = self.expand_moves(answer) eval_cube.rotate(expanded_answer) - solved = eval_cube.is_done() + # 3) Check if fully solved + solved = eval_cube.is_done() if solved: reward = 1.0 - elif len(answer.strip()) > 0: # encourage non-empty answers - reward = 0.05 # Incorrect, but rotate could parse the answer else: - reward = 0.01 - except: - reward = 0.01 # At least you tried + partial = self.partial_score(eval_cube) + if len(answer.strip()) > 0: + reward = max(0.05, partial) + else: + reward = max(0.01, partial) + except: + reward = 0.01 return reward def remove_ansi(self, line): diff --git a/reasoning_gym/games/sokoban.py b/reasoning_gym/games/sokoban.py index 09a9a96d..8a568694 100644 --- a/reasoning_gym/games/sokoban.py +++ b/reasoning_gym/games/sokoban.py @@ -99,6 +99,7 @@ Here is your puzzle: "source_dataset": DATASET_NAME, "source_index": idx, "gamestr": gamestr, + "source_dataset": DATASET_NAME, "width": puzzle_data["width"], "height": puzzle_data["height"], "difficulty": { diff --git a/tests/test_basic_arithmetic.py b/tests/test_basic_arithmetic.py index be2f6224..0101109f 100644 --- a/tests/test_basic_arithmetic.py +++ b/tests/test_basic_arithmetic.py @@ -6,6 +6,7 @@ from reasoning_gym.arithmetic.basic_arithmetic import ( BasicArithmeticDatasetConfig, eval_floordiv, ) +from reasoning_gym.coaching.base_curriculum import DefaultCurriculumContext, RangeAttributeMode def test_arithmetic_dataset_config_validation(): @@ -103,7 +104,7 @@ def test_basic_arithmetic_curriculum(): """Test the BasicArithmeticCurriculum functionality""" curriculum = BasicArithmeticCurriculum() - base_value = {"size": 150, "seed": 1} + base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1} base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(base_value) assert base_cfg.seed == 1 @@ -115,7 +116,7 @@ def test_basic_arithmetic_curriculum(): curriculum.increment_attr_level("num_terms") curriculum.increment_attr_level("num_digits") increased_cfg = curriculum.generate_configuration(base_value) - assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 5 + assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3 assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2 # Test decrementing attribute level for num_terms @@ -128,7 +129,7 @@ def test_basic_arithmetic_curriculum(): curriculum.increment_attr_level("num_terms") curriculum.increment_attr_level("num_terms") higher_level_cfg = curriculum.generate_configuration(base_value) - assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 10 + assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 4 assert higher_level_cfg.min_digits == 1 and higher_level_cfg.max_digits == 2 # Test boundary conditions - trying to decrement below level 0 @@ -144,5 +145,26 @@ def test_basic_arithmetic_curriculum(): curriculum.increment_attr_level("num_terms") curriculum.increment_attr_level("num_digits") upper_bound_cfg = curriculum.generate_configuration(base_value) - assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 15 - assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 10 + assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 6 + assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 4 + + +def test_basic_arithmetic_curriculum_upper_bound(): + curriculum = BasicArithmeticCurriculum() + + base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1} + + base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration( + base_value, context=DefaultCurriculumContext(mode=RangeAttributeMode.UPPER_BOUND) + ) + assert base_cfg.seed == 1 + assert base_cfg.size == 150 + assert base_cfg.min_terms == 2 and base_cfg.max_terms == 2 + assert base_cfg.min_digits == 1 and base_cfg.max_digits == 1 + + # Test incrementing attribute levels + curriculum.increment_attr_level("num_terms") + curriculum.increment_attr_level("num_digits") + increased_cfg = curriculum.generate_configuration(base_value) + assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3 + assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2 diff --git a/tests/test_number_sorting.py b/tests/test_number_sorting.py index 729c421d..6eae6265 100644 --- a/tests/test_number_sorting.py +++ b/tests/test_number_sorting.py @@ -56,6 +56,7 @@ def test_number_sorting_dataset_items(): # Verify number count constraints numbers = item["metadata"]["original_numbers"] + print(numbers) assert len(numbers) >= config.min_numbers assert len(numbers) <= config.max_numbers @@ -99,7 +100,7 @@ def test_number_sorting_curriculum(): base_cfg: NumberSortingConfig = curriculum.generate_configuration(base_value) assert base_cfg.seed == 1 assert base_cfg.size == 150 - assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 50 + assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 100 assert base_cfg.min_decimals == 0 and base_cfg.max_decimals == 1 assert base_cfg.min_value == -100 and base_cfg.max_value == 100 @@ -107,14 +108,14 @@ def test_number_sorting_curriculum(): curriculum.increment_attr_level("numbers") curriculum.increment_attr_level("decimals") increased_cfg = curriculum.generate_configuration(base_value) - assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 100 + assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 500 assert increased_cfg.min_decimals == 0 and increased_cfg.max_decimals == 2 assert increased_cfg.min_value == -100 and increased_cfg.max_value == 100 # test decrementing attribute level for numbers again curriculum.decrement_attr_level("numbers") partially_decreased_cfg = curriculum.generate_configuration(base_value) - assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 50 + assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 100 assert partially_decreased_cfg.min_decimals == 0 and partially_decreased_cfg.max_decimals == 2 assert partially_decreased_cfg.min_value == -100 and partially_decreased_cfg.max_value == 100 diff --git a/tests/test_rubiks_cube.py b/tests/test_rubiks_cube.py index 4b8f949b..f9a77efa 100644 --- a/tests/test_rubiks_cube.py +++ b/tests/test_rubiks_cube.py @@ -55,9 +55,9 @@ def test_rubikscube_items(): assert dataset.score_answer(answer=None, entry=item) == 0.0 if item["metadata"]["example_correct_answer"] != "R": - assert dataset.score_answer(answer="R", entry=item) == 0.05 + assert dataset.score_answer(answer="R", entry=item) == 0.01 - assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.05 + assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.01 if len(item["metadata"]["example_correct_answer"]) > 0: assert dataset.score_answer(answer="", entry=item) == 0.01 diff --git a/training/README.md b/training/README.md index 6c740042..552b8fba 100644 --- a/training/README.md +++ b/training/README.md @@ -87,6 +87,7 @@ python utils/load_fsdp_to_hf.py checkpoints/rg-test/intra_reasoning_algorithmic_ From here you may to run evaluations of your trained model. In the `training/evaluation` directory there is a script `evaluate_model.py` which you csn run to evaluate your trained model on a specific dataset. You specify evaluation parameters in a yaml file. This evaluation can point to either a local or remote model. For example the configuration file `training/evaluation/eval_algorithmic_composite.yaml` specifies the path to a local model which is stored as a hugginface checkpoint at `training/utils/qwen3b_500` (note that you have to convert to fsdp checkpoint to hf checkpoint for evaluation script to work as shown in the previous step). ## Run the script +export VLLM_ATTENTION_BACKEND=XFORMERS Navigate to evaluations directory: ``` python evaluate_model.py --config path-to-yaml diff --git a/training/configs/intra_generalisation/algebra_qwen_3b.yaml b/training/configs/intra_generalisation/algebra_qwen_3b.yaml new file mode 100644 index 00000000..1193ffb0 --- /dev/null +++ b/training/configs/intra_generalisation/algebra_qwen_3b.yaml @@ -0,0 +1,221 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + simple_equations: + weight: 0.5 + config: + min_terms: 2 + max_terms: 4 + min_value: 1 + max_value: 100 + polynomial_multiplication: + weight: 0.5 + config: + min_terms: 2 + max_terms: 4 + min_value: 1 + max_value: 100 + min_degree: 0 + max_degree: 3 + min_polynomials: 2 + max_polynomials: 3 +curriculum: + enabled: False + schedule: + automatic: True + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 500 + project_name: rg-test + experiment_name: intra_reasoning_algebra_qwen_3b_composite + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/algorithmic_qwen_3b.yaml b/training/configs/intra_generalisation/algorithmic_qwen_3b.yaml similarity index 100% rename from training/configs/algorithmic_qwen_3b.yaml rename to training/configs/intra_generalisation/algorithmic_qwen_3b.yaml diff --git a/training/configs/intra_generalisation/arithmetic_qwen_3b.yaml b/training/configs/intra_generalisation/arithmetic_qwen_3b.yaml new file mode 100644 index 00000000..82199ae3 --- /dev/null +++ b/training/configs/intra_generalisation/arithmetic_qwen_3b.yaml @@ -0,0 +1,224 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + fraction_simplification: + weight: 0.33 + config: + min_value: 1 + max_value: 1000 + min_factor: 1 + max_factor: 100 + gcd: + weight: 0.34 + config: + min_numbers: 2 # Minimum numbers to find GCD of + max_numbers: 2 # Maximum numbers to find GCD of + min_value: 1 # Minimum value for each number + max_value: 1000 # Maximum value for each number + lcm: + weight: 0.33 + config: + min_numbers: 2 + max_numbers: 2 + min_value: 1 + max_value: 100 +curriculum: + enabled: False + schedule: + automatic: True + update_steps: 30 + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 500 + project_name: rg-test + experiment_name: intra_reasoning_arithmetic_qwen_3b_composite + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/intra_generalisation/cognition_qwen_3b.yaml b/training/configs/intra_generalisation/cognition_qwen_3b.yaml new file mode 100644 index 00000000..4b605923 --- /dev/null +++ b/training/configs/intra_generalisation/cognition_qwen_3b.yaml @@ -0,0 +1,219 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + rubiks_cube: + weight: 0.33 + config: + min_scramble_steps: 3 + max_scramble_steps: 10 + figlet_font: + weight: 0.34 + config: + min_word_len: 3 + max_word_len: 7 + rectangle_count: + weight: 0.33 + config: + max_rectangles: 10 + width: 80 + height: 80 +curriculum: + enabled: False + schedule: + automatic: True + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 500 + project_name: rg-test + experiment_name: intra_reasoning_cognition_qwen_3b_composite_test + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/intra_generalisation/games_qwen_3b.yaml b/training/configs/intra_generalisation/games_qwen_3b.yaml new file mode 100644 index 00000000..91e0725a --- /dev/null +++ b/training/configs/intra_generalisation/games_qwen_3b.yaml @@ -0,0 +1,225 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + sudoku: + weight: 0.33 + config: + min_empty: 30 + max_empty: 50 + futoshiki: + weight: 0.34 + config: + min_board_size: 4 # Board will be NxN where N is this value + max_board_size: 9 + min_difficulty: 0 + max_difficulty: 3 + sokoban: + weight: 0.33 + config: + min_w: 6 # Minimum width of the puzzle + min_h: 6 # Minimum height of the puzzle + max_w: 10 # Maximum width of the puzzle + max_h: 10 # Maximum height of the puzzle + min_boxes: 4 # Minimum number of boxes + max_boxes: 10 # Maximum number of boxes + max_depth: 80 # Maximum search depth +curriculum: + enabled: False + schedule: + automatic: True + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 500 + project_name: rg-test + experiment_name: intra_reasoning_games_qwen_3b_composite + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/configs/intra_generalisation/graphs_qwen_3b.yaml b/training/configs/intra_generalisation/graphs_qwen_3b.yaml new file mode 100644 index 00000000..2bc5c740 --- /dev/null +++ b/training/configs/intra_generalisation/graphs_qwen_3b.yaml @@ -0,0 +1,226 @@ +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: + shortest_path: + weight: 0.33 + config: + min_rows: 5 + max_rows: 8 + min_cols: 5 + max_cols: 8 + p_blocked: 0.4 + largest_island: + weight: 0.34 + config: + min_rows: 5 + max_rows: 10 + min_cols: 5 + max_cols: 10 + min_num_islands: 0 + max_num_islands: 5 + min_island_size: 0 + max_island_size: 10 + quantum_lock: + weight: 0.33 + config: + difficulty: 10 +curriculum: + enabled: False + schedule: + automatic: True + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 500 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 500 + project_name: rg-test + experiment_name: intra_reasoning_games_qwen_3b_graphs + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/evaluations/eval_algebraic_composite.yaml b/training/evaluations/eval_algebraic_composite.yaml new file mode 100644 index 00000000..f79514b4 --- /dev/null +++ b/training/evaluations/eval_algebraic_composite.yaml @@ -0,0 +1,28 @@ +# Model configuration +model_path: ../utils/qwen3b_algebraic +max_tokens: 1024 +temperature: 0.6 +top_p: 0.9 +developer_prompt: DeepSeekZero +developer_role: system # Standard role for system prompts + +# Output configuration +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +# Categories and datasets to evaluate +categories: + - category: reasoning + datasets: + - dataset: simple_integration + size: 100 + seed: 42 + params: + min_terms: 2 + max_terms: 5 + min_degree: 1 + max_degree: 10 + min_bounds: 1 + max_bounds: 10 diff --git a/training/evaluations/eval_algorithmic_composite.yaml b/training/evaluations/eval_algorithmic_composite.yaml index 6eca3dda..1adba40f 100644 --- a/training/evaluations/eval_algorithmic_composite.yaml +++ b/training/evaluations/eval_algorithmic_composite.yaml @@ -1,8 +1,8 @@ # Model configuration -model_path: ../utils/qwen3b_500 # Change to the smaller model -max_tokens: 1024 # From max_response_length in training config -temperature: 0.7 # Lower temperature for more focused responses -top_p: 0.9 # From rollout top_p +model_path: ../utils/qwen3b_algorithmic_500 +max_tokens: 1024 +temperature: 0.6 +top_p: 0.9 developer_prompt: DeepSeekZero developer_role: system # Standard role for system prompts diff --git a/training/evaluations/eval_arithmetic_composite.yaml b/training/evaluations/eval_arithmetic_composite.yaml new file mode 100644 index 00000000..388cfdb3 --- /dev/null +++ b/training/evaluations/eval_arithmetic_composite.yaml @@ -0,0 +1,24 @@ +# Model configuration +model_path: ../utils/qwen_3b_arithmetic_100 +max_tokens: 1024 +temperature: 0.6 +top_p: 0.9 +developer_prompt: DeepSeekZero +developer_role: system # Standard role for system prompts + +# Output configuration +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +# Categories and datasets to evaluate +categories: + - category: reasoning + datasets: + - dataset: prime_factorization + size: 100 + seed: 42 + params: + min_value: 2 + max_value: 1000 diff --git a/training/evaluations/eval_cognition_composite.yaml b/training/evaluations/eval_cognition_composite.yaml new file mode 100644 index 00000000..6b5a2279 --- /dev/null +++ b/training/evaluations/eval_cognition_composite.yaml @@ -0,0 +1,36 @@ +# Model configuration +model_path: ../utils/qwen3b_cognition +max_tokens: 1024 +temperature: 0.6 # Lower temperature for more focused responses +top_p: 0.9 # From rollout top_p +developer_prompt: DeepSeekZero +developer_role: system # Standard role for system prompts + +# Output configuration +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +# Categories and datasets to evaluate +categories: + - category: reasoning + datasets: + - dataset: number_sequence + size: 100 + seed: 42 + params: + min_terms: 4 # Minimum visible terms + max_terms: 8 # Maximum visible terms + min_value: -100 # Minimum allowed number + max_value: 100 # Maximum allowed number + max_complexity: 3 # Maximum number of operations to combine + - dataset: modulo_grid + size: 100 + seed: 42 + params: + size_x: 20 + size_y: 20 + max_divisor: 20 + max_target: 20 + max_holes: 1 diff --git a/training/evaluations/eval_games_composite.yaml b/training/evaluations/eval_games_composite.yaml new file mode 100644 index 00000000..b183b6fd --- /dev/null +++ b/training/evaluations/eval_games_composite.yaml @@ -0,0 +1,24 @@ +# Model configuration +model_path: ../utils/qwen3b_games +max_tokens: 1024 +temperature: 0.6 # Lower temperature for more focused responses +top_p: 0.9 # From rollout top_p +developer_prompt: DeepSeekZero +developer_role: system # Standard role for system prompts + +# Output configuration +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +# Categories and datasets to evaluate +categories: + - category: reasoning + datasets: + - dataset: mahjong_puzzle + size: 100 + seed: 42 + params: + min_num_rounds: 10 + max_num_rounds: 50 diff --git a/training/evaluations/eval_qwen_3b.yaml b/training/evaluations/eval_qwen_3b.yaml index 4069895d..132989fb 100644 --- a/training/evaluations/eval_qwen_3b.yaml +++ b/training/evaluations/eval_qwen_3b.yaml @@ -16,13 +16,13 @@ eval_repeats: 3 categories: - category: reasoning datasets: - - dataset: number_sorting + - dataset: decimal_chain_sum size: 100 seed: 42 params: - min_numbers: 3 - max_numbers: 10 - min_decimals: 0 - max_decimals: 2 - min_value: -100.0 - max_value: 100.0 + min_terms: 2 + max_terms: 4 + min_digits: 1 + max_digits: 3 + min_decimal_places: 1 + max_decimal_places: 4