mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
Feat/intragen experiments (#414)
* added curriculum * readapted readme * corrected small errors * Delete eval/eval/r1/algorithmic/word_sorting.json * removed redundant argument * added spell * removed duplicated fit * changed config * added composite changes * added composite changes * updated yaml * added spell backward * updated read me * added qwen2.5 * added * Add files via upload * updated missing trainer func * updated curr * updated spell back * updated correctness score func * updated configs * added local evals * added updates * updated datasets * added fsdp to hf utility * added algorithmic qwen 3b yaml * updated read me * updated configs * added preappend token * updated with thinking token * updated test score board * resolved comments * added evaluation scripts * removed results from pr * added config * added partial reward scoring * added evaluation composites * added training configs * added games eval * added rubriks cube * resolved merge cinflicts * added games config * added latest eval configs * updated strucutre * Delete training/evaluations/eval_graphs_composite.yaml --------- Co-authored-by: joesharratt1229 <joesharrat1229@gmail.com>
This commit is contained in:
parent
224532f12a
commit
d0ef136d5b
21 changed files with 1331 additions and 48 deletions
|
|
@ -170,7 +170,7 @@ class NumberSortingCurriculum(BaseCurriculum):
|
||||||
self._define_attributes(
|
self._define_attributes(
|
||||||
RangeAttributeDefinition(
|
RangeAttributeDefinition(
|
||||||
name="numbers",
|
name="numbers",
|
||||||
levels=[10, 50, 100, 200],
|
levels=[10, 100, 500, 1000],
|
||||||
description="How many numbers to sort",
|
description="How many numbers to sort",
|
||||||
lower_field_name="min_numbers",
|
lower_field_name="min_numbers",
|
||||||
upper_field_name="max_numbers",
|
upper_field_name="max_numbers",
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ from dataclasses import dataclass
|
||||||
from random import Random
|
from random import Random
|
||||||
from typing import Any, Literal, Optional
|
from typing import Any, Literal, Optional
|
||||||
|
|
||||||
from ..coaching import BaseCurriculum, RangeAttributeDefinition
|
from ..coaching import BaseCurriculum, RangeAttributeDefinition, ScalarAttributeDefinition
|
||||||
from ..factory import ProceduralDataset, register_dataset
|
from ..factory import ProceduralDataset, register_dataset
|
||||||
|
|
||||||
DATASET_NAME = "basic_arithmetic"
|
DATASET_NAME = "basic_arithmetic"
|
||||||
|
|
@ -250,17 +250,19 @@ class BasicArithmeticCurriculum(BaseCurriculum):
|
||||||
self._define_attributes(
|
self._define_attributes(
|
||||||
RangeAttributeDefinition(
|
RangeAttributeDefinition(
|
||||||
name="num_terms",
|
name="num_terms",
|
||||||
levels=[2, 5, 10, 15],
|
levels=[2, 3, 4, 5, 6],
|
||||||
description="Number of terms in the expression",
|
description="Number of terms in the expression",
|
||||||
lower_field_name="min_terms",
|
lower_field_name="min_terms",
|
||||||
upper_field_name="max_terms",
|
upper_field_name="max_terms",
|
||||||
|
ensure_interval=False,
|
||||||
),
|
),
|
||||||
RangeAttributeDefinition(
|
RangeAttributeDefinition(
|
||||||
name="num_digits",
|
name="num_digits",
|
||||||
levels=[1, 2, 5, 10],
|
levels=[1, 2, 3, 4],
|
||||||
description="Number of digits in the numbers",
|
description="Number of digits in the numbers",
|
||||||
lower_field_name="min_digits",
|
lower_field_name="min_digits",
|
||||||
upper_field_name="max_digits",
|
upper_field_name="max_digits",
|
||||||
|
ensure_interval=False,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,8 @@ Now, it's your turn. How many rectangles do you see in the grid below?
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DATASET_NAME = "rectangle_count"
|
DATASET_NAME = "rectangle_count"
|
||||||
|
CONST_TERM = 0.8
|
||||||
|
D = 5
|
||||||
|
|
||||||
|
|
||||||
def draw_rectangles_with_overlap(n, width, height, rng):
|
def draw_rectangles_with_overlap(n, width, height, rng):
|
||||||
|
|
@ -132,22 +134,29 @@ class RectangleCountDataset(ProceduralDataset):
|
||||||
}
|
}
|
||||||
|
|
||||||
def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
|
def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
|
||||||
"""Determine if the solution provided solves the RectangleCount task.
|
"""Determine if the solution provided solves the RectangleCount task,
|
||||||
|
awarding partial credit if the guess is close.
|
||||||
The function awards 1.0 for a correct answer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
answer (Optional[str]): The user's answer.
|
|
||||||
entry (dict[str, Any]): The original dataset entry containing the correct answer.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
float: The computed score between 0.0 and 1.0.
|
float: A score between 0.0 and 1.0.
|
||||||
"""
|
"""
|
||||||
|
correct_str = entry["answer"].lower().replace("\n", "")
|
||||||
|
|
||||||
if isinstance(answer, str):
|
try:
|
||||||
if answer.lower().replace("\n", "") == entry["answer"].lower().replace("\n", ""):
|
correct_val = int(correct_str)
|
||||||
return 1.0 # Yay
|
user_val = int(answer.strip())
|
||||||
return 0.0
|
except (ValueError, TypeError, AttributeError):
|
||||||
|
return 0.0
|
||||||
|
distance = abs(user_val - correct_val)
|
||||||
|
|
||||||
|
if distance == 0:
|
||||||
|
return 1.0
|
||||||
|
if distance >= D:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
score = 1.0 - (distance / float(D))
|
||||||
|
score = CONST_TERM * score
|
||||||
|
return max(0.0, score)
|
||||||
|
|
||||||
|
|
||||||
class RectangleCountCurriculum(BaseCurriculum):
|
class RectangleCountCurriculum(BaseCurriculum):
|
||||||
|
|
|
||||||
|
|
@ -121,29 +121,49 @@ class RubiksCubeDataset(ProceduralDataset):
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def partial_score(self, cube: Cube) -> float:
|
||||||
|
"""
|
||||||
|
Returns a fraction between 0 and 1, indicating how many stickers are
|
||||||
|
correctly positioned (i.e., match the solved color for that face).
|
||||||
|
"""
|
||||||
|
total_stickers = 6 * (cube.size**2)
|
||||||
|
correct_stickers = 0
|
||||||
|
|
||||||
|
for face_index in range(6):
|
||||||
|
face = cube.faces[face_index]
|
||||||
|
|
||||||
|
solved_color = face[cube.size // 2][cube.size // 2].color
|
||||||
|
for row in range(cube.size):
|
||||||
|
for col in range(cube.size):
|
||||||
|
sticker = face[row][col]
|
||||||
|
if sticker.color == solved_color:
|
||||||
|
correct_stickers += 1
|
||||||
|
|
||||||
|
return correct_stickers / total_stickers
|
||||||
|
|
||||||
def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
|
def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
|
||||||
"""Determine if the solution provided solves the cube"""
|
"""Determine if the solution provided solves the cube, with partial rewards."""
|
||||||
reward = 0.0 # default reward
|
reward = 0.0 # default
|
||||||
if answer is not None:
|
if answer is not None:
|
||||||
# Reconstruct the test cube
|
|
||||||
eval_cube = Cube(entry["metadata"]["cube_size"])
|
eval_cube = Cube(entry["metadata"]["cube_size"])
|
||||||
eval_cube.rotate(entry["metadata"]["scramble_moves"])
|
eval_cube.rotate(entry["metadata"]["scramble_moves"])
|
||||||
|
|
||||||
# Test the solution
|
|
||||||
try:
|
try:
|
||||||
expanded_answer = self.expand_moves(answer)
|
expanded_answer = self.expand_moves(answer)
|
||||||
eval_cube.rotate(expanded_answer)
|
eval_cube.rotate(expanded_answer)
|
||||||
solved = eval_cube.is_done()
|
|
||||||
|
|
||||||
|
# 3) Check if fully solved
|
||||||
|
solved = eval_cube.is_done()
|
||||||
if solved:
|
if solved:
|
||||||
reward = 1.0
|
reward = 1.0
|
||||||
elif len(answer.strip()) > 0: # encourage non-empty answers
|
|
||||||
reward = 0.05 # Incorrect, but rotate could parse the answer
|
|
||||||
else:
|
else:
|
||||||
reward = 0.01
|
partial = self.partial_score(eval_cube)
|
||||||
except:
|
|
||||||
reward = 0.01 # At least you tried
|
|
||||||
|
|
||||||
|
if len(answer.strip()) > 0:
|
||||||
|
reward = max(0.05, partial)
|
||||||
|
else:
|
||||||
|
reward = max(0.01, partial)
|
||||||
|
except:
|
||||||
|
reward = 0.01
|
||||||
return reward
|
return reward
|
||||||
|
|
||||||
def remove_ansi(self, line):
|
def remove_ansi(self, line):
|
||||||
|
|
|
||||||
|
|
@ -99,6 +99,7 @@ Here is your puzzle:
|
||||||
"source_dataset": DATASET_NAME,
|
"source_dataset": DATASET_NAME,
|
||||||
"source_index": idx,
|
"source_index": idx,
|
||||||
"gamestr": gamestr,
|
"gamestr": gamestr,
|
||||||
|
"source_dataset": DATASET_NAME,
|
||||||
"width": puzzle_data["width"],
|
"width": puzzle_data["width"],
|
||||||
"height": puzzle_data["height"],
|
"height": puzzle_data["height"],
|
||||||
"difficulty": {
|
"difficulty": {
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ from reasoning_gym.arithmetic.basic_arithmetic import (
|
||||||
BasicArithmeticDatasetConfig,
|
BasicArithmeticDatasetConfig,
|
||||||
eval_floordiv,
|
eval_floordiv,
|
||||||
)
|
)
|
||||||
|
from reasoning_gym.coaching.base_curriculum import DefaultCurriculumContext, RangeAttributeMode
|
||||||
|
|
||||||
|
|
||||||
def test_arithmetic_dataset_config_validation():
|
def test_arithmetic_dataset_config_validation():
|
||||||
|
|
@ -103,7 +104,7 @@ def test_basic_arithmetic_curriculum():
|
||||||
"""Test the BasicArithmeticCurriculum functionality"""
|
"""Test the BasicArithmeticCurriculum functionality"""
|
||||||
curriculum = BasicArithmeticCurriculum()
|
curriculum = BasicArithmeticCurriculum()
|
||||||
|
|
||||||
base_value = {"size": 150, "seed": 1}
|
base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1}
|
||||||
|
|
||||||
base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(base_value)
|
base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(base_value)
|
||||||
assert base_cfg.seed == 1
|
assert base_cfg.seed == 1
|
||||||
|
|
@ -115,7 +116,7 @@ def test_basic_arithmetic_curriculum():
|
||||||
curriculum.increment_attr_level("num_terms")
|
curriculum.increment_attr_level("num_terms")
|
||||||
curriculum.increment_attr_level("num_digits")
|
curriculum.increment_attr_level("num_digits")
|
||||||
increased_cfg = curriculum.generate_configuration(base_value)
|
increased_cfg = curriculum.generate_configuration(base_value)
|
||||||
assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 5
|
assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3
|
||||||
assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2
|
assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2
|
||||||
|
|
||||||
# Test decrementing attribute level for num_terms
|
# Test decrementing attribute level for num_terms
|
||||||
|
|
@ -128,7 +129,7 @@ def test_basic_arithmetic_curriculum():
|
||||||
curriculum.increment_attr_level("num_terms")
|
curriculum.increment_attr_level("num_terms")
|
||||||
curriculum.increment_attr_level("num_terms")
|
curriculum.increment_attr_level("num_terms")
|
||||||
higher_level_cfg = curriculum.generate_configuration(base_value)
|
higher_level_cfg = curriculum.generate_configuration(base_value)
|
||||||
assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 10
|
assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 4
|
||||||
assert higher_level_cfg.min_digits == 1 and higher_level_cfg.max_digits == 2
|
assert higher_level_cfg.min_digits == 1 and higher_level_cfg.max_digits == 2
|
||||||
|
|
||||||
# Test boundary conditions - trying to decrement below level 0
|
# Test boundary conditions - trying to decrement below level 0
|
||||||
|
|
@ -144,5 +145,26 @@ def test_basic_arithmetic_curriculum():
|
||||||
curriculum.increment_attr_level("num_terms")
|
curriculum.increment_attr_level("num_terms")
|
||||||
curriculum.increment_attr_level("num_digits")
|
curriculum.increment_attr_level("num_digits")
|
||||||
upper_bound_cfg = curriculum.generate_configuration(base_value)
|
upper_bound_cfg = curriculum.generate_configuration(base_value)
|
||||||
assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 15
|
assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 6
|
||||||
assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 10
|
assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_basic_arithmetic_curriculum_upper_bound():
|
||||||
|
curriculum = BasicArithmeticCurriculum()
|
||||||
|
|
||||||
|
base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1}
|
||||||
|
|
||||||
|
base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(
|
||||||
|
base_value, context=DefaultCurriculumContext(mode=RangeAttributeMode.UPPER_BOUND)
|
||||||
|
)
|
||||||
|
assert base_cfg.seed == 1
|
||||||
|
assert base_cfg.size == 150
|
||||||
|
assert base_cfg.min_terms == 2 and base_cfg.max_terms == 2
|
||||||
|
assert base_cfg.min_digits == 1 and base_cfg.max_digits == 1
|
||||||
|
|
||||||
|
# Test incrementing attribute levels
|
||||||
|
curriculum.increment_attr_level("num_terms")
|
||||||
|
curriculum.increment_attr_level("num_digits")
|
||||||
|
increased_cfg = curriculum.generate_configuration(base_value)
|
||||||
|
assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3
|
||||||
|
assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,7 @@ def test_number_sorting_dataset_items():
|
||||||
|
|
||||||
# Verify number count constraints
|
# Verify number count constraints
|
||||||
numbers = item["metadata"]["original_numbers"]
|
numbers = item["metadata"]["original_numbers"]
|
||||||
|
print(numbers)
|
||||||
assert len(numbers) >= config.min_numbers
|
assert len(numbers) >= config.min_numbers
|
||||||
assert len(numbers) <= config.max_numbers
|
assert len(numbers) <= config.max_numbers
|
||||||
|
|
||||||
|
|
@ -99,7 +100,7 @@ def test_number_sorting_curriculum():
|
||||||
base_cfg: NumberSortingConfig = curriculum.generate_configuration(base_value)
|
base_cfg: NumberSortingConfig = curriculum.generate_configuration(base_value)
|
||||||
assert base_cfg.seed == 1
|
assert base_cfg.seed == 1
|
||||||
assert base_cfg.size == 150
|
assert base_cfg.size == 150
|
||||||
assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 50
|
assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 100
|
||||||
assert base_cfg.min_decimals == 0 and base_cfg.max_decimals == 1
|
assert base_cfg.min_decimals == 0 and base_cfg.max_decimals == 1
|
||||||
assert base_cfg.min_value == -100 and base_cfg.max_value == 100
|
assert base_cfg.min_value == -100 and base_cfg.max_value == 100
|
||||||
|
|
||||||
|
|
@ -107,14 +108,14 @@ def test_number_sorting_curriculum():
|
||||||
curriculum.increment_attr_level("numbers")
|
curriculum.increment_attr_level("numbers")
|
||||||
curriculum.increment_attr_level("decimals")
|
curriculum.increment_attr_level("decimals")
|
||||||
increased_cfg = curriculum.generate_configuration(base_value)
|
increased_cfg = curriculum.generate_configuration(base_value)
|
||||||
assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 100
|
assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 500
|
||||||
assert increased_cfg.min_decimals == 0 and increased_cfg.max_decimals == 2
|
assert increased_cfg.min_decimals == 0 and increased_cfg.max_decimals == 2
|
||||||
assert increased_cfg.min_value == -100 and increased_cfg.max_value == 100
|
assert increased_cfg.min_value == -100 and increased_cfg.max_value == 100
|
||||||
|
|
||||||
# test decrementing attribute level for numbers again
|
# test decrementing attribute level for numbers again
|
||||||
curriculum.decrement_attr_level("numbers")
|
curriculum.decrement_attr_level("numbers")
|
||||||
partially_decreased_cfg = curriculum.generate_configuration(base_value)
|
partially_decreased_cfg = curriculum.generate_configuration(base_value)
|
||||||
assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 50
|
assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 100
|
||||||
assert partially_decreased_cfg.min_decimals == 0 and partially_decreased_cfg.max_decimals == 2
|
assert partially_decreased_cfg.min_decimals == 0 and partially_decreased_cfg.max_decimals == 2
|
||||||
assert partially_decreased_cfg.min_value == -100 and partially_decreased_cfg.max_value == 100
|
assert partially_decreased_cfg.min_value == -100 and partially_decreased_cfg.max_value == 100
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -55,9 +55,9 @@ def test_rubikscube_items():
|
||||||
assert dataset.score_answer(answer=None, entry=item) == 0.0
|
assert dataset.score_answer(answer=None, entry=item) == 0.0
|
||||||
|
|
||||||
if item["metadata"]["example_correct_answer"] != "R":
|
if item["metadata"]["example_correct_answer"] != "R":
|
||||||
assert dataset.score_answer(answer="R", entry=item) == 0.05
|
assert dataset.score_answer(answer="R", entry=item) == 0.01
|
||||||
|
|
||||||
assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.05
|
assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.01
|
||||||
|
|
||||||
if len(item["metadata"]["example_correct_answer"]) > 0:
|
if len(item["metadata"]["example_correct_answer"]) > 0:
|
||||||
assert dataset.score_answer(answer="", entry=item) == 0.01
|
assert dataset.score_answer(answer="", entry=item) == 0.01
|
||||||
|
|
|
||||||
|
|
@ -87,6 +87,7 @@ python utils/load_fsdp_to_hf.py checkpoints/rg-test/intra_reasoning_algorithmic_
|
||||||
From here you may to run evaluations of your trained model. In the `training/evaluation` directory there is a script `evaluate_model.py` which you csn run to evaluate your trained model on a specific dataset. You specify evaluation parameters in a yaml file. This evaluation can point to either a local or remote model. For example the configuration file `training/evaluation/eval_algorithmic_composite.yaml` specifies the path to a local model which is stored as a hugginface checkpoint at `training/utils/qwen3b_500` (note that you have to convert to fsdp checkpoint to hf checkpoint for evaluation script to work as shown in the previous step).
|
From here you may to run evaluations of your trained model. In the `training/evaluation` directory there is a script `evaluate_model.py` which you csn run to evaluate your trained model on a specific dataset. You specify evaluation parameters in a yaml file. This evaluation can point to either a local or remote model. For example the configuration file `training/evaluation/eval_algorithmic_composite.yaml` specifies the path to a local model which is stored as a hugginface checkpoint at `training/utils/qwen3b_500` (note that you have to convert to fsdp checkpoint to hf checkpoint for evaluation script to work as shown in the previous step).
|
||||||
|
|
||||||
## Run the script
|
## Run the script
|
||||||
|
export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||||
Navigate to evaluations directory:
|
Navigate to evaluations directory:
|
||||||
```
|
```
|
||||||
python evaluate_model.py --config path-to-yaml
|
python evaluate_model.py --config path-to-yaml
|
||||||
|
|
|
||||||
221
training/configs/intra_generalisation/algebra_qwen_3b.yaml
Normal file
221
training/configs/intra_generalisation/algebra_qwen_3b.yaml
Normal file
|
|
@ -0,0 +1,221 @@
|
||||||
|
reasoning_gym:
|
||||||
|
dataset_size: 20000
|
||||||
|
developer_prompt: DeepSeekZero
|
||||||
|
datasets:
|
||||||
|
simple_equations:
|
||||||
|
weight: 0.5
|
||||||
|
config:
|
||||||
|
min_terms: 2
|
||||||
|
max_terms: 4
|
||||||
|
min_value: 1
|
||||||
|
max_value: 100
|
||||||
|
polynomial_multiplication:
|
||||||
|
weight: 0.5
|
||||||
|
config:
|
||||||
|
min_terms: 2
|
||||||
|
max_terms: 4
|
||||||
|
min_value: 1
|
||||||
|
max_value: 100
|
||||||
|
min_degree: 0
|
||||||
|
max_degree: 3
|
||||||
|
min_polynomials: 2
|
||||||
|
max_polynomials: 3
|
||||||
|
curriculum:
|
||||||
|
enabled: False
|
||||||
|
schedule:
|
||||||
|
automatic: True
|
||||||
|
update_steps: 30 # automatic curriculum updating after 50 steps
|
||||||
|
last_k: 20
|
||||||
|
success_threshold: 0.70
|
||||||
|
failure_threshold: 0.10
|
||||||
|
curricula:
|
||||||
|
spell_backward:
|
||||||
|
attribute_levels:
|
||||||
|
word_len: 0
|
||||||
|
reward:
|
||||||
|
use_accuracy: True
|
||||||
|
secondary_rewards:
|
||||||
|
- name: cosine
|
||||||
|
scaling_factor: 0.3
|
||||||
|
- name: format
|
||||||
|
scaling_factor: 0.2
|
||||||
|
kwargs:
|
||||||
|
preappend_thinking_token: False
|
||||||
|
|
||||||
|
data:
|
||||||
|
tokenizer: null
|
||||||
|
train_files: train.parquet
|
||||||
|
val_files: test.parquet
|
||||||
|
prompt_key: prompt
|
||||||
|
max_prompt_length: 512
|
||||||
|
max_response_length: 1024
|
||||||
|
train_batch_size: 32
|
||||||
|
val_batch_size: 64
|
||||||
|
return_raw_chat: True
|
||||||
|
return_raw_input_ids: True
|
||||||
|
actor_rollout_ref:
|
||||||
|
hybrid_engine: True
|
||||||
|
model:
|
||||||
|
path: Qwen/Qwen2.5-3B-Instruct
|
||||||
|
external_lib: null
|
||||||
|
override_config: { }
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: True
|
||||||
|
actor:
|
||||||
|
strategy: fsdp # This is for backward-compatibility
|
||||||
|
ppo_mini_batch_size: 16
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: 4
|
||||||
|
use_dynamic_bsz: False
|
||||||
|
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
|
||||||
|
grad_clip: 1.0
|
||||||
|
clip_ratio: 0.2
|
||||||
|
entropy_coeff: 0.001
|
||||||
|
use_kl_loss: True # True for GRPO
|
||||||
|
kl_loss_coef: 0.001 # for grpo
|
||||||
|
kl_loss_type: low_var_kl # for grpo
|
||||||
|
ppo_epochs: 1
|
||||||
|
shuffle: False
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
optim:
|
||||||
|
lr: 1e-6
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: 500 # must be override by program
|
||||||
|
fsdp_config:
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
ref:
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: True
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
|
||||||
|
rollout:
|
||||||
|
name: vllm
|
||||||
|
temperature: 1.0
|
||||||
|
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
|
||||||
|
top_p: 1
|
||||||
|
prompt_length: ${data.max_prompt_length} # not use for opensource
|
||||||
|
response_length: ${data.max_response_length}
|
||||||
|
# for vllm rollout
|
||||||
|
dtype: bfloat16 # should align with FSDP
|
||||||
|
gpu_memory_utilization: 0.7
|
||||||
|
ignore_eos: False
|
||||||
|
enforce_eager: True
|
||||||
|
free_cache_engine: True
|
||||||
|
load_format: dummy_dtensor
|
||||||
|
tensor_model_parallel_size: 4
|
||||||
|
max_num_batched_tokens: 12288
|
||||||
|
max_num_seqs: 1024
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
disable_log_stats: True
|
||||||
|
enable_chunked_prefill: True # could get higher throughput
|
||||||
|
# for hf rollout
|
||||||
|
do_sample: True
|
||||||
|
use_fire_sampling: False
|
||||||
|
max_model_len: 12288
|
||||||
|
# number of responses (i.e. num sample times)
|
||||||
|
n: 8 # > 1 for grpo
|
||||||
|
val_kwargs:
|
||||||
|
do_sample: True
|
||||||
|
|
||||||
|
algorithm:
|
||||||
|
gamma: 1.0
|
||||||
|
lam: 1.0
|
||||||
|
adv_estimator: grpo
|
||||||
|
kl_penalty: kl # how to estimate kl divergence
|
||||||
|
kl_ctrl:
|
||||||
|
type: fixed
|
||||||
|
kl_coef: 0.001
|
||||||
|
verbose: True
|
||||||
|
trainer:
|
||||||
|
balance_batch: True
|
||||||
|
total_epochs: 1
|
||||||
|
total_training_steps: 500
|
||||||
|
project_name: rg-test
|
||||||
|
experiment_name: intra_reasoning_algebra_qwen_3b_composite
|
||||||
|
logger: [ 'console', 'wandb' ]
|
||||||
|
val_generations_to_log_to_wandb: 0
|
||||||
|
nnodes: 1
|
||||||
|
n_gpus_per_node: 4
|
||||||
|
save_freq: 100
|
||||||
|
# auto: find the last ckpt to resume. If can't find, start from scratch
|
||||||
|
resume_mode: auto # or auto or resume_path if
|
||||||
|
resume_from_path: False
|
||||||
|
test_freq: 100
|
||||||
|
critic_warmup: 0
|
||||||
|
default_hdfs_dir: null
|
||||||
|
remove_previous_ckpt_in_save: False
|
||||||
|
del_local_ckpt_after_load: False
|
||||||
|
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
|
||||||
|
|
||||||
|
|
||||||
|
critic:
|
||||||
|
strategy: fsdp
|
||||||
|
optim:
|
||||||
|
lr: 1e-5
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: -1 # must be override by program
|
||||||
|
model:
|
||||||
|
path: ~/models/deepseek-llm-7b-chat
|
||||||
|
tokenizer_path: ${actor_rollout_ref.model.path}
|
||||||
|
override_config: { }
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
fsdp_size: -1
|
||||||
|
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: null
|
||||||
|
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
|
||||||
|
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
|
||||||
|
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
|
||||||
|
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
|
||||||
|
shuffle: ${actor_rollout_ref.actor.shuffle}
|
||||||
|
grad_clip: 1.0
|
||||||
|
cliprange_value: 0.5
|
||||||
|
|
||||||
|
# Reward model not used for GRPO
|
||||||
|
reward_model:
|
||||||
|
enable: False
|
||||||
|
strategy: fsdp
|
||||||
|
model:
|
||||||
|
input_tokenizer: ${actor_rollout_ref.model.path}
|
||||||
|
path: ~/models/FsfairX-LLaMA3-RM-v0.1
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
micro_batch_size: null
|
||||||
|
micro_batch_size_per_gpu: null
|
||||||
|
max_length: null
|
||||||
|
ulysses_sequence_parallel_size: 1
|
||||||
|
use_dynamic_bsz: ${critic.use_dynamic_bsz}
|
||||||
|
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
|
||||||
224
training/configs/intra_generalisation/arithmetic_qwen_3b.yaml
Normal file
224
training/configs/intra_generalisation/arithmetic_qwen_3b.yaml
Normal file
|
|
@ -0,0 +1,224 @@
|
||||||
|
reasoning_gym:
|
||||||
|
dataset_size: 20000
|
||||||
|
developer_prompt: DeepSeekZero
|
||||||
|
datasets:
|
||||||
|
fraction_simplification:
|
||||||
|
weight: 0.33
|
||||||
|
config:
|
||||||
|
min_value: 1
|
||||||
|
max_value: 1000
|
||||||
|
min_factor: 1
|
||||||
|
max_factor: 100
|
||||||
|
gcd:
|
||||||
|
weight: 0.34
|
||||||
|
config:
|
||||||
|
min_numbers: 2 # Minimum numbers to find GCD of
|
||||||
|
max_numbers: 2 # Maximum numbers to find GCD of
|
||||||
|
min_value: 1 # Minimum value for each number
|
||||||
|
max_value: 1000 # Maximum value for each number
|
||||||
|
lcm:
|
||||||
|
weight: 0.33
|
||||||
|
config:
|
||||||
|
min_numbers: 2
|
||||||
|
max_numbers: 2
|
||||||
|
min_value: 1
|
||||||
|
max_value: 100
|
||||||
|
curriculum:
|
||||||
|
enabled: False
|
||||||
|
schedule:
|
||||||
|
automatic: True
|
||||||
|
update_steps: 30
|
||||||
|
last_k: 20
|
||||||
|
success_threshold: 0.70
|
||||||
|
failure_threshold: 0.10
|
||||||
|
curricula:
|
||||||
|
spell_backward:
|
||||||
|
attribute_levels:
|
||||||
|
word_len: 0
|
||||||
|
reward:
|
||||||
|
use_accuracy: True
|
||||||
|
secondary_rewards:
|
||||||
|
- name: cosine
|
||||||
|
scaling_factor: 0.3
|
||||||
|
- name: format
|
||||||
|
scaling_factor: 0.2
|
||||||
|
kwargs:
|
||||||
|
preappend_thinking_token: False
|
||||||
|
|
||||||
|
data:
|
||||||
|
tokenizer: null
|
||||||
|
train_files: train.parquet
|
||||||
|
val_files: test.parquet
|
||||||
|
prompt_key: prompt
|
||||||
|
max_prompt_length: 512
|
||||||
|
max_response_length: 1024
|
||||||
|
train_batch_size: 32
|
||||||
|
val_batch_size: 64
|
||||||
|
return_raw_chat: True
|
||||||
|
return_raw_input_ids: True
|
||||||
|
actor_rollout_ref:
|
||||||
|
hybrid_engine: True
|
||||||
|
model:
|
||||||
|
path: Qwen/Qwen2.5-3B-Instruct
|
||||||
|
external_lib: null
|
||||||
|
override_config: { }
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: True
|
||||||
|
actor:
|
||||||
|
strategy: fsdp # This is for backward-compatibility
|
||||||
|
ppo_mini_batch_size: 16
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: 4
|
||||||
|
use_dynamic_bsz: False
|
||||||
|
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
|
||||||
|
grad_clip: 1.0
|
||||||
|
clip_ratio: 0.2
|
||||||
|
entropy_coeff: 0.001
|
||||||
|
use_kl_loss: True # True for GRPO
|
||||||
|
kl_loss_coef: 0.001 # for grpo
|
||||||
|
kl_loss_type: low_var_kl # for grpo
|
||||||
|
ppo_epochs: 1
|
||||||
|
shuffle: False
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
optim:
|
||||||
|
lr: 1e-6
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: 500 # must be override by program
|
||||||
|
fsdp_config:
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
ref:
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: True
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
|
||||||
|
rollout:
|
||||||
|
name: vllm
|
||||||
|
temperature: 1.0
|
||||||
|
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
|
||||||
|
top_p: 1
|
||||||
|
prompt_length: ${data.max_prompt_length} # not use for opensource
|
||||||
|
response_length: ${data.max_response_length}
|
||||||
|
# for vllm rollout
|
||||||
|
dtype: bfloat16 # should align with FSDP
|
||||||
|
gpu_memory_utilization: 0.7
|
||||||
|
ignore_eos: False
|
||||||
|
enforce_eager: True
|
||||||
|
free_cache_engine: True
|
||||||
|
load_format: dummy_dtensor
|
||||||
|
tensor_model_parallel_size: 4
|
||||||
|
max_num_batched_tokens: 12288
|
||||||
|
max_num_seqs: 1024
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
disable_log_stats: True
|
||||||
|
enable_chunked_prefill: True # could get higher throughput
|
||||||
|
# for hf rollout
|
||||||
|
do_sample: True
|
||||||
|
use_fire_sampling: False
|
||||||
|
max_model_len: 12288
|
||||||
|
# number of responses (i.e. num sample times)
|
||||||
|
n: 8 # > 1 for grpo
|
||||||
|
val_kwargs:
|
||||||
|
do_sample: True
|
||||||
|
|
||||||
|
algorithm:
|
||||||
|
gamma: 1.0
|
||||||
|
lam: 1.0
|
||||||
|
adv_estimator: grpo
|
||||||
|
kl_penalty: kl # how to estimate kl divergence
|
||||||
|
kl_ctrl:
|
||||||
|
type: fixed
|
||||||
|
kl_coef: 0.001
|
||||||
|
verbose: True
|
||||||
|
trainer:
|
||||||
|
balance_batch: True
|
||||||
|
total_epochs: 1
|
||||||
|
total_training_steps: 500
|
||||||
|
project_name: rg-test
|
||||||
|
experiment_name: intra_reasoning_arithmetic_qwen_3b_composite
|
||||||
|
logger: [ 'console', 'wandb' ]
|
||||||
|
val_generations_to_log_to_wandb: 0
|
||||||
|
nnodes: 1
|
||||||
|
n_gpus_per_node: 4
|
||||||
|
save_freq: 100
|
||||||
|
# auto: find the last ckpt to resume. If can't find, start from scratch
|
||||||
|
resume_mode: auto # or auto or resume_path if
|
||||||
|
resume_from_path: False
|
||||||
|
test_freq: 100
|
||||||
|
critic_warmup: 0
|
||||||
|
default_hdfs_dir: null
|
||||||
|
remove_previous_ckpt_in_save: False
|
||||||
|
del_local_ckpt_after_load: False
|
||||||
|
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
|
||||||
|
|
||||||
|
|
||||||
|
critic:
|
||||||
|
strategy: fsdp
|
||||||
|
optim:
|
||||||
|
lr: 1e-5
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: -1 # must be override by program
|
||||||
|
model:
|
||||||
|
path: ~/models/deepseek-llm-7b-chat
|
||||||
|
tokenizer_path: ${actor_rollout_ref.model.path}
|
||||||
|
override_config: { }
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
fsdp_size: -1
|
||||||
|
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: null
|
||||||
|
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
|
||||||
|
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
|
||||||
|
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
|
||||||
|
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
|
||||||
|
shuffle: ${actor_rollout_ref.actor.shuffle}
|
||||||
|
grad_clip: 1.0
|
||||||
|
cliprange_value: 0.5
|
||||||
|
|
||||||
|
# Reward model not used for GRPO
|
||||||
|
reward_model:
|
||||||
|
enable: False
|
||||||
|
strategy: fsdp
|
||||||
|
model:
|
||||||
|
input_tokenizer: ${actor_rollout_ref.model.path}
|
||||||
|
path: ~/models/FsfairX-LLaMA3-RM-v0.1
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
micro_batch_size: null
|
||||||
|
micro_batch_size_per_gpu: null
|
||||||
|
max_length: null
|
||||||
|
ulysses_sequence_parallel_size: 1
|
||||||
|
use_dynamic_bsz: ${critic.use_dynamic_bsz}
|
||||||
|
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
|
||||||
219
training/configs/intra_generalisation/cognition_qwen_3b.yaml
Normal file
219
training/configs/intra_generalisation/cognition_qwen_3b.yaml
Normal file
|
|
@ -0,0 +1,219 @@
|
||||||
|
reasoning_gym:
|
||||||
|
dataset_size: 20000
|
||||||
|
developer_prompt: DeepSeekZero
|
||||||
|
datasets:
|
||||||
|
rubiks_cube:
|
||||||
|
weight: 0.33
|
||||||
|
config:
|
||||||
|
min_scramble_steps: 3
|
||||||
|
max_scramble_steps: 10
|
||||||
|
figlet_font:
|
||||||
|
weight: 0.34
|
||||||
|
config:
|
||||||
|
min_word_len: 3
|
||||||
|
max_word_len: 7
|
||||||
|
rectangle_count:
|
||||||
|
weight: 0.33
|
||||||
|
config:
|
||||||
|
max_rectangles: 10
|
||||||
|
width: 80
|
||||||
|
height: 80
|
||||||
|
curriculum:
|
||||||
|
enabled: False
|
||||||
|
schedule:
|
||||||
|
automatic: True
|
||||||
|
update_steps: 30 # automatic curriculum updating after 50 steps
|
||||||
|
last_k: 20
|
||||||
|
success_threshold: 0.70
|
||||||
|
failure_threshold: 0.10
|
||||||
|
curricula:
|
||||||
|
spell_backward:
|
||||||
|
attribute_levels:
|
||||||
|
word_len: 0
|
||||||
|
reward:
|
||||||
|
use_accuracy: True
|
||||||
|
secondary_rewards:
|
||||||
|
- name: cosine
|
||||||
|
scaling_factor: 0.3
|
||||||
|
- name: format
|
||||||
|
scaling_factor: 0.2
|
||||||
|
kwargs:
|
||||||
|
preappend_thinking_token: False
|
||||||
|
|
||||||
|
data:
|
||||||
|
tokenizer: null
|
||||||
|
train_files: train.parquet
|
||||||
|
val_files: test.parquet
|
||||||
|
prompt_key: prompt
|
||||||
|
max_prompt_length: 512
|
||||||
|
max_response_length: 1024
|
||||||
|
train_batch_size: 32
|
||||||
|
val_batch_size: 64
|
||||||
|
return_raw_chat: True
|
||||||
|
return_raw_input_ids: True
|
||||||
|
actor_rollout_ref:
|
||||||
|
hybrid_engine: True
|
||||||
|
model:
|
||||||
|
path: Qwen/Qwen2.5-3B-Instruct
|
||||||
|
external_lib: null
|
||||||
|
override_config: { }
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: True
|
||||||
|
actor:
|
||||||
|
strategy: fsdp # This is for backward-compatibility
|
||||||
|
ppo_mini_batch_size: 16
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: 4
|
||||||
|
use_dynamic_bsz: False
|
||||||
|
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
|
||||||
|
grad_clip: 1.0
|
||||||
|
clip_ratio: 0.2
|
||||||
|
entropy_coeff: 0.001
|
||||||
|
use_kl_loss: True # True for GRPO
|
||||||
|
kl_loss_coef: 0.001 # for grpo
|
||||||
|
kl_loss_type: low_var_kl # for grpo
|
||||||
|
ppo_epochs: 1
|
||||||
|
shuffle: False
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
optim:
|
||||||
|
lr: 1e-6
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: 500 # must be override by program
|
||||||
|
fsdp_config:
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
ref:
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: True
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
|
||||||
|
rollout:
|
||||||
|
name: vllm
|
||||||
|
temperature: 1.0
|
||||||
|
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
|
||||||
|
top_p: 1
|
||||||
|
prompt_length: ${data.max_prompt_length} # not use for opensource
|
||||||
|
response_length: ${data.max_response_length}
|
||||||
|
# for vllm rollout
|
||||||
|
dtype: bfloat16 # should align with FSDP
|
||||||
|
gpu_memory_utilization: 0.7
|
||||||
|
ignore_eos: False
|
||||||
|
enforce_eager: True
|
||||||
|
free_cache_engine: True
|
||||||
|
load_format: dummy_dtensor
|
||||||
|
tensor_model_parallel_size: 4
|
||||||
|
max_num_batched_tokens: 12288
|
||||||
|
max_num_seqs: 1024
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
disable_log_stats: True
|
||||||
|
enable_chunked_prefill: True # could get higher throughput
|
||||||
|
# for hf rollout
|
||||||
|
do_sample: True
|
||||||
|
use_fire_sampling: False
|
||||||
|
max_model_len: 12288
|
||||||
|
# number of responses (i.e. num sample times)
|
||||||
|
n: 8 # > 1 for grpo
|
||||||
|
val_kwargs:
|
||||||
|
do_sample: True
|
||||||
|
|
||||||
|
algorithm:
|
||||||
|
gamma: 1.0
|
||||||
|
lam: 1.0
|
||||||
|
adv_estimator: grpo
|
||||||
|
kl_penalty: kl # how to estimate kl divergence
|
||||||
|
kl_ctrl:
|
||||||
|
type: fixed
|
||||||
|
kl_coef: 0.001
|
||||||
|
verbose: True
|
||||||
|
trainer:
|
||||||
|
balance_batch: True
|
||||||
|
total_epochs: 1
|
||||||
|
total_training_steps: 500
|
||||||
|
project_name: rg-test
|
||||||
|
experiment_name: intra_reasoning_cognition_qwen_3b_composite_test
|
||||||
|
logger: [ 'console', 'wandb' ]
|
||||||
|
val_generations_to_log_to_wandb: 0
|
||||||
|
nnodes: 1
|
||||||
|
n_gpus_per_node: 4
|
||||||
|
save_freq: 100
|
||||||
|
# auto: find the last ckpt to resume. If can't find, start from scratch
|
||||||
|
resume_mode: auto # or auto or resume_path if
|
||||||
|
resume_from_path: False
|
||||||
|
test_freq: 100
|
||||||
|
critic_warmup: 0
|
||||||
|
default_hdfs_dir: null
|
||||||
|
remove_previous_ckpt_in_save: False
|
||||||
|
del_local_ckpt_after_load: False
|
||||||
|
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
|
||||||
|
|
||||||
|
|
||||||
|
critic:
|
||||||
|
strategy: fsdp
|
||||||
|
optim:
|
||||||
|
lr: 1e-5
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: -1 # must be override by program
|
||||||
|
model:
|
||||||
|
path: ~/models/deepseek-llm-7b-chat
|
||||||
|
tokenizer_path: ${actor_rollout_ref.model.path}
|
||||||
|
override_config: { }
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
fsdp_size: -1
|
||||||
|
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: null
|
||||||
|
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
|
||||||
|
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
|
||||||
|
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
|
||||||
|
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
|
||||||
|
shuffle: ${actor_rollout_ref.actor.shuffle}
|
||||||
|
grad_clip: 1.0
|
||||||
|
cliprange_value: 0.5
|
||||||
|
|
||||||
|
# Reward model not used for GRPO
|
||||||
|
reward_model:
|
||||||
|
enable: False
|
||||||
|
strategy: fsdp
|
||||||
|
model:
|
||||||
|
input_tokenizer: ${actor_rollout_ref.model.path}
|
||||||
|
path: ~/models/FsfairX-LLaMA3-RM-v0.1
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
micro_batch_size: null
|
||||||
|
micro_batch_size_per_gpu: null
|
||||||
|
max_length: null
|
||||||
|
ulysses_sequence_parallel_size: 1
|
||||||
|
use_dynamic_bsz: ${critic.use_dynamic_bsz}
|
||||||
|
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
|
||||||
225
training/configs/intra_generalisation/games_qwen_3b.yaml
Normal file
225
training/configs/intra_generalisation/games_qwen_3b.yaml
Normal file
|
|
@ -0,0 +1,225 @@
|
||||||
|
reasoning_gym:
|
||||||
|
dataset_size: 20000
|
||||||
|
developer_prompt: DeepSeekZero
|
||||||
|
datasets:
|
||||||
|
sudoku:
|
||||||
|
weight: 0.33
|
||||||
|
config:
|
||||||
|
min_empty: 30
|
||||||
|
max_empty: 50
|
||||||
|
futoshiki:
|
||||||
|
weight: 0.34
|
||||||
|
config:
|
||||||
|
min_board_size: 4 # Board will be NxN where N is this value
|
||||||
|
max_board_size: 9
|
||||||
|
min_difficulty: 0
|
||||||
|
max_difficulty: 3
|
||||||
|
sokoban:
|
||||||
|
weight: 0.33
|
||||||
|
config:
|
||||||
|
min_w: 6 # Minimum width of the puzzle
|
||||||
|
min_h: 6 # Minimum height of the puzzle
|
||||||
|
max_w: 10 # Maximum width of the puzzle
|
||||||
|
max_h: 10 # Maximum height of the puzzle
|
||||||
|
min_boxes: 4 # Minimum number of boxes
|
||||||
|
max_boxes: 10 # Maximum number of boxes
|
||||||
|
max_depth: 80 # Maximum search depth
|
||||||
|
curriculum:
|
||||||
|
enabled: False
|
||||||
|
schedule:
|
||||||
|
automatic: True
|
||||||
|
update_steps: 30 # automatic curriculum updating after 50 steps
|
||||||
|
last_k: 20
|
||||||
|
success_threshold: 0.70
|
||||||
|
failure_threshold: 0.10
|
||||||
|
curricula:
|
||||||
|
spell_backward:
|
||||||
|
attribute_levels:
|
||||||
|
word_len: 0
|
||||||
|
reward:
|
||||||
|
use_accuracy: True
|
||||||
|
secondary_rewards:
|
||||||
|
- name: cosine
|
||||||
|
scaling_factor: 0.3
|
||||||
|
- name: format
|
||||||
|
scaling_factor: 0.2
|
||||||
|
kwargs:
|
||||||
|
preappend_thinking_token: False
|
||||||
|
|
||||||
|
data:
|
||||||
|
tokenizer: null
|
||||||
|
train_files: train.parquet
|
||||||
|
val_files: test.parquet
|
||||||
|
prompt_key: prompt
|
||||||
|
max_prompt_length: 512
|
||||||
|
max_response_length: 1024
|
||||||
|
train_batch_size: 32
|
||||||
|
val_batch_size: 64
|
||||||
|
return_raw_chat: True
|
||||||
|
return_raw_input_ids: True
|
||||||
|
actor_rollout_ref:
|
||||||
|
hybrid_engine: True
|
||||||
|
model:
|
||||||
|
path: Qwen/Qwen2.5-3B-Instruct
|
||||||
|
external_lib: null
|
||||||
|
override_config: { }
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: True
|
||||||
|
actor:
|
||||||
|
strategy: fsdp # This is for backward-compatibility
|
||||||
|
ppo_mini_batch_size: 16
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: 4
|
||||||
|
use_dynamic_bsz: False
|
||||||
|
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
|
||||||
|
grad_clip: 1.0
|
||||||
|
clip_ratio: 0.2
|
||||||
|
entropy_coeff: 0.001
|
||||||
|
use_kl_loss: True # True for GRPO
|
||||||
|
kl_loss_coef: 0.001 # for grpo
|
||||||
|
kl_loss_type: low_var_kl # for grpo
|
||||||
|
ppo_epochs: 1
|
||||||
|
shuffle: False
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
optim:
|
||||||
|
lr: 1e-6
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: 500 # must be override by program
|
||||||
|
fsdp_config:
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
ref:
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: True
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
|
||||||
|
rollout:
|
||||||
|
name: vllm
|
||||||
|
temperature: 1.0
|
||||||
|
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
|
||||||
|
top_p: 1
|
||||||
|
prompt_length: ${data.max_prompt_length} # not use for opensource
|
||||||
|
response_length: ${data.max_response_length}
|
||||||
|
# for vllm rollout
|
||||||
|
dtype: bfloat16 # should align with FSDP
|
||||||
|
gpu_memory_utilization: 0.7
|
||||||
|
ignore_eos: False
|
||||||
|
enforce_eager: True
|
||||||
|
free_cache_engine: True
|
||||||
|
load_format: dummy_dtensor
|
||||||
|
tensor_model_parallel_size: 4
|
||||||
|
max_num_batched_tokens: 12288
|
||||||
|
max_num_seqs: 1024
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
disable_log_stats: True
|
||||||
|
enable_chunked_prefill: True # could get higher throughput
|
||||||
|
# for hf rollout
|
||||||
|
do_sample: True
|
||||||
|
use_fire_sampling: False
|
||||||
|
max_model_len: 12288
|
||||||
|
# number of responses (i.e. num sample times)
|
||||||
|
n: 8 # > 1 for grpo
|
||||||
|
val_kwargs:
|
||||||
|
do_sample: True
|
||||||
|
|
||||||
|
algorithm:
|
||||||
|
gamma: 1.0
|
||||||
|
lam: 1.0
|
||||||
|
adv_estimator: grpo
|
||||||
|
kl_penalty: kl # how to estimate kl divergence
|
||||||
|
kl_ctrl:
|
||||||
|
type: fixed
|
||||||
|
kl_coef: 0.001
|
||||||
|
verbose: True
|
||||||
|
trainer:
|
||||||
|
balance_batch: True
|
||||||
|
total_epochs: 1
|
||||||
|
total_training_steps: 500
|
||||||
|
project_name: rg-test
|
||||||
|
experiment_name: intra_reasoning_games_qwen_3b_composite
|
||||||
|
logger: [ 'console', 'wandb' ]
|
||||||
|
val_generations_to_log_to_wandb: 0
|
||||||
|
nnodes: 1
|
||||||
|
n_gpus_per_node: 4
|
||||||
|
save_freq: 100
|
||||||
|
# auto: find the last ckpt to resume. If can't find, start from scratch
|
||||||
|
resume_mode: auto # or auto or resume_path if
|
||||||
|
resume_from_path: False
|
||||||
|
test_freq: 100
|
||||||
|
critic_warmup: 0
|
||||||
|
default_hdfs_dir: null
|
||||||
|
remove_previous_ckpt_in_save: False
|
||||||
|
del_local_ckpt_after_load: False
|
||||||
|
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
|
||||||
|
|
||||||
|
|
||||||
|
critic:
|
||||||
|
strategy: fsdp
|
||||||
|
optim:
|
||||||
|
lr: 1e-5
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: -1 # must be override by program
|
||||||
|
model:
|
||||||
|
path: ~/models/deepseek-llm-7b-chat
|
||||||
|
tokenizer_path: ${actor_rollout_ref.model.path}
|
||||||
|
override_config: { }
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
fsdp_size: -1
|
||||||
|
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: null
|
||||||
|
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
|
||||||
|
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
|
||||||
|
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
|
||||||
|
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
|
||||||
|
shuffle: ${actor_rollout_ref.actor.shuffle}
|
||||||
|
grad_clip: 1.0
|
||||||
|
cliprange_value: 0.5
|
||||||
|
|
||||||
|
# Reward model not used for GRPO
|
||||||
|
reward_model:
|
||||||
|
enable: False
|
||||||
|
strategy: fsdp
|
||||||
|
model:
|
||||||
|
input_tokenizer: ${actor_rollout_ref.model.path}
|
||||||
|
path: ~/models/FsfairX-LLaMA3-RM-v0.1
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
micro_batch_size: null
|
||||||
|
micro_batch_size_per_gpu: null
|
||||||
|
max_length: null
|
||||||
|
ulysses_sequence_parallel_size: 1
|
||||||
|
use_dynamic_bsz: ${critic.use_dynamic_bsz}
|
||||||
|
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
|
||||||
226
training/configs/intra_generalisation/graphs_qwen_3b.yaml
Normal file
226
training/configs/intra_generalisation/graphs_qwen_3b.yaml
Normal file
|
|
@ -0,0 +1,226 @@
|
||||||
|
reasoning_gym:
|
||||||
|
dataset_size: 20000
|
||||||
|
developer_prompt: DeepSeekZero
|
||||||
|
datasets:
|
||||||
|
shortest_path:
|
||||||
|
weight: 0.33
|
||||||
|
config:
|
||||||
|
min_rows: 5
|
||||||
|
max_rows: 8
|
||||||
|
min_cols: 5
|
||||||
|
max_cols: 8
|
||||||
|
p_blocked: 0.4
|
||||||
|
largest_island:
|
||||||
|
weight: 0.34
|
||||||
|
config:
|
||||||
|
min_rows: 5
|
||||||
|
max_rows: 10
|
||||||
|
min_cols: 5
|
||||||
|
max_cols: 10
|
||||||
|
min_num_islands: 0
|
||||||
|
max_num_islands: 5
|
||||||
|
min_island_size: 0
|
||||||
|
max_island_size: 10
|
||||||
|
quantum_lock:
|
||||||
|
weight: 0.33
|
||||||
|
config:
|
||||||
|
difficulty: 10
|
||||||
|
curriculum:
|
||||||
|
enabled: False
|
||||||
|
schedule:
|
||||||
|
automatic: True
|
||||||
|
update_steps: 30 # automatic curriculum updating after 50 steps
|
||||||
|
last_k: 20
|
||||||
|
success_threshold: 0.70
|
||||||
|
failure_threshold: 0.10
|
||||||
|
curricula:
|
||||||
|
spell_backward:
|
||||||
|
attribute_levels:
|
||||||
|
word_len: 0
|
||||||
|
reward:
|
||||||
|
use_accuracy: True
|
||||||
|
secondary_rewards:
|
||||||
|
- name: cosine
|
||||||
|
scaling_factor: 0.3
|
||||||
|
- name: format
|
||||||
|
scaling_factor: 0.2
|
||||||
|
kwargs:
|
||||||
|
preappend_thinking_token: False
|
||||||
|
|
||||||
|
data:
|
||||||
|
tokenizer: null
|
||||||
|
train_files: train.parquet
|
||||||
|
val_files: test.parquet
|
||||||
|
prompt_key: prompt
|
||||||
|
max_prompt_length: 512
|
||||||
|
max_response_length: 1024
|
||||||
|
train_batch_size: 32
|
||||||
|
val_batch_size: 64
|
||||||
|
return_raw_chat: True
|
||||||
|
return_raw_input_ids: True
|
||||||
|
actor_rollout_ref:
|
||||||
|
hybrid_engine: True
|
||||||
|
model:
|
||||||
|
path: Qwen/Qwen2.5-3B-Instruct
|
||||||
|
external_lib: null
|
||||||
|
override_config: { }
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: True
|
||||||
|
actor:
|
||||||
|
strategy: fsdp # This is for backward-compatibility
|
||||||
|
ppo_mini_batch_size: 16
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: 4
|
||||||
|
use_dynamic_bsz: False
|
||||||
|
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
|
||||||
|
grad_clip: 1.0
|
||||||
|
clip_ratio: 0.2
|
||||||
|
entropy_coeff: 0.001
|
||||||
|
use_kl_loss: True # True for GRPO
|
||||||
|
kl_loss_coef: 0.001 # for grpo
|
||||||
|
kl_loss_type: low_var_kl # for grpo
|
||||||
|
ppo_epochs: 1
|
||||||
|
shuffle: False
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
optim:
|
||||||
|
lr: 1e-6
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: 500 # must be override by program
|
||||||
|
fsdp_config:
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
ref:
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: True
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
|
||||||
|
rollout:
|
||||||
|
name: vllm
|
||||||
|
temperature: 1.0
|
||||||
|
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
|
||||||
|
top_p: 1
|
||||||
|
prompt_length: ${data.max_prompt_length} # not use for opensource
|
||||||
|
response_length: ${data.max_response_length}
|
||||||
|
# for vllm rollout
|
||||||
|
dtype: bfloat16 # should align with FSDP
|
||||||
|
gpu_memory_utilization: 0.7
|
||||||
|
ignore_eos: False
|
||||||
|
enforce_eager: True
|
||||||
|
free_cache_engine: True
|
||||||
|
load_format: dummy_dtensor
|
||||||
|
tensor_model_parallel_size: 4
|
||||||
|
max_num_batched_tokens: 12288
|
||||||
|
max_num_seqs: 1024
|
||||||
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||||
|
disable_log_stats: True
|
||||||
|
enable_chunked_prefill: True # could get higher throughput
|
||||||
|
# for hf rollout
|
||||||
|
do_sample: True
|
||||||
|
use_fire_sampling: False
|
||||||
|
max_model_len: 12288
|
||||||
|
# number of responses (i.e. num sample times)
|
||||||
|
n: 8 # > 1 for grpo
|
||||||
|
val_kwargs:
|
||||||
|
do_sample: True
|
||||||
|
|
||||||
|
algorithm:
|
||||||
|
gamma: 1.0
|
||||||
|
lam: 1.0
|
||||||
|
adv_estimator: grpo
|
||||||
|
kl_penalty: kl # how to estimate kl divergence
|
||||||
|
kl_ctrl:
|
||||||
|
type: fixed
|
||||||
|
kl_coef: 0.001
|
||||||
|
verbose: True
|
||||||
|
trainer:
|
||||||
|
balance_batch: True
|
||||||
|
total_epochs: 1
|
||||||
|
total_training_steps: 500
|
||||||
|
project_name: rg-test
|
||||||
|
experiment_name: intra_reasoning_games_qwen_3b_graphs
|
||||||
|
logger: [ 'console', 'wandb' ]
|
||||||
|
val_generations_to_log_to_wandb: 0
|
||||||
|
nnodes: 1
|
||||||
|
n_gpus_per_node: 4
|
||||||
|
save_freq: 100
|
||||||
|
# auto: find the last ckpt to resume. If can't find, start from scratch
|
||||||
|
resume_mode: auto # or auto or resume_path if
|
||||||
|
resume_from_path: False
|
||||||
|
test_freq: 100
|
||||||
|
critic_warmup: 0
|
||||||
|
default_hdfs_dir: null
|
||||||
|
remove_previous_ckpt_in_save: False
|
||||||
|
del_local_ckpt_after_load: False
|
||||||
|
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
|
||||||
|
|
||||||
|
|
||||||
|
critic:
|
||||||
|
strategy: fsdp
|
||||||
|
optim:
|
||||||
|
lr: 1e-5
|
||||||
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
min_lr_ratio: null # only useful for warmup with cosine
|
||||||
|
warmup_style: constant # select from constant/cosine
|
||||||
|
total_training_steps: -1 # must be override by program
|
||||||
|
model:
|
||||||
|
path: ~/models/deepseek-llm-7b-chat
|
||||||
|
tokenizer_path: ${actor_rollout_ref.model.path}
|
||||||
|
override_config: { }
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
enable_gradient_checkpointing: True
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
param_offload: False
|
||||||
|
optimizer_offload: False
|
||||||
|
wrap_policy:
|
||||||
|
# transformer_layer_cls_to_wrap: None
|
||||||
|
min_num_params: 0
|
||||||
|
fsdp_size: -1
|
||||||
|
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
|
||||||
|
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
|
||||||
|
ppo_micro_batch_size_per_gpu: null
|
||||||
|
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
|
||||||
|
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
|
||||||
|
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
|
||||||
|
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
|
||||||
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
|
||||||
|
shuffle: ${actor_rollout_ref.actor.shuffle}
|
||||||
|
grad_clip: 1.0
|
||||||
|
cliprange_value: 0.5
|
||||||
|
|
||||||
|
# Reward model not used for GRPO
|
||||||
|
reward_model:
|
||||||
|
enable: False
|
||||||
|
strategy: fsdp
|
||||||
|
model:
|
||||||
|
input_tokenizer: ${actor_rollout_ref.model.path}
|
||||||
|
path: ~/models/FsfairX-LLaMA3-RM-v0.1
|
||||||
|
external_lib: ${actor_rollout_ref.model.external_lib}
|
||||||
|
use_remove_padding: False
|
||||||
|
fsdp_config:
|
||||||
|
min_num_params: 0
|
||||||
|
param_offload: False
|
||||||
|
fsdp_size: -1
|
||||||
|
micro_batch_size: null
|
||||||
|
micro_batch_size_per_gpu: null
|
||||||
|
max_length: null
|
||||||
|
ulysses_sequence_parallel_size: 1
|
||||||
|
use_dynamic_bsz: ${critic.use_dynamic_bsz}
|
||||||
|
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
|
||||||
28
training/evaluations/eval_algebraic_composite.yaml
Normal file
28
training/evaluations/eval_algebraic_composite.yaml
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
# Model configuration
|
||||||
|
model_path: ../utils/qwen3b_algebraic
|
||||||
|
max_tokens: 1024
|
||||||
|
temperature: 0.6
|
||||||
|
top_p: 0.9
|
||||||
|
developer_prompt: DeepSeekZero
|
||||||
|
developer_role: system # Standard role for system prompts
|
||||||
|
|
||||||
|
# Output configuration
|
||||||
|
output_dir: results
|
||||||
|
save_metadata: true
|
||||||
|
save_full_results: true
|
||||||
|
eval_repeats: 3
|
||||||
|
|
||||||
|
# Categories and datasets to evaluate
|
||||||
|
categories:
|
||||||
|
- category: reasoning
|
||||||
|
datasets:
|
||||||
|
- dataset: simple_integration
|
||||||
|
size: 100
|
||||||
|
seed: 42
|
||||||
|
params:
|
||||||
|
min_terms: 2
|
||||||
|
max_terms: 5
|
||||||
|
min_degree: 1
|
||||||
|
max_degree: 10
|
||||||
|
min_bounds: 1
|
||||||
|
max_bounds: 10
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
# Model configuration
|
# Model configuration
|
||||||
model_path: ../utils/qwen3b_500 # Change to the smaller model
|
model_path: ../utils/qwen3b_algorithmic_500
|
||||||
max_tokens: 1024 # From max_response_length in training config
|
max_tokens: 1024
|
||||||
temperature: 0.7 # Lower temperature for more focused responses
|
temperature: 0.6
|
||||||
top_p: 0.9 # From rollout top_p
|
top_p: 0.9
|
||||||
developer_prompt: DeepSeekZero
|
developer_prompt: DeepSeekZero
|
||||||
developer_role: system # Standard role for system prompts
|
developer_role: system # Standard role for system prompts
|
||||||
|
|
||||||
|
|
|
||||||
24
training/evaluations/eval_arithmetic_composite.yaml
Normal file
24
training/evaluations/eval_arithmetic_composite.yaml
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
# Model configuration
|
||||||
|
model_path: ../utils/qwen_3b_arithmetic_100
|
||||||
|
max_tokens: 1024
|
||||||
|
temperature: 0.6
|
||||||
|
top_p: 0.9
|
||||||
|
developer_prompt: DeepSeekZero
|
||||||
|
developer_role: system # Standard role for system prompts
|
||||||
|
|
||||||
|
# Output configuration
|
||||||
|
output_dir: results
|
||||||
|
save_metadata: true
|
||||||
|
save_full_results: true
|
||||||
|
eval_repeats: 3
|
||||||
|
|
||||||
|
# Categories and datasets to evaluate
|
||||||
|
categories:
|
||||||
|
- category: reasoning
|
||||||
|
datasets:
|
||||||
|
- dataset: prime_factorization
|
||||||
|
size: 100
|
||||||
|
seed: 42
|
||||||
|
params:
|
||||||
|
min_value: 2
|
||||||
|
max_value: 1000
|
||||||
36
training/evaluations/eval_cognition_composite.yaml
Normal file
36
training/evaluations/eval_cognition_composite.yaml
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
# Model configuration
|
||||||
|
model_path: ../utils/qwen3b_cognition
|
||||||
|
max_tokens: 1024
|
||||||
|
temperature: 0.6 # Lower temperature for more focused responses
|
||||||
|
top_p: 0.9 # From rollout top_p
|
||||||
|
developer_prompt: DeepSeekZero
|
||||||
|
developer_role: system # Standard role for system prompts
|
||||||
|
|
||||||
|
# Output configuration
|
||||||
|
output_dir: results
|
||||||
|
save_metadata: true
|
||||||
|
save_full_results: true
|
||||||
|
eval_repeats: 3
|
||||||
|
|
||||||
|
# Categories and datasets to evaluate
|
||||||
|
categories:
|
||||||
|
- category: reasoning
|
||||||
|
datasets:
|
||||||
|
- dataset: number_sequence
|
||||||
|
size: 100
|
||||||
|
seed: 42
|
||||||
|
params:
|
||||||
|
min_terms: 4 # Minimum visible terms
|
||||||
|
max_terms: 8 # Maximum visible terms
|
||||||
|
min_value: -100 # Minimum allowed number
|
||||||
|
max_value: 100 # Maximum allowed number
|
||||||
|
max_complexity: 3 # Maximum number of operations to combine
|
||||||
|
- dataset: modulo_grid
|
||||||
|
size: 100
|
||||||
|
seed: 42
|
||||||
|
params:
|
||||||
|
size_x: 20
|
||||||
|
size_y: 20
|
||||||
|
max_divisor: 20
|
||||||
|
max_target: 20
|
||||||
|
max_holes: 1
|
||||||
24
training/evaluations/eval_games_composite.yaml
Normal file
24
training/evaluations/eval_games_composite.yaml
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
# Model configuration
|
||||||
|
model_path: ../utils/qwen3b_games
|
||||||
|
max_tokens: 1024
|
||||||
|
temperature: 0.6 # Lower temperature for more focused responses
|
||||||
|
top_p: 0.9 # From rollout top_p
|
||||||
|
developer_prompt: DeepSeekZero
|
||||||
|
developer_role: system # Standard role for system prompts
|
||||||
|
|
||||||
|
# Output configuration
|
||||||
|
output_dir: results
|
||||||
|
save_metadata: true
|
||||||
|
save_full_results: true
|
||||||
|
eval_repeats: 3
|
||||||
|
|
||||||
|
# Categories and datasets to evaluate
|
||||||
|
categories:
|
||||||
|
- category: reasoning
|
||||||
|
datasets:
|
||||||
|
- dataset: mahjong_puzzle
|
||||||
|
size: 100
|
||||||
|
seed: 42
|
||||||
|
params:
|
||||||
|
min_num_rounds: 10
|
||||||
|
max_num_rounds: 50
|
||||||
|
|
@ -16,13 +16,13 @@ eval_repeats: 3
|
||||||
categories:
|
categories:
|
||||||
- category: reasoning
|
- category: reasoning
|
||||||
datasets:
|
datasets:
|
||||||
- dataset: number_sorting
|
- dataset: decimal_chain_sum
|
||||||
size: 100
|
size: 100
|
||||||
seed: 42
|
seed: 42
|
||||||
params:
|
params:
|
||||||
min_numbers: 3
|
min_terms: 2
|
||||||
max_numbers: 10
|
max_terms: 4
|
||||||
min_decimals: 0
|
min_digits: 1
|
||||||
max_decimals: 2
|
max_digits: 3
|
||||||
min_value: -100.0
|
min_decimal_places: 1
|
||||||
max_value: 100.0
|
max_decimal_places: 4
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue