Feat/intragen experiments (#414)

* added curriculum

* readapted readme

* corrected small errors

* Delete eval/eval/r1/algorithmic/word_sorting.json

* removed redundant argument

* added spell

* removed duplicated fit

* changed config

* added composite changes

* added composite changes

* updated yaml

* added spell backward

* updated read me

* added qwen2.5

* added

* Add files via upload

* updated missing trainer func

* updated curr

* updated spell back

* updated correctness score func

* updated configs

* added local evals

* added updates

* updated datasets

* added fsdp to hf utility

* added algorithmic qwen 3b yaml

* updated read me

* updated configs

* added preappend token

* updated with thinking token

* updated test score board

* resolved comments

* added evaluation scripts

* removed results from pr

* added config

* added partial reward scoring

* added evaluation composites

* added training configs

* added games eval

* added rubriks cube

* resolved merge cinflicts

* added games config

* added latest eval configs

* updated strucutre

* Delete training/evaluations/eval_graphs_composite.yaml

---------

Co-authored-by: joesharratt1229 <joesharrat1229@gmail.com>
This commit is contained in:
joesharratt1229 2025-04-16 07:04:52 +01:00 committed by GitHub
parent 224532f12a
commit d0ef136d5b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 1331 additions and 48 deletions

View file

@ -170,7 +170,7 @@ class NumberSortingCurriculum(BaseCurriculum):
self._define_attributes( self._define_attributes(
RangeAttributeDefinition( RangeAttributeDefinition(
name="numbers", name="numbers",
levels=[10, 50, 100, 200], levels=[10, 100, 500, 1000],
description="How many numbers to sort", description="How many numbers to sort",
lower_field_name="min_numbers", lower_field_name="min_numbers",
upper_field_name="max_numbers", upper_field_name="max_numbers",

View file

@ -2,7 +2,7 @@ from dataclasses import dataclass
from random import Random from random import Random
from typing import Any, Literal, Optional from typing import Any, Literal, Optional
from ..coaching import BaseCurriculum, RangeAttributeDefinition from ..coaching import BaseCurriculum, RangeAttributeDefinition, ScalarAttributeDefinition
from ..factory import ProceduralDataset, register_dataset from ..factory import ProceduralDataset, register_dataset
DATASET_NAME = "basic_arithmetic" DATASET_NAME = "basic_arithmetic"
@ -250,17 +250,19 @@ class BasicArithmeticCurriculum(BaseCurriculum):
self._define_attributes( self._define_attributes(
RangeAttributeDefinition( RangeAttributeDefinition(
name="num_terms", name="num_terms",
levels=[2, 5, 10, 15], levels=[2, 3, 4, 5, 6],
description="Number of terms in the expression", description="Number of terms in the expression",
lower_field_name="min_terms", lower_field_name="min_terms",
upper_field_name="max_terms", upper_field_name="max_terms",
ensure_interval=False,
), ),
RangeAttributeDefinition( RangeAttributeDefinition(
name="num_digits", name="num_digits",
levels=[1, 2, 5, 10], levels=[1, 2, 3, 4],
description="Number of digits in the numbers", description="Number of digits in the numbers",
lower_field_name="min_digits", lower_field_name="min_digits",
upper_field_name="max_digits", upper_field_name="max_digits",
ensure_interval=False,
), ),
) )

View file

@ -16,6 +16,8 @@ Now, it's your turn. How many rectangles do you see in the grid below?
""" """
DATASET_NAME = "rectangle_count" DATASET_NAME = "rectangle_count"
CONST_TERM = 0.8
D = 5
def draw_rectangles_with_overlap(n, width, height, rng): def draw_rectangles_with_overlap(n, width, height, rng):
@ -132,22 +134,29 @@ class RectangleCountDataset(ProceduralDataset):
} }
def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float: def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
"""Determine if the solution provided solves the RectangleCount task. """Determine if the solution provided solves the RectangleCount task,
awarding partial credit if the guess is close.
The function awards 1.0 for a correct answer.
Args:
answer (Optional[str]): The user's answer.
entry (dict[str, Any]): The original dataset entry containing the correct answer.
Returns: Returns:
float: The computed score between 0.0 and 1.0. float: A score between 0.0 and 1.0.
""" """
correct_str = entry["answer"].lower().replace("\n", "")
if isinstance(answer, str): try:
if answer.lower().replace("\n", "") == entry["answer"].lower().replace("\n", ""): correct_val = int(correct_str)
return 1.0 # Yay user_val = int(answer.strip())
return 0.0 except (ValueError, TypeError, AttributeError):
return 0.0
distance = abs(user_val - correct_val)
if distance == 0:
return 1.0
if distance >= D:
return 0.0
score = 1.0 - (distance / float(D))
score = CONST_TERM * score
return max(0.0, score)
class RectangleCountCurriculum(BaseCurriculum): class RectangleCountCurriculum(BaseCurriculum):

View file

@ -121,29 +121,49 @@ class RubiksCubeDataset(ProceduralDataset):
}, },
} }
def partial_score(self, cube: Cube) -> float:
"""
Returns a fraction between 0 and 1, indicating how many stickers are
correctly positioned (i.e., match the solved color for that face).
"""
total_stickers = 6 * (cube.size**2)
correct_stickers = 0
for face_index in range(6):
face = cube.faces[face_index]
solved_color = face[cube.size // 2][cube.size // 2].color
for row in range(cube.size):
for col in range(cube.size):
sticker = face[row][col]
if sticker.color == solved_color:
correct_stickers += 1
return correct_stickers / total_stickers
def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float: def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
"""Determine if the solution provided solves the cube""" """Determine if the solution provided solves the cube, with partial rewards."""
reward = 0.0 # default reward reward = 0.0 # default
if answer is not None: if answer is not None:
# Reconstruct the test cube
eval_cube = Cube(entry["metadata"]["cube_size"]) eval_cube = Cube(entry["metadata"]["cube_size"])
eval_cube.rotate(entry["metadata"]["scramble_moves"]) eval_cube.rotate(entry["metadata"]["scramble_moves"])
# Test the solution
try: try:
expanded_answer = self.expand_moves(answer) expanded_answer = self.expand_moves(answer)
eval_cube.rotate(expanded_answer) eval_cube.rotate(expanded_answer)
solved = eval_cube.is_done()
# 3) Check if fully solved
solved = eval_cube.is_done()
if solved: if solved:
reward = 1.0 reward = 1.0
elif len(answer.strip()) > 0: # encourage non-empty answers
reward = 0.05 # Incorrect, but rotate could parse the answer
else: else:
reward = 0.01 partial = self.partial_score(eval_cube)
except:
reward = 0.01 # At least you tried
if len(answer.strip()) > 0:
reward = max(0.05, partial)
else:
reward = max(0.01, partial)
except:
reward = 0.01
return reward return reward
def remove_ansi(self, line): def remove_ansi(self, line):

View file

@ -99,6 +99,7 @@ Here is your puzzle:
"source_dataset": DATASET_NAME, "source_dataset": DATASET_NAME,
"source_index": idx, "source_index": idx,
"gamestr": gamestr, "gamestr": gamestr,
"source_dataset": DATASET_NAME,
"width": puzzle_data["width"], "width": puzzle_data["width"],
"height": puzzle_data["height"], "height": puzzle_data["height"],
"difficulty": { "difficulty": {

View file

@ -6,6 +6,7 @@ from reasoning_gym.arithmetic.basic_arithmetic import (
BasicArithmeticDatasetConfig, BasicArithmeticDatasetConfig,
eval_floordiv, eval_floordiv,
) )
from reasoning_gym.coaching.base_curriculum import DefaultCurriculumContext, RangeAttributeMode
def test_arithmetic_dataset_config_validation(): def test_arithmetic_dataset_config_validation():
@ -103,7 +104,7 @@ def test_basic_arithmetic_curriculum():
"""Test the BasicArithmeticCurriculum functionality""" """Test the BasicArithmeticCurriculum functionality"""
curriculum = BasicArithmeticCurriculum() curriculum = BasicArithmeticCurriculum()
base_value = {"size": 150, "seed": 1} base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1}
base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(base_value) base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(base_value)
assert base_cfg.seed == 1 assert base_cfg.seed == 1
@ -115,7 +116,7 @@ def test_basic_arithmetic_curriculum():
curriculum.increment_attr_level("num_terms") curriculum.increment_attr_level("num_terms")
curriculum.increment_attr_level("num_digits") curriculum.increment_attr_level("num_digits")
increased_cfg = curriculum.generate_configuration(base_value) increased_cfg = curriculum.generate_configuration(base_value)
assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 5 assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3
assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2 assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2
# Test decrementing attribute level for num_terms # Test decrementing attribute level for num_terms
@ -128,7 +129,7 @@ def test_basic_arithmetic_curriculum():
curriculum.increment_attr_level("num_terms") curriculum.increment_attr_level("num_terms")
curriculum.increment_attr_level("num_terms") curriculum.increment_attr_level("num_terms")
higher_level_cfg = curriculum.generate_configuration(base_value) higher_level_cfg = curriculum.generate_configuration(base_value)
assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 10 assert higher_level_cfg.min_terms == 2 and higher_level_cfg.max_terms == 4
assert higher_level_cfg.min_digits == 1 and higher_level_cfg.max_digits == 2 assert higher_level_cfg.min_digits == 1 and higher_level_cfg.max_digits == 2
# Test boundary conditions - trying to decrement below level 0 # Test boundary conditions - trying to decrement below level 0
@ -144,5 +145,26 @@ def test_basic_arithmetic_curriculum():
curriculum.increment_attr_level("num_terms") curriculum.increment_attr_level("num_terms")
curriculum.increment_attr_level("num_digits") curriculum.increment_attr_level("num_digits")
upper_bound_cfg = curriculum.generate_configuration(base_value) upper_bound_cfg = curriculum.generate_configuration(base_value)
assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 15 assert upper_bound_cfg.min_terms == 2 and upper_bound_cfg.max_terms == 6
assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 10 assert upper_bound_cfg.min_digits == 1 and upper_bound_cfg.max_digits == 4
def test_basic_arithmetic_curriculum_upper_bound():
curriculum = BasicArithmeticCurriculum()
base_value = {"size": 150, "seed": 1, "min_terms": 2, "max_terms": 2, "min_digits": 1, "max_digits": 1}
base_cfg: BasicArithmeticDatasetConfig = curriculum.generate_configuration(
base_value, context=DefaultCurriculumContext(mode=RangeAttributeMode.UPPER_BOUND)
)
assert base_cfg.seed == 1
assert base_cfg.size == 150
assert base_cfg.min_terms == 2 and base_cfg.max_terms == 2
assert base_cfg.min_digits == 1 and base_cfg.max_digits == 1
# Test incrementing attribute levels
curriculum.increment_attr_level("num_terms")
curriculum.increment_attr_level("num_digits")
increased_cfg = curriculum.generate_configuration(base_value)
assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3
assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2

View file

@ -56,6 +56,7 @@ def test_number_sorting_dataset_items():
# Verify number count constraints # Verify number count constraints
numbers = item["metadata"]["original_numbers"] numbers = item["metadata"]["original_numbers"]
print(numbers)
assert len(numbers) >= config.min_numbers assert len(numbers) >= config.min_numbers
assert len(numbers) <= config.max_numbers assert len(numbers) <= config.max_numbers
@ -99,7 +100,7 @@ def test_number_sorting_curriculum():
base_cfg: NumberSortingConfig = curriculum.generate_configuration(base_value) base_cfg: NumberSortingConfig = curriculum.generate_configuration(base_value)
assert base_cfg.seed == 1 assert base_cfg.seed == 1
assert base_cfg.size == 150 assert base_cfg.size == 150
assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 50 assert base_cfg.min_numbers == 10 and base_cfg.max_numbers == 100
assert base_cfg.min_decimals == 0 and base_cfg.max_decimals == 1 assert base_cfg.min_decimals == 0 and base_cfg.max_decimals == 1
assert base_cfg.min_value == -100 and base_cfg.max_value == 100 assert base_cfg.min_value == -100 and base_cfg.max_value == 100
@ -107,14 +108,14 @@ def test_number_sorting_curriculum():
curriculum.increment_attr_level("numbers") curriculum.increment_attr_level("numbers")
curriculum.increment_attr_level("decimals") curriculum.increment_attr_level("decimals")
increased_cfg = curriculum.generate_configuration(base_value) increased_cfg = curriculum.generate_configuration(base_value)
assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 100 assert increased_cfg.min_numbers == 10 and increased_cfg.max_numbers == 500
assert increased_cfg.min_decimals == 0 and increased_cfg.max_decimals == 2 assert increased_cfg.min_decimals == 0 and increased_cfg.max_decimals == 2
assert increased_cfg.min_value == -100 and increased_cfg.max_value == 100 assert increased_cfg.min_value == -100 and increased_cfg.max_value == 100
# test decrementing attribute level for numbers again # test decrementing attribute level for numbers again
curriculum.decrement_attr_level("numbers") curriculum.decrement_attr_level("numbers")
partially_decreased_cfg = curriculum.generate_configuration(base_value) partially_decreased_cfg = curriculum.generate_configuration(base_value)
assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 50 assert partially_decreased_cfg.min_numbers == 10 and partially_decreased_cfg.max_numbers == 100
assert partially_decreased_cfg.min_decimals == 0 and partially_decreased_cfg.max_decimals == 2 assert partially_decreased_cfg.min_decimals == 0 and partially_decreased_cfg.max_decimals == 2
assert partially_decreased_cfg.min_value == -100 and partially_decreased_cfg.max_value == 100 assert partially_decreased_cfg.min_value == -100 and partially_decreased_cfg.max_value == 100

View file

@ -55,9 +55,9 @@ def test_rubikscube_items():
assert dataset.score_answer(answer=None, entry=item) == 0.0 assert dataset.score_answer(answer=None, entry=item) == 0.0
if item["metadata"]["example_correct_answer"] != "R": if item["metadata"]["example_correct_answer"] != "R":
assert dataset.score_answer(answer="R", entry=item) == 0.05 assert dataset.score_answer(answer="R", entry=item) == 0.01
assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.05 assert dataset.score_answer(answer="R2 R3 R4 R5 R'2 R'3", entry=item) == 0.01
if len(item["metadata"]["example_correct_answer"]) > 0: if len(item["metadata"]["example_correct_answer"]) > 0:
assert dataset.score_answer(answer="", entry=item) == 0.01 assert dataset.score_answer(answer="", entry=item) == 0.01

View file

@ -87,6 +87,7 @@ python utils/load_fsdp_to_hf.py checkpoints/rg-test/intra_reasoning_algorithmic_
From here you may to run evaluations of your trained model. In the `training/evaluation` directory there is a script `evaluate_model.py` which you csn run to evaluate your trained model on a specific dataset. You specify evaluation parameters in a yaml file. This evaluation can point to either a local or remote model. For example the configuration file `training/evaluation/eval_algorithmic_composite.yaml` specifies the path to a local model which is stored as a hugginface checkpoint at `training/utils/qwen3b_500` (note that you have to convert to fsdp checkpoint to hf checkpoint for evaluation script to work as shown in the previous step). From here you may to run evaluations of your trained model. In the `training/evaluation` directory there is a script `evaluate_model.py` which you csn run to evaluate your trained model on a specific dataset. You specify evaluation parameters in a yaml file. This evaluation can point to either a local or remote model. For example the configuration file `training/evaluation/eval_algorithmic_composite.yaml` specifies the path to a local model which is stored as a hugginface checkpoint at `training/utils/qwen3b_500` (note that you have to convert to fsdp checkpoint to hf checkpoint for evaluation script to work as shown in the previous step).
## Run the script ## Run the script
export VLLM_ATTENTION_BACKEND=XFORMERS
Navigate to evaluations directory: Navigate to evaluations directory:
``` ```
python evaluate_model.py --config path-to-yaml python evaluate_model.py --config path-to-yaml

View file

@ -0,0 +1,221 @@
reasoning_gym:
dataset_size: 20000
developer_prompt: DeepSeekZero
datasets:
simple_equations:
weight: 0.5
config:
min_terms: 2
max_terms: 4
min_value: 1
max_value: 100
polynomial_multiplication:
weight: 0.5
config:
min_terms: 2
max_terms: 4
min_value: 1
max_value: 100
min_degree: 0
max_degree: 3
min_polynomials: 2
max_polynomials: 3
curriculum:
enabled: False
schedule:
automatic: True
update_steps: 30 # automatic curriculum updating after 50 steps
last_k: 20
success_threshold: 0.70
failure_threshold: 0.10
curricula:
spell_backward:
attribute_levels:
word_len: 0
reward:
use_accuracy: True
secondary_rewards:
- name: cosine
scaling_factor: 0.3
- name: format
scaling_factor: 0.2
kwargs:
preappend_thinking_token: False
data:
tokenizer: null
train_files: train.parquet
val_files: test.parquet
prompt_key: prompt
max_prompt_length: 512
max_response_length: 1024
train_batch_size: 32
val_batch_size: 64
return_raw_chat: True
return_raw_input_ids: True
actor_rollout_ref:
hybrid_engine: True
model:
path: Qwen/Qwen2.5-3B-Instruct
external_lib: null
override_config: { }
enable_gradient_checkpointing: True
use_remove_padding: True
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 16
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
use_kl_loss: True # True for GRPO
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1
shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: 500 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
optimizer_offload: False
fsdp_size: -1
ref:
fsdp_config:
param_offload: True
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.7
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 4
max_num_batched_tokens: 12288
max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
disable_log_stats: True
enable_chunked_prefill: True # could get higher throughput
# for hf rollout
do_sample: True
use_fire_sampling: False
max_model_len: 12288
# number of responses (i.e. num sample times)
n: 8 # > 1 for grpo
val_kwargs:
do_sample: True
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: grpo
kl_penalty: kl # how to estimate kl divergence
kl_ctrl:
type: fixed
kl_coef: 0.001
verbose: True
trainer:
balance_batch: True
total_epochs: 1
total_training_steps: 500
project_name: rg-test
experiment_name: intra_reasoning_algebra_qwen_3b_composite
logger: [ 'console', 'wandb' ]
val_generations_to_log_to_wandb: 0
nnodes: 1
n_gpus_per_node: 4
save_freq: 100
# auto: find the last ckpt to resume. If can't find, start from scratch
resume_mode: auto # or auto or resume_path if
resume_from_path: False
test_freq: 100
critic_warmup: 0
default_hdfs_dir: null
remove_previous_ckpt_in_save: False
del_local_ckpt_after_load: False
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
critic:
strategy: fsdp
optim:
lr: 1e-5
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
model:
path: ~/models/deepseek-llm-7b-chat
tokenizer_path: ${actor_rollout_ref.model.path}
override_config: { }
external_lib: ${actor_rollout_ref.model.external_lib}
enable_gradient_checkpointing: True
use_remove_padding: False
fsdp_config:
param_offload: False
optimizer_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
fsdp_size: -1
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: 1 # sp size
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0
cliprange_value: 0.5
# Reward model not used for GRPO
reward_model:
enable: False
strategy: fsdp
model:
input_tokenizer: ${actor_rollout_ref.model.path}
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: ${actor_rollout_ref.model.external_lib}
use_remove_padding: False
fsdp_config:
min_num_params: 0
param_offload: False
fsdp_size: -1
micro_batch_size: null
micro_batch_size_per_gpu: null
max_length: null
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: ${critic.use_dynamic_bsz}
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

View file

@ -0,0 +1,224 @@
reasoning_gym:
dataset_size: 20000
developer_prompt: DeepSeekZero
datasets:
fraction_simplification:
weight: 0.33
config:
min_value: 1
max_value: 1000
min_factor: 1
max_factor: 100
gcd:
weight: 0.34
config:
min_numbers: 2 # Minimum numbers to find GCD of
max_numbers: 2 # Maximum numbers to find GCD of
min_value: 1 # Minimum value for each number
max_value: 1000 # Maximum value for each number
lcm:
weight: 0.33
config:
min_numbers: 2
max_numbers: 2
min_value: 1
max_value: 100
curriculum:
enabled: False
schedule:
automatic: True
update_steps: 30
last_k: 20
success_threshold: 0.70
failure_threshold: 0.10
curricula:
spell_backward:
attribute_levels:
word_len: 0
reward:
use_accuracy: True
secondary_rewards:
- name: cosine
scaling_factor: 0.3
- name: format
scaling_factor: 0.2
kwargs:
preappend_thinking_token: False
data:
tokenizer: null
train_files: train.parquet
val_files: test.parquet
prompt_key: prompt
max_prompt_length: 512
max_response_length: 1024
train_batch_size: 32
val_batch_size: 64
return_raw_chat: True
return_raw_input_ids: True
actor_rollout_ref:
hybrid_engine: True
model:
path: Qwen/Qwen2.5-3B-Instruct
external_lib: null
override_config: { }
enable_gradient_checkpointing: True
use_remove_padding: True
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 16
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
use_kl_loss: True # True for GRPO
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1
shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: 500 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
optimizer_offload: False
fsdp_size: -1
ref:
fsdp_config:
param_offload: True
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.7
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 4
max_num_batched_tokens: 12288
max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
disable_log_stats: True
enable_chunked_prefill: True # could get higher throughput
# for hf rollout
do_sample: True
use_fire_sampling: False
max_model_len: 12288
# number of responses (i.e. num sample times)
n: 8 # > 1 for grpo
val_kwargs:
do_sample: True
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: grpo
kl_penalty: kl # how to estimate kl divergence
kl_ctrl:
type: fixed
kl_coef: 0.001
verbose: True
trainer:
balance_batch: True
total_epochs: 1
total_training_steps: 500
project_name: rg-test
experiment_name: intra_reasoning_arithmetic_qwen_3b_composite
logger: [ 'console', 'wandb' ]
val_generations_to_log_to_wandb: 0
nnodes: 1
n_gpus_per_node: 4
save_freq: 100
# auto: find the last ckpt to resume. If can't find, start from scratch
resume_mode: auto # or auto or resume_path if
resume_from_path: False
test_freq: 100
critic_warmup: 0
default_hdfs_dir: null
remove_previous_ckpt_in_save: False
del_local_ckpt_after_load: False
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
critic:
strategy: fsdp
optim:
lr: 1e-5
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
model:
path: ~/models/deepseek-llm-7b-chat
tokenizer_path: ${actor_rollout_ref.model.path}
override_config: { }
external_lib: ${actor_rollout_ref.model.external_lib}
enable_gradient_checkpointing: True
use_remove_padding: False
fsdp_config:
param_offload: False
optimizer_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
fsdp_size: -1
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: 1 # sp size
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0
cliprange_value: 0.5
# Reward model not used for GRPO
reward_model:
enable: False
strategy: fsdp
model:
input_tokenizer: ${actor_rollout_ref.model.path}
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: ${actor_rollout_ref.model.external_lib}
use_remove_padding: False
fsdp_config:
min_num_params: 0
param_offload: False
fsdp_size: -1
micro_batch_size: null
micro_batch_size_per_gpu: null
max_length: null
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: ${critic.use_dynamic_bsz}
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

View file

@ -0,0 +1,219 @@
reasoning_gym:
dataset_size: 20000
developer_prompt: DeepSeekZero
datasets:
rubiks_cube:
weight: 0.33
config:
min_scramble_steps: 3
max_scramble_steps: 10
figlet_font:
weight: 0.34
config:
min_word_len: 3
max_word_len: 7
rectangle_count:
weight: 0.33
config:
max_rectangles: 10
width: 80
height: 80
curriculum:
enabled: False
schedule:
automatic: True
update_steps: 30 # automatic curriculum updating after 50 steps
last_k: 20
success_threshold: 0.70
failure_threshold: 0.10
curricula:
spell_backward:
attribute_levels:
word_len: 0
reward:
use_accuracy: True
secondary_rewards:
- name: cosine
scaling_factor: 0.3
- name: format
scaling_factor: 0.2
kwargs:
preappend_thinking_token: False
data:
tokenizer: null
train_files: train.parquet
val_files: test.parquet
prompt_key: prompt
max_prompt_length: 512
max_response_length: 1024
train_batch_size: 32
val_batch_size: 64
return_raw_chat: True
return_raw_input_ids: True
actor_rollout_ref:
hybrid_engine: True
model:
path: Qwen/Qwen2.5-3B-Instruct
external_lib: null
override_config: { }
enable_gradient_checkpointing: True
use_remove_padding: True
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 16
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
use_kl_loss: True # True for GRPO
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1
shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: 500 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
optimizer_offload: False
fsdp_size: -1
ref:
fsdp_config:
param_offload: True
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.7
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 4
max_num_batched_tokens: 12288
max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
disable_log_stats: True
enable_chunked_prefill: True # could get higher throughput
# for hf rollout
do_sample: True
use_fire_sampling: False
max_model_len: 12288
# number of responses (i.e. num sample times)
n: 8 # > 1 for grpo
val_kwargs:
do_sample: True
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: grpo
kl_penalty: kl # how to estimate kl divergence
kl_ctrl:
type: fixed
kl_coef: 0.001
verbose: True
trainer:
balance_batch: True
total_epochs: 1
total_training_steps: 500
project_name: rg-test
experiment_name: intra_reasoning_cognition_qwen_3b_composite_test
logger: [ 'console', 'wandb' ]
val_generations_to_log_to_wandb: 0
nnodes: 1
n_gpus_per_node: 4
save_freq: 100
# auto: find the last ckpt to resume. If can't find, start from scratch
resume_mode: auto # or auto or resume_path if
resume_from_path: False
test_freq: 100
critic_warmup: 0
default_hdfs_dir: null
remove_previous_ckpt_in_save: False
del_local_ckpt_after_load: False
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
critic:
strategy: fsdp
optim:
lr: 1e-5
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
model:
path: ~/models/deepseek-llm-7b-chat
tokenizer_path: ${actor_rollout_ref.model.path}
override_config: { }
external_lib: ${actor_rollout_ref.model.external_lib}
enable_gradient_checkpointing: True
use_remove_padding: False
fsdp_config:
param_offload: False
optimizer_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
fsdp_size: -1
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: 1 # sp size
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0
cliprange_value: 0.5
# Reward model not used for GRPO
reward_model:
enable: False
strategy: fsdp
model:
input_tokenizer: ${actor_rollout_ref.model.path}
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: ${actor_rollout_ref.model.external_lib}
use_remove_padding: False
fsdp_config:
min_num_params: 0
param_offload: False
fsdp_size: -1
micro_batch_size: null
micro_batch_size_per_gpu: null
max_length: null
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: ${critic.use_dynamic_bsz}
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

View file

@ -0,0 +1,225 @@
reasoning_gym:
dataset_size: 20000
developer_prompt: DeepSeekZero
datasets:
sudoku:
weight: 0.33
config:
min_empty: 30
max_empty: 50
futoshiki:
weight: 0.34
config:
min_board_size: 4 # Board will be NxN where N is this value
max_board_size: 9
min_difficulty: 0
max_difficulty: 3
sokoban:
weight: 0.33
config:
min_w: 6 # Minimum width of the puzzle
min_h: 6 # Minimum height of the puzzle
max_w: 10 # Maximum width of the puzzle
max_h: 10 # Maximum height of the puzzle
min_boxes: 4 # Minimum number of boxes
max_boxes: 10 # Maximum number of boxes
max_depth: 80 # Maximum search depth
curriculum:
enabled: False
schedule:
automatic: True
update_steps: 30 # automatic curriculum updating after 50 steps
last_k: 20
success_threshold: 0.70
failure_threshold: 0.10
curricula:
spell_backward:
attribute_levels:
word_len: 0
reward:
use_accuracy: True
secondary_rewards:
- name: cosine
scaling_factor: 0.3
- name: format
scaling_factor: 0.2
kwargs:
preappend_thinking_token: False
data:
tokenizer: null
train_files: train.parquet
val_files: test.parquet
prompt_key: prompt
max_prompt_length: 512
max_response_length: 1024
train_batch_size: 32
val_batch_size: 64
return_raw_chat: True
return_raw_input_ids: True
actor_rollout_ref:
hybrid_engine: True
model:
path: Qwen/Qwen2.5-3B-Instruct
external_lib: null
override_config: { }
enable_gradient_checkpointing: True
use_remove_padding: True
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 16
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
use_kl_loss: True # True for GRPO
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1
shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: 500 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
optimizer_offload: False
fsdp_size: -1
ref:
fsdp_config:
param_offload: True
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.7
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 4
max_num_batched_tokens: 12288
max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
disable_log_stats: True
enable_chunked_prefill: True # could get higher throughput
# for hf rollout
do_sample: True
use_fire_sampling: False
max_model_len: 12288
# number of responses (i.e. num sample times)
n: 8 # > 1 for grpo
val_kwargs:
do_sample: True
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: grpo
kl_penalty: kl # how to estimate kl divergence
kl_ctrl:
type: fixed
kl_coef: 0.001
verbose: True
trainer:
balance_batch: True
total_epochs: 1
total_training_steps: 500
project_name: rg-test
experiment_name: intra_reasoning_games_qwen_3b_composite
logger: [ 'console', 'wandb' ]
val_generations_to_log_to_wandb: 0
nnodes: 1
n_gpus_per_node: 4
save_freq: 100
# auto: find the last ckpt to resume. If can't find, start from scratch
resume_mode: auto # or auto or resume_path if
resume_from_path: False
test_freq: 100
critic_warmup: 0
default_hdfs_dir: null
remove_previous_ckpt_in_save: False
del_local_ckpt_after_load: False
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
critic:
strategy: fsdp
optim:
lr: 1e-5
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
model:
path: ~/models/deepseek-llm-7b-chat
tokenizer_path: ${actor_rollout_ref.model.path}
override_config: { }
external_lib: ${actor_rollout_ref.model.external_lib}
enable_gradient_checkpointing: True
use_remove_padding: False
fsdp_config:
param_offload: False
optimizer_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
fsdp_size: -1
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: 1 # sp size
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0
cliprange_value: 0.5
# Reward model not used for GRPO
reward_model:
enable: False
strategy: fsdp
model:
input_tokenizer: ${actor_rollout_ref.model.path}
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: ${actor_rollout_ref.model.external_lib}
use_remove_padding: False
fsdp_config:
min_num_params: 0
param_offload: False
fsdp_size: -1
micro_batch_size: null
micro_batch_size_per_gpu: null
max_length: null
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: ${critic.use_dynamic_bsz}
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

View file

@ -0,0 +1,226 @@
reasoning_gym:
dataset_size: 20000
developer_prompt: DeepSeekZero
datasets:
shortest_path:
weight: 0.33
config:
min_rows: 5
max_rows: 8
min_cols: 5
max_cols: 8
p_blocked: 0.4
largest_island:
weight: 0.34
config:
min_rows: 5
max_rows: 10
min_cols: 5
max_cols: 10
min_num_islands: 0
max_num_islands: 5
min_island_size: 0
max_island_size: 10
quantum_lock:
weight: 0.33
config:
difficulty: 10
curriculum:
enabled: False
schedule:
automatic: True
update_steps: 30 # automatic curriculum updating after 50 steps
last_k: 20
success_threshold: 0.70
failure_threshold: 0.10
curricula:
spell_backward:
attribute_levels:
word_len: 0
reward:
use_accuracy: True
secondary_rewards:
- name: cosine
scaling_factor: 0.3
- name: format
scaling_factor: 0.2
kwargs:
preappend_thinking_token: False
data:
tokenizer: null
train_files: train.parquet
val_files: test.parquet
prompt_key: prompt
max_prompt_length: 512
max_response_length: 1024
train_batch_size: 32
val_batch_size: 64
return_raw_chat: True
return_raw_input_ids: True
actor_rollout_ref:
hybrid_engine: True
model:
path: Qwen/Qwen2.5-3B-Instruct
external_lib: null
override_config: { }
enable_gradient_checkpointing: True
use_remove_padding: True
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 16
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
use_kl_loss: True # True for GRPO
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1
shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: 500 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
optimizer_offload: False
fsdp_size: -1
ref:
fsdp_config:
param_offload: True
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.7
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 4
max_num_batched_tokens: 12288
max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
disable_log_stats: True
enable_chunked_prefill: True # could get higher throughput
# for hf rollout
do_sample: True
use_fire_sampling: False
max_model_len: 12288
# number of responses (i.e. num sample times)
n: 8 # > 1 for grpo
val_kwargs:
do_sample: True
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: grpo
kl_penalty: kl # how to estimate kl divergence
kl_ctrl:
type: fixed
kl_coef: 0.001
verbose: True
trainer:
balance_batch: True
total_epochs: 1
total_training_steps: 500
project_name: rg-test
experiment_name: intra_reasoning_games_qwen_3b_graphs
logger: [ 'console', 'wandb' ]
val_generations_to_log_to_wandb: 0
nnodes: 1
n_gpus_per_node: 4
save_freq: 100
# auto: find the last ckpt to resume. If can't find, start from scratch
resume_mode: auto # or auto or resume_path if
resume_from_path: False
test_freq: 100
critic_warmup: 0
default_hdfs_dir: null
remove_previous_ckpt_in_save: False
del_local_ckpt_after_load: False
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
critic:
strategy: fsdp
optim:
lr: 1e-5
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
model:
path: ~/models/deepseek-llm-7b-chat
tokenizer_path: ${actor_rollout_ref.model.path}
override_config: { }
external_lib: ${actor_rollout_ref.model.external_lib}
enable_gradient_checkpointing: True
use_remove_padding: False
fsdp_config:
param_offload: False
optimizer_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
fsdp_size: -1
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: 1 # sp size
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0
cliprange_value: 0.5
# Reward model not used for GRPO
reward_model:
enable: False
strategy: fsdp
model:
input_tokenizer: ${actor_rollout_ref.model.path}
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: ${actor_rollout_ref.model.external_lib}
use_remove_padding: False
fsdp_config:
min_num_params: 0
param_offload: False
fsdp_size: -1
micro_batch_size: null
micro_batch_size_per_gpu: null
max_length: null
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: ${critic.use_dynamic_bsz}
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

View file

@ -0,0 +1,28 @@
# Model configuration
model_path: ../utils/qwen3b_algebraic
max_tokens: 1024
temperature: 0.6
top_p: 0.9
developer_prompt: DeepSeekZero
developer_role: system # Standard role for system prompts
# Output configuration
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
# Categories and datasets to evaluate
categories:
- category: reasoning
datasets:
- dataset: simple_integration
size: 100
seed: 42
params:
min_terms: 2
max_terms: 5
min_degree: 1
max_degree: 10
min_bounds: 1
max_bounds: 10

View file

@ -1,8 +1,8 @@
# Model configuration # Model configuration
model_path: ../utils/qwen3b_500 # Change to the smaller model model_path: ../utils/qwen3b_algorithmic_500
max_tokens: 1024 # From max_response_length in training config max_tokens: 1024
temperature: 0.7 # Lower temperature for more focused responses temperature: 0.6
top_p: 0.9 # From rollout top_p top_p: 0.9
developer_prompt: DeepSeekZero developer_prompt: DeepSeekZero
developer_role: system # Standard role for system prompts developer_role: system # Standard role for system prompts

View file

@ -0,0 +1,24 @@
# Model configuration
model_path: ../utils/qwen_3b_arithmetic_100
max_tokens: 1024
temperature: 0.6
top_p: 0.9
developer_prompt: DeepSeekZero
developer_role: system # Standard role for system prompts
# Output configuration
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
# Categories and datasets to evaluate
categories:
- category: reasoning
datasets:
- dataset: prime_factorization
size: 100
seed: 42
params:
min_value: 2
max_value: 1000

View file

@ -0,0 +1,36 @@
# Model configuration
model_path: ../utils/qwen3b_cognition
max_tokens: 1024
temperature: 0.6 # Lower temperature for more focused responses
top_p: 0.9 # From rollout top_p
developer_prompt: DeepSeekZero
developer_role: system # Standard role for system prompts
# Output configuration
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
# Categories and datasets to evaluate
categories:
- category: reasoning
datasets:
- dataset: number_sequence
size: 100
seed: 42
params:
min_terms: 4 # Minimum visible terms
max_terms: 8 # Maximum visible terms
min_value: -100 # Minimum allowed number
max_value: 100 # Maximum allowed number
max_complexity: 3 # Maximum number of operations to combine
- dataset: modulo_grid
size: 100
seed: 42
params:
size_x: 20
size_y: 20
max_divisor: 20
max_target: 20
max_holes: 1

View file

@ -0,0 +1,24 @@
# Model configuration
model_path: ../utils/qwen3b_games
max_tokens: 1024
temperature: 0.6 # Lower temperature for more focused responses
top_p: 0.9 # From rollout top_p
developer_prompt: DeepSeekZero
developer_role: system # Standard role for system prompts
# Output configuration
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
# Categories and datasets to evaluate
categories:
- category: reasoning
datasets:
- dataset: mahjong_puzzle
size: 100
seed: 42
params:
min_num_rounds: 10
max_num_rounds: 50

View file

@ -16,13 +16,13 @@ eval_repeats: 3
categories: categories:
- category: reasoning - category: reasoning
datasets: datasets:
- dataset: number_sorting - dataset: decimal_chain_sum
size: 100 size: 100
seed: 42 seed: 42
params: params:
min_numbers: 3 min_terms: 2
max_numbers: 10 max_terms: 4
min_decimals: 0 min_digits: 1
max_decimals: 2 max_digits: 3
min_value: -100.0 min_decimal_places: 1
max_value: 100.0 max_decimal_places: 4