mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
fix(training): Prepend <think> token in format reward (#396)
* prepend think token in format reward * pre commit + fix some default vals * add checkpoint config
This commit is contained in:
parent
ed560a80f4
commit
695aad4dbc
4 changed files with 16 additions and 1 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
|
@ -45,3 +45,8 @@ htmlcov/
|
||||||
# Jupyter Notebook
|
# Jupyter Notebook
|
||||||
.ipynb_checkpoints/
|
.ipynb_checkpoints/
|
||||||
.virtual_documents/
|
.virtual_documents/
|
||||||
|
|
||||||
|
# logs
|
||||||
|
wandb/
|
||||||
|
outputs/
|
||||||
|
*.log
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@ reward:
|
||||||
format_reward:
|
format_reward:
|
||||||
enable: True
|
enable: True
|
||||||
scaling_factor: 0.2
|
scaling_factor: 0.2
|
||||||
|
prepend_think_token: False # Set to True only when the tokenizer's prompt template pre-fills the generation with <think>, such as in the case of (distilled) r1 models
|
||||||
length_reward:
|
length_reward:
|
||||||
enable: True
|
enable: True
|
||||||
scaling_factor: 0.2
|
scaling_factor: 0.2
|
||||||
|
|
@ -75,6 +76,8 @@ actor_rollout_ref:
|
||||||
ppo_epochs: 1
|
ppo_epochs: 1
|
||||||
shuffle: False
|
shuffle: False
|
||||||
ulysses_sequence_parallel_size: 1 # sp size
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
checkpoint:
|
||||||
|
contents: ['model', 'hf_model', 'optimizer', 'extra']
|
||||||
optim:
|
optim:
|
||||||
lr: 1e-6
|
lr: 1e-6
|
||||||
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@ reward:
|
||||||
format_reward:
|
format_reward:
|
||||||
enable: True
|
enable: True
|
||||||
scaling_factor: 0.2
|
scaling_factor: 0.2
|
||||||
|
prepend_think_token: False # Set to True only when the tokenizer's prompt template pre-fills the generation with <think>, such as in the case of (distilled) r1 models
|
||||||
length_reward:
|
length_reward:
|
||||||
enable: True
|
enable: True
|
||||||
scaling_factor: 0.2
|
scaling_factor: 0.2
|
||||||
|
|
@ -75,6 +76,8 @@ actor_rollout_ref:
|
||||||
ppo_epochs: 1
|
ppo_epochs: 1
|
||||||
shuffle: False
|
shuffle: False
|
||||||
ulysses_sequence_parallel_size: 1 # sp size
|
ulysses_sequence_parallel_size: 1 # sp size
|
||||||
|
checkpoint:
|
||||||
|
contents: ['model', 'hf_model', 'optimizer', 'extra']
|
||||||
optim:
|
optim:
|
||||||
lr: 1e-6
|
lr: 1e-6
|
||||||
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||||
|
|
@ -116,6 +119,7 @@ actor_rollout_ref:
|
||||||
tensor_model_parallel_size: 2
|
tensor_model_parallel_size: 2
|
||||||
max_num_batched_tokens: 8192
|
max_num_batched_tokens: 8192
|
||||||
max_num_seqs: 1024
|
max_num_seqs: 1024
|
||||||
|
max_model_len: 1024
|
||||||
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
|
||||||
log_prob_micro_batch_size_per_gpu: 160
|
log_prob_micro_batch_size_per_gpu: 160
|
||||||
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||||||
|
|
@ -144,7 +148,7 @@ trainer:
|
||||||
total_epochs: 10
|
total_epochs: 10
|
||||||
total_training_steps: null
|
total_training_steps: null
|
||||||
project_name: rg-test
|
project_name: rg-test
|
||||||
experiment_name: verl_grpo_llama3.1_1b
|
experiment_name: verl_grpo_qwen2.5_1.5b
|
||||||
logger: [ 'console', 'wandb' ]
|
logger: [ 'console', 'wandb' ]
|
||||||
val_generations_to_log_to_wandb: 0
|
val_generations_to_log_to_wandb: 0
|
||||||
nnodes: 1
|
nnodes: 1
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ class RayGRPOTrainer(RayPPOTrainer):
|
||||||
self.max_output_length = max_output_length
|
self.max_output_length = max_output_length
|
||||||
|
|
||||||
self.format_reward_scaling_factor = config.reward.format_reward.scaling_factor
|
self.format_reward_scaling_factor = config.reward.format_reward.scaling_factor
|
||||||
|
self.format_reward_prepend_think_token = config.reward.format_reward.prepend_think_token
|
||||||
self.length_reward_scaling_factor = config.reward.length_reward.scaling_factor
|
self.length_reward_scaling_factor = config.reward.length_reward.scaling_factor
|
||||||
|
|
||||||
train_reward_fn = lambda data: self._score_output(data, num_examine=0)
|
train_reward_fn = lambda data: self._score_output(data, num_examine=0)
|
||||||
|
|
@ -99,6 +100,8 @@ class RayGRPOTrainer(RayPPOTrainer):
|
||||||
|
|
||||||
def _compute_format_reward(self, solution_str: str) -> float:
|
def _compute_format_reward(self, solution_str: str) -> float:
|
||||||
"""Reward use of exactly one correctly structured <think> and <answer> block."""
|
"""Reward use of exactly one correctly structured <think> and <answer> block."""
|
||||||
|
if self.format_reward_prepend_think_token:
|
||||||
|
solution_str = "<think>" + solution_str
|
||||||
scaling_factor = self.format_reward_scaling_factor
|
scaling_factor = self.format_reward_scaling_factor
|
||||||
# check <think> and <answer> blocks are present
|
# check <think> and <answer> blocks are present
|
||||||
pattern = r"\s*<think>.*?</think>\s*<answer>.*?</answer>"
|
pattern = r"\s*<think>.*?</think>\s*<answer>.*?</answer>"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue