fix(training): Prepend <think> token in format reward (#396)

* prepend think token in format reward

* pre commit + fix some default vals

* add checkpoint config
This commit is contained in:
Zafir Stojanovski 2025-03-28 09:45:17 +01:00 committed by GitHub
parent ed560a80f4
commit 695aad4dbc
4 changed files with 16 additions and 1 deletions

5
.gitignore vendored
View file

@ -45,3 +45,8 @@ htmlcov/
# Jupyter Notebook # Jupyter Notebook
.ipynb_checkpoints/ .ipynb_checkpoints/
.virtual_documents/ .virtual_documents/
# logs
wandb/
outputs/
*.log

View file

@ -35,6 +35,7 @@ reward:
format_reward: format_reward:
enable: True enable: True
scaling_factor: 0.2 scaling_factor: 0.2
prepend_think_token: False # Set to True only when the tokenizer's prompt template pre-fills the generation with <think>, such as in the case of (distilled) r1 models
length_reward: length_reward:
enable: True enable: True
scaling_factor: 0.2 scaling_factor: 0.2
@ -75,6 +76,8 @@ actor_rollout_ref:
ppo_epochs: 1 ppo_epochs: 1
shuffle: False shuffle: False
ulysses_sequence_parallel_size: 1 # sp size ulysses_sequence_parallel_size: 1 # sp size
checkpoint:
contents: ['model', 'hf_model', 'optimizer', 'extra']
optim: optim:
lr: 1e-6 lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime

View file

@ -35,6 +35,7 @@ reward:
format_reward: format_reward:
enable: True enable: True
scaling_factor: 0.2 scaling_factor: 0.2
prepend_think_token: False # Set to True only when the tokenizer's prompt template pre-fills the generation with <think>, such as in the case of (distilled) r1 models
length_reward: length_reward:
enable: True enable: True
scaling_factor: 0.2 scaling_factor: 0.2
@ -75,6 +76,8 @@ actor_rollout_ref:
ppo_epochs: 1 ppo_epochs: 1
shuffle: False shuffle: False
ulysses_sequence_parallel_size: 1 # sp size ulysses_sequence_parallel_size: 1 # sp size
checkpoint:
contents: ['model', 'hf_model', 'optimizer', 'extra']
optim: optim:
lr: 1e-6 lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
@ -116,6 +119,7 @@ actor_rollout_ref:
tensor_model_parallel_size: 2 tensor_model_parallel_size: 2
max_num_batched_tokens: 8192 max_num_batched_tokens: 8192
max_num_seqs: 1024 max_num_seqs: 1024
max_model_len: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 160 log_prob_micro_batch_size_per_gpu: 160
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@ -144,7 +148,7 @@ trainer:
total_epochs: 10 total_epochs: 10
total_training_steps: null total_training_steps: null
project_name: rg-test project_name: rg-test
experiment_name: verl_grpo_llama3.1_1b experiment_name: verl_grpo_qwen2.5_1.5b
logger: [ 'console', 'wandb' ] logger: [ 'console', 'wandb' ]
val_generations_to_log_to_wandb: 0 val_generations_to_log_to_wandb: 0
nnodes: 1 nnodes: 1

View file

@ -31,6 +31,7 @@ class RayGRPOTrainer(RayPPOTrainer):
self.max_output_length = max_output_length self.max_output_length = max_output_length
self.format_reward_scaling_factor = config.reward.format_reward.scaling_factor self.format_reward_scaling_factor = config.reward.format_reward.scaling_factor
self.format_reward_prepend_think_token = config.reward.format_reward.prepend_think_token
self.length_reward_scaling_factor = config.reward.length_reward.scaling_factor self.length_reward_scaling_factor = config.reward.length_reward.scaling_factor
train_reward_fn = lambda data: self._score_output(data, num_examine=0) train_reward_fn = lambda data: self._score_output(data, num_examine=0)
@ -99,6 +100,8 @@ class RayGRPOTrainer(RayPPOTrainer):
def _compute_format_reward(self, solution_str: str) -> float: def _compute_format_reward(self, solution_str: str) -> float:
"""Reward use of exactly one correctly structured <think> and <answer> block.""" """Reward use of exactly one correctly structured <think> and <answer> block."""
if self.format_reward_prepend_think_token:
solution_str = "<think>" + solution_str
scaling_factor = self.format_reward_scaling_factor scaling_factor = self.format_reward_scaling_factor
# check <think> and <answer> blocks are present # check <think> and <answer> blocks are present
pattern = r"\s*<think>.*?</think>\s*<answer>.*?</answer>" pattern = r"\s*<think>.*?</think>\s*<answer>.*?</answer>"