fix(training): Prepend <think> token in format reward (#396)

* prepend think token in format reward * pre commit + fix some default vals * add checkpoint config
2026-04-19 12:58:07 +00:00 · 2025-03-28 09:45:17 +01:00 · 2025-03-28 09:45:17 +01:00 · 695aad4dbc
commit 695aad4dbc
parent ed560a80f4
4 changed files with 16 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,3 +45,8 @@ htmlcov/
 # Jupyter Notebook
 .ipynb_checkpoints/
 .virtual_documents/
 # logs
 wandb/
 outputs/
 *.log
--- a/training/configs/llama3.1_1b_grpo.yaml
+++ b/training/configs/llama3.1_1b_grpo.yaml
@ -35,6 +35,7 @@ reward:
  format_reward:
    enable: True
    scaling_factor: 0.2
    prepend_think_token: False  # Set to True only when the tokenizer's prompt template pre-fills the generation with <think>, such as in the case of (distilled) r1 models
  length_reward:
    enable: True
    scaling_factor: 0.2
@ -75,6 +76,8 @@ actor_rollout_ref:
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
    checkpoint:
      contents: ['model', 'hf_model', 'optimizer', 'extra']
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
--- a/training/configs/qwen2.5_1.5b_grpo.yaml
+++ b/training/configs/qwen2.5_1.5b_grpo.yaml
@ -35,6 +35,7 @@ reward:
  format_reward:
    enable: True
    scaling_factor: 0.2
    prepend_think_token: False  # Set to True only when the tokenizer's prompt template pre-fills the generation with <think>, such as in the case of (distilled) r1 models
  length_reward:
    enable: True
    scaling_factor: 0.2
@ -75,6 +76,8 @@ actor_rollout_ref:
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
    checkpoint:
      contents: ['model', 'hf_model', 'optimizer', 'extra']
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
@ -116,6 +119,7 @@ actor_rollout_ref:
    tensor_model_parallel_size: 2
    max_num_batched_tokens: 8192
    max_num_seqs: 1024
    max_model_len: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 160
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@ -144,7 +148,7 @@ trainer:
  total_epochs: 10
  total_training_steps: null
  project_name: rg-test
-  experiment_name: verl_grpo_llama3.1_1b
+  experiment_name: verl_grpo_qwen2.5_1.5b
  logger: [ 'console', 'wandb' ]
  val_generations_to_log_to_wandb: 0
  nnodes: 1
--- a/training/trainers/ray_grpo_trainer.py
+++ b/training/trainers/ray_grpo_trainer.py
@ -31,6 +31,7 @@ class RayGRPOTrainer(RayPPOTrainer):
        self.max_output_length = max_output_length
        self.format_reward_scaling_factor = config.reward.format_reward.scaling_factor
        self.format_reward_prepend_think_token = config.reward.format_reward.prepend_think_token
        self.length_reward_scaling_factor = config.reward.length_reward.scaling_factor
        train_reward_fn = lambda data: self._score_output(data, num_examine=0)
@ -99,6 +100,8 @@ class RayGRPOTrainer(RayPPOTrainer):
    def _compute_format_reward(self, solution_str: str) -> float:
        """Reward use of exactly one correctly structured <think> and <answer> block."""
        if self.format_reward_prepend_think_token:
            solution_str = "<think>" + solution_str
        scaling_factor = self.format_reward_scaling_factor
        # check <think> and <answer> blocks are present
        pattern = r"\s*<think>.*?</think>\s*<answer>.*?</answer>"