fix(training): Prepend <think> token in format reward (#396)

* prepend think token in format reward * pre commit + fix some default vals * add checkpoint config
2026-04-25 17:10:51 +00:00 · 2025-03-28 09:45:17 +01:00 · 2025-03-28 09:45:17 +01:00 · 695aad4dbc
commit 695aad4dbc
parent ed560a80f4
4 changed files with 16 additions and 1 deletions
--- a/training/configs/llama3.1_1b_grpo.yaml
+++ b/training/configs/llama3.1_1b_grpo.yaml
@ -35,6 +35,7 @@ reward:
  format_reward:
    enable: True
    scaling_factor: 0.2
+    prepend_think_token: False  # Set to True only when the tokenizer's prompt template pre-fills the generation with <think>, such as in the case of (distilled) r1 models
  length_reward:
    enable: True
    scaling_factor: 0.2
@ -75,6 +76,8 @@ actor_rollout_ref:
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
+    checkpoint:
+      contents: ['model', 'hf_model', 'optimizer', 'extra']
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime