mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-25 17:10:51 +00:00
fix(training): Prepend <think> token in format reward (#396)
* prepend think token in format reward * pre commit + fix some default vals * add checkpoint config
This commit is contained in:
parent
ed560a80f4
commit
695aad4dbc
4 changed files with 16 additions and 1 deletions
|
|
@ -35,6 +35,7 @@ reward:
|
|||
format_reward:
|
||||
enable: True
|
||||
scaling_factor: 0.2
|
||||
prepend_think_token: False # Set to True only when the tokenizer's prompt template pre-fills the generation with <think>, such as in the case of (distilled) r1 models
|
||||
length_reward:
|
||||
enable: True
|
||||
scaling_factor: 0.2
|
||||
|
|
@ -75,6 +76,8 @@ actor_rollout_ref:
|
|||
ppo_epochs: 1
|
||||
shuffle: False
|
||||
ulysses_sequence_parallel_size: 1 # sp size
|
||||
checkpoint:
|
||||
contents: ['model', 'hf_model', 'optimizer', 'extra']
|
||||
optim:
|
||||
lr: 1e-6
|
||||
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue