diff --git a/training/configs/algorithmic_qwen_3b.yaml b/training/configs/algorithmic_qwen_3b.yaml index 49bbdcaa..99946ff7 100644 --- a/training/configs/algorithmic_qwen_3b.yaml +++ b/training/configs/algorithmic_qwen_3b.yaml @@ -41,6 +41,8 @@ reward: scaling_factor: 0.3 - name: format scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False data: tokenizer: null diff --git a/training/rewards/reward.py b/training/rewards/reward.py index 4501b720..4e741483 100644 --- a/training/rewards/reward.py +++ b/training/rewards/reward.py @@ -60,6 +60,10 @@ def cosine_scaled_reward(solution_str, scaling_factor, **kwargs): @reward_registry.register("format") def compute_format_reward(solution_str: str, scaling_factor: float = 0.2, **kwargs) -> float: """Reward use of exactly one correctly structured and block.""" + preappend_thinking_token = kwargs.get("preappend_thinking_token", False) + if preappend_thinking_token: + solution_str = "" + solution_str + pattern = r"\s*.*?\s*.*?" if not re.match(pattern, solution_str, re.DOTALL): return 0.0