diff --git a/environments/eval_environments/eval_helpers.py b/environments/eval_environments/eval_helpers.py
index 2a30fd9f..424ea791 100644
--- a/environments/eval_environments/eval_helpers.py
+++ b/environments/eval_environments/eval_helpers.py
@@ -92,6 +92,16 @@ THINK_CONTENT_INSIDE_PATTERN = re.compile(
r"(.*?)", re.DOTALL | re.IGNORECASE
)
+# Pre-compiled regex for scratchpad mode (alternative reasoning format)
+SCRATCHPAD_OPEN_PATTERN = re.compile(r"<\|start_of_scratchpad\|>")
+SCRATCHPAD_CLOSE_PATTERN = re.compile(r"<\|end_of_scratchpad\|>")
+SCRATCHPAD_CONTENT_AFTER_PATTERN = re.compile(
+ r"<\|end_of_scratchpad\|>\s*(.*)", re.DOTALL
+)
+SCRATCHPAD_CONTENT_INSIDE_PATTERN = re.compile(
+ r"<\|start_of_scratchpad\|>(.*?)<\|end_of_scratchpad\|>", re.DOTALL
+)
+
# Common prefixes that models use before stating their answer
# These will be stripped to help isolate the actual answer
@@ -460,10 +470,11 @@ def validate_thinking_format(
response: str, thinking_mode: bool = True
) -> Tuple[bool, str]:
"""
- Validate thinking format and extract content after tags.
+ Validate thinking format and extract content after reasoning tags.
- In thinking mode, we expect exactly one pair of tags.
- Returns the content after for answer extraction.
+ In thinking mode, we expect exactly one pair of reasoning tags.
+ Supports both and <|start_of_scratchpad|><|end_of_scratchpad|> formats.
+ Returns the content after the closing tag for answer extraction.
Args:
response: The model's full response
@@ -475,34 +486,52 @@ def validate_thinking_format(
if not thinking_mode:
return True, response
- # Check for exactly one pair of think tags
+ # Try tags first
think_open_count = len(THINK_OPEN_PATTERN.findall(response))
think_close_count = len(THINK_CLOSE_PATTERN.findall(response))
- if think_open_count != 1 or think_close_count != 1:
- return False, response
+ if think_open_count == 1 and think_close_count == 1:
+ # Extract content after tags for answer extraction
+ match = THINK_CONTENT_AFTER_PATTERN.search(response)
+ if match:
+ return True, match.group(1).strip()
- # Extract content after tags for answer extraction
- match = THINK_CONTENT_AFTER_PATTERN.search(response)
- if match:
- return True, match.group(1).strip()
- else:
- return False, response
+ # Try <|start_of_scratchpad|><|end_of_scratchpad|> tags
+ scratchpad_open_count = len(SCRATCHPAD_OPEN_PATTERN.findall(response))
+ scratchpad_close_count = len(SCRATCHPAD_CLOSE_PATTERN.findall(response))
+
+ if scratchpad_open_count == 1 and scratchpad_close_count == 1:
+ # Extract content after <|end_of_scratchpad|> tags for answer extraction
+ match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response)
+ if match:
+ return True, match.group(1).strip()
+
+ # No valid reasoning format found
+ return False, response
def extract_thinking_content(response: str) -> Optional[str]:
"""
- Extract the content inside tags.
+ Extract the content inside reasoning tags.
+
+ Supports both and <|start_of_scratchpad|><|end_of_scratchpad|> formats.
Args:
response: The model's full response
Returns:
- Content inside think tags, or None if not found
+ Content inside reasoning tags, or None if not found
"""
+ # Try tags first
match = THINK_CONTENT_INSIDE_PATTERN.search(response)
if match:
return match.group(1).strip()
+
+ # Try <|start_of_scratchpad|><|end_of_scratchpad|> tags
+ match = SCRATCHPAD_CONTENT_INSIDE_PATTERN.search(response)
+ if match:
+ return match.group(1).strip()
+
return None
@@ -568,17 +597,19 @@ def extract_reasoning_from_response(
2. reasoning_details[].text field (OpenRouter style for reasoning models)
3. reasoning field on the message (some providers)
4. blocks in message content (Hermes style)
+ 5. <|start_of_scratchpad|><|end_of_scratchpad|> blocks (alternative format)
Args:
response: The ChatCompletion response object from the API
content: Optional message content string. If provided, will check for
- blocks in addition to API fields.
+ reasoning tag blocks in addition to API fields.
Returns:
Tuple of (reasoning_content, source) where:
- reasoning_content: The extracted reasoning text, or None if not found
- source: String indicating where reasoning was found:
- "reasoning_content", "reasoning_details", "reasoning", "think_block", "none"
+ "reasoning_content", "reasoning_details", "reasoning", "think_block",
+ "scratchpad_block", or "none"
Example:
completion = await server.chat_completion(messages=messages)
@@ -631,6 +662,12 @@ def extract_reasoning_from_response(
if match:
return match.group(1).strip(), "think_block"
+ # Try <|start_of_scratchpad|> blocks in content (alternative reasoning format)
+ if content:
+ match = SCRATCHPAD_CONTENT_INSIDE_PATTERN.search(content)
+ if match:
+ return match.group(1).strip(), "scratchpad_block"
+
return None, "none"
@@ -1032,11 +1069,13 @@ def extract_first_boxed_answer(
Extract the first \\boxed{} answer from a response.
Follows the rule: only accept if there's exactly ONE boxed answer
- after the tag (if thinking mode). Multiple boxed answers = failure.
+ after the reasoning tags (if thinking mode). Multiple boxed answers = failure.
+
+ Supports both and <|start_of_scratchpad|><|end_of_scratchpad|> formats.
Args:
response: The model's full response
- after_think: Whether to only look after tags
+ after_think: Whether to only look after reasoning tags
debug: Whether to print debug information
Returns:
@@ -1044,13 +1083,18 @@ def extract_first_boxed_answer(
"""
# Get content to search
if after_think:
- # Extract content after
+ # Try to extract content after first
match = THINK_CONTENT_AFTER_PATTERN.search(response)
if match:
search_content = match.group(1)
else:
- # No think tags, use full response
- search_content = response
+ # Try <|end_of_scratchpad|> tags
+ match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response)
+ if match:
+ search_content = match.group(1)
+ else:
+ # No reasoning tags, use full response
+ search_content = response
else:
search_content = response
@@ -1306,13 +1350,18 @@ def score_math_answer(
Returns:
Tuple of (is_correct or None, method_used, has_multiple_boxed)
"""
- # Get content to score
+ # Get content to score (check for both think and scratchpad tags)
if after_think:
match = THINK_CONTENT_AFTER_PATTERN.search(response)
if match:
score_content = match.group(1)
else:
- score_content = response
+ # Try scratchpad tags
+ match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response)
+ if match:
+ score_content = match.group(1)
+ else:
+ score_content = response
else:
score_content = response
@@ -1391,13 +1440,18 @@ async def score_math_answer_async(
"""
import asyncio
- # Get content to score
+ # Get content to score (check for both think and scratchpad tags)
if after_think:
match = THINK_CONTENT_AFTER_PATTERN.search(response)
if match:
score_content = match.group(1)
else:
- score_content = response
+ # Try scratchpad tags
+ match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response)
+ if match:
+ score_content = match.group(1)
+ else:
+ score_content = response
else:
score_content = response
diff --git a/environments/eval_environments/ifeval_eval.py b/environments/eval_environments/ifeval_eval.py
index 2f0d6155..2c32801b 100644
--- a/environments/eval_environments/ifeval_eval.py
+++ b/environments/eval_environments/ifeval_eval.py
@@ -34,8 +34,11 @@ from typing import Any, Dict, List, Optional, Tuple
from datasets import load_dataset
from eval_helpers import (
create_system_content,
+ extract_reasoning_from_completion,
format_reasoning_debug_info,
get_default_thinking_prompt,
+ get_reasoning_token_usage,
+ validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
@@ -229,6 +232,7 @@ class IFEvalEnv(BaseEnv):
print(f" Max tokens: {self.config.eval_max_tokens}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")
+ print(f" Reasoning effort: {self.config.reasoning_effort}")
if self.config.thinking_mode:
thinking_prompt = self._get_thinking_prompt()
if thinking_prompt:
@@ -274,29 +278,46 @@ class IFEvalEnv(BaseEnv):
self.iter = 0
def _validate_thinking_format(self, response: str) -> Tuple[bool, str]:
- """Validate thinking format and extract content after tags."""
- if not self.config.thinking_mode:
- return True, response
+ """
+ Validate thinking format and extract content after reasoning tags.
- think_open_count = len(self._think_pattern.findall(response))
- think_close_count = len(self._think_close_pattern.findall(response))
-
- if think_open_count != 1 or think_close_count != 1:
- return False, response
-
- match = self._think_content_pattern.search(response)
- if match:
- return True, match.group(1).strip()
- else:
- return False, response
+ Supports both and <|start_of_scratchpad|><|end_of_scratchpad|> formats.
+ """
+ return validate_thinking_format(response, self.config.thinking_mode)
def _extract_thinking_content(self, response: str) -> Optional[str]:
- """Extract the content inside tags."""
+ """Extract the content inside tags (legacy method)."""
match = self._thinking_extract_pattern.search(response)
if match:
return match.group(1).strip()
return None
+ def _extract_reasoning_content(
+ self, completion: Any, model_response: str
+ ) -> Tuple[Optional[str], str]:
+ """
+ Extract reasoning content from completion using multiple methods.
+
+ This handles different reasoning formats from various providers:
+ 1. reasoning_content field (OpenAI reasoning models, some providers)
+ 2. reasoning_details[].text field (OpenRouter style)
+ 3. reasoning field on message
+ 4. blocks in message content (Hermes style)
+ 5. <|start_of_scratchpad|><|end_of_scratchpad|> blocks
+
+ Args:
+ completion: The ChatCompletion response object
+ model_response: The message content string
+
+ Returns:
+ Tuple of (reasoning_content, source) where source indicates
+ where reasoning was found: "reasoning_content", "reasoning_details",
+ "reasoning", "think_block", "scratchpad_block", or "none"
+ """
+ # Use comprehensive extraction from eval_helpers
+ reasoning, source, _ = extract_reasoning_from_completion(completion)
+ return reasoning, source
+
def _preprocess_response(self, response: str) -> List[str]:
"""
Preprocess response for loose evaluation.
@@ -526,15 +547,22 @@ class IFEvalEnv(BaseEnv):
if not model_response:
return {"result": None, "sample": None}
- # Handle thinking mode - extract content after for evaluation
+ # Handle thinking mode - extract content after reasoning tags for evaluation
thinking_format_valid, response_for_eval = self._validate_thinking_format(
model_response
)
- # Extract thinking content for logging
- thinking_content = None
- if self.config.thinking_mode:
- thinking_content = self._extract_thinking_content(model_response)
+ # Extract reasoning content using comprehensive method
+ # This handles multiple formats: reasoning_content field, reasoning_details,
+ # reasoning field, blocks, and <|start_of_scratchpad|> blocks
+ # Extract reasoning content using comprehensive method
+ # Always extract, regardless of thinking_mode, since API reasoning may be available
+ thinking_content, reasoning_source = self._extract_reasoning_content(
+ completion, model_response
+ )
+ if self.config.full_debug and thinking_content:
+ print(f" [Reasoning] Found via: {reasoning_source}")
+ print(f" [Reasoning] Length: {len(thinking_content)} chars")
# Check instructions
check_result = self._check_instructions(
@@ -571,6 +599,7 @@ class IFEvalEnv(BaseEnv):
if thinking_content and len(thinking_content) > 500
else thinking_content
)
+ sample["reasoning_source"] = reasoning_source
if self.config.full_debug:
strict_status = "✓" if check_result["prompt_level_strict"] else "✗"
@@ -599,6 +628,7 @@ class IFEvalEnv(BaseEnv):
print(f" Total prompts: {len(self.all_eval_items)}")
print(f" Max tokens: {self.config.eval_max_tokens}")
print(f" Thinking mode: {self.config.thinking_mode}")
+ print(f" Reasoning effort: {self.config.reasoning_effort}")
print(f"{'='*60}\n")
try:
@@ -678,6 +708,15 @@ class IFEvalEnv(BaseEnv):
if self.config.thinking_mode:
thinking_utilization = sum(1 for s in samples if s.get("thinking_content"))
+ # Reasoning source statistics (tracks where reasoning was extracted from)
+ reasoning_sources = {}
+ if self.config.thinking_mode:
+ for sample in samples:
+ source = sample.get("reasoning_source", "none")
+ if source not in reasoning_sources:
+ reasoning_sources[source] = 0
+ reasoning_sources[source] += 1
+
# Build metrics dictionary
eval_metrics = {
"eval/prompt_level_strict_acc": prompt_strict_acc,
@@ -723,6 +762,16 @@ class IFEvalEnv(BaseEnv):
if self.config.thinking_mode:
print(f"Thinking Format Compliance: {thinking_format_compliance_rate:.4f}")
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
+
+ # Print reasoning source breakdown if thinking mode is enabled
+ if self.config.thinking_mode and reasoning_sources:
+ print("\nReasoning Source Breakdown:")
+ for source, count in sorted(
+ reasoning_sources.items(), key=lambda x: -x[1]
+ ):
+ pct = (count / total_count) * 100 if total_count > 0 else 0
+ print(f" {source}: {count} ({pct:.1f}%)")
+
print(f"{'='*60}\n")
# Log evaluation results
@@ -736,6 +785,7 @@ class IFEvalEnv(BaseEnv):
"temperature": self.config.eval_temperature,
"max_tokens": self.config.eval_max_tokens,
"thinking_mode": self.config.thinking_mode,
+ "reasoning_effort": self.config.reasoning_effort,
},
)
except Exception as e: