diff --git a/environments/eval_environments/eval_helpers.py b/environments/eval_environments/eval_helpers.py index 2a30fd9f..424ea791 100644 --- a/environments/eval_environments/eval_helpers.py +++ b/environments/eval_environments/eval_helpers.py @@ -92,6 +92,16 @@ THINK_CONTENT_INSIDE_PATTERN = re.compile( r"(.*?)", re.DOTALL | re.IGNORECASE ) +# Pre-compiled regex for scratchpad mode (alternative reasoning format) +SCRATCHPAD_OPEN_PATTERN = re.compile(r"<\|start_of_scratchpad\|>") +SCRATCHPAD_CLOSE_PATTERN = re.compile(r"<\|end_of_scratchpad\|>") +SCRATCHPAD_CONTENT_AFTER_PATTERN = re.compile( + r"<\|end_of_scratchpad\|>\s*(.*)", re.DOTALL +) +SCRATCHPAD_CONTENT_INSIDE_PATTERN = re.compile( + r"<\|start_of_scratchpad\|>(.*?)<\|end_of_scratchpad\|>", re.DOTALL +) + # Common prefixes that models use before stating their answer # These will be stripped to help isolate the actual answer @@ -460,10 +470,11 @@ def validate_thinking_format( response: str, thinking_mode: bool = True ) -> Tuple[bool, str]: """ - Validate thinking format and extract content after tags. + Validate thinking format and extract content after reasoning tags. - In thinking mode, we expect exactly one pair of tags. - Returns the content after for answer extraction. + In thinking mode, we expect exactly one pair of reasoning tags. + Supports both and <|start_of_scratchpad|><|end_of_scratchpad|> formats. + Returns the content after the closing tag for answer extraction. Args: response: The model's full response @@ -475,34 +486,52 @@ def validate_thinking_format( if not thinking_mode: return True, response - # Check for exactly one pair of think tags + # Try tags first think_open_count = len(THINK_OPEN_PATTERN.findall(response)) think_close_count = len(THINK_CLOSE_PATTERN.findall(response)) - if think_open_count != 1 or think_close_count != 1: - return False, response + if think_open_count == 1 and think_close_count == 1: + # Extract content after tags for answer extraction + match = THINK_CONTENT_AFTER_PATTERN.search(response) + if match: + return True, match.group(1).strip() - # Extract content after tags for answer extraction - match = THINK_CONTENT_AFTER_PATTERN.search(response) - if match: - return True, match.group(1).strip() - else: - return False, response + # Try <|start_of_scratchpad|><|end_of_scratchpad|> tags + scratchpad_open_count = len(SCRATCHPAD_OPEN_PATTERN.findall(response)) + scratchpad_close_count = len(SCRATCHPAD_CLOSE_PATTERN.findall(response)) + + if scratchpad_open_count == 1 and scratchpad_close_count == 1: + # Extract content after <|end_of_scratchpad|> tags for answer extraction + match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response) + if match: + return True, match.group(1).strip() + + # No valid reasoning format found + return False, response def extract_thinking_content(response: str) -> Optional[str]: """ - Extract the content inside tags. + Extract the content inside reasoning tags. + + Supports both and <|start_of_scratchpad|><|end_of_scratchpad|> formats. Args: response: The model's full response Returns: - Content inside think tags, or None if not found + Content inside reasoning tags, or None if not found """ + # Try tags first match = THINK_CONTENT_INSIDE_PATTERN.search(response) if match: return match.group(1).strip() + + # Try <|start_of_scratchpad|><|end_of_scratchpad|> tags + match = SCRATCHPAD_CONTENT_INSIDE_PATTERN.search(response) + if match: + return match.group(1).strip() + return None @@ -568,17 +597,19 @@ def extract_reasoning_from_response( 2. reasoning_details[].text field (OpenRouter style for reasoning models) 3. reasoning field on the message (some providers) 4. blocks in message content (Hermes style) + 5. <|start_of_scratchpad|><|end_of_scratchpad|> blocks (alternative format) Args: response: The ChatCompletion response object from the API content: Optional message content string. If provided, will check for - blocks in addition to API fields. + reasoning tag blocks in addition to API fields. Returns: Tuple of (reasoning_content, source) where: - reasoning_content: The extracted reasoning text, or None if not found - source: String indicating where reasoning was found: - "reasoning_content", "reasoning_details", "reasoning", "think_block", "none" + "reasoning_content", "reasoning_details", "reasoning", "think_block", + "scratchpad_block", or "none" Example: completion = await server.chat_completion(messages=messages) @@ -631,6 +662,12 @@ def extract_reasoning_from_response( if match: return match.group(1).strip(), "think_block" + # Try <|start_of_scratchpad|> blocks in content (alternative reasoning format) + if content: + match = SCRATCHPAD_CONTENT_INSIDE_PATTERN.search(content) + if match: + return match.group(1).strip(), "scratchpad_block" + return None, "none" @@ -1032,11 +1069,13 @@ def extract_first_boxed_answer( Extract the first \\boxed{} answer from a response. Follows the rule: only accept if there's exactly ONE boxed answer - after the tag (if thinking mode). Multiple boxed answers = failure. + after the reasoning tags (if thinking mode). Multiple boxed answers = failure. + + Supports both and <|start_of_scratchpad|><|end_of_scratchpad|> formats. Args: response: The model's full response - after_think: Whether to only look after tags + after_think: Whether to only look after reasoning tags debug: Whether to print debug information Returns: @@ -1044,13 +1083,18 @@ def extract_first_boxed_answer( """ # Get content to search if after_think: - # Extract content after + # Try to extract content after first match = THINK_CONTENT_AFTER_PATTERN.search(response) if match: search_content = match.group(1) else: - # No think tags, use full response - search_content = response + # Try <|end_of_scratchpad|> tags + match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response) + if match: + search_content = match.group(1) + else: + # No reasoning tags, use full response + search_content = response else: search_content = response @@ -1306,13 +1350,18 @@ def score_math_answer( Returns: Tuple of (is_correct or None, method_used, has_multiple_boxed) """ - # Get content to score + # Get content to score (check for both think and scratchpad tags) if after_think: match = THINK_CONTENT_AFTER_PATTERN.search(response) if match: score_content = match.group(1) else: - score_content = response + # Try scratchpad tags + match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response) + if match: + score_content = match.group(1) + else: + score_content = response else: score_content = response @@ -1391,13 +1440,18 @@ async def score_math_answer_async( """ import asyncio - # Get content to score + # Get content to score (check for both think and scratchpad tags) if after_think: match = THINK_CONTENT_AFTER_PATTERN.search(response) if match: score_content = match.group(1) else: - score_content = response + # Try scratchpad tags + match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response) + if match: + score_content = match.group(1) + else: + score_content = response else: score_content = response diff --git a/environments/eval_environments/ifeval_eval.py b/environments/eval_environments/ifeval_eval.py index 2f0d6155..2c32801b 100644 --- a/environments/eval_environments/ifeval_eval.py +++ b/environments/eval_environments/ifeval_eval.py @@ -34,8 +34,11 @@ from typing import Any, Dict, List, Optional, Tuple from datasets import load_dataset from eval_helpers import ( create_system_content, + extract_reasoning_from_completion, format_reasoning_debug_info, get_default_thinking_prompt, + get_reasoning_token_usage, + validate_thinking_format, ) from pydantic import Field from tqdm.asyncio import tqdm_asyncio @@ -229,6 +232,7 @@ class IFEvalEnv(BaseEnv): print(f" Max tokens: {self.config.eval_max_tokens}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") + print(f" Reasoning effort: {self.config.reasoning_effort}") if self.config.thinking_mode: thinking_prompt = self._get_thinking_prompt() if thinking_prompt: @@ -274,29 +278,46 @@ class IFEvalEnv(BaseEnv): self.iter = 0 def _validate_thinking_format(self, response: str) -> Tuple[bool, str]: - """Validate thinking format and extract content after tags.""" - if not self.config.thinking_mode: - return True, response + """ + Validate thinking format and extract content after reasoning tags. - think_open_count = len(self._think_pattern.findall(response)) - think_close_count = len(self._think_close_pattern.findall(response)) - - if think_open_count != 1 or think_close_count != 1: - return False, response - - match = self._think_content_pattern.search(response) - if match: - return True, match.group(1).strip() - else: - return False, response + Supports both and <|start_of_scratchpad|><|end_of_scratchpad|> formats. + """ + return validate_thinking_format(response, self.config.thinking_mode) def _extract_thinking_content(self, response: str) -> Optional[str]: - """Extract the content inside tags.""" + """Extract the content inside tags (legacy method).""" match = self._thinking_extract_pattern.search(response) if match: return match.group(1).strip() return None + def _extract_reasoning_content( + self, completion: Any, model_response: str + ) -> Tuple[Optional[str], str]: + """ + Extract reasoning content from completion using multiple methods. + + This handles different reasoning formats from various providers: + 1. reasoning_content field (OpenAI reasoning models, some providers) + 2. reasoning_details[].text field (OpenRouter style) + 3. reasoning field on message + 4. blocks in message content (Hermes style) + 5. <|start_of_scratchpad|><|end_of_scratchpad|> blocks + + Args: + completion: The ChatCompletion response object + model_response: The message content string + + Returns: + Tuple of (reasoning_content, source) where source indicates + where reasoning was found: "reasoning_content", "reasoning_details", + "reasoning", "think_block", "scratchpad_block", or "none" + """ + # Use comprehensive extraction from eval_helpers + reasoning, source, _ = extract_reasoning_from_completion(completion) + return reasoning, source + def _preprocess_response(self, response: str) -> List[str]: """ Preprocess response for loose evaluation. @@ -526,15 +547,22 @@ class IFEvalEnv(BaseEnv): if not model_response: return {"result": None, "sample": None} - # Handle thinking mode - extract content after for evaluation + # Handle thinking mode - extract content after reasoning tags for evaluation thinking_format_valid, response_for_eval = self._validate_thinking_format( model_response ) - # Extract thinking content for logging - thinking_content = None - if self.config.thinking_mode: - thinking_content = self._extract_thinking_content(model_response) + # Extract reasoning content using comprehensive method + # This handles multiple formats: reasoning_content field, reasoning_details, + # reasoning field, blocks, and <|start_of_scratchpad|> blocks + # Extract reasoning content using comprehensive method + # Always extract, regardless of thinking_mode, since API reasoning may be available + thinking_content, reasoning_source = self._extract_reasoning_content( + completion, model_response + ) + if self.config.full_debug and thinking_content: + print(f" [Reasoning] Found via: {reasoning_source}") + print(f" [Reasoning] Length: {len(thinking_content)} chars") # Check instructions check_result = self._check_instructions( @@ -571,6 +599,7 @@ class IFEvalEnv(BaseEnv): if thinking_content and len(thinking_content) > 500 else thinking_content ) + sample["reasoning_source"] = reasoning_source if self.config.full_debug: strict_status = "✓" if check_result["prompt_level_strict"] else "✗" @@ -599,6 +628,7 @@ class IFEvalEnv(BaseEnv): print(f" Total prompts: {len(self.all_eval_items)}") print(f" Max tokens: {self.config.eval_max_tokens}") print(f" Thinking mode: {self.config.thinking_mode}") + print(f" Reasoning effort: {self.config.reasoning_effort}") print(f"{'='*60}\n") try: @@ -678,6 +708,15 @@ class IFEvalEnv(BaseEnv): if self.config.thinking_mode: thinking_utilization = sum(1 for s in samples if s.get("thinking_content")) + # Reasoning source statistics (tracks where reasoning was extracted from) + reasoning_sources = {} + if self.config.thinking_mode: + for sample in samples: + source = sample.get("reasoning_source", "none") + if source not in reasoning_sources: + reasoning_sources[source] = 0 + reasoning_sources[source] += 1 + # Build metrics dictionary eval_metrics = { "eval/prompt_level_strict_acc": prompt_strict_acc, @@ -723,6 +762,16 @@ class IFEvalEnv(BaseEnv): if self.config.thinking_mode: print(f"Thinking Format Compliance: {thinking_format_compliance_rate:.4f}") print(f"Thinking Utilization: {thinking_utilization}/{total_count}") + + # Print reasoning source breakdown if thinking mode is enabled + if self.config.thinking_mode and reasoning_sources: + print("\nReasoning Source Breakdown:") + for source, count in sorted( + reasoning_sources.items(), key=lambda x: -x[1] + ): + pct = (count / total_count) * 100 if total_count > 0 else 0 + print(f" {source}: {count} ({pct:.1f}%)") + print(f"{'='*60}\n") # Log evaluation results @@ -736,6 +785,7 @@ class IFEvalEnv(BaseEnv): "temperature": self.config.eval_temperature, "max_tokens": self.config.eval_max_tokens, "thinking_mode": self.config.thinking_mode, + "reasoning_effort": self.config.reasoning_effort, }, ) except Exception as e: