Merge branch 'main' into sid/verifiers

2026-04-30 17:40:36 +00:00 · 2026-01-16 11:50:27 +05:30 · 2026-01-16 11:50:27 +05:30 · 7f28c52994
commit 7f28c52994
parent c56af35eaa 9047f03109
18 changed files with 1869 additions and 90 deletions
--- a/environments/eval_environments/eval_helpers.py
+++ b/environments/eval_environments/eval_helpers.py
@ -12,6 +12,7 @@ Includes:
 - Math answer verification (using math_verify library)
 - System prompt creation
 - Results saving utilities
+- Reasoning content extraction from various API response formats
 """

 import json
@ -19,7 +20,50 @@ import os
 import re
 from concurrent.futures import ProcessPoolExecutor
 from string import ascii_uppercase
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+# =============================================================================
+# REASONING/THINKING PROMPTS
+# =============================================================================
+# Standard prompts for triggering reasoning mode in various models.
+# These are NOT automatically injected - use explicitly when desired.
+
+HERMES_REASONING_PROMPT = (
+    "You are a deep thinking AI, you may use extremely long chains of thought to deeply "
+    "consider the problem and deliberate with yourself via systematic reasoning processes "
+    "to help come to a correct solution prior to answering. You should enclose your "
+    "thoughts and internal monologue inside <think> </think> tags, and then provide your "
+    "solution or response to the problem."
+)
+"""
+Standard reasoning prompt for Hermes models.
+
+This prompt triggers the model to use extended chain-of-thought reasoning
+with explicit <think></think> tags. Use this when you want visible reasoning
+in the response content.
+
+Example usage:
+    from eval_helpers import HERMES_REASONING_PROMPT
+
+    messages = [
+        {"role": "system", "content": HERMES_REASONING_PROMPT},
+        {"role": "user", "content": question},
+    ]
+"""
+
+HERMES_REASONING_PROMPT_WITH_ANSWER = (
+    "You are a deep thinking AI, you may use extremely long chains of thought to deeply "
+    "consider the problem and deliberate with yourself via systematic reasoning processes "
+    "to help come to a correct solution prior to answering. You should enclose your "
+    "thoughts and internal monologue inside <think> </think> tags, and then provide your "
+    "solution or response to the problem. After your thinking, provide your final answer "
+    "inside <answer></answer> tags."
+)
+"""
+Standard reasoning prompt for Hermes models with explicit answer tag instruction.
+
+Use this when you want the model to clearly separate reasoning from the final answer.
+"""

 # Try to import math_verify libraries (optional dependency for math evals)
 try:
@ -48,6 +92,16 @@ THINK_CONTENT_INSIDE_PATTERN = re.compile(
    r"<think>(.*?)</think>", re.DOTALL | re.IGNORECASE
 )

+# Pre-compiled regex for scratchpad mode (alternative reasoning format)
+SCRATCHPAD_OPEN_PATTERN = re.compile(r"<\|start_of_scratchpad\|>")
+SCRATCHPAD_CLOSE_PATTERN = re.compile(r"<\|end_of_scratchpad\|>")
+SCRATCHPAD_CONTENT_AFTER_PATTERN = re.compile(
+    r"<\|end_of_scratchpad\|>\s*(.*)", re.DOTALL
+)
+SCRATCHPAD_CONTENT_INSIDE_PATTERN = re.compile(
+    r"<\|start_of_scratchpad\|>(.*?)<\|end_of_scratchpad\|>", re.DOTALL
+)
+

 # Common prefixes that models use before stating their answer
 # These will be stripped to help isolate the actual answer
@ -416,10 +470,11 @@ def validate_thinking_format(
    response: str, thinking_mode: bool = True
 ) -> Tuple[bool, str]:
    """
-    Validate thinking format and extract content after </think> tags.
+    Validate thinking format and extract content after reasoning tags.

-    In thinking mode, we expect exactly one pair of <think></think> tags.
-    Returns the content after </think> for answer extraction.
+    In thinking mode, we expect exactly one pair of reasoning tags.
+    Supports both <think></think> and <|start_of_scratchpad|><|end_of_scratchpad|> formats.
+    Returns the content after the closing tag for answer extraction.

    Args:
        response: The model's full response
@ -431,56 +486,393 @@ def validate_thinking_format(
    if not thinking_mode:
        return True, response

-    # Check for exactly one pair of think tags
+    # Try <think></think> tags first
    think_open_count = len(THINK_OPEN_PATTERN.findall(response))
    think_close_count = len(THINK_CLOSE_PATTERN.findall(response))

-    if think_open_count != 1 or think_close_count != 1:
-        return False, response
+    if think_open_count == 1 and think_close_count == 1:
+        # Extract content after </think> tags for answer extraction
+        match = THINK_CONTENT_AFTER_PATTERN.search(response)
+        if match:
+            return True, match.group(1).strip()

-    # Extract content after </think> tags for answer extraction
-    match = THINK_CONTENT_AFTER_PATTERN.search(response)
-    if match:
-        return True, match.group(1).strip()
-    else:
-        return False, response
+    # Try <|start_of_scratchpad|><|end_of_scratchpad|> tags
+    scratchpad_open_count = len(SCRATCHPAD_OPEN_PATTERN.findall(response))
+    scratchpad_close_count = len(SCRATCHPAD_CLOSE_PATTERN.findall(response))
+
+    if scratchpad_open_count == 1 and scratchpad_close_count == 1:
+        # Extract content after <|end_of_scratchpad|> tags for answer extraction
+        match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response)
+        if match:
+            return True, match.group(1).strip()
+
+    # No valid reasoning format found
+    return False, response


 def extract_thinking_content(response: str) -> Optional[str]:
    """
-    Extract the content inside <think></think> tags.
+    Extract the content inside reasoning tags.
+
+    Supports both <think></think> and <|start_of_scratchpad|><|end_of_scratchpad|> formats.

    Args:
        response: The model's full response

    Returns:
-        Content inside think tags, or None if not found
+        Content inside reasoning tags, or None if not found
    """
+    # Try <think></think> tags first
    match = THINK_CONTENT_INSIDE_PATTERN.search(response)
    if match:
        return match.group(1).strip()
+
+    # Try <|start_of_scratchpad|><|end_of_scratchpad|> tags
+    match = SCRATCHPAD_CONTENT_INSIDE_PATTERN.search(response)
+    if match:
+        return match.group(1).strip()
+
    return None


-def get_default_thinking_prompt(custom_prompt: Optional[str] = None) -> str:
+def get_default_thinking_prompt(custom_prompt: Optional[str] = None) -> Optional[str]:
    """
    Get the thinking system prompt.

+    By default, returns None (no prompt injection). Pass a custom prompt or use
+    HERMES_REASONING_PROMPT explicitly if you want reasoning prompt injection.
+
    Args:
-        custom_prompt: Optional custom thinking prompt to use instead of default
+        custom_prompt: Optional custom thinking prompt to use. If None, returns None.
+                      Use HERMES_REASONING_PROMPT for the standard Hermes prompt.

    Returns:
-        The thinking prompt string
-    """
-    if custom_prompt:
-        return custom_prompt
+        The thinking prompt string, or None if no prompt specified.

-    return (
-        "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the "
-        "problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
-        "solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
-        "</think> tags, and then provide your solution or response to the problem."
-    )
+    Example:
+        # No prompt injection (default):
+        prompt = get_default_thinking_prompt()  # Returns None
+
+        # Use Hermes reasoning prompt:
+        from eval_helpers import HERMES_REASONING_PROMPT
+        prompt = get_default_thinking_prompt(HERMES_REASONING_PROMPT)
+    """
+    return custom_prompt  # None means no prompt injection
+
+
+def get_thinking_prompt_or_hermes(custom_prompt: Optional[str] = None) -> str:
+    """
+    Get thinking prompt, defaulting to HERMES_REASONING_PROMPT if none provided.
+
+    Use this when you want to ensure a thinking prompt is always used.
+
+    Args:
+        custom_prompt: Optional custom thinking prompt. If None, uses HERMES_REASONING_PROMPT.
+
+    Returns:
+        The thinking prompt string (never None).
+    """
+    return custom_prompt if custom_prompt else HERMES_REASONING_PROMPT
+
+
+# =============================================================================
+# REASONING CONTENT EXTRACTION
+# =============================================================================
+# Functions for extracting reasoning content from various API response formats.
+# Different providers return reasoning in different ways:
+# - OpenRouter/Nebius: reasoning_details[].text or reasoning_content field
+# - Some providers: reasoning field in message
+# - Hermes/others: <think></think> blocks in message content
+
+
+def extract_reasoning_from_response(
+    response: Any,
+    content: Optional[str] = None,
+) -> Tuple[Optional[str], str]:
+    """
+    Extract reasoning content from various API response formats.
+
+    This function handles multiple reasoning formats:
+    1. reasoning_content field on the message (some providers)
+    2. reasoning_details[].text field (OpenRouter style for reasoning models)
+    3. reasoning field on the message (some providers)
+    4. <think></think> blocks in message content (Hermes style)
+    5. <|start_of_scratchpad|><|end_of_scratchpad|> blocks (alternative format)
+
+    Args:
+        response: The ChatCompletion response object from the API
+        content: Optional message content string. If provided, will check for
+                reasoning tag blocks in addition to API fields.
+
+    Returns:
+        Tuple of (reasoning_content, source) where:
+        - reasoning_content: The extracted reasoning text, or None if not found
+        - source: String indicating where reasoning was found:
+          "reasoning_content", "reasoning_details", "reasoning", "think_block",
+          "scratchpad_block", or "none"
+
+    Example:
+        completion = await server.chat_completion(messages=messages)
+        message = completion.choices[0].message
+        reasoning, source = extract_reasoning_from_response(
+            completion.choices[0],
+            content=message.content
+        )
+        if reasoning:
+            print(f"Found reasoning via {source}: {len(reasoning)} chars")
+    """
+    # Try reasoning_content field (some providers like certain OpenAI-compatible APIs)
+    if hasattr(response, "reasoning_content") and response.reasoning_content:
+        return response.reasoning_content, "reasoning_content"
+
+    # Try message.reasoning_content if response is a Choice
+    if hasattr(response, "message"):
+        message = response.message
+        if hasattr(message, "reasoning_content") and message.reasoning_content:
+            return message.reasoning_content, "reasoning_content"
+        if hasattr(message, "reasoning") and message.reasoning:
+            return message.reasoning, "reasoning"
+
+    # Try reasoning_details field (OpenRouter style)
+    if hasattr(response, "reasoning_details") and response.reasoning_details:
+        for detail in response.reasoning_details:
+            if hasattr(detail, "text") and detail.text:
+                return detail.text, "reasoning_details"
+            # Some formats use 'content' instead of 'text'
+            if isinstance(detail, dict) and detail.get("text"):
+                return detail["text"], "reasoning_details"
+
+    # Try message.reasoning_details if response is a Choice
+    if hasattr(response, "message"):
+        message = response.message
+        if hasattr(message, "reasoning_details") and message.reasoning_details:
+            for detail in message.reasoning_details:
+                if hasattr(detail, "text") and detail.text:
+                    return detail.text, "reasoning_details"
+                if isinstance(detail, dict) and detail.get("text"):
+                    return detail["text"], "reasoning_details"
+
+    # Try reasoning field directly
+    if hasattr(response, "reasoning") and response.reasoning:
+        return response.reasoning, "reasoning"
+
+    # Try <think> blocks in content (Hermes style)
+    if content:
+        match = THINK_CONTENT_INSIDE_PATTERN.search(content)
+        if match:
+            return match.group(1).strip(), "think_block"
+
+    # Try <|start_of_scratchpad|> blocks in content (alternative reasoning format)
+    if content:
+        match = SCRATCHPAD_CONTENT_INSIDE_PATTERN.search(content)
+        if match:
+            return match.group(1).strip(), "scratchpad_block"
+
+    return None, "none"
+
+
+def extract_reasoning_from_completion(
+    completion: Any,
+    choice_idx: int = 0,
+) -> Tuple[Optional[str], str, Optional[str]]:
+    """
+    Extract reasoning from a ChatCompletion object.
+
+    Convenience wrapper around extract_reasoning_from_response that handles
+    the common case of extracting from a ChatCompletion.
+
+    Args:
+        completion: The ChatCompletion response object
+        choice_idx: Index of the choice to extract from (default 0)
+
+    Returns:
+        Tuple of (reasoning_content, source, message_content) where:
+        - reasoning_content: The extracted reasoning text, or None
+        - source: Where reasoning was found (see extract_reasoning_from_response)
+        - message_content: The message content (for convenience)
+
+    Example:
+        completion = await server.chat_completion(messages=messages)
+        reasoning, source, content = extract_reasoning_from_completion(completion)
+    """
+    if not completion or not completion.choices:
+        return None, "none", None
+
+    if choice_idx >= len(completion.choices):
+        return None, "none", None
+
+    choice = completion.choices[choice_idx]
+    content = None
+
+    if hasattr(choice, "message") and hasattr(choice.message, "content"):
+        content = choice.message.content
+
+    reasoning, source = extract_reasoning_from_response(choice, content)
+    return reasoning, source, content
+
+
+def get_reasoning_token_usage(completion: Any) -> Dict[str, Any]:
+    """
+    Extract reasoning token usage information from a ChatCompletion.
+
+    This extracts token counts from the usage field, including reasoning-specific
+    metrics when available (e.g., reasoning_tokens from OpenRouter/OpenAI).
+
+    Works with all known providers:
+    - OpenAI: usage.completion_tokens_details.reasoning_tokens
+    - OpenRouter (Claude, Hermes, DeepSeek, etc.): Same location + provider/cost fields
+
+    Args:
+        completion: The ChatCompletion response object
+
+    Returns:
+        Dict with token usage info:
+        - model: Model name used
+        - completion_tokens: Total completion tokens
+        - prompt_tokens: Input tokens
+        - total_tokens: Total tokens used
+        - reasoning_tokens: Reasoning/thinking tokens (if available)
+        - cached_tokens: Cached prompt tokens (if available)
+        - cost: API cost (if available, OpenRouter)
+        - provider: Provider name (if available, OpenRouter)
+        - has_reasoning_content: Whether message contains reasoning field
+
+    Example:
+        completion = await server.chat_completion(messages=messages)
+        usage = get_reasoning_token_usage(completion)
+        if config.full_debug:
+            print(f"  Reasoning tokens: {usage.get('reasoning_tokens', 'N/A')}")
+    """
+    result = {
+        "model": None,
+        "completion_tokens": None,
+        "prompt_tokens": None,
+        "total_tokens": None,
+        "reasoning_tokens": None,
+        "cached_tokens": None,
+        "cost": None,
+        "provider": None,
+        "has_reasoning_content": False,
+    }
+
+    if not completion:
+        return result
+
+    # Extract model name
+    if hasattr(completion, "model"):
+        result["model"] = completion.model
+
+    # Extract provider (OpenRouter includes this)
+    if hasattr(completion, "provider"):
+        result["provider"] = completion.provider
+
+    # Check if message has reasoning content
+    if hasattr(completion, "choices") and completion.choices:
+        msg = (
+            completion.choices[0].message
+            if hasattr(completion.choices[0], "message")
+            else None
+        )
+        if msg:
+            # Check for reasoning field (OpenRouter normalized field)
+            if hasattr(msg, "reasoning") and msg.reasoning:
+                result["has_reasoning_content"] = True
+            # Check for reasoning_details (OpenRouter)
+            elif hasattr(msg, "reasoning_details") and msg.reasoning_details:
+                result["has_reasoning_content"] = True
+
+    # Extract usage info
+    if not hasattr(completion, "usage") or not completion.usage:
+        return result
+
+    usage = completion.usage
+
+    result["completion_tokens"] = getattr(usage, "completion_tokens", None)
+    result["prompt_tokens"] = getattr(usage, "prompt_tokens", None)
+    result["total_tokens"] = getattr(usage, "total_tokens", None)
+
+    # Extract cost (OpenRouter includes this)
+    if hasattr(usage, "cost"):
+        result["cost"] = usage.cost
+
+    # Extract reasoning tokens from completion_tokens_details
+    # This works for: OpenAI, OpenRouter (Claude, Hermes, DeepSeek, etc.)
+    if hasattr(usage, "completion_tokens_details") and usage.completion_tokens_details:
+        details = usage.completion_tokens_details
+        if hasattr(details, "reasoning_tokens"):
+            result["reasoning_tokens"] = details.reasoning_tokens
+
+    # Extract cached tokens from prompt_tokens_details (OpenRouter/OpenAI)
+    if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
+        details = usage.prompt_tokens_details
+        if hasattr(details, "cached_tokens"):
+            result["cached_tokens"] = details.cached_tokens
+
+    return result
+
+
+def format_reasoning_debug_info(
+    completion: Any, reasoning_content: Optional[str] = None
+) -> str:
+    """
+    Format reasoning debug information for logging.
+
+    Use this in evals when full_debug is enabled to show reasoning token usage.
+
+    Args:
+        completion: The ChatCompletion response object
+        reasoning_content: Optional pre-extracted reasoning content
+
+    Returns:
+        Formatted string with reasoning debug info
+
+    Example:
+        if self.config.full_debug:
+            print(format_reasoning_debug_info(completion))
+    """
+    usage = get_reasoning_token_usage(completion)
+
+    lines = ["  [Reasoning/Token Debug Info]"]
+
+    # Model and provider info
+    if usage["model"]:
+        lines.append(f"    Model: {usage['model']}")
+    if usage["provider"]:
+        lines.append(f"    Provider: {usage['provider']}")
+
+    # Token counts
+    if usage["prompt_tokens"] is not None:
+        prompt_info = f"    Prompt tokens: {usage['prompt_tokens']}"
+        if usage["cached_tokens"]:
+            prompt_info += f" (cached: {usage['cached_tokens']})"
+        lines.append(prompt_info)
+
+    if usage["completion_tokens"] is not None:
+        lines.append(f"    Completion tokens: {usage['completion_tokens']}")
+
+    # Reasoning-specific info
+    if usage["reasoning_tokens"] is not None:
+        lines.append(f"    Reasoning tokens: {usage['reasoning_tokens']}")
+        if usage["completion_tokens"] and usage["completion_tokens"] > 0:
+            pct = (usage["reasoning_tokens"] / usage["completion_tokens"]) * 100
+            lines.append(f"    Reasoning %: {pct:.1f}%")
+
+    if usage["has_reasoning_content"]:
+        lines.append("    Has reasoning content: Yes")
+
+    # Cost info
+    if usage["cost"] is not None:
+        lines.append(f"    Cost: ${usage['cost']:.6f}")
+
+    # Total
+    if usage["total_tokens"] is not None:
+        lines.append(f"    Total tokens: {usage['total_tokens']}")
+
+    # Reasoning content length if provided
+    if reasoning_content:
+        lines.append(f"    Reasoning content length: {len(reasoning_content)} chars")
+
+    return "\n".join(lines)


 # Fallback regex patterns for MCQA when answer tags don't work
@ -677,11 +1069,13 @@ def extract_first_boxed_answer(
    Extract the first \\boxed{} answer from a response.

    Follows the rule: only accept if there's exactly ONE boxed answer
-    after the </think> tag (if thinking mode). Multiple boxed answers = failure.
+    after the reasoning tags (if thinking mode). Multiple boxed answers = failure.
+
+    Supports both <think></think> and <|start_of_scratchpad|><|end_of_scratchpad|> formats.

    Args:
        response: The model's full response
-        after_think: Whether to only look after </think> tags
+        after_think: Whether to only look after reasoning tags
        debug: Whether to print debug information

    Returns:
@ -689,13 +1083,18 @@ def extract_first_boxed_answer(
    """
    # Get content to search
    if after_think:
-        # Extract content after </think>
+        # Try to extract content after </think> first
        match = THINK_CONTENT_AFTER_PATTERN.search(response)
        if match:
            search_content = match.group(1)
        else:
-            # No think tags, use full response
-            search_content = response
+            # Try <|end_of_scratchpad|> tags
+            match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response)
+            if match:
+                search_content = match.group(1)
+            else:
+                # No reasoning tags, use full response
+                search_content = response
    else:
        search_content = response

@ -951,13 +1350,18 @@ def score_math_answer(
    Returns:
        Tuple of (is_correct or None, method_used, has_multiple_boxed)
    """
-    # Get content to score
+    # Get content to score (check for both think and scratchpad tags)
    if after_think:
        match = THINK_CONTENT_AFTER_PATTERN.search(response)
        if match:
            score_content = match.group(1)
        else:
-            score_content = response
+            # Try scratchpad tags
+            match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response)
+            if match:
+                score_content = match.group(1)
+            else:
+                score_content = response
    else:
        score_content = response

@ -1036,13 +1440,18 @@ async def score_math_answer_async(
    """
    import asyncio

-    # Get content to score
+    # Get content to score (check for both think and scratchpad tags)
    if after_think:
        match = THINK_CONTENT_AFTER_PATTERN.search(response)
        if match:
            score_content = match.group(1)
        else:
-            score_content = response
+            # Try scratchpad tags
+            match = SCRATCHPAD_CONTENT_AFTER_PATTERN.search(response)
+            if match:
+                score_content = match.group(1)
+            else:
+                score_content = response
    else:
        score_content = response

--- a/environments/eval_environments/ifeval_eval.py
+++ b/environments/eval_environments/ifeval_eval.py
@ -34,7 +34,10 @@ from typing import Any, Dict, List, Optional, Tuple
 from datasets import load_dataset
 from eval_helpers import (
    create_system_content,
+    extract_reasoning_from_completion,
+    format_reasoning_debug_info,
    get_default_thinking_prompt,
+    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
@ -228,8 +231,13 @@ class IFEvalEnv(BaseEnv):
        print(f"  Max tokens: {self.config.eval_max_tokens}")
        print(f"  Evaluation split: {self.config.eval_split}")
        print(f"  Thinking mode: {self.config.thinking_mode}")
+        print(f"  Reasoning effort: {self.config.reasoning_effort}")
        if self.config.thinking_mode:
-            print(f"  Thinking prompt: {self._get_thinking_prompt()[:100]}...")
+            thinking_prompt = self._get_thinking_prompt()
+            if thinking_prompt:
+                print(f"  Thinking prompt: {thinking_prompt[:100]}...")
+            else:
+                print("  Thinking prompt: None (using API reasoning mode only)")

        # Load IFEval dataset
        try:
@ -269,29 +277,46 @@ class IFEvalEnv(BaseEnv):
        self.iter = 0

    def _validate_thinking_format(self, response: str) -> Tuple[bool, str]:
-        """Validate thinking format and extract content after </think> tags."""
-        if not self.config.thinking_mode:
-            return True, response
+        """
+        Validate thinking format and extract content after reasoning tags.

-        think_open_count = len(self._think_pattern.findall(response))
-        think_close_count = len(self._think_close_pattern.findall(response))
-
-        if think_open_count != 1 or think_close_count != 1:
-            return False, response
-
-        match = self._think_content_pattern.search(response)
-        if match:
-            return True, match.group(1).strip()
-        else:
-            return False, response
+        Supports both <think></think> and <|start_of_scratchpad|><|end_of_scratchpad|> formats.
+        """
+        return validate_thinking_format(response, self.config.thinking_mode)

    def _extract_thinking_content(self, response: str) -> Optional[str]:
-        """Extract the content inside <think></think> tags."""
+        """Extract the content inside <think></think> tags (legacy method)."""
        match = self._thinking_extract_pattern.search(response)
        if match:
            return match.group(1).strip()
        return None

+    def _extract_reasoning_content(
+        self, completion: Any, model_response: str
+    ) -> Tuple[Optional[str], str]:
+        """
+        Extract reasoning content from completion using multiple methods.
+
+        This handles different reasoning formats from various providers:
+        1. reasoning_content field (OpenAI reasoning models, some providers)
+        2. reasoning_details[].text field (OpenRouter style)
+        3. reasoning field on message
+        4. <think></think> blocks in message content (Hermes style)
+        5. <|start_of_scratchpad|><|end_of_scratchpad|> blocks
+
+        Args:
+            completion: The ChatCompletion response object
+            model_response: The message content string
+
+        Returns:
+            Tuple of (reasoning_content, source) where source indicates
+            where reasoning was found: "reasoning_content", "reasoning_details",
+            "reasoning", "think_block", "scratchpad_block", or "none"
+        """
+        # Use comprehensive extraction from eval_helpers
+        reasoning, source, _ = extract_reasoning_from_completion(completion)
+        return reasoning, source
+
    def _preprocess_response(self, response: str) -> List[str]:
        """
        Preprocess response for loose evaluation.
@ -458,7 +483,32 @@ class IFEvalEnv(BaseEnv):
                    if self.config.eval_max_tokens > 0:
                        completion_kwargs["max_tokens"] = self.config.eval_max_tokens

+                    if self.config.full_debug:
+                        print(
+                            f"\n  [API Call] Sending request (attempt {attempt + 1})..."
+                        )
+                        print(
+                            f"    Temperature: {completion_kwargs.get('temperature')}"
+                        )
+                        print(
+                            f"    Max tokens: {completion_kwargs.get('max_tokens', 'not set (unlimited)')}"
+                        )
+                        print(f"    Thinking mode: {self.config.thinking_mode}")
+                        print(f"    Reasoning effort: {self.config.reasoning_effort}")
+                        # Show extra_body that will be injected by ServerManager
+                        if self.config.thinking_mode or self.config.reasoning_effort:
+                            print(
+                                "    (ServerManager will inject reasoning extra_body)"
+                            )
+
+                    _api_start = time.time()
                    completion = await self.server.chat_completion(**completion_kwargs)
+                    _api_elapsed = time.time() - _api_start
+
+                    # Log reasoning token usage if full_debug is enabled
+                    if self.config.full_debug and completion:
+                        print(f"  [API Response] Received in {_api_elapsed:.2f}s")
+                        print(format_reasoning_debug_info(completion))

                    if completion.choices and completion.choices[0].message.content:
                        model_response = completion.choices[0].message.content
@ -496,15 +546,22 @@ class IFEvalEnv(BaseEnv):
            if not model_response:
                return {"result": None, "sample": None}

-            # Handle thinking mode - extract content after </think> for evaluation
+            # Handle thinking mode - extract content after reasoning tags for evaluation
            thinking_format_valid, response_for_eval = self._validate_thinking_format(
                model_response
            )

-            # Extract thinking content for logging
-            thinking_content = None
-            if self.config.thinking_mode:
-                thinking_content = self._extract_thinking_content(model_response)
+            # Extract reasoning content using comprehensive method
+            # This handles multiple formats: reasoning_content field, reasoning_details,
+            # reasoning field, <think></think> blocks, and <|start_of_scratchpad|> blocks
+            # Extract reasoning content using comprehensive method
+            # Always extract, regardless of thinking_mode, since API reasoning may be available
+            thinking_content, reasoning_source = self._extract_reasoning_content(
+                completion, model_response
+            )
+            if self.config.full_debug and thinking_content:
+                print(f"  [Reasoning] Found via: {reasoning_source}")
+                print(f"  [Reasoning] Length: {len(thinking_content)} chars")

            # Check instructions
            check_result = self._check_instructions(
@ -541,6 +598,7 @@ class IFEvalEnv(BaseEnv):
                    if thinking_content and len(thinking_content) > 500
                    else thinking_content
                )
+                sample["reasoning_source"] = reasoning_source

            if self.config.full_debug:
                strict_status = "✓" if check_result["prompt_level_strict"] else "✗"
@ -569,6 +627,7 @@ class IFEvalEnv(BaseEnv):
        print(f"  Total prompts: {len(self.all_eval_items)}")
        print(f"  Max tokens: {self.config.eval_max_tokens}")
        print(f"  Thinking mode: {self.config.thinking_mode}")
+        print(f"  Reasoning effort: {self.config.reasoning_effort}")
        print(f"{'='*60}\n")

        try:
@ -648,6 +707,15 @@ class IFEvalEnv(BaseEnv):
        if self.config.thinking_mode:
            thinking_utilization = sum(1 for s in samples if s.get("thinking_content"))

+        # Reasoning source statistics (tracks where reasoning was extracted from)
+        reasoning_sources = {}
+        if self.config.thinking_mode:
+            for sample in samples:
+                source = sample.get("reasoning_source", "none")
+                if source not in reasoning_sources:
+                    reasoning_sources[source] = 0
+                reasoning_sources[source] += 1
+
        # Build metrics dictionary
        eval_metrics = {
            "eval/prompt_level_strict_acc": prompt_strict_acc,
@ -693,6 +761,14 @@ class IFEvalEnv(BaseEnv):
        if self.config.thinking_mode:
            print(f"Thinking Format Compliance: {thinking_format_compliance_rate:.4f}")
            print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
+
+        # Print reasoning source breakdown if thinking mode is enabled
+        if self.config.thinking_mode and reasoning_sources:
+            print("\nReasoning Source Breakdown:")
+            for source, count in sorted(reasoning_sources.items(), key=lambda x: -x[1]):
+                pct = (count / total_count) * 100 if total_count > 0 else 0
+                print(f"  {source}: {count} ({pct:.1f}%)")
+
        print(f"{'='*60}\n")

        # Log evaluation results
@ -706,6 +782,7 @@ class IFEvalEnv(BaseEnv):
                    "temperature": self.config.eval_temperature,
                    "max_tokens": self.config.eval_max_tokens,
                    "thinking_mode": self.config.thinking_mode,
+                    "reasoning_effort": self.config.reasoning_effort,
                },
            )
        except Exception as e: