Add support for reasoning models and their variety of providers/endpoints

2026-04-19 12:57:58 +00:00 · 2025-12-30 00:23:00 +00:00 · 2025-12-30 00:23:00 +00:00 · 62fa51240c
commit 62fa51240c
parent 1c306d3b17
6 changed files with 1551 additions and 16 deletions
--- a/environments/eval_environments/eval_helpers.py
+++ b/environments/eval_environments/eval_helpers.py
@ -12,6 +12,7 @@ Includes:
 - Math answer verification (using math_verify library)
 - System prompt creation
 - Results saving utilities
+- Reasoning content extraction from various API response formats
 """

 import json
@ -19,7 +20,51 @@ import os
 import re
 from concurrent.futures import ProcessPoolExecutor
 from string import ascii_uppercase
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+
+
+# =============================================================================
+# REASONING/THINKING PROMPTS
+# =============================================================================
+# Standard prompts for triggering reasoning mode in various models.
+# These are NOT automatically injected - use explicitly when desired.
+
+HERMES_REASONING_PROMPT = (
+    "You are a deep thinking AI, you may use extremely long chains of thought to deeply "
+    "consider the problem and deliberate with yourself via systematic reasoning processes "
+    "to help come to a correct solution prior to answering. You should enclose your "
+    "thoughts and internal monologue inside <think> </think> tags, and then provide your "
+    "solution or response to the problem."
+)
+"""
+Standard reasoning prompt for Hermes models.
+
+This prompt triggers the model to use extended chain-of-thought reasoning
+with explicit <think></think> tags. Use this when you want visible reasoning
+in the response content.
+
+Example usage:
+    from eval_helpers import HERMES_REASONING_PROMPT
+    
+    messages = [
+        {"role": "system", "content": HERMES_REASONING_PROMPT},
+        {"role": "user", "content": question},
+    ]
+"""
+
+HERMES_REASONING_PROMPT_WITH_ANSWER = (
+    "You are a deep thinking AI, you may use extremely long chains of thought to deeply "
+    "consider the problem and deliberate with yourself via systematic reasoning processes "
+    "to help come to a correct solution prior to answering. You should enclose your "
+    "thoughts and internal monologue inside <think> </think> tags, and then provide your "
+    "solution or response to the problem. After your thinking, provide your final answer "
+    "inside <answer></answer> tags."
+)
+"""
+Standard reasoning prompt for Hermes models with explicit answer tag instruction.
+
+Use this when you want the model to clearly separate reasoning from the final answer.
+"""

 # Try to import math_verify libraries (optional dependency for math evals)
 try:
@ -462,25 +507,330 @@ def extract_thinking_content(response: str) -> Optional[str]:
    return None


-def get_default_thinking_prompt(custom_prompt: Optional[str] = None) -> str:
+def get_default_thinking_prompt(custom_prompt: Optional[str] = None) -> Optional[str]:
    """
    Get the thinking system prompt.

+    By default, returns None (no prompt injection). Pass a custom prompt or use
+    HERMES_REASONING_PROMPT explicitly if you want reasoning prompt injection.
+
    Args:
-        custom_prompt: Optional custom thinking prompt to use instead of default
+        custom_prompt: Optional custom thinking prompt to use. If None, returns None.
+                      Use HERMES_REASONING_PROMPT for the standard Hermes prompt.

    Returns:
-        The thinking prompt string
+        The thinking prompt string, or None if no prompt specified.
+        
+    Example:
+        # No prompt injection (default):
+        prompt = get_default_thinking_prompt()  # Returns None
+        
+        # Use Hermes reasoning prompt:
+        from eval_helpers import HERMES_REASONING_PROMPT
+        prompt = get_default_thinking_prompt(HERMES_REASONING_PROMPT)
    """
-    if custom_prompt:
-        return custom_prompt
+    return custom_prompt  # None means no prompt injection

-    return (
-        "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the "
-        "problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
-        "solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
-        "</think> tags, and then provide your solution or response to the problem."
-    )
+
+def get_thinking_prompt_or_hermes(custom_prompt: Optional[str] = None) -> str:
+    """
+    Get thinking prompt, defaulting to HERMES_REASONING_PROMPT if none provided.
+    
+    Use this when you want to ensure a thinking prompt is always used.
+    
+    Args:
+        custom_prompt: Optional custom thinking prompt. If None, uses HERMES_REASONING_PROMPT.
+        
+    Returns:
+        The thinking prompt string (never None).
+    """
+    return custom_prompt if custom_prompt else HERMES_REASONING_PROMPT
+
+
+# =============================================================================
+# REASONING CONTENT EXTRACTION
+# =============================================================================
+# Functions for extracting reasoning content from various API response formats.
+# Different providers return reasoning in different ways:
+# - OpenRouter/Nebius: reasoning_details[].text or reasoning_content field
+# - Some providers: reasoning field in message
+# - Hermes/others: <think></think> blocks in message content
+
+
+def extract_reasoning_from_response(
+    response: Any,
+    content: Optional[str] = None,
+) -> Tuple[Optional[str], str]:
+    """
+    Extract reasoning content from various API response formats.
+    
+    This function handles multiple reasoning formats:
+    1. reasoning_content field on the message (some providers)
+    2. reasoning_details[].text field (OpenRouter style for reasoning models)
+    3. reasoning field on the message (some providers)
+    4. <think></think> blocks in message content (Hermes style)
+    
+    Args:
+        response: The ChatCompletion response object from the API
+        content: Optional message content string. If provided, will check for
+                <think> blocks in addition to API fields.
+    
+    Returns:
+        Tuple of (reasoning_content, source) where:
+        - reasoning_content: The extracted reasoning text, or None if not found
+        - source: String indicating where reasoning was found:
+          "reasoning_content", "reasoning_details", "reasoning", "think_block", "none"
+    
+    Example:
+        completion = await server.chat_completion(messages=messages)
+        message = completion.choices[0].message
+        reasoning, source = extract_reasoning_from_response(
+            completion.choices[0],
+            content=message.content
+        )
+        if reasoning:
+            print(f"Found reasoning via {source}: {len(reasoning)} chars")
+    """
+    # Try reasoning_content field (some providers like certain OpenAI-compatible APIs)
+    if hasattr(response, "reasoning_content") and response.reasoning_content:
+        return response.reasoning_content, "reasoning_content"
+    
+    # Try message.reasoning_content if response is a Choice
+    if hasattr(response, "message"):
+        message = response.message
+        if hasattr(message, "reasoning_content") and message.reasoning_content:
+            return message.reasoning_content, "reasoning_content"
+        if hasattr(message, "reasoning") and message.reasoning:
+            return message.reasoning, "reasoning"
+    
+    # Try reasoning_details field (OpenRouter style)
+    if hasattr(response, "reasoning_details") and response.reasoning_details:
+        for detail in response.reasoning_details:
+            if hasattr(detail, "text") and detail.text:
+                return detail.text, "reasoning_details"
+            # Some formats use 'content' instead of 'text'
+            if isinstance(detail, dict) and detail.get("text"):
+                return detail["text"], "reasoning_details"
+    
+    # Try message.reasoning_details if response is a Choice
+    if hasattr(response, "message"):
+        message = response.message
+        if hasattr(message, "reasoning_details") and message.reasoning_details:
+            for detail in message.reasoning_details:
+                if hasattr(detail, "text") and detail.text:
+                    return detail.text, "reasoning_details"
+                if isinstance(detail, dict) and detail.get("text"):
+                    return detail["text"], "reasoning_details"
+    
+    # Try reasoning field directly
+    if hasattr(response, "reasoning") and response.reasoning:
+        return response.reasoning, "reasoning"
+    
+    # Try <think> blocks in content (Hermes style)
+    if content:
+        match = THINK_CONTENT_INSIDE_PATTERN.search(content)
+        if match:
+            return match.group(1).strip(), "think_block"
+    
+    return None, "none"
+
+
+def extract_reasoning_from_completion(
+    completion: Any,
+    choice_idx: int = 0,
+) -> Tuple[Optional[str], str, Optional[str]]:
+    """
+    Extract reasoning from a ChatCompletion object.
+    
+    Convenience wrapper around extract_reasoning_from_response that handles
+    the common case of extracting from a ChatCompletion.
+    
+    Args:
+        completion: The ChatCompletion response object
+        choice_idx: Index of the choice to extract from (default 0)
+    
+    Returns:
+        Tuple of (reasoning_content, source, message_content) where:
+        - reasoning_content: The extracted reasoning text, or None
+        - source: Where reasoning was found (see extract_reasoning_from_response)
+        - message_content: The message content (for convenience)
+    
+    Example:
+        completion = await server.chat_completion(messages=messages)
+        reasoning, source, content = extract_reasoning_from_completion(completion)
+    """
+    if not completion or not completion.choices:
+        return None, "none", None
+    
+    if choice_idx >= len(completion.choices):
+        return None, "none", None
+    
+    choice = completion.choices[choice_idx]
+    content = None
+    
+    if hasattr(choice, "message") and hasattr(choice.message, "content"):
+        content = choice.message.content
+    
+    reasoning, source = extract_reasoning_from_response(choice, content)
+    return reasoning, source, content
+
+
+def get_reasoning_token_usage(completion: Any) -> Dict[str, Any]:
+    """
+    Extract reasoning token usage information from a ChatCompletion.
+    
+    This extracts token counts from the usage field, including reasoning-specific
+    metrics when available (e.g., reasoning_tokens from OpenRouter/OpenAI).
+    
+    Works with all known providers:
+    - OpenAI: usage.completion_tokens_details.reasoning_tokens
+    - OpenRouter (Claude, Hermes, DeepSeek, etc.): Same location + provider/cost fields
+    
+    Args:
+        completion: The ChatCompletion response object
+    
+    Returns:
+        Dict with token usage info:
+        - model: Model name used
+        - completion_tokens: Total completion tokens
+        - prompt_tokens: Input tokens
+        - total_tokens: Total tokens used
+        - reasoning_tokens: Reasoning/thinking tokens (if available)
+        - cached_tokens: Cached prompt tokens (if available)
+        - cost: API cost (if available, OpenRouter)
+        - provider: Provider name (if available, OpenRouter)
+        - has_reasoning_content: Whether message contains reasoning field
+    
+    Example:
+        completion = await server.chat_completion(messages=messages)
+        usage = get_reasoning_token_usage(completion)
+        if config.full_debug:
+            print(f"  Reasoning tokens: {usage.get('reasoning_tokens', 'N/A')}")
+    """
+    result = {
+        "model": None,
+        "completion_tokens": None,
+        "prompt_tokens": None,
+        "total_tokens": None,
+        "reasoning_tokens": None,
+        "cached_tokens": None,
+        "cost": None,
+        "provider": None,
+        "has_reasoning_content": False,
+    }
+    
+    if not completion:
+        return result
+    
+    # Extract model name
+    if hasattr(completion, "model"):
+        result["model"] = completion.model
+    
+    # Extract provider (OpenRouter includes this)
+    if hasattr(completion, "provider"):
+        result["provider"] = completion.provider
+    
+    # Check if message has reasoning content
+    if hasattr(completion, "choices") and completion.choices:
+        msg = completion.choices[0].message if hasattr(completion.choices[0], "message") else None
+        if msg:
+            # Check for reasoning field (OpenRouter normalized field)
+            if hasattr(msg, "reasoning") and msg.reasoning:
+                result["has_reasoning_content"] = True
+            # Check for reasoning_details (OpenRouter)
+            elif hasattr(msg, "reasoning_details") and msg.reasoning_details:
+                result["has_reasoning_content"] = True
+    
+    # Extract usage info
+    if not hasattr(completion, "usage") or not completion.usage:
+        return result
+    
+    usage = completion.usage
+    
+    result["completion_tokens"] = getattr(usage, "completion_tokens", None)
+    result["prompt_tokens"] = getattr(usage, "prompt_tokens", None)
+    result["total_tokens"] = getattr(usage, "total_tokens", None)
+    
+    # Extract cost (OpenRouter includes this)
+    if hasattr(usage, "cost"):
+        result["cost"] = usage.cost
+    
+    # Extract reasoning tokens from completion_tokens_details
+    # This works for: OpenAI, OpenRouter (Claude, Hermes, DeepSeek, etc.)
+    if hasattr(usage, "completion_tokens_details") and usage.completion_tokens_details:
+        details = usage.completion_tokens_details
+        if hasattr(details, "reasoning_tokens"):
+            result["reasoning_tokens"] = details.reasoning_tokens
+    
+    # Extract cached tokens from prompt_tokens_details (OpenRouter/OpenAI)
+    if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
+        details = usage.prompt_tokens_details
+        if hasattr(details, "cached_tokens"):
+            result["cached_tokens"] = details.cached_tokens
+    
+    return result
+
+
+def format_reasoning_debug_info(completion: Any, reasoning_content: Optional[str] = None) -> str:
+    """
+    Format reasoning debug information for logging.
+    
+    Use this in evals when full_debug is enabled to show reasoning token usage.
+    
+    Args:
+        completion: The ChatCompletion response object
+        reasoning_content: Optional pre-extracted reasoning content
+    
+    Returns:
+        Formatted string with reasoning debug info
+    
+    Example:
+        if self.config.full_debug:
+            print(format_reasoning_debug_info(completion))
+    """
+    usage = get_reasoning_token_usage(completion)
+    
+    lines = ["  [Reasoning/Token Debug Info]"]
+    
+    # Model and provider info
+    if usage["model"]:
+        lines.append(f"    Model: {usage['model']}")
+    if usage["provider"]:
+        lines.append(f"    Provider: {usage['provider']}")
+    
+    # Token counts
+    if usage["prompt_tokens"] is not None:
+        prompt_info = f"    Prompt tokens: {usage['prompt_tokens']}"
+        if usage["cached_tokens"]:
+            prompt_info += f" (cached: {usage['cached_tokens']})"
+        lines.append(prompt_info)
+    
+    if usage["completion_tokens"] is not None:
+        lines.append(f"    Completion tokens: {usage['completion_tokens']}")
+    
+    # Reasoning-specific info
+    if usage["reasoning_tokens"] is not None:
+        lines.append(f"    Reasoning tokens: {usage['reasoning_tokens']}")
+        if usage["completion_tokens"] and usage["completion_tokens"] > 0:
+            pct = (usage["reasoning_tokens"] / usage["completion_tokens"]) * 100
+            lines.append(f"    Reasoning %: {pct:.1f}%")
+    
+    if usage["has_reasoning_content"]:
+        lines.append(f"    Has reasoning content: Yes")
+    
+    # Cost info
+    if usage["cost"] is not None:
+        lines.append(f"    Cost: ${usage['cost']:.6f}")
+    
+    # Total
+    if usage["total_tokens"] is not None:
+        lines.append(f"    Total tokens: {usage['total_tokens']}")
+    
+    # Reasoning content length if provided
+    if reasoning_content:
+        lines.append(f"    Reasoning content length: {len(reasoning_content)} chars")
+    
+    return "\n".join(lines)


 # Fallback regex patterns for MCQA when answer tags don't work