Add support for reasoning models and their variety of providers/endpoints

2026-04-19 12:57:58 +00:00 · 2025-12-30 00:23:00 +00:00 · 2025-12-30 00:23:00 +00:00 · 62fa51240c
commit 62fa51240c
parent 1c306d3b17
6 changed files with 1551 additions and 16 deletions
--- a/atroposlib/tests/test_reasoning_models.py
+++ b/atroposlib/tests/test_reasoning_models.py
@ -0,0 +1,920 @@
+"""
+Test file for reasoning model support across multiple providers.
+
+This test validates:
+1. ReasoningConfig builds correct extra_body for different providers
+2. Reasoning can be enabled and responses contain reasoning content
+3. Reasoning extraction works for various API response formats
+
+Providers tested:
+- OpenAI (gpt-5.2) - Uses reasoning_effort at top level
+- OpenRouter (anthropic/claude-opus-4.5, nousresearch/hermes-4-70B, deepseek/deepseek-v3.2)
+  - Uses nested reasoning object with enabled/effort/max_tokens
+
+Usage:
+    python -m pytest atroposlib/tests/test_reasoning_models.py -v
+    
+    Or run directly:
+    python atroposlib/tests/test_reasoning_models.py
+
+Note: This test requires valid API keys. Set them as environment variables or
+modify the constants below for testing.
+"""
+
+import asyncio
+import json
+import os
+import sys
+from datetime import datetime
+from typing import Optional
+
+# Add the project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+
+import openai
+
+from atroposlib.envs.server_handling.server_baseline import (
+    ReasoningConfig,
+    VALID_REASONING_EFFORTS,
+)
+from environments.eval_environments.eval_helpers import (
+    HERMES_REASONING_PROMPT,
+    HERMES_REASONING_PROMPT_WITH_ANSWER,
+    extract_reasoning_from_response,
+    extract_reasoning_from_completion,
+)
+
+
+# =============================================================================
+# API CONFIGURATION
+# =============================================================================
+# These are test credentials. For production, use environment variables.
+
+# API keys must be set via environment variables
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-5.2")
+OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
+
+OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
+OPENROUTER_BASE_URL = os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
+
+# Models to test on OpenRouter
+OPENROUTER_MODELS = [
+    "anthropic/claude-opus-4.5",
+    "nousresearch/hermes-4-70b",
+    "deepseek/deepseek-v3.2",
+]
+
+# Test prompt that should trigger reasoning
+TEST_PROMPT = "What is 15 * 23? Think step by step before giving your answer."
+
+# Log file for full ChatCompletion objects
+LOG_FILE = os.path.join(
+    os.path.dirname(__file__), 
+    "reasoning_test_results.log"
+)
+
+
+def log_to_file(message: str):
+    """Append message to log file."""
+    with open(LOG_FILE, "a") as f:
+        f.write(message + "\n")
+
+
+# =============================================================================
+# UNIT TESTS FOR ReasoningConfig
+# =============================================================================
+
+def test_reasoning_config_default():
+    """Test default ReasoningConfig is not active."""
+    config = ReasoningConfig()
+    assert not config.enabled
+    assert config.effort is None
+    assert config.max_tokens is None
+    assert not config.is_active()
+    assert config.build_extra_body() is None
+    print("✓ Default ReasoningConfig is inactive")
+
+
+def test_reasoning_config_enabled_only():
+    """Test ReasoningConfig with only enabled=True."""
+    config = ReasoningConfig(enabled=True)
+    assert config.enabled
+    assert config.is_active()
+    
+    # Test for non-OpenAI provider
+    extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
+    assert extra_body == {"reasoning": {"enabled": True}}
+    
+    # Test for OpenAI provider
+    extra_body = config.build_extra_body("https://api.openai.com/v1")
+    assert extra_body == {"reasoning_effort": "medium"}
+    print("✓ ReasoningConfig with enabled=True works correctly")
+
+
+def test_reasoning_config_with_effort():
+    """Test ReasoningConfig with effort specified."""
+    config = ReasoningConfig(effort="high")
+    assert config.enabled  # Should be auto-enabled
+    assert config.effort == "high"
+    assert config.is_active()
+    
+    # Test for non-OpenAI provider
+    extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
+    assert extra_body == {"reasoning": {"enabled": True, "effort": "high"}}
+    
+    # Test for OpenAI provider
+    extra_body = config.build_extra_body("https://api.openai.com/v1")
+    assert extra_body == {"reasoning_effort": "high"}
+    print("✓ ReasoningConfig with effort works correctly")
+
+
+def test_reasoning_config_with_max_tokens():
+    """Test ReasoningConfig with max_tokens specified."""
+    config = ReasoningConfig(max_tokens=4096)
+    assert config.enabled  # Should be auto-enabled
+    assert config.max_tokens == 4096
+    assert config.is_active()
+    
+    # Test for non-OpenAI provider
+    extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
+    assert extra_body == {"reasoning": {"enabled": True, "max_tokens": 4096}}
+    
+    # Test for OpenAI provider (max_tokens not supported, falls back to medium)
+    extra_body = config.build_extra_body("https://api.openai.com/v1")
+    assert extra_body == {"reasoning_effort": "medium"}
+    print("✓ ReasoningConfig with max_tokens works correctly")
+
+
+def test_reasoning_config_full():
+    """Test ReasoningConfig with all options."""
+    config = ReasoningConfig(enabled=True, effort="xhigh", max_tokens=8192)
+    assert config.enabled
+    assert config.effort == "xhigh"
+    assert config.max_tokens == 8192
+    
+    # Test for non-OpenAI provider
+    # Note: OpenRouter only allows ONE of effort or max_tokens
+    # When both are set, effort takes priority
+    extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
+    assert extra_body == {
+        "reasoning": {
+            "enabled": True,
+            "effort": "xhigh",
+            # max_tokens is NOT included when effort is specified (OpenRouter limitation)
+        }
+    }
+    
+    # Test for OpenAI provider (xhigh maps to high)
+    extra_body = config.build_extra_body("https://api.openai.com/v1")
+    assert extra_body == {"reasoning_effort": "high"}
+    print("✓ ReasoningConfig with full options works correctly")
+
+
+def test_reasoning_config_effort_mapping():
+    """Test that effort levels are correctly mapped for OpenAI."""
+    mappings = {
+        "none": "low",
+        "minimal": "low",
+        "low": "low",
+        "medium": "medium",
+        "high": "high",
+        "xhigh": "high",
+    }
+    
+    for our_effort, expected_openai in mappings.items():
+        config = ReasoningConfig(effort=our_effort)
+        extra_body = config.build_extra_body("https://api.openai.com/v1")
+        assert extra_body["reasoning_effort"] == expected_openai, \
+            f"Expected {our_effort} to map to {expected_openai}, got {extra_body}"
+    print("✓ Effort level mapping for OpenAI works correctly")
+
+
+def test_reasoning_config_invalid_effort():
+    """Test that invalid effort raises ValueError."""
+    try:
+        config = ReasoningConfig(effort="invalid")
+        assert False, "Should have raised ValueError"
+    except ValueError as e:
+        assert "Invalid reasoning_effort" in str(e)
+    print("✓ Invalid effort raises ValueError")
+
+
+def test_reasoning_config_invalid_max_tokens():
+    """Test that invalid max_tokens raises ValueError."""
+    # Too low
+    try:
+        config = ReasoningConfig(max_tokens=500)
+        assert False, "Should have raised ValueError for too low"
+    except ValueError as e:
+        assert "must be between 1024 and 32000" in str(e)
+    
+    # Too high
+    try:
+        config = ReasoningConfig(max_tokens=50000)
+        assert False, "Should have raised ValueError for too high"
+    except ValueError as e:
+        assert "must be between 1024 and 32000" in str(e)
+    print("✓ Invalid max_tokens raises ValueError")
+
+
+def test_hermes_prompts_defined():
+    """Test that Hermes prompts are properly defined."""
+    assert HERMES_REASONING_PROMPT is not None
+    assert "<think>" in HERMES_REASONING_PROMPT
+    assert "</think>" in HERMES_REASONING_PROMPT
+    
+    assert HERMES_REASONING_PROMPT_WITH_ANSWER is not None
+    assert "<answer>" in HERMES_REASONING_PROMPT_WITH_ANSWER
+    print("✓ Hermes prompts are properly defined")
+
+
+# =============================================================================
+# SERVER MANAGER INTEGRATION TESTS
+# =============================================================================
+
+def test_reasoning_config_from_env_config():
+    """Test ReasoningConfig.from_env_config() creates correct config."""
+    from atroposlib.envs.base import BaseEnvConfig
+    
+    # Test with thinking_mode only
+    env_config = BaseEnvConfig(
+        tokenizer_name="gpt2",
+        group_name="test",
+        run_name="test",
+        thinking_mode=True,
+    )
+    reasoning_config = ReasoningConfig.from_env_config(env_config)
+    assert reasoning_config.enabled == True
+    assert reasoning_config.effort is None
+    assert reasoning_config.max_tokens is None
+    print("✓ ReasoningConfig.from_env_config with thinking_mode=True works")
+    
+    # Test with reasoning_effort
+    env_config = BaseEnvConfig(
+        tokenizer_name="gpt2",
+        group_name="test",
+        run_name="test",
+        reasoning_effort="high",
+    )
+    reasoning_config = ReasoningConfig.from_env_config(env_config)
+    assert reasoning_config.enabled == True  # Auto-enabled because effort is set
+    assert reasoning_config.effort == "high"
+    print("✓ ReasoningConfig.from_env_config with reasoning_effort works")
+    
+    # Test with max_reasoning_tokens
+    env_config = BaseEnvConfig(
+        tokenizer_name="gpt2",
+        group_name="test",
+        run_name="test",
+        max_reasoning_tokens=8000,
+    )
+    reasoning_config = ReasoningConfig.from_env_config(env_config)
+    assert reasoning_config.enabled == True  # Auto-enabled because max_tokens is set
+    assert reasoning_config.max_tokens == 8000
+    print("✓ ReasoningConfig.from_env_config with max_reasoning_tokens works")
+    
+    # Test with all disabled (default)
+    env_config = BaseEnvConfig(
+        tokenizer_name="gpt2",
+        group_name="test",
+        run_name="test",
+    )
+    reasoning_config = ReasoningConfig.from_env_config(env_config)
+    assert reasoning_config.enabled == False
+    assert not reasoning_config.is_active()
+    print("✓ ReasoningConfig.from_env_config with defaults (disabled) works")
+
+
+def test_server_manager_builds_extra_body():
+    """Test ServerManager._build_extra_body() injects correct extra_body."""
+    from unittest.mock import MagicMock
+    from atroposlib.envs.server_handling.server_manager import ServerManager
+    from atroposlib.envs.server_handling.server_baseline import APIServerConfig
+    
+    # Create a mock server with OpenRouter base_url
+    openrouter_config = APIServerConfig(
+        model_name="nousresearch/hermes-4-70b",
+        base_url="https://openrouter.ai/api/v1",
+        api_key="test-key",
+        num_requests_for_eval=10,
+    )
+    
+    # Create ServerManager with reasoning config
+    reasoning_config = ReasoningConfig(enabled=True, effort="high")
+    
+    # We can't easily instantiate ServerManager without actual servers,
+    # so let's test the _build_extra_body logic directly
+    
+    # Test OpenRouter format
+    extra_body = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
+    assert "reasoning" in extra_body
+    assert extra_body["reasoning"]["enabled"] == True
+    assert extra_body["reasoning"]["effort"] == "high"
+    print("✓ ServerManager builds correct extra_body for OpenRouter")
+    
+    # Test OpenAI format
+    extra_body = reasoning_config.build_extra_body("https://api.openai.com/v1")
+    assert "reasoning_effort" in extra_body
+    assert extra_body["reasoning_effort"] == "high"
+    assert "reasoning" not in extra_body  # Should NOT have nested reasoning
+    print("✓ ServerManager builds correct extra_body for OpenAI")
+    
+    # Test Claude (anthropic) - should use max_tokens
+    claude_reasoning = ReasoningConfig(enabled=True, max_tokens=8000)
+    extra_body = claude_reasoning.build_extra_body("https://openrouter.ai/api/v1")
+    assert "reasoning" in extra_body
+    assert extra_body["reasoning"]["enabled"] == True
+    assert extra_body["reasoning"]["max_tokens"] == 8000
+    print("✓ ServerManager builds correct extra_body for Claude (max_tokens)")
+
+
+async def test_server_manager_injects_extra_body():
+    """
+    Integration test: Verify ServerManager actually injects extra_body in API calls.
+    
+    This test creates a real ServerManager and makes an actual API call to verify
+    the full flow works.
+    """
+    if not OPENROUTER_API_KEY:
+        print("⚠ Skipping ServerManager integration test - OPENROUTER_API_KEY not set")
+        return True
+    
+    from atroposlib.envs.server_handling.server_manager import ServerManager
+    from atroposlib.envs.server_handling.server_baseline import APIServerConfig
+    
+    # Create server config for OpenRouter
+    server_config = APIServerConfig(
+        model_name="nousresearch/hermes-4-70b",
+        base_url=OPENROUTER_BASE_URL,
+        api_key=OPENROUTER_API_KEY,
+        num_requests_for_eval=10,
+    )
+    
+    # Create reasoning config
+    reasoning_config = ReasoningConfig(enabled=True, effort="high")
+    
+    print("\n" + "="*60)
+    print("Testing ServerManager.chat_completion() with reasoning injection")
+    print("="*60)
+    
+    try:
+        # Create ServerManager with reasoning config (NOT in testing mode - we want real API call)
+        server_manager = ServerManager(
+            configs=[server_config],
+            reasoning_config=reasoning_config,
+            testing=False,  # Actually make the API call
+        )
+        
+        # Make a chat completion call
+        messages = [
+            {"role": "system", "content": HERMES_REASONING_PROMPT},
+            {"role": "user", "content": "What is 2 + 2? Think carefully."}
+        ]
+        
+        print(f"Making API call with reasoning config: enabled={reasoning_config.enabled}, effort={reasoning_config.effort}")
+        
+        completion = await server_manager.chat_completion(
+            messages=messages,
+            max_tokens=512,
+            temperature=0.7,
+        )
+        
+        # Verify response has reasoning
+        reasoning, source, content = extract_reasoning_from_completion(completion)
+        
+        print(f"Response received!")
+        print(f"Content: {content[:100]}..." if content and len(content) > 100 else f"Content: {content}")
+        print(f"Reasoning source: {source}")
+        print(f"Reasoning length: {len(reasoning) if reasoning else 0} chars")
+        
+        if reasoning:
+            print("✓ ServerManager.chat_completion() correctly injected reasoning extra_body")
+            return True
+        else:
+            print("⚠ Response received but no reasoning found (model may not support it)")
+            return True  # Still a pass - the injection worked, model just didn't return reasoning
+            
+    except Exception as e:
+        import traceback
+        print(f"✗ ServerManager test failed: {e}")
+        traceback.print_exc()
+        return False
+
+
+def test_full_env_config_to_server_flow():
+    """
+    Test the complete flow from BaseEnvConfig to ServerManager reasoning injection.
+    
+    This verifies that:
+    1. BaseEnvConfig with reasoning fields creates properly
+    2. ReasoningConfig.from_env_config() works
+    3. The resulting config would inject correct extra_body
+    """
+    from atroposlib.envs.base import BaseEnvConfig
+    
+    print("\n" + "="*60)
+    print("Testing full BaseEnvConfig → ServerManager flow")
+    print("="*60)
+    
+    # Create a config like a user would
+    env_config = BaseEnvConfig(
+        tokenizer_name="gpt2",
+        group_name="test-reasoning",
+        run_name="test-run",
+        thinking_mode=True,
+        reasoning_effort="high",
+        max_reasoning_tokens=8000,
+    )
+    
+    print(f"Created BaseEnvConfig:")
+    print(f"  thinking_mode: {env_config.thinking_mode}")
+    print(f"  reasoning_effort: {env_config.reasoning_effort}")
+    print(f"  max_reasoning_tokens: {env_config.max_reasoning_tokens}")
+    
+    # Convert to ReasoningConfig (this happens in BaseEnv.__init__)
+    reasoning_config = ReasoningConfig.from_env_config(env_config)
+    
+    print(f"\nReasoningConfig created:")
+    print(f"  enabled: {reasoning_config.enabled}")
+    print(f"  effort: {reasoning_config.effort}")
+    print(f"  max_tokens: {reasoning_config.max_tokens}")
+    
+    # Verify the config would generate correct extra_body
+    # For OpenRouter
+    openrouter_extra = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
+    print(f"\nOpenRouter extra_body: {json.dumps(openrouter_extra, indent=2)}")
+    assert openrouter_extra["reasoning"]["enabled"] == True
+    assert openrouter_extra["reasoning"]["effort"] == "high"
+    # Note: max_tokens is NOT included when effort is set (OpenRouter limitation)
+    
+    # For OpenAI
+    openai_extra = reasoning_config.build_extra_body("https://api.openai.com/v1")
+    print(f"\nOpenAI extra_body: {json.dumps(openai_extra, indent=2)}")
+    assert openai_extra["reasoning_effort"] == "high"
+    
+    print("\n✓ Full BaseEnvConfig → ServerManager flow works correctly!")
+    return True
+
+
+# =============================================================================
+# INTEGRATION TESTS WITH REAL API CALLS
+# =============================================================================
+
+async def test_openrouter_reasoning(model: str, effort: str = "high"):
+    """
+    Test reasoning with an OpenRouter model.
+    
+    Args:
+        model: Model name to test
+        effort: Reasoning effort level
+    
+    Returns:
+        Dict with test results
+    """
+    print(f"\n{'='*60}")
+    print(f"Testing OpenRouter: {model}")
+    print(f"{'='*60}")
+    
+    client = openai.AsyncOpenAI(
+        api_key=OPENROUTER_API_KEY,
+        base_url=OPENROUTER_BASE_URL,
+    )
+    
+    # Build extra_body based on model type
+    # Claude models need max_tokens in reasoning dict, not effort
+    # See: https://openrouter.ai/docs/guides/best-practices/reasoning-tokens
+    is_claude = "claude" in model.lower() or "anthropic" in model.lower()
+    
+    if is_claude:
+        # Claude needs reasoning.max_tokens, and overall max_tokens must be higher
+        reasoning_max_tokens = 8000
+        overall_max_tokens = 0  # Must be > reasoning_max_tokens
+        extra_body = {
+            "reasoning": {
+                "max_tokens": reasoning_max_tokens
+            }
+        }
+    else:
+        # Other models use effort
+        config = ReasoningConfig(enabled=True, effort=effort)
+        extra_body = config.build_extra_body(OPENROUTER_BASE_URL)
+        overall_max_tokens = 0
+    
+    messages = [
+        {"role": "user", "content": TEST_PROMPT}
+    ]
+    
+    # For Hermes, also add the system prompt
+    if "hermes" in model.lower():
+        messages.insert(0, {"role": "system", "content": HERMES_REASONING_PROMPT})
+    
+    # Build the full request for logging
+    request_params = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": overall_max_tokens,
+        "temperature": 0.7,
+        "extra_body": extra_body,
+    }
+    
+    print(f"Request params:")
+    print(f"  model: {model}")
+    print(f"  max_tokens: {overall_max_tokens}")
+    print(f"  extra_body: {json.dumps(extra_body, indent=2)}")
+    
+    # Log the full request to file
+    log_to_file(f"\n{'='*70}")
+    log_to_file(f"MODEL: {model}")
+    log_to_file(f"TIMESTAMP: {datetime.now().isoformat()}")
+    log_to_file(f"{'='*70}")
+    log_to_file(f"\nREQUEST SENT:")
+    log_to_file(json.dumps(request_params, indent=2, default=str))
+    
+    try:
+        completion = await client.chat.completions.create(
+            model=model,
+            messages=messages,
+            max_tokens=overall_max_tokens,
+            temperature=0.7,
+            extra_body=extra_body,
+        )
+        
+        # Log full ChatCompletion object to file for inspection
+        log_to_file(f"\nRESPONSE RECEIVED:")
+        log_to_file(f"\nFULL CHATCOMPLETION OBJECT:")
+        log_to_file(str(completion))
+        log_to_file(f"\n{'='*40}")
+        
+        # Also log the choice and message separately for clarity
+        choice = completion.choices[0]
+        log_to_file(f"\nChoice object: {choice}")
+        log_to_file(f"\nMessage object: {choice.message}")
+        
+        # Log all attributes on the message
+        log_to_file(f"\nMessage attributes: {dir(choice.message)}")
+        
+        # Try to get model_dump if available (pydantic)
+        if hasattr(completion, "model_dump"):
+            log_to_file(f"\nCompletion model_dump():")
+            log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
+        
+        if hasattr(choice, "model_dump"):
+            log_to_file(f"\nChoice model_dump():")
+            log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
+            
+        if hasattr(choice.message, "model_dump"):
+            log_to_file(f"\nMessage model_dump():")
+            log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
+        
+        # Get the response content
+        content = completion.choices[0].message.content
+        log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
+        log_to_file(content if content else "(empty)")
+        
+        # Try to extract reasoning
+        reasoning, source = extract_reasoning_from_response(
+            completion.choices[0], 
+            content=content
+        )
+        
+        log_to_file(f"\nReasoning extraction result:")
+        log_to_file(f"  Source: {source}")
+        if reasoning:
+            log_to_file(f"  Length: {len(reasoning)} chars")
+            log_to_file(f"  Full reasoning content:")
+            log_to_file(reasoning)
+        else:
+            log_to_file("  No separate reasoning found")
+        
+        # Check for <think> blocks in content
+        has_think_block = "<think>" in content.lower() if content else False
+        log_to_file(f"  Has <think> block in content: {has_think_block}")
+        
+        # Check for reasoning_details in raw response
+        has_reasoning_details = hasattr(completion.choices[0], "reasoning_details")
+        has_reasoning_content = hasattr(completion.choices[0].message, "reasoning_content")
+        log_to_file(f"  Has reasoning_details attr: {has_reasoning_details}")
+        log_to_file(f"  Has reasoning_content attr: {has_reasoning_content}")
+        
+        # Try to access reasoning fields directly if they exist
+        if hasattr(choice.message, "reasoning_content") and choice.message.reasoning_content:
+            log_to_file(f"  message.reasoning_content: {choice.message.reasoning_content}")
+        if hasattr(choice.message, "reasoning") and choice.message.reasoning:
+            log_to_file(f"  message.reasoning: {choice.message.reasoning}")
+        if hasattr(choice, "reasoning_details") and choice.reasoning_details:
+            log_to_file(f"  choice.reasoning_details: {choice.reasoning_details}")
+        
+        log_to_file(f"\n{'='*70}\n")
+        
+        # Also print summary to console
+        print(f"\nResponse content ({len(content) if content else 0} chars)")
+        print(f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars")
+        print(f"(Full details logged to {LOG_FILE})")
+        
+        return {
+            "model": model,
+            "success": True,
+            "content_length": len(content) if content else 0,
+            "reasoning_source": source,
+            "reasoning_length": len(reasoning) if reasoning else 0,
+            "has_think_block": has_think_block,
+        }
+        
+    except Exception as e:
+        print(f"Error: {e}")
+        return {
+            "model": model,
+            "success": False,
+            "error": str(e),
+        }
+
+
+async def test_openai_reasoning(effort: str = "medium"):
+    """
+    Test reasoning with OpenAI official API.
+    
+    Args:
+        effort: Reasoning effort level
+    
+    Returns:
+        Dict with test results
+    """
+    print(f"\n{'='*60}")
+    print(f"Testing OpenAI: {OPENAI_MODEL}")
+    print(f"{'='*60}")
+    
+    client = openai.AsyncOpenAI(
+        api_key=OPENAI_API_KEY,
+        base_url=OPENAI_BASE_URL,
+    )
+    
+    # Build extra_body using our ReasoningConfig
+    config = ReasoningConfig(enabled=True, effort=effort)
+    extra_body = config.build_extra_body(OPENAI_BASE_URL)
+    
+    messages = [
+        {"role": "user", "content": TEST_PROMPT}
+    ]
+    
+    # Build the full request for logging
+    request_params = {
+        "model": OPENAI_MODEL,
+        "messages": messages,
+        "max_completion_tokens": 1024,
+        "extra_body": extra_body,
+    }
+    
+    print(f"Request params:")
+    print(f"  model: {OPENAI_MODEL}")
+    print(f"  max_completion_tokens: 1024")
+    print(f"  extra_body: {json.dumps(extra_body, indent=2)}")
+    
+    # Log the full request to file
+    log_to_file(f"\n{'='*70}")
+    log_to_file(f"MODEL: {OPENAI_MODEL} (OpenAI)")
+    log_to_file(f"TIMESTAMP: {datetime.now().isoformat()}")
+    log_to_file(f"{'='*70}")
+    log_to_file(f"\nREQUEST SENT:")
+    log_to_file(json.dumps(request_params, indent=2, default=str))
+    
+    try:
+        completion = await client.chat.completions.create(
+            model=OPENAI_MODEL,
+            messages=messages,
+            max_completion_tokens=1024,  # OpenAI reasoning models require this instead of max_tokens
+            # Note: OpenAI reasoning models only support temperature=1 (default)
+            extra_body=extra_body,
+        )
+        
+        # Log full ChatCompletion object to file for inspection
+        log_to_file(f"\nRESPONSE RECEIVED:")
+        log_to_file(f"\nFULL CHATCOMPLETION OBJECT:")
+        log_to_file(str(completion))
+        log_to_file(f"\n{'='*40}")
+        
+        # Also log the choice and message separately for clarity
+        choice = completion.choices[0]
+        log_to_file(f"\nChoice object: {choice}")
+        log_to_file(f"\nMessage object: {choice.message}")
+        
+        # Log all attributes on the message
+        log_to_file(f"\nMessage attributes: {dir(choice.message)}")
+        
+        # Try to get model_dump if available (pydantic)
+        if hasattr(completion, "model_dump"):
+            log_to_file(f"\nCompletion model_dump():")
+            log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
+        
+        if hasattr(choice, "model_dump"):
+            log_to_file(f"\nChoice model_dump():")
+            log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
+            
+        if hasattr(choice.message, "model_dump"):
+            log_to_file(f"\nMessage model_dump():")
+            log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
+        
+        # Get the response content
+        content = completion.choices[0].message.content
+        log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
+        log_to_file(content if content else "(empty)")
+        
+        # Try to extract reasoning
+        reasoning, source = extract_reasoning_from_response(
+            completion.choices[0], 
+            content=content
+        )
+        
+        log_to_file(f"\nReasoning extraction result:")
+        log_to_file(f"  Source: {source}")
+        if reasoning:
+            log_to_file(f"  Length: {len(reasoning)} chars")
+            log_to_file(f"  Full reasoning content:")
+            log_to_file(reasoning)
+        else:
+            log_to_file("  No separate reasoning found")
+        
+        # Check for <think> blocks in content (unlikely for OpenAI)
+        has_think_block = "<think>" in content.lower() if content else False
+        log_to_file(f"  Has <think> block in content: {has_think_block}")
+        
+        # Try to access reasoning fields directly if they exist
+        if hasattr(choice.message, "reasoning_content") and choice.message.reasoning_content:
+            log_to_file(f"  message.reasoning_content: {choice.message.reasoning_content}")
+        if hasattr(choice.message, "reasoning") and choice.message.reasoning:
+            log_to_file(f"  message.reasoning: {choice.message.reasoning}")
+        if hasattr(choice, "reasoning_details") and choice.reasoning_details:
+            log_to_file(f"  choice.reasoning_details: {choice.reasoning_details}")
+        
+        log_to_file(f"\n{'='*70}\n")
+        
+        # Also print summary to console
+        print(f"\nResponse content ({len(content) if content else 0} chars)")
+        print(f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars")
+        print(f"(Full details logged to {LOG_FILE})")
+        
+        return {
+            "model": OPENAI_MODEL,
+            "success": True,
+            "content_length": len(content) if content else 0,
+            "reasoning_source": source,
+            "reasoning_length": len(reasoning) if reasoning else 0,
+            "has_think_block": has_think_block,
+        }
+        
+    except Exception as e:
+        print(f"Error: {e}")
+        return {
+            "model": OPENAI_MODEL,
+            "success": False,
+            "error": str(e),
+        }
+
+
+async def run_all_integration_tests():
+    """Run all integration tests and summarize results."""
+    print("\n" + "="*70)
+    print("REASONING MODEL INTEGRATION TESTS")
+    print("="*70)
+    
+    # Check that API keys are set
+    missing_keys = []
+    if not OPENAI_API_KEY:
+        missing_keys.append("OPENAI_API_KEY")
+    if not OPENROUTER_API_KEY:
+        missing_keys.append("OPENROUTER_API_KEY")
+    
+    if missing_keys:
+        print(f"\n⚠ Missing required environment variables: {', '.join(missing_keys)}")
+        print("Set them before running integration tests:")
+        print("  export OPENAI_API_KEY='your-key-here'")
+        print("  export OPENROUTER_API_KEY='your-key-here'")
+        return False
+    
+    # Initialize log file
+    with open(LOG_FILE, "w") as f:
+        f.write(f"REASONING MODEL TEST RESULTS\n")
+        f.write(f"Generated: {datetime.now().isoformat()}\n")
+        f.write(f"{'='*70}\n\n")
+    
+    print(f"\nLogging full ChatCompletion objects to: {LOG_FILE}\n")
+    
+    results = []
+    
+    # Test OpenRouter models
+    for model in OPENROUTER_MODELS:
+        result = await test_openrouter_reasoning(model)
+        results.append(result)
+    
+    # Test OpenAI
+    result = await test_openai_reasoning()
+    results.append(result)
+    
+    # Summary
+    print("\n" + "="*70)
+    print("TEST SUMMARY")
+    print("="*70)
+    
+    for result in results:
+        status = "✓ PASS" if result.get("success") else "✗ FAIL"
+        model = result.get("model", "unknown")
+        
+        if result.get("success"):
+            reasoning_info = f"reasoning: {result.get('reasoning_source', 'none')}"
+            if result.get("reasoning_length", 0) > 0:
+                reasoning_info += f" ({result['reasoning_length']} chars)"
+            print(f"{status} | {model:40} | {reasoning_info}")
+        else:
+            print(f"{status} | {model:40} | error: {result.get('error', 'unknown')[:30]}")
+    
+    # Check if any failed
+    failures = [r for r in results if not r.get("success")]
+    if failures:
+        print(f"\n{len(failures)} test(s) failed")
+        return False
+    else:
+        print(f"\nAll {len(results)} tests passed!")
+        return True
+
+
+# =============================================================================
+# MAIN
+# =============================================================================
+
+def run_unit_tests():
+    """Run all unit tests (no API calls)."""
+    print("\n" + "="*70)
+    print("UNIT TESTS")
+    print("="*70 + "\n")
+    
+    # ReasoningConfig unit tests
+    test_reasoning_config_default()
+    test_reasoning_config_enabled_only()
+    test_reasoning_config_with_effort()
+    test_reasoning_config_with_max_tokens()
+    test_reasoning_config_full()
+    test_reasoning_config_effort_mapping()
+    test_reasoning_config_invalid_effort()
+    test_reasoning_config_invalid_max_tokens()
+    test_hermes_prompts_defined()
+    
+    # ServerManager integration tests (no API calls)
+    test_reasoning_config_from_env_config()
+    test_server_manager_builds_extra_body()
+    test_full_env_config_to_server_flow()
+    
+    print("\n" + "="*70)
+    print("All unit tests passed!")
+    print("="*70)
+
+
+async def run_server_manager_integration_test():
+    """Run ServerManager integration test with real API call."""
+    print("\n" + "="*70)
+    print("SERVER MANAGER INTEGRATION TEST")
+    print("="*70)
+    
+    result = await test_server_manager_injects_extra_body()
+    
+    if result:
+        print("\n✓ ServerManager integration test passed!")
+    else:
+        print("\n✗ ServerManager integration test failed!")
+    
+    return result
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Test reasoning model support")
+    parser.add_argument(
+        "--unit-only", 
+        action="store_true",
+        help="Only run unit tests (no API calls)"
+    )
+    parser.add_argument(
+        "--integration-only",
+        action="store_true", 
+        help="Only run integration tests (API calls to all providers)"
+    )
+    parser.add_argument(
+        "--server-manager-only",
+        action="store_true",
+        help="Only run ServerManager integration test (single API call)"
+    )
+    args = parser.parse_args()
+    
+    if args.integration_only:
+        asyncio.run(run_all_integration_tests())
+    elif args.unit_only:
+        run_unit_tests()
+    elif args.server_manager_only:
+        run_unit_tests()  # Run unit tests first
+        asyncio.run(run_server_manager_integration_test())
+    else:
+        # Run all tests
+        run_unit_tests()
+        asyncio.run(run_server_manager_integration_test())
+        asyncio.run(run_all_integration_tests())
+