"""
Test file for reasoning model support across multiple providers.
This test validates:
1. ReasoningConfig builds correct extra_body for different providers
2. Reasoning can be enabled and responses contain reasoning content
3. Reasoning extraction works for various API response formats
Providers tested:
- OpenAI (gpt-5.2) - Uses reasoning_effort at top level
- OpenRouter (anthropic/claude-opus-4.5, nousresearch/hermes-4-70B, deepseek/deepseek-v3.2)
- Uses nested reasoning object with enabled/effort/max_tokens
Usage:
python -m pytest atroposlib/tests/test_reasoning_models.py -v
Or run directly:
python atroposlib/tests/test_reasoning_models.py
Note: This test requires valid API keys. Set them as environment variables or
modify the constants below for testing.
"""
import asyncio
import json
import os
import sys
from datetime import datetime
import pytest
# Add the project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
import openai # noqa: E402
from atroposlib.envs.server_handling.server_baseline import ( # noqa: E402
ReasoningConfig,
)
from environments.eval_environments.eval_helpers import ( # noqa: E402
HERMES_REASONING_PROMPT,
HERMES_REASONING_PROMPT_WITH_ANSWER,
extract_reasoning_from_completion,
extract_reasoning_from_response,
)
# =============================================================================
# API CONFIGURATION
# =============================================================================
# These are test credentials. For production, use environment variables.
# API keys must be set via environment variables
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-5.2")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = os.environ.get(
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
)
# Models to test on OpenRouter
OPENROUTER_MODELS = [
"anthropic/claude-opus-4.5",
"nousresearch/hermes-4-70b",
"deepseek/deepseek-v3.2",
]
# Test prompt that should trigger reasoning
TEST_PROMPT = "What is 15 * 23? Think step by step before giving your answer."
# Log file for full ChatCompletion objects
LOG_FILE = os.path.join(os.path.dirname(__file__), "reasoning_test_results.log")
def log_to_file(message: str):
"""Append message to log file."""
with open(LOG_FILE, "a") as f:
f.write(message + "\n")
# =============================================================================
# UNIT TESTS FOR ReasoningConfig
# =============================================================================
def test_reasoning_config_default():
"""Test default ReasoningConfig is not active."""
config = ReasoningConfig()
assert not config.enabled
assert config.effort is None
assert config.max_tokens is None
assert not config.is_reasoning_kwargs_active()
assert config.build_extra_body() is None
print("✓ Default ReasoningConfig is inactive")
def test_reasoning_config_enabled_only():
"""Test ReasoningConfig with only enabled=True."""
config = ReasoningConfig(enabled=True)
assert config.enabled
assert config.is_reasoning_kwargs_active()
# Test for non-OpenAI provider
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
assert extra_body == {"reasoning": {"enabled": True}}
# Test for OpenAI provider
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert extra_body == {"reasoning_effort": "medium"}
print("✓ ReasoningConfig with enabled=True works correctly")
def test_reasoning_config_with_effort():
"""Test ReasoningConfig with effort specified."""
config = ReasoningConfig(effort="high")
assert config.enabled # Should be auto-enabled
assert config.effort == "high"
assert config.is_reasoning_kwargs_active()
# Test for non-OpenAI provider
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
assert extra_body == {"reasoning": {"enabled": True, "effort": "high"}}
# Test for OpenAI provider
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert extra_body == {"reasoning_effort": "high"}
print("✓ ReasoningConfig with effort works correctly")
def test_reasoning_config_with_max_tokens():
"""Test ReasoningConfig with max_tokens specified."""
config = ReasoningConfig(max_tokens=4096)
assert config.enabled # Should be auto-enabled
assert config.max_tokens == 4096
assert config.is_reasoning_kwargs_active()
# Test for non-OpenAI provider
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
assert extra_body == {"reasoning": {"enabled": True, "max_tokens": 4096}}
# Test for OpenAI provider (max_tokens not supported, falls back to medium)
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert extra_body == {"reasoning_effort": "medium"}
print("✓ ReasoningConfig with max_tokens works correctly")
def test_reasoning_config_full():
"""Test ReasoningConfig with all options."""
config = ReasoningConfig(enabled=True, effort="xhigh", max_tokens=8192)
assert config.enabled
assert config.effort == "xhigh"
assert config.max_tokens == 8192
# Test for non-OpenAI provider
# Note: OpenRouter only allows ONE of effort or max_tokens
# When both are set, effort takes priority
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
assert extra_body == {
"reasoning": {
"enabled": True,
"effort": "xhigh",
# max_tokens is NOT included when effort is specified (OpenRouter limitation)
}
}
# Test for OpenAI provider (effort passed through directly)
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert extra_body == {"reasoning_effort": "xhigh"}
print("✓ ReasoningConfig with full options works correctly")
def test_reasoning_config_effort_mapping():
"""Test that effort levels are passed through directly for OpenAI."""
# All effort levels are now passed through 1:1
effort_levels = ["none", "minimal", "low", "medium", "high", "xhigh"]
for effort in effort_levels:
config = ReasoningConfig(effort=effort)
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert (
extra_body["reasoning_effort"] == effort
), f"Expected {effort} to pass through, got {extra_body}"
print("✓ Effort levels pass through correctly for OpenAI")
def test_reasoning_config_invalid_effort():
"""Test that invalid effort raises ValueError."""
try:
ReasoningConfig(effort="invalid") # Should raise
assert False, "Should have raised ValueError"
except ValueError as e:
assert "Invalid reasoning_effort" in str(e)
print("✓ Invalid effort raises ValueError")
def test_reasoning_config_max_tokens_no_validation():
"""Test that max_tokens accepts any value (no range validation).
Provider limits vary and may change over time:
- OpenRouter currently caps Anthropic at 1024-32000
- Native Anthropic API supports up to 128k extended thinking
We don't enforce limits here to allow flexibility.
"""
# Low values should work
config_low = ReasoningConfig(max_tokens=500)
assert config_low.max_tokens == 500
assert config_low.enabled # Auto-enabled
# High values should work (e.g., for native Anthropic 128k thinking)
config_high = ReasoningConfig(max_tokens=128000)
assert config_high.max_tokens == 128000
assert config_high.enabled
print("✓ max_tokens accepts any value (no range validation)")
def test_hermes_prompts_defined():
"""Test that Hermes prompts are properly defined."""
assert HERMES_REASONING_PROMPT is not None
assert "" in HERMES_REASONING_PROMPT
assert "" in HERMES_REASONING_PROMPT
assert HERMES_REASONING_PROMPT_WITH_ANSWER is not None
assert "" in HERMES_REASONING_PROMPT_WITH_ANSWER
print("✓ Hermes prompts are properly defined")
# =============================================================================
# SERVER MANAGER INTEGRATION TESTS
# =============================================================================
def test_reasoning_config_from_env_config():
"""Test ReasoningConfig.from_env_config() creates correct config."""
from atroposlib.envs.base import BaseEnvConfig
# Test with thinking_mode only
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
group_name="test",
run_name="test",
thinking_mode=True,
)
reasoning_config = ReasoningConfig.from_env_config(env_config)
assert reasoning_config.enabled is True
assert reasoning_config.effort is None
assert reasoning_config.max_tokens is None
print("✓ ReasoningConfig.from_env_config with thinking_mode=True works")
# Test with reasoning_effort
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
group_name="test",
run_name="test",
reasoning_effort="high",
)
reasoning_config = ReasoningConfig.from_env_config(env_config)
assert reasoning_config.enabled is True # Auto-enabled because effort is set
assert reasoning_config.effort == "high"
print("✓ ReasoningConfig.from_env_config with reasoning_effort works")
# Test with max_reasoning_tokens
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
group_name="test",
run_name="test",
max_reasoning_tokens=8000,
)
reasoning_config = ReasoningConfig.from_env_config(env_config)
assert reasoning_config.enabled is True # Auto-enabled because max_tokens is set
assert reasoning_config.max_tokens == 8000
print("✓ ReasoningConfig.from_env_config with max_reasoning_tokens works")
# Test with all disabled (default)
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
group_name="test",
run_name="test",
)
reasoning_config = ReasoningConfig.from_env_config(env_config)
assert reasoning_config.enabled is False
assert not reasoning_config.is_reasoning_kwargs_active()
print("✓ ReasoningConfig.from_env_config with defaults (disabled) works")
def test_server_manager_builds_extra_body():
"""Test ReasoningConfig.build_extra_body() generates correct extra_body."""
# Create reasoning config
reasoning_config = ReasoningConfig(enabled=True, effort="high")
# We can't easily instantiate ServerManager without actual servers,
# so let's test the build_extra_body logic directly
# Test OpenRouter format
extra_body = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
assert "reasoning" in extra_body
assert extra_body["reasoning"]["enabled"] is True
assert extra_body["reasoning"]["effort"] == "high"
print("✓ ServerManager builds correct extra_body for OpenRouter")
# Test OpenAI format
extra_body = reasoning_config.build_extra_body("https://api.openai.com/v1")
assert "reasoning_effort" in extra_body
assert extra_body["reasoning_effort"] == "high"
assert "reasoning" not in extra_body # Should NOT have nested reasoning
print("✓ ServerManager builds correct extra_body for OpenAI")
# Test Claude (anthropic) - should use max_tokens
claude_reasoning = ReasoningConfig(enabled=True, max_tokens=8000)
extra_body = claude_reasoning.build_extra_body("https://openrouter.ai/api/v1")
assert "reasoning" in extra_body
assert extra_body["reasoning"]["enabled"] is True
assert extra_body["reasoning"]["max_tokens"] == 8000
print("✓ ServerManager builds correct extra_body for Claude (max_tokens)")
async def test_server_manager_injects_extra_body():
"""
Integration test: Verify ServerManager actually injects extra_body in API calls.
This test creates a real ServerManager and makes an actual API call to verify
the full flow works.
"""
if not OPENROUTER_API_KEY:
pytest.skip("OPENROUTER_API_KEY not set - skipping integration test")
from atroposlib.envs.server_handling.server_baseline import APIServerConfig
from atroposlib.envs.server_handling.server_manager import ServerManager
# Create server config for OpenRouter
server_config = APIServerConfig(
model_name="nousresearch/hermes-4-70b",
base_url=OPENROUTER_BASE_URL,
api_key=OPENROUTER_API_KEY,
num_requests_for_eval=10,
)
# Create reasoning config
reasoning_config = ReasoningConfig(enabled=True, effort="high")
print("\n" + "=" * 60)
print("Testing ServerManager.chat_completion() with reasoning injection")
print("=" * 60)
# Create ServerManager with reasoning config (NOT in testing mode - we want real API call)
server_manager = ServerManager(
configs=[server_config],
reasoning_config=reasoning_config,
testing=False, # Actually make the API call
)
# Make a chat completion call
messages = [
{"role": "system", "content": HERMES_REASONING_PROMPT},
{"role": "user", "content": "What is 2 + 2? Think carefully."},
]
print(
f"Making API call: enabled={reasoning_config.enabled}, "
f"effort={reasoning_config.effort}"
)
completion = await server_manager.chat_completion(
messages=messages,
max_tokens=512,
temperature=0.7,
)
# Verify response has reasoning
reasoning, source, content = extract_reasoning_from_completion(completion)
print("Response received!")
print(
f"Content: {content[:100]}..."
if content and len(content) > 100
else f"Content: {content}"
)
print(f"Reasoning source: {source}")
print(f"Reasoning length: {len(reasoning) if reasoning else 0} chars")
# Assert we got a response (reasoning is optional - model may not support it)
assert content is not None, "Expected response content"
print("✓ ServerManager.chat_completion() correctly injected reasoning extra_body")
def test_full_env_config_to_server_flow():
"""
Test the complete flow from BaseEnvConfig to ServerManager reasoning injection.
This verifies that:
1. BaseEnvConfig with reasoning fields creates properly
2. ReasoningConfig.from_env_config() works
3. The resulting config would inject correct extra_body
"""
from atroposlib.envs.base import BaseEnvConfig
print("\n" + "=" * 60)
print("Testing full BaseEnvConfig → ServerManager flow")
print("=" * 60)
# Create a config like a user would
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
group_name="test-reasoning",
run_name="test-run",
thinking_mode=True,
reasoning_effort="high",
max_reasoning_tokens=8000,
)
print("Created BaseEnvConfig:")
print(f" thinking_mode: {env_config.thinking_mode}")
print(f" reasoning_effort: {env_config.reasoning_effort}")
print(f" max_reasoning_tokens: {env_config.max_reasoning_tokens}")
# Convert to ReasoningConfig (this happens in BaseEnv.__init__)
reasoning_config = ReasoningConfig.from_env_config(env_config)
print("\nReasoningConfig created:")
print(f" enabled: {reasoning_config.enabled}")
print(f" effort: {reasoning_config.effort}")
print(f" max_tokens: {reasoning_config.max_tokens}")
# Verify the config would generate correct extra_body
# For OpenRouter
openrouter_extra = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
print(f"\nOpenRouter extra_body: {json.dumps(openrouter_extra, indent=2)}")
assert openrouter_extra["reasoning"]["enabled"] is True
assert openrouter_extra["reasoning"]["effort"] == "high"
# Note: max_tokens is NOT included when effort is set (OpenRouter limitation)
# For OpenAI
openai_extra = reasoning_config.build_extra_body("https://api.openai.com/v1")
print(f"\nOpenAI extra_body: {json.dumps(openai_extra, indent=2)}")
assert openai_extra["reasoning_effort"] == "high"
print("\n✓ Full BaseEnvConfig → ServerManager flow works correctly!")
# =============================================================================
# INTEGRATION TESTS WITH REAL API CALLS
# =============================================================================
async def _run_openrouter_reasoning_test(model: str, effort: str = "high"):
"""
Run reasoning test with an OpenRouter model (helper function).
Note: This is a helper function, not a pytest test. It's called by
run_all_integration_tests() when running the script directly.
Args:
model: Model name to test
effort: Reasoning effort level
Returns:
Dict with test results
"""
print(f"\n{'='*60}")
print(f"Testing OpenRouter: {model}")
print(f"{'='*60}")
client = openai.AsyncOpenAI(
api_key=OPENROUTER_API_KEY,
base_url=OPENROUTER_BASE_URL,
)
# Build extra_body based on model type
# Claude models need max_tokens in reasoning dict, not effort
# See: https://openrouter.ai/docs/guides/best-practices/reasoning-tokens
is_claude = "claude" in model.lower() or "anthropic" in model.lower()
if is_claude:
# Claude needs reasoning.max_tokens, and overall max_tokens must be higher
reasoning_max_tokens = 8000
overall_max_tokens = 0 # Must be > reasoning_max_tokens
extra_body = {"reasoning": {"max_tokens": reasoning_max_tokens}}
else:
# Other models use effort
config = ReasoningConfig(enabled=True, effort=effort)
extra_body = config.build_extra_body(OPENROUTER_BASE_URL)
overall_max_tokens = 0
messages = [{"role": "user", "content": TEST_PROMPT}]
# For Hermes, also add the system prompt
if "hermes" in model.lower():
messages.insert(0, {"role": "system", "content": HERMES_REASONING_PROMPT})
# Build the full request for logging
request_params = {
"model": model,
"messages": messages,
"max_tokens": overall_max_tokens,
"temperature": 0.7,
"extra_body": extra_body,
}
print("Request params:")
print(f" model: {model}")
print(f" max_tokens: {overall_max_tokens}")
print(f" extra_body: {json.dumps(extra_body, indent=2)}")
# Log the full request to file
log_to_file(f"\n{'='*70}")
log_to_file(f"MODEL: {model}")
log_to_file(f"TIMESTAMP: {datetime.now().isoformat()}")
log_to_file(f"{'='*70}")
log_to_file("\nREQUEST SENT:")
log_to_file(json.dumps(request_params, indent=2, default=str))
try:
completion = await client.chat.completions.create(
model=model,
messages=messages,
max_tokens=overall_max_tokens,
temperature=0.7,
extra_body=extra_body,
)
# Log full ChatCompletion object to file for inspection
log_to_file("\nRESPONSE RECEIVED:")
log_to_file("\nFULL CHATCOMPLETION OBJECT:")
log_to_file(str(completion))
log_to_file(f"\n{'='*40}")
# Also log the choice and message separately for clarity
choice = completion.choices[0]
log_to_file(f"\nChoice object: {choice}")
log_to_file(f"\nMessage object: {choice.message}")
# Log all attributes on the message
log_to_file(f"\nMessage attributes: {dir(choice.message)}")
# Try to get model_dump if available (pydantic)
if hasattr(completion, "model_dump"):
log_to_file("\nCompletion model_dump():")
log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
if hasattr(choice, "model_dump"):
log_to_file("\nChoice model_dump():")
log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
if hasattr(choice.message, "model_dump"):
log_to_file("\nMessage model_dump():")
log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
# Get the response content
content = completion.choices[0].message.content
log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
log_to_file(content if content else "(empty)")
# Try to extract reasoning
reasoning, source = extract_reasoning_from_response(
completion.choices[0], content=content
)
log_to_file("\nReasoning extraction result:")
log_to_file(f" Source: {source}")
if reasoning:
log_to_file(f" Length: {len(reasoning)} chars")
log_to_file(" Full reasoning content:")
log_to_file(reasoning)
else:
log_to_file(" No separate reasoning found")
# Check for blocks in content
has_think_block = "" in content.lower() if content else False
log_to_file(f" Has block in content: {has_think_block}")
# Check for reasoning_details in raw response
has_reasoning_details = hasattr(completion.choices[0], "reasoning_details")
has_reasoning_content = hasattr(
completion.choices[0].message, "reasoning_content"
)
log_to_file(f" Has reasoning_details attr: {has_reasoning_details}")
log_to_file(f" Has reasoning_content attr: {has_reasoning_content}")
# Try to access reasoning fields directly if they exist
if (
hasattr(choice.message, "reasoning_content")
and choice.message.reasoning_content
):
log_to_file(
f" message.reasoning_content: {choice.message.reasoning_content}"
)
if hasattr(choice.message, "reasoning") and choice.message.reasoning:
log_to_file(f" message.reasoning: {choice.message.reasoning}")
if hasattr(choice, "reasoning_details") and choice.reasoning_details:
log_to_file(f" choice.reasoning_details: {choice.reasoning_details}")
log_to_file(f"\n{'='*70}\n")
# Also print summary to console
print(f"\nResponse content ({len(content) if content else 0} chars)")
print(
f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars"
)
print(f"(Full details logged to {LOG_FILE})")
return {
"model": model,
"success": True,
"content_length": len(content) if content else 0,
"reasoning_source": source,
"reasoning_length": len(reasoning) if reasoning else 0,
"has_think_block": has_think_block,
}
except Exception as e:
print(f"Error: {e}")
return {
"model": model,
"success": False,
"error": str(e),
}
async def _run_openai_reasoning_test(effort: str = "medium"):
"""
Run reasoning test with OpenAI official API (helper function).
Note: This is a helper function, not a pytest test. It's called by
run_all_integration_tests() when running the script directly.
Args:
effort: Reasoning effort level
Returns:
Dict with test results
"""
print(f"\n{'='*60}")
print(f"Testing OpenAI: {OPENAI_MODEL}")
print(f"{'='*60}")
client = openai.AsyncOpenAI(
api_key=OPENAI_API_KEY,
base_url=OPENAI_BASE_URL,
)
# Build extra_body using our ReasoningConfig
config = ReasoningConfig(enabled=True, effort=effort)
extra_body = config.build_extra_body(OPENAI_BASE_URL)
messages = [{"role": "user", "content": TEST_PROMPT}]
# Build the full request for logging
request_params = {
"model": OPENAI_MODEL,
"messages": messages,
"max_completion_tokens": 1024,
"extra_body": extra_body,
}
print("Request params:")
print(f" model: {OPENAI_MODEL}")
print(" max_completion_tokens: 1024")
print(f" extra_body: {json.dumps(extra_body, indent=2)}")
# Log the full request to file
log_to_file(f"\n{'='*70}")
log_to_file(f"MODEL: {OPENAI_MODEL} (OpenAI)")
log_to_file(f"TIMESTAMP: {datetime.now().isoformat()}")
log_to_file(f"{'='*70}")
log_to_file("\nREQUEST SENT:")
log_to_file(json.dumps(request_params, indent=2, default=str))
try:
completion = await client.chat.completions.create(
model=OPENAI_MODEL,
messages=messages,
max_completion_tokens=1024, # OpenAI reasoning models require this instead of max_tokens
# Note: OpenAI reasoning models only support temperature=1 (default)
extra_body=extra_body,
)
# Log full ChatCompletion object to file for inspection
log_to_file("\nRESPONSE RECEIVED:")
log_to_file("\nFULL CHATCOMPLETION OBJECT:")
log_to_file(str(completion))
log_to_file(f"\n{'='*40}")
# Also log the choice and message separately for clarity
choice = completion.choices[0]
log_to_file(f"\nChoice object: {choice}")
log_to_file(f"\nMessage object: {choice.message}")
# Log all attributes on the message
log_to_file(f"\nMessage attributes: {dir(choice.message)}")
# Try to get model_dump if available (pydantic)
if hasattr(completion, "model_dump"):
log_to_file("\nCompletion model_dump():")
log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
if hasattr(choice, "model_dump"):
log_to_file("\nChoice model_dump():")
log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
if hasattr(choice.message, "model_dump"):
log_to_file("\nMessage model_dump():")
log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
# Get the response content
content = completion.choices[0].message.content
log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
log_to_file(content if content else "(empty)")
# Try to extract reasoning
reasoning, source = extract_reasoning_from_response(
completion.choices[0], content=content
)
log_to_file("\nReasoning extraction result:")
log_to_file(f" Source: {source}")
if reasoning:
log_to_file(f" Length: {len(reasoning)} chars")
log_to_file(" Full reasoning content:")
log_to_file(reasoning)
else:
log_to_file(" No separate reasoning found")
# Check for blocks in content (unlikely for OpenAI)
has_think_block = "" in content.lower() if content else False
log_to_file(f" Has block in content: {has_think_block}")
# Try to access reasoning fields directly if they exist
if (
hasattr(choice.message, "reasoning_content")
and choice.message.reasoning_content
):
log_to_file(
f" message.reasoning_content: {choice.message.reasoning_content}"
)
if hasattr(choice.message, "reasoning") and choice.message.reasoning:
log_to_file(f" message.reasoning: {choice.message.reasoning}")
if hasattr(choice, "reasoning_details") and choice.reasoning_details:
log_to_file(f" choice.reasoning_details: {choice.reasoning_details}")
log_to_file(f"\n{'='*70}\n")
# Also print summary to console
print(f"\nResponse content ({len(content) if content else 0} chars)")
print(
f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars"
)
print(f"(Full details logged to {LOG_FILE})")
return {
"model": OPENAI_MODEL,
"success": True,
"content_length": len(content) if content else 0,
"reasoning_source": source,
"reasoning_length": len(reasoning) if reasoning else 0,
"has_think_block": has_think_block,
}
except Exception as e:
print(f"Error: {e}")
return {
"model": OPENAI_MODEL,
"success": False,
"error": str(e),
}
async def run_all_integration_tests():
"""Run all integration tests and summarize results."""
print("\n" + "=" * 70)
print("REASONING MODEL INTEGRATION TESTS")
print("=" * 70)
# Check that API keys are set
missing_keys = []
if not OPENAI_API_KEY:
missing_keys.append("OPENAI_API_KEY")
if not OPENROUTER_API_KEY:
missing_keys.append("OPENROUTER_API_KEY")
if missing_keys:
print(f"\n⚠ Missing required environment variables: {', '.join(missing_keys)}")
print("Set them before running integration tests:")
print(" export OPENAI_API_KEY='your-key-here'")
print(" export OPENROUTER_API_KEY='your-key-here'")
return False
# Initialize log file
with open(LOG_FILE, "w") as f:
f.write("REASONING MODEL TEST RESULTS\n")
f.write(f"Generated: {datetime.now().isoformat()}\n")
f.write(f"{'='*70}\n\n")
print(f"\nLogging full ChatCompletion objects to: {LOG_FILE}\n")
results = []
# Test OpenRouter models
for model in OPENROUTER_MODELS:
result = await _run_openrouter_reasoning_test(model)
results.append(result)
# Test OpenAI
result = await _run_openai_reasoning_test()
results.append(result)
# Summary
print("\n" + "=" * 70)
print("TEST SUMMARY")
print("=" * 70)
for result in results:
status = "✓ PASS" if result.get("success") else "✗ FAIL"
model = result.get("model", "unknown")
if result.get("success"):
reasoning_info = f"reasoning: {result.get('reasoning_source', 'none')}"
if result.get("reasoning_length", 0) > 0:
reasoning_info += f" ({result['reasoning_length']} chars)"
print(f"{status} | {model:40} | {reasoning_info}")
else:
print(
f"{status} | {model:40} | error: {result.get('error', 'unknown')[:30]}"
)
# Check if any failed
failures = [r for r in results if not r.get("success")]
if failures:
print(f"\n{len(failures)} test(s) failed")
return False
else:
print(f"\nAll {len(results)} tests passed!")
return True
# =============================================================================
# MAIN
# =============================================================================
def run_unit_tests():
"""Run all unit tests (no API calls)."""
print("\n" + "=" * 70)
print("UNIT TESTS")
print("=" * 70 + "\n")
# ReasoningConfig unit tests
test_reasoning_config_default()
test_reasoning_config_enabled_only()
test_reasoning_config_with_effort()
test_reasoning_config_with_max_tokens()
test_reasoning_config_full()
test_reasoning_config_effort_mapping()
test_reasoning_config_invalid_effort()
test_reasoning_config_max_tokens_no_validation()
test_hermes_prompts_defined()
# ServerManager integration tests (no API calls)
test_reasoning_config_from_env_config()
test_server_manager_builds_extra_body()
test_full_env_config_to_server_flow()
print("\n" + "=" * 70)
print("All unit tests passed!")
print("=" * 70)
async def run_server_manager_integration_test():
"""Run ServerManager integration test with real API call."""
print("\n" + "=" * 70)
print("SERVER MANAGER INTEGRATION TEST")
print("=" * 70)
result = await test_server_manager_injects_extra_body()
if result:
print("\n✓ ServerManager integration test passed!")
else:
print("\n✗ ServerManager integration test failed!")
return result
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Test reasoning model support")
parser.add_argument(
"--unit-only", action="store_true", help="Only run unit tests (no API calls)"
)
parser.add_argument(
"--integration-only",
action="store_true",
help="Only run integration tests (API calls to all providers)",
)
parser.add_argument(
"--server-manager-only",
action="store_true",
help="Only run ServerManager integration test (single API call)",
)
args = parser.parse_args()
if args.integration_only:
asyncio.run(run_all_integration_tests())
elif args.unit_only:
run_unit_tests()
elif args.server_manager_only:
run_unit_tests() # Run unit tests first
asyncio.run(run_server_manager_integration_test())
else:
# Run all tests
run_unit_tests()
asyncio.run(run_server_manager_integration_test())
asyncio.run(run_all_integration_tests())