[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
This commit is contained in:
pre-commit-ci[bot] 2025-12-30 00:26:29 +00:00
parent 62fa51240c
commit 97047eee7b
5 changed files with 320 additions and 280 deletions

View file

@ -13,7 +13,7 @@ Providers tested:
Usage:
python -m pytest atroposlib/tests/test_reasoning_models.py -v
Or run directly:
python atroposlib/tests/test_reasoning_models.py
@ -34,17 +34,16 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
import openai
from atroposlib.envs.server_handling.server_baseline import (
ReasoningConfig,
VALID_REASONING_EFFORTS,
ReasoningConfig,
)
from environments.eval_environments.eval_helpers import (
HERMES_REASONING_PROMPT,
HERMES_REASONING_PROMPT_WITH_ANSWER,
extract_reasoning_from_response,
extract_reasoning_from_completion,
extract_reasoning_from_response,
)
# =============================================================================
# API CONFIGURATION
# =============================================================================
@ -56,7 +55,9 @@ OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-5.2")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
OPENROUTER_BASE_URL = os.environ.get(
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
)
# Models to test on OpenRouter
OPENROUTER_MODELS = [
@ -69,10 +70,7 @@ OPENROUTER_MODELS = [
TEST_PROMPT = "What is 15 * 23? Think step by step before giving your answer."
# Log file for full ChatCompletion objects
LOG_FILE = os.path.join(
os.path.dirname(__file__),
"reasoning_test_results.log"
)
LOG_FILE = os.path.join(os.path.dirname(__file__), "reasoning_test_results.log")
def log_to_file(message: str):
@ -85,6 +83,7 @@ def log_to_file(message: str):
# UNIT TESTS FOR ReasoningConfig
# =============================================================================
def test_reasoning_config_default():
"""Test default ReasoningConfig is not active."""
config = ReasoningConfig()
@ -101,11 +100,11 @@ def test_reasoning_config_enabled_only():
config = ReasoningConfig(enabled=True)
assert config.enabled
assert config.is_active()
# Test for non-OpenAI provider
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
assert extra_body == {"reasoning": {"enabled": True}}
# Test for OpenAI provider
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert extra_body == {"reasoning_effort": "medium"}
@ -118,11 +117,11 @@ def test_reasoning_config_with_effort():
assert config.enabled # Should be auto-enabled
assert config.effort == "high"
assert config.is_active()
# Test for non-OpenAI provider
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
assert extra_body == {"reasoning": {"enabled": True, "effort": "high"}}
# Test for OpenAI provider
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert extra_body == {"reasoning_effort": "high"}
@ -135,11 +134,11 @@ def test_reasoning_config_with_max_tokens():
assert config.enabled # Should be auto-enabled
assert config.max_tokens == 4096
assert config.is_active()
# Test for non-OpenAI provider
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
assert extra_body == {"reasoning": {"enabled": True, "max_tokens": 4096}}
# Test for OpenAI provider (max_tokens not supported, falls back to medium)
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert extra_body == {"reasoning_effort": "medium"}
@ -152,7 +151,7 @@ def test_reasoning_config_full():
assert config.enabled
assert config.effort == "xhigh"
assert config.max_tokens == 8192
# Test for non-OpenAI provider
# Note: OpenRouter only allows ONE of effort or max_tokens
# When both are set, effort takes priority
@ -164,7 +163,7 @@ def test_reasoning_config_full():
# max_tokens is NOT included when effort is specified (OpenRouter limitation)
}
}
# Test for OpenAI provider (xhigh maps to high)
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert extra_body == {"reasoning_effort": "high"}
@ -181,12 +180,13 @@ def test_reasoning_config_effort_mapping():
"high": "high",
"xhigh": "high",
}
for our_effort, expected_openai in mappings.items():
config = ReasoningConfig(effort=our_effort)
extra_body = config.build_extra_body("https://api.openai.com/v1")
assert extra_body["reasoning_effort"] == expected_openai, \
f"Expected {our_effort} to map to {expected_openai}, got {extra_body}"
assert (
extra_body["reasoning_effort"] == expected_openai
), f"Expected {our_effort} to map to {expected_openai}, got {extra_body}"
print("✓ Effort level mapping for OpenAI works correctly")
@ -208,7 +208,7 @@ def test_reasoning_config_invalid_max_tokens():
assert False, "Should have raised ValueError for too low"
except ValueError as e:
assert "must be between 1024 and 32000" in str(e)
# Too high
try:
config = ReasoningConfig(max_tokens=50000)
@ -223,7 +223,7 @@ def test_hermes_prompts_defined():
assert HERMES_REASONING_PROMPT is not None
assert "<think>" in HERMES_REASONING_PROMPT
assert "</think>" in HERMES_REASONING_PROMPT
assert HERMES_REASONING_PROMPT_WITH_ANSWER is not None
assert "<answer>" in HERMES_REASONING_PROMPT_WITH_ANSWER
print("✓ Hermes prompts are properly defined")
@ -233,10 +233,11 @@ def test_hermes_prompts_defined():
# SERVER MANAGER INTEGRATION TESTS
# =============================================================================
def test_reasoning_config_from_env_config():
"""Test ReasoningConfig.from_env_config() creates correct config."""
from atroposlib.envs.base import BaseEnvConfig
# Test with thinking_mode only
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
@ -249,7 +250,7 @@ def test_reasoning_config_from_env_config():
assert reasoning_config.effort is None
assert reasoning_config.max_tokens is None
print("✓ ReasoningConfig.from_env_config with thinking_mode=True works")
# Test with reasoning_effort
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
@ -261,7 +262,7 @@ def test_reasoning_config_from_env_config():
assert reasoning_config.enabled == True # Auto-enabled because effort is set
assert reasoning_config.effort == "high"
print("✓ ReasoningConfig.from_env_config with reasoning_effort works")
# Test with max_reasoning_tokens
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
@ -273,7 +274,7 @@ def test_reasoning_config_from_env_config():
assert reasoning_config.enabled == True # Auto-enabled because max_tokens is set
assert reasoning_config.max_tokens == 8000
print("✓ ReasoningConfig.from_env_config with max_reasoning_tokens works")
# Test with all disabled (default)
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
@ -289,9 +290,10 @@ def test_reasoning_config_from_env_config():
def test_server_manager_builds_extra_body():
"""Test ServerManager._build_extra_body() injects correct extra_body."""
from unittest.mock import MagicMock
from atroposlib.envs.server_handling.server_manager import ServerManager
from atroposlib.envs.server_handling.server_baseline import APIServerConfig
from atroposlib.envs.server_handling.server_manager import ServerManager
# Create a mock server with OpenRouter base_url
openrouter_config = APIServerConfig(
model_name="nousresearch/hermes-4-70b",
@ -299,27 +301,27 @@ def test_server_manager_builds_extra_body():
api_key="test-key",
num_requests_for_eval=10,
)
# Create ServerManager with reasoning config
reasoning_config = ReasoningConfig(enabled=True, effort="high")
# We can't easily instantiate ServerManager without actual servers,
# so let's test the _build_extra_body logic directly
# Test OpenRouter format
extra_body = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
assert "reasoning" in extra_body
assert extra_body["reasoning"]["enabled"] == True
assert extra_body["reasoning"]["effort"] == "high"
print("✓ ServerManager builds correct extra_body for OpenRouter")
# Test OpenAI format
extra_body = reasoning_config.build_extra_body("https://api.openai.com/v1")
assert "reasoning_effort" in extra_body
assert extra_body["reasoning_effort"] == "high"
assert "reasoning" not in extra_body # Should NOT have nested reasoning
print("✓ ServerManager builds correct extra_body for OpenAI")
# Test Claude (anthropic) - should use max_tokens
claude_reasoning = ReasoningConfig(enabled=True, max_tokens=8000)
extra_body = claude_reasoning.build_extra_body("https://openrouter.ai/api/v1")
@ -332,17 +334,17 @@ def test_server_manager_builds_extra_body():
async def test_server_manager_injects_extra_body():
"""
Integration test: Verify ServerManager actually injects extra_body in API calls.
This test creates a real ServerManager and makes an actual API call to verify
the full flow works.
"""
if not OPENROUTER_API_KEY:
print("⚠ Skipping ServerManager integration test - OPENROUTER_API_KEY not set")
return True
from atroposlib.envs.server_handling.server_manager import ServerManager
from atroposlib.envs.server_handling.server_baseline import APIServerConfig
from atroposlib.envs.server_handling.server_manager import ServerManager
# Create server config for OpenRouter
server_config = APIServerConfig(
model_name="nousresearch/hermes-4-70b",
@ -350,14 +352,14 @@ async def test_server_manager_injects_extra_body():
api_key=OPENROUTER_API_KEY,
num_requests_for_eval=10,
)
# Create reasoning config
reasoning_config = ReasoningConfig(enabled=True, effort="high")
print("\n" + "="*60)
print("\n" + "=" * 60)
print("Testing ServerManager.chat_completion() with reasoning injection")
print("="*60)
print("=" * 60)
try:
# Create ServerManager with reasoning config (NOT in testing mode - we want real API call)
server_manager = ServerManager(
@ -365,38 +367,49 @@ async def test_server_manager_injects_extra_body():
reasoning_config=reasoning_config,
testing=False, # Actually make the API call
)
# Make a chat completion call
messages = [
{"role": "system", "content": HERMES_REASONING_PROMPT},
{"role": "user", "content": "What is 2 + 2? Think carefully."}
{"role": "user", "content": "What is 2 + 2? Think carefully."},
]
print(f"Making API call with reasoning config: enabled={reasoning_config.enabled}, effort={reasoning_config.effort}")
print(
f"Making API call with reasoning config: enabled={reasoning_config.enabled}, effort={reasoning_config.effort}"
)
completion = await server_manager.chat_completion(
messages=messages,
max_tokens=512,
temperature=0.7,
)
# Verify response has reasoning
reasoning, source, content = extract_reasoning_from_completion(completion)
print(f"Response received!")
print(f"Content: {content[:100]}..." if content and len(content) > 100 else f"Content: {content}")
print(
f"Content: {content[:100]}..."
if content and len(content) > 100
else f"Content: {content}"
)
print(f"Reasoning source: {source}")
print(f"Reasoning length: {len(reasoning) if reasoning else 0} chars")
if reasoning:
print("✓ ServerManager.chat_completion() correctly injected reasoning extra_body")
print(
"✓ ServerManager.chat_completion() correctly injected reasoning extra_body"
)
return True
else:
print("⚠ Response received but no reasoning found (model may not support it)")
print(
"⚠ Response received but no reasoning found (model may not support it)"
)
return True # Still a pass - the injection worked, model just didn't return reasoning
except Exception as e:
import traceback
print(f"✗ ServerManager test failed: {e}")
traceback.print_exc()
return False
@ -405,18 +418,18 @@ async def test_server_manager_injects_extra_body():
def test_full_env_config_to_server_flow():
"""
Test the complete flow from BaseEnvConfig to ServerManager reasoning injection.
This verifies that:
1. BaseEnvConfig with reasoning fields creates properly
2. ReasoningConfig.from_env_config() works
3. The resulting config would inject correct extra_body
"""
from atroposlib.envs.base import BaseEnvConfig
print("\n" + "="*60)
print("\n" + "=" * 60)
print("Testing full BaseEnvConfig → ServerManager flow")
print("="*60)
print("=" * 60)
# Create a config like a user would
env_config = BaseEnvConfig(
tokenizer_name="gpt2",
@ -426,20 +439,20 @@ def test_full_env_config_to_server_flow():
reasoning_effort="high",
max_reasoning_tokens=8000,
)
print(f"Created BaseEnvConfig:")
print(f" thinking_mode: {env_config.thinking_mode}")
print(f" reasoning_effort: {env_config.reasoning_effort}")
print(f" max_reasoning_tokens: {env_config.max_reasoning_tokens}")
# Convert to ReasoningConfig (this happens in BaseEnv.__init__)
reasoning_config = ReasoningConfig.from_env_config(env_config)
print(f"\nReasoningConfig created:")
print(f" enabled: {reasoning_config.enabled}")
print(f" effort: {reasoning_config.effort}")
print(f" max_tokens: {reasoning_config.max_tokens}")
# Verify the config would generate correct extra_body
# For OpenRouter
openrouter_extra = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
@ -447,12 +460,12 @@ def test_full_env_config_to_server_flow():
assert openrouter_extra["reasoning"]["enabled"] == True
assert openrouter_extra["reasoning"]["effort"] == "high"
# Note: max_tokens is NOT included when effort is set (OpenRouter limitation)
# For OpenAI
openai_extra = reasoning_config.build_extra_body("https://api.openai.com/v1")
print(f"\nOpenAI extra_body: {json.dumps(openai_extra, indent=2)}")
assert openai_extra["reasoning_effort"] == "high"
print("\n✓ Full BaseEnvConfig → ServerManager flow works correctly!")
return True
@ -461,54 +474,49 @@ def test_full_env_config_to_server_flow():
# INTEGRATION TESTS WITH REAL API CALLS
# =============================================================================
async def test_openrouter_reasoning(model: str, effort: str = "high"):
"""
Test reasoning with an OpenRouter model.
Args:
model: Model name to test
effort: Reasoning effort level
Returns:
Dict with test results
"""
print(f"\n{'='*60}")
print(f"Testing OpenRouter: {model}")
print(f"{'='*60}")
client = openai.AsyncOpenAI(
api_key=OPENROUTER_API_KEY,
base_url=OPENROUTER_BASE_URL,
)
# Build extra_body based on model type
# Claude models need max_tokens in reasoning dict, not effort
# See: https://openrouter.ai/docs/guides/best-practices/reasoning-tokens
is_claude = "claude" in model.lower() or "anthropic" in model.lower()
if is_claude:
# Claude needs reasoning.max_tokens, and overall max_tokens must be higher
reasoning_max_tokens = 8000
overall_max_tokens = 0 # Must be > reasoning_max_tokens
extra_body = {
"reasoning": {
"max_tokens": reasoning_max_tokens
}
}
extra_body = {"reasoning": {"max_tokens": reasoning_max_tokens}}
else:
# Other models use effort
config = ReasoningConfig(enabled=True, effort=effort)
extra_body = config.build_extra_body(OPENROUTER_BASE_URL)
overall_max_tokens = 0
messages = [
{"role": "user", "content": TEST_PROMPT}
]
messages = [{"role": "user", "content": TEST_PROMPT}]
# For Hermes, also add the system prompt
if "hermes" in model.lower():
messages.insert(0, {"role": "system", "content": HERMES_REASONING_PROMPT})
# Build the full request for logging
request_params = {
"model": model,
@ -517,12 +525,12 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
"temperature": 0.7,
"extra_body": extra_body,
}
print(f"Request params:")
print(f" model: {model}")
print(f" max_tokens: {overall_max_tokens}")
print(f" extra_body: {json.dumps(extra_body, indent=2)}")
# Log the full request to file
log_to_file(f"\n{'='*70}")
log_to_file(f"MODEL: {model}")
@ -530,7 +538,7 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
log_to_file(f"{'='*70}")
log_to_file(f"\nREQUEST SENT:")
log_to_file(json.dumps(request_params, indent=2, default=str))
try:
completion = await client.chat.completions.create(
model=model,
@ -539,45 +547,44 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
temperature=0.7,
extra_body=extra_body,
)
# Log full ChatCompletion object to file for inspection
log_to_file(f"\nRESPONSE RECEIVED:")
log_to_file(f"\nFULL CHATCOMPLETION OBJECT:")
log_to_file(str(completion))
log_to_file(f"\n{'='*40}")
# Also log the choice and message separately for clarity
choice = completion.choices[0]
log_to_file(f"\nChoice object: {choice}")
log_to_file(f"\nMessage object: {choice.message}")
# Log all attributes on the message
log_to_file(f"\nMessage attributes: {dir(choice.message)}")
# Try to get model_dump if available (pydantic)
if hasattr(completion, "model_dump"):
log_to_file(f"\nCompletion model_dump():")
log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
if hasattr(choice, "model_dump"):
log_to_file(f"\nChoice model_dump():")
log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
if hasattr(choice.message, "model_dump"):
log_to_file(f"\nMessage model_dump():")
log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
# Get the response content
content = completion.choices[0].message.content
log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
log_to_file(content if content else "(empty)")
# Try to extract reasoning
reasoning, source = extract_reasoning_from_response(
completion.choices[0],
content=content
completion.choices[0], content=content
)
log_to_file(f"\nReasoning extraction result:")
log_to_file(f" Source: {source}")
if reasoning:
@ -586,32 +593,41 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
log_to_file(reasoning)
else:
log_to_file(" No separate reasoning found")
# Check for <think> blocks in content
has_think_block = "<think>" in content.lower() if content else False
log_to_file(f" Has <think> block in content: {has_think_block}")
# Check for reasoning_details in raw response
has_reasoning_details = hasattr(completion.choices[0], "reasoning_details")
has_reasoning_content = hasattr(completion.choices[0].message, "reasoning_content")
has_reasoning_content = hasattr(
completion.choices[0].message, "reasoning_content"
)
log_to_file(f" Has reasoning_details attr: {has_reasoning_details}")
log_to_file(f" Has reasoning_content attr: {has_reasoning_content}")
# Try to access reasoning fields directly if they exist
if hasattr(choice.message, "reasoning_content") and choice.message.reasoning_content:
log_to_file(f" message.reasoning_content: {choice.message.reasoning_content}")
if (
hasattr(choice.message, "reasoning_content")
and choice.message.reasoning_content
):
log_to_file(
f" message.reasoning_content: {choice.message.reasoning_content}"
)
if hasattr(choice.message, "reasoning") and choice.message.reasoning:
log_to_file(f" message.reasoning: {choice.message.reasoning}")
if hasattr(choice, "reasoning_details") and choice.reasoning_details:
log_to_file(f" choice.reasoning_details: {choice.reasoning_details}")
log_to_file(f"\n{'='*70}\n")
# Also print summary to console
print(f"\nResponse content ({len(content) if content else 0} chars)")
print(f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars")
print(
f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars"
)
print(f"(Full details logged to {LOG_FILE})")
return {
"model": model,
"success": True,
@ -620,7 +636,7 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
"reasoning_length": len(reasoning) if reasoning else 0,
"has_think_block": has_think_block,
}
except Exception as e:
print(f"Error: {e}")
return {
@ -633,30 +649,28 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
async def test_openai_reasoning(effort: str = "medium"):
"""
Test reasoning with OpenAI official API.
Args:
effort: Reasoning effort level
Returns:
Dict with test results
"""
print(f"\n{'='*60}")
print(f"Testing OpenAI: {OPENAI_MODEL}")
print(f"{'='*60}")
client = openai.AsyncOpenAI(
api_key=OPENAI_API_KEY,
base_url=OPENAI_BASE_URL,
)
# Build extra_body using our ReasoningConfig
config = ReasoningConfig(enabled=True, effort=effort)
extra_body = config.build_extra_body(OPENAI_BASE_URL)
messages = [
{"role": "user", "content": TEST_PROMPT}
]
messages = [{"role": "user", "content": TEST_PROMPT}]
# Build the full request for logging
request_params = {
"model": OPENAI_MODEL,
@ -664,12 +678,12 @@ async def test_openai_reasoning(effort: str = "medium"):
"max_completion_tokens": 1024,
"extra_body": extra_body,
}
print(f"Request params:")
print(f" model: {OPENAI_MODEL}")
print(f" max_completion_tokens: 1024")
print(f" extra_body: {json.dumps(extra_body, indent=2)}")
# Log the full request to file
log_to_file(f"\n{'='*70}")
log_to_file(f"MODEL: {OPENAI_MODEL} (OpenAI)")
@ -677,7 +691,7 @@ async def test_openai_reasoning(effort: str = "medium"):
log_to_file(f"{'='*70}")
log_to_file(f"\nREQUEST SENT:")
log_to_file(json.dumps(request_params, indent=2, default=str))
try:
completion = await client.chat.completions.create(
model=OPENAI_MODEL,
@ -686,45 +700,44 @@ async def test_openai_reasoning(effort: str = "medium"):
# Note: OpenAI reasoning models only support temperature=1 (default)
extra_body=extra_body,
)
# Log full ChatCompletion object to file for inspection
log_to_file(f"\nRESPONSE RECEIVED:")
log_to_file(f"\nFULL CHATCOMPLETION OBJECT:")
log_to_file(str(completion))
log_to_file(f"\n{'='*40}")
# Also log the choice and message separately for clarity
choice = completion.choices[0]
log_to_file(f"\nChoice object: {choice}")
log_to_file(f"\nMessage object: {choice.message}")
# Log all attributes on the message
log_to_file(f"\nMessage attributes: {dir(choice.message)}")
# Try to get model_dump if available (pydantic)
if hasattr(completion, "model_dump"):
log_to_file(f"\nCompletion model_dump():")
log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
if hasattr(choice, "model_dump"):
log_to_file(f"\nChoice model_dump():")
log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
if hasattr(choice.message, "model_dump"):
log_to_file(f"\nMessage model_dump():")
log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
# Get the response content
content = completion.choices[0].message.content
log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
log_to_file(content if content else "(empty)")
# Try to extract reasoning
reasoning, source = extract_reasoning_from_response(
completion.choices[0],
content=content
completion.choices[0], content=content
)
log_to_file(f"\nReasoning extraction result:")
log_to_file(f" Source: {source}")
if reasoning:
@ -733,26 +746,33 @@ async def test_openai_reasoning(effort: str = "medium"):
log_to_file(reasoning)
else:
log_to_file(" No separate reasoning found")
# Check for <think> blocks in content (unlikely for OpenAI)
has_think_block = "<think>" in content.lower() if content else False
log_to_file(f" Has <think> block in content: {has_think_block}")
# Try to access reasoning fields directly if they exist
if hasattr(choice.message, "reasoning_content") and choice.message.reasoning_content:
log_to_file(f" message.reasoning_content: {choice.message.reasoning_content}")
if (
hasattr(choice.message, "reasoning_content")
and choice.message.reasoning_content
):
log_to_file(
f" message.reasoning_content: {choice.message.reasoning_content}"
)
if hasattr(choice.message, "reasoning") and choice.message.reasoning:
log_to_file(f" message.reasoning: {choice.message.reasoning}")
if hasattr(choice, "reasoning_details") and choice.reasoning_details:
log_to_file(f" choice.reasoning_details: {choice.reasoning_details}")
log_to_file(f"\n{'='*70}\n")
# Also print summary to console
print(f"\nResponse content ({len(content) if content else 0} chars)")
print(f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars")
print(
f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars"
)
print(f"(Full details logged to {LOG_FILE})")
return {
"model": OPENAI_MODEL,
"success": True,
@ -761,7 +781,7 @@ async def test_openai_reasoning(effort: str = "medium"):
"reasoning_length": len(reasoning) if reasoning else 0,
"has_think_block": has_think_block,
}
except Exception as e:
print(f"Error: {e}")
return {
@ -773,60 +793,62 @@ async def test_openai_reasoning(effort: str = "medium"):
async def run_all_integration_tests():
"""Run all integration tests and summarize results."""
print("\n" + "="*70)
print("\n" + "=" * 70)
print("REASONING MODEL INTEGRATION TESTS")
print("="*70)
print("=" * 70)
# Check that API keys are set
missing_keys = []
if not OPENAI_API_KEY:
missing_keys.append("OPENAI_API_KEY")
if not OPENROUTER_API_KEY:
missing_keys.append("OPENROUTER_API_KEY")
if missing_keys:
print(f"\n⚠ Missing required environment variables: {', '.join(missing_keys)}")
print("Set them before running integration tests:")
print(" export OPENAI_API_KEY='your-key-here'")
print(" export OPENROUTER_API_KEY='your-key-here'")
return False
# Initialize log file
with open(LOG_FILE, "w") as f:
f.write(f"REASONING MODEL TEST RESULTS\n")
f.write(f"Generated: {datetime.now().isoformat()}\n")
f.write(f"{'='*70}\n\n")
print(f"\nLogging full ChatCompletion objects to: {LOG_FILE}\n")
results = []
# Test OpenRouter models
for model in OPENROUTER_MODELS:
result = await test_openrouter_reasoning(model)
results.append(result)
# Test OpenAI
result = await test_openai_reasoning()
results.append(result)
# Summary
print("\n" + "="*70)
print("\n" + "=" * 70)
print("TEST SUMMARY")
print("="*70)
print("=" * 70)
for result in results:
status = "✓ PASS" if result.get("success") else "✗ FAIL"
model = result.get("model", "unknown")
if result.get("success"):
reasoning_info = f"reasoning: {result.get('reasoning_source', 'none')}"
if result.get("reasoning_length", 0) > 0:
reasoning_info += f" ({result['reasoning_length']} chars)"
print(f"{status} | {model:40} | {reasoning_info}")
else:
print(f"{status} | {model:40} | error: {result.get('error', 'unknown')[:30]}")
print(
f"{status} | {model:40} | error: {result.get('error', 'unknown')[:30]}"
)
# Check if any failed
failures = [r for r in results if not r.get("success")]
if failures:
@ -841,12 +863,13 @@ async def run_all_integration_tests():
# MAIN
# =============================================================================
def run_unit_tests():
"""Run all unit tests (no API calls)."""
print("\n" + "="*70)
print("\n" + "=" * 70)
print("UNIT TESTS")
print("="*70 + "\n")
print("=" * 70 + "\n")
# ReasoningConfig unit tests
test_reasoning_config_default()
test_reasoning_config_enabled_only()
@ -857,54 +880,52 @@ def run_unit_tests():
test_reasoning_config_invalid_effort()
test_reasoning_config_invalid_max_tokens()
test_hermes_prompts_defined()
# ServerManager integration tests (no API calls)
test_reasoning_config_from_env_config()
test_server_manager_builds_extra_body()
test_full_env_config_to_server_flow()
print("\n" + "="*70)
print("\n" + "=" * 70)
print("All unit tests passed!")
print("="*70)
print("=" * 70)
async def run_server_manager_integration_test():
"""Run ServerManager integration test with real API call."""
print("\n" + "="*70)
print("\n" + "=" * 70)
print("SERVER MANAGER INTEGRATION TEST")
print("="*70)
print("=" * 70)
result = await test_server_manager_injects_extra_body()
if result:
print("\n✓ ServerManager integration test passed!")
else:
print("\n✗ ServerManager integration test failed!")
return result
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Test reasoning model support")
parser.add_argument(
"--unit-only",
action="store_true",
help="Only run unit tests (no API calls)"
"--unit-only", action="store_true", help="Only run unit tests (no API calls)"
)
parser.add_argument(
"--integration-only",
action="store_true",
help="Only run integration tests (API calls to all providers)"
action="store_true",
help="Only run integration tests (API calls to all providers)",
)
parser.add_argument(
"--server-manager-only",
action="store_true",
help="Only run ServerManager integration test (single API call)"
help="Only run ServerManager integration test (single API call)",
)
args = parser.parse_args()
if args.integration_only:
asyncio.run(run_all_integration_tests())
elif args.unit_only:
@ -917,4 +938,3 @@ if __name__ == "__main__":
run_unit_tests()
asyncio.run(run_server_manager_integration_test())
asyncio.run(run_all_integration_tests())