mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
914 lines
32 KiB
Python
914 lines
32 KiB
Python
"""
|
|
Test file for reasoning model support across multiple providers.
|
|
|
|
This test validates:
|
|
1. ReasoningConfig builds correct extra_body for different providers
|
|
2. Reasoning can be enabled and responses contain reasoning content
|
|
3. Reasoning extraction works for various API response formats
|
|
|
|
Providers tested:
|
|
- OpenAI (gpt-5.2) - Uses reasoning_effort at top level
|
|
- OpenRouter (anthropic/claude-opus-4.5, nousresearch/hermes-4-70B, deepseek/deepseek-v3.2)
|
|
- Uses nested reasoning object with enabled/effort/max_tokens
|
|
|
|
Usage:
|
|
python -m pytest atroposlib/tests/test_reasoning_models.py -v
|
|
|
|
Or run directly:
|
|
python atroposlib/tests/test_reasoning_models.py
|
|
|
|
Note: This test requires valid API keys. Set them as environment variables or
|
|
modify the constants below for testing.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
import pytest
|
|
|
|
# Add the project root to path
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
|
|
|
import openai # noqa: E402
|
|
|
|
from atroposlib.envs.server_handling.server_baseline import ( # noqa: E402
|
|
ReasoningConfig,
|
|
)
|
|
from environments.eval_environments.eval_helpers import ( # noqa: E402
|
|
HERMES_REASONING_PROMPT,
|
|
HERMES_REASONING_PROMPT_WITH_ANSWER,
|
|
extract_reasoning_from_completion,
|
|
extract_reasoning_from_response,
|
|
)
|
|
|
|
# =============================================================================
|
|
# API CONFIGURATION
|
|
# =============================================================================
|
|
# These are test credentials. For production, use environment variables.
|
|
|
|
# API keys must be set via environment variables
|
|
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
|
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-5.2")
|
|
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
|
|
|
|
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
|
OPENROUTER_BASE_URL = os.environ.get(
|
|
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
|
|
)
|
|
|
|
# Models to test on OpenRouter
|
|
OPENROUTER_MODELS = [
|
|
"anthropic/claude-opus-4.5",
|
|
"nousresearch/hermes-4-70b",
|
|
"deepseek/deepseek-v3.2",
|
|
]
|
|
|
|
# Test prompt that should trigger reasoning
|
|
TEST_PROMPT = "What is 15 * 23? Think step by step before giving your answer."
|
|
|
|
# Log file for full ChatCompletion objects
|
|
LOG_FILE = os.path.join(os.path.dirname(__file__), "reasoning_test_results.log")
|
|
|
|
|
|
def log_to_file(message: str):
|
|
"""Append message to log file."""
|
|
with open(LOG_FILE, "a") as f:
|
|
f.write(message + "\n")
|
|
|
|
|
|
# =============================================================================
|
|
# UNIT TESTS FOR ReasoningConfig
|
|
# =============================================================================
|
|
|
|
|
|
def test_reasoning_config_default():
|
|
"""Test default ReasoningConfig is not active."""
|
|
config = ReasoningConfig()
|
|
assert not config.enabled
|
|
assert config.effort is None
|
|
assert config.max_tokens is None
|
|
assert not config.is_reasoning_kwargs_active()
|
|
assert config.build_extra_body() is None
|
|
print("✓ Default ReasoningConfig is inactive")
|
|
|
|
|
|
def test_reasoning_config_enabled_only():
|
|
"""Test ReasoningConfig with only enabled=True."""
|
|
config = ReasoningConfig(enabled=True)
|
|
assert config.enabled
|
|
assert config.is_reasoning_kwargs_active()
|
|
|
|
# Test for non-OpenAI provider
|
|
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
|
|
assert extra_body == {"reasoning": {"enabled": True}}
|
|
|
|
# Test for OpenAI provider
|
|
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
|
assert extra_body == {"reasoning_effort": "medium"}
|
|
print("✓ ReasoningConfig with enabled=True works correctly")
|
|
|
|
|
|
def test_reasoning_config_with_effort():
|
|
"""Test ReasoningConfig with effort specified."""
|
|
config = ReasoningConfig(effort="high")
|
|
assert config.enabled # Should be auto-enabled
|
|
assert config.effort == "high"
|
|
assert config.is_reasoning_kwargs_active()
|
|
|
|
# Test for non-OpenAI provider
|
|
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
|
|
assert extra_body == {"reasoning": {"enabled": True, "effort": "high"}}
|
|
|
|
# Test for OpenAI provider
|
|
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
|
assert extra_body == {"reasoning_effort": "high"}
|
|
print("✓ ReasoningConfig with effort works correctly")
|
|
|
|
|
|
def test_reasoning_config_with_max_tokens():
|
|
"""Test ReasoningConfig with max_tokens specified."""
|
|
config = ReasoningConfig(max_tokens=4096)
|
|
assert config.enabled # Should be auto-enabled
|
|
assert config.max_tokens == 4096
|
|
assert config.is_reasoning_kwargs_active()
|
|
|
|
# Test for non-OpenAI provider
|
|
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
|
|
assert extra_body == {"reasoning": {"enabled": True, "max_tokens": 4096}}
|
|
|
|
# Test for OpenAI provider (max_tokens not supported, falls back to medium)
|
|
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
|
assert extra_body == {"reasoning_effort": "medium"}
|
|
print("✓ ReasoningConfig with max_tokens works correctly")
|
|
|
|
|
|
def test_reasoning_config_full():
|
|
"""Test ReasoningConfig with all options."""
|
|
config = ReasoningConfig(enabled=True, effort="xhigh", max_tokens=8192)
|
|
assert config.enabled
|
|
assert config.effort == "xhigh"
|
|
assert config.max_tokens == 8192
|
|
|
|
# Test for non-OpenAI provider
|
|
# Note: OpenRouter only allows ONE of effort or max_tokens
|
|
# When both are set, effort takes priority
|
|
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
|
|
assert extra_body == {
|
|
"reasoning": {
|
|
"enabled": True,
|
|
"effort": "xhigh",
|
|
# max_tokens is NOT included when effort is specified (OpenRouter limitation)
|
|
}
|
|
}
|
|
|
|
# Test for OpenAI provider (effort passed through directly)
|
|
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
|
assert extra_body == {"reasoning_effort": "xhigh"}
|
|
print("✓ ReasoningConfig with full options works correctly")
|
|
|
|
|
|
def test_reasoning_config_effort_mapping():
|
|
"""Test that effort levels are passed through directly for OpenAI."""
|
|
# All effort levels are now passed through 1:1
|
|
effort_levels = ["none", "minimal", "low", "medium", "high", "xhigh"]
|
|
|
|
for effort in effort_levels:
|
|
config = ReasoningConfig(effort=effort)
|
|
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
|
assert (
|
|
extra_body["reasoning_effort"] == effort
|
|
), f"Expected {effort} to pass through, got {extra_body}"
|
|
print("✓ Effort levels pass through correctly for OpenAI")
|
|
|
|
|
|
def test_reasoning_config_invalid_effort():
|
|
"""Test that invalid effort raises ValueError."""
|
|
try:
|
|
ReasoningConfig(effort="invalid") # Should raise
|
|
assert False, "Should have raised ValueError"
|
|
except ValueError as e:
|
|
assert "Invalid reasoning_effort" in str(e)
|
|
print("✓ Invalid effort raises ValueError")
|
|
|
|
|
|
def test_reasoning_config_max_tokens_no_validation():
|
|
"""Test that max_tokens accepts any value (no range validation).
|
|
|
|
Provider limits vary and may change over time:
|
|
- OpenRouter currently caps Anthropic at 1024-32000
|
|
- Native Anthropic API supports up to 128k extended thinking
|
|
We don't enforce limits here to allow flexibility.
|
|
"""
|
|
# Low values should work
|
|
config_low = ReasoningConfig(max_tokens=500)
|
|
assert config_low.max_tokens == 500
|
|
assert config_low.enabled # Auto-enabled
|
|
|
|
# High values should work (e.g., for native Anthropic 128k thinking)
|
|
config_high = ReasoningConfig(max_tokens=128000)
|
|
assert config_high.max_tokens == 128000
|
|
assert config_high.enabled
|
|
|
|
print("✓ max_tokens accepts any value (no range validation)")
|
|
|
|
|
|
def test_hermes_prompts_defined():
|
|
"""Test that Hermes prompts are properly defined."""
|
|
assert HERMES_REASONING_PROMPT is not None
|
|
assert "<think>" in HERMES_REASONING_PROMPT
|
|
assert "</think>" in HERMES_REASONING_PROMPT
|
|
|
|
assert HERMES_REASONING_PROMPT_WITH_ANSWER is not None
|
|
assert "<answer>" in HERMES_REASONING_PROMPT_WITH_ANSWER
|
|
print("✓ Hermes prompts are properly defined")
|
|
|
|
|
|
# =============================================================================
|
|
# SERVER MANAGER INTEGRATION TESTS
|
|
# =============================================================================
|
|
|
|
|
|
def test_reasoning_config_from_env_config():
|
|
"""Test ReasoningConfig.from_env_config() creates correct config."""
|
|
from atroposlib.envs.base import BaseEnvConfig
|
|
|
|
# Test with thinking_mode only
|
|
env_config = BaseEnvConfig(
|
|
tokenizer_name="gpt2",
|
|
group_name="test",
|
|
run_name="test",
|
|
thinking_mode=True,
|
|
)
|
|
reasoning_config = ReasoningConfig.from_env_config(env_config)
|
|
assert reasoning_config.enabled is True
|
|
assert reasoning_config.effort is None
|
|
assert reasoning_config.max_tokens is None
|
|
print("✓ ReasoningConfig.from_env_config with thinking_mode=True works")
|
|
|
|
# Test with reasoning_effort
|
|
env_config = BaseEnvConfig(
|
|
tokenizer_name="gpt2",
|
|
group_name="test",
|
|
run_name="test",
|
|
reasoning_effort="high",
|
|
)
|
|
reasoning_config = ReasoningConfig.from_env_config(env_config)
|
|
assert reasoning_config.enabled is True # Auto-enabled because effort is set
|
|
assert reasoning_config.effort == "high"
|
|
print("✓ ReasoningConfig.from_env_config with reasoning_effort works")
|
|
|
|
# Test with max_reasoning_tokens
|
|
env_config = BaseEnvConfig(
|
|
tokenizer_name="gpt2",
|
|
group_name="test",
|
|
run_name="test",
|
|
max_reasoning_tokens=8000,
|
|
)
|
|
reasoning_config = ReasoningConfig.from_env_config(env_config)
|
|
assert reasoning_config.enabled is True # Auto-enabled because max_tokens is set
|
|
assert reasoning_config.max_tokens == 8000
|
|
print("✓ ReasoningConfig.from_env_config with max_reasoning_tokens works")
|
|
|
|
# Test with all disabled (default)
|
|
env_config = BaseEnvConfig(
|
|
tokenizer_name="gpt2",
|
|
group_name="test",
|
|
run_name="test",
|
|
)
|
|
reasoning_config = ReasoningConfig.from_env_config(env_config)
|
|
assert reasoning_config.enabled is False
|
|
assert not reasoning_config.is_reasoning_kwargs_active()
|
|
print("✓ ReasoningConfig.from_env_config with defaults (disabled) works")
|
|
|
|
|
|
def test_server_manager_builds_extra_body():
|
|
"""Test ReasoningConfig.build_extra_body() generates correct extra_body."""
|
|
# Create reasoning config
|
|
reasoning_config = ReasoningConfig(enabled=True, effort="high")
|
|
|
|
# We can't easily instantiate ServerManager without actual servers,
|
|
# so let's test the build_extra_body logic directly
|
|
|
|
# Test OpenRouter format
|
|
extra_body = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
|
|
assert "reasoning" in extra_body
|
|
assert extra_body["reasoning"]["enabled"] is True
|
|
assert extra_body["reasoning"]["effort"] == "high"
|
|
print("✓ ServerManager builds correct extra_body for OpenRouter")
|
|
|
|
# Test OpenAI format
|
|
extra_body = reasoning_config.build_extra_body("https://api.openai.com/v1")
|
|
assert "reasoning_effort" in extra_body
|
|
assert extra_body["reasoning_effort"] == "high"
|
|
assert "reasoning" not in extra_body # Should NOT have nested reasoning
|
|
print("✓ ServerManager builds correct extra_body for OpenAI")
|
|
|
|
# Test Claude (anthropic) - should use max_tokens
|
|
claude_reasoning = ReasoningConfig(enabled=True, max_tokens=8000)
|
|
extra_body = claude_reasoning.build_extra_body("https://openrouter.ai/api/v1")
|
|
assert "reasoning" in extra_body
|
|
assert extra_body["reasoning"]["enabled"] is True
|
|
assert extra_body["reasoning"]["max_tokens"] == 8000
|
|
print("✓ ServerManager builds correct extra_body for Claude (max_tokens)")
|
|
|
|
|
|
async def test_server_manager_injects_extra_body():
|
|
"""
|
|
Integration test: Verify ServerManager actually injects extra_body in API calls.
|
|
|
|
This test creates a real ServerManager and makes an actual API call to verify
|
|
the full flow works.
|
|
"""
|
|
if not OPENROUTER_API_KEY:
|
|
pytest.skip("OPENROUTER_API_KEY not set - skipping integration test")
|
|
|
|
from atroposlib.envs.server_handling.server_baseline import APIServerConfig
|
|
from atroposlib.envs.server_handling.server_manager import ServerManager
|
|
|
|
# Create server config for OpenRouter
|
|
server_config = APIServerConfig(
|
|
model_name="nousresearch/hermes-4-70b",
|
|
base_url=OPENROUTER_BASE_URL,
|
|
api_key=OPENROUTER_API_KEY,
|
|
num_requests_for_eval=10,
|
|
)
|
|
|
|
# Create reasoning config
|
|
reasoning_config = ReasoningConfig(enabled=True, effort="high")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Testing ServerManager.chat_completion() with reasoning injection")
|
|
print("=" * 60)
|
|
|
|
# Create ServerManager with reasoning config (NOT in testing mode - we want real API call)
|
|
server_manager = ServerManager(
|
|
configs=[server_config],
|
|
reasoning_config=reasoning_config,
|
|
testing=False, # Actually make the API call
|
|
)
|
|
|
|
# Make a chat completion call
|
|
messages = [
|
|
{"role": "system", "content": HERMES_REASONING_PROMPT},
|
|
{"role": "user", "content": "What is 2 + 2? Think carefully."},
|
|
]
|
|
|
|
print(
|
|
f"Making API call: enabled={reasoning_config.enabled}, "
|
|
f"effort={reasoning_config.effort}"
|
|
)
|
|
|
|
completion = await server_manager.chat_completion(
|
|
messages=messages,
|
|
max_tokens=512,
|
|
temperature=0.7,
|
|
)
|
|
|
|
# Verify response has reasoning
|
|
reasoning, source, content = extract_reasoning_from_completion(completion)
|
|
|
|
print("Response received!")
|
|
print(
|
|
f"Content: {content[:100]}..."
|
|
if content and len(content) > 100
|
|
else f"Content: {content}"
|
|
)
|
|
print(f"Reasoning source: {source}")
|
|
print(f"Reasoning length: {len(reasoning) if reasoning else 0} chars")
|
|
|
|
# Assert we got a response (reasoning is optional - model may not support it)
|
|
assert content is not None, "Expected response content"
|
|
print("✓ ServerManager.chat_completion() correctly injected reasoning extra_body")
|
|
|
|
|
|
def test_full_env_config_to_server_flow():
|
|
"""
|
|
Test the complete flow from BaseEnvConfig to ServerManager reasoning injection.
|
|
|
|
This verifies that:
|
|
1. BaseEnvConfig with reasoning fields creates properly
|
|
2. ReasoningConfig.from_env_config() works
|
|
3. The resulting config would inject correct extra_body
|
|
"""
|
|
from atroposlib.envs.base import BaseEnvConfig
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Testing full BaseEnvConfig → ServerManager flow")
|
|
print("=" * 60)
|
|
|
|
# Create a config like a user would
|
|
env_config = BaseEnvConfig(
|
|
tokenizer_name="gpt2",
|
|
group_name="test-reasoning",
|
|
run_name="test-run",
|
|
thinking_mode=True,
|
|
reasoning_effort="high",
|
|
max_reasoning_tokens=8000,
|
|
)
|
|
|
|
print("Created BaseEnvConfig:")
|
|
print(f" thinking_mode: {env_config.thinking_mode}")
|
|
print(f" reasoning_effort: {env_config.reasoning_effort}")
|
|
print(f" max_reasoning_tokens: {env_config.max_reasoning_tokens}")
|
|
|
|
# Convert to ReasoningConfig (this happens in BaseEnv.__init__)
|
|
reasoning_config = ReasoningConfig.from_env_config(env_config)
|
|
|
|
print("\nReasoningConfig created:")
|
|
print(f" enabled: {reasoning_config.enabled}")
|
|
print(f" effort: {reasoning_config.effort}")
|
|
print(f" max_tokens: {reasoning_config.max_tokens}")
|
|
|
|
# Verify the config would generate correct extra_body
|
|
# For OpenRouter
|
|
openrouter_extra = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
|
|
print(f"\nOpenRouter extra_body: {json.dumps(openrouter_extra, indent=2)}")
|
|
assert openrouter_extra["reasoning"]["enabled"] is True
|
|
assert openrouter_extra["reasoning"]["effort"] == "high"
|
|
# Note: max_tokens is NOT included when effort is set (OpenRouter limitation)
|
|
|
|
# For OpenAI
|
|
openai_extra = reasoning_config.build_extra_body("https://api.openai.com/v1")
|
|
print(f"\nOpenAI extra_body: {json.dumps(openai_extra, indent=2)}")
|
|
assert openai_extra["reasoning_effort"] == "high"
|
|
|
|
print("\n✓ Full BaseEnvConfig → ServerManager flow works correctly!")
|
|
|
|
|
|
# =============================================================================
|
|
# INTEGRATION TESTS WITH REAL API CALLS
|
|
# =============================================================================
|
|
|
|
|
|
async def _run_openrouter_reasoning_test(model: str, effort: str = "high"):
|
|
"""
|
|
Run reasoning test with an OpenRouter model (helper function).
|
|
|
|
Note: This is a helper function, not a pytest test. It's called by
|
|
run_all_integration_tests() when running the script directly.
|
|
|
|
Args:
|
|
model: Model name to test
|
|
effort: Reasoning effort level
|
|
|
|
Returns:
|
|
Dict with test results
|
|
"""
|
|
print(f"\n{'='*60}")
|
|
print(f"Testing OpenRouter: {model}")
|
|
print(f"{'='*60}")
|
|
|
|
client = openai.AsyncOpenAI(
|
|
api_key=OPENROUTER_API_KEY,
|
|
base_url=OPENROUTER_BASE_URL,
|
|
)
|
|
|
|
# Build extra_body based on model type
|
|
# Claude models need max_tokens in reasoning dict, not effort
|
|
# See: https://openrouter.ai/docs/guides/best-practices/reasoning-tokens
|
|
is_claude = "claude" in model.lower() or "anthropic" in model.lower()
|
|
|
|
if is_claude:
|
|
# Claude needs reasoning.max_tokens, and overall max_tokens must be higher
|
|
reasoning_max_tokens = 8000
|
|
overall_max_tokens = 0 # Must be > reasoning_max_tokens
|
|
extra_body = {"reasoning": {"max_tokens": reasoning_max_tokens}}
|
|
else:
|
|
# Other models use effort
|
|
config = ReasoningConfig(enabled=True, effort=effort)
|
|
extra_body = config.build_extra_body(OPENROUTER_BASE_URL)
|
|
overall_max_tokens = 0
|
|
|
|
messages = [{"role": "user", "content": TEST_PROMPT}]
|
|
|
|
# For Hermes, also add the system prompt
|
|
if "hermes" in model.lower():
|
|
messages.insert(0, {"role": "system", "content": HERMES_REASONING_PROMPT})
|
|
|
|
# Build the full request for logging
|
|
request_params = {
|
|
"model": model,
|
|
"messages": messages,
|
|
"max_tokens": overall_max_tokens,
|
|
"temperature": 0.7,
|
|
"extra_body": extra_body,
|
|
}
|
|
|
|
print("Request params:")
|
|
print(f" model: {model}")
|
|
print(f" max_tokens: {overall_max_tokens}")
|
|
print(f" extra_body: {json.dumps(extra_body, indent=2)}")
|
|
|
|
# Log the full request to file
|
|
log_to_file(f"\n{'='*70}")
|
|
log_to_file(f"MODEL: {model}")
|
|
log_to_file(f"TIMESTAMP: {datetime.now().isoformat()}")
|
|
log_to_file(f"{'='*70}")
|
|
log_to_file("\nREQUEST SENT:")
|
|
log_to_file(json.dumps(request_params, indent=2, default=str))
|
|
|
|
try:
|
|
completion = await client.chat.completions.create(
|
|
model=model,
|
|
messages=messages,
|
|
max_tokens=overall_max_tokens,
|
|
temperature=0.7,
|
|
extra_body=extra_body,
|
|
)
|
|
|
|
# Log full ChatCompletion object to file for inspection
|
|
log_to_file("\nRESPONSE RECEIVED:")
|
|
log_to_file("\nFULL CHATCOMPLETION OBJECT:")
|
|
log_to_file(str(completion))
|
|
log_to_file(f"\n{'='*40}")
|
|
|
|
# Also log the choice and message separately for clarity
|
|
choice = completion.choices[0]
|
|
log_to_file(f"\nChoice object: {choice}")
|
|
log_to_file(f"\nMessage object: {choice.message}")
|
|
|
|
# Log all attributes on the message
|
|
log_to_file(f"\nMessage attributes: {dir(choice.message)}")
|
|
|
|
# Try to get model_dump if available (pydantic)
|
|
if hasattr(completion, "model_dump"):
|
|
log_to_file("\nCompletion model_dump():")
|
|
log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
|
|
|
|
if hasattr(choice, "model_dump"):
|
|
log_to_file("\nChoice model_dump():")
|
|
log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
|
|
|
|
if hasattr(choice.message, "model_dump"):
|
|
log_to_file("\nMessage model_dump():")
|
|
log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
|
|
|
|
# Get the response content
|
|
content = completion.choices[0].message.content
|
|
log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
|
|
log_to_file(content if content else "(empty)")
|
|
|
|
# Try to extract reasoning
|
|
reasoning, source = extract_reasoning_from_response(
|
|
completion.choices[0], content=content
|
|
)
|
|
|
|
log_to_file("\nReasoning extraction result:")
|
|
log_to_file(f" Source: {source}")
|
|
if reasoning:
|
|
log_to_file(f" Length: {len(reasoning)} chars")
|
|
log_to_file(" Full reasoning content:")
|
|
log_to_file(reasoning)
|
|
else:
|
|
log_to_file(" No separate reasoning found")
|
|
|
|
# Check for <think> blocks in content
|
|
has_think_block = "<think>" in content.lower() if content else False
|
|
log_to_file(f" Has <think> block in content: {has_think_block}")
|
|
|
|
# Check for reasoning_details in raw response
|
|
has_reasoning_details = hasattr(completion.choices[0], "reasoning_details")
|
|
has_reasoning_content = hasattr(
|
|
completion.choices[0].message, "reasoning_content"
|
|
)
|
|
log_to_file(f" Has reasoning_details attr: {has_reasoning_details}")
|
|
log_to_file(f" Has reasoning_content attr: {has_reasoning_content}")
|
|
|
|
# Try to access reasoning fields directly if they exist
|
|
if (
|
|
hasattr(choice.message, "reasoning_content")
|
|
and choice.message.reasoning_content
|
|
):
|
|
log_to_file(
|
|
f" message.reasoning_content: {choice.message.reasoning_content}"
|
|
)
|
|
if hasattr(choice.message, "reasoning") and choice.message.reasoning:
|
|
log_to_file(f" message.reasoning: {choice.message.reasoning}")
|
|
if hasattr(choice, "reasoning_details") and choice.reasoning_details:
|
|
log_to_file(f" choice.reasoning_details: {choice.reasoning_details}")
|
|
|
|
log_to_file(f"\n{'='*70}\n")
|
|
|
|
# Also print summary to console
|
|
print(f"\nResponse content ({len(content) if content else 0} chars)")
|
|
print(
|
|
f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars"
|
|
)
|
|
print(f"(Full details logged to {LOG_FILE})")
|
|
|
|
return {
|
|
"model": model,
|
|
"success": True,
|
|
"content_length": len(content) if content else 0,
|
|
"reasoning_source": source,
|
|
"reasoning_length": len(reasoning) if reasoning else 0,
|
|
"has_think_block": has_think_block,
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
return {
|
|
"model": model,
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
async def _run_openai_reasoning_test(effort: str = "medium"):
|
|
"""
|
|
Run reasoning test with OpenAI official API (helper function).
|
|
|
|
Note: This is a helper function, not a pytest test. It's called by
|
|
run_all_integration_tests() when running the script directly.
|
|
|
|
Args:
|
|
effort: Reasoning effort level
|
|
|
|
Returns:
|
|
Dict with test results
|
|
"""
|
|
print(f"\n{'='*60}")
|
|
print(f"Testing OpenAI: {OPENAI_MODEL}")
|
|
print(f"{'='*60}")
|
|
|
|
client = openai.AsyncOpenAI(
|
|
api_key=OPENAI_API_KEY,
|
|
base_url=OPENAI_BASE_URL,
|
|
)
|
|
|
|
# Build extra_body using our ReasoningConfig
|
|
config = ReasoningConfig(enabled=True, effort=effort)
|
|
extra_body = config.build_extra_body(OPENAI_BASE_URL)
|
|
|
|
messages = [{"role": "user", "content": TEST_PROMPT}]
|
|
|
|
# Build the full request for logging
|
|
request_params = {
|
|
"model": OPENAI_MODEL,
|
|
"messages": messages,
|
|
"max_completion_tokens": 1024,
|
|
"extra_body": extra_body,
|
|
}
|
|
|
|
print("Request params:")
|
|
print(f" model: {OPENAI_MODEL}")
|
|
print(" max_completion_tokens: 1024")
|
|
print(f" extra_body: {json.dumps(extra_body, indent=2)}")
|
|
|
|
# Log the full request to file
|
|
log_to_file(f"\n{'='*70}")
|
|
log_to_file(f"MODEL: {OPENAI_MODEL} (OpenAI)")
|
|
log_to_file(f"TIMESTAMP: {datetime.now().isoformat()}")
|
|
log_to_file(f"{'='*70}")
|
|
log_to_file("\nREQUEST SENT:")
|
|
log_to_file(json.dumps(request_params, indent=2, default=str))
|
|
|
|
try:
|
|
completion = await client.chat.completions.create(
|
|
model=OPENAI_MODEL,
|
|
messages=messages,
|
|
max_completion_tokens=1024, # OpenAI reasoning models require this instead of max_tokens
|
|
# Note: OpenAI reasoning models only support temperature=1 (default)
|
|
extra_body=extra_body,
|
|
)
|
|
|
|
# Log full ChatCompletion object to file for inspection
|
|
log_to_file("\nRESPONSE RECEIVED:")
|
|
log_to_file("\nFULL CHATCOMPLETION OBJECT:")
|
|
log_to_file(str(completion))
|
|
log_to_file(f"\n{'='*40}")
|
|
|
|
# Also log the choice and message separately for clarity
|
|
choice = completion.choices[0]
|
|
log_to_file(f"\nChoice object: {choice}")
|
|
log_to_file(f"\nMessage object: {choice.message}")
|
|
|
|
# Log all attributes on the message
|
|
log_to_file(f"\nMessage attributes: {dir(choice.message)}")
|
|
|
|
# Try to get model_dump if available (pydantic)
|
|
if hasattr(completion, "model_dump"):
|
|
log_to_file("\nCompletion model_dump():")
|
|
log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
|
|
|
|
if hasattr(choice, "model_dump"):
|
|
log_to_file("\nChoice model_dump():")
|
|
log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
|
|
|
|
if hasattr(choice.message, "model_dump"):
|
|
log_to_file("\nMessage model_dump():")
|
|
log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
|
|
|
|
# Get the response content
|
|
content = completion.choices[0].message.content
|
|
log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
|
|
log_to_file(content if content else "(empty)")
|
|
|
|
# Try to extract reasoning
|
|
reasoning, source = extract_reasoning_from_response(
|
|
completion.choices[0], content=content
|
|
)
|
|
|
|
log_to_file("\nReasoning extraction result:")
|
|
log_to_file(f" Source: {source}")
|
|
if reasoning:
|
|
log_to_file(f" Length: {len(reasoning)} chars")
|
|
log_to_file(" Full reasoning content:")
|
|
log_to_file(reasoning)
|
|
else:
|
|
log_to_file(" No separate reasoning found")
|
|
|
|
# Check for <think> blocks in content (unlikely for OpenAI)
|
|
has_think_block = "<think>" in content.lower() if content else False
|
|
log_to_file(f" Has <think> block in content: {has_think_block}")
|
|
|
|
# Try to access reasoning fields directly if they exist
|
|
if (
|
|
hasattr(choice.message, "reasoning_content")
|
|
and choice.message.reasoning_content
|
|
):
|
|
log_to_file(
|
|
f" message.reasoning_content: {choice.message.reasoning_content}"
|
|
)
|
|
if hasattr(choice.message, "reasoning") and choice.message.reasoning:
|
|
log_to_file(f" message.reasoning: {choice.message.reasoning}")
|
|
if hasattr(choice, "reasoning_details") and choice.reasoning_details:
|
|
log_to_file(f" choice.reasoning_details: {choice.reasoning_details}")
|
|
|
|
log_to_file(f"\n{'='*70}\n")
|
|
|
|
# Also print summary to console
|
|
print(f"\nResponse content ({len(content) if content else 0} chars)")
|
|
print(
|
|
f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars"
|
|
)
|
|
print(f"(Full details logged to {LOG_FILE})")
|
|
|
|
return {
|
|
"model": OPENAI_MODEL,
|
|
"success": True,
|
|
"content_length": len(content) if content else 0,
|
|
"reasoning_source": source,
|
|
"reasoning_length": len(reasoning) if reasoning else 0,
|
|
"has_think_block": has_think_block,
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
return {
|
|
"model": OPENAI_MODEL,
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
async def run_all_integration_tests():
|
|
"""Run all integration tests and summarize results."""
|
|
print("\n" + "=" * 70)
|
|
print("REASONING MODEL INTEGRATION TESTS")
|
|
print("=" * 70)
|
|
|
|
# Check that API keys are set
|
|
missing_keys = []
|
|
if not OPENAI_API_KEY:
|
|
missing_keys.append("OPENAI_API_KEY")
|
|
if not OPENROUTER_API_KEY:
|
|
missing_keys.append("OPENROUTER_API_KEY")
|
|
|
|
if missing_keys:
|
|
print(f"\n⚠ Missing required environment variables: {', '.join(missing_keys)}")
|
|
print("Set them before running integration tests:")
|
|
print(" export OPENAI_API_KEY='your-key-here'")
|
|
print(" export OPENROUTER_API_KEY='your-key-here'")
|
|
return False
|
|
|
|
# Initialize log file
|
|
with open(LOG_FILE, "w") as f:
|
|
f.write("REASONING MODEL TEST RESULTS\n")
|
|
f.write(f"Generated: {datetime.now().isoformat()}\n")
|
|
f.write(f"{'='*70}\n\n")
|
|
|
|
print(f"\nLogging full ChatCompletion objects to: {LOG_FILE}\n")
|
|
|
|
results = []
|
|
|
|
# Test OpenRouter models
|
|
for model in OPENROUTER_MODELS:
|
|
result = await _run_openrouter_reasoning_test(model)
|
|
results.append(result)
|
|
|
|
# Test OpenAI
|
|
result = await _run_openai_reasoning_test()
|
|
results.append(result)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print("TEST SUMMARY")
|
|
print("=" * 70)
|
|
|
|
for result in results:
|
|
status = "✓ PASS" if result.get("success") else "✗ FAIL"
|
|
model = result.get("model", "unknown")
|
|
|
|
if result.get("success"):
|
|
reasoning_info = f"reasoning: {result.get('reasoning_source', 'none')}"
|
|
if result.get("reasoning_length", 0) > 0:
|
|
reasoning_info += f" ({result['reasoning_length']} chars)"
|
|
print(f"{status} | {model:40} | {reasoning_info}")
|
|
else:
|
|
print(
|
|
f"{status} | {model:40} | error: {result.get('error', 'unknown')[:30]}"
|
|
)
|
|
|
|
# Check if any failed
|
|
failures = [r for r in results if not r.get("success")]
|
|
if failures:
|
|
print(f"\n{len(failures)} test(s) failed")
|
|
return False
|
|
else:
|
|
print(f"\nAll {len(results)} tests passed!")
|
|
return True
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
|
|
|
|
def run_unit_tests():
|
|
"""Run all unit tests (no API calls)."""
|
|
print("\n" + "=" * 70)
|
|
print("UNIT TESTS")
|
|
print("=" * 70 + "\n")
|
|
|
|
# ReasoningConfig unit tests
|
|
test_reasoning_config_default()
|
|
test_reasoning_config_enabled_only()
|
|
test_reasoning_config_with_effort()
|
|
test_reasoning_config_with_max_tokens()
|
|
test_reasoning_config_full()
|
|
test_reasoning_config_effort_mapping()
|
|
test_reasoning_config_invalid_effort()
|
|
test_reasoning_config_max_tokens_no_validation()
|
|
test_hermes_prompts_defined()
|
|
|
|
# ServerManager integration tests (no API calls)
|
|
test_reasoning_config_from_env_config()
|
|
test_server_manager_builds_extra_body()
|
|
test_full_env_config_to_server_flow()
|
|
|
|
print("\n" + "=" * 70)
|
|
print("All unit tests passed!")
|
|
print("=" * 70)
|
|
|
|
|
|
async def run_server_manager_integration_test():
|
|
"""Run ServerManager integration test with real API call."""
|
|
print("\n" + "=" * 70)
|
|
print("SERVER MANAGER INTEGRATION TEST")
|
|
print("=" * 70)
|
|
|
|
result = await test_server_manager_injects_extra_body()
|
|
|
|
if result:
|
|
print("\n✓ ServerManager integration test passed!")
|
|
else:
|
|
print("\n✗ ServerManager integration test failed!")
|
|
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Test reasoning model support")
|
|
parser.add_argument(
|
|
"--unit-only", action="store_true", help="Only run unit tests (no API calls)"
|
|
)
|
|
parser.add_argument(
|
|
"--integration-only",
|
|
action="store_true",
|
|
help="Only run integration tests (API calls to all providers)",
|
|
)
|
|
parser.add_argument(
|
|
"--server-manager-only",
|
|
action="store_true",
|
|
help="Only run ServerManager integration test (single API call)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.integration_only:
|
|
asyncio.run(run_all_integration_tests())
|
|
elif args.unit_only:
|
|
run_unit_tests()
|
|
elif args.server_manager_only:
|
|
run_unit_tests() # Run unit tests first
|
|
asyncio.run(run_server_manager_integration_test())
|
|
else:
|
|
# Run all tests
|
|
run_unit_tests()
|
|
asyncio.run(run_server_manager_integration_test())
|
|
asyncio.run(run_all_integration_tests())
|