mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
This commit is contained in:
parent
62fa51240c
commit
97047eee7b
5 changed files with 320 additions and 280 deletions
|
|
@ -13,7 +13,7 @@ Providers tested:
|
|||
|
||||
Usage:
|
||||
python -m pytest atroposlib/tests/test_reasoning_models.py -v
|
||||
|
||||
|
||||
Or run directly:
|
||||
python atroposlib/tests/test_reasoning_models.py
|
||||
|
||||
|
|
@ -34,17 +34,16 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
|||
import openai
|
||||
|
||||
from atroposlib.envs.server_handling.server_baseline import (
|
||||
ReasoningConfig,
|
||||
VALID_REASONING_EFFORTS,
|
||||
ReasoningConfig,
|
||||
)
|
||||
from environments.eval_environments.eval_helpers import (
|
||||
HERMES_REASONING_PROMPT,
|
||||
HERMES_REASONING_PROMPT_WITH_ANSWER,
|
||||
extract_reasoning_from_response,
|
||||
extract_reasoning_from_completion,
|
||||
extract_reasoning_from_response,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API CONFIGURATION
|
||||
# =============================================================================
|
||||
|
|
@ -56,7 +55,9 @@ OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-5.2")
|
|||
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||||
|
||||
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
||||
OPENROUTER_BASE_URL = os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
|
||||
OPENROUTER_BASE_URL = os.environ.get(
|
||||
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
|
||||
)
|
||||
|
||||
# Models to test on OpenRouter
|
||||
OPENROUTER_MODELS = [
|
||||
|
|
@ -69,10 +70,7 @@ OPENROUTER_MODELS = [
|
|||
TEST_PROMPT = "What is 15 * 23? Think step by step before giving your answer."
|
||||
|
||||
# Log file for full ChatCompletion objects
|
||||
LOG_FILE = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"reasoning_test_results.log"
|
||||
)
|
||||
LOG_FILE = os.path.join(os.path.dirname(__file__), "reasoning_test_results.log")
|
||||
|
||||
|
||||
def log_to_file(message: str):
|
||||
|
|
@ -85,6 +83,7 @@ def log_to_file(message: str):
|
|||
# UNIT TESTS FOR ReasoningConfig
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_reasoning_config_default():
|
||||
"""Test default ReasoningConfig is not active."""
|
||||
config = ReasoningConfig()
|
||||
|
|
@ -101,11 +100,11 @@ def test_reasoning_config_enabled_only():
|
|||
config = ReasoningConfig(enabled=True)
|
||||
assert config.enabled
|
||||
assert config.is_active()
|
||||
|
||||
|
||||
# Test for non-OpenAI provider
|
||||
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
|
||||
assert extra_body == {"reasoning": {"enabled": True}}
|
||||
|
||||
|
||||
# Test for OpenAI provider
|
||||
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
||||
assert extra_body == {"reasoning_effort": "medium"}
|
||||
|
|
@ -118,11 +117,11 @@ def test_reasoning_config_with_effort():
|
|||
assert config.enabled # Should be auto-enabled
|
||||
assert config.effort == "high"
|
||||
assert config.is_active()
|
||||
|
||||
|
||||
# Test for non-OpenAI provider
|
||||
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
|
||||
assert extra_body == {"reasoning": {"enabled": True, "effort": "high"}}
|
||||
|
||||
|
||||
# Test for OpenAI provider
|
||||
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
||||
assert extra_body == {"reasoning_effort": "high"}
|
||||
|
|
@ -135,11 +134,11 @@ def test_reasoning_config_with_max_tokens():
|
|||
assert config.enabled # Should be auto-enabled
|
||||
assert config.max_tokens == 4096
|
||||
assert config.is_active()
|
||||
|
||||
|
||||
# Test for non-OpenAI provider
|
||||
extra_body = config.build_extra_body("https://openrouter.ai/api/v1")
|
||||
assert extra_body == {"reasoning": {"enabled": True, "max_tokens": 4096}}
|
||||
|
||||
|
||||
# Test for OpenAI provider (max_tokens not supported, falls back to medium)
|
||||
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
||||
assert extra_body == {"reasoning_effort": "medium"}
|
||||
|
|
@ -152,7 +151,7 @@ def test_reasoning_config_full():
|
|||
assert config.enabled
|
||||
assert config.effort == "xhigh"
|
||||
assert config.max_tokens == 8192
|
||||
|
||||
|
||||
# Test for non-OpenAI provider
|
||||
# Note: OpenRouter only allows ONE of effort or max_tokens
|
||||
# When both are set, effort takes priority
|
||||
|
|
@ -164,7 +163,7 @@ def test_reasoning_config_full():
|
|||
# max_tokens is NOT included when effort is specified (OpenRouter limitation)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Test for OpenAI provider (xhigh maps to high)
|
||||
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
||||
assert extra_body == {"reasoning_effort": "high"}
|
||||
|
|
@ -181,12 +180,13 @@ def test_reasoning_config_effort_mapping():
|
|||
"high": "high",
|
||||
"xhigh": "high",
|
||||
}
|
||||
|
||||
|
||||
for our_effort, expected_openai in mappings.items():
|
||||
config = ReasoningConfig(effort=our_effort)
|
||||
extra_body = config.build_extra_body("https://api.openai.com/v1")
|
||||
assert extra_body["reasoning_effort"] == expected_openai, \
|
||||
f"Expected {our_effort} to map to {expected_openai}, got {extra_body}"
|
||||
assert (
|
||||
extra_body["reasoning_effort"] == expected_openai
|
||||
), f"Expected {our_effort} to map to {expected_openai}, got {extra_body}"
|
||||
print("✓ Effort level mapping for OpenAI works correctly")
|
||||
|
||||
|
||||
|
|
@ -208,7 +208,7 @@ def test_reasoning_config_invalid_max_tokens():
|
|||
assert False, "Should have raised ValueError for too low"
|
||||
except ValueError as e:
|
||||
assert "must be between 1024 and 32000" in str(e)
|
||||
|
||||
|
||||
# Too high
|
||||
try:
|
||||
config = ReasoningConfig(max_tokens=50000)
|
||||
|
|
@ -223,7 +223,7 @@ def test_hermes_prompts_defined():
|
|||
assert HERMES_REASONING_PROMPT is not None
|
||||
assert "<think>" in HERMES_REASONING_PROMPT
|
||||
assert "</think>" in HERMES_REASONING_PROMPT
|
||||
|
||||
|
||||
assert HERMES_REASONING_PROMPT_WITH_ANSWER is not None
|
||||
assert "<answer>" in HERMES_REASONING_PROMPT_WITH_ANSWER
|
||||
print("✓ Hermes prompts are properly defined")
|
||||
|
|
@ -233,10 +233,11 @@ def test_hermes_prompts_defined():
|
|||
# SERVER MANAGER INTEGRATION TESTS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_reasoning_config_from_env_config():
|
||||
"""Test ReasoningConfig.from_env_config() creates correct config."""
|
||||
from atroposlib.envs.base import BaseEnvConfig
|
||||
|
||||
|
||||
# Test with thinking_mode only
|
||||
env_config = BaseEnvConfig(
|
||||
tokenizer_name="gpt2",
|
||||
|
|
@ -249,7 +250,7 @@ def test_reasoning_config_from_env_config():
|
|||
assert reasoning_config.effort is None
|
||||
assert reasoning_config.max_tokens is None
|
||||
print("✓ ReasoningConfig.from_env_config with thinking_mode=True works")
|
||||
|
||||
|
||||
# Test with reasoning_effort
|
||||
env_config = BaseEnvConfig(
|
||||
tokenizer_name="gpt2",
|
||||
|
|
@ -261,7 +262,7 @@ def test_reasoning_config_from_env_config():
|
|||
assert reasoning_config.enabled == True # Auto-enabled because effort is set
|
||||
assert reasoning_config.effort == "high"
|
||||
print("✓ ReasoningConfig.from_env_config with reasoning_effort works")
|
||||
|
||||
|
||||
# Test with max_reasoning_tokens
|
||||
env_config = BaseEnvConfig(
|
||||
tokenizer_name="gpt2",
|
||||
|
|
@ -273,7 +274,7 @@ def test_reasoning_config_from_env_config():
|
|||
assert reasoning_config.enabled == True # Auto-enabled because max_tokens is set
|
||||
assert reasoning_config.max_tokens == 8000
|
||||
print("✓ ReasoningConfig.from_env_config with max_reasoning_tokens works")
|
||||
|
||||
|
||||
# Test with all disabled (default)
|
||||
env_config = BaseEnvConfig(
|
||||
tokenizer_name="gpt2",
|
||||
|
|
@ -289,9 +290,10 @@ def test_reasoning_config_from_env_config():
|
|||
def test_server_manager_builds_extra_body():
|
||||
"""Test ServerManager._build_extra_body() injects correct extra_body."""
|
||||
from unittest.mock import MagicMock
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
|
||||
from atroposlib.envs.server_handling.server_baseline import APIServerConfig
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
|
||||
# Create a mock server with OpenRouter base_url
|
||||
openrouter_config = APIServerConfig(
|
||||
model_name="nousresearch/hermes-4-70b",
|
||||
|
|
@ -299,27 +301,27 @@ def test_server_manager_builds_extra_body():
|
|||
api_key="test-key",
|
||||
num_requests_for_eval=10,
|
||||
)
|
||||
|
||||
|
||||
# Create ServerManager with reasoning config
|
||||
reasoning_config = ReasoningConfig(enabled=True, effort="high")
|
||||
|
||||
|
||||
# We can't easily instantiate ServerManager without actual servers,
|
||||
# so let's test the _build_extra_body logic directly
|
||||
|
||||
|
||||
# Test OpenRouter format
|
||||
extra_body = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
|
||||
assert "reasoning" in extra_body
|
||||
assert extra_body["reasoning"]["enabled"] == True
|
||||
assert extra_body["reasoning"]["effort"] == "high"
|
||||
print("✓ ServerManager builds correct extra_body for OpenRouter")
|
||||
|
||||
|
||||
# Test OpenAI format
|
||||
extra_body = reasoning_config.build_extra_body("https://api.openai.com/v1")
|
||||
assert "reasoning_effort" in extra_body
|
||||
assert extra_body["reasoning_effort"] == "high"
|
||||
assert "reasoning" not in extra_body # Should NOT have nested reasoning
|
||||
print("✓ ServerManager builds correct extra_body for OpenAI")
|
||||
|
||||
|
||||
# Test Claude (anthropic) - should use max_tokens
|
||||
claude_reasoning = ReasoningConfig(enabled=True, max_tokens=8000)
|
||||
extra_body = claude_reasoning.build_extra_body("https://openrouter.ai/api/v1")
|
||||
|
|
@ -332,17 +334,17 @@ def test_server_manager_builds_extra_body():
|
|||
async def test_server_manager_injects_extra_body():
|
||||
"""
|
||||
Integration test: Verify ServerManager actually injects extra_body in API calls.
|
||||
|
||||
|
||||
This test creates a real ServerManager and makes an actual API call to verify
|
||||
the full flow works.
|
||||
"""
|
||||
if not OPENROUTER_API_KEY:
|
||||
print("⚠ Skipping ServerManager integration test - OPENROUTER_API_KEY not set")
|
||||
return True
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
|
||||
from atroposlib.envs.server_handling.server_baseline import APIServerConfig
|
||||
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
|
||||
# Create server config for OpenRouter
|
||||
server_config = APIServerConfig(
|
||||
model_name="nousresearch/hermes-4-70b",
|
||||
|
|
@ -350,14 +352,14 @@ async def test_server_manager_injects_extra_body():
|
|||
api_key=OPENROUTER_API_KEY,
|
||||
num_requests_for_eval=10,
|
||||
)
|
||||
|
||||
|
||||
# Create reasoning config
|
||||
reasoning_config = ReasoningConfig(enabled=True, effort="high")
|
||||
|
||||
print("\n" + "="*60)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing ServerManager.chat_completion() with reasoning injection")
|
||||
print("="*60)
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
# Create ServerManager with reasoning config (NOT in testing mode - we want real API call)
|
||||
server_manager = ServerManager(
|
||||
|
|
@ -365,38 +367,49 @@ async def test_server_manager_injects_extra_body():
|
|||
reasoning_config=reasoning_config,
|
||||
testing=False, # Actually make the API call
|
||||
)
|
||||
|
||||
|
||||
# Make a chat completion call
|
||||
messages = [
|
||||
{"role": "system", "content": HERMES_REASONING_PROMPT},
|
||||
{"role": "user", "content": "What is 2 + 2? Think carefully."}
|
||||
{"role": "user", "content": "What is 2 + 2? Think carefully."},
|
||||
]
|
||||
|
||||
print(f"Making API call with reasoning config: enabled={reasoning_config.enabled}, effort={reasoning_config.effort}")
|
||||
|
||||
|
||||
print(
|
||||
f"Making API call with reasoning config: enabled={reasoning_config.enabled}, effort={reasoning_config.effort}"
|
||||
)
|
||||
|
||||
completion = await server_manager.chat_completion(
|
||||
messages=messages,
|
||||
max_tokens=512,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
|
||||
# Verify response has reasoning
|
||||
reasoning, source, content = extract_reasoning_from_completion(completion)
|
||||
|
||||
|
||||
print(f"Response received!")
|
||||
print(f"Content: {content[:100]}..." if content and len(content) > 100 else f"Content: {content}")
|
||||
print(
|
||||
f"Content: {content[:100]}..."
|
||||
if content and len(content) > 100
|
||||
else f"Content: {content}"
|
||||
)
|
||||
print(f"Reasoning source: {source}")
|
||||
print(f"Reasoning length: {len(reasoning) if reasoning else 0} chars")
|
||||
|
||||
|
||||
if reasoning:
|
||||
print("✓ ServerManager.chat_completion() correctly injected reasoning extra_body")
|
||||
print(
|
||||
"✓ ServerManager.chat_completion() correctly injected reasoning extra_body"
|
||||
)
|
||||
return True
|
||||
else:
|
||||
print("⚠ Response received but no reasoning found (model may not support it)")
|
||||
print(
|
||||
"⚠ Response received but no reasoning found (model may not support it)"
|
||||
)
|
||||
return True # Still a pass - the injection worked, model just didn't return reasoning
|
||||
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
print(f"✗ ServerManager test failed: {e}")
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
|
@ -405,18 +418,18 @@ async def test_server_manager_injects_extra_body():
|
|||
def test_full_env_config_to_server_flow():
|
||||
"""
|
||||
Test the complete flow from BaseEnvConfig to ServerManager reasoning injection.
|
||||
|
||||
|
||||
This verifies that:
|
||||
1. BaseEnvConfig with reasoning fields creates properly
|
||||
2. ReasoningConfig.from_env_config() works
|
||||
3. The resulting config would inject correct extra_body
|
||||
"""
|
||||
from atroposlib.envs.base import BaseEnvConfig
|
||||
|
||||
print("\n" + "="*60)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing full BaseEnvConfig → ServerManager flow")
|
||||
print("="*60)
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
# Create a config like a user would
|
||||
env_config = BaseEnvConfig(
|
||||
tokenizer_name="gpt2",
|
||||
|
|
@ -426,20 +439,20 @@ def test_full_env_config_to_server_flow():
|
|||
reasoning_effort="high",
|
||||
max_reasoning_tokens=8000,
|
||||
)
|
||||
|
||||
|
||||
print(f"Created BaseEnvConfig:")
|
||||
print(f" thinking_mode: {env_config.thinking_mode}")
|
||||
print(f" reasoning_effort: {env_config.reasoning_effort}")
|
||||
print(f" max_reasoning_tokens: {env_config.max_reasoning_tokens}")
|
||||
|
||||
|
||||
# Convert to ReasoningConfig (this happens in BaseEnv.__init__)
|
||||
reasoning_config = ReasoningConfig.from_env_config(env_config)
|
||||
|
||||
|
||||
print(f"\nReasoningConfig created:")
|
||||
print(f" enabled: {reasoning_config.enabled}")
|
||||
print(f" effort: {reasoning_config.effort}")
|
||||
print(f" max_tokens: {reasoning_config.max_tokens}")
|
||||
|
||||
|
||||
# Verify the config would generate correct extra_body
|
||||
# For OpenRouter
|
||||
openrouter_extra = reasoning_config.build_extra_body("https://openrouter.ai/api/v1")
|
||||
|
|
@ -447,12 +460,12 @@ def test_full_env_config_to_server_flow():
|
|||
assert openrouter_extra["reasoning"]["enabled"] == True
|
||||
assert openrouter_extra["reasoning"]["effort"] == "high"
|
||||
# Note: max_tokens is NOT included when effort is set (OpenRouter limitation)
|
||||
|
||||
|
||||
# For OpenAI
|
||||
openai_extra = reasoning_config.build_extra_body("https://api.openai.com/v1")
|
||||
print(f"\nOpenAI extra_body: {json.dumps(openai_extra, indent=2)}")
|
||||
assert openai_extra["reasoning_effort"] == "high"
|
||||
|
||||
|
||||
print("\n✓ Full BaseEnvConfig → ServerManager flow works correctly!")
|
||||
return True
|
||||
|
||||
|
|
@ -461,54 +474,49 @@ def test_full_env_config_to_server_flow():
|
|||
# INTEGRATION TESTS WITH REAL API CALLS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
async def test_openrouter_reasoning(model: str, effort: str = "high"):
|
||||
"""
|
||||
Test reasoning with an OpenRouter model.
|
||||
|
||||
|
||||
Args:
|
||||
model: Model name to test
|
||||
effort: Reasoning effort level
|
||||
|
||||
|
||||
Returns:
|
||||
Dict with test results
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing OpenRouter: {model}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
client = openai.AsyncOpenAI(
|
||||
api_key=OPENROUTER_API_KEY,
|
||||
base_url=OPENROUTER_BASE_URL,
|
||||
)
|
||||
|
||||
|
||||
# Build extra_body based on model type
|
||||
# Claude models need max_tokens in reasoning dict, not effort
|
||||
# See: https://openrouter.ai/docs/guides/best-practices/reasoning-tokens
|
||||
is_claude = "claude" in model.lower() or "anthropic" in model.lower()
|
||||
|
||||
|
||||
if is_claude:
|
||||
# Claude needs reasoning.max_tokens, and overall max_tokens must be higher
|
||||
reasoning_max_tokens = 8000
|
||||
overall_max_tokens = 0 # Must be > reasoning_max_tokens
|
||||
extra_body = {
|
||||
"reasoning": {
|
||||
"max_tokens": reasoning_max_tokens
|
||||
}
|
||||
}
|
||||
extra_body = {"reasoning": {"max_tokens": reasoning_max_tokens}}
|
||||
else:
|
||||
# Other models use effort
|
||||
config = ReasoningConfig(enabled=True, effort=effort)
|
||||
extra_body = config.build_extra_body(OPENROUTER_BASE_URL)
|
||||
overall_max_tokens = 0
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": TEST_PROMPT}
|
||||
]
|
||||
|
||||
|
||||
messages = [{"role": "user", "content": TEST_PROMPT}]
|
||||
|
||||
# For Hermes, also add the system prompt
|
||||
if "hermes" in model.lower():
|
||||
messages.insert(0, {"role": "system", "content": HERMES_REASONING_PROMPT})
|
||||
|
||||
|
||||
# Build the full request for logging
|
||||
request_params = {
|
||||
"model": model,
|
||||
|
|
@ -517,12 +525,12 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
|
|||
"temperature": 0.7,
|
||||
"extra_body": extra_body,
|
||||
}
|
||||
|
||||
|
||||
print(f"Request params:")
|
||||
print(f" model: {model}")
|
||||
print(f" max_tokens: {overall_max_tokens}")
|
||||
print(f" extra_body: {json.dumps(extra_body, indent=2)}")
|
||||
|
||||
|
||||
# Log the full request to file
|
||||
log_to_file(f"\n{'='*70}")
|
||||
log_to_file(f"MODEL: {model}")
|
||||
|
|
@ -530,7 +538,7 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
|
|||
log_to_file(f"{'='*70}")
|
||||
log_to_file(f"\nREQUEST SENT:")
|
||||
log_to_file(json.dumps(request_params, indent=2, default=str))
|
||||
|
||||
|
||||
try:
|
||||
completion = await client.chat.completions.create(
|
||||
model=model,
|
||||
|
|
@ -539,45 +547,44 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
|
|||
temperature=0.7,
|
||||
extra_body=extra_body,
|
||||
)
|
||||
|
||||
|
||||
# Log full ChatCompletion object to file for inspection
|
||||
log_to_file(f"\nRESPONSE RECEIVED:")
|
||||
log_to_file(f"\nFULL CHATCOMPLETION OBJECT:")
|
||||
log_to_file(str(completion))
|
||||
log_to_file(f"\n{'='*40}")
|
||||
|
||||
|
||||
# Also log the choice and message separately for clarity
|
||||
choice = completion.choices[0]
|
||||
log_to_file(f"\nChoice object: {choice}")
|
||||
log_to_file(f"\nMessage object: {choice.message}")
|
||||
|
||||
|
||||
# Log all attributes on the message
|
||||
log_to_file(f"\nMessage attributes: {dir(choice.message)}")
|
||||
|
||||
|
||||
# Try to get model_dump if available (pydantic)
|
||||
if hasattr(completion, "model_dump"):
|
||||
log_to_file(f"\nCompletion model_dump():")
|
||||
log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
|
||||
|
||||
|
||||
if hasattr(choice, "model_dump"):
|
||||
log_to_file(f"\nChoice model_dump():")
|
||||
log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
|
||||
|
||||
|
||||
if hasattr(choice.message, "model_dump"):
|
||||
log_to_file(f"\nMessage model_dump():")
|
||||
log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
|
||||
|
||||
|
||||
# Get the response content
|
||||
content = completion.choices[0].message.content
|
||||
log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
|
||||
log_to_file(content if content else "(empty)")
|
||||
|
||||
|
||||
# Try to extract reasoning
|
||||
reasoning, source = extract_reasoning_from_response(
|
||||
completion.choices[0],
|
||||
content=content
|
||||
completion.choices[0], content=content
|
||||
)
|
||||
|
||||
|
||||
log_to_file(f"\nReasoning extraction result:")
|
||||
log_to_file(f" Source: {source}")
|
||||
if reasoning:
|
||||
|
|
@ -586,32 +593,41 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
|
|||
log_to_file(reasoning)
|
||||
else:
|
||||
log_to_file(" No separate reasoning found")
|
||||
|
||||
|
||||
# Check for <think> blocks in content
|
||||
has_think_block = "<think>" in content.lower() if content else False
|
||||
log_to_file(f" Has <think> block in content: {has_think_block}")
|
||||
|
||||
|
||||
# Check for reasoning_details in raw response
|
||||
has_reasoning_details = hasattr(completion.choices[0], "reasoning_details")
|
||||
has_reasoning_content = hasattr(completion.choices[0].message, "reasoning_content")
|
||||
has_reasoning_content = hasattr(
|
||||
completion.choices[0].message, "reasoning_content"
|
||||
)
|
||||
log_to_file(f" Has reasoning_details attr: {has_reasoning_details}")
|
||||
log_to_file(f" Has reasoning_content attr: {has_reasoning_content}")
|
||||
|
||||
|
||||
# Try to access reasoning fields directly if they exist
|
||||
if hasattr(choice.message, "reasoning_content") and choice.message.reasoning_content:
|
||||
log_to_file(f" message.reasoning_content: {choice.message.reasoning_content}")
|
||||
if (
|
||||
hasattr(choice.message, "reasoning_content")
|
||||
and choice.message.reasoning_content
|
||||
):
|
||||
log_to_file(
|
||||
f" message.reasoning_content: {choice.message.reasoning_content}"
|
||||
)
|
||||
if hasattr(choice.message, "reasoning") and choice.message.reasoning:
|
||||
log_to_file(f" message.reasoning: {choice.message.reasoning}")
|
||||
if hasattr(choice, "reasoning_details") and choice.reasoning_details:
|
||||
log_to_file(f" choice.reasoning_details: {choice.reasoning_details}")
|
||||
|
||||
|
||||
log_to_file(f"\n{'='*70}\n")
|
||||
|
||||
|
||||
# Also print summary to console
|
||||
print(f"\nResponse content ({len(content) if content else 0} chars)")
|
||||
print(f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars")
|
||||
print(
|
||||
f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars"
|
||||
)
|
||||
print(f"(Full details logged to {LOG_FILE})")
|
||||
|
||||
|
||||
return {
|
||||
"model": model,
|
||||
"success": True,
|
||||
|
|
@ -620,7 +636,7 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
|
|||
"reasoning_length": len(reasoning) if reasoning else 0,
|
||||
"has_think_block": has_think_block,
|
||||
}
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return {
|
||||
|
|
@ -633,30 +649,28 @@ async def test_openrouter_reasoning(model: str, effort: str = "high"):
|
|||
async def test_openai_reasoning(effort: str = "medium"):
|
||||
"""
|
||||
Test reasoning with OpenAI official API.
|
||||
|
||||
|
||||
Args:
|
||||
effort: Reasoning effort level
|
||||
|
||||
|
||||
Returns:
|
||||
Dict with test results
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing OpenAI: {OPENAI_MODEL}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
client = openai.AsyncOpenAI(
|
||||
api_key=OPENAI_API_KEY,
|
||||
base_url=OPENAI_BASE_URL,
|
||||
)
|
||||
|
||||
|
||||
# Build extra_body using our ReasoningConfig
|
||||
config = ReasoningConfig(enabled=True, effort=effort)
|
||||
extra_body = config.build_extra_body(OPENAI_BASE_URL)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": TEST_PROMPT}
|
||||
]
|
||||
|
||||
|
||||
messages = [{"role": "user", "content": TEST_PROMPT}]
|
||||
|
||||
# Build the full request for logging
|
||||
request_params = {
|
||||
"model": OPENAI_MODEL,
|
||||
|
|
@ -664,12 +678,12 @@ async def test_openai_reasoning(effort: str = "medium"):
|
|||
"max_completion_tokens": 1024,
|
||||
"extra_body": extra_body,
|
||||
}
|
||||
|
||||
|
||||
print(f"Request params:")
|
||||
print(f" model: {OPENAI_MODEL}")
|
||||
print(f" max_completion_tokens: 1024")
|
||||
print(f" extra_body: {json.dumps(extra_body, indent=2)}")
|
||||
|
||||
|
||||
# Log the full request to file
|
||||
log_to_file(f"\n{'='*70}")
|
||||
log_to_file(f"MODEL: {OPENAI_MODEL} (OpenAI)")
|
||||
|
|
@ -677,7 +691,7 @@ async def test_openai_reasoning(effort: str = "medium"):
|
|||
log_to_file(f"{'='*70}")
|
||||
log_to_file(f"\nREQUEST SENT:")
|
||||
log_to_file(json.dumps(request_params, indent=2, default=str))
|
||||
|
||||
|
||||
try:
|
||||
completion = await client.chat.completions.create(
|
||||
model=OPENAI_MODEL,
|
||||
|
|
@ -686,45 +700,44 @@ async def test_openai_reasoning(effort: str = "medium"):
|
|||
# Note: OpenAI reasoning models only support temperature=1 (default)
|
||||
extra_body=extra_body,
|
||||
)
|
||||
|
||||
|
||||
# Log full ChatCompletion object to file for inspection
|
||||
log_to_file(f"\nRESPONSE RECEIVED:")
|
||||
log_to_file(f"\nFULL CHATCOMPLETION OBJECT:")
|
||||
log_to_file(str(completion))
|
||||
log_to_file(f"\n{'='*40}")
|
||||
|
||||
|
||||
# Also log the choice and message separately for clarity
|
||||
choice = completion.choices[0]
|
||||
log_to_file(f"\nChoice object: {choice}")
|
||||
log_to_file(f"\nMessage object: {choice.message}")
|
||||
|
||||
|
||||
# Log all attributes on the message
|
||||
log_to_file(f"\nMessage attributes: {dir(choice.message)}")
|
||||
|
||||
|
||||
# Try to get model_dump if available (pydantic)
|
||||
if hasattr(completion, "model_dump"):
|
||||
log_to_file(f"\nCompletion model_dump():")
|
||||
log_to_file(json.dumps(completion.model_dump(), indent=2, default=str))
|
||||
|
||||
|
||||
if hasattr(choice, "model_dump"):
|
||||
log_to_file(f"\nChoice model_dump():")
|
||||
log_to_file(json.dumps(choice.model_dump(), indent=2, default=str))
|
||||
|
||||
|
||||
if hasattr(choice.message, "model_dump"):
|
||||
log_to_file(f"\nMessage model_dump():")
|
||||
log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str))
|
||||
|
||||
|
||||
# Get the response content
|
||||
content = completion.choices[0].message.content
|
||||
log_to_file(f"\nResponse content ({len(content) if content else 0} chars):")
|
||||
log_to_file(content if content else "(empty)")
|
||||
|
||||
|
||||
# Try to extract reasoning
|
||||
reasoning, source = extract_reasoning_from_response(
|
||||
completion.choices[0],
|
||||
content=content
|
||||
completion.choices[0], content=content
|
||||
)
|
||||
|
||||
|
||||
log_to_file(f"\nReasoning extraction result:")
|
||||
log_to_file(f" Source: {source}")
|
||||
if reasoning:
|
||||
|
|
@ -733,26 +746,33 @@ async def test_openai_reasoning(effort: str = "medium"):
|
|||
log_to_file(reasoning)
|
||||
else:
|
||||
log_to_file(" No separate reasoning found")
|
||||
|
||||
|
||||
# Check for <think> blocks in content (unlikely for OpenAI)
|
||||
has_think_block = "<think>" in content.lower() if content else False
|
||||
log_to_file(f" Has <think> block in content: {has_think_block}")
|
||||
|
||||
|
||||
# Try to access reasoning fields directly if they exist
|
||||
if hasattr(choice.message, "reasoning_content") and choice.message.reasoning_content:
|
||||
log_to_file(f" message.reasoning_content: {choice.message.reasoning_content}")
|
||||
if (
|
||||
hasattr(choice.message, "reasoning_content")
|
||||
and choice.message.reasoning_content
|
||||
):
|
||||
log_to_file(
|
||||
f" message.reasoning_content: {choice.message.reasoning_content}"
|
||||
)
|
||||
if hasattr(choice.message, "reasoning") and choice.message.reasoning:
|
||||
log_to_file(f" message.reasoning: {choice.message.reasoning}")
|
||||
if hasattr(choice, "reasoning_details") and choice.reasoning_details:
|
||||
log_to_file(f" choice.reasoning_details: {choice.reasoning_details}")
|
||||
|
||||
|
||||
log_to_file(f"\n{'='*70}\n")
|
||||
|
||||
|
||||
# Also print summary to console
|
||||
print(f"\nResponse content ({len(content) if content else 0} chars)")
|
||||
print(f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars")
|
||||
print(
|
||||
f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars"
|
||||
)
|
||||
print(f"(Full details logged to {LOG_FILE})")
|
||||
|
||||
|
||||
return {
|
||||
"model": OPENAI_MODEL,
|
||||
"success": True,
|
||||
|
|
@ -761,7 +781,7 @@ async def test_openai_reasoning(effort: str = "medium"):
|
|||
"reasoning_length": len(reasoning) if reasoning else 0,
|
||||
"has_think_block": has_think_block,
|
||||
}
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return {
|
||||
|
|
@ -773,60 +793,62 @@ async def test_openai_reasoning(effort: str = "medium"):
|
|||
|
||||
async def run_all_integration_tests():
|
||||
"""Run all integration tests and summarize results."""
|
||||
print("\n" + "="*70)
|
||||
print("\n" + "=" * 70)
|
||||
print("REASONING MODEL INTEGRATION TESTS")
|
||||
print("="*70)
|
||||
|
||||
print("=" * 70)
|
||||
|
||||
# Check that API keys are set
|
||||
missing_keys = []
|
||||
if not OPENAI_API_KEY:
|
||||
missing_keys.append("OPENAI_API_KEY")
|
||||
if not OPENROUTER_API_KEY:
|
||||
missing_keys.append("OPENROUTER_API_KEY")
|
||||
|
||||
|
||||
if missing_keys:
|
||||
print(f"\n⚠ Missing required environment variables: {', '.join(missing_keys)}")
|
||||
print("Set them before running integration tests:")
|
||||
print(" export OPENAI_API_KEY='your-key-here'")
|
||||
print(" export OPENROUTER_API_KEY='your-key-here'")
|
||||
return False
|
||||
|
||||
|
||||
# Initialize log file
|
||||
with open(LOG_FILE, "w") as f:
|
||||
f.write(f"REASONING MODEL TEST RESULTS\n")
|
||||
f.write(f"Generated: {datetime.now().isoformat()}\n")
|
||||
f.write(f"{'='*70}\n\n")
|
||||
|
||||
|
||||
print(f"\nLogging full ChatCompletion objects to: {LOG_FILE}\n")
|
||||
|
||||
|
||||
results = []
|
||||
|
||||
|
||||
# Test OpenRouter models
|
||||
for model in OPENROUTER_MODELS:
|
||||
result = await test_openrouter_reasoning(model)
|
||||
results.append(result)
|
||||
|
||||
|
||||
# Test OpenAI
|
||||
result = await test_openai_reasoning()
|
||||
results.append(result)
|
||||
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("\n" + "=" * 70)
|
||||
print("TEST SUMMARY")
|
||||
print("="*70)
|
||||
|
||||
print("=" * 70)
|
||||
|
||||
for result in results:
|
||||
status = "✓ PASS" if result.get("success") else "✗ FAIL"
|
||||
model = result.get("model", "unknown")
|
||||
|
||||
|
||||
if result.get("success"):
|
||||
reasoning_info = f"reasoning: {result.get('reasoning_source', 'none')}"
|
||||
if result.get("reasoning_length", 0) > 0:
|
||||
reasoning_info += f" ({result['reasoning_length']} chars)"
|
||||
print(f"{status} | {model:40} | {reasoning_info}")
|
||||
else:
|
||||
print(f"{status} | {model:40} | error: {result.get('error', 'unknown')[:30]}")
|
||||
|
||||
print(
|
||||
f"{status} | {model:40} | error: {result.get('error', 'unknown')[:30]}"
|
||||
)
|
||||
|
||||
# Check if any failed
|
||||
failures = [r for r in results if not r.get("success")]
|
||||
if failures:
|
||||
|
|
@ -841,12 +863,13 @@ async def run_all_integration_tests():
|
|||
# MAIN
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def run_unit_tests():
|
||||
"""Run all unit tests (no API calls)."""
|
||||
print("\n" + "="*70)
|
||||
print("\n" + "=" * 70)
|
||||
print("UNIT TESTS")
|
||||
print("="*70 + "\n")
|
||||
|
||||
print("=" * 70 + "\n")
|
||||
|
||||
# ReasoningConfig unit tests
|
||||
test_reasoning_config_default()
|
||||
test_reasoning_config_enabled_only()
|
||||
|
|
@ -857,54 +880,52 @@ def run_unit_tests():
|
|||
test_reasoning_config_invalid_effort()
|
||||
test_reasoning_config_invalid_max_tokens()
|
||||
test_hermes_prompts_defined()
|
||||
|
||||
|
||||
# ServerManager integration tests (no API calls)
|
||||
test_reasoning_config_from_env_config()
|
||||
test_server_manager_builds_extra_body()
|
||||
test_full_env_config_to_server_flow()
|
||||
|
||||
print("\n" + "="*70)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("All unit tests passed!")
|
||||
print("="*70)
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
async def run_server_manager_integration_test():
|
||||
"""Run ServerManager integration test with real API call."""
|
||||
print("\n" + "="*70)
|
||||
print("\n" + "=" * 70)
|
||||
print("SERVER MANAGER INTEGRATION TEST")
|
||||
print("="*70)
|
||||
|
||||
print("=" * 70)
|
||||
|
||||
result = await test_server_manager_injects_extra_body()
|
||||
|
||||
|
||||
if result:
|
||||
print("\n✓ ServerManager integration test passed!")
|
||||
else:
|
||||
print("\n✗ ServerManager integration test failed!")
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description="Test reasoning model support")
|
||||
parser.add_argument(
|
||||
"--unit-only",
|
||||
action="store_true",
|
||||
help="Only run unit tests (no API calls)"
|
||||
"--unit-only", action="store_true", help="Only run unit tests (no API calls)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--integration-only",
|
||||
action="store_true",
|
||||
help="Only run integration tests (API calls to all providers)"
|
||||
action="store_true",
|
||||
help="Only run integration tests (API calls to all providers)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--server-manager-only",
|
||||
action="store_true",
|
||||
help="Only run ServerManager integration test (single API call)"
|
||||
help="Only run ServerManager integration test (single API call)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
if args.integration_only:
|
||||
asyncio.run(run_all_integration_tests())
|
||||
elif args.unit_only:
|
||||
|
|
@ -917,4 +938,3 @@ if __name__ == "__main__":
|
|||
run_unit_tests()
|
||||
asyncio.run(run_server_manager_integration_test())
|
||||
asyncio.run(run_all_integration_tests())
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue