""" Test file for reasoning model support across multiple providers. This test validates: 1. ReasoningConfig builds correct extra_body for different providers 2. Reasoning can be enabled and responses contain reasoning content 3. Reasoning extraction works for various API response formats Providers tested: - OpenAI (gpt-5.2) - Uses reasoning_effort at top level - OpenRouter (anthropic/claude-opus-4.5, nousresearch/hermes-4-70B, deepseek/deepseek-v3.2) - Uses nested reasoning object with enabled/effort/max_tokens Usage: python -m pytest atroposlib/tests/test_reasoning_models.py -v Or run directly: python atroposlib/tests/test_reasoning_models.py Note: This test requires valid API keys. Set them as environment variables or modify the constants below for testing. """ import asyncio import json import os import sys from datetime import datetime import pytest # Add the project root to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) import openai # noqa: E402 from atroposlib.envs.server_handling.server_baseline import ( # noqa: E402 ReasoningConfig, ) from environments.eval_environments.eval_helpers import ( # noqa: E402 HERMES_REASONING_PROMPT, HERMES_REASONING_PROMPT_WITH_ANSWER, extract_reasoning_from_completion, extract_reasoning_from_response, ) # ============================================================================= # API CONFIGURATION # ============================================================================= # These are test credentials. For production, use environment variables. # API keys must be set via environment variables OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-5.2") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY") OPENROUTER_BASE_URL = os.environ.get( "OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1" ) # Models to test on OpenRouter OPENROUTER_MODELS = [ "anthropic/claude-opus-4.5", "nousresearch/hermes-4-70b", "deepseek/deepseek-v3.2", ] # Test prompt that should trigger reasoning TEST_PROMPT = "What is 15 * 23? Think step by step before giving your answer." # Log file for full ChatCompletion objects LOG_FILE = os.path.join(os.path.dirname(__file__), "reasoning_test_results.log") def log_to_file(message: str): """Append message to log file.""" with open(LOG_FILE, "a") as f: f.write(message + "\n") # ============================================================================= # UNIT TESTS FOR ReasoningConfig # ============================================================================= def test_reasoning_config_default(): """Test default ReasoningConfig is not active.""" config = ReasoningConfig() assert not config.enabled assert config.effort is None assert config.max_tokens is None assert not config.is_reasoning_kwargs_active() assert config.build_extra_body() is None print("✓ Default ReasoningConfig is inactive") def test_reasoning_config_enabled_only(): """Test ReasoningConfig with only enabled=True.""" config = ReasoningConfig(enabled=True) assert config.enabled assert config.is_reasoning_kwargs_active() # Test for non-OpenAI provider extra_body = config.build_extra_body("https://openrouter.ai/api/v1") assert extra_body == {"reasoning": {"enabled": True}} # Test for OpenAI provider extra_body = config.build_extra_body("https://api.openai.com/v1") assert extra_body == {"reasoning_effort": "medium"} print("✓ ReasoningConfig with enabled=True works correctly") def test_reasoning_config_with_effort(): """Test ReasoningConfig with effort specified.""" config = ReasoningConfig(effort="high") assert config.enabled # Should be auto-enabled assert config.effort == "high" assert config.is_reasoning_kwargs_active() # Test for non-OpenAI provider extra_body = config.build_extra_body("https://openrouter.ai/api/v1") assert extra_body == {"reasoning": {"enabled": True, "effort": "high"}} # Test for OpenAI provider extra_body = config.build_extra_body("https://api.openai.com/v1") assert extra_body == {"reasoning_effort": "high"} print("✓ ReasoningConfig with effort works correctly") def test_reasoning_config_with_max_tokens(): """Test ReasoningConfig with max_tokens specified.""" config = ReasoningConfig(max_tokens=4096) assert config.enabled # Should be auto-enabled assert config.max_tokens == 4096 assert config.is_reasoning_kwargs_active() # Test for non-OpenAI provider extra_body = config.build_extra_body("https://openrouter.ai/api/v1") assert extra_body == {"reasoning": {"enabled": True, "max_tokens": 4096}} # Test for OpenAI provider (max_tokens not supported, falls back to medium) extra_body = config.build_extra_body("https://api.openai.com/v1") assert extra_body == {"reasoning_effort": "medium"} print("✓ ReasoningConfig with max_tokens works correctly") def test_reasoning_config_full(): """Test ReasoningConfig with all options.""" config = ReasoningConfig(enabled=True, effort="xhigh", max_tokens=8192) assert config.enabled assert config.effort == "xhigh" assert config.max_tokens == 8192 # Test for non-OpenAI provider # Note: OpenRouter only allows ONE of effort or max_tokens # When both are set, effort takes priority extra_body = config.build_extra_body("https://openrouter.ai/api/v1") assert extra_body == { "reasoning": { "enabled": True, "effort": "xhigh", # max_tokens is NOT included when effort is specified (OpenRouter limitation) } } # Test for OpenAI provider (effort passed through directly) extra_body = config.build_extra_body("https://api.openai.com/v1") assert extra_body == {"reasoning_effort": "xhigh"} print("✓ ReasoningConfig with full options works correctly") def test_reasoning_config_effort_mapping(): """Test that effort levels are passed through directly for OpenAI.""" # All effort levels are now passed through 1:1 effort_levels = ["none", "minimal", "low", "medium", "high", "xhigh"] for effort in effort_levels: config = ReasoningConfig(effort=effort) extra_body = config.build_extra_body("https://api.openai.com/v1") assert ( extra_body["reasoning_effort"] == effort ), f"Expected {effort} to pass through, got {extra_body}" print("✓ Effort levels pass through correctly for OpenAI") def test_reasoning_config_invalid_effort(): """Test that invalid effort raises ValueError.""" try: ReasoningConfig(effort="invalid") # Should raise assert False, "Should have raised ValueError" except ValueError as e: assert "Invalid reasoning_effort" in str(e) print("✓ Invalid effort raises ValueError") def test_reasoning_config_max_tokens_no_validation(): """Test that max_tokens accepts any value (no range validation). Provider limits vary and may change over time: - OpenRouter currently caps Anthropic at 1024-32000 - Native Anthropic API supports up to 128k extended thinking We don't enforce limits here to allow flexibility. """ # Low values should work config_low = ReasoningConfig(max_tokens=500) assert config_low.max_tokens == 500 assert config_low.enabled # Auto-enabled # High values should work (e.g., for native Anthropic 128k thinking) config_high = ReasoningConfig(max_tokens=128000) assert config_high.max_tokens == 128000 assert config_high.enabled print("✓ max_tokens accepts any value (no range validation)") def test_hermes_prompts_defined(): """Test that Hermes prompts are properly defined.""" assert HERMES_REASONING_PROMPT is not None assert "" in HERMES_REASONING_PROMPT assert "" in HERMES_REASONING_PROMPT assert HERMES_REASONING_PROMPT_WITH_ANSWER is not None assert "" in HERMES_REASONING_PROMPT_WITH_ANSWER print("✓ Hermes prompts are properly defined") # ============================================================================= # SERVER MANAGER INTEGRATION TESTS # ============================================================================= def test_reasoning_config_from_env_config(): """Test ReasoningConfig.from_env_config() creates correct config.""" from atroposlib.envs.base import BaseEnvConfig # Test with thinking_mode only env_config = BaseEnvConfig( tokenizer_name="gpt2", group_name="test", run_name="test", thinking_mode=True, ) reasoning_config = ReasoningConfig.from_env_config(env_config) assert reasoning_config.enabled is True assert reasoning_config.effort is None assert reasoning_config.max_tokens is None print("✓ ReasoningConfig.from_env_config with thinking_mode=True works") # Test with reasoning_effort env_config = BaseEnvConfig( tokenizer_name="gpt2", group_name="test", run_name="test", reasoning_effort="high", ) reasoning_config = ReasoningConfig.from_env_config(env_config) assert reasoning_config.enabled is True # Auto-enabled because effort is set assert reasoning_config.effort == "high" print("✓ ReasoningConfig.from_env_config with reasoning_effort works") # Test with max_reasoning_tokens env_config = BaseEnvConfig( tokenizer_name="gpt2", group_name="test", run_name="test", max_reasoning_tokens=8000, ) reasoning_config = ReasoningConfig.from_env_config(env_config) assert reasoning_config.enabled is True # Auto-enabled because max_tokens is set assert reasoning_config.max_tokens == 8000 print("✓ ReasoningConfig.from_env_config with max_reasoning_tokens works") # Test with all disabled (default) env_config = BaseEnvConfig( tokenizer_name="gpt2", group_name="test", run_name="test", ) reasoning_config = ReasoningConfig.from_env_config(env_config) assert reasoning_config.enabled is False assert not reasoning_config.is_reasoning_kwargs_active() print("✓ ReasoningConfig.from_env_config with defaults (disabled) works") def test_server_manager_builds_extra_body(): """Test ReasoningConfig.build_extra_body() generates correct extra_body.""" # Create reasoning config reasoning_config = ReasoningConfig(enabled=True, effort="high") # We can't easily instantiate ServerManager without actual servers, # so let's test the build_extra_body logic directly # Test OpenRouter format extra_body = reasoning_config.build_extra_body("https://openrouter.ai/api/v1") assert "reasoning" in extra_body assert extra_body["reasoning"]["enabled"] is True assert extra_body["reasoning"]["effort"] == "high" print("✓ ServerManager builds correct extra_body for OpenRouter") # Test OpenAI format extra_body = reasoning_config.build_extra_body("https://api.openai.com/v1") assert "reasoning_effort" in extra_body assert extra_body["reasoning_effort"] == "high" assert "reasoning" not in extra_body # Should NOT have nested reasoning print("✓ ServerManager builds correct extra_body for OpenAI") # Test Claude (anthropic) - should use max_tokens claude_reasoning = ReasoningConfig(enabled=True, max_tokens=8000) extra_body = claude_reasoning.build_extra_body("https://openrouter.ai/api/v1") assert "reasoning" in extra_body assert extra_body["reasoning"]["enabled"] is True assert extra_body["reasoning"]["max_tokens"] == 8000 print("✓ ServerManager builds correct extra_body for Claude (max_tokens)") async def test_server_manager_injects_extra_body(): """ Integration test: Verify ServerManager actually injects extra_body in API calls. This test creates a real ServerManager and makes an actual API call to verify the full flow works. """ if not OPENROUTER_API_KEY: pytest.skip("OPENROUTER_API_KEY not set - skipping integration test") from atroposlib.envs.server_handling.server_baseline import APIServerConfig from atroposlib.envs.server_handling.server_manager import ServerManager # Create server config for OpenRouter server_config = APIServerConfig( model_name="nousresearch/hermes-4-70b", base_url=OPENROUTER_BASE_URL, api_key=OPENROUTER_API_KEY, num_requests_for_eval=10, ) # Create reasoning config reasoning_config = ReasoningConfig(enabled=True, effort="high") print("\n" + "=" * 60) print("Testing ServerManager.chat_completion() with reasoning injection") print("=" * 60) # Create ServerManager with reasoning config (NOT in testing mode - we want real API call) server_manager = ServerManager( configs=[server_config], reasoning_config=reasoning_config, testing=False, # Actually make the API call ) # Make a chat completion call messages = [ {"role": "system", "content": HERMES_REASONING_PROMPT}, {"role": "user", "content": "What is 2 + 2? Think carefully."}, ] print( f"Making API call: enabled={reasoning_config.enabled}, " f"effort={reasoning_config.effort}" ) completion = await server_manager.chat_completion( messages=messages, max_tokens=512, temperature=0.7, ) # Verify response has reasoning reasoning, source, content = extract_reasoning_from_completion(completion) print("Response received!") print( f"Content: {content[:100]}..." if content and len(content) > 100 else f"Content: {content}" ) print(f"Reasoning source: {source}") print(f"Reasoning length: {len(reasoning) if reasoning else 0} chars") # Assert we got a response (reasoning is optional - model may not support it) assert content is not None, "Expected response content" print("✓ ServerManager.chat_completion() correctly injected reasoning extra_body") def test_full_env_config_to_server_flow(): """ Test the complete flow from BaseEnvConfig to ServerManager reasoning injection. This verifies that: 1. BaseEnvConfig with reasoning fields creates properly 2. ReasoningConfig.from_env_config() works 3. The resulting config would inject correct extra_body """ from atroposlib.envs.base import BaseEnvConfig print("\n" + "=" * 60) print("Testing full BaseEnvConfig → ServerManager flow") print("=" * 60) # Create a config like a user would env_config = BaseEnvConfig( tokenizer_name="gpt2", group_name="test-reasoning", run_name="test-run", thinking_mode=True, reasoning_effort="high", max_reasoning_tokens=8000, ) print("Created BaseEnvConfig:") print(f" thinking_mode: {env_config.thinking_mode}") print(f" reasoning_effort: {env_config.reasoning_effort}") print(f" max_reasoning_tokens: {env_config.max_reasoning_tokens}") # Convert to ReasoningConfig (this happens in BaseEnv.__init__) reasoning_config = ReasoningConfig.from_env_config(env_config) print("\nReasoningConfig created:") print(f" enabled: {reasoning_config.enabled}") print(f" effort: {reasoning_config.effort}") print(f" max_tokens: {reasoning_config.max_tokens}") # Verify the config would generate correct extra_body # For OpenRouter openrouter_extra = reasoning_config.build_extra_body("https://openrouter.ai/api/v1") print(f"\nOpenRouter extra_body: {json.dumps(openrouter_extra, indent=2)}") assert openrouter_extra["reasoning"]["enabled"] is True assert openrouter_extra["reasoning"]["effort"] == "high" # Note: max_tokens is NOT included when effort is set (OpenRouter limitation) # For OpenAI openai_extra = reasoning_config.build_extra_body("https://api.openai.com/v1") print(f"\nOpenAI extra_body: {json.dumps(openai_extra, indent=2)}") assert openai_extra["reasoning_effort"] == "high" print("\n✓ Full BaseEnvConfig → ServerManager flow works correctly!") # ============================================================================= # INTEGRATION TESTS WITH REAL API CALLS # ============================================================================= async def _run_openrouter_reasoning_test(model: str, effort: str = "high"): """ Run reasoning test with an OpenRouter model (helper function). Note: This is a helper function, not a pytest test. It's called by run_all_integration_tests() when running the script directly. Args: model: Model name to test effort: Reasoning effort level Returns: Dict with test results """ print(f"\n{'='*60}") print(f"Testing OpenRouter: {model}") print(f"{'='*60}") client = openai.AsyncOpenAI( api_key=OPENROUTER_API_KEY, base_url=OPENROUTER_BASE_URL, ) # Build extra_body based on model type # Claude models need max_tokens in reasoning dict, not effort # See: https://openrouter.ai/docs/guides/best-practices/reasoning-tokens is_claude = "claude" in model.lower() or "anthropic" in model.lower() if is_claude: # Claude needs reasoning.max_tokens, and overall max_tokens must be higher reasoning_max_tokens = 8000 overall_max_tokens = 0 # Must be > reasoning_max_tokens extra_body = {"reasoning": {"max_tokens": reasoning_max_tokens}} else: # Other models use effort config = ReasoningConfig(enabled=True, effort=effort) extra_body = config.build_extra_body(OPENROUTER_BASE_URL) overall_max_tokens = 0 messages = [{"role": "user", "content": TEST_PROMPT}] # For Hermes, also add the system prompt if "hermes" in model.lower(): messages.insert(0, {"role": "system", "content": HERMES_REASONING_PROMPT}) # Build the full request for logging request_params = { "model": model, "messages": messages, "max_tokens": overall_max_tokens, "temperature": 0.7, "extra_body": extra_body, } print("Request params:") print(f" model: {model}") print(f" max_tokens: {overall_max_tokens}") print(f" extra_body: {json.dumps(extra_body, indent=2)}") # Log the full request to file log_to_file(f"\n{'='*70}") log_to_file(f"MODEL: {model}") log_to_file(f"TIMESTAMP: {datetime.now().isoformat()}") log_to_file(f"{'='*70}") log_to_file("\nREQUEST SENT:") log_to_file(json.dumps(request_params, indent=2, default=str)) try: completion = await client.chat.completions.create( model=model, messages=messages, max_tokens=overall_max_tokens, temperature=0.7, extra_body=extra_body, ) # Log full ChatCompletion object to file for inspection log_to_file("\nRESPONSE RECEIVED:") log_to_file("\nFULL CHATCOMPLETION OBJECT:") log_to_file(str(completion)) log_to_file(f"\n{'='*40}") # Also log the choice and message separately for clarity choice = completion.choices[0] log_to_file(f"\nChoice object: {choice}") log_to_file(f"\nMessage object: {choice.message}") # Log all attributes on the message log_to_file(f"\nMessage attributes: {dir(choice.message)}") # Try to get model_dump if available (pydantic) if hasattr(completion, "model_dump"): log_to_file("\nCompletion model_dump():") log_to_file(json.dumps(completion.model_dump(), indent=2, default=str)) if hasattr(choice, "model_dump"): log_to_file("\nChoice model_dump():") log_to_file(json.dumps(choice.model_dump(), indent=2, default=str)) if hasattr(choice.message, "model_dump"): log_to_file("\nMessage model_dump():") log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str)) # Get the response content content = completion.choices[0].message.content log_to_file(f"\nResponse content ({len(content) if content else 0} chars):") log_to_file(content if content else "(empty)") # Try to extract reasoning reasoning, source = extract_reasoning_from_response( completion.choices[0], content=content ) log_to_file("\nReasoning extraction result:") log_to_file(f" Source: {source}") if reasoning: log_to_file(f" Length: {len(reasoning)} chars") log_to_file(" Full reasoning content:") log_to_file(reasoning) else: log_to_file(" No separate reasoning found") # Check for blocks in content has_think_block = "" in content.lower() if content else False log_to_file(f" Has block in content: {has_think_block}") # Check for reasoning_details in raw response has_reasoning_details = hasattr(completion.choices[0], "reasoning_details") has_reasoning_content = hasattr( completion.choices[0].message, "reasoning_content" ) log_to_file(f" Has reasoning_details attr: {has_reasoning_details}") log_to_file(f" Has reasoning_content attr: {has_reasoning_content}") # Try to access reasoning fields directly if they exist if ( hasattr(choice.message, "reasoning_content") and choice.message.reasoning_content ): log_to_file( f" message.reasoning_content: {choice.message.reasoning_content}" ) if hasattr(choice.message, "reasoning") and choice.message.reasoning: log_to_file(f" message.reasoning: {choice.message.reasoning}") if hasattr(choice, "reasoning_details") and choice.reasoning_details: log_to_file(f" choice.reasoning_details: {choice.reasoning_details}") log_to_file(f"\n{'='*70}\n") # Also print summary to console print(f"\nResponse content ({len(content) if content else 0} chars)") print( f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars" ) print(f"(Full details logged to {LOG_FILE})") return { "model": model, "success": True, "content_length": len(content) if content else 0, "reasoning_source": source, "reasoning_length": len(reasoning) if reasoning else 0, "has_think_block": has_think_block, } except Exception as e: print(f"Error: {e}") return { "model": model, "success": False, "error": str(e), } async def _run_openai_reasoning_test(effort: str = "medium"): """ Run reasoning test with OpenAI official API (helper function). Note: This is a helper function, not a pytest test. It's called by run_all_integration_tests() when running the script directly. Args: effort: Reasoning effort level Returns: Dict with test results """ print(f"\n{'='*60}") print(f"Testing OpenAI: {OPENAI_MODEL}") print(f"{'='*60}") client = openai.AsyncOpenAI( api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL, ) # Build extra_body using our ReasoningConfig config = ReasoningConfig(enabled=True, effort=effort) extra_body = config.build_extra_body(OPENAI_BASE_URL) messages = [{"role": "user", "content": TEST_PROMPT}] # Build the full request for logging request_params = { "model": OPENAI_MODEL, "messages": messages, "max_completion_tokens": 1024, "extra_body": extra_body, } print("Request params:") print(f" model: {OPENAI_MODEL}") print(" max_completion_tokens: 1024") print(f" extra_body: {json.dumps(extra_body, indent=2)}") # Log the full request to file log_to_file(f"\n{'='*70}") log_to_file(f"MODEL: {OPENAI_MODEL} (OpenAI)") log_to_file(f"TIMESTAMP: {datetime.now().isoformat()}") log_to_file(f"{'='*70}") log_to_file("\nREQUEST SENT:") log_to_file(json.dumps(request_params, indent=2, default=str)) try: completion = await client.chat.completions.create( model=OPENAI_MODEL, messages=messages, max_completion_tokens=1024, # OpenAI reasoning models require this instead of max_tokens # Note: OpenAI reasoning models only support temperature=1 (default) extra_body=extra_body, ) # Log full ChatCompletion object to file for inspection log_to_file("\nRESPONSE RECEIVED:") log_to_file("\nFULL CHATCOMPLETION OBJECT:") log_to_file(str(completion)) log_to_file(f"\n{'='*40}") # Also log the choice and message separately for clarity choice = completion.choices[0] log_to_file(f"\nChoice object: {choice}") log_to_file(f"\nMessage object: {choice.message}") # Log all attributes on the message log_to_file(f"\nMessage attributes: {dir(choice.message)}") # Try to get model_dump if available (pydantic) if hasattr(completion, "model_dump"): log_to_file("\nCompletion model_dump():") log_to_file(json.dumps(completion.model_dump(), indent=2, default=str)) if hasattr(choice, "model_dump"): log_to_file("\nChoice model_dump():") log_to_file(json.dumps(choice.model_dump(), indent=2, default=str)) if hasattr(choice.message, "model_dump"): log_to_file("\nMessage model_dump():") log_to_file(json.dumps(choice.message.model_dump(), indent=2, default=str)) # Get the response content content = completion.choices[0].message.content log_to_file(f"\nResponse content ({len(content) if content else 0} chars):") log_to_file(content if content else "(empty)") # Try to extract reasoning reasoning, source = extract_reasoning_from_response( completion.choices[0], content=content ) log_to_file("\nReasoning extraction result:") log_to_file(f" Source: {source}") if reasoning: log_to_file(f" Length: {len(reasoning)} chars") log_to_file(" Full reasoning content:") log_to_file(reasoning) else: log_to_file(" No separate reasoning found") # Check for blocks in content (unlikely for OpenAI) has_think_block = "" in content.lower() if content else False log_to_file(f" Has block in content: {has_think_block}") # Try to access reasoning fields directly if they exist if ( hasattr(choice.message, "reasoning_content") and choice.message.reasoning_content ): log_to_file( f" message.reasoning_content: {choice.message.reasoning_content}" ) if hasattr(choice.message, "reasoning") and choice.message.reasoning: log_to_file(f" message.reasoning: {choice.message.reasoning}") if hasattr(choice, "reasoning_details") and choice.reasoning_details: log_to_file(f" choice.reasoning_details: {choice.reasoning_details}") log_to_file(f"\n{'='*70}\n") # Also print summary to console print(f"\nResponse content ({len(content) if content else 0} chars)") print( f"Reasoning source: {source}, length: {len(reasoning) if reasoning else 0} chars" ) print(f"(Full details logged to {LOG_FILE})") return { "model": OPENAI_MODEL, "success": True, "content_length": len(content) if content else 0, "reasoning_source": source, "reasoning_length": len(reasoning) if reasoning else 0, "has_think_block": has_think_block, } except Exception as e: print(f"Error: {e}") return { "model": OPENAI_MODEL, "success": False, "error": str(e), } async def run_all_integration_tests(): """Run all integration tests and summarize results.""" print("\n" + "=" * 70) print("REASONING MODEL INTEGRATION TESTS") print("=" * 70) # Check that API keys are set missing_keys = [] if not OPENAI_API_KEY: missing_keys.append("OPENAI_API_KEY") if not OPENROUTER_API_KEY: missing_keys.append("OPENROUTER_API_KEY") if missing_keys: print(f"\n⚠ Missing required environment variables: {', '.join(missing_keys)}") print("Set them before running integration tests:") print(" export OPENAI_API_KEY='your-key-here'") print(" export OPENROUTER_API_KEY='your-key-here'") return False # Initialize log file with open(LOG_FILE, "w") as f: f.write("REASONING MODEL TEST RESULTS\n") f.write(f"Generated: {datetime.now().isoformat()}\n") f.write(f"{'='*70}\n\n") print(f"\nLogging full ChatCompletion objects to: {LOG_FILE}\n") results = [] # Test OpenRouter models for model in OPENROUTER_MODELS: result = await _run_openrouter_reasoning_test(model) results.append(result) # Test OpenAI result = await _run_openai_reasoning_test() results.append(result) # Summary print("\n" + "=" * 70) print("TEST SUMMARY") print("=" * 70) for result in results: status = "✓ PASS" if result.get("success") else "✗ FAIL" model = result.get("model", "unknown") if result.get("success"): reasoning_info = f"reasoning: {result.get('reasoning_source', 'none')}" if result.get("reasoning_length", 0) > 0: reasoning_info += f" ({result['reasoning_length']} chars)" print(f"{status} | {model:40} | {reasoning_info}") else: print( f"{status} | {model:40} | error: {result.get('error', 'unknown')[:30]}" ) # Check if any failed failures = [r for r in results if not r.get("success")] if failures: print(f"\n{len(failures)} test(s) failed") return False else: print(f"\nAll {len(results)} tests passed!") return True # ============================================================================= # MAIN # ============================================================================= def run_unit_tests(): """Run all unit tests (no API calls).""" print("\n" + "=" * 70) print("UNIT TESTS") print("=" * 70 + "\n") # ReasoningConfig unit tests test_reasoning_config_default() test_reasoning_config_enabled_only() test_reasoning_config_with_effort() test_reasoning_config_with_max_tokens() test_reasoning_config_full() test_reasoning_config_effort_mapping() test_reasoning_config_invalid_effort() test_reasoning_config_max_tokens_no_validation() test_hermes_prompts_defined() # ServerManager integration tests (no API calls) test_reasoning_config_from_env_config() test_server_manager_builds_extra_body() test_full_env_config_to_server_flow() print("\n" + "=" * 70) print("All unit tests passed!") print("=" * 70) async def run_server_manager_integration_test(): """Run ServerManager integration test with real API call.""" print("\n" + "=" * 70) print("SERVER MANAGER INTEGRATION TEST") print("=" * 70) result = await test_server_manager_injects_extra_body() if result: print("\n✓ ServerManager integration test passed!") else: print("\n✗ ServerManager integration test failed!") return result if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Test reasoning model support") parser.add_argument( "--unit-only", action="store_true", help="Only run unit tests (no API calls)" ) parser.add_argument( "--integration-only", action="store_true", help="Only run integration tests (API calls to all providers)", ) parser.add_argument( "--server-manager-only", action="store_true", help="Only run ServerManager integration test (single API call)", ) args = parser.parse_args() if args.integration_only: asyncio.run(run_all_integration_tests()) elif args.unit_only: run_unit_tests() elif args.server_manager_only: run_unit_tests() # Run unit tests first asyncio.run(run_server_manager_integration_test()) else: # Run all tests run_unit_tests() asyncio.run(run_server_manager_integration_test()) asyncio.run(run_all_integration_tests())