Add support for reasoning models and their variety of providers/endpoints

This commit is contained in:
teknium 2025-12-30 00:23:00 +00:00
parent 1c306d3b17
commit 62fa51240c
6 changed files with 1551 additions and 16 deletions

View file

@ -3,7 +3,8 @@ import collections
import time
from abc import ABC, abstractmethod
from asyncio import exceptions
from typing import Literal, Optional
from dataclasses import dataclass, field
from typing import Any, Dict, Literal, Optional
import numpy as np
from openai.types.chat.chat_completion import ChatCompletion
@ -11,6 +12,128 @@ from openai.types.completion import Completion
from pydantic import BaseModel, Field
from tenacity import retry, stop_after_attempt, wait_random_exponential
# Valid reasoning effort levels
VALID_REASONING_EFFORTS = {"none", "minimal", "low", "medium", "high", "xhigh"}
@dataclass
class ReasoningConfig:
"""
Configuration for reasoning/thinking model support.
This config is used by ServerManager to automatically inject the appropriate
extra_body parameters into API requests based on the provider (OpenAI vs others).
Attributes:
enabled: Whether reasoning mode is enabled. Auto-set to True if effort or
max_tokens are specified.
effort: Reasoning effort level. One of: "none", "minimal", "low", "medium",
"high", "xhigh". Default None (not specified).
max_tokens: Maximum tokens for reasoning (1024-32000). Default None.
"""
enabled: bool = False
effort: Optional[str] = None
max_tokens: Optional[int] = None
def __post_init__(self):
"""Validate and auto-enable if effort or max_tokens are set."""
# Validate effort if provided
if self.effort is not None and self.effort not in VALID_REASONING_EFFORTS:
raise ValueError(
f"Invalid reasoning_effort: {self.effort}. "
f"Must be one of: {VALID_REASONING_EFFORTS}"
)
# Validate max_tokens range if provided
if self.max_tokens is not None:
if self.max_tokens < 1024 or self.max_tokens > 32000:
raise ValueError(
f"max_reasoning_tokens must be between 1024 and 32000, "
f"got {self.max_tokens}"
)
# Auto-enable if effort or max_tokens are specified
if self.effort is not None or self.max_tokens is not None:
self.enabled = True
def is_active(self) -> bool:
"""Check if reasoning is active (enabled with any settings)."""
return self.enabled
def build_extra_body(self, base_url: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""
Build the extra_body dict for API requests based on provider.
Args:
base_url: The API base URL, used to detect OpenAI official endpoint.
Returns:
Dict to merge into extra_body, or None if reasoning not active.
Note:
OpenRouter only allows ONE of effort or max_tokens, not both.
When both are specified, effort takes priority.
"""
if not self.is_active():
return None
# Detect if using official OpenAI endpoint
is_openai_official = base_url and "api.openai.com" in base_url
if is_openai_official:
# OpenAI only accepts reasoning_effort at top level, not nested reasoning object
# They also don't support max_tokens for reasoning
effort = self.effort if self.effort else "medium"
# Map our extended effort levels to OpenAI's supported values
openai_effort_map = {
"none": "low", # OpenAI doesn't have "none", use low
"minimal": "low", # OpenAI doesn't have "minimal", use low
"low": "low",
"medium": "medium",
"high": "high",
"xhigh": "high", # OpenAI doesn't have "xhigh", use high
}
return {"reasoning_effort": openai_effort_map.get(effort, "medium")}
else:
# Standard format for OpenRouter, Nebius, Nous Portal, etc.
# Note: OpenRouter only allows ONE of effort or max_tokens, not both.
# When both are specified, effort takes priority.
reasoning = {"enabled": True}
if self.effort is not None:
reasoning["effort"] = self.effort
elif self.max_tokens is not None:
# Only add max_tokens if effort is not specified
reasoning["max_tokens"] = self.max_tokens
return {"reasoning": reasoning}
@classmethod
def from_env_config(cls, env_config) -> "ReasoningConfig":
"""
Create a ReasoningConfig from a BaseEnvConfig.
This is used by BaseEnv to convert environment config settings
into the reasoning configuration used by ServerManager.
Args:
env_config: A BaseEnvConfig (or subclass) instance with reasoning fields.
Returns:
A ReasoningConfig instance configured based on the env_config.
"""
# Get reasoning settings from env config
thinking_mode = getattr(env_config, "thinking_mode", False)
reasoning_effort = getattr(env_config, "reasoning_effort", None)
max_reasoning_tokens = getattr(env_config, "max_reasoning_tokens", None)
# Determine if enabled: explicitly True, or implied by effort/max_tokens
enabled = thinking_mode or reasoning_effort is not None or max_reasoning_tokens is not None
return cls(
enabled=enabled,
effort=reasoning_effort,
max_tokens=max_reasoning_tokens,
)
class AsyncSemWithAdaptiveWeight(asyncio.Semaphore):
def __init__(self, value: int):