Add support for reasoning models and their variety of providers/endpoints

2026-04-19 12:57:58 +00:00 · 2025-12-30 00:23:00 +00:00 · 2025-12-30 00:23:00 +00:00 · 62fa51240c
commit 62fa51240c
parent 1c306d3b17
6 changed files with 1551 additions and 16 deletions
--- a/environments/eval_environments/ifeval_eval.py
+++ b/environments/eval_environments/ifeval_eval.py
@ -35,6 +35,7 @@ from datasets import load_dataset
 from eval_helpers import (
    create_system_content,
    get_default_thinking_prompt,
+    format_reasoning_debug_info,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
@ -229,7 +230,11 @@ class IFEvalEnv(BaseEnv):
        print(f"  Evaluation split: {self.config.eval_split}")
        print(f"  Thinking mode: {self.config.thinking_mode}")
        if self.config.thinking_mode:
-            print(f"  Thinking prompt: {self._get_thinking_prompt()[:100]}...")
+            thinking_prompt = self._get_thinking_prompt()
+            if thinking_prompt:
+                print(f"  Thinking prompt: {thinking_prompt[:100]}...")
+            else:
+                print(f"  Thinking prompt: None (using API reasoning mode only)")

        # Load IFEval dataset
        try:
@ -458,7 +463,24 @@ class IFEvalEnv(BaseEnv):
                    if self.config.eval_max_tokens > 0:
                        completion_kwargs["max_tokens"] = self.config.eval_max_tokens

+                    if self.config.full_debug:
+                        print(f"\n  [API Call] Sending request (attempt {attempt + 1})...")
+                        print(f"    Temperature: {completion_kwargs.get('temperature')}")
+                        print(f"    Max tokens: {completion_kwargs.get('max_tokens', 'not set (unlimited)')}")
+                        print(f"    Thinking mode: {self.config.thinking_mode}")
+                        print(f"    Reasoning effort: {self.config.reasoning_effort}")
+                        # Show extra_body that will be injected by ServerManager
+                        if self.config.thinking_mode or self.config.reasoning_effort:
+                            print(f"    (ServerManager will inject reasoning extra_body)")
+
+                    _api_start = time.time()
                    completion = await self.server.chat_completion(**completion_kwargs)
+                    _api_elapsed = time.time() - _api_start
+
+                    # Log reasoning token usage if full_debug is enabled
+                    if self.config.full_debug and completion:
+                        print(f"  [API Response] Received in {_api_elapsed:.2f}s")
+                        print(format_reasoning_debug_info(completion))

                    if completion.choices and completion.choices[0].message.content:
                        model_response = completion.choices[0].message.content