atropos/environments/tool_use_interleaved_thinking.py

"""
Interleaved‑Thinking Single‑Block Environment
============================================


This environment lets a model emit *multiple* <tool_call>/<tool_response> pairs
**inside one still‑open <think> block**, then close </think> and write the
final answer – all within a single assistant turn.


Unlike the first draft, this version is **stand‑alone**: it does **NOT**
inherit from SingleToolCallingEnv.  All required boiler‑plate from that
class is copied here so nothing breaks when you swap env names.
"""

from __future__ import annotations

import asyncio
import json
import logging
import os
import re
from typing import Dict, List, Optional, Tuple, Union

import aiohttp
import httpx
import wandb
from datasets import Dataset, load_dataset

from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
    EvalHandlingEnum,
    ScoredDataGroup,
)
from atroposlib.type_definitions import Message
from atroposlib.utils.io import parse_http_response
from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer

logger = logging.getLogger(__name__)

# Set to True to always print debug information.
DEBUG = True
EXECUTION_FEEDBACK = True
TOOL_USAGE_BONUS = 0.2

# Hard caps for generation length
MAX_REPLY_TOKENS = 2048  # truncate any single assistant reply to ≤1024 tokens
MAX_GEN_PER_TURN = 512  # never request more than 512 new tokens from the model
# Maximum number of thinking/tool-use turns per rollout
MAX_ROLLOUT_TURNS = 3


system_prompt = (
    "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the "
    "problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
    "solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
    "</think> tags, and then provide your solution or response to the problem."
)


TOOL_SYSTEM_PROMPT = (
    "You are a function-calling & reasoning AI model. You are provided with "
    "function signatures inside <reasoning_tools> … XML tags. After calling & "
    "executing the functions, you will get results inside <tool_response> … "
    "Here are the available tools:\n\n"
    "<reasoning_tools>\n"
    "[\n"
    "  {\n"
    '    "type": "function",\n'
    '    "function": {\n'
    '      "name": "calculator",\n'
    '      "description": '
    '        "Evaluate a numeric Python expression and return the result.",\n'
    '      "parameters": {\n'
    '        "type": "object",\n'
    '        "properties": {\n'
    '          "expr": {\n'
    '            "type": "string",\n'
    '            "description": '
    '              "A pure-Python arithmetic expression\'"\n'
    "          }\n"
    "        },\n"
    '        "required": ["expr"]\n'
    "      }\n"
    "    }\n"
    "  },\n"
    "  {\n"
    '    "type": "function",\n'
    '    "function": {\n'
    '      "name": "python_interpreter",\n'
    '      "description": '
    '        "Run a short Python snippet and return stdout plus the last '
    'expression.",\n'
    '      "parameters": {\n'
    '        "type": "object",\n'
    '        "properties": {\n'
    '          "code": {\n'
    '            "type": "string",\n'
    '            "description": "Python source code to execute."\n'
    "          }\n"
    "        },\n"
    '        "required": ["code"]\n'
    "      }\n"
    "    }\n"
    "  }\n"
    "]\n"
    "</reasoning_tools>\n\n"
    "You must use reasoning tools such as python_interpreter as a tool call when available "
    "for hard problems such as math before providing your final answer.\n"
    "Always provide your final numeric answer (or final result) in \\\\boxed{...} so it "
    "can be automatically graded right after closing </think> tag.\n\n"
    "For reasoning tools, return interleaved tool calls within <think> </think> tags.\n"
    "<think>\n"
    "<tool_call>{'name': <function-name>, 'arguments': <args-dict>}</tool_call>\n"
    "<!-- system pauses runtime for execution -->\n"
    "<tool_response>{'result': <result>}</tool_response>\n"
    "<!-- assistant resumes within same think -->\n"
    "</think>\n"
    "<!-- plain text answer with \\\\boxed{...} -->\n"
)

SYSTEM_PROMPT = system_prompt + TOOL_SYSTEM_PROMPT


class InterleavedInlineEnv(BaseEnv):
    """
    One episode = user prompt → single assistant message with inline tool
    calls inside a still‑open <think> block.
    """

    name = "interleaved_inline"
    _re_last_call = re.compile(r"<tool_call>\s*(.*?)\s*</tool_call>\s*$", re.S)

    def __init__(
        self,
        config: BaseEnvConfig,
        server_configs: List[APIServerConfig],
        slurm: bool = True,
        testing: bool = False,
    ):
        super().__init__(config, server_configs, slurm, testing)
        self.percent_correct_buffer: List[float] = []
        self.eval_metrics: List[Tuple[str, float]] = []
        self.rollouts_for_wandb = []
        self.iter = 0
        import random

        self.rng = random.Random()
        # Dynamic few‑shot pool: list of (user_msg, assistant_msg) tuples
        self.dynamic_pool: List[Tuple[Dict, Dict]] = []
        self.dynamic_pool_max = 0  # keep at most 4 real examples
        self.max_token_len = 8192

    async def get_server_info(self):
        """Override to prevent server from overwriting our max_token_len"""
        async with aiohttp.ClientSession() as session:
            async with session.get(f"{self.config.rollout_server_url}/info") as resp:
                data = await parse_http_response(resp, logger)
                if data["batch_size"] != -1:
                    self.config.batch_size = data["batch_size"]
                # Log what the server tried to set max_token_len to
                if data["max_token_len"] != -1:
                    logger.info(
                        f"Server tried to set max_token_len to {data['max_token_len']}\n"
                        f"keeping our value of {self.max_token_len}"
                    )
        if self.config.batch_size == -1:
            logging.warning("Batch size not set by config or server!")
        if self.config.group_size > self.config.batch_size:
            raise ValueError(
                f"group_size ({self.config.group_size}) "
                f"must be less than batch_size ({self.config.batch_size})"
            )

    @classmethod
    def config_init(cls):
        cfg = BaseEnvConfig(
            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview",
            group_size=16,
            use_wandb=True,
            rollout_server_url="http://localhost:8000",
            total_steps=2000,
            batch_size=1024,
            steps_per_eval=20,
            max_token_length=16 * 8192,
            inference_weight=1.0,
            wandb_name="toolcall_interleaved",
            eval_handling=EvalHandlingEnum.LIMIT_TRAIN,
            eval_limit_ratio=0.1,
            max_gen_per_turn=MAX_GEN_PER_TURN,
        )
        servers = [
            APIServerConfig(
                model_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview",
                base_url="http://localhost:9004/v1",
                api_key="x",
                num_max_requests_at_once=32,
                num_requests_for_eval=256,
            )
        ]
        return cfg, servers

    async def setup(self):
        """
        Load a streamed subset of **nvidia/AceReason-Math**.


        We keep only rows whose *answer* looks purely numeric so the
        calculator / python_interpreter tools can verify them automatically.


        The env‑var SUBSET_ROWS (default 1000) controls how many rows we keep.
        """

        N = int(os.getenv("SUBSET_ROWS", "1000"))

        stream_ds = load_dataset(  # ≈50 k rows total → stream
            # "NVIDIA/OpenMathReasoning",
            # split="cot",
            # "NVIDIA/OpenMathReasoning",
            # split="cot",
            # "open-r1/OpenR1-Math-220k",
            "nvidia/AceReason-Math",
            split="train",
            streaming=True,
        )

        _numeric = re.compile(r"^[0-9+\-*/(). %√\\\\sqrt{}]+$").fullmatch

        subset = []
        for ex in stream_ds:
            if len(subset) >= N:
                break
            # some datasets use "answer", others "expected_answer"
            ans_raw = ex.get("answer", ex.get("expected_answer"))
            if ans_raw is None:
                continue
            ans = str(ans_raw).strip()
            if _numeric(ans):
                subset.append(
                    {
                        "problem": ex["problem"],
                        "expected_answer": ans,
                    }
                )

        full = Dataset.from_list(subset)
        if DEBUG:
            print(f"[DEBUG setup] kept {len(subset)} rows from Dataset")

        split = full.train_test_split(test_size=0.02, seed=42)

        split = full.train_test_split(test_size=0.02, seed=42)
        self.train, self.test = split["train"], split["test"]
        self.train = self.train.shuffle(seed=int.from_bytes(os.urandom(2), "big"))

    async def _completion_until(
        self, prompt: str, max_tokens: int, stop: Optional[Union[str, List[str]]] = None
    ) -> str:
        comp = await self.server.completion(
            prompt=prompt,
            stop=stop,
            max_tokens=max_tokens,
            temperature=0.8,
        )
        return comp.choices[0].text

    def _extract_last_call(self, chunk: str):
        """
        Return the JSON dict for the *last* <tool_call> … </tool_call> block
        in `chunk`, or **None** if no such block exists.
        Also handles incomplete tool calls (missing </tool_call> tag).
        """
        # First try to find complete tool calls
        matches = self._re_last_call.findall(chunk)
        if matches:
            try:
                return json.loads(matches[-1])
            except Exception:
                pass

        # If no complete tool calls, look for incomplete ones (missing </tool_call>)
        last_tool_call_pos = chunk.rfind("<tool_call>")
        if last_tool_call_pos != -1:
            json_start = last_tool_call_pos + len("<tool_call>")
            json_text = chunk[json_start:].strip()
            try:
                return json.loads(json_text)
            except json.JSONDecodeError:
                # Try partial JSON extraction
                brace_count = 0
                json_end = 0
                for i, char in enumerate(json_text):
                    if char == "{":
                        brace_count += 1
                    elif char == "}":
                        brace_count -= 1
                        if brace_count == 0:
                            json_end = i + 1
                            break
                if json_end > 0:
                    try:
                        return json.loads(json_text[:json_end])
                    except json.JSONDecodeError:
                        pass
        return None

    def _is_new_tool_call(self, raw: str) -> bool:
        """
        Return True if there's an unresponded <tool_call> in raw
        (i.e., open call without matching </tool_response>).
        """
        pos = raw.rfind("<tool_call>")
        if pos == -1:
            return False
        return "</tool_response>" not in raw[pos:]

    @staticmethod
    def _canon_num(txt: str) -> str:
        """Return number string without commas / spaces; keep leading sign."""
        return txt.strip().replace(",", "").replace(" ", "")

    # boxed{answer} pattern for final numeric result
    _re_box = re.compile(r"\\boxed\{([^}]*)\}")

    def _boxed_after_think(self, text: str) -> Optional[str]:
        """
        Return the first \\boxed{…} that appears *after* the closing </think>
        tag.  Returns None if </think> is missing or no boxed answer exists.
        """
        think_pos = text.find("</think>")
        if think_pos == -1:
            return None
        m = self._re_box.search(text, pos=think_pos)
        return m.group(1).strip() if m else None

    async def _exec_tool(self, call_json: Dict):
        """
        Execute reasoning‑time tools.


        • python_interpreter → POST code to the local coding server running at  localhost:5002/execute
                        and return {"stdout":..., "result":...}
        • calculator  → eval(expr) in a math‑only sandbox and return the number.
        """
        name = call_json["name"]
        args = call_json["arguments"]

        if name == "python_interpreter":
            try:
                async with httpx.AsyncClient(timeout=10.0) as client:
                    payload = {"code": args["code"], "input": ""}
                    resp = await client.post(
                        "http://localhost:5002/execute", json=payload
                    )
                    data = resp.json()
            except httpx.ConnectError:
                print(
                    "❌ [CRITICAL] Python interpreter server not available at localhost:5002"
                )
                print("Please ensure the code_exec_server Docker container is running")
                raise RuntimeError(
                    "Python interpreter server not available - cannot continue without verification"
                )
            if DEBUG:
                print(f"[DEBUG _exec_tool] {name} result → {data}")
            return {
                "stdout": data.get("output", ""),
                "result": data.get("output", "").strip(),
            }
        elif name == "calculator":
            import math

            expr = args["expr"]
            val = eval(expr, {"__builtins__": {}}, {"math": math})
            if DEBUG:
                print(f"[DEBUG _exec_tool] {name} result → {val}")
            return {"value": val}
        else:
            raise ValueError(f"Unknown tool name {name}")

    async def _execute_turn_inference(
        self,
        turn_idx: int,
        prompts: List[str],
        ridx_map: List[int],
        expected_calls_by_turn: List[List[str]],
    ) -> List[str]:
        """Execute inference for a turn using optimal batching strategy."""
        print(f"\n\033[95m=== Expected Tool Calls for Turn {turn_idx+1} ===\033[0m")
        print(f"\033[95m{expected_calls_by_turn[turn_idx]}\033[0m\n")

        # Always use batched identical prompts for turn 0, heterogeneous for others
        if turn_idx == 0:
            choices = await self._batch_identical_prompts(
                prompts[0], len(ridx_map), turn_idx
            )
        else:
            choices = await self._batch_heterogeneous_prompts(prompts, turn_idx)

        return choices

    async def _batch_identical_prompts(
        self, prompt: str, count: int, turn_idx: int
    ) -> List[str]:
        """Handle identical prompts efficiently using n parameter."""
        print(
            f"    \033[93m→ TURN {turn_idx+1} prompt full:\033[0m "
            f"\033[92m{prompt}\033[0m"
        )

        # Use the constant instead of config attribute
        resp = await self.server.completion(
            prompt=prompt,
            n=count,
            max_tokens=MAX_GEN_PER_TURN,
            temperature=0.8,
            stop="</tool_call>",
        )
        choices = [c.text for c in resp.choices]

        # Debug: print each rollout
        for i, raw in enumerate(choices):
            print(
                f"    \033[93m· turn {turn_idx+1} rollout raw [{i}]:\033[0m \033[94m{raw}\033[0m"
            )
            if not raw.strip():
                print(f"      → (empty or error string returned for rollout {i})")
        print("    → All turn 1 rollouts printed; moving on.\n" + "-" * 48)

        return choices

    async def _batch_heterogeneous_prompts(
        self, prompts: List[str], turn_idx: int
    ) -> List[str]:
        """Handle heterogeneous prompts using parallel requests."""
        if turn_idx == 1:
            print("=== DEBUG: Now parallelizing Turn 2 prompts ===")
        print(f"    → Parallelizing {len(prompts)} prompts at turn {turn_idx+1}")

        # Print each prompt
        for idx_p, p_str in enumerate(prompts):
            print(
                f"    \033[93m→ TURN-{turn_idx+1} prompt[{idx_p}] full:\033[0m \033[92m{p_str}\033[0m"
            )

        async def _call_single(prompt_str: str) -> str:
            try:
                # Use the constant instead of config attribute
                comp = await self.server.completion(
                    prompt=prompt_str,
                    n=1,
                    max_tokens=MAX_GEN_PER_TURN,
                    temperature=0.8,
                    stop="</tool_call>",
                )
                return comp.choices[0].text
            except Exception as e:
                print(f"    → Turn {turn_idx+1} _call_single exception: {e}")
                return ""

        tasks = [_call_single(p) for p in prompts]
        results = await asyncio.gather(*tasks)

        # Debug: print results for all turns
        choices = []
        for i, rtext in enumerate(results):
            raw = rtext or ""
            print(
                f"    \033[93m· rollout {i} (Turn {turn_idx+1}) full reply:\033[0m \033[94m{raw}\033[0m\n"
                + "-" * 48
            )
            if not raw:
                print(f"    → Rollout {i} returned empty or error string")
            choices.append(raw)

        return choices

    def _json_objects_match(self, j1, j2):
        try:
            for k in j2:
                if k not in j1:
                    return False
                if isinstance(j2[k], dict):
                    if not self._json_objects_match(j1[k], j2[k]):
                        return False
                elif j1[k] != j2[k]:
                    return False
            return True
        except Exception:
            return False

    async def collect_trajectories(self, item) -> Tuple[ScoredDataGroup, List]:
        """
        One prompt → `n = group_size` sampled assistant completions in
        parallel (single OpenAI request with n completions).  Mirrors the
        logic in SingleToolCallingEnv.
        """
        messages_tuple, expected_raw = item
        expected = (
            json.loads(expected_raw) if isinstance(expected_raw, str) else expected_raw
        )

        # Re‑inflate frozensets to normal dicts
        prompt_msgs = [dict(r) for r in messages_tuple]

        if EXECUTION_FEEDBACK:
            # MODE: Real interleaved tool execution
            return await self._collect_trajectories_with_execution(
                prompt_msgs, expected
            )
        else:
            # MODE: Static generation for data collection (current behavior)
            return await self._collect_trajectories_static(prompt_msgs, expected)

    async def _collect_trajectories_static(
        self, prompt_msgs: List[Dict], expected
    ) -> Tuple[ScoredDataGroup, List]:
        """
        Original static generation mode - no tool execution, just data collection.
        """
        # Convert to text prompt
        prompt_txt = self.tokenizer.apply_chat_template(
            prompt_msgs, add_generation_prompt=True, tokenize=False
        )

        if DEBUG:
            clean_prompt = prompt_txt.replace("<|eot_id|>", "")
            print(
                f"\n\033[93m▶ BATCH PROMPT (tokens {len(prompt_txt)}):\033[0m "
                f"\033[92m{clean_prompt}\033[0m\n{'-'*60}"
            )

        # One API call → many completions
        completions = await self.server.completion(
            prompt=prompt_txt,
            n=self.config.group_size,
            max_tokens=MAX_GEN_PER_TURN,
            temperature=0.8,
        )

        scored: ScoredDataGroup = {
            "tokens": [],
            "masks": [],
            "scores": [],
            "advantages": None,
            "ref_logprobs": None,
            "messages": None,
            "group_overrides": {},
            "overrides": None,
            "images": None,
        }

        for idx, choice in enumerate(completions.choices):
            raw = choice.text or ""
            toks = self.tokenizer.encode(raw)
            if len(toks) > MAX_REPLY_TOKENS:
                toks = toks[:MAX_REPLY_TOKENS]
                raw = self.tokenizer.decode(toks)
                raw = self.tokenizer.decode(toks)
                if DEBUG:
                    print(f"[DEBUG] truncated reply {idx} to {len(toks)} tokens")
            assistant_msg = {"role": "assistant", "content": raw}

            # Create the full context for tokenization - cast to Message type
            full_ctx: List[Message] = prompt_msgs + [assistant_msg]

            # Outcome‑based reward: compare boxed answer to expected expr
            expr = (
                expected["arguments"]["code"][6:-1]
                if (
                    isinstance(expected, dict)
                    and "arguments" in expected
                    and "code" in expected["arguments"]
                    and expected["arguments"]["code"].startswith("print(")
                    and expected["arguments"]["code"].endswith(")")
                )
                else None
            )
            expr = (
                expected["arguments"]["code"][6:-1]
                if (
                    isinstance(expected, dict)
                    and "arguments" in expected
                    and "code" in expected["arguments"]
                    and expected["arguments"]["code"].startswith("print(")
                    and expected["arguments"]["code"].endswith(")")
                )
                else None
            )
            boxed = self._boxed_after_think(raw)

            same = boxed == expr or (
                boxed and expr and self._canon_num(boxed) == self._canon_num(expr)
            )
            reward = 1.0 if same else -1.0
            if "</think>" not in raw:
                reward = -1.0  # invalid – did not close think block
            else:
                # no tool_call tags are allowed *outside* the think block
                end_pos = raw.lower().find("</think>")
                if "<tool_call" in raw[end_pos + len("</think>") :].lower():
                    if DEBUG:
                        print(
                            "[DEBUG] tool_call found outside </think>; setting reward = -1"
                        )

                    reward = -1.0

            if DEBUG:
                print(
                    f"\033[95m--- COMPLETION {idx+1}/{self.config.group_size} ---\033[0m\n"
                    f"\033[94m{raw}\033[0m\nreward={reward}\n{'='*60}"
                )

            tok = tokenize_for_trainer(self.tokenizer, full_ctx)
            scored["tokens"].append(tok["tokens"])
            scored["masks"].append(tok["masks"])
            scored["scores"].append(reward)
            self.percent_correct_buffer.append(max(reward, 0))

        # --- harvest a success for dynamic few‑shots --------------------
        for idx, sc in enumerate(scored["scores"]):
            reply_txt = completions.choices[idx].text
            has_call = "<tool_call" in reply_txt.lower()
            if sc >= 1.0 and has_call:
                # Build (user, assistant) pair from this successful rollout
                u = {"role": "user", "content": prompt_msgs[-1]["content"]}
                a = {"role": "assistant", "content": reply_txt}
                self.dynamic_pool.append((u, a))
                if len(self.dynamic_pool) > self.dynamic_pool_max:
                    self.dynamic_pool.pop(0)
                break

        return scored, []

    async def _collect_trajectories_with_execution(
        self, prompt_msgs: List[Dict], expected
    ) -> Tuple[ScoredDataGroup, List]:
        """
        Real interleaved tool execution mode - stops at tool calls, executes them, and continues.
        Uses turn-based parallel execution for maximum efficiency.
        """
        print(
            f"\n🚀 [EXECUTION MODE] Running {self.config.group_size} rollouts with parallel turn-based execution"
        )

        scored: ScoredDataGroup = {
            "tokens": [],
            "masks": [],
            "scores": [],
            "advantages": None,
            "ref_logprobs": None,
            "messages": None,
            "group_overrides": {},
            "overrides": None,
            "images": None,
        }

        # Initialize per-rollout state
        num_rollouts = self.config.group_size
        rollout_ctxs = [prompt_msgs.copy() for _ in range(num_rollouts)]
        assistant_msgs = [
            {"role": "assistant", "content": ""} for _ in range(num_rollouts)
        ]
        done = [False] * num_rollouts
        final_results = [None] * num_rollouts
        executed_tools = [[] for _ in range(num_rollouts)]

        # Track the most recent generation chunk for each rollout
        last_turns: List[str] = [""] * num_rollouts

        turn_idx = 0
        max_turns = MAX_ROLLOUT_TURNS

        while not all(done) and turn_idx < max_turns:
            print(
                f"\n[TURN {turn_idx + 1}] Processing {sum(1 for d in done if not d)} active rollouts"
            )

            # Build prompts for active rollouts only
            active_prompts = []
            active_indices = []

            for i in range(num_rollouts):
                if not done[i]:
                    prompt_txt = self.tokenizer.apply_chat_template(
                        rollout_ctxs[i],
                        add_generation_prompt=True,
                        tokenize=False,
                    )
                    prompt_txt += assistant_msgs[i]["content"]
                    active_prompts.append(prompt_txt)
                    active_indices.append(i)

            if not active_prompts:
                break

            # Execute inference for this turn
            if turn_idx == 0:
                # First turn: all prompts are identical, use batched inference
                print(
                    f"[TURN {turn_idx + 1}] Batching {len(active_prompts)} identical prompts"
                )
                replies = await self._batch_identical_prompts(
                    active_prompts[0], len(active_prompts), turn_idx
                )
            else:
                # Subsequent turns: prompts may be heterogeneous, use parallel inference
                print(
                    f"⚡ [TURN {turn_idx + 1}] Parallelizing {len(active_prompts)} heterogeneous prompts"
                )
                replies = await self._batch_heterogeneous_prompts(
                    active_prompts, turn_idx
                )

            # Process each active rollout's reply
            for prompt_idx, rollout_idx in enumerate(active_indices):
                if done[rollout_idx]:
                    continue

                reply = replies[prompt_idx]
                # Save this turn's delta for summary
                last_turns[rollout_idx] = reply
                assistant_msgs[rollout_idx]["content"] += reply

                raw = assistant_msgs[rollout_idx]["content"]

                if "</think>" in raw:
                    # Think block closed
                    boxed = self._boxed_after_think(raw)
                    if boxed:
                        # Boxed answer found after </think>
                        print(
                            f"🎯 [ROLLOUT {rollout_idx}] Found boxed answer after </think> - marking complete"
                        )
                        done[rollout_idx] = True
                        rollout_ctxs[rollout_idx].append(assistant_msgs[rollout_idx])
                        final_results[rollout_idx] = raw
                        continue
                    else:
                        # Think block closed but no boxed answer
                        print(
                            f"❌ [ROLLOUT {rollout_idx}] </think> closed but no boxed answer - marking failed"
                        )
                        done[rollout_idx] = True
                        final_results[rollout_idx] = raw
                        continue
                else:
                    # Think block not closed
                    if self._is_new_tool_call(raw):
                        # Tool call present, continue to next turn after executing tool
                        print(
                            f"🔧 [ROLLOUT {rollout_idx}] Tool call detected - extracting and executing"
                        )
                        call_json = self._extract_last_call(raw)
                        if call_json is None:
                            print(
                                f"❌ [ROLLOUT {rollout_idx}] Failed to parse tool call JSON - marking inactive"
                            )
                            done[rollout_idx] = True
                            final_results[rollout_idx] = raw
                            continue

                        print(
                            f"🔧 [ROLLOUT {rollout_idx}] Executing {call_json['name']}\n"
                            f"with args: {call_json['arguments']}"
                        )
                        try:
                            result = await self._exec_tool(call_json)
                            executed_tools[rollout_idx].append(call_json)

                            print(f"✅ [ROLLOUT {rollout_idx}] Tool result: {result}")
                            # Clean up any malformed/partial closing tags before appending
                            content = assistant_msgs[rollout_idx]["content"]
                            content = re.sub(
                                r"</tool_call.*?$", "", content, flags=re.MULTILINE
                            )
                            assistant_msgs[rollout_idx]["content"] = content
                            # Append proper closing tag and response
                            assistant_msgs[rollout_idx]["content"] += "</tool_call>\n"
                            assistant_msgs[rollout_idx][
                                "content"
                            ] += (
                                f"<tool_response>{json.dumps(result)}</tool_response>\n"
                            )
                            print(
                                f"📝 [ROLLOUT {rollout_idx}] Added tool response to context"
                            )
                            continue
                        except Exception as e:
                            print(
                                f"❌ [ROLLOUT {rollout_idx}] Tool execution failed: {e}"
                            )
                            done[rollout_idx] = True
                            final_results[rollout_idx] = raw
                            continue
                    else:
                        # No new tool call or boxed answer yet
                        if turn_idx + 1 < max_turns:
                            print(
                                f"🔄 [ROLLOUT {rollout_idx}] Still thinking—continuing to next turn"
                            )
                            continue
                        # max turns reached, fail
                        print(
                            f"⚠️ [ROLLOUT {rollout_idx}] Max turns reached without completion—marking failed"
                        )
                        done[rollout_idx] = True
                        final_results[rollout_idx] = raw
                        continue

            turn_idx += 1

        # Process final results and score
        print(f"\n🏁 [EXECUTION COMPLETE] Processed {turn_idx} turns")

        # -- Summary of all rollouts before scoring --
        expr = None
        if (
            isinstance(expected, dict)
            and "arguments" in expected
            and "code" in expected["arguments"]
        ):
            code_str = expected["arguments"]["code"]
            if code_str.startswith("print(") and code_str.endswith(")"):
                expr = code_str[6:-1]

        print("\n\033[96m🔎 Final rollout results:\033[0m")
        any_success = False
        for i in range(num_rollouts):
            # Get full text and boxed value
            raw_full = (
                final_results[i]
                if final_results[i] is not None
                else assistant_msgs[i]["content"]
            )
            boxed_val = self._boxed_after_think(raw_full)
            # Determine correctness against expected
            is_correct = False
            if expr is not None and boxed_val is not None:
                is_correct = boxed_val == expr or self._canon_num(
                    boxed_val
                ) == self._canon_num(expr)
            # Choose color and label
            if is_correct:
                label = "CORRECT"
                lbl_color = "\033[92m"
                any_success = True
            elif boxed_val is not None:
                label = "WRONG"
                lbl_color = "\033[93m"
            else:
                label = "NO_BOX"
                lbl_color = "\033[91m"
            reset = "\033[0m"
            last = last_turns[i]
            print()
            print(f"\033[93m--- ROLLOUT {i} ---\033[0m")
            print(f"Result: {lbl_color}{label}{reset}")
            # Last turn content
            print("Last turn output:")
            print(f"\033[96m{last}\033[0m")
            # Boxed vs expected
            print(f"Boxed answer: {boxed_val!r}")
            print(f"Expected answer: {expr!r}")

        if not any_success:
            print(
                f"⚠️ All {num_rollouts} rollouts failed to produce a boxed answer. Invalidating group."
            )
            return None, []
        # -- End summary --

        for rollout_idx in range(num_rollouts):
            try:
                raw = (
                    final_results[rollout_idx]
                    if final_results[rollout_idx] is not None
                    else assistant_msgs[rollout_idx]["content"]
                )

                toks = self.tokenizer.encode(raw)
                if len(toks) > MAX_REPLY_TOKENS:
                    toks = toks[:MAX_REPLY_TOKENS]
                    raw = self.tokenizer.decode(toks)

                final_assistant_msg = {"role": "assistant", "content": raw}
                full_ctx: List[Message] = prompt_msgs + [final_assistant_msg]

                expr = (
                    expected["arguments"]["code"][6:-1]
                    if (
                        isinstance(expected, dict)
                        and "arguments" in expected
                        and "code" in expected["arguments"]
                        and expected["arguments"]["code"].startswith("print(")
                        and expected["arguments"]["code"].endswith(")")
                    )
                    else None
                )
                boxed = self._boxed_after_think(raw)

                same = boxed == expr or (
                    boxed and expr and self._canon_num(boxed) == self._canon_num(expr)
                )
                reward = 1.0 if same else -1.0

                if "</think>" not in raw:
                    reward = -1.0
                else:
                    end_pos = raw.lower().find("</think>")
                    # Check for tool calls or responses after </think>
                    if (
                        "<tool_call" in raw[end_pos + len("</think>") :].lower()
                        or "<tool_response" in raw[end_pos + len("</think>") :].lower()
                    ):
                        reward = -1.0
                    # Add bonus for tool usage if the completion was successful
                    elif reward > 0 and len(executed_tools[rollout_idx]) > 0:
                        print(
                            f"🌟 [ROLLOUT {rollout_idx}] Adding tool usage bonus (+{TOOL_USAGE_BONUS})"
                        )
                        reward += TOOL_USAGE_BONUS

                tok = tokenize_for_trainer(self.tokenizer, full_ctx)
                scored["tokens"].append(tok["tokens"])
                scored["masks"].append(tok["masks"])
                scored["scores"].append(reward)
                self.percent_correct_buffer.append(max(reward, 0))

                # Add successful completions to dynamic pool regardless of number of turns
                if (
                    reward >= 1.0
                ):  # This will now include both 1.0 and 1.0 + TOOL_USAGE_BONUS
                    u = {"role": "user", "content": prompt_msgs[-1]["content"]}
                    a = {"role": "assistant", "content": raw}
                    self.dynamic_pool.append((u, a))
                    if len(self.dynamic_pool) > self.dynamic_pool_max:
                        self.dynamic_pool.pop(0)

            except Exception:
                scored["tokens"].append([])
                scored["masks"].append([])
                scored["scores"].append(-1.0)
                self.percent_correct_buffer.append(0.0)

        print(
            "\n🏁 [EXECUTION MODE] Completed all rollouts. Average reward: \n"
            f"{sum(scored['scores'])/len(scored['scores']):.3f}"
        )

        # -- Per-rollout score summary --
        print("\n\033[96m📊 Rollout score summary:\033[0m")
        reset = "\033[0m"
        for i, score in enumerate(scored["scores"]):
            color = "\033[92m" if score > 0 else "\033[91m"
            print(f"  \033[93m[ROLLOUT {i}]\033[0m Score: {color}{score}{reset}")

        # Add warning if all rollouts failed
        if all(score < 0 for score in scored["scores"]):
            print(
                f"⚠️ [WARNING] All {len(scored['scores'])} rollouts failed with negative rewards!"
            )
            print(
                "   This may indicate a problem with the model, prompt, or token budget."
            )
            # Signal failure to the outer loop
            return None, []

        return scored, []

    # --------------------- evaluation loop -------------------------------- #
    async def evaluate(self, *_, **__):
        """
        Simple eval: run one rollout per test item, compute binary correctness
        based on the boxed answer.  Adds a metric 'eval/percent_correct'.
        """
        if not hasattr(self, "test"):
            return

        total, correct = 0, 0
        for sample in self.test:
            # Build prompt exactly like get_next_item but without mutating self.iter
            prompt_text = sample["problem"]
            expr = sample["expected_answer"].strip()
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": prompt_text},
            ]

            prompt = self.tokenizer.apply_chat_template(
                messages, add_generation_prompt=True, tokenize=False
            )
            comp = await self.server.completion(
                prompt=prompt,
                n=1,
                max_tokens=1024,
                temperature=0.0,
                split="eval",
            )
            model_reply = comp.choices[0].text
            boxed = self._boxed_after_think(model_reply)
            if boxed and boxed == expr:
                correct += 1
            total += 1

        accuracy = correct / max(total, 1)
        self.eval_metrics.append(("eval/percent_correct", accuracy))

    # --------------------- dataset iterator ------------------------------- #
    async def get_next_item(self):
        idx = self.rng.randint(0, len(self.train) - 1)
        sample = self.train[idx]

        prompt_text = sample["problem"]
        expr = sample["expected_answer"].strip()
        answer_call = {
            "name": "python_interpreter",
            "arguments": {"code": f"print({expr})"},
        }

        # ---------------- few‑shot demonstration ---------------- #
        fewshot_user = {
            "role": "user",
            "content": "Compute the integral of x^2 from 0 to 1.",
        }
        fewshot_assistant = {
            "role": "assistant",
            "content": (
                "<think>\n"
                "Let's evaluate the definite integral ∫₀¹ x² dx. This is a basic power rule integral.\n"
                "We know:\n"
                "∫ xⁿ dx from a to b = [xⁿ⁺¹ / (n+1)] from a to b.\n"
                "So for x²:\n"
                "= [x³ / 3] from 0 to 1\n"
                "= (1³ / 3) - (0³ / 3) = 1/3 - 0 = 1/3\n"
                "That checks out, but let's confirm with SymPy just to be sure.\n"
                '<tool_call>{"name":"python_interpreter", '
                '"arguments":{"code":'
                '"import sympy as sp\\n'
                "x=sp.symbols('x')\\n"
                'print(sp.integrate(x**2,(x,0,1)))"}}\n'
                "</tool_call>\n"
                '<tool_response>{"result": 1/3}</tool_response>\n'
                "The interpreter returns 1/3, so the value is 0.333̅.\n"
                "</think>\n\n"
                "The integral equals \\boxed{\\tfrac{1}{3}} \\approx 0.333."
            ),
        }

        # --- second tiny example: simple arithmetic with calculator ---- #
        fewshot_user2 = {"role": "user", "content": "What is (2 + 3) * 4 ?"}
        fewshot_assistant2 = {
            "role": "assistant",
            "content": (
                "<think>\n"
                "I need (2+3)*4.  Quick mental math gives 5*4 = 20, "
                "but I'll confirm with the calculator tool.\n"
                '<tool_call>{"name":"calculator", '
                '"arguments":{"expr":"(2+3)*4"}}</tool_call>\n'
                '<tool_response>{"value": 20}</tool_response>\n'
                "The tool also says 20, matching my head‑math.\n"
                "</think>\n\n"
                "Therefore the answer is \\boxed{20}."
            ),
        }

        # --------------- build final prompt messages ------------ #
        system_msg = {"role": "system", "content": SYSTEM_PROMPT}
        real_user = {
            "role": "user",
            "content": (
                f"{prompt_text} \n"
                "This is a math problem, you must use the python_interpreter or calculator tool call to solve it."
                # "Before you call the tools, try to solve it step-by-step and then use the tool to verify"
            ),
        }

        # Optionally insert one real demo from dynamic_pool
        dyn = list(self.dynamic_pool[-1]) if self.dynamic_pool else []

        messages = (
            [
                system_msg,
                fewshot_user,
                fewshot_assistant,
                fewshot_user2,
                fewshot_assistant2,
            ]
            + dyn
            + [real_user]
        )

        # Freeze for hashing
        frozen = tuple(frozenset(m.items()) for m in messages)

        return (frozen, answer_call)

    # --------------------- wandb logging ---------------------------------- #
    async def create_rollout_table(self, metrics):
        if self.rollouts_for_wandb:
            table = wandb.Table(columns=["text", "score"])
            for grp in self.rollouts_for_wandb:
                for txt, sc in grp:
                    table.add_data(txt, sc)
            metrics["train/rollouts"] = table
        self.rollouts_for_wandb = []
        return metrics

    async def wandb_log(self, metrics: Dict = None):
        metrics = metrics or {}
        if self.percent_correct_buffer:
            metrics["train/percent_correct"] = sum(self.percent_correct_buffer) / len(
                self.percent_correct_buffer
            )
        self.percent_correct_buffer = []
        for k, v in self.eval_metrics:
            metrics[k] = v
        self.eval_metrics = []
        await super().wandb_log(metrics)


if __name__ == "__main__":
    InterleavedInlineEnv.cli()