diff --git a/LICENSE b/LICENSE
index 75410e73..5700b997 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2025 Nous Research
+Copyright (c) 2026 Nous Research
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/atroposlib/envs/eval.py b/atroposlib/envs/eval.py
new file mode 100644
index 00000000..21090b03
--- /dev/null
+++ b/atroposlib/envs/eval.py
@@ -0,0 +1,225 @@
+import json
+import logging
+import os
+import time
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Tuple
+
+import jsonlines
+import numpy as np
+from openai.types.chat import ChatCompletion
+from tqdm.asyncio import tqdm_asyncio
+
+from atroposlib.envs.server_handling.server_manager import ServerManager
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+def pass_at_k(m, c, k):
+    """
+    m: total samples
+    c: correct samples
+    k: k in pass@k
+    """
+    if m - c < k:
+        return 1.0
+    return 1.0 - np.prod(1.0 - k / np.arange(m - c + 1, m + 1))
+
+
+def evaluate_log(
+    metrics: Dict,
+    eval_dir: Optional[str] = None,
+    task_name: Optional[str] = None,
+    model_name: Optional[str] = None,
+    start_time: Optional[float] = None,
+    end_time: Optional[float] = None,
+    generation_parameters: Optional[Dict] = None,
+    samples: Optional[List[Dict]] = None,
+    verbose: bool = True,
+):
+    """
+    Log evaluation results to a JSON file in the format expected by nous-evals.
+
+    Args:
+        metrics: Dictionary of metrics to log (same format as wandb_log)
+        eval_dir: Directory to save evaluation results to
+        task_name: Name of the evaluation task (defaults to env name)
+        model_name: Name of the model being evaluated
+        start_time: Start time of evaluation (unix timestamp)
+        end_time: End time of evaluation (unix timestamp)
+        generation_parameters: Dictionary of generation parameters used
+        samples: List of sample dictionaries to save to samples.jsonl
+        verbose: If True, print a markdown table of the metrics
+    """
+    if eval_dir is None:
+        logger.warning("eval_dir is not set, skipping evaluation logging")
+        return
+    # Create directory if it doesn't exist
+    os.makedirs(eval_dir, exist_ok=True)
+
+    # Generate filename
+    filename = "metrics.json"
+    filepath = os.path.join(eval_dir, filename)
+
+    if start_time is None:
+        start_time = time.time()
+    if end_time is None:
+        end_time = time.time()
+    if generation_parameters is None:
+        generation_parameters = {}
+
+    # Print metrics table if verbose
+    if verbose:
+        from atroposlib.utils.display import display_metrics_table
+
+        display_metrics_table(task_name, metrics, start_time, end_time)
+
+    # Build evaluation result structure - skeleton of lighteval's
+    task_key = f"atropos|{task_name}|0"
+
+    eval_result = {
+        "config_general": {
+            "model_name": model_name,
+            "total_evaluation_time_seconds": str(end_time - start_time),
+            "generation_parameters": generation_parameters,
+        },
+        "results": {
+            task_key: metrics,
+            "all": metrics,
+        },
+    }
+
+    # Write main results to JSON file
+    with open(filepath, "w") as f:
+        json.dump(eval_result, f, indent=2)
+
+    print(f"Evaluation results saved to {filepath}")
+
+    # Write samples to JSONL file if provided
+    if samples:
+        samples_filepath = os.path.join(eval_dir, "samples.jsonl")
+        with jsonlines.open(samples_filepath, "w") as writer:
+            for sample in samples:
+                writer.write(sample)
+        print(f"Evaluation samples saved to {samples_filepath}")
+
+
+class EvalBase(ABC):
+    """ """
+
+    def __init__(self, pass_at_n=1, pass_at_n_samples=1, **kwargs):
+        self.pass_at_n = pass_at_n
+        self.pass_at_n_samples = pass_at_n_samples
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        self.data = self.setup_data()
+
+    def get_generation_params(self):
+        """
+        Generation params to be sent to an openai server
+        """
+        temp = getattr(self, "temperature", 0.0)
+        n = max(self.pass_at_n_samples, self.pass_at_n)
+        top_p = getattr(self, "top_p", 1.0)
+        max_tokens = getattr(self, "max_tokens", -1)
+        return {"temperature": temp, "n": n, "top_p": top_p, "max_tokens": max_tokens}
+
+    async def chat_completion(self, server, messages) -> ChatCompletion:
+        gen_params = self.get_generation_params()
+        return await server.chat_completion(messages=messages, **gen_params)
+
+    @abstractmethod
+    def setup_data(self) -> list:
+        raise NotImplementedError("Setup data method must be implemented in subclass")
+
+    @abstractmethod
+    async def run_item(
+        self, server: ServerManager, data_item: dict
+    ) -> Tuple[dict, list]:
+        """
+        An abstract method that must be overridden in a subclass to define how a
+        specific item should be processed. This method encapsulates the logic required
+        to run or process the given data item on the provided server instance.
+
+        Args:
+            server (ServerManager): An instance of ServerManager used to manage and
+                interact with server operations during the item processing.
+            data_item (dict): A dictionary representing the data item to be processed.
+                The structure and content of the dictionary would depend on the
+                specific application and use case.
+
+        Returns:
+            Tuple[dict, list]: A tuple where the first element is a dictionary
+                containing the processed results or output of the operation, and
+                the second element is a list containing any additional data generated
+                or collected during the item's processing.
+
+        Raises:
+            NotImplementedError: This error is raised when the method is not
+                implemented in subclasses.
+        """
+        raise NotImplementedError("Run item method must be implemented in subclass")
+
+    async def __call__(self, server_manager: ServerManager):
+        task_coros = list()
+        start_time = time.time()
+        for data_item in self.data:
+            task_coros.append(self.run_item(server_manager, data_item))
+        task_results = await tqdm_asyncio.gather(*task_coros)
+        end_time = time.time()
+        # grab metrics and generation params
+        metrics_list = [result[0] for result in task_results]
+        # aggregate metrics
+        keys = list(metrics_list[0].keys())
+        metrics = {
+            key: sum([result[key] for result in metrics_list]) / len(metrics_list)
+            for key in keys
+        }
+        # check if all generation params are the same
+        samples = [result[1] for result in task_results]
+        task_name = self.__class__.__name__
+        task_name += f"@{self.pass_at_n}"
+        if self.pass_at_n != self.pass_at_n_samples:
+            task_name += f":{self.pass_at_n_samples}"
+        print(f"{task_name} metrics: {metrics}")
+        evaluate_log(
+            metrics,
+            eval_dir=getattr(self, "eval_dir", None),
+            task_name=task_name,
+            model_name=server_manager.servers[0].config.model_name,
+            start_time=start_time,
+            end_time=end_time,
+            generation_parameters=self.get_generation_params(),
+            samples=samples,
+            verbose=getattr(self, "verbose", False),
+        )
+
+        return metrics
+
+
+async def eval_runner(eval_env: EvalBase):
+    import argparse
+
+    from atroposlib.envs.server_handling.server_baseline import APIServerConfig
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="http://localhost:8000",
+        help="URL of the server to connect to.",
+    )
+    parser.add_argument("--model-name", type=str, default=None, help="Model name")
+    args = parser.parse_args()
+    server_manager = ServerManager(
+        configs=[
+            APIServerConfig(
+                api_key="dummy",
+                base_url=args.server_url,
+                model_name=args.model_name,
+                health_check=False,
+            ),
+        ]
+    )
+    return await eval_env(server_manager)
diff --git a/atroposlib/envs/server_handling/server_manager.py b/atroposlib/envs/server_handling/server_manager.py
index 2a3154f5..b9666093 100644
--- a/atroposlib/envs/server_handling/server_manager.py
+++ b/atroposlib/envs/server_handling/server_manager.py
@@ -1,6 +1,7 @@
 import asyncio
 import inspect
 import os
+import warnings
 from contextlib import asynccontextmanager
 from typing import AsyncGenerator, List, Optional, Union
 
@@ -394,12 +395,18 @@ class ServerManager:
                 most_available_server_num_slots = server.sem._value
 
         # Create ManagedServer wrapping the selected server
-        managed = ManagedServer(
-            server=self.servers[most_available_server], tokenizer=tokenizer
-        )
+        if isinstance(self.servers[most_available_server], OpenAIServer):
+            warnings.warn(
+                "Using OpenAIServer with managed_server does not allow for state tracking"
+            )
+            yield self.servers[most_available_server]
+        else:
+            managed = ManagedServer(
+                server=self.servers[most_available_server], tokenizer=tokenizer
+            )
 
-        try:
-            yield managed
-        finally:
-            # Clean up: reset tracked sequences
-            managed.reset()
+            try:
+                yield managed
+            finally:
+                # Clean up: reset tracked sequences
+                managed.reset()
diff --git a/environments/README.md b/environments/README.md
index c723b3a5..79201afb 100644
--- a/environments/README.md
+++ b/environments/README.md
@@ -554,3 +554,91 @@ Wrap each *SEARCH/REPLACE* edit in a code block as shown in the example above. I
 - **Dataset Handling:** Loads training and test data from Hugging Face datasets, specifically tailored for SWE-bench like formats.
 - **Patch Parsing:** Implements robust parsing for a specific SEARCH/REPLACE patch format.
 - **Thinking Tag Processing:** Extracts content after `<think> </think>`
+
+---
+
+### Text Reversal Environment (`text_reversal_environment.py`)
+
+Environment for training and evaluating exact string reversal with optional thinking and split train/eval context lengths.
+
+**Dataset:**
+- `PrimeIntellect/Reverse-Text-SFT`
+
+**Input Format:**
+- Each item contains two `prompt` messages and one `completion` message:
+  - `prompt`: list of messages with roles {`system`, `user`}
+  - `completion`: list with a single assistant message containing the reversed text, wrapped in `<reversed_text>...</reversed_text>`
+
+**Prompt Construction:**
+- The dataset's system text is NOT used as a system message to the model.
+- Instead, it is prepended to the user content with two newline separators and sent as the user turn:
+  - Effective user content: `"{dataset_system}\n\n{dataset_user}"`
+- Optional thinking system prompt is included only when `use_thinking=True`.
+
+**Reward Function:**
+- Extract the model output after the first closing `</think>` tag (if present), trim whitespace.
+- Score is 1.0 if the remaining output EXACTLY matches the dataset assistant `completion` content; otherwise 0.0.
+
+**Optional CoT Length Penalty (for correct rollouts only):**
+- Enabled by default (`length_penalty_enabled=True`).
+- Within each training group, compute CoT token lengths from the content inside the first `<think>...</think>` block of correct rollouts.
+- Let L̄ be the average of those lengths. A deadband δ (default 5 tokens) defines a threshold `L̄ + δ`.
+- Any correct rollout with length above this threshold is penalized: `score = 1 - α * ((excess / L̄)^p)`, clipped to `[penalty_min_score, 1]`.
+  - Defaults: `α=0.5`, `p=2`, `penalty_min_score=0.2`.
+- Incorrect rollouts remain at 0.0. If no valid think block (or thinking disabled), penalty is skipped for that rollout.
+
+**Curriculum: One-Epoch + Hard Retries (optional):**
+- Controlled by `curriculum_one_epoch_enabled` (default: True).
+- First pass (one epoch): each item is attempted once. If any rollout in the group is correct (≥1/N), the item is considered solved and never revisited. If the group has zero correct (0/N), the item is marked “hard” and placed into a retry pool.
+- Retry phase: only begins after the first pass over all training items completes. Items in the retry pool are revisited up to `hard_retry_max_attempts` times (default: 3). If still unsolved, they are dropped and training completes naturally when the retry pool is exhausted.
+- Tip: Use a large `total_steps`. The environment will stop serving items once the one-epoch + retries queues are exhausted (it raises completion in `get_next_item`).
+
+**Configuration Options (`TextReversalEnvConfig`):**
+- `use_thinking` (bool, default: False): include thinking system prompt.
+- `dataset_name` (str, default: `PrimeIntellect/Reverse-Text-SFT`): training dataset.
+- `eval_dataset_name` (Optional[str], default: None): static eval dataset to use (full split). If `None`, the environment samples `test_set_size` examples from the training dataset for eval.
+- `test_set_size` (int, default: 100): number of samples for eval when `eval_dataset_name=None`.
+- `max_train_token_length` (int, default: 16384): max tokens for training generations.
+- `max_eval_token_length` (int, default: 32768): max tokens for eval generations.
+- `length_penalty_enabled` (bool, default: True): enable within-group CoT length penalty for correct rollouts.
+- `penalty_deadband_tokens` (int, default: 5): δ deadband added above average length before penalizing.
+- `penalty_alpha` (float, default: 0.5): penalty scale.
+- `penalty_power` (float, default: 2.0): penalty exponent (quadratic by default).
+- `penalty_min_score` (float, default: 0.2): lower bound for penalized correct rollouts.
+- `curriculum_one_epoch_enabled` (bool, default: True): enables one-pass training plus a late retry phase for hard items.
+- `hard_retry_max_attempts` (int, default: 3): maximum retry attempts per hard item in the retry phase.
+
+**Usage Examples:**
+```bash
+# Basic training with default 16k train context, 32k eval context, and sampled eval set (100 examples)
+python text_reversal_environment.py serve
+
+# Enable thinking system prompt
+python text_reversal_environment.py serve \
+  --env.use_thinking=True
+
+# Use a static eval dataset instead of sampling from train
+python text_reversal_environment.py serve \
+  --env.eval_dataset_name="someorg/Reverse-Text-EVAL"
+
+# Override max token lengths if needed
+python text_reversal_environment.py serve \
+  --env.max_train_token_length=12000 \
+  --env.max_eval_token_length=28000
+
+# Adjust/disable the CoT length penalty for correct rollouts
+python text_reversal_environment.py serve \
+  --env.length_penalty_enabled=False \
+  --env.penalty_deadband_tokens=8 \
+  --env.penalty_alpha=0.6 \
+  --env.penalty_power=2.0 \
+  --env.penalty_min_score=0.3
+
+# Enable one-epoch + retries curriculum and set max retries
+python text_reversal_environment.py serve \
+  --env.curriculum_one_epoch_enabled=True \
+  --env.hard_retry_max_attempts=3
+```
+
+**Evaluation Metric:**
+- `eval/percent_correct`: strict exact-match accuracy on the eval set.
diff --git a/environments/code_execution_server/README.md b/environments/code_execution_server/README.md
new file mode 100644
index 00000000..46e16592
--- /dev/null
+++ b/environments/code_execution_server/README.md
@@ -0,0 +1,239 @@
+# Code Execution Environment
+
+A comprehensive environment for training language models to solve coding problems through **code generation and execution**. This environment evaluates models on their ability to generate correct Python code that passes test cases using a Modal endpoint to validate LLM-generated code.
+
+## 🎯 Overview
+
+The Code Execution Environment evaluates models on:
+- Generating correct Python code solutions for coding problems
+- Passing test cases through actual code execution
+- Handling both function-based and stdin/stdout problem types
+- Supporting multiple datasets (RLVR, DeepMind Code Contests, LCB Test)
+- Providing comprehensive evaluation metrics (pass@1, pass@k, difficulty breakdowns)
+
+**Key Philosophy**: This environment scores based on **code correctness** (1.0 for passing all tests, -1.0 for failing). Models must generate syntactically valid, executable Python code that produces the correct outputs for given test cases.
+
+## ✨ Key Features
+
+### 🔧 **Code Execution & Testing**
+- Real code execution via Modal endpoint (`lcb_modal_endpoint.py`)
+- Support for function-based problems (LeetCode-style)
+- Support for stdin/stdout problems (competitive programming style)
+- Automatic test case validation
+- Error handling (timeouts, runtime errors, wrong answers)
+
+### 📊 **Comprehensive Evaluation**
+- Pass@1 metric calculation using combinatorial estimation
+- Pass@group_size evaluation
+- Difficulty-level breakdowns (easy/medium/hard)
+- Completion length tracking (correct vs incorrect solutions)
+- Overlong ratio tracking (solutions exceeding token limits)
+
+### 📚 **Dataset Support**
+- **RLVR_Coding_Problems**: Training dataset for reinforcement learning
+- **DeepMind Code Contests**: Alternative training dataset
+- **LCB Test**: Evaluation benchmark from LiveCodeBench
+- Automatic dataset format handling and conversion
+
+### ⚙️ **Safety & Reliability**
+- Sandboxed code execution via Modal
+- Timeout protection
+- Resource limits (memory, CPU)
+- Reliability guards preventing destructive operations
+- Segmentation fault handling
+
+## 🚀 Quick Start
+
+### Basic Configuration
+
+```python
+from atropos.environments.code_execution_server import CodingEnv, CodeConfig
+
+# Configuration
+config = CodeConfig(
+    dataset_name="normal",  # or "deepmind"
+    temperature=1.0,
+    eval_temperature=0.6,
+    top_p=1.0,
+    eval_top_p=0.95,
+    start_idx=0,
+    max_eval_token_length=40960,
+)
+
+# Initialize environment
+env = CodingEnv(config, server_configs)
+```
+
+### Problem Types
+
+#### **Function-Based Problems**
+Problems where code defines a function that is called with test inputs:
+```python
+# Problem specification includes function signature
+def solve(nums: List[int]) -> int:
+    # Model generates function body
+    return max(nums)
+
+# Tests call the function directly
+tests = {
+    "fn_name": "solve",
+    "input": [[1, 2, 3], [5, 4, 3]],
+    "output": [3, 5]
+}
+```
+
+#### **Standard Input/Output Problems**
+Problems where code reads from stdin and writes to stdout:
+```python
+# Problem: Read integers, output their sum
+# Model generates:
+a = int(input())
+b = int(input())
+print(a + b)
+
+# Tests provide stdin inputs and expected stdout outputs
+tests = {
+    "fn_name": "none",
+    "input": ["5\n3", "10\n20"],
+    "output": ["8", "30"]
+}
+```
+
+## 📊 Evaluation Metrics
+
+### **Pass@1 (Estimated)**
+Uses combinatorial estimation to calculate the probability of at least one correct solution in a single attempt:
+```
+pass@1 = mean(1 - C(n-c, 1) / C(n, 1)) across all problems
+where n = group_size, c = number of correct solutions
+```
+
+### **Pass@group_size**
+Fraction of problems where at least one solution is correct:
+```
+pass@group_size = mean(num_correct > 0)
+```
+
+### **Difficulty Breakdowns**
+Separate metrics for easy, medium, and hard problems:
+- `eval/easy_pass_1`
+- `eval/medium_pass_1`
+- `eval/hard_pass_1`
+
+### **Completion Analysis**
+- `eval/completion_length`: Average completion length
+- `eval/correct_completion_length`: Average length of correct solutions
+- `eval/incorrect_completion_length`: Average length of incorrect solutions
+- `eval/overlong_ratio`: Fraction of solutions exceeding token limits
+
+### **Training Metrics**
+- `train_rewards/rewards`: Average reward (correctness rate)
+- `train_rewards/pass@group_size`: Training pass rate
+- `train_rewards/overlong_ratio`: Training overlong ratio
+- `train/completion_lengths`: Training completion statistics
+
+
+## 🔍 Code Extraction & Scoring
+
+### **Code Extraction**
+The environment extracts Python code from model responses using regex:
+- Looks for code blocks in markdown format: ` ```python ... ``` `
+- Takes the last code block if multiple are present
+- Returns `None` if no code block is found (scores -1.0)
+
+### **Scoring Logic**
+```python
+if code is None:
+    score = -1.0  # No code extracted
+elif all_tests_pass(code, test_cases):
+    score = 1.0   # All tests pass
+else:
+    score = -1.0  # Tests fail or error occurs
+```
+
+### **Test Execution**
+Code is executed via Modal endpoint with:
+- Timeout protection (default 15 seconds per test case)
+- Memory limits (5GB)
+- Reliability guards (disables file system, network, process operations)
+- Error categorization (Wrong Answer, Time Limit Exceeded, Runtime Error)
+
+## 🛠️ Advanced Features
+
+### **Offline Filtering**
+Utility to identify problems where the model achieves perfect scores:
+```python
+await env.offline_filter()
+# Saves perfect problem indices to perfect_indices.txt
+```
+
+### **Blacklist System**
+Problems in `perfect_indices.txt` are automatically blacklisted during training to focus on harder problems.
+
+### **Data Logging**
+Comprehensive logging to files:
+- **Short logs**: `qwen_data_dump_{timestamp}.txt` - Basic stats per problem
+- **Long logs**: `qwen_data_dump_long_{timestamp}.txt` - Includes code, errors, full outputs
+- Separate directories: `train_logs/` and `eval_logs/`
+
+### **Example Log Entry**
+```json
+{
+    "cur_id": 42,
+    "num_correct": 5,
+    "total": 8,
+    "scores": [1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0],
+    "lengths": [1200, 1180, 1220, 1190, 1210, 800, 750, 820],
+    "errors": [...],
+    "codes": [...],
+    "gen": "assistant message content"
+}
+```
+
+
+## 📝 Response Format Requirements
+
+### **Expected Format**
+Model responses should contain Python code in markdown code blocks:
+````
+Here's my solution:
+
+```python
+def solve(nums):
+    return max(nums)
+```
+````
+
+## 🔐 Code Execution Endpoint (`lcb_modal_endpoint.py`)
+
+The Modal endpoint handles safe execution of generated Python code with comprehensive sandboxing and test validation. Most of the code is adapted from the [LiveCodeBench](https://github.com/LiveCodeBench/LiveCodeBench) and [rllm](https://github.com/rllm-org/rllm) repositories.
+
+#### **Problem Type Support**
+- **Call-Based**: Function calls with JSON-serialized inputs/outputs (LeetCode-style)
+- **Standard I/O**: stdin/stdout problems (competitive programming style)
+
+The code execution endpoint (`lcb_modal_endpoint.py`) implements extensive safety measures:
+
+### **Reliability Guards**
+- Disables file system operations (`os.remove`, `os.chdir`, etc.)
+- Blocks process operations (`os.fork`, `subprocess.Popen`)
+- Prevents network access
+- Limits memory usage (5GB default)
+- Sets recursion limit to prevent stack overflow
+
+### **Error Handling**
+- Timeout exceptions (SIGALRM)
+- Segmentation fault detection (faulthandler)
+- Runtime error capture
+- Wrong answer detection with detailed comparison
+
+### **Execution Environment**
+- Isolated Modal containers
+- Signal-based timeouts
+- Resource limits (CPU, memory)
+- Sandboxed imports (whitelisted standard library modules)
+
+
+## 📄 License
+
+This environment is part of the Atropos training framework. See the main repository for license information.
diff --git a/environments/code_execution_server/coding_server.py b/environments/code_execution_server/coding_server.py
index c2088ff4..0a64acf3 100644
--- a/environments/code_execution_server/coding_server.py
+++ b/environments/code_execution_server/coding_server.py
@@ -1,117 +1,350 @@
 import asyncio
+import json
+import math
 import os
-import random
-from typing import List, Optional, Tuple
+import time
+from collections import deque
+from datetime import datetime
+from typing import Dict, List, Optional, Tuple
 
-import docker
-import httpx
+import modal
+import numpy as np
 import regex as re
+import wandb
 from datasets import load_dataset
+from pydantic import Field
+from rich import print as rprint
 
-from atroposlib.envs.base import BaseEnv, ScoredDataGroup
+from atroposlib.envs.base import (
+    APIServerConfig,
+    BaseEnv,
+    BaseEnvConfig,
+    EvalHandlingEnum,
+    ScoredDataGroup,
+)
 from atroposlib.type_definitions import GameHistory, Item
 from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer
 
-system_prompt = (
-    "You are a deep thinking AI, you may use extremely long chains of thought "
-    "to deeply consider the problem and deliberate with yourself via systematic "
-    "reasoning processes to help come to a correct solution prior to answering. "
-    "You should enclose your thoughts and internal monologue inside <think> </think> "
-    "tags, and then provide your solution or response to the problem.\n\n"
+system_prompt = ""
+
+system_prompt += (
+    "You are an expert Python programmer. You will be given a question "
+    "(problem specification) and will generate a correct Python program "
+    "that matches the specification and passes all tests."
 )
 
+FORMATTING_MESSAGE_WITH_STARTER_CODE = (
+    "You will use the following starter code to write the solution to the "
+    "problem and enclose your code within delimiters."
+)
+FORMATTING_WITHOUT_STARTER_CODE = (
+    "Read the inputs from stdin, solve the problem, and write the answer to "
+    "stdout (do not directly test on the sample inputs). Enclose your code "
+    "within delimiters as follows. Ensure that when the python program runs "
+    "it reads the inputs runs the algorithm and writes output to STDOUT."
+)
 
-async def submit_code(client, code, test_input, language="python"):
-    url = "http://localhost:5002/execute"
-    payload = {"code": code, "input": test_input, "language": language}
-    response = await client.post(url, json=payload)
-    response_json = response.json()
-    return response_json["output"]
+async_semaphore = asyncio.Semaphore(100)
 
 
-async def get_results(code, answer):
-    async with httpx.AsyncClient() as client:
-        tasks = []
-        for i in range(len(answer)):
-            tasks.append(submit_code(client, code, answer[i]))
-
-        results = await asyncio.gather(*tasks)
-    return [result for result in results]
+def get_prompt(question, problem_type, starter_code=None):
+    prompt = ""
+    prompt += f"Question: {question}\n\n"
+    if problem_type == "func" and starter_code:
+        prompt += f"{FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
+        prompt += f"```python\n{starter_code}\n```\n\n"
+    elif problem_type == "func" and not starter_code:
+        pass
+    else:
+        prompt += f"{FORMATTING_WITHOUT_STARTER_CODE}\n"
+        prompt += "```python\n# YOUR CODE HERE\n```\n\n"
+    prompt += "### Answer: (use the provided format with backticks)\n\n"
+    return prompt
 
 
-def init_docker():
-    client = docker.from_env()
+run_test = modal.Function.from_name("joeli-lcb", "run_test")
 
-    def build_docker_image():
-        try:
-            # Build the Docker image
-            print("Building Docker image...")
-            current_dir = os.path.dirname(
-                os.path.abspath(__file__)
-            )  # Get the current directory of the script
-            image, logs = client.images.build(path=current_dir, tag="code-executor")
 
-            # Print the build logs
-            for log in logs:
-                print(log.get("stream", "").strip())
-
-            print("Docker image built successfully.")
-            return image
-        except docker.errors.BuildError as e:
-            print(f"Error during Docker image build: {e}")
-
-    def run_docker_container():
-        try:
-            # Run the Docker container
-            print("Running Docker container...")
-            container = client.containers.run(
-                "code-executor", ports={"5002/tcp": 5002}, detach=True
-            )  # Runs in detached mode (in the background)
-
-            print(f"Docker container is running with ID: {container.id}")
-            return container
-        except docker.errors.ContainerError as e:
-            print(f"Error during Docker container run: {e}")
-
-    build_docker_image()
-    container = run_docker_container()
-    return container
+class CodeConfig(BaseEnvConfig):
+    dataset_name: str = Field(
+        "normal", description="dataset name, should be normal or deepmind"
+    )
+    temperature: float = Field(0.6, description="model temperature")
+    eval_temperature: float = Field(
+        0.6, description="model temperature during evaluation"
+    )
+    top_p: float = Field(0.95, description="top p")
+    eval_top_p: float = Field(0.95, description="eval top p")
+    start_idx: int = Field(0, description="start index in training set")
+    max_eval_token_length: int = Field(
+        40960, description="max sequence length during evaluation"
+    )
 
 
 class CodingEnv(BaseEnv):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+        self.id = 0
+        self.complete = []
+        self.total = []
+        self.complement = []
+        self.eval_metrics = []
+        self.train_metrics = {
+            "rewards": [],
+            "overlong_ratio": [],
+            "pass_gs": [],
+            "completion_lengths": [],
+            "correct_completion_lengths": [],
+            "incorrect_completion_lengths": [],
+        }
+        self.cur_time = datetime.now().strftime("%Y-%m-%d %H.%M.%S")
+
+        self.temp_metrics = {
+            "indices": [],
+            "num_correct": [],
+        }
+
+        self.blacklist = set()
+        self.deq = deque()
+
+    @classmethod
+    def config_init(cls) -> Tuple[CodeConfig, List[APIServerConfig]]:
+        env_config = CodeConfig(
+            tokenizer_name="Qwen/Qwen3-14B",
+            group_size=8,
+            use_wandb=True,
+            rollout_server_url="http://localhost:8000",
+            total_steps=600,
+            batch_size=-1,
+            steps_per_eval=10,
+            max_batches_offpolicy=2,
+            max_eval_workers=128,  # max workers per node * num gpus
+            eval_handling=EvalHandlingEnum.STOP_TRAIN,
+            eval_limit_ratio=0.25,
+            max_token_length=32768,
+            min_items_sent_before_logging=0,
+            wandb_name="qwen_gspo_32k_mod",
+            dataset_name="normal",
+            worker_timeout=7200,
+            temperature=1.0,
+            eval_temperature=0.6,
+            top_p=1.0,
+            eval_top_p=0.95,
+            start_idx=0,
+            max_eval_token_length=40960,
+        )
+        server_configs = [
+            APIServerConfig(
+                model_name="Qwen/Qwen3-14B",
+                base_url="http://localhost:9001/v1",
+                api_key="x",
+                num_requests_for_eval=256,
+                timeout=2400,
+            ),
+        ]
+
+        return env_config, server_configs
+
+    # item looks like {problem, tests, problem_type, idx}
     async def collect_trajectories(
         self, item: Item
     ) -> Tuple[GameHistory | None, List[Item]]:
-        chat_completions = await self.server.chat_completion(
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You must submit your answer with ```python\n{code}```",
-                },
-                dict(item[0][0]),
-            ],
-            n=self.config.group_size,
-            max_tokens=1024 * 4,
+        rprint("COLLECTING TRAJECTORIES")
+        train_or_valid = item["split"]
+        system_msg = {"role": "system", "content": system_prompt}
+        user_msg = {
+            "role": "user",
+            "content": get_prompt(
+                item["problem"], item["problem_type"], item.get("starter_code", None)
+            ),
+        }
+
+        prompt_tokens = tokenize_for_trainer(
+            self.tokenizer, chat=[system_msg, user_msg]
         )
-        to_score = list()
-        to_backlog = list()
-        for i, chat_completion in enumerate(chat_completions.choices):
-            messages = (
-                dict(item[0][0]),
-                {"role": "assistant", "content": chat_completion.message.content},
+        buffer = 32
+
+        async def generate_and_score(index: int) -> Tuple[List[int], List[int], float]:
+            # Step 1: Generate single completion
+
+            max_tokens = (
+                self.config.max_token_length - len(prompt_tokens["tokens"]) - buffer
             )
-            to_score.append(
-                (
-                    messages,
-                    item[1],
+            temp = self.config.temperature
+            top_p = self.config.top_p
+            if train_or_valid == "test":
+                max_tokens = (
+                    self.config.max_eval_token_length
+                    - len(prompt_tokens["tokens"])
+                    - buffer
                 )
+                top_p = self.config.eval_top_p
+                temp = self.config.eval_temperature
+
+            chat_completion = await self.server.chat_completion(
+                messages=[system_msg, user_msg],
+                n=1,  # CHANGE
+                max_tokens=max_tokens,
+                temperature=temp,
+                top_p=top_p,
+            )
+            content = chat_completion.choices[0].message.content
+            assistant_msg = {"role": "assistant", "content": content}
+            messages = [system_msg, user_msg, assistant_msg]
+
+            if "fn_name" not in item:
+                fn_name = "none"
+            else:
+                fn_name = item["fn_name"]
+
+            tests = item["tests"]
+            if isinstance(tests, str):
+                tests = json.loads(tests)
+            tests["fn_name"] = fn_name
+
+            score_input = [
+                (messages, tests, item["idx"], chat_completion.choices[0].finish_reason)
+            ]
+            scored_group, tup = await self.score(score_input)
+            return (
+                scored_group["tokens"][0],
+                scored_group["masks"][0],
+                scored_group["scores"][0],
+                scored_group["overrides"][0],
+                tup[0],
+                tup[1],
+                tup[2],
+                assistant_msg,
             )
 
-        to_postprocess = await self.score(to_score)
-        return to_postprocess, to_backlog
+        start_time = time.time()
+        if train_or_valid == "train":
+            self.total.append(item["idx"])  # LOCK
+            self.complement = sorted(list(set(self.total) - set(self.complete)))
+            print("TOTAL: ", self.complement)
+        if train_or_valid == "train":
+            tasks = [generate_and_score(i) for i in range(self.config.group_size)]
+        else:
+            tasks = [generate_and_score(i) for i in range(16)]
+        results = await asyncio.gather(*tasks)
+        end_time = time.time()
+
+        dt = end_time - start_time
+        scored_data = ScoredDataGroup()
+        scored_data["tokens"] = [r[0] for r in results]
+        scored_data["masks"] = [r[1] for r in results]
+        scored_data["scores"] = [r[2] for r in results]
+        scored_data["overrides"] = [r[3] for r in results]
+
+        codes = [r[4] for r in results]
+        errors = [r[5] for r in results]
+        lengths = [r[6] for r in results]
+        asst_messages = [r[7] for r in results]
+        cur_id = item["idx"]
+        if train_or_valid == "train":
+            self.complete.append(cur_id)  # LOCK
+            self.complement = sorted(list(set(self.total) - set(self.complete)))
+
+        rprint(train_or_valid)
+        rprint("CURRENT ID: ", cur_id, item["problem_type"])
+        print("GEN LENGTHS: ", lengths)
+        print("SCORES ", scored_data["scores"])
+        print("MISSING ", self.complement)
+        print("CURRENT TIME", time.time())
+        # for error, code in zip(errors, codes):
+        #    print("ERROR:", error)
+        # if 'output' in error and error['output'] == '':
+        #    print(code)
+        print(f"Elapsed time: {dt:.2f} seconds")
+
+        async with self.lock:
+            heap_sum = 0
+            for x in scored_data["scores"]:
+                if math.isclose(1.0, x):
+                    heap_sum += 1
+            num_override = sum(
+                [x["set_advantage_to_zero"] for x in scored_data["overrides"]]
+            )
+
+            if train_or_valid == "train":
+                self.train_metrics["rewards"].append(
+                    1.0 * heap_sum / self.config.group_size
+                )
+                self.train_metrics["overlong_ratio"].append(
+                    1.0 * num_override / self.config.group_size
+                )
+                self.train_metrics["pass_gs"].append(1.0 * (heap_sum > 0))
+                self.train_metrics["completion_lengths"].append(
+                    sum([len(x) for x in scored_data["tokens"]])
+                    / len(scored_data["tokens"])
+                )
+                self.train_metrics["correct_completion_lengths"].append(
+                    sum(
+                        [
+                            len(x)
+                            for x, y in zip(
+                                scored_data["tokens"], scored_data["scores"]
+                            )
+                            if math.isclose(y, 1.0)
+                        ]
+                    )
+                )
+                self.train_metrics["incorrect_completion_lengths"].append(
+                    sum(
+                        [
+                            len(x)
+                            for x, y in zip(
+                                scored_data["tokens"], scored_data["scores"]
+                            )
+                            if math.isclose(y, -1.0)
+                        ]
+                    )
+                )
+                self.temp_metrics["indices"].append(cur_id)
+                self.temp_metrics["num_correct"].append(heap_sum)
+
+            script_dir = os.path.dirname(os.path.abspath(__file__))
+            """
+            if heap_sum > self.config.group_size - 2 and train_or_valid == "train":
+                self.blacklist.add(cur_id)
+                file_path = os.path.join(script_dir, "blacklist_" + self.cur_time + ".txt")
+                with open(file_path, "a") as f:
+                    f.write(json.dumps(cur_id) + "\n")
+            """
+            if train_or_valid == "train":
+                script_dir = os.path.join(script_dir, "train_logs")
+            else:
+                script_dir = os.path.join(script_dir, "eval_logs")
+            file_path = os.path.join(
+                script_dir, "qwen_data_dump_" + self.cur_time + ".txt"
+            )
+            file_path_long = os.path.join(
+                script_dir, "qwen_data_dump_long" + self.cur_time + ".txt"
+            )
+            with open(file_path, "a") as f:
+                mp = {
+                    "cur_id": cur_id,
+                    "num_correct": heap_sum,
+                    "total": self.config.group_size,
+                    "scores": scored_data["scores"],
+                    "lengths": lengths,
+                }
+                f.write(json.dumps(mp) + "\n")
+            with open(file_path_long, "a") as f:
+                mp = {
+                    "cur_id": cur_id,
+                    "num_correct": heap_sum,
+                    "total": self.config.group_size,
+                    "scores": scored_data["scores"],
+                    "lengths": lengths,
+                    "errors": errors,
+                    "codes": codes,
+                    "gen": asst_messages[0],
+                }
+                f.write(json.dumps(mp) + "\n")
+
+        return scored_data, []
 
     async def evaluate(self, *args, **kwargs):
         """
@@ -125,30 +358,391 @@ class CodingEnv(BaseEnv):
         :param kwargs:
         :return: None.
         """
+        rprint("EVALUATION")
+        test_data = self.test
+
+        sema = asyncio.Semaphore(self.config.max_eval_workers)
+
+        all_total, all_correct = [], []
+        easy_total, easy_correct = [], []
+        medium_total, medium_correct = [], []
+        hard_total, hard_correct = [], []
+
+        temp_completion_lengths = []
+
+        correct_completion_lengths = []
+        incorrect_completion_lengths = []
+
+        overlong_ratio = []
+        pass_gs = []
+
+        async def eval(curr_step):
+            async with sema:
+                item = test_data[curr_step]
+                scored_data, _ = await self.collect_trajectories(item)
+
+                scores = scored_data["scores"]
+                num_correct = sum(1 for x in scores if math.isclose(x, 1.0))
+                num_overlong = sum(
+                    x["set_advantage_to_zero"] for x in scored_data["overrides"]
+                )
+
+                async with self.lock2:
+                    overlong_ratio.append(num_overlong / len(scores))
+                    pass_gs.append(num_correct > 0)
+                    temp_completion_lengths.append(
+                        sum([len(x) for x in scored_data["tokens"]])
+                        / len(scored_data["tokens"])
+                    )
+
+                    correct_completion_lengths.extend(
+                        [
+                            len(x)
+                            for x, y in zip(
+                                scored_data["tokens"], scored_data["scores"]
+                            )
+                            if math.isclose(y, 1.0)
+                        ]
+                    )
+                    incorrect_completion_lengths.extend(
+                        [
+                            len(x)
+                            for x, y in zip(
+                                scored_data["tokens"], scored_data["scores"]
+                            )
+                            if math.isclose(y, -1.0)
+                        ]
+                    )
+                    all_total.append(len(scores))
+                    all_correct.append(num_correct)
+                    if item["difficulty"] == "easy":
+                        easy_total.append(len(scores))
+                        easy_correct.append(num_correct)
+                    elif item["difficulty"] == "medium":
+                        medium_total.append(len(scores))
+                        medium_correct.append(num_correct)
+                    elif item["difficulty"] == "hard":
+                        hard_total.append(len(scores))
+                        hard_correct.append(num_correct)
+
+        tasks = [asyncio.create_task(eval(i)) for i in range(len(test_data))]
+        await asyncio.gather(*tasks)
+
+        def estimator(n: int, c: int, k: int) -> float:
+            """Calculates 1 - comb(n - c, k) / comb(n, k)."""
+            if n - c < k:
+                return 1.0
+            return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+        all_pass_1 = sum(
+            [estimator(n, c, 1) for n, c in zip(all_total, all_correct)]
+        ) / len(all_total)
+        easy_pass_1 = sum(
+            [estimator(n, c, 1) for n, c in zip(easy_total, easy_correct)]
+        ) / (len(easy_total) + 1e-6)
+        medium_pass_1 = sum(
+            [estimator(n, c, 1) for n, c in zip(medium_total, medium_correct)]
+        ) / (len(medium_total) + 1e-6)
+        hard_pass_1 = sum(
+            [estimator(n, c, 1) for n, c in zip(hard_total, hard_correct)]
+        ) / (len(hard_total) + 1e-6)
+
+        self.eval_metrics.append(
+            ("eval/overlong_ratio", 1.0 * sum(overlong_ratio) / len(overlong_ratio))
+        )
+        self.eval_metrics.append(
+            ("eval/pass@group_size", 1.0 * sum(pass_gs) / len(pass_gs))
+        )
+
+        avg_comp_len = sum(temp_completion_lengths) / len(temp_completion_lengths)
+
+        self.eval_metrics.append(("eval/pass_1", all_pass_1))
+        self.eval_metrics.append(("eval/easy_pass_1", easy_pass_1))
+        self.eval_metrics.append(("eval/medium_pass_1", medium_pass_1))
+        self.eval_metrics.append(("eval/hard_pass_1", hard_pass_1))
+        self.eval_metrics.append(("eval/completion_length", avg_comp_len))
+
+        self.eval_metrics.append(
+            (
+                "eval/correct_completion_length",
+                sum(correct_completion_lengths) / len(correct_completion_lengths),
+            )
+        )
+        self.eval_metrics.append(
+            (
+                "eval/incorrect_completion_length",
+                sum(incorrect_completion_lengths) / len(incorrect_completion_lengths),
+            )
+        )
+        print("STATS", self.eval_metrics)
+
         return
 
+    async def offline_filter(self):
+        rprint("OFFLINE FILTERING")
+        train_data = self.train
+
+        sema = asyncio.Semaphore(256)
+
+        async def filter_temp(curr_step):
+            async with sema:
+                item = train_data[curr_step]
+                item["split"] = "train"
+                item["idx"] = curr_step
+                rprint("OFFLINE FILTERING INDEX", curr_step)
+                scored_data, _ = await self.collect_trajectories(item)
+
+                scores = scored_data["scores"]
+                num_correct = sum(1 for x in scores if math.isclose(x, 1.0))
+
+                async with self.lock2:
+                    if num_correct == self.config.group_size:
+                        script_dir = os.path.dirname(os.path.abspath(__file__))
+                        file_path = os.path.join(script_dir, "perfect_indices.txt")
+                        with open(file_path, "a") as f:
+                            f.write(json.dumps(curr_step) + "\n")
+
+        tasks = [asyncio.create_task(filter_temp(i)) for i in range(len(train_data))]
+        await asyncio.gather(*tasks)
+        print("DONE")
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        for item in self.eval_metrics:
+            wandb_metrics[item[0]] = item[1]
+
+        async with self.lock:
+            if self.wandb_step == 1 and self.curr_step > 5:
+                self.wandb_step = self.curr_step
+            cnt = sum(
+                (i != 0 and i != self.config.group_size)
+                for i in self.temp_metrics["num_correct"]
+            )
+            print(
+                "TEMP METRICS INDICES: ",
+                self.temp_metrics["indices"],
+                "\n",
+                self.temp_metrics["num_correct"],
+                cnt,
+                len(self.temp_metrics["num_correct"]),
+            )
+            for k, v in wandb_metrics.items():
+                print(k, v)
+
+            old_idx = 0
+            idx = 0
+            good_updates = 0
+
+            old_completion_idx = 0
+            new_completion_idx = 0
+
+            # assert len(self.temp_metrics["num_correct"]) == len(self.completion_lengths),
+            # f"TEMP METRICS {len(self.temp_metrics['num_correct'])} and
+            # COMPLETION LENGTHS {len(self.completion_lengths)} MUST BE SAME LENGTH"
+            print(
+                "TEMP METRICS LENGTHS",
+                len(self.temp_metrics["num_correct"]),
+                "COMPLETION LENGTHS",
+                len(self.completion_lengths),
+            )
+            print("BATCH SZ", self.config.batch_size)
+
+            while idx < len(self.temp_metrics["num_correct"]):
+                if (
+                    self.temp_metrics["num_correct"][idx] != 0
+                    and self.temp_metrics["num_correct"][idx] != self.config.group_size
+                ):
+                    good_updates += 1
+                idx += 1
+                if good_updates == self.config.batch_size // self.config.group_size:
+                    wandb_metrics["train_rewards/rewards"] = sum(
+                        self.train_metrics["rewards"][old_idx:idx]
+                    ) / len(self.train_metrics["rewards"][old_idx:idx])
+                    wandb_metrics["train_rewards/overlong_ratio"] = sum(
+                        self.train_metrics["overlong_ratio"][old_idx:idx]
+                    ) / len(self.train_metrics["overlong_ratio"][old_idx:idx])
+                    wandb_metrics["train_rewards/pass@group_size"] = sum(
+                        self.train_metrics["pass_gs"][old_idx:idx]
+                    ) / len(self.train_metrics["pass_gs"][old_idx:idx])
+                    wandb_metrics["train_rewards/completion_lengths"] = sum(
+                        self.train_metrics["completion_lengths"][old_idx:idx]
+                    ) / len(self.train_metrics["completion_lengths"][old_idx:idx])
+                    wandb_metrics["train_rewards/correct_completion_lengths"] = sum(
+                        self.train_metrics["correct_completion_lengths"][old_idx:idx]
+                    ) / sum(self.temp_metrics["num_correct"][old_idx:idx])
+                    wandb_metrics["train_rewards/incorrect_completion_lengths"] = sum(
+                        self.train_metrics["incorrect_completion_lengths"][old_idx:idx]
+                    ) / (
+                        self.config.group_size * (idx - old_idx)
+                        - sum(self.temp_metrics["num_correct"][old_idx:idx])
+                    )
+
+                    new_completion_idx += self.config.batch_size
+
+                    assert old_completion_idx <= len(self.completion_lengths), (
+                        f"OLD COMPLETION IDX {old_completion_idx} and "
+                        f"COMPLETION LENGTHS {len(self.completion_lengths)} MUST BE SMALLER"
+                    )
+
+                    wandb_metrics["train/completion_lengths"] = sum(
+                        self.completion_lengths[old_completion_idx:new_completion_idx]
+                    ) / len(
+                        self.completion_lengths[old_completion_idx:new_completion_idx]
+                    )
+                    wandb_metrics["train/completion_lengths_std"] = np.std(
+                        self.completion_lengths[old_completion_idx:new_completion_idx]
+                    )
+                    wandb_metrics["train/completion_lengths_max"] = np.max(
+                        self.completion_lengths[old_completion_idx:new_completion_idx]
+                    )
+                    wandb_metrics["train/completion_lengths_min"] = np.min(
+                        self.completion_lengths[old_completion_idx:new_completion_idx]
+                    )
+                    wandb_metrics["train/completion_lengths_p95"] = (
+                        np.array(
+                            self.completion_lengths[
+                                old_completion_idx:new_completion_idx
+                            ]
+                        )
+                        > (0.95 * self.max_token_len)
+                    ).mean()
+
+                    if self.wandb_prepend is not None:
+                        wandb_metrics = {
+                            f"{self.wandb_prepend}_{k}": v
+                            for k, v in wandb_metrics.items()
+                        }
+                    print("WANDB LOG")
+                    wandb.log(wandb_metrics, step=self.wandb_step, commit=True)
+                    wandb_metrics = {}
+
+                    good_updates = 0
+                    self.wandb_step += 1
+                    old_idx = idx
+                    old_completion_idx = new_completion_idx
+
+            self.completion_lengths = self.completion_lengths[old_completion_idx:]
+            self.temp_metrics["indices"] = self.temp_metrics["indices"][old_idx:]
+            self.temp_metrics["num_correct"] = self.temp_metrics["num_correct"][
+                old_idx:
+            ]
+            self.train_metrics["rewards"] = self.train_metrics["rewards"][old_idx:]
+            self.train_metrics["overlong_ratio"] = self.train_metrics["overlong_ratio"][
+                old_idx:
+            ]
+            self.train_metrics["pass_gs"] = self.train_metrics["pass_gs"][old_idx:]
+            self.train_metrics["completion_lengths"] = self.train_metrics[
+                "completion_lengths"
+            ][old_idx:]
+            self.train_metrics["correct_completion_lengths"] = self.train_metrics[
+                "correct_completion_lengths"
+            ][old_idx:]
+            self.train_metrics["incorrect_completion_lengths"] = self.train_metrics[
+                "incorrect_completion_lengths"
+            ][old_idx:]
+
+        print("WANDB STEPS vs STATUS STEP", self.wandb_step, self.curr_step)
+
+        self.eval_metrics = list()
+
+        for i, server in enumerate(self.server.servers):
+            server_wandb_metrics = await server.wandb_metrics({}, f"server_{i}")
+
+        wandb_metrics = await self.create_rollout_table(wandb_metrics)
+        wandb_metrics = self.perf_stats(wandb_metrics)
+        self.rollouts_for_wandb = []
+
+        if self.config.use_wandb:
+            if self.wandb_prepend is not None:
+                wandb_metrics = {
+                    f"{self.wandb_prepend}_{k}": v for k, v in wandb_metrics.items()
+                }
+            # add server metrics to wandb without prepend to collate them all
+            wandb_metrics.update(server_wandb_metrics)
+            wandb.log(wandb_metrics, step=self.curr_step, commit=True)
+
     async def setup(self):
         """Setup the environment"""
-        self.container = init_docker()
-        self.train = load_dataset("deepmind/code_contests", split="train")
-        self.iter = 0
+        if self.config.dataset_name == "deepmind":
+            self.train = load_dataset("deepmind/code_contests", split="train")  # CHANGE
+        else:
+            self.train = load_dataset(
+                "NousResearch/RLVR_Coding_Problems", split="train"
+            )
+        test = load_dataset("NousResearch/lcb_test", split="test")
+        self.test = []
+        for problem in test:
+            self.test.append(problem)
+            self.test[-1]["idx"] = len(self.test) - 1
+            self.test[-1]["split"] = "test"
+
+        self.iter = 0  # CHANGE
+        self.lock = asyncio.Lock()
+        self.lock2 = asyncio.Lock()
+
+        self.wandb_step = 1
+
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        file_path = os.path.join(script_dir, "perfect_indices.txt")
+        with open(file_path, "r") as f:
+            for line in f:
+                a = int(line.strip())
+                self.blacklist.add(a)
+
+        self.good_indices = []
+        if self.config.dataset_name != "deepmind":
+            arr_short = []
+            for i in range(len(self.train)):
+                if i not in self.blacklist:
+                    arr_short.append(i)
+            print("ARR SHORT LENGTH", len(arr_short))
+            temp_arr = arr_short * 100
+            self.deq = deque(temp_arr[self.config.start_idx :])
+        else:
+            self.good_indices = [i for i in range(10000)]
+            for i in range(10000):
+                self.deq.append(i)
+
+        rprint("NUM FILTERED EXAMPLES:", len(self.deq))
+        rprint(self.config.batch_size)
+
+        """
+        rprint("BEFORE SETUP, do blacklisting")
+        await self.offline_filter()
+        rprint("FINISH blacklisting")
+        """
+
+        """
+        st = time.time()
+        await self.evaluate() ### CHANGE
+        ed = time.time()
+        rprint("ELAPSED TIME FOR EVALUATION: ", ed-st)
+        """
 
     async def get_next_item(self) -> Item:
         """
         Get the next items to be rolled out
         """
-        next_item = self.train[self.iter % len(self.train)]
-        self.iter += 1
-        prompt = tuple(
-            [frozenset({"role": "user", "content": next_item["description"]}.items())]
-        )
-        answer = (
-            tuple(next_item["private_tests"]["input"]),
-            tuple(next_item["private_tests"]["output"]),
-            tuple(next_item["generated_tests"]["input"]),
-            tuple(next_item["generated_tests"]["output"]),
-        )
-        return (prompt, answer)
+        async with self.lock:
+            cur_idx = self.deq.popleft()
+            while cur_idx in self.blacklist:
+                print("IN BLACKLIST, SKIPPING", cur_idx)
+                cur_idx = self.deq.popleft()
+            next_item = self.train[cur_idx]
+        next_item["idx"] = cur_idx
+        next_item["split"] = "train"
+        if self.config.dataset_name == "deepmind":
+            next_item["problem"] = next_item["description"]
+            next_item["tests"] = {
+                "input": next_item["private_tests"]["input"]
+                + next_item["generated_tests"]["input"],
+                "output": next_item["private_tests"]["output"]
+                + next_item["generated_tests"]["output"],
+            }
+            next_item["problem_type"] = "stdin_stdout"
+        return next_item
 
     def extract_python_code_blocks(self, text):
         # Regex specifically looks for ```python\n...code...\n```
@@ -157,44 +751,87 @@ class CodingEnv(BaseEnv):
         python_blocks = [r for r in result]
         return python_blocks
 
-    async def score(self, rollout_group_data) -> Optional[ScoredDataGroup]:
-        # print("Rollout group data", rollout_group_data)
+    async def score(self, rollout_group_data):
+
+        assert len(rollout_group_data) == 1
+        cur_id = rollout_group_data[0][2]
+
         scores = ScoredDataGroup()
         scores["tokens"] = list()
         scores["masks"] = list()
         scores["scores"] = list()
-        random.shuffle(rollout_group_data)
+        scores["overrides"] = list()
+
+        codes = []
+        lengths = []
+        errors = []
+
         for item in rollout_group_data:
-            out_dict = tokenize_for_trainer(self.tokenizer, item[0])
+            scores["overrides"].append(dict())
+            scores["overrides"][-1]["set_advantage_to_zero"] = False
+            if item[3] == "length":
+                scores["overrides"][-1]["set_advantage_to_zero"] = True
+            else:
+                if item[3] != "stop":
+                    rprint("FINISH REASON", item[3])
+                # assert item[3] == "stop"
+
+            out_dict = tokenize_for_trainer(self.tokenizer, item[0], item[3])
             tokens = out_dict["tokens"]
             masks = out_dict["masks"]
-            """
-            CALCULATE REWARD NOW
-            """
-            code = self.extract_python_code_blocks(item[0][-1]["content"])[0]
-            test_cases = list(item[1][0]) + list(item[1][2])
-            x = await get_results(code, test_cases)
-            output_cases = list(item[1][1]) + list(item[1][3])
-            assert len(x) == len(output_cases)
-            reward = True
-            for k in range(len(x)):
-                if x[k] != output_cases[k]:
-                    reward = False
-                    break
-            # remove obviously bad examples
-            if len([1 for i in masks if i != -100]) < 10:
-                continue
             scores["tokens"].append(tokens)
             scores["masks"].append(masks)
-            scores["scores"].append(1.0 if reward else -1.0)
-            if len(scores["tokens"]) >= self.config.group_size:
-                break
-        # check if all the same
-        # print(scores['scores'])
-        # if all([scores["scores"][0] == score for score in scores["scores"]]):
-        #     return None  # If all the same, we return None
-        return scores
+            lengths.append(len(tokens))
+
+            code = self.extract_python_code_blocks(item[0][-1]["content"])
+            if len(code) > 0:
+                code = code[-1]
+            else:
+                code = None
+            test_cases = {"tests": item[1]}
+
+            codes.append(code)
+
+            if code is None:
+                scores["scores"].append(-1.0)
+                errors.append({"error_message": "No code"})
+                continue
+            else:
+                try:
+                    async with async_semaphore:
+                        res, metadata = await run_test.remote.aio(test_cases, code)
+                except modal.exception.RemoteError:
+                    res = [False]
+                    rprint("index", cur_id)
+                    rprint(test_cases["tests"]["fn_name"])
+                    metadata = {"error": "segmentation fault"}
+                    rprint("FAULT")
+                    rprint("code:\n", code)
+                    rprint(metadata)
+                except Exception as f:
+                    rprint("index", cur_id)
+                    rprint(test_cases["tests"]["fn_name"])
+                    rprint("except:", code)
+                    res = [False]
+                    metadata = {"error": "segmentation fault"}
+                    rprint("NON SEGFAULT")
+                    rprint("FAULT", f)
+                for x in res:
+                    if not isinstance(x, (int, bool)):
+                        rprint("WARNING")
+                        rprint(res)
+                if set(res) == {True}:
+                    scores["scores"].append(1.0)
+                else:
+                    scores["scores"].append(-1.0)
+                    rprint("index", cur_id)
+                    rprint(metadata)
+
+                errors.append(metadata)
+
+        return scores, (codes[0], errors[0], lengths[0])
 
 
 if __name__ == "__main__":
+    print(system_prompt)
     CodingEnv.cli()
diff --git a/environments/code_execution_server/lcb_modal_endpoint.py b/environments/code_execution_server/lcb_modal_endpoint.py
new file mode 100644
index 00000000..a8809724
--- /dev/null
+++ b/environments/code_execution_server/lcb_modal_endpoint.py
@@ -0,0 +1,664 @@
+import ast
+import faulthandler
+import json
+import platform
+
+# to run the solution files we're using a timing based approach
+import signal
+import sys
+import time
+
+# used for debugging to time steps
+from datetime import datetime
+from decimal import Decimal
+from enum import Enum
+from io import StringIO
+
+# from pyext import RuntimeModule
+from types import ModuleType
+
+# used for testing the code that reads from input
+from unittest.mock import mock_open, patch
+
+import modal
+
+import_string = (
+    "from string import *\nfrom re import *\nfrom datetime import *\n"
+    "from collections import *\nfrom heapq import *\nfrom bisect import *\n"
+    "from copy import *\nfrom math import *\nfrom random import *\n"
+    "from statistics import *\nfrom itertools import *\nfrom functools import *\n"
+    "from operator import *\nfrom io import *\nfrom sys import *\n"
+    "from json import *\nfrom builtins import *\nfrom typing import *\n"
+    "import string\nimport re\nimport datetime\nimport collections\n"
+    "import heapq\nimport bisect\nimport copy\nimport math\nimport random\n"
+    "import statistics\nimport itertools\nimport functools\nimport operator\n"
+    "import io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n"
+)
+
+image = modal.Image.debian_slim(python_version="3.10").pip_install("numpy")
+app = modal.App("joeli-lcb", image=image)
+
+
+def truncatefn(s, length=300):
+    if isinstance(s, str):
+        pass
+    else:
+        s = str(s)
+    if len(s) <= length:
+        return s
+
+    return s[: length // 2] + "...(truncated) ..." + s[-length // 2 :]
+
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    print("timeout occured: alarm went off")
+    raise TimeoutException
+
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+
+    def __exit__(self, *args):
+        self.append(self._stringio.getvalue())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
+
+
+# Custom mock for sys.stdin that supports buffer attribute
+class MockStdinWithBuffer:
+    def __init__(self, inputs: str):
+        self.inputs = inputs
+        self._stringio = StringIO(inputs)
+        self.buffer = MockBuffer(inputs)
+
+    def read(self, *args):
+        return self.inputs
+
+    def __iter__(self):  # <- required for "for line in sys.stdin"
+        return iter(self._stringio)
+
+    def readline(self, *args):
+        return self._stringio.readline(*args)
+
+    def readlines(self, *args):
+        return self.inputs.split("\n")
+
+    def __getattr__(self, name):
+        # Delegate other attributes to StringIO
+        return getattr(self._stringio, name)
+
+
+class MockBuffer:
+    def __init__(self, inputs: str):
+        self.inputs = inputs.encode("utf-8")  # Convert to bytes
+
+    def read(self, *args):
+        # Return as byte strings that can be split
+        return self.inputs
+
+    def readline(self, *args):
+        return self.inputs.split(b"\n")[0] + b"\n"
+
+
+def clean_if_name(code: str) -> str:
+    try:
+        astree = ast.parse(code)
+        last_block = astree.body[-1]
+        if isinstance(last_block, ast.If):
+            condition = last_block.test
+            if (
+                ast.unparse(condition).strip() == "__name__ == '__main__'"
+                or ast.unparse(condition).strip() == '__name__ == "__main__"'
+            ):
+                code = (
+                    ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body)  # type: ignore
+                )
+    except Exception:
+        pass
+
+    return code
+
+
+def make_function(code: str) -> str:
+    try:
+        import_stmts = []
+        all_other_stmts = []
+        astree = ast.parse(code)
+        for stmt in astree.body:
+            if isinstance(stmt, (ast.Import, ast.ImportFrom)):
+                import_stmts.append(stmt)
+            else:
+                all_other_stmts.append(stmt)
+
+        function_ast = ast.FunctionDef(
+            name="wrapped_function",
+            args=ast.arguments(
+                posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]
+            ),
+            body=all_other_stmts,
+            decorator_list=[],
+            lineno=-1,
+        )
+        main_code = (
+            import_string
+            + "\n"
+            + ast.unparse(import_stmts)  # type: ignore
+            + "\n"
+            + ast.unparse(function_ast)  # type: ignore
+        )
+        return main_code
+    except Exception:
+        return code
+
+
+def call_method(method, inputs):
+
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # Create custom stdin mock with buffer support
+    mock_stdin = MockStdinWithBuffer(inputs)
+
+    # CHANGE
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch("builtins.open", mock_open(read_data=inputs))
+    @patch("sys.stdin", mock_stdin)  # Use our custom mock instead of StringIO
+    @patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
+    @patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
+    @patch("sys.stdin.read", lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        if "stdin" in _method.__globals__:
+            _method.__globals__["stdin"] = mock_stdin
+        if "stdout" in _method.__globals__:
+            _method.__globals__["stdout"] = sys.stdout
+        try:
+            return _method()
+        except SystemExit:
+            pass
+        finally:
+            pass
+
+    return _inner_call_method(method)
+
+
+def get_function(compiled_sol, fn_name: str):  # type: ignore
+    try:
+        assert hasattr(compiled_sol, fn_name)
+        return getattr(compiled_sol, fn_name)
+    except Exception:
+        return
+
+
+def compile_code(code: str, timeout: int):
+    signal.alarm(timeout)
+    try:
+        tmp_sol = ModuleType("tmp_sol", "")
+
+        exec(code, tmp_sol.__dict__)
+
+        if "class Solution" in code:
+            # leetcode wraps solutions in `Solution`
+            # this is a hack to check if it is leetcode solution or not
+            # currently livecodebench only supports LeetCode but
+            # else condition allows future extensibility to other platforms
+            compiled_sol = tmp_sol.Solution()
+        else:
+            # do nothing in the other case since function is accesible
+            compiled_sol = tmp_sol
+
+        assert compiled_sol is not None
+    finally:
+        signal.alarm(0)
+
+    return compiled_sol
+
+
+def convert_line_to_decimals(line: str) -> tuple[bool, list[Decimal]]:
+    try:
+        decimal_line = [Decimal(elem) for elem in line.split()]
+    except Exception:
+        return False, []
+    return True, decimal_line
+
+
+def get_stripped_lines(val: str):
+    # you don't want empty lines to add empty list after splitlines!
+    val = val.strip()
+
+    return [val_line.strip() for val_line in val.split("\n")]
+
+
+def grade_call_based(
+    code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int
+):
+    # call-based clean up logic
+    # need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
+    code = import_string + "\n\n" + code
+    compiled_sol = compile_code(code, timeout)
+    print("code:", code)
+
+    if compiled_sol is None:
+        return
+
+    method = get_function(compiled_sol, fn_name)
+
+    if method is None:
+        return
+
+    all_inputs = [
+        (
+            [json.loads(line) for line in inputs.split("\n")]
+            if isinstance(inputs, str)
+            else inputs
+        )
+        for inputs in all_inputs
+    ]
+    all_outputs = [
+        json.loads(output) if isinstance(output, str) else output[0]
+        for output in all_outputs
+    ]
+
+    # all_inputs = [[json.loads(line) for line in inputs.split("\n")] for inputs in all_inputs]
+    # all_outputs = [json.loads(output) for output in all_outputs]
+
+    total_execution = 0
+    all_results = []
+    for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
+        signal.alarm(timeout)
+        faulthandler.enable()
+        print("INPUT", gt_inp)
+        print("Expected Out", gt_out)
+        try:
+            # can lock here so time is useful
+            start = time.time()
+            prediction = method(*gt_inp)
+            total_execution += time.time() - start
+            signal.alarm(0)
+
+            # don't penalize model if it produces tuples instead of lists
+            # ground truth sequences are not tuples
+            if isinstance(prediction, tuple):
+                prediction = list(prediction)
+
+            print("predicted out", prediction)
+
+            tmp_result = prediction == gt_out
+
+            # handle floating point comparisons
+
+            all_results.append(tmp_result)
+
+            if not tmp_result:
+                return all_results, {
+                    "output": truncatefn(prediction),
+                    "inputs": truncatefn(gt_inp),
+                    "expected": truncatefn(gt_out),
+                    "error_code": -2,
+                    "error_message": "Wrong Answer",
+                }
+        except Exception as e:
+            signal.alarm(0)
+            if "timeoutexception" in repr(e).lower():
+                all_results.append(-3)
+                return all_results, {
+                    "error": repr(e),
+                    "error_code": -3,
+                    "error_message": "Time Limit Exceeded",
+                    "inputs": truncatefn(gt_inp),
+                    "expected": truncatefn(gt_out),
+                }
+            else:
+                all_results.append(-4)
+                return all_results, {
+                    "error": repr(e),
+                    "error_code": -4,
+                    "error_message": "Runtime Error",
+                    "inputs": truncatefn(gt_inp),
+                    "expected": truncatefn(gt_out),
+                }
+
+        finally:
+            signal.alarm(0)
+            faulthandler.disable()
+
+    return all_results, {"execution time": total_execution}
+
+
+def decimals_are_close(a: Decimal, b: Decimal, tol: Decimal = Decimal("1e-4")) -> bool:
+    return abs(a - b) <= tol
+
+
+def grade_stdio(
+    code: str,
+    all_inputs: list,
+    all_outputs: list,
+    timeout: int,
+):
+    print("ORIGINAL CODE\n", code)
+    # runtime doesn't interact well with __name__ == '__main__'
+    code = clean_if_name(code)
+
+    # we wrap the given code inside another function
+    code = make_function(code)
+
+    compiled_sol = compile_code(code, timeout)
+    if compiled_sol is None:
+        return
+
+    method = get_function(compiled_sol, "wrapped_function")
+
+    if method is None:
+        return
+
+    all_results = []
+    total_execution_time = 0
+    print("LCB CODE\n", code)
+    for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
+        print("INPUT\n", gt_inp)
+        print("EXPECTED OUTPUT\n", gt_out)
+        signal.alarm(timeout)
+        faulthandler.enable()
+
+        signal.alarm(timeout)
+        with Capturing() as captured_output:
+            try:
+                start = time.time()
+                call_method(method, gt_inp)
+                total_execution_time += time.time() - start
+                # reset the alarm
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                if "timeoutexception" in repr(e).lower():
+                    all_results.append(-3)
+                    return all_results, {
+                        "error": repr(e),
+                        "error_code": -3,
+                        "error_message": "Time Limit Exceeded",
+                        "inputs": truncatefn(gt_inp),
+                        "expected": truncatefn(gt_out),
+                    }
+                else:
+                    all_results.append(-4)
+                    return all_results, {
+                        "error": repr(e),
+                        "error_code": -4,
+                        "error_message": "Runtime Error",
+                        "inputs": truncatefn(gt_inp),
+                        "expected": truncatefn(gt_out),
+                    }
+
+            finally:
+                signal.alarm(0)
+                faulthandler.disable()
+
+        prediction = captured_output[0]
+        print("OUTPUT\n", prediction)
+
+        stripped_prediction_lines = get_stripped_lines(prediction)
+        if isinstance(gt_out, str):
+            stripped_gt_out_lines = get_stripped_lines(gt_out)
+        else:
+            stripped_gt_out_lines = gt_out
+
+        # WA happens in multiple circumstances
+        # so cache the return to make it clean!
+        WA_send_args = {
+            "output": truncatefn(prediction),
+            "inputs": truncatefn(gt_inp),
+            "expected": truncatefn(gt_out),
+            "error_code": -2,
+        }
+
+        if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
+            all_results.append(-2)
+            WA_send_args["error_message"] = "Wrong answer: mismatched output length"
+            return all_results, WA_send_args
+
+        for output_line_idx, (
+            stripped_prediction_line,
+            stripped_gt_out_line,
+        ) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
+            WA_send_args["error_message"] = (
+                f"Wrong answer at {output_line_idx=}: "
+                f"{truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}"
+            )
+
+            # CASE 1: exact match
+            if stripped_prediction_line == stripped_gt_out_line:
+                continue
+
+            # CASE 2: element-wise comparision
+            # if there are floating elements
+            # use `decimal` library for good floating point comparision
+            # otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
+            # note that we should always be able to convert to decimals
+
+            success, decimal_prediction_line = convert_line_to_decimals(
+                stripped_prediction_line
+            )
+            if not success:
+                all_results.append(-2)
+                return all_results, WA_send_args
+            success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)
+            if not success:
+                all_results.append(-2)
+                return all_results, WA_send_args
+
+            if decimal_prediction_line == decimal_gtout_line:
+                continue
+
+            if len(decimal_prediction_line) == len(decimal_gtout_line) and all(
+                decimals_are_close(a, b)
+                for a, b in zip(decimal_prediction_line, decimal_gtout_line)
+            ):
+                continue
+
+            all_results.append(-2)
+            return all_results, WA_send_args
+        all_results.append(True)
+
+    return all_results, {"execution time": total_execution_time}
+
+
+@app.function(
+    max_containers=100,
+    cpu=1.0,
+    restrict_modal_access=False,
+    max_inputs=1,
+    timeout=600,
+    block_network=False,
+    retries=3,
+)
+def run_test(sample, test=None, debug=False, timeout=15):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    print("VALID, went into function")
+    # test = json.loads(test)
+    signal.signal(signal.SIGALRM, timeout_handler)
+
+    # Disable functionalities that can make destructive changes to the test.
+    # max memory is set to 4GB
+    reliability_guard(5 * 1024 * 1024 * 1024)
+
+    if debug:
+        print(f"start = {datetime.now().time()}")
+
+    try:
+        # in_outs = json.loads(sample["input_output"])
+        in_outs = sample["tests"]
+    except ValueError as e:
+        raise e
+        in_outs = None
+
+    if in_outs:
+        if in_outs.get("fn_name") == "none":
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+    print("type, method name: ", which_type, method_name)
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+
+    if test is None:
+        assert False, "should not happen: test code is none"
+        return in_outs, {"error": "no test code provided"}
+    elif test is not None:
+        results = []
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+
+        if which_type == CODE_TYPE.call_based:
+            signal.alarm(timeout)
+            try:
+                results, metadata = grade_call_based(
+                    code=test,
+                    all_inputs=in_outs["input"],
+                    all_outputs=in_outs["output"],
+                    fn_name=method_name,
+                    timeout=timeout,
+                )
+                return results, metadata
+            except Exception as e:
+                return [-4], {
+                    "error_code": -4,
+                    "error_message": f"Error during testing: {e}",
+                }
+            finally:
+                signal.alarm(0)
+        elif which_type == CODE_TYPE.standard_input:
+            # sol
+            # if code has if __name__ == "__main__": then remove it
+
+            signal.alarm(timeout)
+            try:
+                results, metadata = grade_stdio(
+                    code=test,
+                    all_inputs=in_outs["input"],
+                    all_outputs=in_outs["output"],
+                    timeout=timeout,
+                )
+                return results, metadata
+            except Exception as e:
+                return [-4], {
+                    "error_code": -4,
+                    "error_message": f"Error during testing: {e}",
+                }
+            finally:
+                signal.alarm(0)
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(
+            resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
+        )
+        resource.setrlimit(
+            resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
+        )
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(
+                resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
+            )
+
+    faulthandler.disable()
+
+    import builtins
+
+    # builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
diff --git a/environments/community/sql_query_env/README.md b/environments/community/sql_query_env/README.md
new file mode 100644
index 00000000..5de843b5
--- /dev/null
+++ b/environments/community/sql_query_env/README.md
@@ -0,0 +1,115 @@
+# SQL Query Generation Environment
+
+Train LLMs to generate correct SQL queries from natural language questions.
+
+## Overview
+
+This environment uses the [Salesforce/WikiSQL](https://huggingface.co/datasets/Salesforce/wikisql) dataset to train language models on text-to-SQL tasks. Queries are verified by **executing** the generated SQL against in-memory SQLite databases and comparing results to ground truth.
+
+## Dataset
+
+- **Source**: [Salesforce/WikiSQL](https://huggingface.co/datasets/Salesforce/wikisql)
+- **Size**: 80,654 examples (train + validation + test)
+- **Format**: Natural language questions with table schemas and ground truth SQL
+
+## Usage
+
+### Training Mode (with API Server)
+
+```bash
+# Terminal 1: Start the Atropos API
+run-api
+
+# Terminal 2: Run the environment
+python sql_query_env.py serve --slurm False
+```
+
+### Local Testing (without API)
+
+```bash
+python sql_query_env.py process --env.data_path_to_save_groups sql_output.jsonl
+```
+
+This generates `sql_output.jsonl` and `sql_output.html` for inspection.
+
+### With Local vLLM Server
+
+```bash
+python sql_query_env.py process \
+    --env.data_path_to_save_groups sql_output.jsonl \
+    --openai.base_url http://localhost:9001/v1 \
+    --openai.model_name YOUR_MODEL_NAME
+```
+
+## Reward Function
+
+| Score | Condition |
+|-------|-----------|
+| **1.0** | Generated SQL executes and returns same result as gold SQL |
+| **-1.0** | SQL fails to execute or returns incorrect result |
+
+When all responses in a group are correct, a length penalty is applied to encourage concise solutions.
+
+## Prompt Format
+
+The model receives a table schema and question:
+
+```
+Table: data
+Columns: col1, col2, col3
+Sample data:
+  value1 | value2 | value3
+
+Question: What is the value of col1 where col2 equals X?
+```
+
+Output should be in boxed format:
+```
+<think>
+[Chain of thought reasoning]
+</think>
+
+\boxed{SELECT col1 FROM data WHERE col2 = 'X'}
+```
+
+## Unit Tests
+
+```bash
+# Run unit tests
+python -m pytest test_sql_executor.py -v
+```
+
+All 19 tests cover:
+- Table creation with special column names
+- SQL execution and error handling
+- `\boxed{}` extraction patterns
+- Result comparison and normalization
+- End-to-end scoring integration
+
+## LLM Integration Test
+
+The environment has been verified with Qwen3-8B on an NVIDIA H200:
+
+```bash
+# Run integration test with a local vLLM server
+python test_integration.py --base_url http://localhost:8000/v1 --model Qwen/Qwen3-8B
+```
+
+Test results:
+- **40% accuracy** on 10 random WikiSQL examples
+- SQL extraction from `\boxed{}` working correctly
+- Execution-based scoring producing correct reward signals
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `sql_query_env.py` | Main environment implementation |
+| `sql_executor.py` | SQLite execution and scoring utilities |
+| `wikisql_loader.py` | WikiSQL dataset loader (from GitHub) |
+| `test_sql_executor.py` | Unit tests (19 tests) |
+| `test_integration.py` | LLM integration test |
+
+## Author
+
+Community contribution to Atropos.
diff --git a/environments/community/sql_query_env/sql_executor.py b/environments/community/sql_query_env/sql_executor.py
new file mode 100644
index 00000000..27f4e017
--- /dev/null
+++ b/environments/community/sql_query_env/sql_executor.py
@@ -0,0 +1,166 @@
+"""
+SQL Executor Module
+
+Provides safe SQL execution against in-memory SQLite databases.
+Used by the SQL Query Environment for reward verification.
+"""
+
+import re
+import sqlite3
+from typing import Any, List, Optional, Tuple
+
+
+def create_table_from_wikisql(
+    header: List[str],
+    rows: List[List[Any]],
+    types: Optional[List[str]] = None,
+) -> sqlite3.Connection:
+    """
+    Create an in-memory SQLite table from WikiSQL format.
+
+    Args:
+        header: List of column names
+        rows: List of row data (each row is a list of values)
+        types: Optional list of column types from WikiSQL
+
+    Returns:
+        SQLite connection with the table created and populated
+    """
+    conn = sqlite3.connect(":memory:")
+    cursor = conn.cursor()
+
+    # Use original column names but quoted to handle spaces/special chars
+    # SQLite allows any string as identifier when quoted with double quotes
+    cols = ", ".join([f'"{h}" TEXT' for h in header])
+    cursor.execute(f"CREATE TABLE data ({cols})")
+
+    # Also create the "table" alias since some WikiSQL queries use it
+    # Create as a view pointing to data
+    try:
+        cursor.execute("CREATE VIEW 'table' AS SELECT * FROM data")
+    except sqlite3.OperationalError:
+        pass  # View already exists or name conflict
+
+    # Insert rows
+    if rows:
+        placeholders = ", ".join(["?" for _ in header])
+        # Convert all values to strings for consistency
+        string_rows = [[str(v) if v is not None else "" for v in row] for row in rows]
+        cursor.executemany(f"INSERT INTO data VALUES ({placeholders})", string_rows)
+
+    conn.commit()
+    return conn
+
+
+def quote_identifiers_in_sql(sql: str, header: List[str]) -> str:
+    """
+    Quote column names in SQL that contain special characters.
+
+    WikiSQL's human_readable SQL often has unquoted columns like:
+    SELECT State/territory FROM table WHERE ...
+
+    This function adds quotes around column names that need them.
+    """
+    result = sql
+    # Sort by length (longest first) to avoid partial replacements
+    sorted_headers = sorted(header, key=len, reverse=True)
+
+    for col in sorted_headers:
+        # Skip if already quoted or is a simple identifier
+        if re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", col):
+            continue
+
+        # Escape quotes in column name for regex
+        col_escaped = re.escape(col)
+
+        # Match the column name when not already quoted
+        # Look for the column name not preceded by " and not followed by "
+        pattern = rf'(?<!")\b{col_escaped}\b(?!")'
+        replacement = f'"{col}"'
+
+        result = re.sub(pattern, replacement, result)
+
+    return result
+
+
+def execute_sql(conn: sqlite3.Connection, sql: str) -> Optional[List[Tuple]]:
+    """
+    Execute SQL safely and return results.
+
+    Args:
+        conn: SQLite connection
+        sql: SQL query to execute
+
+    Returns:
+        List of result tuples, or None if execution failed
+    """
+    try:
+        cursor = conn.cursor()
+        cursor.execute(sql)
+        return cursor.fetchall()
+    except sqlite3.Error:
+        return None
+    except Exception:
+        return None
+
+
+def extract_boxed_sql(text: str) -> Optional[str]:
+    """
+    Extract SQL from \\boxed{} format in LLM response.
+
+    Args:
+        text: LLM response text
+
+    Returns:
+        Extracted SQL string, or None if not found
+    """
+    # Try to find \boxed{...} pattern
+    # Handle both \\boxed{} and \boxed{} formats
+    patterns = [
+        r"\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}",  # Handles nested braces
+        r"\\boxed\{(.+?)\}",  # Simple pattern
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            sql = match.group(1).strip()
+            if sql:
+                return sql
+
+    return None
+
+
+def normalize_result(result: List[Tuple]) -> List[Tuple]:
+    """
+    Normalize query results for comparison.
+
+    Converts all values to lowercase strings and sorts.
+    """
+    normalized = []
+    for row in result:
+        normalized_row = tuple(
+            str(v).lower().strip() if v is not None else "" for v in row
+        )
+        normalized.append(normalized_row)
+    return sorted(normalized)
+
+
+def results_match(result1: List[Tuple], result2: List[Tuple]) -> bool:
+    """
+    Compare two SQL query results for equality.
+
+    Args:
+        result1: First result set
+        result2: Second result set
+
+    Returns:
+        True if results match (order-independent, case-insensitive)
+    """
+    if result1 is None or result2 is None:
+        return False
+
+    norm1 = normalize_result(result1)
+    norm2 = normalize_result(result2)
+
+    return norm1 == norm2
diff --git a/environments/community/sql_query_env/sql_query_env.py b/environments/community/sql_query_env/sql_query_env.py
new file mode 100644
index 00000000..1e2a75f2
--- /dev/null
+++ b/environments/community/sql_query_env/sql_query_env.py
@@ -0,0 +1,424 @@
+"""
+SQL Query Generation Environment for Atropos
+
+Trains LLMs to generate correct SQL queries from natural language questions.
+Uses the Salesforce/WikiSQL dataset with execution-based scoring.
+"""
+
+import random
+from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
+
+from sql_executor import (
+    create_table_from_wikisql,
+    execute_sql,
+    extract_boxed_sql,
+    quote_identifiers_in_sql,
+    results_match,
+)
+from tqdm.asyncio import tqdm_asyncio
+from wikisql_loader import load_wikisql_split
+
+from atroposlib.envs.base import (
+    APIServerConfig,
+    BaseEnv,
+    BaseEnvConfig,
+    ScoredDataGroup,
+)
+from atroposlib.type_definitions import Item
+
+# System prompt following the established Atropos pattern
+system_prompt = (
+    "You are a deep thinking AI, you may use extremely long chains of thought "
+    "to deeply consider the problem and deliberate with yourself via systematic "
+    "reasoning processes to help come to a correct solution prior to answering. "
+    "You should enclose your thoughts and internal monologue inside <think> </think> "
+    "tags, and then provide your solution or response to the problem.\n\n"
+)
+
+system_prompt += """You are a SQL expert. Given a table schema and a natural language question,
+generate a SQL query that answers the question.
+
+You are allocated a maximum of 1024 tokens, please strive to use less.
+
+Provide your SQL query inside \\boxed{} like this: \\boxed{SELECT column FROM table WHERE condition}
+
+Important:
+- Use only the columns provided in the table schema
+- The table is always named "data"
+- Ensure your SQL syntax is valid SQLite
+
+So please end your answer with \\boxed{your SQL query here}"""
+
+
+class WikiSQLRow(TypedDict):
+    """Type definition for a WikiSQL dataset row."""
+
+    question: str
+    header: List[str]
+    rows: List[List[Any]]
+    types: List[str]
+    gold_sql: str
+
+
+def format_table_schema(
+    header: List[str], rows: List[List[Any]], max_rows: int = 3
+) -> str:
+    """Format table schema for the prompt."""
+    schema = f"Table: data\nColumns: {', '.join(header)}\n"
+    if rows:
+        schema += "Sample data:\n"
+        for row in rows[:max_rows]:
+            row_str = " | ".join(str(v) for v in row)
+            schema += f"  {row_str}\n"
+    return schema
+
+
+class SQLQueryEnv(BaseEnv):
+    """
+    Environment for training LLMs to generate SQL queries.
+
+    Uses the Salesforce/WikiSQL dataset and verifies correctness
+    by executing SQL against in-memory SQLite databases.
+    """
+
+    name = "sql_query"
+
+    def __init__(
+        self,
+        config: BaseEnvConfig,
+        server_configs: List[APIServerConfig],
+        slurm=True,
+        testing=False,
+    ):
+        super().__init__(config, server_configs, slurm, testing)
+        self.percent_correct_buffer = list()
+        self.eval_metrics = list()
+        self.execution_success_buffer = list()
+
+    @classmethod
+    def config_init(cls) -> Tuple[BaseEnvConfig, List[APIServerConfig]]:
+        """Initialize default configuration for the environment."""
+        env_config = BaseEnvConfig(
+            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
+            group_size=8,
+            use_wandb=True,
+            rollout_server_url="http://localhost:8000",
+            total_steps=1000,
+            batch_size=12,
+            steps_per_eval=100,
+            max_token_length=1024,
+            wandb_name="sql_query",
+        )
+        server_configs = [
+            APIServerConfig(
+                model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
+                base_url="http://localhost:9001/v1",
+                api_key="x",
+                num_requests_for_eval=256,
+            ),
+        ]
+        return env_config, server_configs
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
+        """Log custom metrics to WandB."""
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        # Log percent correct
+        try:
+            wandb_metrics["train/percent_correct"] = sum(
+                self.percent_correct_buffer
+            ) / len(self.percent_correct_buffer)
+        except ZeroDivisionError:
+            pass
+
+        # Log execution success rate
+        try:
+            wandb_metrics["train/execution_success"] = sum(
+                self.execution_success_buffer
+            ) / len(self.execution_success_buffer)
+        except ZeroDivisionError:
+            pass
+
+        self.percent_correct_buffer = list()
+        self.execution_success_buffer = list()
+
+        for item in self.eval_metrics:
+            wandb_metrics[item[0]] = item[1]
+        self.eval_metrics = list()
+
+        await super().wandb_log(wandb_metrics)
+
+    async def setup(self):
+        """Load the WikiSQL dataset and prepare train/test splits."""
+        # Load WikiSQL dataset directly from GitHub source
+        print("Loading WikiSQL training data...")
+        self.train = load_wikisql_split("train")
+        print(f"Loaded {len(self.train)} training examples")
+
+        print("Loading WikiSQL test data...")
+        self.test = load_wikisql_split("test")
+        print(f"Loaded {len(self.test)} test examples")
+
+        random.shuffle(self.train)
+        self.iter = 0
+
+    def save_checkpoint(self, step, data=None):
+        """Save checkpoint with iteration state."""
+        if data is None:
+            data = {}
+        data["iter"] = self.iter
+        super().save_checkpoint(step, data)
+
+    def _score_sql(
+        self,
+        generated_sql: str,
+        gold_sql: str,
+        header: List[str],
+        rows: List[List[Any]],
+    ) -> Tuple[float, bool]:
+        """
+        Score SQL by execution comparison.
+
+        Returns:
+            Tuple of (score, execution_success)
+        """
+        if not generated_sql:
+            return -1.0, False
+
+        # Create in-memory table
+        try:
+            conn = create_table_from_wikisql(header, rows)
+        except Exception:
+            return -1.0, False
+
+        # Quote identifiers in gold SQL that need quoting (e.g., "State/territory")
+        quoted_gold_sql = quote_identifiers_in_sql(gold_sql, header)
+
+        # Execute gold SQL
+        gold_result = execute_sql(conn, quoted_gold_sql)
+        if gold_result is None:
+            conn.close()
+            return -1.0, False
+
+        # Execute generated SQL
+        gen_result = execute_sql(conn, generated_sql)
+        conn.close()
+
+        if gen_result is None:
+            return -1.0, False
+
+        # Compare results
+        if results_match(gen_result, gold_result):
+            return 1.0, True
+        else:
+            return -1.0, True
+
+    async def rollout_and_score_eval(
+        self, question: str, gold_sql: str, header: List[str], rows: List[List[Any]]
+    ) -> dict:
+        """Rollout and score a single evaluation item."""
+        table_schema = format_table_schema(header, rows)
+        user_content = f"{table_schema}\nQuestion: {question}"
+
+        async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
+            completion = await managed.chat_completion(
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_content},
+                ],
+                n=1,
+                max_tokens=self.config.max_token_length,
+                temperature=0.6,
+            )
+            response_content = completion.choices[0].message.content
+
+        # Extract and score generated SQL
+        generated_sql = extract_boxed_sql(response_content)
+        score, exec_success = self._score_sql(generated_sql, gold_sql, header, rows)
+        correct = score == 1.0
+
+        sample = {
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_content},
+                {"role": "assistant", "content": response_content},
+            ],
+            "question": question,
+            "gold_sql": gold_sql,
+            "generated_sql": generated_sql,
+            "score": 1 if correct else 0,
+            "correct": correct,
+            "execution_success": exec_success,
+            "finish_reason": completion.choices[0].finish_reason,
+        }
+
+        return {"score": 1 if correct else 0, "sample": sample}
+
+    async def evaluate(self, *args, **kwargs):
+        """Run evaluation on test set."""
+        import time
+
+        start_time = time.time()
+
+        eval_tasks = []
+        # Limit eval to first 500 items for efficiency
+        for item in self.test[:500]:
+            eval_tasks.append(
+                self.rollout_and_score_eval(
+                    item["question"], item["gold_sql"], item["header"], item["rows"]
+                )
+            )
+        results = await tqdm_asyncio.gather(*eval_tasks)
+
+        scores = [result["score"] for result in results]
+        samples = [result["sample"] for result in results]
+
+        percent_correct = sum(scores) / len(scores) if scores else 0
+
+        end_time = time.time()
+
+        self.eval_metrics.append(("eval/percent_correct", percent_correct))
+
+        eval_metrics = {
+            "eval/percent_correct": percent_correct,
+        }
+
+        await self.evaluate_log(
+            metrics=eval_metrics,
+            samples=samples,
+            start_time=start_time,
+            end_time=end_time,
+            generation_parameters={
+                "temperature": 0.6,
+                "max_tokens": self.config.max_token_length,
+            },
+        )
+
+    async def collect_trajectories(
+        self, item: WikiSQLRow
+    ) -> Tuple[ScoredDataGroup, list[Item]]:
+        """Generate SQL queries for a given question."""
+        table_schema = format_table_schema(item["header"], item["rows"])
+        user_content = f"{table_schema}\nQuestion: {item['question']}"
+        user_message = {"role": "user", "content": user_content}
+
+        async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
+            chat_completions = await managed.chat_completion(
+                messages=[{"role": "system", "content": system_prompt}, user_message],
+                n=self.config.group_size,
+                max_tokens=self.config.max_token_length,
+                temperature=1.0,
+            )
+
+            state = managed.get_state()
+            nodes = state["nodes"]
+
+        to_score = list()
+        to_backlog = list()
+
+        for i, chat_completion in enumerate(chat_completions.choices):
+            messages = (
+                {"role": "system", "content": system_prompt},
+                user_message,
+                {"role": "assistant", "content": chat_completion.message.content},
+            )
+            to_score.append(
+                {
+                    "messages": messages,
+                    "gold_sql": item["gold_sql"],
+                    "header": item["header"],
+                    "rows": item["rows"],
+                    "finish_reason": chat_completion.finish_reason,
+                    "tokens": nodes[i].tokens,
+                    "masks": nodes[i].masked_tokens,
+                    "logprobs": nodes[i].logprobs,
+                }
+            )
+
+        to_postprocess = await self.score(to_score)
+        return to_postprocess, to_backlog
+
+    async def score(
+        self, rollout_group_data
+    ) -> Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]]:
+        """Score generated SQL queries by execution comparison."""
+        scores = ScoredDataGroup()
+        scores["tokens"] = list()
+        scores["masks"] = list()
+        scores["scores"] = list()
+        scores["inference_logprobs"] = list()
+
+        # Get table info from first item
+        gold_sql = rollout_group_data[0]["gold_sql"]
+        header = rollout_group_data[0]["header"]
+        rows = rollout_group_data[0]["rows"]
+
+        random.shuffle(rollout_group_data)
+
+        for item in rollout_group_data:
+            response_content = item["messages"][-1]["content"]
+
+            # Extract SQL from response
+            generated_sql = extract_boxed_sql(response_content)
+
+            # Score by execution
+            reward, exec_success = self._score_sql(
+                generated_sql, gold_sql, header, rows
+            )
+            self.execution_success_buffer.append(1 if exec_success else 0)
+
+            tokens = item["tokens"]
+            masks = item["masks"]
+            logprobs = item["logprobs"]
+
+            # Remove obviously bad examples
+            if len([1 for i in masks if i != -100]) < 10:
+                continue
+
+            scores["tokens"].append(tokens)
+            scores["masks"].append(masks)
+            scores["inference_logprobs"].append(logprobs)
+            scores["scores"].append(reward)
+
+            if len(scores["tokens"]) >= self.config.group_size:
+                break
+
+        for score in scores["scores"]:
+            self.percent_correct_buffer.append(max(score, 0))
+
+        # Check if all scores are the same
+        if all([score == 1 for score in scores["scores"]]):
+            # Apply length penalty when all are correct
+            token_lengths = [len(token) for token in scores["tokens"]]
+            if max(token_lengths) == 0:
+                return None
+
+            max_allowed_length = self.config.max_token_length
+            length_threshold = max_allowed_length * 0.5
+
+            scores["scores"] = []
+            for length in token_lengths:
+                if length <= length_threshold:
+                    scores["scores"].append(1.0)
+                else:
+                    percentage_of_range = (length - length_threshold) / (
+                        max_allowed_length - length_threshold
+                    )
+                    percentage_of_range = min(percentage_of_range, 1.0)
+                    scores["scores"].append(1.0 - percentage_of_range)
+
+        if all([scores["scores"][0] == score for score in scores["scores"]]):
+            return None  # If all the same, return None
+
+        return scores
+
+    async def get_next_item(self) -> WikiSQLRow:
+        """Get the next training item."""
+        next_item = self.train[self.iter % len(self.train)]
+        self.iter += 1
+        return next_item
+
+
+if __name__ == "__main__":
+    SQLQueryEnv.cli()
diff --git a/environments/community/sql_query_env/test_integration.py b/environments/community/sql_query_env/test_integration.py
new file mode 100644
index 00000000..755fb444
--- /dev/null
+++ b/environments/community/sql_query_env/test_integration.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+"""
+Integration test for SQL Query Environment that works with OpenAI-compatible APIs.
+
+This test verifies:
+1. WikiSQL dataset loading
+2. SQL generation from LLM
+3. SQL extraction from \boxed{}
+4. SQL execution and result comparison
+5. Scoring logic
+"""
+
+import asyncio
+import json
+from typing import Any, List
+
+import openai
+
+# Import local modules
+from sql_executor import (
+    create_table_from_wikisql,
+    execute_sql,
+    extract_boxed_sql,
+    quote_identifiers_in_sql,
+    results_match,
+)
+from wikisql_loader import load_wikisql_split
+
+# System prompt from the environment
+SYSTEM_PROMPT = (
+    "You are a deep thinking AI, you may use extremely long chains of thought "
+    "to deeply consider the problem and deliberate with yourself via systematic "
+    "reasoning processes to help come to a correct solution prior to answering. "
+    "You should enclose your thoughts and internal monologue inside <think> </think> "
+    "tags, and then provide your solution or response to the problem.\n\n"
+    "You are a SQL expert. Given a table schema and a natural language question, "
+    "generate a SQL query that answers the question.\n\n"
+    "You are allocated a maximum of 1024 tokens, please strive to use less.\n\n"
+    "Provide your SQL query inside \\boxed{} like this: "
+    "\\boxed{SELECT column FROM table WHERE condition}\n\n"
+    "Important:\n"
+    "- Use only the columns provided in the table schema\n"
+    '- The table is always named "data"\n'
+    "- Ensure your SQL syntax is valid SQLite\n\n"
+    "So please end your answer with \\boxed{your SQL query here}"
+)
+
+
+def format_table_schema(
+    header: List[str], rows: List[List[Any]], max_rows: int = 3
+) -> str:
+    """Format table schema for the prompt."""
+    schema = f"Table: data\nColumns: {', '.join(header)}\n"
+    if rows:
+        schema += "Sample data:\n"
+        for row in rows[:max_rows]:
+            row_str = " | ".join(str(v) for v in row)
+            schema += f"  {row_str}\n"
+    return schema
+
+
+def score_sql(
+    generated_sql: str,
+    gold_sql: str,
+    header: List[str],
+    rows: List[List[Any]],
+) -> dict:
+    """Score SQL by execution comparison."""
+    result = {
+        "generated_sql": generated_sql,
+        "gold_sql": gold_sql,
+        "score": -1.0,
+        "execution_success": False,
+        "gen_result": None,
+        "gold_result": None,
+        "error": None,
+    }
+
+    if not generated_sql:
+        result["error"] = "No SQL extracted from response"
+        return result
+
+    # Create in-memory table
+    try:
+        conn = create_table_from_wikisql(header, rows)
+    except Exception as e:
+        result["error"] = f"Table creation failed: {e}"
+        return result
+
+    # Quote identifiers in gold SQL that need quoting
+    quoted_gold_sql = quote_identifiers_in_sql(gold_sql, header)
+    result["quoted_gold_sql"] = quoted_gold_sql
+
+    # Execute gold SQL
+    gold_result = execute_sql(conn, quoted_gold_sql)
+    if gold_result is None:
+        conn.close()
+        result["error"] = f"Gold SQL execution failed: {quoted_gold_sql}"
+        return result
+    result["gold_result"] = str(gold_result)
+
+    # Execute generated SQL
+    gen_result = execute_sql(conn, generated_sql)
+    conn.close()
+
+    if gen_result is None:
+        result["error"] = "Generated SQL execution failed"
+        return result
+
+    result["gen_result"] = str(gen_result)
+    result["execution_success"] = True
+
+    # Compare results
+    if results_match(gen_result, gold_result):
+        result["score"] = 1.0
+    else:
+        result["error"] = "Results do not match"
+
+    return result
+
+
+async def test_single_item(client, model_name: str, item: dict, item_idx: int) -> dict:
+    """Test a single WikiSQL item."""
+    table_schema = format_table_schema(item["header"], item["rows"])
+    user_content = f"{table_schema}\nQuestion: {item['question']}"
+
+    try:
+        response = await client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_content},
+            ],
+            max_tokens=1024,
+            temperature=0.6,
+        )
+
+        response_content = response.choices[0].message.content
+
+        # Extract SQL
+        generated_sql = extract_boxed_sql(response_content)
+
+        # Score
+        score_result = score_sql(
+            generated_sql, item["gold_sql"], item["header"], item["rows"]
+        )
+
+        return {
+            "item_idx": item_idx,
+            "question": item["question"],
+            "response": (
+                response_content[:500] + "..."
+                if len(response_content) > 500
+                else response_content
+            ),
+            **score_result,
+        }
+
+    except Exception as e:
+        return {
+            "item_idx": item_idx,
+            "question": item["question"],
+            "error": str(e),
+            "score": -1.0,
+        }
+
+
+async def run_integration_test(
+    base_url: str,
+    model_name: str,
+    api_key: str = "x",
+    num_samples: int = 10,
+):
+    """Run the integration test."""
+    print(f"\n{'='*60}")
+    print("SQL Query Environment Integration Test")
+    print(f"{'='*60}")
+    print(f"Server: {base_url}")
+    print(f"Model: {model_name}")
+    print(f"Samples: {num_samples}")
+    print()
+
+    # Load dataset
+    print("Loading WikiSQL training data...")
+    train_data = load_wikisql_split("train")
+    print(f"Loaded {len(train_data)} training examples")
+
+    # Initialize OpenAI client
+    client = openai.AsyncClient(
+        base_url=base_url,
+        api_key=api_key,
+        timeout=120.0,
+    )
+
+    # Run tests
+    print(f"\nTesting {num_samples} samples...\n")
+    results = []
+
+    for i in range(min(num_samples, len(train_data))):
+        item = train_data[i]
+        print(f"[{i+1}/{num_samples}] Testing: {item['question'][:60]}...")
+        result = await test_single_item(client, model_name, item, i)
+        results.append(result)
+
+        # Print result
+        if result["score"] == 1.0:
+            print(f"  ✓ CORRECT - Generated: {result.get('generated_sql', 'N/A')[:80]}")
+        else:
+            print(f"  ✗ INCORRECT - {result.get('error', 'Unknown error')}")
+            if result.get("generated_sql"):
+                print(f"    Generated: {result['generated_sql'][:80]}")
+            print(f"    Gold: {result.get('gold_sql', 'N/A')[:80]}")
+
+    # Summary
+    print(f"\n{'='*60}")
+    print("SUMMARY")
+    print(f"{'='*60}")
+
+    correct = sum(1 for r in results if r["score"] == 1.0)
+    executed = sum(1 for r in results if r.get("execution_success", False))
+
+    print(f"Correct:          {correct}/{num_samples} ({100*correct/num_samples:.1f}%)")
+    print(
+        f"Execution Success: {executed}/{num_samples} ({100*executed/num_samples:.1f}%)"
+    )
+
+    # Save results
+    output_file = "integration_test_results.json"
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nDetailed results saved to: {output_file}")
+
+    return results
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="SQL Query Environment Integration Test"
+    )
+    parser.add_argument(
+        "--base_url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="Base URL for OpenAI-compatible API",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="Qwen/Qwen3-8B",
+        help="Model name",
+    )
+    parser.add_argument(
+        "--api_key",
+        type=str,
+        default="x",
+        help="API key",
+    )
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=10,
+        help="Number of samples to test",
+    )
+
+    args = parser.parse_args()
+
+    asyncio.run(
+        run_integration_test(
+            base_url=args.base_url,
+            model_name=args.model,
+            api_key=args.api_key,
+            num_samples=args.num_samples,
+        )
+    )
diff --git a/environments/community/sql_query_env/test_sql_executor.py b/environments/community/sql_query_env/test_sql_executor.py
new file mode 100644
index 00000000..9838d05a
--- /dev/null
+++ b/environments/community/sql_query_env/test_sql_executor.py
@@ -0,0 +1,224 @@
+"""
+Unit tests for SQL executor module.
+"""
+
+from sql_executor import (
+    create_table_from_wikisql,
+    execute_sql,
+    extract_boxed_sql,
+    normalize_result,
+    results_match,
+)
+
+
+class TestCreateTableFromWikiSQL:
+    """Tests for create_table_from_wikisql function."""
+
+    def test_basic_table_creation(self):
+        """Test creating a simple table."""
+        header = ["name", "age", "city"]
+        rows = [
+            ["Alice", "30", "New York"],
+            ["Bob", "25", "Los Angeles"],
+        ]
+
+        conn = create_table_from_wikisql(header, rows)
+        result = execute_sql(conn, "SELECT * FROM data")
+        conn.close()
+
+        assert result is not None
+        assert len(result) == 2
+
+    def test_table_with_special_column_names(self):
+        """Test columns with spaces and special characters."""
+        header = ["Player Name", "Goals-Scored", "Team (2024)"]
+        rows = [["Messi", "10", "Inter Miami"]]
+
+        conn = create_table_from_wikisql(header, rows)
+        # Column names should be sanitized
+        result = execute_sql(conn, "SELECT * FROM data")
+        conn.close()
+
+        assert result is not None
+        assert len(result) == 1
+
+    def test_empty_table(self):
+        """Test creating a table with no rows."""
+        header = ["col1", "col2"]
+        rows = []
+
+        conn = create_table_from_wikisql(header, rows)
+        result = execute_sql(conn, "SELECT * FROM data")
+        conn.close()
+
+        assert result == []
+
+
+class TestExecuteSQL:
+    """Tests for execute_sql function."""
+
+    def test_valid_select(self):
+        """Test a valid SELECT query."""
+        header = ["name", "age"]
+        rows = [["Alice", "30"], ["Bob", "25"]]
+
+        conn = create_table_from_wikisql(header, rows)
+        result = execute_sql(conn, "SELECT name FROM data WHERE age = '30'")
+        conn.close()
+
+        assert result == [("Alice",)]
+
+    def test_invalid_sql_returns_none(self):
+        """Test that invalid SQL returns None."""
+        header = ["a"]
+        rows = [["1"]]
+
+        conn = create_table_from_wikisql(header, rows)
+        result = execute_sql(conn, "INVALID SQL SYNTAX")
+        conn.close()
+
+        assert result is None
+
+    def test_aggregation(self):
+        """Test COUNT aggregation."""
+        header = ["category"]
+        rows = [["A"], ["A"], ["B"]]
+
+        conn = create_table_from_wikisql(header, rows)
+        result = execute_sql(conn, "SELECT COUNT(*) FROM data WHERE category = 'A'")
+        conn.close()
+
+        assert result == [(2,)]
+
+
+class TestExtractBoxedSQL:
+    """Tests for extract_boxed_sql function."""
+
+    def test_simple_boxed(self):
+        """Test extracting from simple boxed format."""
+        text = "The answer is \\boxed{SELECT * FROM data}"
+        result = extract_boxed_sql(text)
+        assert result == "SELECT * FROM data"
+
+    def test_boxed_with_think_tags(self):
+        """Test extracting when thinking tags are present."""
+        text = """<think>
+Let me think about this query...
+</think>
+
+\\boxed{SELECT name FROM data WHERE age > 25}"""
+        result = extract_boxed_sql(text)
+        assert result == "SELECT name FROM data WHERE age > 25"
+
+    def test_no_boxed_returns_none(self):
+        """Test that missing boxed returns None."""
+        text = "SELECT * FROM data"
+        result = extract_boxed_sql(text)
+        assert result is None
+
+    def test_empty_boxed(self):
+        """Test empty boxed returns None."""
+        text = "\\boxed{}"
+        result = extract_boxed_sql(text)
+        assert result is None
+
+
+class TestResultsMatch:
+    """Tests for results_match function."""
+
+    def test_identical_results(self):
+        """Test matching identical results."""
+        result1 = [("Alice", "30"), ("Bob", "25")]
+        result2 = [("Alice", "30"), ("Bob", "25")]
+        assert results_match(result1, result2) is True
+
+    def test_different_order(self):
+        """Test matching results in different order."""
+        result1 = [("Alice",), ("Bob",)]
+        result2 = [("Bob",), ("Alice",)]
+        assert results_match(result1, result2) is True
+
+    def test_case_insensitive(self):
+        """Test case-insensitive matching."""
+        result1 = [("ALICE",)]
+        result2 = [("alice",)]
+        assert results_match(result1, result2) is True
+
+    def test_different_results(self):
+        """Test non-matching results."""
+        result1 = [("Alice",)]
+        result2 = [("Bob",)]
+        assert results_match(result1, result2) is False
+
+    def test_none_result(self):
+        """Test that None results don't match."""
+        assert results_match(None, [("a",)]) is False
+        assert results_match([("a",)], None) is False
+
+
+class TestNormalizeResult:
+    """Tests for normalize_result function."""
+
+    def test_normalization(self):
+        """Test result normalization."""
+        result = [("Alice", "30"), ("BOB", "25")]
+        normalized = normalize_result(result)
+
+        # Should be lowercase and sorted
+        assert normalized == [("25", "bob"), ("30", "alice")] or normalized == [
+            ("alice", "30"),
+            ("bob", "25"),
+        ]
+
+
+class TestIntegration:
+    """Integration tests for the full scoring pipeline."""
+
+    def test_correct_sql_scores_1(self):
+        """Test that matching SQL results score correctly."""
+        header = ["name", "age"]
+        rows = [["Alice", "30"], ["Bob", "25"]]
+
+        conn = create_table_from_wikisql(header, rows)
+
+        gold_sql = "SELECT name FROM data WHERE age = '30'"
+        generated_sql = "SELECT name FROM data WHERE age = '30'"
+
+        gold_result = execute_sql(conn, gold_sql)
+        gen_result = execute_sql(conn, generated_sql)
+        conn.close()
+
+        assert results_match(gen_result, gold_result) is True
+
+    def test_semantically_equivalent_sql(self):
+        """Test that semantically equivalent but differently written SQL matches."""
+        header = ["name", "age"]
+        rows = [["Alice", "30"], ["Bob", "25"]]
+
+        conn = create_table_from_wikisql(header, rows)
+
+        gold_sql = "SELECT name FROM data WHERE age = '30'"
+        # Different whitespace/formatting but same result
+        generated_sql = "select NAME from data where AGE='30'"
+
+        gold_result = execute_sql(conn, gold_sql)
+        gen_result = execute_sql(conn, generated_sql)
+        conn.close()
+
+        assert results_match(gen_result, gold_result) is True
+
+    def test_wrong_sql_fails(self):
+        """Test that incorrect SQL doesn't match."""
+        header = ["name", "age"]
+        rows = [["Alice", "30"], ["Bob", "25"]]
+
+        conn = create_table_from_wikisql(header, rows)
+
+        gold_sql = "SELECT name FROM data WHERE age = '30'"
+        generated_sql = "SELECT name FROM data WHERE age = '25'"
+
+        gold_result = execute_sql(conn, gold_sql)
+        gen_result = execute_sql(conn, generated_sql)
+        conn.close()
+
+        assert results_match(gen_result, gold_result) is False
diff --git a/environments/community/sql_query_env/wikisql_loader.py b/environments/community/sql_query_env/wikisql_loader.py
new file mode 100644
index 00000000..880fd298
--- /dev/null
+++ b/environments/community/sql_query_env/wikisql_loader.py
@@ -0,0 +1,227 @@
+"""
+WikiSQL Data Loader
+
+Downloads and loads the WikiSQL dataset directly from the Salesforce GitHub repository.
+This module provides functionality to fetch the dataset, extract it, and load it
+in a format compatible with the SQL Query Environment.
+"""
+
+import json
+import os
+import tarfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.request import urlretrieve
+
+# WikiSQL data source
+WIKISQL_DATA_URL = "https://github.com/salesforce/WikiSQL/raw/master/data.tar.bz2"
+
+# SQL operators from WikiSQL lib/query.py
+AGG_OPS = ["", "MAX", "MIN", "COUNT", "SUM", "AVG"]
+COND_OPS = ["=", ">", "<", "OP"]
+
+
+def get_cache_dir() -> Path:
+    """Get the cache directory for WikiSQL data."""
+    cache_dir = Path(
+        os.environ.get("WIKISQL_CACHE_DIR", Path.home() / ".cache" / "wikisql")
+    )
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+
+
+def download_wikisql(cache_dir: Optional[Path] = None, force: bool = False) -> Path:
+    """
+    Download the WikiSQL dataset from GitHub.
+
+    Args:
+        cache_dir: Directory to cache the downloaded data
+        force: Force re-download even if already cached
+
+    Returns:
+        Path to the extracted data directory
+    """
+    if cache_dir is None:
+        cache_dir = get_cache_dir()
+
+    data_dir = cache_dir / "data"
+    tar_file = cache_dir / "data.tar.bz2"
+
+    # Check if already extracted
+    if data_dir.exists() and not force:
+        expected_files = [
+            "train.jsonl",
+            "dev.jsonl",
+            "test.jsonl",
+            "train.db",
+            "dev.db",
+            "test.db",
+        ]
+        if all((data_dir / f).exists() for f in expected_files):
+            return data_dir
+
+    # Download if needed
+    if not tar_file.exists() or force:
+        print(f"Downloading WikiSQL dataset from {WIKISQL_DATA_URL}...")
+        urlretrieve(WIKISQL_DATA_URL, tar_file)
+        print(f"Downloaded to {tar_file}")
+
+    # Extract
+    print(f"Extracting to {cache_dir}...")
+    with tarfile.open(tar_file, "r:bz2") as tar:
+        tar.extractall(cache_dir)
+    print("Extraction complete.")
+
+    return data_dir
+
+
+def build_human_readable_sql(
+    table_header: List[str],
+    sel: int,
+    agg: int,
+    conds: List[Tuple[int, int, str]],
+) -> str:
+    """
+    Build a human-readable SQL query from WikiSQL format.
+
+    Args:
+        table_header: List of column names
+        sel: Index of selected column
+        agg: Index of aggregation operator
+        conds: List of (column_index, operator_index, condition) tuples
+
+    Returns:
+        Human-readable SQL query string
+    """
+    # Build SELECT clause
+    sel_col = table_header[sel] if sel < len(table_header) else f"col{sel}"
+    if agg > 0:
+        select_clause = f"{AGG_OPS[agg]}({sel_col})"
+    else:
+        select_clause = sel_col
+
+    sql = f"SELECT {select_clause} FROM data"
+
+    # Build WHERE clause
+    if conds:
+        where_parts = []
+        for col_idx, op_idx, condition in conds:
+            col_name = (
+                table_header[col_idx]
+                if col_idx < len(table_header)
+                else f"col{col_idx}"
+            )
+            op = COND_OPS[op_idx] if op_idx < len(COND_OPS) else "="
+            # Quote string conditions
+            if isinstance(condition, str):
+                where_parts.append(f"{col_name} {op} '{condition}'")
+            else:
+                where_parts.append(f"{col_name} {op} {condition}")
+        sql += " WHERE " + " AND ".join(where_parts)
+
+    return sql
+
+
+def load_wikisql_split(
+    split: str = "train",
+    cache_dir: Optional[Path] = None,
+) -> List[Dict[str, Any]]:
+    """
+    Load a WikiSQL dataset split.
+
+    Args:
+        split: One of 'train', 'dev', or 'test'
+        cache_dir: Directory containing the WikiSQL data
+
+    Returns:
+        List of dictionaries with question, table info, and gold SQL
+    """
+    if cache_dir is None:
+        data_dir = download_wikisql()
+    else:
+        data_dir = cache_dir
+
+    jsonl_file = data_dir / f"{split}.jsonl"
+    tables_file = data_dir / f"{split}.tables.jsonl"
+
+    if not jsonl_file.exists():
+        raise FileNotFoundError(f"WikiSQL {split} JSONL file not found: {jsonl_file}")
+    if not tables_file.exists():
+        raise FileNotFoundError(f"WikiSQL {split} tables file not found: {tables_file}")
+
+    # Load all tables from tables.jsonl (has real column headers)
+    tables_cache = {}
+    with open(tables_file, "r", encoding="utf-8") as f:
+        for line in f:
+            table = json.loads(line.strip())
+            tables_cache[table["id"]] = {
+                "header": table["header"],
+                "rows": table["rows"],
+                "types": table.get("types", ["text"] * len(table["header"])),
+            }
+
+    print(f"Loaded {len(tables_cache)} tables for {split} split")
+
+    # Load questions and build dataset
+    dataset = []
+    skipped = 0
+    with open(jsonl_file, "r", encoding="utf-8") as f:
+        for line in f:
+            item = json.loads(line.strip())
+
+            table_info = tables_cache.get(item["table_id"])
+            if not table_info or not table_info["header"]:
+                skipped += 1
+                continue
+
+            # Build human-readable SQL
+            conds = [(c[0], c[1], c[2]) for c in item["sql"]["conds"]]
+            gold_sql = build_human_readable_sql(
+                table_info["header"],
+                item["sql"]["sel"],
+                item["sql"]["agg"],
+                conds,
+            )
+
+            dataset.append(
+                {
+                    "question": item["question"],
+                    "header": table_info["header"],
+                    "rows": table_info["rows"],
+                    "types": table_info["types"],
+                    "gold_sql": gold_sql,
+                    "table_id": item["table_id"],
+                }
+            )
+
+    if skipped > 0:
+        print(f"Skipped {skipped} examples with missing tables")
+
+    return dataset
+
+
+def load_wikisql(cache_dir: Optional[Path] = None) -> Dict[str, List[Dict[str, Any]]]:
+    """
+    Load the full WikiSQL dataset.
+
+    Returns:
+        Dictionary with 'train', 'dev', and 'test' splits
+    """
+    return {
+        "train": load_wikisql_split("train", cache_dir),
+        "dev": load_wikisql_split("dev", cache_dir),
+        "test": load_wikisql_split("test", cache_dir),
+    }
+
+
+if __name__ == "__main__":
+    # Test the loader
+    print("Testing WikiSQL loader...")
+    train = load_wikisql_split("train")
+    print(f"Loaded {len(train)} training examples")
+    if train:
+        print("\nFirst example:")
+        print(f"  Question: {train[0]['question']}")
+        print(f"  Header: {train[0]['header']}")
+        print(f"  Gold SQL: {train[0]['gold_sql']}")
+        print(f"  Rows (first 2): {train[0]['rows'][:2]}")
diff --git a/environments/eval_environments/aime24_hermes_example.py b/environments/eval_environments/aime24_hermes_example.py
new file mode 100644
index 00000000..f0ccfbf1
--- /dev/null
+++ b/environments/eval_environments/aime24_hermes_example.py
@@ -0,0 +1,170 @@
+import asyncio
+from concurrent.futures import ProcessPoolExecutor
+from typing import Optional, Tuple
+
+from datasets import load_dataset
+from latex2sympy2_extended import NormalizationConfig
+from math_verify import LatexExtractionConfig, parse, verify
+from math_verify.errors import TimeoutException
+
+from atroposlib.envs.eval import EvalBase, eval_runner, pass_at_k
+from atroposlib.envs.server_handling.server_manager import ServerManager
+
+hermes_system_prompt = (
+    "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the "
+    "problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
+    "solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
+    "</think> tags, and then provide your solution or response to the problem."
+)
+
+
+def score_answer(gold, resp) -> Optional[bool]:
+    try:
+        gold_parsed = parse(
+            gold,
+            extraction_mode="first_match",
+            extraction_config=[LatexExtractionConfig()],
+        )
+    except (Exception, TimeoutException, KeyError, TypeError, NotImplementedError):
+        return None
+    if len(gold_parsed) != 0:
+        try:
+            answer_parsed = parse(
+                resp,
+                extraction_config=[
+                    LatexExtractionConfig(
+                        normalization_config=NormalizationConfig(
+                            nits=False,
+                            malformed_operators=False,
+                            basic_latex=True,
+                            boxed="all",
+                            units=True,
+                        ),
+                        # Ensures that boxed is tried first
+                        boxed_match_priority=0,
+                        try_extract_without_anchor=False,
+                    )
+                ],
+                extraction_mode="first_match",
+            )
+        except (
+            Exception,
+            TimeoutException,
+            KeyError,
+            TypeError,
+            NotImplementedError,
+        ):
+            # Can't parse, so we skip
+            return None
+        # Reward 1 if the content is the same as the ground truth, 0 otherwise
+        try:
+            return verify(answer_parsed, gold_parsed)
+        except (
+            Exception,
+            TimeoutException,
+            KeyError,
+            TypeError,
+            NotImplementedError,
+        ):
+            return None
+    return None
+
+
+class AIME24(EvalBase):
+    """
+    AIME24 Eval Environment
+
+    kwargs:
+        use_system_prompt (bool): Whether to use the system prompt in the evaluation.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.mp_executor = ProcessPoolExecutor(8)
+
+    def setup_data(self):
+        aime_test_data = load_dataset("HuggingFaceH4/aime_2024", split="train")
+        data = list()
+        for item in aime_test_data:
+            data.append(
+                {
+                    "problem": item["problem"],
+                    "answer": item["answer"],
+                }
+            )
+        return data
+
+    async def run_item(
+        self, server: ServerManager, data_item: dict
+    ) -> Tuple[dict, list]:
+        """
+        An abstract method that must be overridden in a subclass to define how a
+        specific item should be processed. This method encapsulates the logic required
+        to run or process the given data item on the provided server instance.
+
+        Args:
+            server (ServerManager): An instance of ServerManager used to manage and
+                interact with server operations during the item processing.
+            data_item (dict): A dictionary representing the data item to be processed.
+                The structure and content of the dictionary would depend on the
+                specific application and use case.
+
+        Returns:
+            Tuple[dict, list]: A tuple where the first element is a dictionary
+                containing the processed results or output of the operation, and
+                the second element is a list containing any additional data generated
+                or collected during the item's processing.
+        """
+        answer = data_item["answer"]
+        question = data_item["problem"]
+        use_sys_prompt = getattr(self, "use_system_prompt", False)
+        async with server.managed_server() as managed:
+            messages = (
+                [{"role": "system", "content": hermes_system_prompt}]
+                if use_sys_prompt
+                else []
+            )
+            messages.append({"role": "user", "content": question})
+            completion = await self.chat_completion(managed, messages)
+        loop = asyncio.get_event_loop()
+        gold = "\\boxed{" + answer + "}" if "\\boxed" not in answer else answer
+        tasks = []
+        for choice in completion.choices:
+            resp = choice.message.content.split("</think>")[-1]
+            tasks.append(
+                loop.run_in_executor(self.mp_executor, score_answer, gold, resp)
+            )
+        rewards = await asyncio.gather(*tasks)
+        rewards = [1.0 if reward else 0.0 for reward in rewards]
+        passing = sum(rewards)
+        n = self.get_generation_params()["n"]
+        pass_at_k_val = getattr(self, "pass_at_k", 1)
+        acc_at_k = pass_at_k(n, passing, getattr(self, "pass_at_k", 1))
+        print(acc_at_k, n, passing, pass_at_k_val)
+        key = f"pass@{pass_at_k_val}"
+        if n != pass_at_k_val:
+            key += f":{n}"
+        return {
+            key: acc_at_k,
+        }, [
+            {
+                "messages": messages,
+                "answer": choice.message.content,
+                "score": rewards[i],
+            }
+            for i, choice in enumerate(completion.choices)
+        ]
+
+
+if __name__ == "__main__":
+    asyncio.run(
+        eval_runner(
+            AIME24(
+                pass_at_n=1,
+                pass_at_n_samples=4,
+                temperature=1.0,
+                max_tokens=32768,
+                use_system_prompt=True,
+            )
+        )
+    )
diff --git a/environments/gsm8k_server.py b/environments/gsm8k_server.py
index 766e27ec..6ae5285b 100644
--- a/environments/gsm8k_server.py
+++ b/environments/gsm8k_server.py
@@ -272,6 +272,7 @@ class GSM8kEnv(BaseEnv):
         scores["tokens"] = list()
         scores["masks"] = list()
         scores["scores"] = list()
+        scores["inference_logprobs"] = list()
         gold_parsed = parse(
             rollout_group_data[0]["gold_answer"],
             extraction_mode="first_match",
diff --git a/environments/letter_counting_environment.py b/environments/letter_counting_environment.py
deleted file mode 100644
index 483851bb..00000000
--- a/environments/letter_counting_environment.py
+++ /dev/null
@@ -1,2187 +0,0 @@
-import json
-import logging
-import os
-import random
-import re
-import uuid
-from typing import Dict, List, Optional, Tuple, Union
-
-import wandb
-from pydantic import Field
-from tqdm.asyncio import tqdm_asyncio
-
-from atroposlib.envs.base import (
-    APIServerConfig,
-    BaseEnv,
-    BaseEnvConfig,
-    EvalHandlingEnum,
-    Item,
-    ScoredDataGroup,
-)
-
-# Import NLTK words corpus for large-scale word list
-try:
-    import nltk
-    from nltk.corpus import words
-    from nltk.tokenize import sent_tokenize
-
-    # Download required NLTK data
-    try:
-        words.words()
-    except LookupError:
-        nltk.download("words")
-    try:
-        sent_tokenize("Test sentence.")
-    except LookupError:
-        nltk.download("punkt")
-except ImportError:
-    print("Warning: NLTK not available. Please install with: pip install nltk")
-    words = None
-    sent_tokenize = None
-
-# Import datasets for OpenWebText
-try:
-    from datasets import load_dataset
-except ImportError:
-    print(
-        "Warning: datasets library not available. Please install with: pip install datasets"
-    )
-    load_dataset = None
-
-system_prompt = (
-    "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the "
-    "problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
-    "solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
-    "</think> tags, and then provide your solution or response to the problem."
-)
-
-
-class LetterCountingConfig(BaseEnvConfig):
-    """Configuration class for Letter Counting Environment with custom parameters."""
-
-    # Word dataset configuration
-    min_word_length: int = Field(3, description="Minimum word length to include")
-    max_word_length: int = Field(30, description="Maximum word length to include")
-    train_test_split: float = Field(0.95, description="Ratio for train/test split")
-
-    # Letter selection configuration
-    use_all_letters: bool = Field(
-        True, description="Whether to use all 26 letters or custom set"
-    )
-    custom_letters: str = Field(
-        "aeiou", description="Custom letter set if use_all_letters=False"
-    )
-    present_letter_bias: float = Field(
-        0.5,
-        description="Probability of choosing letters that are present in the text (0.0-1.0)",
-    )
-
-    # Generation configuration
-    generation_temperature: float = Field(
-        1.0, description="Temperature for training generation"
-    )
-    eval_temperature: float = Field(
-        0.2, description="Temperature for evaluation generation"
-    )
-    max_generation_tokens: int = Field(
-        1024 * 15, description="Maximum tokens for model generation"
-    )
-
-    # Evaluation configuration
-    eval_sample_size: int = Field(
-        1000, description="Number of test words to evaluate on"
-    )
-
-    # Reproducibility configuration
-    random_seed: Optional[int] = Field(
-        42, description="Seed for reproducibility, None for random"
-    )
-
-    # Random string generation configuration
-    random_string_percentage: float = Field(
-        0.03, description="Percentage of dataset to be random strings (0.0-1.0)"
-    )
-    random_string_min_length: int = Field(
-        3, description="Minimum length for random strings"
-    )
-    random_string_max_length: int = Field(
-        25, description="Maximum length for random strings"
-    )
-
-    # Word capitalization configuration
-    uppercase_word_percentage: float = Field(
-        0.01, description="Percentage of real words to make uppercase (0.0-1.0)"
-    )
-    capitalized_word_percentage: float = Field(
-        0.01,
-        description="Percentage of real words to capitalize first letter (0.0-1.0)",
-    )
-
-    # Text/passage configuration
-    use_text_passages: bool = Field(
-        False, description="Include text passages from OpenWebText in addition to words"
-    )
-    text_passage_percentage: float = Field(
-        0.5,
-        description="Percentage of dataset to be text passages when use_text_passages=True (0.0-1.0)",
-    )
-    min_text_length: int = Field(
-        50, description="Minimum character length for text passages"
-    )
-    max_text_length: int = Field(
-        500, description="Maximum character length for text passages"
-    )
-    include_punctuation_in_count: bool = Field(
-        True, description="Include punctuation in letter counting"
-    )
-    include_spaces_in_count: bool = Field(
-        False, description="Include spaces in letter counting"
-    )
-
-    # Multi-letter counting configuration
-    max_letters_to_count: int = Field(
-        1,
-        description="Maximum number of different letters to count simultaneously (1 for single letter)",
-    )
-    multi_letter_probability: float = Field(
-        0.2, description="Probability of asking for multiple letters (0.0-1.0)"
-    )
-
-    # Difficulty and training thresholds
-    max_group_average_for_training: float = Field(
-        1.0,
-        description="Maximum group average to use for training (skip groups that are too easy)",
-    )
-
-    # Logging and data dumping configuration
-    debug_logging: bool = Field(
-        True, description="Enable debug-level logging for more verbose output"
-    )
-    suppress_base_env_logs: bool = Field(
-        True, description="Suppress verbose base environment logs"
-    )
-    dump_rollouts: bool = Field(
-        False, description="Whether to dump successful rollouts to JSONL files"
-    )
-    dump_batch_size: int = Field(
-        100,
-        description="Number of groups to accumulate before saving to disk (1 = save immediately)",
-    )
-
-
-class LetterCountingEnv(BaseEnv):
-    """
-    Letter Counting Environment for training models to count letters in words and sentences.
-
-    This environment presents the model with questions like "How many 'a's are in the word 'banana'?"
-    or "Count the occurrences of the letters 'e', 'o', and 't' in the following text: 'The quick brown fox jumps over the lazy dog'"
-    and expects responses in the format <answer>3</answer> for single letters or <answer>{"e": 4, "o": 4, "t": 2}</answer>
-    for multiple letters. The model should use <think></think> tags for reasoning before providing the final answer.
-
-    Features:
-    - **Word Mode**: Uses NLTK's words corpus (236k+ English words)
-    - **Mixed Mode**: Combines words and text passages from OpenWebText-10k dataset
-    - **Text Passage Mode**: Uses OpenWebText-10k dataset with character-based text extraction
-    - Optional random string generation (80% alphabetical) mixed with real words
-    - Configurable word/string/passage length ranges and letter sets
-    - Optional word capitalization (uppercase, title case)
-    - **Multi-letter counting**: Configurable simultaneous counting of multiple letters with JSON responses
-    - **Letter selection bias**: Configurable bias toward letters present in the text (reduces zero-count questions)
-    - Training thresholds based on group average scores
-    - Configurable punctuation and space handling for letter counting
-    - Comprehensive logging and data dumping capabilities
-    - Detailed metrics tracking (letter distribution, text lengths, error rates, group average scores)
-    - Support for saving successful and failed rollouts for analysis
-
-    Data Dumping:
-    - Set dump_rollouts=True to save rollouts from groups with appropriate difficulty
-    - Only saves groups where group average score ≤ max_group_average_for_training
-    - Files saved to data_dumps/ directory with unique UUIDs
-    - Rollouts include full conversations, scores, metadata
-    - Configurable batch size (dump_batch_size) for efficient disk I/O
-
-    Mixed Mode Configuration:
-    - Set use_text_passages=True to enable mixed mode with both words and text passages
-    - Configure text_passage_percentage to control the ratio (e.g., 0.3 = 30% passages, 70% words)
-    - Configure min/max text passage character lengths (more reliable than word counts)
-    - Set max_group_average_for_training to skip groups that are too easy
-
-    Logging:
-    - Set debug_logging=True for verbose per-item scoring details
-    - Comprehensive WandB metrics including letter distribution entropy and group average scores
-    - Progress tracking for data dumps and evaluation
-    """  # noqa
-
-    name = "letter_counting"
-    env_config_cls = LetterCountingConfig
-
-    def __init__(
-        self,
-        config: LetterCountingConfig,
-        server_configs: List[APIServerConfig],
-        slurm=True,
-        testing=False,
-    ):
-        """
-        Initialize the Letter Counting environment.
-
-        Args:
-            config: Configuration for the base environment
-            server_configs: List of server configurations for OpenAI API
-            slurm: Whether to use Slurm for distributed training
-            testing: Whether in testing mode
-        """  # noqa: E501
-        super().__init__(config, server_configs, slurm, testing)
-
-        # Initialize data dumping infrastructure first (needed for logging)
-        self.run_uuid = str(uuid.uuid4())
-        self.rollouts_to_save_buffer: List[
-            Dict[str, Union[str, List[Dict[str, Union[List[Dict[str, str]], float]]]]]
-        ] = []
-        self.processed_item_count = 0
-        self.datadumps_dir = os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "data_dumps"
-        )
-        self.save_file_batch_num = 0
-
-        # Additional metrics tracking
-        self.letter_distribution_stats: Dict[str, int] = {}
-        self.word_length_stats: Dict[int, int] = {}
-        self.answer_format_errors = 0
-        self.think_format_errors = 0
-
-        # Initialize the logger
-        self.logger = logging.getLogger(self.__class__.__name__)
-        if not self.logger.handlers:
-            # Add a basic stream handler if no handlers are configured
-            _handler = logging.StreamHandler()
-            _formatter = logging.Formatter(
-                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-            )
-            _handler.setFormatter(_formatter)
-            self.logger.addHandler(_handler)
-            # Set logging level based on config
-            log_level = logging.DEBUG if self.config.debug_logging else logging.INFO
-            self.logger.setLevel(log_level)
-        # Ensure the logger itself is enabled
-        self.logger.disabled = False
-
-        # Suppress base environment logs if requested
-        if self.config.suppress_base_env_logs:
-            # Set the base environment logger to WARNING level to suppress INFO logs
-            base_logger = logging.getLogger("atroposlib.envs.base")
-            base_logger.setLevel(logging.WARNING)
-
-        # Log initialization completion
-        self.logger.info(
-            f"LetterCountingEnv initialized with run UUID: {self.run_uuid}"
-        )
-        self.logger.info(
-            f"Debug logging: {'enabled' if self.config.debug_logging else 'disabled'}"
-        )
-        self.logger.info(
-            f"Data dumping: rollouts={'enabled' if self.config.dump_rollouts else 'disabled'}"
-        )
-
-        self.percent_correct_buffer = list()
-        self.eval_metrics = list()
-        self.rollouts_for_wandb: List[List[Tuple[str, float, str, str, str]]] = []
-
-    @classmethod
-    def config_init(self) -> Tuple[LetterCountingConfig, List[APIServerConfig]]:
-        env_config = LetterCountingConfig(
-            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview",
-            group_size=32,
-            use_wandb=True,
-            max_num_workers=128,
-            rollout_server_url="http://localhost:8000",
-            total_steps=250,
-            batch_size=1024,
-            steps_per_eval=20,
-            max_token_length=1024 * 15,
-            inference_weight=1.0,
-            wandb_name="letter_counting_deep_thinking",
-            data_path_to_save_groups=None,
-            eval_handling=EvalHandlingEnum.LIMIT_TRAIN,
-            eval_limit_ratio=0.1,
-            # Letter counting specific configs
-            min_word_length=3,
-            max_word_length=30,
-            train_test_split=0.95,
-            eval_sample_size=1000,
-            generation_temperature=1.0,
-            eval_temperature=0.5,
-            random_seed=42,
-            use_all_letters=True,
-            custom_letters="aeiou",
-            present_letter_bias=0.5,
-            max_generation_tokens=1024 * 15,
-            # Random string generation
-            random_string_percentage=0.0,
-            random_string_min_length=3,
-            random_string_max_length=15,
-            # Word capitalization
-            uppercase_word_percentage=0.01,
-            capitalized_word_percentage=0.005,
-            # Text passage configuration
-            use_text_passages=True,
-            text_passage_percentage=0.3,
-            min_text_length=3,
-            max_text_length=2000,
-            include_punctuation_in_count=True,
-            include_spaces_in_count=True,
-            max_group_average_for_training=0.7,
-            # Multi-letter counting
-            max_letters_to_count=4,
-            multi_letter_probability=0.2,
-            debug_logging=True,
-            dump_rollouts=True,
-            dump_batch_size=100,
-        )
-        server_configs = [
-            APIServerConfig(
-                model_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview",
-                base_url="http://localhost:9004/v1",
-                api_key="x",
-                num_max_requests_at_once=32,
-                num_requests_for_eval=256,
-            )
-        ]
-
-        return env_config, server_configs
-
-    async def setup(self):
-        """
-        Set up the environment by loading and preparing the word/text dataset.
-        """
-        if self.config.use_text_passages:
-            await self._setup_mixed_dataset()
-        else:
-            await self._setup_word_dataset()
-
-        # Initialize iteration counter
-        self.iter = 0
-
-    async def _setup_mixed_dataset(self):
-        """
-        Set up the environment using both words and text passages from OpenWebText dataset.
-        """
-        if load_dataset is None:
-            raise ImportError(
-                "datasets library is required for text passage mode. Please install with: pip install datasets"
-            )
-        if words is None:
-            raise ImportError(
-                "NLTK is required for this environment. Please install with: pip install nltk"
-            )
-
-        # Set random seed for reproducibility if configured
-        if self.config.random_seed is not None:
-            random.seed(self.config.random_seed)
-
-        # Validate configuration
-        await self._validate_config()
-
-        self.logger.info("Setting up mixed dataset with words and text passages...")
-
-        # First, set up words (same as word-only mode)
-        all_words = words.words()
-        filtered_words = [
-            word.lower()
-            for word in all_words
-            if word.isalpha()
-            and self.config.min_word_length <= len(word) <= self.config.max_word_length
-        ]
-
-        # Apply capitalization to real words if configured
-        filtered_words = self._apply_word_capitalization(filtered_words)
-
-        # Generate random strings if configured
-        if self.config.random_string_percentage > 0.0:
-            total_filtered_words = len(filtered_words)
-            if self.config.random_string_percentage >= 1.0:
-                num_random_strings = (
-                    total_filtered_words if total_filtered_words > 0 else 1000
-                )
-                filtered_words = []
-            else:
-                num_random_strings = int(
-                    (self.config.random_string_percentage * total_filtered_words)
-                    / (1.0 - self.config.random_string_percentage)
-                )
-
-            random_strings = self._generate_random_strings(num_random_strings)
-            all_word_strings = filtered_words + random_strings
-            self.logger.info(
-                f"Generated {num_random_strings} random strings ({self.config.random_string_percentage:.1%} of word dataset)"  # noqa
-            )
-        else:
-            all_word_strings = filtered_words
-            random_strings = []
-
-        self.logger.info(
-            f"Prepared {len(all_word_strings)} word/string items ({len(filtered_words)} real words + {len(random_strings)} random strings)"  # noqa
-        )
-
-        # Now load and process text passages
-        self.logger.info("Loading OpenWebText-10k dataset for text passages...")
-        dataset = load_dataset("stas/openwebtext-10k", split="train")
-        self.logger.info(f"Loaded {len(dataset)} text samples from OpenWebText")
-
-        # Extract and filter text passages
-        all_passages = []
-        processed_texts = 0
-
-        for item in dataset:
-            text = item["text"]
-
-            # Skip texts that are too long initially
-            if (
-                len(text) > self.config.max_text_length * 3
-            ):  # Allow some overhead for chunking
-                continue
-
-            # Extract passages from this text
-            passages = self._extract_text_passages(text)
-            all_passages.extend(passages)
-            processed_texts += 1
-
-            # Log progress periodically
-            if processed_texts % 1000 == 0:
-                self.logger.info(
-                    f"Processed {processed_texts} texts, extracted {len(all_passages)} passages so far..."
-                )
-
-        self.logger.info(
-            f"Extracted {len(all_passages)} text passages from {processed_texts} texts"
-        )
-
-        # Now mix words and passages according to text_passage_percentage
-        if self.config.text_passage_percentage >= 1.0:
-            # Special case: 100% text passages, no words
-            all_mixed_items = all_passages
-            final_word_count = 0
-            final_passage_count = len(all_passages)
-        elif self.config.text_passage_percentage <= 0.0:
-            # Special case: 0% text passages, only words
-            all_mixed_items = all_word_strings
-            final_word_count = len(all_word_strings)
-            final_passage_count = 0
-        else:
-            # Calculate how many passages to include
-            # If we want X% passages, then passages / (words + passages) = X
-            # Solving: passages_to_use = X * words / (1 - X)
-            total_words = len(all_word_strings)
-            num_passages_to_use = int(
-                (self.config.text_passage_percentage * total_words)
-                / (1.0 - self.config.text_passage_percentage)
-            )
-
-            # Upsample passages if we don't have enough to reach the target percentage
-            if num_passages_to_use > len(all_passages):
-                self.logger.info(
-                    f"Upsampling passages: need {num_passages_to_use} but only have {len(all_passages)}. "
-                    f"Will repeat passages to reach target percentage."
-                )
-                # Calculate how many times we need to repeat the passage list
-                repeat_factor = (num_passages_to_use // len(all_passages)) + 1
-                upsampled_passages = all_passages * repeat_factor
-
-                # Shuffle the upsampled passages to avoid patterns
-                random.shuffle(upsampled_passages)
-
-                # Take exactly the number we need
-                passages_to_use = upsampled_passages[:num_passages_to_use]
-
-                self.logger.info(
-                    f"Upsampled {len(all_passages)} unique passages {repeat_factor}x times, "
-                    f"then sampled {len(passages_to_use)} passages for the dataset."
-                )
-            else:
-                # We have enough passages, just sample what we need
-                passages_to_use = random.sample(all_passages, num_passages_to_use)
-
-            # Combine words and passages
-            all_mixed_items = all_word_strings + passages_to_use
-            final_word_count = len(all_word_strings)
-            final_passage_count = len(passages_to_use)
-
-        # Shuffle the mixed dataset
-        random.shuffle(all_mixed_items)
-
-        # Create train/test split
-        split_point = int(self.config.train_test_split * len(all_mixed_items))
-        self.train_words = all_mixed_items[
-            :split_point
-        ]  # Reusing train_words for mixed items
-        self.test_words = all_mixed_items[split_point:]
-
-        # Calculate actual percentages
-        actual_passage_percentage = (
-            final_passage_count / len(all_mixed_items)
-            if len(all_mixed_items) > 0
-            else 0.0
-        )
-
-        # Calculate unique passages for logging
-        unique_passages_used = (
-            len(set(passages_to_use)) if "passages_to_use" in locals() else 0
-        )
-
-        # Log dataset statistics
-        self.logger.info("Mixed dataset created:")
-        if unique_passages_used < final_passage_count:
-            # Upsampling occurred
-            self.logger.info(
-                f"  Total items: {len(all_mixed_items)} ({final_word_count} words/strings + {final_passage_count} passages)"  # noqa
-            )
-            self.logger.info(
-                f"  Unique passages used: {unique_passages_used} (repeated to create {final_passage_count} total passage instances)"  # noqa
-            )
-        else:
-            # No upsampling
-            self.logger.info(
-                f"  Total items: {len(all_mixed_items)} ({final_word_count} words/strings + {final_passage_count} passages)"  # noqa
-            )
-        self.logger.info(
-            f"  Actual passage percentage: {actual_passage_percentage:.1%} (target: {self.config.text_passage_percentage:.1%})"  # noqa
-        )
-        self.logger.info(f"  Training items: {len(self.train_words)}")
-        self.logger.info(f"  Test items: {len(self.test_words)}")
-
-        # Show examples of both types
-        word_examples = [item for item in self.train_words[:10] if len(item) <= 50][:3]
-        passage_examples = [
-            item[:100] + "..." for item in self.train_words[:50] if len(item) > 50
-        ][:3]
-
-        if word_examples:
-            self.logger.info(f"  Example words: {word_examples}")
-        if passage_examples:
-            self.logger.info(f"  Example passages: {passage_examples}")
-
-        # If we have upsampled passages, show some statistics
-        if unique_passages_used < final_passage_count and unique_passages_used > 0:
-            avg_repetitions = final_passage_count / unique_passages_used
-            self.logger.info(f"  Average passage repetitions: {avg_repetitions:.1f}x")
-
-        # Log configuration details
-        self.logger.info(
-            f"Word length range: {self.config.min_word_length}-{self.config.max_word_length}"
-        )
-        self.logger.info(
-            f"Passage length range: {self.config.min_text_length}-{self.config.max_text_length} characters"
-        )
-        self.logger.info(
-            f"Include punctuation: {self.config.include_punctuation_in_count}"
-        )
-        self.logger.info(f"Include spaces: {self.config.include_spaces_in_count}")
-        self.logger.info(
-            f"Training threshold: {self.config.max_group_average_for_training}"
-        )
-
-        self.logger.info("Mixed dataset setup complete")
-
-    async def _setup_text_passage_dataset(self):
-        """
-        Set up the environment using text passages from OpenWebText dataset.
-        """
-        if load_dataset is None:
-            raise ImportError(
-                "datasets library is required for text passage mode. Please install with: pip install datasets"
-            )
-
-        # Set random seed for reproducibility if configured
-        if self.config.random_seed is not None:
-            random.seed(self.config.random_seed)
-
-        # Validate configuration
-        await self._validate_config()
-
-        self.logger.info("Loading OpenWebText-10k dataset...")
-
-        # Load the dataset
-        dataset = load_dataset("stas/openwebtext-10k", split="train")
-
-        self.logger.info(f"Loaded {len(dataset)} text samples from OpenWebText")
-
-        # Extract and filter text passages
-        all_passages = []
-        processed_texts = 0
-
-        for item in dataset:
-            text = item["text"]
-
-            # Skip texts that are too long initially
-            if (
-                len(text) > self.config.max_text_length * 3
-            ):  # Allow some overhead for chunking
-                continue
-
-            # Extract passages from this text
-            passages = self._extract_text_passages(text)
-            all_passages.extend(passages)
-            processed_texts += 1
-
-            # Log progress periodically
-            if processed_texts % 1000 == 0:
-                self.logger.info(
-                    f"Processed {processed_texts} texts, extracted {len(all_passages)} passages so far..."
-                )
-
-        self.logger.info(
-            f"Extracted {len(all_passages)} text passages from {processed_texts} texts"
-        )
-
-        # Shuffle passages for randomness
-        random.shuffle(all_passages)
-
-        # Create train/test split
-        split_point = int(self.config.train_test_split * len(all_passages))
-        self.train_words = all_passages[
-            :split_point
-        ]  # Reusing train_words for passages
-        self.test_words = all_passages[split_point:]
-
-        # Log dataset statistics
-        self.logger.info(f"Training passages: {len(self.train_words)}")
-        self.logger.info(f"Test passages: {len(self.test_words)}")
-        self.logger.info(
-            f"Example passages: {[p[:100] + '...' for p in self.train_words[:3]]}"
-        )
-
-        # Log configuration details
-        self.logger.info(
-            f"Passage length range: {self.config.min_text_length}-{self.config.max_text_length} characters"
-        )
-        self.logger.info(
-            f"Include punctuation: {self.config.include_punctuation_in_count}"
-        )
-        self.logger.info(f"Include spaces: {self.config.include_spaces_in_count}")
-        self.logger.info(
-            f"Training threshold: {self.config.max_group_average_for_training}"
-        )
-
-        self.logger.info("Text passage dataset setup complete")
-
-    async def _setup_word_dataset(self):
-        """
-        Set up the environment using single words (original functionality).
-        """
-        # Load the NLTK words corpus (contains 236,736 English words)
-        if words is None:
-            raise ImportError(
-                "NLTK is required for this environment. Please install with: pip install nltk"
-            )
-
-        # Set random seed for reproducibility if configured
-        if self.config.random_seed is not None:
-            random.seed(self.config.random_seed)
-
-        # Validate configuration
-        await self._validate_config()
-
-        # Get all English words from NLTK
-        all_words = words.words()
-
-        # Filter words to ensure they contain only alphabetic characters
-        # and are within the configured length range for reasonable difficulty
-        filtered_words = [
-            word.lower()
-            for word in all_words
-            if word.isalpha()
-            and self.config.min_word_length <= len(word) <= self.config.max_word_length
-        ]
-
-        # Apply capitalization to real words if configured
-        filtered_words = self._apply_word_capitalization(filtered_words)
-
-        # Generate random strings if configured
-        if self.config.random_string_percentage > 0.0:
-            # Calculate how many random strings to generate
-            total_filtered_words = len(filtered_words)
-            # If we want X% random strings, then random_strings / (words + random_strings) = X
-            # Solving: random_strings = X * words / (1 - X)
-            if self.config.random_string_percentage >= 1.0:
-                # Special case: 100% random strings, no real words
-                num_random_strings = (
-                    total_filtered_words if total_filtered_words > 0 else 1000
-                )
-                filtered_words = []
-            else:
-                num_random_strings = int(
-                    (self.config.random_string_percentage * total_filtered_words)
-                    / (1.0 - self.config.random_string_percentage)
-                )
-
-            random_strings = self._generate_random_strings(num_random_strings)
-
-            # Combine real words and random strings
-            all_strings = filtered_words + random_strings
-            self.logger.info(
-                f"Generated {num_random_strings} random strings ({self.config.random_string_percentage:.1%} of dataset)"
-            )
-        else:
-            # No random strings, use only real words
-            all_strings = filtered_words
-            random_strings = []
-
-        # Shuffle all strings for randomness
-        random.shuffle(all_strings)
-
-        # Create train/test split using configured ratio
-        split_point = int(self.config.train_test_split * len(all_strings))
-        self.train_words = all_strings[:split_point]
-        self.test_words = all_strings[split_point:]
-
-        # Log dataset statistics
-        if self.config.random_string_percentage > 0.0:
-            self.logger.info(
-                f"Total dataset: {len(all_strings)} strings ({len(filtered_words)} real words + {len(random_strings)} random strings)"  # noqa
-            )
-        else:
-            self.logger.info(f"Loaded {len(all_strings)} words total")
-        self.logger.info(f"Training strings: {len(self.train_words)}")
-        self.logger.info(f"Test strings: {len(self.test_words)}")
-        self.logger.info(f"Example strings: {self.train_words[:10]}")
-
-        # Log configuration details
-        self.logger.info(
-            f"Word length range: {self.config.min_word_length}-{self.config.max_word_length}"
-        )
-        if self.config.random_string_percentage > 0.0:
-            self.logger.info(
-                f"Random string length range: {self.config.random_string_min_length}-{self.config.random_string_max_length}"  # noqa
-            )
-        self.logger.info(f"Train/test split: {self.config.train_test_split:.2%}")
-        self.logger.info(
-            f"Letter set: {'all 26 letters' if self.config.use_all_letters else f'custom ({self.config.custom_letters})'}"  # noqa
-        )
-        self.logger.info(
-            f"Random strings: {self.config.random_string_percentage:.1%} of dataset"
-        )
-
-        # Log capitalization settings
-        if (
-            self.config.uppercase_word_percentage > 0.0
-            or self.config.capitalized_word_percentage > 0.0
-        ):
-            self.logger.info(
-                f"Word capitalization: {self.config.uppercase_word_percentage:.1%} uppercase, {self.config.capitalized_word_percentage:.1%} title case"  # noqa
-            )
-
-        self.logger.info(f"Random seed: {self.config.random_seed}")
-
-        # Log letter selection bias configuration
-        self.logger.info(
-            f"Letter selection bias: {self.config.present_letter_bias:.1%} toward letters present in text"
-        )
-
-        # Log data dumping configuration
-        if self.config.dump_rollouts:
-            self.logger.info(
-                "Data dumping enabled - saving groups with appropriate difficulty for training"
-            )
-            self.logger.info(
-                f"Group difficulty threshold: group average ≤ {self.config.max_group_average_for_training}"
-            )
-            self.logger.info(f"Data dumps directory: {self.datadumps_dir}")
-            self.logger.info(
-                f"Batch size: {self.config.dump_batch_size} groups per file"
-            )
-
-        self.logger.info("Letter counting environment setup complete")
-
-    async def _save_rollouts_to_jsonl(self):
-        """Saves the buffered rollouts to a JSONL file in the datadumps directory."""
-        if not self.rollouts_to_save_buffer:
-            self.logger.warning("_save_rollouts_to_jsonl called but buffer is empty!")
-            return
-
-        buffer_size = len(self.rollouts_to_save_buffer)
-        self.logger.info(f"Starting save of {buffer_size} groups to JSONL file...")
-
-        try:
-            if not os.path.exists(self.datadumps_dir):
-                os.makedirs(self.datadumps_dir)
-                self.logger.info(f"Created directory: {self.datadumps_dir}")
-        except OSError as e:
-            self.logger.error(f"Error creating directory {self.datadumps_dir}: {e}")
-            return
-
-        file_path = os.path.join(
-            self.datadumps_dir,
-            f"letter_counting_environment_rollouts_{self.run_uuid}_{self.save_file_batch_num:04d}.jsonl",
-        )
-
-        try:
-            with open(file_path, "w") as f:
-                for rollout_dict in self.rollouts_to_save_buffer:
-                    json.dump(rollout_dict, f)
-                    f.write("\n")
-            self.logger.info(
-                f"Successfully saved {buffer_size} groups to {file_path} "
-                f"(batch #{self.save_file_batch_num})"
-            )
-            self.rollouts_to_save_buffer.clear()
-            self.save_file_batch_num += 1
-            self.logger.info(
-                f"Buffer cleared. Next batch will be #{self.save_file_batch_num}"
-            )
-        except IOError as e:
-            self.logger.error(f"Error writing rollouts to {file_path}: {e}")
-        except Exception as e:
-            self.logger.error(
-                f"An unexpected error occurred while saving rollouts to {file_path}: {e}"
-            )
-
-    def _get_letter_set(self):
-        """
-        Get the set of letters to choose from based on configuration.
-
-        Returns:
-            String containing letters to choose from (lowercase only for consistency)
-        """
-        if not self.config.use_all_letters:
-            return self.config.custom_letters.lower()
-        else:
-            return "abcdefghijklmnopqrstuvwxyz"
-
-    def _select_target_letters(self, text: str, num_letters: int) -> List[str]:
-        """
-        Select target letters with bias toward letters present in the text.
-
-        Args:
-            text: The text to analyze for present letters
-            num_letters: Number of letters to select
-
-        Returns:
-            List of selected target letters
-        """
-        available_letters = list(self._get_letter_set())
-
-        # Prepare text for counting to match the actual counting logic
-        text_for_counting = self._prepare_text_for_counting(text)
-
-        # Find letters that are present in the text (case-insensitive)
-        present_letters = []
-        absent_letters = []
-
-        for letter in available_letters:
-            if letter.lower() in text_for_counting.lower():
-                present_letters.append(letter)
-            else:
-                absent_letters.append(letter)
-
-        # If we need more letters than are present, we'll need some absent ones too
-        if num_letters > len(present_letters):
-            # Select all present letters and fill the rest randomly from absent letters
-            selected_letters = present_letters.copy()
-            remaining_needed = num_letters - len(present_letters)
-            if remaining_needed > 0 and absent_letters:
-                selected_letters.extend(
-                    random.sample(
-                        absent_letters, min(remaining_needed, len(absent_letters))
-                    )
-                )
-            return selected_letters[:num_letters]
-
-        # We have enough present letters, so apply the bias
-        selected_letters = []
-
-        for _ in range(num_letters):
-            # Decide whether to pick from present or absent letters based on bias
-            if present_letters and (
-                not absent_letters or random.random() < self.config.present_letter_bias
-            ):
-                # Choose from present letters
-                chosen_letter = random.choice(present_letters)
-                present_letters.remove(chosen_letter)
-            elif absent_letters:
-                # Choose from absent letters
-                chosen_letter = random.choice(absent_letters)
-                absent_letters.remove(chosen_letter)
-            else:
-                # Fallback: choose from any remaining available letters
-                remaining_letters = [
-                    l for l in available_letters if l not in selected_letters  # noqa
-                ]
-                if remaining_letters:
-                    chosen_letter = random.choice(remaining_letters)
-                else:
-                    break  # No more letters available
-
-            selected_letters.append(chosen_letter)
-
-        return selected_letters
-
-    def _generate_random_string(self, length: int) -> str:
-        """
-        Generate a random string of specified length with at least 80% alphabetical characters.
-
-        Args:
-            length: Length of the string to generate
-
-        Returns:
-            Random string with mix of uppercase, lowercase, and some non-alphabetical chars
-        """
-        # Ensure at least 80% alphabetical characters
-        min_alpha_chars = max(1, int(length * 0.8))
-
-        # Character sets
-        letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
-        non_alpha = "0123456789!@#$%^&*()-_=+[]{}|;:,.<>?"
-
-        result = []
-
-        # First, add the required alphabetical characters
-        for _ in range(min_alpha_chars):
-            result.append(random.choice(letters))
-
-        # Fill the rest with either alphabetical or non-alphabetical characters
-        remaining_length = length - min_alpha_chars
-        for _ in range(remaining_length):
-            # 90% chance of alphabetical even for remaining chars (to exceed 80% minimum)
-            if random.random() < 0.9:
-                result.append(random.choice(letters))
-            else:
-                result.append(random.choice(non_alpha))
-
-        # Shuffle to avoid alphabetical chars being clustered at the beginning
-        random.shuffle(result)
-        return "".join(result)
-
-    def _generate_random_strings(self, count: int) -> List[str]:
-        """
-        Generate a list of random strings with lengths within the configured random string range.
-
-        Args:
-            count: Number of random strings to generate
-
-        Returns:
-            List of random strings
-        """
-        random_strings = []
-        for _ in range(count):
-            # Generate random length within configured random string range
-            length = random.randint(
-                self.config.random_string_min_length,
-                self.config.random_string_max_length,
-            )
-            random_string = self._generate_random_string(length)
-            random_strings.append(random_string)
-        return random_strings
-
-    def _apply_word_capitalization(self, words: List[str]) -> List[str]:
-        """
-        Apply capitalization transformations to real words based on configuration.
-
-        Args:
-            words: List of lowercase words
-
-        Returns:
-            List of words with applied capitalization
-        """
-        if (
-            self.config.uppercase_word_percentage == 0.0
-            and self.config.capitalized_word_percentage == 0.0
-        ):
-            return words
-
-        result = []
-        for word in words:
-            rand_val = random.random()
-
-            if rand_val < self.config.uppercase_word_percentage:
-                # Make uppercase
-                result.append(word.upper())
-            elif (
-                rand_val
-                < self.config.uppercase_word_percentage
-                + self.config.capitalized_word_percentage
-            ):
-                # Capitalize first letter
-                result.append(word.capitalize())
-            else:
-                # Keep lowercase
-                result.append(word)
-
-        return result
-
-    def _extract_text_passages(self, text: str) -> List[str]:
-        """
-        Extract text passages from raw text based on character length.
-
-        Args:
-            text: Raw text to extract passages from
-
-        Returns:
-            List of filtered text passages
-        """
-        # Clean the text - remove excessive whitespace and normalize
-        text = re.sub(r"\s+", " ", text.strip())
-
-        # If text is shorter than min length, skip it
-        if len(text) < self.config.min_text_length:
-            return []
-
-        # If text is within range, use it as-is
-        if len(text) <= self.config.max_text_length:
-            return [text]
-
-        # For longer texts, create overlapping chunks
-        passages = []
-        chunk_size = self.config.max_text_length
-        overlap = min(50, chunk_size // 4)  # 25% overlap, max 50 chars
-
-        start = 0
-        while start < len(text):
-            end = start + chunk_size
-
-            # If this would be the last chunk and it's too small, extend the previous chunk
-            if end >= len(text):
-                if len(text) - start >= self.config.min_text_length:
-                    passages.append(text[start:])
-                break
-
-            # Try to break at a natural boundary (space, punctuation)
-            chunk = text[start:end]
-
-            # Look for a good break point in the last 20% of the chunk
-            break_start = int(len(chunk) * 0.8)
-            break_candidates = []
-
-            # Find sentence endings first
-            for i in range(len(chunk) - 1, break_start - 1, -1):
-                if chunk[i] in ".!?":
-                    break_candidates.append(i + 1)
-                    break
-
-            # If no sentence ending, look for other punctuation
-            if not break_candidates:
-                for i in range(len(chunk) - 1, break_start - 1, -1):
-                    if chunk[i] in ",;:":
-                        break_candidates.append(i + 1)
-                        break
-
-            # If no punctuation, look for spaces
-            if not break_candidates:
-                for i in range(len(chunk) - 1, break_start - 1, -1):
-                    if chunk[i] == " ":
-                        break_candidates.append(i)
-                        break
-
-            # Use the break point if found, otherwise use the full chunk
-            if break_candidates:
-                actual_end = start + break_candidates[0]
-                passage = text[start:actual_end].strip()
-            else:
-                passage = chunk.strip()
-                actual_end = end
-
-            # Only add if it meets minimum length
-            if len(passage) >= self.config.min_text_length:
-                passages.append(passage)
-
-            # Move start position with overlap
-            start = actual_end - overlap
-
-            # Avoid infinite loops
-            if start >= end - overlap:
-                start = end
-
-        return passages
-
-    def _prepare_text_for_counting(self, text: str) -> str:
-        """
-        Prepare text for letter counting based on configuration.
-
-        Args:
-            text: Original text
-
-        Returns:
-            Processed text for counting
-        """
-        if not self.config.include_punctuation_in_count:
-            # Remove punctuation but keep spaces and alphanumeric
-            text = "".join(c for c in text if c.isalnum() or c.isspace())
-
-        if not self.config.include_spaces_in_count:
-            # Remove spaces
-            text = text.replace(" ", "")
-
-        return text
-
-    async def _validate_config(self):
-        """Validate configuration parameters."""
-        if not (0.0 <= self.config.random_string_percentage <= 1.0):
-            raise ValueError(
-                f"random_string_percentage must be between 0.0 and 1.0, got {self.config.random_string_percentage}"
-            )  # noqa
-        if not (0.0 <= self.config.uppercase_word_percentage <= 1.0):
-            raise ValueError(
-                f"uppercase_word_percentage must be between 0.0 and 1.0, got {self.config.uppercase_word_percentage}"
-            )  # noqa
-        if not (0.0 <= self.config.capitalized_word_percentage <= 1.0):
-            raise ValueError(
-                f"capitalized_word_percentage must be between 0.0 and 1.0, got {self.config.capitalized_word_percentage}"  # noqa
-            )  # noqa
-        if (
-            self.config.uppercase_word_percentage
-            + self.config.capitalized_word_percentage
-            > 1.0
-        ):
-            raise ValueError(
-                f"Sum of uppercase_word_percentage ({self.config.uppercase_word_percentage}) and capitalized_word_percentage ({self.config.capitalized_word_percentage}) cannot exceed 1.0"  # noqa
-            )  # noqa
-        if self.config.random_string_min_length < 1:
-            raise ValueError(
-                f"random_string_min_length must be at least 1, got {self.config.random_string_min_length}"
-            )  # noqa
-        if self.config.random_string_max_length < self.config.random_string_min_length:
-            raise ValueError(
-                f"random_string_max_length ({self.config.random_string_max_length}) must be >= random_string_min_length ({self.config.random_string_min_length})"  # noqa
-            )  # noqa
-        if self.config.use_text_passages:
-            if not (0.0 <= self.config.text_passage_percentage <= 1.0):
-                raise ValueError(
-                    f"text_passage_percentage must be between 0.0 and 1.0, got {self.config.text_passage_percentage}"  # noqa
-                )
-            if self.config.min_text_length < 1:
-                raise ValueError(
-                    f"min_text_length must be at least 1, got {self.config.min_text_length}"  # noqa
-                )
-            if self.config.max_text_length < self.config.min_text_length:
-                raise ValueError(
-                    f"max_text_length ({self.config.max_text_length}) must be >= min_text_length ({self.config.min_text_length})"  # noqa
-                )
-            if self.config.max_text_length < 10:
-                raise ValueError(
-                    f"max_text_length must be at least 10 characters, got {self.config.max_text_length}"  # noqa
-                )
-        if self.config.max_letters_to_count < 1:
-            raise ValueError(
-                f"max_letters_to_count must be at least 1, got {self.config.max_letters_to_count}"
-            )
-        if not (0.0 <= self.config.multi_letter_probability <= 1.0):
-            raise ValueError(
-                f"multi_letter_probability must be between 0.0 and 1.0, got {self.config.multi_letter_probability}"
-            )
-        if self.config.max_letters_to_count > 26:
-            raise ValueError(
-                f"max_letters_to_count cannot exceed 26 (total letters), got {self.config.max_letters_to_count}"
-            )
-        if not (0.0 <= self.config.present_letter_bias <= 1.0):
-            raise ValueError(
-                f"present_letter_bias must be between 0.0 and 1.0, got {self.config.present_letter_bias}"
-            )
-        if self.config.dump_batch_size < 1:
-            raise ValueError(
-                f"dump_batch_size must be at least 1, got {self.config.dump_batch_size}"
-            )
-
-    def save_checkpoint(self, step, data=None):
-        """Save checkpoint including current iteration number, statistics, and data dumping state."""
-        if data is None:
-            data = {}
-        data["iter"] = self.iter
-        data["processed_item_count"] = self.processed_item_count
-        data["save_file_batch_num"] = self.save_file_batch_num
-        data["letter_distribution_stats"] = self.letter_distribution_stats
-        data["word_length_stats"] = self.word_length_stats
-        data["answer_format_errors"] = self.answer_format_errors
-        data["think_format_errors"] = self.think_format_errors
-        super().save_checkpoint(step, data)
-
-    def load_checkpoint(self):
-        """Load checkpoint including iteration number, statistics, and data dumping state."""
-        # Call the base class method first to load the data
-        super().load_checkpoint()
-
-        # The base class loads data into attributes, so we can access them directly
-        # if they were saved in save_checkpoint
-        if hasattr(self, "iter"):
-            # Restore statistics if available
-            if (
-                hasattr(self, "letter_distribution_stats")
-                and self.letter_distribution_stats
-            ):
-                total_letters = sum(self.letter_distribution_stats.values())
-                self.logger.info(
-                    f"Restored letter distribution stats with {total_letters} total letters"
-                )
-            if hasattr(self, "word_length_stats") and self.word_length_stats:
-                total_words = sum(self.word_length_stats.values())
-                self.logger.info(
-                    f"Restored word length stats with {total_words} total words"
-                )
-            if hasattr(self, "answer_format_errors"):
-                self.logger.info(
-                    f"Restored error counts: {self.answer_format_errors} answer format errors, {self.think_format_errors} think format errors"  # noqa
-                )  # noqa
-
-    async def close(self):
-        """Clean up and save any remaining rollouts before exiting."""
-        self.logger.info(
-            "Closing LetterCountingEnv. Attempting to save any remaining rollouts..."
-        )
-        if self.config.dump_rollouts and self.rollouts_to_save_buffer:
-            self.logger.info(
-                f"FINAL SAVE: Found {len(self.rollouts_to_save_buffer)} groups in buffer. "
-                f"Saving final batch to disk (batch #{self.save_file_batch_num})..."
-            )
-            await self._save_rollouts_to_jsonl()
-            self.logger.info("Final save completed successfully.")
-        elif self.config.dump_rollouts:
-            self.logger.info("No rollouts in buffer to save upon closing.")
-        else:
-            self.logger.info("Data dumping is disabled - no rollouts to save.")
-
-        # Call the superclass's close method if it exists
-        if hasattr(super(), "close"):
-            await super().close()
-        self.logger.info("LetterCountingEnv closed.")
-
-    async def get_next_item(self):
-        """
-        Get the next training item from the dataset.
-
-        Returns:
-            A tuple containing prompt and expected answer
-        """
-        # Get the next text from training set (could be a word, sentence, or random string)
-        text = self.train_words[self.iter % len(self.train_words)]
-
-        # Decide whether to use multiple letters
-        use_multiple = (
-            self.config.max_letters_to_count > 1
-            and random.random() < self.config.multi_letter_probability
-        )
-
-        if use_multiple:
-            # Choose 2 to max_letters_to_count different letters
-            num_letters = random.randint(2, self.config.max_letters_to_count)
-            target_letters = self._select_target_letters(text, num_letters)
-        else:
-            # Single letter counting
-            target_letters = self._select_target_letters(text, 1)
-
-        # Prepare text for counting (handle punctuation/spaces based on config)
-        text_for_counting = self._prepare_text_for_counting(text)
-
-        # Count occurrences for each target letter (case-insensitive)
-        correct_counts = {}
-        for letter in target_letters:
-            correct_counts[letter] = text_for_counting.lower().count(letter.lower())
-
-        # Determine if this is a text passage or word/string based on length and content
-        is_text_passage = (
-            len(text) > 50 or " " in text or any(c in text for c in ".,!?;:")
-        )
-
-        # Log item selection details for every item
-        text_type = "passage" if is_text_passage else "word/string"
-        text_preview = text[:50] + "..." if len(text) > 50 else text
-        letters_str = ", ".join(target_letters)
-        counts_str = ", ".join(
-            f"{letter}:{correct_counts[letter]}" for letter in target_letters
-        )
-
-        # Add information about present vs absent letters for bias analysis
-        present_count = sum(
-            1 for letter in target_letters if correct_counts[letter] > 0
-        )
-        absent_count = len(target_letters) - present_count
-        bias_info = f"Present: {present_count}, Absent: {absent_count}"
-
-        self.logger.info(
-            f"Selected {text_type}: '{text_preview}' | Letters: [{letters_str}] | Counts: [{counts_str}] | {bias_info} (iteration {self.iter})"  # noqa
-        )
-
-        self.iter += 1
-
-        # Create the question based on whether this item is a text passage or word/string and single/multiple letters
-        if len(target_letters) == 1:
-            # Single letter question
-            target_letter = target_letters[0]
-            if is_text_passage:
-                question_text = (
-                    f'How many {target_letter}s are in the following text: "{text}"?'
-                )
-            else:
-                question_text = f"How many {target_letter}s are in the string {text}?"
-
-            # Add instruction for single letter answer format
-            question_with_instruction = f"{question_text}\n\nProvide your answer in the format: <answer>{{number}}</answer>"  # noqa
-        else:
-            # Multiple letters question
-            letters_str = (
-                ", ".join(f"'{letter}'" for letter in target_letters[:-1])
-                + f", and '{target_letters[-1]}'"
-            )
-            if is_text_passage:
-                question_text = f'Count the occurrences of the letters {letters_str} in the following text: "{text}"'
-            else:
-                question_text = f"Count the occurrences of the letters {letters_str} in the string {text}"
-
-            # Add instruction for multiple letter JSON answer format
-            example_json = (
-                "{" + ", ".join(f'"{letter}": 0' for letter in target_letters) + "}"
-            )
-            question_with_instruction = f"{question_text}\n\nProvide your answer as JSON in the format: <answer>{example_json}</answer>"  # noqa
-
-        # Create prompt tuple using frozensets as required
-        prompt = []
-
-        # Add system prompt
-        prompt.append(frozenset({"role": "system", "content": system_prompt}.items()))
-
-        # Add user message with the question
-        prompt.append(
-            frozenset({"role": "user", "content": question_with_instruction}.items())
-        )
-
-        # Return the prompt, correct counts, text, and target letters
-        return (tuple(prompt), correct_counts, text, target_letters)
-
-    async def collect_trajectories(self, item) -> Tuple[ScoredDataGroup, List]:
-        """
-        Generate and collect model responses for scoring.
-
-        Args:
-            item: Input item containing prompt and expected answer
-
-        Returns:
-            Tuple of lists containing scored data groups and backlog
-        """
-        # Extract messages from the item
-        messages = []
-        for role_dict in item[0]:
-            messages.append(dict(role_dict))
-
-        # Apply chat template to convert messages to a single string
-        prompt = self.tokenizer.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=False
-        )
-
-        async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
-            # Get completions from the model
-            completions = await managed.completion(
-                prompt=prompt,
-                n=self.config.group_size,
-                max_tokens=self.config.max_generation_tokens,
-                temperature=self.config.generation_temperature,
-            )
-
-            state = managed.get_state()
-            nodes = state["nodes"]
-
-        to_score = list()
-
-        for i, completion_choice in enumerate(completions.choices):
-            # Create a copy of the prompt messages
-            trajectory_messages = []
-            for role_dict in item[0]:
-                trajectory_messages.append(dict(role_dict))
-
-            # Add the model's response
-            trajectory_messages.append(
-                {"role": "assistant", "content": completion_choice.text}
-            )
-
-            # Add to scoring queue with expected answer and metadata
-            to_score.append(
-                {
-                    "messages": tuple(trajectory_messages),
-                    "correct_counts": item[1],  # correct_counts (dict)
-                    "text": item[2],  # text (word or sentence)
-                    "target_letters": item[3],  # target_letters (list)
-                    "finish_reason": completion_choice.finish_reason,  # stop reason
-                    "tokens": nodes[i].tokens,
-                    "masks": nodes[i].masked_tokens,
-                    "logprobs": nodes[i].logprobs,
-                }
-            )
-
-        # Call score to get the scored data
-        scored_data = await self.score(to_score)
-
-        # Data dumping logic - save groups that meet the accuracy threshold BEFORE training filtering
-        # This ensures we save high-quality data even if it's too easy for training
-        if self.config.dump_rollouts:
-            # First, we need to score the data to get the scores, but we'll do our own scoring for data dumping
-            temp_scores = []
-            for score_item in to_score:
-                # Extract the model's response and score it
-                model_response = score_item["messages"][-1]["content"]
-                stop_reason = score_item["finish_reason"]
-                expected_counts = score_item["correct_counts"]
-                target_letters = score_item["target_letters"]
-
-                # Handle legacy format
-                if isinstance(target_letters, str):
-                    target_letters = [target_letters]
-                    expected_counts = {target_letters[0]: expected_counts}
-                elif isinstance(expected_counts, int):
-                    expected_counts = {target_letters[0]: expected_counts}
-
-                # Score this individual response
-                if stop_reason == "length":
-                    temp_scores.append(0.0)
-                else:
-                    expected_format = "single" if len(target_letters) == 1 else "multi"
-                    model_answer = self._extract_answer(model_response, expected_format)
-
-                    if model_answer is None:
-                        temp_scores.append(0.0)
-                    else:
-                        if expected_format == "single":
-                            expected_single_count = expected_counts[target_letters[0]]
-                            temp_scores.append(
-                                1.0 if model_answer == expected_single_count else 0.0
-                            )
-                        else:
-                            if set(model_answer.keys()) == set(target_letters) and all(
-                                model_answer.get(letter, -1) == expected_counts[letter]
-                                for letter in target_letters
-                            ):
-                                temp_scores.append(1.0)
-                            else:
-                                temp_scores.append(0.0)
-
-            # Check if group has appropriate difficulty for training (group average within training range)
-            group_average_score = (
-                sum(temp_scores) / len(temp_scores) if temp_scores else 0.0
-            )
-
-            # Skip groups where all scores are identical (no learning signal, including all 0.0)
-            if temp_scores and all(temp_scores[0] == score for score in temp_scores):
-                self.logger.debug(
-                    f"Skipping group save - all scores identical ({temp_scores[0]:.3f}) - no learning signal"
-                )
-            elif group_average_score <= self.config.max_group_average_for_training:
-                self.logger.debug(
-                    f"Saving group with scores: {[f'{s:.3f}' for s in temp_scores]} (group_avg: {group_average_score:.3f} <= {self.config.max_group_average_for_training})"  # noqa
-                )
-                rollouts_with_scores_to_save = []
-
-                for i, score_for_rollout in enumerate(temp_scores):
-                    conversation_messages = to_score[i]["messages"]
-                    correct_counts = to_score[i]["correct_counts"]
-                    text = to_score[i]["text"]
-                    target_letters = to_score[i]["target_letters"]
-                    stop_reason = to_score[i]["finish_reason"]
-
-                    rollouts_with_scores_to_save.append(
-                        {
-                            "conversation": conversation_messages,
-                            "score": score_for_rollout,
-                            "expected_counts": correct_counts,
-                            "text": text,
-                            "target_letters": target_letters,
-                            "stop_reason": stop_reason,
-                            "group_average_score": group_average_score,
-                        }
-                    )
-
-                if rollouts_with_scores_to_save:
-                    # Extract item info for logging
-                    correct_counts = item[1]
-                    text = item[2]
-                    target_letters = item[3]
-                    text_preview = (
-                        text[:30].replace(" ", "_")
-                        if len(text) > 30
-                        else text.replace(" ", "_")
-                    )
-                    letters_str = "_".join(target_letters)
-                    counts_str = "_".join(
-                        str(correct_counts.get(letter, 0)) for letter in target_letters
-                    )
-                    item_id = f"{text_preview}_{letters_str}_{counts_str}"
-
-                    item_data_to_save = {
-                        "item_id": item_id,
-                        "rollouts": rollouts_with_scores_to_save,
-                    }
-                    self.rollouts_to_save_buffer.append(item_data_to_save)
-                    self.processed_item_count += 1
-
-                    # Log every single sample added to buffer
-                    self.logger.info(
-                        f"BUFFER ADD: Added item '{item_id}' to buffer. "
-                        f"Buffer now contains {len(self.rollouts_to_save_buffer)} groups "
-                        f"(target batch size: {self.config.dump_batch_size})"
-                    )
-
-                # Log progress every 10 items
-                if self.processed_item_count % 10 == 0:
-                    self.logger.info(
-                        f"Data dump progress: {self.processed_item_count} items processed "
-                        f"(Buffer size: {len(self.rollouts_to_save_buffer)})"
-                    )
-
-                # Save in batches when buffer reaches the configured size
-                if (
-                    self.config.dump_rollouts
-                    and len(self.rollouts_to_save_buffer) >= self.config.dump_batch_size
-                ):
-                    self.logger.info(
-                        f"Buffer reached batch size ({len(self.rollouts_to_save_buffer)}/{self.config.dump_batch_size}). "  # noqa
-                        f"Saving batch to disk..."
-                    )
-                    await self._save_rollouts_to_jsonl()
-
-                # Safety mechanism: save every 50 items processed to prevent data loss
-                elif (
-                    self.config.dump_rollouts
-                    and self.processed_item_count % 50 == 0
-                    and len(self.rollouts_to_save_buffer) > 0
-                ):
-                    self.logger.info(
-                        f"Safety save: {self.processed_item_count} items processed. "
-                        f"Saving {len(self.rollouts_to_save_buffer)} groups to prevent data loss..."
-                    )
-                    await self._save_rollouts_to_jsonl()
-            else:
-                self.logger.debug(
-                    f"Skipping group save - group too easy for training (group_avg: {group_average_score:.3f} > {self.config.max_group_average_for_training})"  # noqa
-                )
-
-        to_backlog = []
-        return scored_data, to_backlog
-
-    def _extract_answer(self, text, expected_format="single"):
-        """
-        Extract the answer from model response (single number or JSON).
-        Only allows one valid answer format - multiple answer formats result in a score of 0.
-
-        Args:
-            text: Text containing the model's response
-            expected_format: "single" for number, "multi" for JSON
-
-        Returns:
-            Extracted answer (int for single, dict for multi) or None if invalid
-        """
-        # Check for multiple <think> tags - score as 0 if found
-        think_tags = re.findall(r"<think>", text, re.IGNORECASE)
-        if len(think_tags) > 1:
-            return None
-
-        # Check if the think tag is properly opened - we need exactly one opening tag
-        if len(think_tags) != 1:
-            return None
-
-        # Check for </think> closing tags
-        think_close_tags = re.findall(r"</think>", text, re.IGNORECASE)
-        if len(think_close_tags) != 1:
-            return None  # Must have exactly one closing tag
-
-        # Split the text into thinking and answer sections
-        parts = re.split(r"</think>", text, flags=re.IGNORECASE, maxsplit=1)
-
-        # If there's no </think> tag or multiple sections, return None
-        if len(parts) != 2:
-            return None
-
-        thinking_section, answer_section = parts
-
-        # Validate thinking section
-        # Make sure thinking section actually contains the opening <think> tag
-        if "<think>" not in thinking_section.lower():
-            return None  # Malformed thinking section
-
-        # Check if there are any <think> tags in the answer section (after the first </think>)
-        if "<think>" in answer_section.lower():
-            return None
-
-        # Look for answer tags in the answer section
-        if expected_format == "single":
-            # Single number format
-            answer_pattern = r"<answer>\s*(\d+)\s*</answer>"
-            answer_matches = re.findall(answer_pattern, answer_section, re.IGNORECASE)
-
-            # If no answers found or multiple answers found, return None
-            if len(answer_matches) != 1:
-                return None
-
-            # Return the single found answer as an integer
-            try:
-                return int(answer_matches[0])
-            except ValueError:
-                return None
-        else:
-            # Multi-letter JSON format
-            answer_pattern = r"<answer>\s*(\{[^}]+\})\s*</answer>"
-            answer_matches = re.findall(answer_pattern, answer_section, re.IGNORECASE)
-
-            # If no answers found or multiple answers found, return None
-            if len(answer_matches) != 1:
-                return None
-
-            # Try to parse the JSON
-            try:
-                import json
-
-                answer_dict = json.loads(answer_matches[0])
-
-                # Validate that all values are integers
-                if not isinstance(answer_dict, dict):
-                    return None
-
-                for key, value in answer_dict.items():
-                    if not isinstance(key, str) or not isinstance(value, int):
-                        return None
-
-                return answer_dict
-            except (json.JSONDecodeError, ValueError):
-                return None
-
-    async def score(self, rollout_group_data: List) -> Optional[ScoredDataGroup]:
-        """
-        Score the generated model responses against expected letter counts.
-
-        Args:
-            rollout_group_data: List of generated responses with expected answers
-
-        Returns:
-            ScoredDataGroup with tokenized inputs and scores, or None if no valid scores
-        """
-        scores = ScoredDataGroup()
-        scores["tokens"] = list()
-        scores["masks"] = list()
-        scores["scores"] = list()
-        scores["inference_logprobs"] = list()
-
-        if not rollout_group_data:
-            return None
-
-        # Get the expected answer from first item
-        expected_counts = rollout_group_data[0][
-            "correct_counts"
-        ]  # correct counts (dict for multi, int for single - legacy)
-        text = rollout_group_data[0]["text"]  # text (word or sentence)
-        target_letters = rollout_group_data[0][
-            "target_letters"
-        ]  # target letters (list)
-
-        # Handle legacy format (single letter as string, single count as int)
-        if isinstance(target_letters, str):
-            target_letters = [target_letters]
-            expected_counts = {target_letters[0]: expected_counts}
-        elif isinstance(expected_counts, int):
-            # Legacy format with single letter
-            expected_counts = {target_letters[0]: expected_counts}
-
-        # Track statistics for all target letters
-        for target_letter in target_letters:
-            if target_letter not in self.letter_distribution_stats:
-                self.letter_distribution_stats[target_letter] = 0
-            self.letter_distribution_stats[target_letter] += 1
-
-        text_len = len(text)
-        if text_len not in self.word_length_stats:
-            self.word_length_stats[text_len] = 0
-        self.word_length_stats[text_len] += 1
-
-        # Shuffle to avoid bias in selection
-        random.shuffle(rollout_group_data)
-
-        format_errors_in_group = 0
-        think_errors_in_group = 0
-
-        for item in rollout_group_data:
-            # Extract the model's response
-            model_response = item["messages"][-1]["content"]
-            stop_reason = item["finish_reason"]  # Get the stop reason
-
-            # If the response was cut off due to length, give it a score of 0
-            if stop_reason == "length":
-                reward = 0
-                if self.config.debug_logging:
-                    letters_str = ", ".join(target_letters)
-                    self.logger.debug(
-                        f"Text '{text[:50]}...' letters '{letters_str}': Length cutoff, score=0"
-                    )
-            else:
-                # Determine expected format and extract the answer
-                expected_format = "single" if len(target_letters) == 1 else "multi"
-                model_answer = self._extract_answer(model_response, expected_format)
-
-                # Track metrics based on result
-                if model_answer is None:
-                    reward = 0  # Invalid format gets 0 reward
-                    format_errors_in_group += 1
-                    # Check if it's a think format error
-                    if (
-                        "<think>" not in model_response.lower()
-                        or "</think>" not in model_response.lower()
-                    ):
-                        think_errors_in_group += 1
-                    if self.config.debug_logging:
-                        letters_str = ", ".join(target_letters)
-                        self.logger.debug(
-                            f"Text '{text[:50]}...' letters '{letters_str}': Format error, score=0"
-                        )
-                else:
-                    # Check if answer matches expected counts
-                    if expected_format == "single":
-                        # Single letter: compare integer
-                        expected_single_count = expected_counts[target_letters[0]]
-                        if model_answer == expected_single_count:
-                            reward = 1
-                            if self.config.debug_logging:
-                                self.logger.debug(
-                                    f"Text '{text[:50]}...' letter '{target_letters[0]}': Correct answer {model_answer}, score=1"  # noqa
-                                )
-                        else:
-                            reward = 0
-                            if self.config.debug_logging:
-                                self.logger.debug(
-                                    f"Text '{text[:50]}...' letter '{target_letters[0]}': Wrong answer {model_answer} (expected {expected_single_count}), score=0"  # noqa
-                                )
-                    else:
-                        # Multiple letters: compare dictionaries
-                        # Check if all expected letters are present and counts match
-                        if set(model_answer.keys()) == set(target_letters) and all(
-                            model_answer.get(letter, -1) == expected_counts[letter]
-                            for letter in target_letters
-                        ):
-                            reward = 1
-                            if self.config.debug_logging:
-                                self.logger.debug(
-                                    f"Text '{text[:50]}...' letters {target_letters}: Correct answer {model_answer}, score=1"  # noqa
-                                )
-                        else:
-                            reward = 0
-                            if self.config.debug_logging:
-                                self.logger.debug(
-                                    f"Text '{text[:50]}...' letters {target_letters}: Wrong answer {model_answer} (expected {expected_counts}), score=0"  # noqa
-                                )
-
-            tokens = item["tokens"]
-            masks = item["masks"]
-            logprobs = item["logprobs"]
-
-            # Remove examples with insufficient context
-            if len([1 for i in masks if i != -100]) < 10:
-                continue
-
-            scores["tokens"].append(tokens)
-            scores["masks"].append(masks)
-            scores["inference_logprobs"].append(logprobs)
-            scores["scores"].append(1.0 if reward else 0.0)
-
-            # Break once we have enough examples
-            if len(scores["tokens"]) >= self.config.group_size:
-                break
-
-        if not scores["tokens"]:
-            letters_str = ", ".join(target_letters)
-            self.logger.warning(
-                f"No valid items were scored for text '{text[:50]}...' letters '{letters_str}' - all items had insufficient context"  # noqa
-            )
-            return None
-
-        # Update global error counters
-        self.answer_format_errors += format_errors_in_group
-        self.think_format_errors += think_errors_in_group
-
-        # Record success rate metrics for wandb logging
-        for score in scores["scores"]:
-            self.percent_correct_buffer.append(score)
-
-        # Calculate and log average score for the current group
-        current_scores = scores.get("scores", [])
-        if current_scores:
-            average_score = sum(current_scores) / len(current_scores)
-
-            # Create log message with appropriate text preview
-            text_preview = text[:50] + "..." if len(text) > 50 else text
-            letters_str = ", ".join(target_letters)
-            expected_str = (
-                str(expected_counts)
-                if len(target_letters) > 1
-                else str(expected_counts[target_letters[0]])
-            )
-            log_message_main = (
-                f"Text: '{text_preview}' | Letters: '{letters_str}' | Expected: {expected_str} | "
-                f"Group average score: {average_score:.4f}"
-            )
-
-            if all(s == 1.0 for s in current_scores):
-                self.logger.info(f"{log_message_main} (All correct in this group!)")
-            elif all(s == 0.0 for s in current_scores):
-                self.logger.info(
-                    f"{log_message_main} (All failed - format/answer errors!)"
-                )
-            else:
-                self.logger.info(log_message_main)
-
-            # Check training threshold - if group is too easy, skip it for training
-            if average_score > self.config.max_group_average_for_training:
-                self.logger.debug(
-                    f"Skipping group for training - too easy (avg: {average_score:.3f} > threshold: {self.config.max_group_average_for_training})"  # noqa
-                )
-                return None
-
-        # Check if all scores are the same (no learning signal)
-        if all(scores["scores"][0] == score for score in scores["scores"]):
-            self.logger.debug(
-                f"All scores in group are identical ({scores['scores'][0]:.4f}) - no learning signal, skipping group"
-            )
-            return None
-
-        return scores
-
-    async def rollout_and_score_eval(self, test_text):
-        """
-        Generate and score model responses for a single test text.
-
-        Args:
-            test_text: Test text from dataset (could be word, sentence, or random string)
-
-        Returns:
-            Score (1 for correct, 0 for incorrect)
-        """
-        # Decide whether to use multiple letters (same logic as get_next_item)
-        use_multiple = (
-            self.config.max_letters_to_count > 1
-            and random.random() < self.config.multi_letter_probability
-        )
-
-        if use_multiple:
-            # Choose 2 to max_letters_to_count different letters
-            num_letters = random.randint(2, self.config.max_letters_to_count)
-            target_letters = self._select_target_letters(test_text, num_letters)
-        else:
-            # Single letter counting
-            target_letters = self._select_target_letters(test_text, 1)
-
-        # Prepare text for counting (handle punctuation/spaces based on config)
-        text_for_counting = self._prepare_text_for_counting(test_text)
-
-        # Count occurrences for each target letter (case-insensitive)
-        expected_counts = {}
-        for letter in target_letters:
-            expected_counts[letter] = text_for_counting.lower().count(letter.lower())
-
-        # Determine if this is a text passage or word/string based on length and content
-        is_text_passage = (
-            len(test_text) > 50
-            or " " in test_text
-            or any(c in test_text for c in ".,!?;:")
-        )
-
-        # Create the question based on whether this item is a text passage or word/string and single/multiple letters
-        if len(target_letters) == 1:
-            # Single letter question
-            target_letter = target_letters[0]
-            if is_text_passage:
-                question_text = f'How many {target_letter}s are in the following text: "{test_text}"?'
-            else:
-                question_text = (
-                    f"How many {target_letter}s are in the string {test_text}?"
-                )
-
-            # Add instruction for single letter answer format
-            question_with_instruction = f"{question_text}\n\nProvide your answer in the format: <answer>{{number}}</answer>"  # noqa
-        else:
-            # Multiple letters question
-            letters_str = (
-                ", ".join(f"'{letter}'" for letter in target_letters[:-1])
-                + f", and '{target_letters[-1]}'"  # noqa
-            )
-            if is_text_passage:
-                question_text = f'Count the occurrences of the letters {letters_str} in the following text: "{test_text}"'  # noqa
-            else:
-                question_text = f"Count the occurrences of the letters {letters_str} in the string {test_text}"  # noqa
-
-            # Add instruction for multiple letter JSON answer format
-            example_json = (
-                "{" + ", ".join(f'"{letter}": 0' for letter in target_letters) + "}"
-            )
-            question_with_instruction = f"{question_text}\n\nProvide your answer as JSON in the format: <answer>{example_json}</answer>"  # noqa
-
-        # Create messages for model
-        messages = [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": question_with_instruction},
-        ]
-
-        # Apply chat template to convert messages to a single string
-        prompt = self.tokenizer.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=False
-        )
-
-        async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
-            # Get model completion
-            completion = await managed.completion(
-                prompt=prompt,
-                n=1,
-                max_tokens=self.config.max_generation_tokens,
-                temperature=self.config.eval_temperature,
-                split="eval",
-            )
-
-        # Extract the model's response from the completion
-        model_response = completion.choices[0].text
-
-        # Determine expected format and extract the answer
-        expected_format = "single" if len(target_letters) == 1 else "multi"
-        model_answer = self._extract_answer(model_response, expected_format)
-
-        # Score 1 if the answers match, 0 otherwise
-        if model_answer is None:
-            score = 0
-        elif expected_format == "single":
-            # Single letter: compare integer
-            expected_single_count = expected_counts[target_letters[0]]
-            score = 1 if model_answer == expected_single_count else 0
-        else:
-            # Multiple letters: compare dictionaries
-            score = (
-                1
-                if (
-                    set(model_answer.keys()) == set(target_letters)
-                    and all(
-                        model_answer.get(letter, -1) == expected_counts[letter]
-                        for letter in target_letters
-                    )
-                )
-                else 0
-            )
-
-        return score
-
-    async def evaluate(self, *args, **kwargs):
-        """
-        Evaluate the model on test data.
-        """
-        self.logger.info("Starting evaluation...")
-        if not self.test_words:
-            self.logger.warning("No test texts available for evaluation. Skipping.")
-            self.eval_metrics.append(("eval/percent_correct", 0.0))
-            return
-
-        eval_tasks = []
-        # Sample a subset of test texts for evaluation to keep it manageable
-        eval_texts = random.sample(
-            self.test_words, min(len(self.test_words), self.config.eval_sample_size)
-        )
-
-        text_type = (
-            "mixed items (words and passages)"
-            if self.config.use_text_passages
-            else "strings"
-        )
-        self.logger.info(
-            f"Starting evaluation on {len(eval_texts)} test {text_type}..."
-        )
-
-        for test_text in eval_texts:
-            eval_tasks.append(self.rollout_and_score_eval(test_text))
-
-        # Run evaluation
-        scores = await tqdm_asyncio.gather(*eval_tasks, desc="Evaluating")
-
-        if not scores:
-            percent_correct = 0.0
-        else:
-            percent_correct = sum(scores) / len(scores)
-
-        self.eval_metrics.append(("eval/percent_correct", percent_correct))
-        self.logger.info(f"Evaluation finished. Percent correct: {percent_correct:.4f}")
-
-    async def add_rollouts_for_wandb(
-        self,
-        scored_data: Union[ScoredDataGroup, List[ScoredDataGroup]],
-        item: Item = None,
-    ):
-        if item is None or scored_data is None or not scored_data.get("tokens"):
-            return
-
-        expected_counts = item[1]  # correct counts (dict)
-        text = item[2]  # text (word or sentence)
-        target_letters = item[3]  # target letters (list)
-
-        # Handle legacy format
-        if isinstance(target_letters, str):
-            target_letters = [target_letters]
-            expected_counts = {target_letters[0]: expected_counts}
-        elif isinstance(expected_counts, int):
-            expected_counts = {target_letters[0]: expected_counts}
-
-        # save rollout to trajectory
-        num_keep = self.config.num_rollouts_per_group_for_logging
-        if num_keep == -1:
-            num_keep = self.config.group_size
-
-        # Make sure there's data to log
-        num_keep = min(num_keep, len(scored_data["tokens"]))
-        if num_keep == 0:
-            return
-
-        # Calculate group average score
-        group_scores = scored_data.get("scores", [])
-        group_average_score = (
-            sum(group_scores) / len(group_scores) if group_scores else 0.0
-        )
-
-        current_rollouts = []
-        for i in range(num_keep):
-            # Ensure tokens and scores have the same length
-            if i < len(scored_data["tokens"]) and i < len(scored_data["scores"]):
-                # Decode the full trajectory including prompt and model response
-                full_text = self.tokenizer.decode(
-                    scored_data["tokens"][i], skip_special_tokens=True
-                )
-                score_val = scored_data["scores"][i]
-                expected_str = (
-                    str(expected_counts)
-                    if len(target_letters) > 1
-                    else str(expected_counts[target_letters[0]])
-                )
-                letters_str = ", ".join(target_letters)
-                current_rollouts.append(
-                    (
-                        full_text,
-                        score_val,
-                        expected_str,
-                        text[:100],
-                        letters_str,
-                        group_average_score,
-                    )
-                )
-            else:
-                self.logger.warning(
-                    f"Mismatch in lengths of tokens/scores for wandb logging at index {i}."
-                )
-
-        self.rollouts_for_wandb.append(current_rollouts)
-
-        if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
-            self.rollouts_for_wandb.pop(0)
-
-    async def create_rollout_table(self, wandb_metrics):
-        if len(self.rollouts_for_wandb) > 0:
-            table = wandb.Table(
-                columns=[
-                    "full_text",
-                    "score",
-                    "expected_counts",
-                    "text",
-                    "target_letters",
-                    "group_average_score",
-                ]
-            )
-            for group in self.rollouts_for_wandb:
-                for item in group:
-                    # Handle both old format (5 items) and new format (6 items)
-                    if len(item) >= 6:
-                        table.add_data(
-                            item[0], item[1], item[2], item[3], item[4], item[5]
-                        )
-                    else:
-                        table.add_data(item[0], item[1], item[2], item[3], item[4], 0.0)
-            wandb_metrics["train/rollouts"] = table
-        self.rollouts_for_wandb = []
-        return wandb_metrics
-
-    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
-        if wandb_metrics is None:
-            wandb_metrics = {}
-
-        # Try to calculate percent_correct, pass if there's a division by zero
-        try:
-            wandb_metrics["train/percent_correct"] = sum(
-                self.percent_correct_buffer
-            ) / len(self.percent_correct_buffer)
-        except ZeroDivisionError:
-            # Skip if buffer is empty
-            pass
-
-        self.percent_correct_buffer = list()
-
-        # Add eval metrics
-        for item in self.eval_metrics:
-            wandb_metrics[item[0]] = item[1]
-        self.eval_metrics = list()
-
-        # Add comprehensive letter counting specific metrics
-        if self.letter_distribution_stats:
-            # Log letter distribution statistics
-            total_letters_asked = sum(self.letter_distribution_stats.values())
-            wandb_metrics["stats/total_letters_asked"] = total_letters_asked
-
-            # Log most and least common letters asked
-            most_common_letter = max(
-                self.letter_distribution_stats, key=self.letter_distribution_stats.get
-            )
-            least_common_letter = min(
-                self.letter_distribution_stats, key=self.letter_distribution_stats.get
-            )
-            wandb_metrics["stats/most_common_letter_count"] = (
-                self.letter_distribution_stats[most_common_letter]
-            )
-            wandb_metrics["stats/least_common_letter_count"] = (
-                self.letter_distribution_stats[least_common_letter]
-            )
-
-            # Log distribution entropy (measure of how evenly distributed the letters are)
-            import math
-
-            entropy = -sum(
-                (count / total_letters_asked) * math.log2(count / total_letters_asked)
-                for count in self.letter_distribution_stats.values()
-                if count > 0
-            )
-            wandb_metrics["stats/letter_distribution_entropy"] = entropy
-
-        if self.word_length_stats:
-            # Log word length statistics
-            total_words_asked = sum(self.word_length_stats.values())
-            wandb_metrics["stats/total_words_asked"] = total_words_asked
-
-            # Calculate average word length
-            avg_word_length = (
-                sum(length * count for length, count in self.word_length_stats.items())
-                / total_words_asked
-            )
-            wandb_metrics["stats/avg_word_length"] = avg_word_length
-
-            # Log min and max word lengths seen
-            wandb_metrics["stats/min_word_length"] = min(self.word_length_stats.keys())
-            wandb_metrics["stats/max_word_length"] = max(self.word_length_stats.keys())
-
-        # Log error rates
-        if self.processed_item_count > 0:
-            wandb_metrics["errors/answer_format_error_rate"] = (
-                self.answer_format_errors
-                / (self.processed_item_count * self.config.group_size)
-            )
-            wandb_metrics["errors/think_format_error_rate"] = (
-                self.think_format_errors
-                / (self.processed_item_count * self.config.group_size)
-            )
-            wandb_metrics["errors/total_format_errors"] = (
-                self.answer_format_errors + self.think_format_errors
-            )
-
-        # Log data dumping progress
-        if self.config.dump_rollouts:
-            wandb_metrics["data_dumps/processed_item_count"] = self.processed_item_count
-            wandb_metrics["data_dumps/rollouts_buffer_size"] = len(
-                self.rollouts_to_save_buffer
-            )
-            wandb_metrics["data_dumps/save_file_batch_num"] = self.save_file_batch_num
-            wandb_metrics["data_dumps/batch_size"] = self.config.dump_batch_size
-
-        # Add rollout table
-        wandb_metrics = await self.create_rollout_table(wandb_metrics)
-
-        await super().wandb_log(wandb_metrics)
-
-
-if __name__ == "__main__":
-    LetterCountingEnv.cli()
diff --git a/environments/letter_counting_environment/config.yaml b/environments/letter_counting_environment/config.yaml
new file mode 100644
index 00000000..43da610b
--- /dev/null
+++ b/environments/letter_counting_environment/config.yaml
@@ -0,0 +1,72 @@
+# Tinker-Atropos Configuration - Letter Counting Environment
+# This environment uses adaptive difficulty (curriculum learning) with 10 tiers
+# Evaluation uses static dataset from HuggingFace: NousResearch/Letter-Counting-Eval
+
+# Environment configuration
+env:
+  # Base environment config
+  group_size: 8
+  batch_size: 256
+  max_batches_offpolicy: 3
+  tokenizer_name: "Qwen/Qwen3-8B"
+  use_wandb: true
+  rollout_server_url: "http://localhost:8000"
+  wandb_name: "letter-counting-env"
+  ensure_scores_are_not_same: true
+  max_token_length: 8192
+  max_num_workers: 24
+  worker_timeout: 3600  # 1 hour - needed for high difficulty levels
+  total_steps: 5000
+  steps_per_eval: 5
+  inference_weight: 1.0
+  data_path_to_save_groups: null
+  eval_limit_ratio: 0.1
+
+  # Generation configuration
+  generation_temperature: 1.0
+  eval_temperature: 0.6
+  max_generation_tokens: 15360
+
+  # Training filtering (CRITICAL for stable training):
+  #   - Groups with >80% success rate are SKIPPED (too easy, no learning signal)
+  #   - Groups with <20% success rate are SKIPPED (too hard, no learning signal)
+  #   - Groups with all identical scores are SKIPPED (no variance)
+  difficulty_window_size: 150         # Number of recent groups to track (larger = more stable)
+  difficulty_increase_threshold: 0.8  # Increase difficulty if success rate > this (also skip group)
+  difficulty_decrease_threshold: 0.2  # Decrease difficulty if success rate < this (also skip group)
+  min_difficulty_level: 1             # Minimum difficulty (1 = easiest)
+  max_difficulty_level: 10            # Maximum difficulty (10 = 500 chars, 50 letters)
+  starting_difficulty_level: 4        # Start at medium difficulty
+
+  # Logging configuration
+  debug_logging: true
+  suppress_base_env_logs: true
+
+  # Data dumping configuration (for creating offline training datasets)
+  dump_rollouts: false
+  dump_batch_size: 100
+
+# OpenAI-compatible server configuration
+openai:
+  - model_name: "Qwen/Qwen3-8B"
+    base_url: "http://localhost:8001/v1"
+    api_key: "x"
+    weight: 1.0
+    num_requests_for_eval: 256
+
+# Tinker-specific example configuration
+tinker:
+  lora_rank: 32
+  learning_rate: 0.00004
+  max_token_trainer_length: 16864
+  checkpoint_dir: "./temp/"
+  save_checkpoint_interval: 0
+
+  # Wandb configuration for trainer
+  wandb_project: "tinker-letter-counting"
+  wandb_group: null
+  wandb_run_name: "tinker-letter-counting-run"
+
+# Standard Atropos flags
+slurm: false
+testing: false
diff --git a/environments/letter_counting_environment/letter_counting_environment.py b/environments/letter_counting_environment/letter_counting_environment.py
new file mode 100644
index 00000000..2398373b
--- /dev/null
+++ b/environments/letter_counting_environment/letter_counting_environment.py
@@ -0,0 +1,1553 @@
+"""
+Letter Counting Environment with Adaptive Difficulty (Curriculum Learning)
+
+This environment trains models to count letter occurrences in words and strings
+using a 10-tier difficulty system that automatically adjusts based on performance.
+
+Evaluation uses a static external dataset from HuggingFace: NousResearch/Letter-Counting-Eval
+"""
+
+import json
+import logging
+import os
+import random
+import re
+import sys
+import threading
+import uuid
+from typing import Dict, List, Optional, Tuple, Union
+
+import wandb
+import yaml
+from pydantic import Field
+from tqdm.asyncio import tqdm_asyncio
+
+from atroposlib.envs.base import (
+    APIServerConfig,
+    BaseEnv,
+    BaseEnvConfig,
+    Item,
+    ScoredDataGroup,
+)
+
+# Default path to the YAML configuration file
+DEFAULT_CONFIG_PATH = "config.yaml"
+
+
+def get_config_path() -> str:
+    """
+    Get the config path from CLI --config argument or use default.
+
+    Usage: python letter_counting_environment.py --config /path/to/config.yaml
+
+    Returns:
+        Config file path (from --config arg or DEFAULT_CONFIG_PATH)
+    """
+    for i, arg in enumerate(sys.argv):
+        if arg == "--config" and i + 1 < len(sys.argv):
+            return sys.argv[i + 1]
+        if arg.startswith("--config="):
+            return arg.split("=", 1)[1]
+    return DEFAULT_CONFIG_PATH
+
+
+# Import NLTK words corpus for training word list
+try:
+    import nltk
+    from nltk.corpus import words
+
+    try:
+        words.words()
+    except LookupError:
+        nltk.download("words")
+except ImportError:
+    print("Warning: NLTK not available. Please install with: pip install nltk")
+    words = None
+
+# Import datasets for loading external eval dataset
+try:
+    from datasets import load_dataset
+except ImportError:
+    print("Warning: datasets library not available. Install with: pip install datasets")
+    load_dataset = None
+
+
+# =============================================================================
+# DIFFICULTY TIERS CONFIGURATION
+# =============================================================================
+# This dict can be modified to adjust difficulty levels.
+# Each tier defines text length ranges, letter count ranges, and whether to use random strings.
+# Levels 1-5 use real English words, levels 6-10 use random letter strings.
+
+DIFFICULTY_TIERS = {
+    # Tier 1: Very Easy - Short words, always single letter
+    1: {
+        "min_word_length": 3,
+        "max_word_length": 8,
+        "multi_letter_probability": 0.0,
+        "min_letters_to_count": 1,
+        "max_letters_to_count": 1,
+        "use_random_string": False,
+    },
+    # Tier 2: Easy - Short-medium words, 50% multi (1-2 letters)
+    2: {
+        "min_word_length": 5,
+        "max_word_length": 12,
+        "multi_letter_probability": 0.5,
+        "min_letters_to_count": 1,
+        "max_letters_to_count": 2,
+        "use_random_string": False,
+    },
+    # Tier 3: Medium-Easy - Medium words, 60% multi (2-3 letters)
+    3: {
+        "min_word_length": 8,
+        "max_word_length": 18,
+        "multi_letter_probability": 0.6,
+        "min_letters_to_count": 2,
+        "max_letters_to_count": 3,
+        "use_random_string": False,
+    },
+    # Tier 4: Medium - Longer words, 80% multi (3-5 letters)
+    4: {
+        "min_word_length": 10,
+        "max_word_length": 25,
+        "multi_letter_probability": 0.8,
+        "min_letters_to_count": 3,
+        "max_letters_to_count": 5,
+        "use_random_string": False,
+    },
+    # Tier 5: Medium-Hard - Long words, 90% multi (5-8 letters)
+    5: {
+        "min_word_length": 15,
+        "max_word_length": 35,
+        "multi_letter_probability": 0.9,
+        "min_letters_to_count": 5,
+        "max_letters_to_count": 8,
+        "use_random_string": False,
+    },
+    # Tier 6: Hard - Random strings, 100% multi (8-12 letters)
+    6: {
+        "min_word_length": 40,
+        "max_word_length": 80,
+        "multi_letter_probability": 1.0,
+        "min_letters_to_count": 8,
+        "max_letters_to_count": 12,
+        "use_random_string": True,
+    },
+    # Tier 7: Very Hard - Long random strings, 100% multi (12-20 letters)
+    7: {
+        "min_word_length": 80,
+        "max_word_length": 150,
+        "multi_letter_probability": 1.0,
+        "min_letters_to_count": 12,
+        "max_letters_to_count": 20,
+        "use_random_string": True,
+    },
+    # Tier 8: Expert - Very long strings, 100% multi (18-30 letters)
+    8: {
+        "min_word_length": 150,
+        "max_word_length": 250,
+        "multi_letter_probability": 1.0,
+        "min_letters_to_count": 18,
+        "max_letters_to_count": 30,
+        "use_random_string": True,
+    },
+    # Tier 9: Master - Extreme strings, 100% multi (25-40 letters)
+    9: {
+        "min_word_length": 250,
+        "max_word_length": 400,
+        "multi_letter_probability": 1.0,
+        "min_letters_to_count": 25,
+        "max_letters_to_count": 40,
+        "use_random_string": True,
+    },
+    # Tier 10: Maximum - 500 chars, 100% multi (35-50 letters)
+    10: {
+        "min_word_length": 400,
+        "max_word_length": 500,
+        "multi_letter_probability": 1.0,
+        "min_letters_to_count": 35,
+        "max_letters_to_count": 50,
+        "use_random_string": True,
+    },
+}
+
+
+class LetterCountingConfig(BaseEnvConfig):
+    """
+    Configuration class for Letter Counting Environment.
+
+    This environment uses adaptive difficulty (curriculum learning) with 10 difficulty tiers.
+    Difficulty automatically adjusts based on model performance.
+    """
+
+    # Generation configuration
+    generation_temperature: float = Field(
+        1.0, description="Temperature for training generation"
+    )
+    eval_temperature: float = Field(
+        0.6, description="Temperature for evaluation generation"
+    )
+    max_generation_tokens: int = Field(
+        1024 * 15, description="Maximum tokens for model generation"
+    )
+
+    # Adaptive difficulty / curriculum learning configuration (10 tiers)
+    difficulty_window_size: int = Field(
+        150, description="Number of recent groups to track for difficulty adjustment"
+    )
+    difficulty_increase_threshold: float = Field(
+        0.8, description="If success rate > this, increase difficulty (skip group)"
+    )
+    difficulty_decrease_threshold: float = Field(
+        0.2, description="If success rate < this, decrease difficulty (skip group)"
+    )
+    min_difficulty_level: int = Field(
+        1, description="Minimum difficulty level (1 = easiest)"
+    )
+    max_difficulty_level: int = Field(
+        10, description="Maximum difficulty level (10 = hardest)"
+    )
+    starting_difficulty_level: int = Field(
+        3, description="Starting difficulty level (1-10)"
+    )
+
+    # Logging configuration
+    debug_logging: bool = Field(
+        True, description="Enable debug-level logging for more verbose output"
+    )
+    suppress_base_env_logs: bool = Field(
+        True, description="Suppress verbose base environment logs"
+    )
+
+    # Data dumping configuration (for creating offline training datasets)
+    dump_rollouts: bool = Field(
+        False, description="Whether to dump successful rollouts to JSONL files"
+    )
+    dump_batch_size: int = Field(
+        100, description="Number of groups to accumulate before saving to disk"
+    )
+
+
+class LetterCountingEnv(BaseEnv):
+    """
+    Letter Counting Environment for training models to count letters in words and strings.
+
+    This environment uses adaptive difficulty (curriculum learning) with 10 difficulty tiers:
+    - Levels 1-5: Real English words with increasing length and multi-letter counting
+    - Levels 6-10: Random letter strings with extreme length and many letters
+
+    The model is presented with questions like:
+    - "How many 'a's are in the string banana?" (single letter)
+    - "Count the occurrences of the letters 'e', 'o', 't' in the string..." (multi-letter)
+
+    Expected response format:
+    - <think>reasoning</think><answer>3</answer> (single letter)
+    - <think>reasoning</think><answer>{"e": 4, "o": 4, "t": 2}</answer> (multi-letter)
+
+    Training filters:
+    - Groups with >80% success rate are skipped (too easy)
+    - Groups with <20% success rate are skipped (too hard)
+    - Difficulty automatically adjusts based on rolling window performance
+
+    Evaluation uses static dataset from HuggingFace: NousResearch/Letter-Counting-Eval
+    """
+
+    name = "letter_counting"
+    env_config_cls = LetterCountingConfig
+
+    # Fixed parameters (not configurable)
+    TRAIN_TEST_SPLIT = 0.95
+    RANDOM_SEED = 42
+    MIN_WORD_LENGTH = 3
+    MAX_WORD_LENGTH = 30
+    EVAL_DATASET_HF = "NousResearch/Letter-Counting-Eval"
+
+    def __init__(
+        self,
+        config: LetterCountingConfig,
+        server_configs: List[APIServerConfig],
+        slurm=True,
+        testing=False,
+    ):
+        """
+        Initialize the Letter Counting environment.
+
+        Args:
+            config: Configuration for the environment
+            server_configs: List of server configurations for OpenAI API
+            slurm: Whether to use Slurm for distributed training
+            testing: Whether in testing mode
+        """
+        super().__init__(config, server_configs, slurm, testing)
+
+        # Run UUID for tracking
+        self.run_uuid = str(uuid.uuid4())
+
+        # Data dumping infrastructure
+        self.rollouts_to_save_buffer: List[Dict] = []
+        self.processed_item_count = 0
+        self.datadumps_dir = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "data_dumps"
+        )
+        self.save_file_batch_num = 0
+
+        # Metrics tracking
+        self.letter_distribution_stats: Dict[str, int] = {}
+        self.word_length_stats: Dict[int, int] = {}
+        self.answer_format_errors = 0
+        self.think_format_errors = 0
+
+        # Adaptive difficulty / curriculum learning state
+        # Thread-safe lock for concurrent workers
+        self._difficulty_lock = threading.Lock()
+        self.current_difficulty_level = self.config.starting_difficulty_level
+        self.recent_scores: List[float] = []  # Rolling window of recent group scores
+        self.difficulty_history: List[Tuple[int, int, float]] = []
+
+        # Fixed evaluation dataset (loaded from HuggingFace during setup)
+        self.eval_dataset: List[Dict] = []
+
+        # Initialize the logger
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter(
+                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+            )
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+            log_level = logging.DEBUG if self.config.debug_logging else logging.INFO
+            self.logger.setLevel(log_level)
+        self.logger.disabled = False
+
+        # Suppress base environment logs if requested
+        if self.config.suppress_base_env_logs:
+            base_logger = logging.getLogger("atroposlib.envs.base")
+            base_logger.setLevel(logging.WARNING)
+
+        # Log initialization
+        self.logger.info(
+            f"LetterCountingEnv initialized with run UUID: {self.run_uuid}"
+        )
+        self.logger.info(
+            f"Adaptive difficulty: starting at L{self.current_difficulty_level}/10 "
+            f"(range: {self.config.min_difficulty_level}-{self.config.max_difficulty_level}, "
+            f"window: {self.config.difficulty_window_size}, "
+            f"thresholds: >{self.config.difficulty_increase_threshold:.0%} to increase, "
+            f"<{self.config.difficulty_decrease_threshold:.0%} to decrease)"
+        )
+        if self.config.dump_rollouts:
+            self.logger.info(
+                f"Data dumping ENABLED: saving to {self.datadumps_dir}, "
+                f"batch size: {self.config.dump_batch_size}"
+            )
+
+        self.percent_correct_buffer = []
+        self.eval_metrics = []
+        self.rollouts_for_wandb: List[List[Tuple]] = []
+
+    @classmethod
+    def config_init(cls) -> Tuple[LetterCountingConfig, List[APIServerConfig]]:
+        """
+        Load configuration from YAML file, with fallback to defaults if not found.
+
+        Config path priority:
+        1. CLI argument: --config /path/to/config.yaml
+        2. Default: config.yaml (DEFAULT_CONFIG_PATH)
+
+        If config file doesn't exist or can't be read, uses sensible defaults.
+        """
+        config_path = get_config_path()
+
+        # Try to load config from file, fall back to empty dicts if not found
+        try:
+            with open(config_path, "r") as f:
+                config = yaml.safe_load(f) or {}
+            print(f"Loaded config from {config_path}")
+        except FileNotFoundError:
+            print(f"Config file not found at {config_path}, using defaults")
+            config = {}
+        except Exception as e:
+            print(f"Error loading config from {config_path}: {e}, using defaults")
+            config = {}
+
+        env = config.get("env", {})
+        openai_configs = config.get("openai", [])
+
+        # Build LetterCountingConfig from YAML
+        env_config = LetterCountingConfig(
+            # Base environment config
+            tokenizer_name=env.get(
+                "tokenizer_name", "meta-llama/Llama-3.1-8B-Instruct"
+            ),
+            group_size=env.get("group_size", 16),
+            batch_size=env.get("batch_size", 256),
+            max_batches_offpolicy=env.get("max_batches_offpolicy", 3),
+            use_wandb=env.get("use_wandb", True),
+            rollout_server_url=env.get("rollout_server_url", "http://localhost:8000"),
+            wandb_name=env.get("wandb_name", "tinker-letter-counting-env"),
+            ensure_scores_are_not_same=env.get("ensure_scores_are_not_same", True),
+            max_token_length=env.get("max_token_length", 16864),
+            max_num_workers=env.get("max_num_workers", 24),
+            total_steps=env.get("total_steps", 5000),
+            steps_per_eval=env.get("steps_per_eval", 100),
+            inference_weight=env.get("inference_weight", 1.0),
+            data_path_to_save_groups=env.get("data_path_to_save_groups", None),
+            eval_limit_ratio=env.get("eval_limit_ratio", 0.1),
+            # Generation config
+            generation_temperature=env.get("generation_temperature", 1.0),
+            eval_temperature=env.get("eval_temperature", 0.6),
+            max_generation_tokens=env.get("max_generation_tokens", 15360),
+            # Adaptive difficulty config
+            difficulty_window_size=env.get("difficulty_window_size", 150),
+            difficulty_increase_threshold=env.get("difficulty_increase_threshold", 0.8),
+            difficulty_decrease_threshold=env.get("difficulty_decrease_threshold", 0.2),
+            min_difficulty_level=env.get("min_difficulty_level", 1),
+            max_difficulty_level=env.get("max_difficulty_level", 10),
+            starting_difficulty_level=env.get("starting_difficulty_level", 3),
+            # Logging config
+            debug_logging=env.get("debug_logging", True),
+            suppress_base_env_logs=env.get("suppress_base_env_logs", True),
+            # Data dumping config
+            dump_rollouts=env.get("dump_rollouts", False),
+            dump_batch_size=env.get("dump_batch_size", 100),
+        )
+
+        # Build server configs from YAML
+        server_configs = []
+        for openai_cfg in openai_configs:
+            server_configs.append(
+                APIServerConfig(
+                    model_name=openai_cfg.get(
+                        "model_name", "meta-llama/Llama-3.1-8B-Instruct"
+                    ),
+                    base_url=openai_cfg.get("base_url", "http://localhost:8001/v1"),
+                    api_key=openai_cfg.get("api_key", "x"),
+                    num_requests_for_eval=openai_cfg.get("num_requests_for_eval", 256),
+                    server_type=openai_cfg.get("server_type"),
+                )
+            )
+
+        # Default server config if none provided
+        if not server_configs:
+            server_configs = [
+                APIServerConfig(
+                    model_name="meta-llama/Llama-3.1-8B-Instruct",
+                    base_url="http://localhost:8001/v1",
+                    api_key="x",
+                    num_requests_for_eval=256,
+                )
+            ]
+
+        return env_config, server_configs
+
+    async def setup(self):
+        """Set up the environment by loading word dataset and eval dataset."""
+        if words is None:
+            raise ImportError(
+                "NLTK is required for this environment. Please install with: pip install nltk"
+            )
+
+        # Set random seed for reproducibility
+        random.seed(self.RANDOM_SEED)
+
+        self.logger.info("Setting up word dataset from NLTK...")
+
+        # Get all English words from NLTK and filter
+        all_words = words.words()
+        filtered_words = [
+            word.lower()
+            for word in all_words
+            if word.isalpha()
+            and self.MIN_WORD_LENGTH <= len(word) <= self.MAX_WORD_LENGTH
+        ]
+
+        # Shuffle for randomness
+        random.shuffle(filtered_words)
+
+        # Create train/test split (test words used as fallback, eval uses HF dataset)
+        split_point = int(self.TRAIN_TEST_SPLIT * len(filtered_words))
+        self.train_words = filtered_words[:split_point]
+        self.test_words = filtered_words[split_point:]
+
+        self.logger.info(f"Loaded {len(filtered_words)} words total")
+        self.logger.info(f"Training words: {len(self.train_words)}")
+        self.logger.info(f"Test words: {len(self.test_words)}")
+        self.logger.info(f"Example words: {self.train_words[:10]}")
+
+        # Initialize iteration counter
+        self.iter = 0
+
+        # Load evaluation dataset from HuggingFace
+        self.eval_dataset = self._load_eval_dataset()
+
+        self.logger.info("Letter counting environment setup complete")
+
+    def _load_eval_dataset(self) -> List[Dict]:
+        """
+        Load the static evaluation dataset from HuggingFace.
+
+        Dataset: NousResearch/Letter-Counting-Eval
+        Schema: Prompt, Answer, DifficultyLevel, _text, _target_letters, _expected_counts
+        """
+        if load_dataset is None:
+            self.logger.warning(
+                "datasets library not available. Evaluation will be skipped. "
+                "Install with: pip install datasets"
+            )
+            return []
+
+        self.logger.info(
+            f"Loading eval dataset from HuggingFace: {self.EVAL_DATASET_HF}"
+        )
+
+        try:
+            dataset = load_dataset(self.EVAL_DATASET_HF, split="train")
+        except Exception as e:
+            self.logger.error(f"Failed to load eval dataset: {e}")
+            return []
+
+        eval_items = []
+        for item in dataset:
+            # Handle both formats from the schema
+            eval_item = {
+                "prompt": item.get("Prompt"),
+                "answer": item.get("Answer"),
+                "difficulty_level": item.get("DifficultyLevel", 0),
+            }
+
+            # Include raw data if available for detailed analysis
+            if "_text" in item:
+                eval_item["text"] = item["_text"]
+            if "_target_letters" in item:
+                eval_item["target_letters"] = item["_target_letters"]
+            if "_expected_counts" in item:
+                eval_item["expected_counts"] = item["_expected_counts"]
+
+            eval_items.append(eval_item)
+
+        # Log distribution by difficulty level
+        level_counts = {}
+        for item in eval_items:
+            level = item.get("difficulty_level", 0)
+            level_counts[level] = level_counts.get(level, 0) + 1
+
+        level_str = ", ".join(f"L{k}={v}" for k, v in sorted(level_counts.items()))
+        self.logger.info(f"Loaded eval dataset: {len(eval_items)} items ({level_str})")
+
+        return eval_items
+
+    def _get_difficulty_params(self, level: Optional[int] = None) -> Dict:
+        """
+        Get difficulty parameters for the specified or current difficulty level.
+
+        Args:
+            level: Optional difficulty level. If None, uses current_difficulty_level.
+
+        Returns:
+            Dict with difficulty parameters from DIFFICULTY_TIERS
+        """
+        if level is None:
+            level = self.current_difficulty_level
+
+        # Clamp to valid range
+        level = max(
+            self.config.min_difficulty_level,
+            min(self.config.max_difficulty_level, level),
+        )
+        return DIFFICULTY_TIERS.get(level, DIFFICULTY_TIERS[3])
+
+    def _generate_random_letter_string(self, min_length: int, max_length: int) -> str:
+        """
+        Generate a random string of lowercase letters for high-difficulty tiers.
+
+        Args:
+            min_length: Minimum string length
+            max_length: Maximum string length
+
+        Returns:
+            Random string of lowercase letters
+        """
+        length = random.randint(min_length, max_length)
+        return "".join(
+            random.choice("abcdefghijklmnopqrstuvwxyz") for _ in range(length)
+        )
+
+    def _select_target_letters(self, text: str, num_letters: int) -> List[str]:
+        """
+        Select target letters with randomized bias toward letters present in the text.
+
+        The bias is randomized each time (between 30% and 90%) to prevent the model
+        from learning that a fixed percentage of letters are always absent.
+        For long strings with many letters requested, the bias automatically adjusts
+        to what's actually available.
+
+        Args:
+            text: The text to analyze for present letters
+            num_letters: Number of letters to select
+
+        Returns:
+            List of selected target letters
+        """
+        all_letters = list("abcdefghijklmnopqrstuvwxyz")
+
+        # Find letters present/absent in text
+        text_lower = text.lower()
+        present_letters = [ch for ch in all_letters if ch in text_lower]
+        absent_letters = [ch for ch in all_letters if ch not in text_lower]
+
+        # Randomize the present letter bias between 30% and 90%
+        # This prevents the model from learning a fixed pattern
+        present_bias = random.uniform(0.3, 0.9)
+
+        selected_letters = []
+        present_pool = present_letters.copy()
+        absent_pool = absent_letters.copy()
+
+        for _ in range(num_letters):
+            # Calculate effective bias based on what's available
+            # If we've exhausted one pool, use the other
+            if not present_pool and not absent_pool:
+                break
+            elif not present_pool:
+                effective_bias = 0.0  # Must use absent
+            elif not absent_pool:
+                effective_bias = 1.0  # Must use present
+            else:
+                effective_bias = present_bias
+
+            # Select based on bias
+            if random.random() < effective_bias and present_pool:
+                chosen = random.choice(present_pool)
+                present_pool.remove(chosen)
+            elif absent_pool:
+                chosen = random.choice(absent_pool)
+                absent_pool.remove(chosen)
+            elif present_pool:
+                # Fallback if absent is empty
+                chosen = random.choice(present_pool)
+                present_pool.remove(chosen)
+            else:
+                break
+
+            selected_letters.append(chosen)
+
+        return selected_letters
+
+    def _update_difficulty(self, group_score: float, item_difficulty_level: int):
+        """
+        Update difficulty level based on recent performance.
+
+        Key behaviors:
+        1. Only counts scores from items at the CURRENT difficulty level
+        2. Resets the rolling window when difficulty changes
+        3. Requires min_samples before making any adjustment decision
+
+        Thread-safe: uses a lock to prevent race conditions.
+
+        Args:
+            group_score: Average score for the most recent group (0.0 to 1.0)
+            item_difficulty_level: The difficulty level when the item was generated
+        """
+        with self._difficulty_lock:
+            # Only count scores from items at the CURRENT difficulty level
+            if item_difficulty_level != self.current_difficulty_level:
+                self.logger.debug(
+                    f"Ignoring score from L{item_difficulty_level} item "
+                    f"(current: L{self.current_difficulty_level})"
+                )
+                return
+
+            # Add to rolling window
+            self.recent_scores.append(group_score)
+
+            # Keep only the most recent scores
+            if len(self.recent_scores) > self.config.difficulty_window_size:
+                self.recent_scores = self.recent_scores[
+                    -self.config.difficulty_window_size :
+                ]
+
+            # Need at least half the window size before adjusting
+            min_samples = max(10, self.config.difficulty_window_size // 2)
+            if len(self.recent_scores) < min_samples:
+                self.logger.debug(
+                    f"Not enough samples at L{self.current_difficulty_level}: "
+                    f"{len(self.recent_scores)}/{min_samples} required"
+                )
+                return
+
+            # Calculate recent success rate
+            recent_success_rate = sum(self.recent_scores) / len(self.recent_scores)
+
+            old_level = self.current_difficulty_level
+            level_changed = False
+
+            # Adjust difficulty
+            if recent_success_rate > self.config.difficulty_increase_threshold:
+                if self.current_difficulty_level < self.config.max_difficulty_level:
+                    self.current_difficulty_level += 1
+                    level_changed = True
+                    self.logger.info(
+                        f"DIFFICULTY INCREASED: L{old_level} -> L{self.current_difficulty_level} "
+                        f"(success: {recent_success_rate:.1%} > {self.config.difficulty_increase_threshold:.1%})"
+                    )
+            elif recent_success_rate < self.config.difficulty_decrease_threshold:
+                if self.current_difficulty_level > self.config.min_difficulty_level:
+                    self.current_difficulty_level -= 1
+                    level_changed = True
+                    self.logger.info(
+                        f"DIFFICULTY DECREASED: L{old_level} -> L{self.current_difficulty_level} "
+                        f"(success: {recent_success_rate:.1%} < {self.config.difficulty_decrease_threshold:.1%})"
+                    )
+
+            # Reset window when difficulty changes
+            if level_changed:
+                self.recent_scores = []
+                self.logger.info(
+                    f"Window reset - collecting fresh samples at L{self.current_difficulty_level}"
+                )
+
+            # Track difficulty history
+            self.difficulty_history.append(
+                (self.iter, self.current_difficulty_level, recent_success_rate)
+            )
+
+    def _reconstruct_message_with_thinking(self, completion_choice) -> str:
+        """
+        Reconstruct a message by combining reasoning_content and content.
+
+        Modern reasoning models return reasoning in a separate `reasoning_content` field.
+        This method wraps that in <think> tags for consistent processing.
+        """
+        # Handle text completion API
+        if hasattr(completion_choice, "text"):
+            raw_content = completion_choice.text or ""
+            reasoning_content = None
+
+            if hasattr(completion_choice, "reasoning_content"):
+                reasoning_content = completion_choice.reasoning_content
+
+            if reasoning_content is None and hasattr(completion_choice, "message"):
+                message = completion_choice.message
+                if message and hasattr(message, "reasoning_content"):
+                    reasoning_content = message.reasoning_content
+
+            if reasoning_content:
+                return f"<think>\n{reasoning_content}\n</think>{raw_content}"
+            return raw_content
+
+        # Handle chat completion API
+        elif hasattr(completion_choice, "message"):
+            message = completion_choice.message
+            content = getattr(message, "content", "") or ""
+            reasoning_content = getattr(message, "reasoning_content", None)
+
+            if reasoning_content:
+                return f"<think>\n{reasoning_content}\n</think>{content}"
+            return content
+
+        return ""
+
+    async def _save_rollouts_to_jsonl(self):
+        """Save buffered rollouts to a JSONL file in the data_dumps directory."""
+        if not self.rollouts_to_save_buffer:
+            self.logger.warning("_save_rollouts_to_jsonl called but buffer is empty!")
+            return
+
+        buffer_size = len(self.rollouts_to_save_buffer)
+        self.logger.info(f"Starting save of {buffer_size} groups to JSONL file...")
+
+        try:
+            if not os.path.exists(self.datadumps_dir):
+                os.makedirs(self.datadumps_dir)
+                self.logger.info(f"Created directory: {self.datadumps_dir}")
+        except OSError as e:
+            self.logger.error(f"Error creating directory {self.datadumps_dir}: {e}")
+            return
+
+        file_path = os.path.join(
+            self.datadumps_dir,
+            f"letter_counting_rollouts_{self.run_uuid}_{self.save_file_batch_num:04d}.jsonl",
+        )
+
+        try:
+            with open(file_path, "w") as f:
+                for rollout_dict in self.rollouts_to_save_buffer:
+                    json.dump(rollout_dict, f)
+                    f.write("\n")
+            self.logger.info(
+                f"Successfully saved {buffer_size} groups to {file_path} "
+                f"(batch #{self.save_file_batch_num})"
+            )
+            self.rollouts_to_save_buffer.clear()
+            self.save_file_batch_num += 1
+        except IOError as e:
+            self.logger.error(f"Error writing rollouts to {file_path}: {e}")
+        except Exception as e:
+            self.logger.error(f"Unexpected error saving rollouts: {e}")
+
+    def save_checkpoint(self, step, data=None):
+        """Save checkpoint including iteration and difficulty state."""
+        if data is None:
+            data = {}
+        data["iter"] = self.iter
+        data["current_difficulty_level"] = self.current_difficulty_level
+        data["recent_scores"] = self.recent_scores
+        data["processed_item_count"] = self.processed_item_count
+        data["save_file_batch_num"] = self.save_file_batch_num
+        data["letter_distribution_stats"] = self.letter_distribution_stats
+        data["word_length_stats"] = self.word_length_stats
+        data["answer_format_errors"] = self.answer_format_errors
+        data["think_format_errors"] = self.think_format_errors
+        super().save_checkpoint(step, data)
+
+    def load_checkpoint(self):
+        """Load checkpoint including iteration and difficulty state."""
+        super().load_checkpoint()
+
+        if hasattr(self, "current_difficulty_level"):
+            self.logger.info(
+                f"Restored difficulty level: L{self.current_difficulty_level}"
+            )
+        if hasattr(self, "recent_scores") and self.recent_scores:
+            self.logger.info(f"Restored {len(self.recent_scores)} recent scores")
+        if hasattr(self, "save_file_batch_num"):
+            self.logger.info(
+                f"Restored save_file_batch_num: {self.save_file_batch_num}"
+            )
+
+    async def close(self):
+        """Clean up and save any remaining rollouts before exiting."""
+        self.logger.info("Closing LetterCountingEnv...")
+
+        if self.config.dump_rollouts and self.rollouts_to_save_buffer:
+            self.logger.info(
+                f"FINAL SAVE: {len(self.rollouts_to_save_buffer)} groups in buffer. "
+                f"Saving to disk..."
+            )
+            await self._save_rollouts_to_jsonl()
+
+        if hasattr(super(), "close"):
+            await super().close()
+
+        self.logger.info("LetterCountingEnv closed.")
+
+    async def get_next_item(self):
+        """
+        Get the next training item from the dataset.
+
+        Uses adaptive difficulty to determine text length and letter count.
+        """
+        # Get difficulty parameters for current level
+        diff_params = self._get_difficulty_params()
+        min_len = diff_params["min_word_length"]
+        max_len = diff_params["max_word_length"]
+        multi_prob = diff_params["multi_letter_probability"]
+        min_letters = diff_params.get("min_letters_to_count", 1)
+        max_letters = diff_params["max_letters_to_count"]
+        use_random_string = diff_params.get("use_random_string", False)
+
+        # Get text based on difficulty level
+        if use_random_string:
+            # Higher difficulty tiers use generated random strings
+            text = self._generate_random_letter_string(min_len, max_len)
+        else:
+            # Find a word that matches the difficulty-appropriate length
+            attempts = 0
+            max_attempts = 100
+            while attempts < max_attempts:
+                candidate = self.train_words[
+                    (self.iter + attempts) % len(self.train_words)
+                ]
+                if min_len <= len(candidate) <= max_len:
+                    text = candidate
+                    break
+                attempts += 1
+            else:
+                # Fallback: generate random string if no matching word found
+                text = self._generate_random_letter_string(min_len, max_len)
+
+        # Decide number of letters to count
+        if random.random() < multi_prob:
+            num_letters = random.randint(max(2, min_letters), max_letters)
+        else:
+            num_letters = 1
+
+        target_letters = self._select_target_letters(text, num_letters)
+
+        # Count occurrences for each target letter
+        correct_counts = {
+            letter: text.lower().count(letter) for letter in target_letters
+        }
+
+        # Log item selection
+        text_preview = text[:50] + "..." if len(text) > 50 else text
+        letters_str = ", ".join(target_letters)
+        counts_str = ", ".join(
+            f"{letter}:{correct_counts[letter]}" for letter in target_letters
+        )
+        present_count = sum(
+            1 for letter in target_letters if correct_counts[letter] > 0
+        )
+
+        self.logger.info(
+            f"[L{self.current_difficulty_level}/10] '{text_preview}' | "
+            f"Letters: [{letters_str}] | Counts: [{counts_str}] | "
+            f"Present: {present_count}/{len(target_letters)} (iter {self.iter})"
+        )
+
+        self.iter += 1
+
+        # Create the question based on single/multiple letters
+        if len(target_letters) == 1:
+            target_letter = target_letters[0]
+            question_text = f"How many {target_letter}s are in the string {text}?"
+            question_with_instruction = (
+                f"{question_text}\n\n"
+                f"Provide your answer in the format: <answer>{{number}}</answer>"
+            )
+        else:
+            letters_str = (
+                ", ".join(f"'{letter}'" for letter in target_letters[:-1])
+                + f", and '{target_letters[-1]}'"
+            )
+            question_text = f"Count the occurrences of the letters {letters_str} in the string {text}"
+            example_json = (
+                "{" + ", ".join(f'"{letter}": 0' for letter in target_letters) + "}"
+            )
+            question_with_instruction = (
+                f"{question_text}\n\n"
+                f"Provide your answer as JSON in the format: <answer>{example_json}</answer>"
+            )
+
+        # Create prompt (user message only, no system prompt)
+        prompt = [
+            frozenset({"role": "user", "content": question_with_instruction}.items())
+        ]
+
+        # Return prompt, correct counts, text, target letters, and difficulty level
+        return (
+            tuple(prompt),
+            correct_counts,
+            text,
+            target_letters,
+            self.current_difficulty_level,
+        )
+
+    async def collect_trajectories(self, item) -> Tuple[ScoredDataGroup, List]:
+        """Generate and collect model responses for scoring."""
+        # Extract messages from the item
+        messages = [dict(role_dict) for role_dict in item[0]]
+
+        try:
+            async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
+                completions = await managed.chat_completion(
+                    messages=messages,
+                    n=self.config.group_size,
+                    max_tokens=self.config.max_generation_tokens,
+                    temperature=self.config.generation_temperature,
+                    stop=[self.tokenizer.eos_token_id],
+                )
+
+                state = managed.get_state()
+                nodes = state["nodes"]
+        except Exception as e:
+            import traceback
+
+            text_preview = item[2][:50] if len(item[2]) > 50 else item[2]
+            self.logger.error(
+                f"API call failed for '{text_preview}' (iter {self.iter}): {e}"
+            )
+            traceback.print_exc()
+            raise
+
+        to_score = []
+
+        for i, completion_choice in enumerate(completions.choices):
+            trajectory_messages = [dict(role_dict) for role_dict in item[0]]
+            reconstructed_response = self._reconstruct_message_with_thinking(
+                completion_choice
+            )
+            trajectory_messages.append(
+                {"role": "assistant", "content": reconstructed_response}
+            )
+
+            to_score.append(
+                {
+                    "messages": tuple(trajectory_messages),
+                    "correct_counts": item[1],
+                    "text": item[2],
+                    "target_letters": item[3],
+                    "difficulty_level": item[4],
+                    "finish_reason": completion_choice.finish_reason,
+                    "tokens": nodes[i].tokens,
+                    "masks": nodes[i].masked_tokens,
+                    "logprobs": nodes[i].logprobs,
+                }
+            )
+
+        scored_data = await self.score(to_score)
+
+        # Data dumping logic
+        if self.config.dump_rollouts and scored_data is not None:
+            await self._handle_data_dumping(to_score, scored_data, item)
+
+        return scored_data, []
+
+    async def _handle_data_dumping(
+        self, to_score: List, scored_data: ScoredDataGroup, item
+    ):
+        """Handle data dumping for creating offline training datasets."""
+        if not scored_data.get("scores"):
+            return
+
+        scores = scored_data["scores"]
+        group_average = sum(scores) / len(scores)
+
+        # Only dump groups that have learning signal (not all same scores)
+        if all(scores[0] == s for s in scores):
+            return
+
+        # Extract successful rollouts for dumping
+        rollouts_to_dump = []
+        for i, score_val in enumerate(scores):
+            if score_val > 0:  # Only save successful rollouts
+                rollouts_to_dump.append(
+                    {
+                        "conversation": to_score[i]["messages"],
+                        "score": score_val,
+                        "expected_counts": to_score[i]["correct_counts"],
+                        "text": to_score[i]["text"],
+                        "target_letters": to_score[i]["target_letters"],
+                        "difficulty_level": to_score[i]["difficulty_level"],
+                        "group_average_score": group_average,
+                    }
+                )
+
+        if rollouts_to_dump:
+            text = item[2]
+            target_letters = item[3]
+            text_preview = (
+                text[:30].replace(" ", "_")
+                if len(text) > 30
+                else text.replace(" ", "_")
+            )
+            letters_str = "_".join(target_letters)
+            item_id = f"{text_preview}_{letters_str}"
+
+            self.rollouts_to_save_buffer.append(
+                {
+                    "item_id": item_id,
+                    "rollouts": rollouts_to_dump,
+                }
+            )
+            self.processed_item_count += 1
+
+            self.logger.debug(
+                f"BUFFER ADD: Added '{item_id}' to buffer. "
+                f"Buffer: {len(self.rollouts_to_save_buffer)}/{self.config.dump_batch_size}"
+            )
+
+            # Save when buffer reaches batch size
+            if len(self.rollouts_to_save_buffer) >= self.config.dump_batch_size:
+                await self._save_rollouts_to_jsonl()
+
+    def _extract_answer(self, text: str, expected_format: str = "single"):
+        """
+        Extract the answer from model response.
+
+        Only allows one valid answer format - multiple formats result in score of 0.
+
+        Args:
+            text: The model's response text
+            expected_format: "single" for number, "multi" for JSON
+
+        Returns:
+            Extracted answer (int for single, dict for multi) or None if invalid
+        """
+        # Check for exactly one <think> opening and closing tag
+        think_tags = re.findall(r"<think>", text, re.IGNORECASE)
+        if len(think_tags) != 1:
+            return None
+
+        think_close_tags = re.findall(r"</think>", text, re.IGNORECASE)
+        if len(think_close_tags) != 1:
+            return None
+
+        # Split into thinking and answer sections
+        parts = re.split(r"</think>", text, flags=re.IGNORECASE, maxsplit=1)
+        if len(parts) != 2:
+            return None
+
+        thinking_section, answer_section = parts
+
+        # Validate thinking section contains opening tag
+        if "<think>" not in thinking_section.lower():
+            return None
+
+        # No additional think tags in answer section
+        if "<think>" in answer_section.lower():
+            return None
+
+        # Extract answer based on format
+        if expected_format == "single":
+            answer_pattern = r"<answer>\s*(\d+)\s*</answer>"
+            matches = re.findall(answer_pattern, answer_section, re.IGNORECASE)
+            if len(matches) != 1:
+                return None
+            try:
+                return int(matches[0])
+            except ValueError:
+                return None
+        else:
+            answer_pattern = r"<answer>\s*(\{[^}]+\})\s*</answer>"
+            matches = re.findall(answer_pattern, answer_section, re.IGNORECASE)
+            if len(matches) != 1:
+                return None
+            try:
+                answer_dict = json.loads(matches[0])
+                if not isinstance(answer_dict, dict):
+                    return None
+                for key, value in answer_dict.items():
+                    if not isinstance(key, str) or not isinstance(value, int):
+                        return None
+                return answer_dict
+            except (json.JSONDecodeError, ValueError):
+                return None
+
+    async def score(self, rollout_group_data: List) -> Optional[ScoredDataGroup]:
+        """Score the generated model responses against expected letter counts."""
+        scores = ScoredDataGroup()
+        scores["tokens"] = []
+        scores["masks"] = []
+        scores["scores"] = []
+        scores["inference_logprobs"] = []
+
+        if not rollout_group_data:
+            return None
+
+        expected_counts = rollout_group_data[0]["correct_counts"]
+        text = rollout_group_data[0]["text"]
+        target_letters = rollout_group_data[0]["target_letters"]
+        item_difficulty_level = rollout_group_data[0].get("difficulty_level", 0)
+
+        # Track statistics
+        for target_letter in target_letters:
+            if target_letter not in self.letter_distribution_stats:
+                self.letter_distribution_stats[target_letter] = 0
+            self.letter_distribution_stats[target_letter] += 1
+
+        text_len = len(text)
+        if text_len not in self.word_length_stats:
+            self.word_length_stats[text_len] = 0
+        self.word_length_stats[text_len] += 1
+
+        # Shuffle to avoid selection bias
+        random.shuffle(rollout_group_data)
+
+        format_errors_in_group = 0
+        think_errors_in_group = 0
+
+        for item in rollout_group_data:
+            model_response = item["messages"][-1]["content"]
+            stop_reason = item["finish_reason"]
+
+            if stop_reason == "length":
+                reward = 0
+            else:
+                expected_format = "single" if len(target_letters) == 1 else "multi"
+                model_answer = self._extract_answer(model_response, expected_format)
+
+                if model_answer is None:
+                    reward = 0
+                    format_errors_in_group += 1
+                    if (
+                        "<think>" not in model_response.lower()
+                        or "</think>" not in model_response.lower()
+                    ):
+                        think_errors_in_group += 1
+                else:
+                    if expected_format == "single":
+                        expected_single_count = expected_counts[target_letters[0]]
+                        reward = 1 if model_answer == expected_single_count else 0
+                    else:
+                        if set(model_answer.keys()) == set(target_letters) and all(
+                            model_answer.get(letter, -1) == expected_counts[letter]
+                            for letter in target_letters
+                        ):
+                            reward = 1
+                        else:
+                            reward = 0
+
+            tokens = item["tokens"]
+            masks = item["masks"]
+            logprobs = item["logprobs"]
+
+            # Remove examples with insufficient context
+            if len([1 for i in masks if i != -100]) < 10:
+                continue
+
+            scores["tokens"].append(tokens)
+            scores["masks"].append(masks)
+            scores["inference_logprobs"].append(logprobs)
+            scores["scores"].append(1.0 if reward else 0.0)
+
+            if len(scores["tokens"]) >= self.config.group_size:
+                break
+
+        if not scores["tokens"]:
+            self.logger.warning(f"No valid items scored for '{text[:50]}...'")
+            return None
+
+        # Update global error counters
+        self.answer_format_errors += format_errors_in_group
+        self.think_format_errors += think_errors_in_group
+
+        # Record success rate
+        for score in scores["scores"]:
+            self.percent_correct_buffer.append(score)
+
+        # Calculate group average and check training thresholds
+        current_scores = scores.get("scores", [])
+        if not current_scores:
+            return None
+
+        average_score = sum(current_scores) / len(current_scores)
+
+        # Update adaptive difficulty
+        self._update_difficulty(average_score, item_difficulty_level)
+
+        # Log group results
+        text_preview = text[:50] + "..." if len(text) > 50 else text
+        letters_str = ", ".join(target_letters)
+        expected_str = (
+            str(expected_counts)
+            if len(target_letters) > 1
+            else str(expected_counts[target_letters[0]])
+        )
+
+        self.logger.info(
+            f"'{text_preview}' | Letters: '{letters_str}' | Expected: {expected_str} | "
+            f"Score: {average_score:.2f} | L{item_difficulty_level} (current: L{self.current_difficulty_level})"
+        )
+
+        # CRITICAL: Training difficulty filtering
+        # Skip groups that are too easy (>80%) or too hard (<20%)
+        if average_score > self.config.difficulty_increase_threshold:
+            self.logger.debug(
+                f"Skipping group - too easy (avg: {average_score:.2f} > "
+                f"{self.config.difficulty_increase_threshold})"
+            )
+            return None
+
+        if average_score < self.config.difficulty_decrease_threshold:
+            self.logger.debug(
+                f"Skipping group - too hard (avg: {average_score:.2f} < "
+                f"{self.config.difficulty_decrease_threshold})"
+            )
+            return None
+
+        # Skip if all scores are identical (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            self.logger.debug(
+                f"All scores identical ({scores['scores'][0]:.2f}) - skipping group"
+            )
+            return None
+
+        return scores
+
+    async def rollout_and_score_eval(self, eval_item: Dict) -> Tuple[int, int]:
+        """
+        Generate and score model response for a single eval item.
+
+        Uses the pre-formatted prompt from the HuggingFace dataset.
+
+        Args:
+            eval_item: Item from the eval dataset with 'prompt' and 'answer' fields
+
+        Returns:
+            Tuple of (score, difficulty_level)
+        """
+        prompt = eval_item["prompt"]
+        expected_answer = eval_item["answer"]
+        difficulty_level = eval_item.get("difficulty_level", 0)
+
+        # Determine expected format from answer
+        try:
+            # Try to parse as JSON (multi-letter)
+            parsed_answer = json.loads(expected_answer)
+            if isinstance(parsed_answer, dict):
+                expected_format = "multi"
+                expected_counts = parsed_answer
+            else:
+                expected_format = "single"
+                expected_counts = int(expected_answer)
+        except (json.JSONDecodeError, ValueError, TypeError):
+            # Single number
+            expected_format = "single"
+            try:
+                expected_counts = int(expected_answer)
+            except (ValueError, TypeError):
+                # Can't parse answer, skip this item
+                self.logger.warning(f"Could not parse eval answer: {expected_answer}")
+                return (0, difficulty_level)
+
+        # Create messages and get completion
+        messages = [{"role": "user", "content": prompt}]
+
+        async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
+            completion = await managed.chat_completion(
+                messages=messages,
+                n=1,
+                max_tokens=self.config.max_generation_tokens,
+                temperature=self.config.eval_temperature,
+                split="eval",
+                stop=[self.tokenizer.eos_token_id],
+            )
+
+        model_response = self._reconstruct_message_with_thinking(completion.choices[0])
+        model_answer = self._extract_answer(model_response, expected_format)
+
+        # Score based on format
+        if model_answer is None:
+            score = 0
+        elif expected_format == "single":
+            score = 1 if model_answer == expected_counts else 0
+        else:
+            # Multi-letter: compare dicts
+            score = 1 if model_answer == expected_counts else 0
+
+        return (score, difficulty_level)
+
+    async def evaluate(self, *args, **kwargs):
+        """
+        Evaluate the model on the static HuggingFace eval dataset.
+
+        Tracks and logs accuracy for each difficulty level separately.
+        """
+        self.logger.info("Starting evaluation...")
+
+        if not self.eval_dataset:
+            self.logger.warning("No eval dataset available. Skipping evaluation.")
+            self.eval_metrics.append(("eval/percent_correct", 0.0))
+            return
+
+        self.logger.info(
+            f"Evaluating on {len(self.eval_dataset)} items from {self.EVAL_DATASET_HF}..."
+        )
+
+        eval_tasks = [
+            self.rollout_and_score_eval(eval_item) for eval_item in self.eval_dataset
+        ]
+        results = await tqdm_asyncio.gather(*eval_tasks, desc="Evaluating")
+
+        if not results:
+            self.eval_metrics.append(("eval/percent_correct", 0.0))
+            self.logger.warning("No evaluation results returned.")
+            return
+
+        # Calculate overall and per-difficulty scores
+        all_scores = [r[0] for r in results]
+        level_scores = {}
+
+        for score, difficulty_level in results:
+            if difficulty_level not in level_scores:
+                level_scores[difficulty_level] = []
+            level_scores[difficulty_level].append(score)
+
+        # Overall accuracy
+        percent_correct = sum(all_scores) / len(all_scores) if all_scores else 0.0
+        self.eval_metrics.append(("eval/percent_correct", percent_correct))
+
+        # Per-difficulty accuracy
+        level_accuracies = {}
+        for level in sorted(level_scores.keys()):
+            if level_scores[level]:
+                level_acc = sum(level_scores[level]) / len(level_scores[level])
+                level_accuracies[level] = level_acc
+                self.eval_metrics.append((f"eval/percent_correct_L{level}", level_acc))
+
+        level_summary = " | ".join(
+            f"L{lvl}: {level_accuracies.get(lvl, 0):.0%}"
+            for lvl in sorted(level_accuracies.keys())
+        )
+        self.logger.info(
+            f"Evaluation finished. Overall: {percent_correct:.1%} | {level_summary}"
+        )
+
+    async def add_rollouts_for_wandb(
+        self,
+        scored_data: Union[ScoredDataGroup, List[ScoredDataGroup]],
+        item: Item = None,
+    ):
+        """Add rollouts for WandB logging."""
+        if item is None or scored_data is None or not scored_data.get("tokens"):
+            return
+
+        expected_counts = item[1]
+        text = item[2]
+        target_letters = item[3]
+
+        num_keep = self.config.num_rollouts_per_group_for_logging
+        if num_keep == -1:
+            num_keep = self.config.group_size
+
+        num_keep = min(num_keep, len(scored_data["tokens"]))
+        if num_keep == 0:
+            return
+
+        group_scores = scored_data.get("scores", [])
+        group_average_score = (
+            sum(group_scores) / len(group_scores) if group_scores else 0.0
+        )
+
+        current_rollouts = []
+        for i in range(num_keep):
+            if i < len(scored_data["tokens"]) and i < len(scored_data["scores"]):
+                full_text = self.tokenizer.decode(
+                    scored_data["tokens"][i], skip_special_tokens=True
+                )
+                score_val = scored_data["scores"][i]
+                expected_str = (
+                    str(expected_counts)
+                    if len(target_letters) > 1
+                    else str(expected_counts[target_letters[0]])
+                )
+                letters_str = ", ".join(target_letters)
+                current_rollouts.append(
+                    (
+                        full_text,
+                        score_val,
+                        expected_str,
+                        text[:100],
+                        letters_str,
+                        group_average_score,
+                    )
+                )
+
+        self.rollouts_for_wandb.append(current_rollouts)
+
+        if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
+            self.rollouts_for_wandb.pop(0)
+
+    async def create_rollout_table(self, wandb_metrics):
+        """Create a WandB table with rollout examples."""
+        if len(self.rollouts_for_wandb) > 0:
+            table = wandb.Table(
+                columns=[
+                    "full_text",
+                    "score",
+                    "expected_counts",
+                    "text",
+                    "target_letters",
+                    "group_average_score",
+                ]
+            )
+            for group in self.rollouts_for_wandb:
+                for item in group:
+                    if len(item) >= 6:
+                        table.add_data(
+                            item[0], item[1], item[2], item[3], item[4], item[5]
+                        )
+                    else:
+                        table.add_data(item[0], item[1], item[2], item[3], item[4], 0.0)
+            wandb_metrics["train/rollouts"] = table
+        self.rollouts_for_wandb = []
+        return wandb_metrics
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
+        """Log metrics to WandB."""
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        # Training accuracy
+        try:
+            wandb_metrics["train/percent_correct"] = sum(
+                self.percent_correct_buffer
+            ) / len(self.percent_correct_buffer)
+        except ZeroDivisionError:
+            pass
+
+        self.percent_correct_buffer = []
+
+        # Eval metrics
+        for item in self.eval_metrics:
+            wandb_metrics[item[0]] = item[1]
+        self.eval_metrics = []
+
+        # Letter distribution stats
+        if self.letter_distribution_stats:
+            total_letters_asked = sum(self.letter_distribution_stats.values())
+            wandb_metrics["stats/total_letters_asked"] = total_letters_asked
+
+            most_common = max(
+                self.letter_distribution_stats, key=self.letter_distribution_stats.get
+            )
+            least_common = min(
+                self.letter_distribution_stats, key=self.letter_distribution_stats.get
+            )
+            wandb_metrics["stats/most_common_letter_count"] = (
+                self.letter_distribution_stats[most_common]
+            )
+            wandb_metrics["stats/least_common_letter_count"] = (
+                self.letter_distribution_stats[least_common]
+            )
+
+            import math
+
+            entropy = -sum(
+                (count / total_letters_asked) * math.log2(count / total_letters_asked)
+                for count in self.letter_distribution_stats.values()
+                if count > 0
+            )
+            wandb_metrics["stats/letter_distribution_entropy"] = entropy
+
+        # Word length stats
+        if self.word_length_stats:
+            total_words = sum(self.word_length_stats.values())
+            wandb_metrics["stats/total_words_asked"] = total_words
+
+            avg_length = (
+                sum(length * count for length, count in self.word_length_stats.items())
+                / total_words
+            )
+            wandb_metrics["stats/avg_word_length"] = avg_length
+            wandb_metrics["stats/min_word_length"] = min(self.word_length_stats.keys())
+            wandb_metrics["stats/max_word_length"] = max(self.word_length_stats.keys())
+
+        # Error rates
+        if self.processed_item_count > 0:
+            wandb_metrics["errors/answer_format_error_rate"] = (
+                self.answer_format_errors
+                / (self.processed_item_count * self.config.group_size)
+            )
+            wandb_metrics["errors/think_format_error_rate"] = (
+                self.think_format_errors
+                / (self.processed_item_count * self.config.group_size)
+            )
+            wandb_metrics["errors/total_format_errors"] = (
+                self.answer_format_errors + self.think_format_errors
+            )
+
+        # Adaptive difficulty metrics
+        wandb_metrics["curriculum/difficulty_level"] = self.current_difficulty_level
+        if self.recent_scores:
+            wandb_metrics["curriculum/recent_success_rate"] = sum(
+                self.recent_scores
+            ) / len(self.recent_scores)
+        wandb_metrics["curriculum/samples_in_window"] = len(self.recent_scores)
+
+        # Data dumping metrics
+        if self.config.dump_rollouts:
+            wandb_metrics["data_dumps/buffer_size"] = len(self.rollouts_to_save_buffer)
+            wandb_metrics["data_dumps/batches_saved"] = self.save_file_batch_num
+
+        # Rollout table
+        wandb_metrics = await self.create_rollout_table(wandb_metrics)
+
+        await super().wandb_log(wandb_metrics)
+
+
+if __name__ == "__main__":
+    LetterCountingEnv.cli()
diff --git a/environments/text_reversal_environment.py b/environments/text_reversal_environment.py
new file mode 100644
index 00000000..0b41e2ec
--- /dev/null
+++ b/environments/text_reversal_environment.py
@@ -0,0 +1,1260 @@
+import asyncio
+import random
+import re
+import time
+from typing import Dict, List, Optional, Tuple, Union
+
+import wandb
+from datasets import load_dataset
+from pydantic import Field
+from tqdm.asyncio import tqdm_asyncio
+
+from atroposlib.envs.base import (
+    APIServerConfig,
+    BaseEnv,
+    BaseEnvConfig,
+    EvalHandlingEnum,
+    Item,
+    ScoredDataGroup,
+)
+from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer
+
+
+class TextReversalConfig(BaseEnvConfig):
+    """Configuration for TextReversalEnv with thinking mode and configurable options."""
+
+    thinking_mode: bool = Field(
+        default=False,
+        description="Whether to enable thinking mode with <think></think> tags.",
+    )
+
+    custom_base_system_prompt: Optional[str] = Field(
+        default=None,
+        description="Custom thinking prompt. If None, uses the default thinking prompt.",
+    )
+
+    custom_thinking_prompt: Optional[str] = Field(
+        default=None,
+        description="Custom thinking prompt. If None, uses the default thinking prompt.",
+    )
+
+    eval_temperature: float = Field(
+        default=0.6,
+        description="Temperature for evaluation completions.",
+    )
+
+    rollout_temperature: float = Field(
+        default=1.0,
+        description="Temperature for training rollout completions.",
+    )
+
+    eval_max_tokens: int = Field(
+        default=1024 * 16,
+        description="Maximum tokens for evaluation completions.",
+    )
+
+    train_max_tokens: int = Field(
+        default=1024 * 7,
+        description="Maximum tokens for training completions.",
+    )
+
+    # Retry configuration
+    max_retries: int = Field(
+        default=3,
+        ge=1,
+        description="Maximum number of retries for failed API calls.",
+    )
+
+    retry_delay: float = Field(
+        default=1.0,
+        ge=0.0,
+        description="Delay in seconds between retry attempts.",
+    )
+
+    min_response_length: int = Field(
+        default=5,
+        ge=1,
+        description="Minimum response length to consider valid (filters out EOS-only responses).",
+    )
+
+    # Dataset configuration
+    train_dataset: str = Field(
+        default="NousResearch/TextReversalDataset",
+        description="Training dataset name (HuggingFace) or path to local JSONL file.",
+    )
+
+    eval_dataset: str = Field(
+        default="NousResearch/TextReversalDataset",
+        description="Evaluation dataset name (HuggingFace) or path to local JSONL file.",
+    )
+
+    train_split: str = Field(
+        default="train",
+        description="Split to use for training dataset (only for HuggingFace datasets).",
+    )
+
+    eval_split: str = Field(
+        default="test",
+        description="Split to use for evaluation dataset (only for HuggingFace datasets).",
+    )
+
+    # Debug configuration
+    full_debug: bool = Field(
+        default=False,
+        description="Enable full debug mode - logs every API request and response with truncated content.",
+    )
+
+
+class TextReversalEnv(BaseEnv):
+    name = "text_reversal"
+    env_config_cls = TextReversalConfig
+
+    def __init__(
+        self,
+        config: TextReversalConfig,
+        server_configs: List[APIServerConfig],
+        slurm=True,
+        testing=False,
+    ):
+        super().__init__(config, server_configs, slurm, testing)
+        self.config: TextReversalConfig = config
+        self.percent_correct_buffer = []
+        self.eval_metrics = []
+
+        # Tracking for training metrics
+        self.successful_reversals = 0
+        self.failed_reversals = 0
+        self.format_errors = 0  # Failed to follow format
+        self.total_attempts = 0
+        self.rollouts_for_wandb = []
+
+        # Pre-compile regex patterns for performance
+        self._think_pattern = re.compile(r"<think>")
+        self._think_close_pattern = re.compile(r"</think>")
+        self._think_content_pattern = re.compile(r"</think>\s*(.*)", re.DOTALL)
+        self._thinking_extract_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
+        self._reversed_text_pattern = re.compile(
+            r"<reversed_text>\s*(.*?)\s*</reversed_text>", re.DOTALL
+        )
+
+        # System prompts (use custom thinking prompt if provided, otherwise default)
+        self.thinking_system_prompt = self._get_thinking_prompt()
+        self.base_system_prompt = self.config.custom_base_system_prompt or ""
+
+    def _get_thinking_prompt(self) -> str:
+        """Get thinking system prompt."""
+        return (
+            self.config.custom_thinking_prompt
+            if self.config.custom_thinking_prompt
+            else "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the "
+            "problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
+            "solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
+            "</think> tags, and then provide your solution or response to the problem."
+        )
+
+    def _format_debug_text(self, text: str, label: str) -> str:
+        """Format text for debug output (first 100 + last 100 chars)."""
+        if not text:
+            return f"{label}: <empty>"
+
+        text_clean = text.strip()
+        if len(text_clean) <= 200:
+            return f"{label}: '{text_clean}'"
+
+        first_100 = text_clean[:100]
+        last_100 = text_clean[-100:]
+        return f"{label}: '{first_100}...{last_100}' (total {len(text_clean)} chars)"
+
+    def _log_full_debug_request(
+        self,
+        messages: List[Dict],
+        params: Dict,
+        context: str = "",
+        item_id: str = "unknown",
+    ):
+        """Log full debug information for API requests."""
+        if not self.config.full_debug:
+            return
+
+        print(f"\n🔍 FULL DEBUG - API REQUEST [{context}]")
+        print(f"   Item ID: {item_id}")
+        print(f"   Parameters: {params}")
+
+        for i, message in enumerate(messages):
+            role = message.get("role", "unknown")
+            content = message.get("content", "")
+            print(
+                f"   Message {i+1} ({role}): {self._format_debug_text(content, 'Content')}"
+            )
+
+    def _log_full_debug_response(self, completion, context: str = ""):
+        """Log full debug information for API responses."""
+        if not self.config.full_debug:
+            return
+
+        print(f"\n🔍 FULL DEBUG - API RESPONSE [{context}]")
+
+        if hasattr(completion, "usage"):
+            print(f"   Usage: {completion.usage}")
+
+        if hasattr(completion, "choices") and completion.choices:
+            for i, choice in enumerate(completion.choices):
+                content = choice.message.content if hasattr(choice, "message") else ""
+                finish_reason = (
+                    choice.finish_reason
+                    if hasattr(choice, "finish_reason")
+                    else "unknown"
+                )
+                print(
+                    f"   Choice {i+1}: {self._format_debug_text(content, 'Response')}"
+                )
+                print(f"   Finish reason: {finish_reason}")
+        else:
+            print("   No choices in response")
+            print(f"   Completion object: {completion}")
+
+    def _reset_metrics(self) -> None:
+        """Reset training metrics."""
+        self.percent_correct_buffer = []
+        self.successful_reversals = 0
+        self.failed_reversals = 0
+        self.format_errors = 0
+        self.total_attempts = 0
+
+    def _create_system_content(self) -> str:
+        """Create system message content based on thinking mode."""
+        if self.config.thinking_mode:
+            return f"{self.thinking_system_prompt}\n\n{self.base_system_prompt}"
+        return self.base_system_prompt
+
+    @classmethod
+    def config_init(cls) -> Tuple[TextReversalConfig, List[APIServerConfig]]:
+        env_config = TextReversalConfig(
+            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview",
+            group_size=8,
+            use_wandb=True,
+            max_num_workers_per_node=8,
+            rollout_server_url="http://localhost:8000",
+            total_steps=2000,
+            batch_size=1024,
+            steps_per_eval=25,
+            train_max_tokens=1024 * 7,
+            eval_max_tokens=1024 * 8,
+            inference_weight=1.0,
+            wandb_name="text_reversal",
+            eval_handling=EvalHandlingEnum.LIMIT_TRAIN,
+            eval_limit_ratio=0.1,
+            min_batch_allocation=0.1,
+            thinking_mode=True,
+            custom_base_system_prompt=None,
+            # Debug and retry configuration
+            full_debug=False,  # Set to True to enable detailed API request/response logging
+            max_retries=3,
+            retry_delay=1.0,
+            min_response_length=5,
+        )
+        server_configs = [
+            APIServerConfig(
+                model_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview",
+                base_url="http://localhost:9004/v1",
+                api_key="x",
+            ),
+        ]
+        return env_config, server_configs
+
+    async def setup(self) -> None:
+        """Set up the environment by loading datasets."""
+        # Load training dataset
+        try:
+            self.train = self._load_dataset(
+                self.config.train_dataset, self.config.train_split
+            )
+            # Shuffle training dataset for reproducibility
+            if hasattr(self.train, "shuffle"):
+                self.train = self.train.shuffle(seed=42)
+            else:
+                # For list-like objects, convert to list and shuffle
+                train_list = list(self.train)
+                random.seed(42)
+                random.shuffle(train_list)
+                self.train = train_list
+        except Exception as e:
+            print(f"Error loading training dataset '{self.config.train_dataset}': {e}")
+            # Create minimal fallback data in expected format
+            self.train = [
+                {"text": "Hello world"},
+                {"text": "Python programming"},
+                {"text": "Machine learning"},
+                {"text": "Artificial intelligence"},
+                {"text": "Deep learning models"},
+            ] * 20  # 100 examples
+            print(f"Using fallback training data with {len(self.train)} examples")
+
+        # Load evaluation dataset
+        try:
+            self.test = self._load_dataset(
+                self.config.eval_dataset, self.config.eval_split
+            )
+        except Exception as e:
+            print(f"Error loading evaluation dataset '{self.config.eval_dataset}': {e}")
+            raise  # Evaluation dataset must work
+
+        # Analyze training dataset composition
+        if hasattr(self.train, "__iter__"):
+            total_train_items = len(self.train)
+            print(f"\nTraining dataset analysis ({total_train_items} total items):")
+
+            # Show some sample text lengths
+            text_lengths = []
+            for item in list(self.train)[:100]:  # Sample first 100 items
+                text = item.get("text", "")
+                text_lengths.append(len(text))
+
+            if text_lengths:
+                avg_length = sum(text_lengths) / len(text_lengths)
+                min_length = min(text_lengths)
+                max_length = max(text_lengths)
+                print(
+                    f"  - Sample text lengths: avg={avg_length:.1f}, min={min_length}, max={max_length}"
+                )
+
+        # Analyze evaluation dataset composition
+        if hasattr(self.test, "__iter__"):
+            total_eval_items = len(self.test)
+            print(f"\nEvaluation dataset analysis ({total_eval_items} total items):")
+
+            # Show some sample text lengths
+            text_lengths = []
+            for item in list(self.test)[:100]:  # Sample first 100 items
+                text = item.get("text", "")
+                text_lengths.append(len(text))
+
+            if text_lengths:
+                avg_length = sum(text_lengths) / len(text_lengths)
+                min_length = min(text_lengths)
+                max_length = max(text_lengths)
+                print(
+                    f"  - Sample text lengths: avg={avg_length:.1f}, min={min_length}, max={max_length}"
+                )
+
+        # Show configuration info
+        print("\nText Reversal Configuration:")
+        print(
+            f"  - Training dataset: {self.config.train_dataset} (split: {self.config.train_split})"
+        )
+        print(
+            f"  - Evaluation dataset: {self.config.eval_dataset} (split: {self.config.eval_split})"
+        )
+        print(f"  - Thinking mode: {self.config.thinking_mode}")
+        print(f"  - Eval temperature: {self.config.eval_temperature}")
+
+        # Show sample training item structure
+        if len(self.train) > 0:
+            try:
+                sample_train_item = self.train[0]
+                print("\nSample training item structure:")
+                print(f"- Available keys: {list(sample_train_item.keys())}")
+                if "text" in sample_train_item:
+                    print(f"- Text: {sample_train_item['text'][:100]}...")
+            except Exception as e:
+                print(f"Warning: Could not display sample training item structure: {e}")
+
+        # Show debug mode status
+        if self.config.full_debug:
+            print(
+                "\n🔍 FULL DEBUG MODE ENABLED - Will log all API requests and responses"
+            )
+            print("   📊 Will show: first/last 100 chars of prompts and responses")
+            print(
+                f"   ⚙️  Retry settings: max_retries={self.config.max_retries}, retry_delay={self.config.retry_delay}s"
+            )
+            print(f"   📏 Min response length: {self.config.min_response_length} chars")
+        else:
+            print(
+                "\n🔍 Full debug mode disabled - Use full_debug=True to enable detailed logging"
+            )
+
+        self.iter = 0
+
+    def _load_dataset(self, dataset_path: str, split: str = None) -> List[Dict]:
+        """
+        Load dataset using HuggingFace load_dataset (supports both HF datasets and local files).
+
+        Args:
+            dataset_path: Either HuggingFace dataset name or path to local file
+            split: Split to use
+
+        Returns:
+            List of dataset items
+        """
+        import os
+
+        try:
+            # Check if it's a local file
+            if os.path.exists(dataset_path):
+                # Local file - use appropriate loader based on extension
+                if dataset_path.endswith(".jsonl") or dataset_path.endswith(".json"):
+                    dataset = load_dataset(
+                        "json", data_files=dataset_path, split=split or "train"
+                    )
+                elif dataset_path.endswith(".csv"):
+                    dataset = load_dataset(
+                        "csv", data_files=dataset_path, split=split or "train"
+                    )
+                elif dataset_path.endswith(".parquet"):
+                    dataset = load_dataset(
+                        "parquet", data_files=dataset_path, split=split or "train"
+                    )
+                else:
+                    # Try JSON as default
+                    dataset = load_dataset(
+                        "json", data_files=dataset_path, split=split or "train"
+                    )
+
+                print(
+                    f"Loaded local dataset from {dataset_path} with {len(dataset)} examples"
+                )
+
+            else:
+                # HuggingFace dataset
+                if split:
+                    dataset = load_dataset(dataset_path, split=split)
+                else:
+                    dataset_dict = load_dataset(dataset_path)
+                    # If no split specified, try to get the first available split
+                    if hasattr(dataset_dict, "keys"):
+                        available_splits = list(dataset_dict.keys())
+                        if available_splits:
+                            dataset = dataset_dict[available_splits[0]]
+                            print(
+                                f"No split specified, using '{available_splits[0]}' split"
+                            )
+                        else:
+                            dataset = dataset_dict
+                    else:
+                        dataset = dataset_dict
+
+                print(
+                    f"Loaded HuggingFace dataset {dataset_path} with {len(dataset)} examples"
+                )
+
+            return dataset
+
+        except Exception as e:
+            print(f"Error loading dataset {dataset_path}: {e}")
+            raise
+
+    def save_checkpoint(self, step: int, data: Optional[Dict] = None) -> None:
+        """Save checkpoint including iteration state."""
+        if data is None:
+            data = {}
+        data["iter"] = self.iter
+        super().save_checkpoint(step, data)
+
+    def _extract_reversed_text(self, response: str) -> Optional[str]:
+        """
+        Extract text from within <reversed_text> tags.
+
+        Args:
+            response: Model response text
+
+        Returns:
+            Extracted text or None if not found or multiple blocks found
+        """
+        if self.config.thinking_mode:
+            # Check for exactly one pair of think tags
+            think_open_count = len(self._think_pattern.findall(response))
+            think_close_count = len(self._think_close_pattern.findall(response))
+
+            if think_open_count != 1 or think_close_count != 1:
+                return None
+
+            # Parse only content after </think> tags
+            match = self._think_content_pattern.search(response)
+            if match:
+                response = match.group(1)
+            else:
+                return None
+
+        # Find all content between <reversed_text> tags
+        matches = self._reversed_text_pattern.findall(response)
+
+        # Must have exactly one reversed_text block
+        if len(matches) != 1:
+            return None
+
+        return matches[0].strip()
+
+    def _create_reversal_prompt(self, text: str) -> str:
+        """Create the user prompt for text reversal task."""
+        return (
+            "Please reverse the characters of the following text and wrap your reversed text in "
+            "<reversed_text> </reversed_text> tags.\n\n"
+            f"The text to reverse:\n{text}"
+        )
+
+    async def get_next_item(self) -> Item:
+        """Generate next training item with text reversal task."""
+        self.iter += 1
+
+        # Get next training example sequentially
+        example = self.train[self.iter % len(self.train)]
+
+        # Extract text from training data
+        original_text = example.get("text", "")
+        if not original_text:
+            # Fallback if text field is missing
+            original_text = "Hello"
+
+        # Create the expected reversed text (ground truth)
+        expected_reversed = original_text[::-1]
+
+        # Create system message
+        system_content = self._create_system_content()
+
+        # Create user prompt
+        user_content = self._create_reversal_prompt(original_text)
+
+        prompt = tuple(
+            [
+                frozenset({"role": "system", "content": system_content}.items()),
+                frozenset({"role": "user", "content": user_content}.items()),
+            ]
+        )
+
+        return (prompt, expected_reversed)
+
+    def prepare_eval_item(self, item: dict) -> Tuple[Optional[Tuple], Optional[str]]:
+        """
+        Prepare an evaluation item from the text reversal dataset.
+        """
+        try:
+            original_text = item.get("text", "")
+
+            # Validate required fields
+            if not original_text:
+                return None, None
+
+            # Create the expected reversed text (ground truth)
+            expected_reversed = original_text[::-1]
+
+            # Create system message
+            system_content = self._create_system_content()
+
+            # Create user prompt
+            user_content = self._create_reversal_prompt(original_text)
+
+            prompt = tuple(
+                [
+                    frozenset({"role": "system", "content": system_content}.items()),
+                    frozenset({"role": "user", "content": user_content}.items()),
+                ]
+            )
+
+            return prompt, expected_reversed
+
+        except Exception as e:
+            print(f"Error preparing evaluation item: {e}")
+            print(f"DEBUG: Exception type: {type(e)}")
+            print(f"DEBUG: item keys: {list(item.keys()) if item else 'item is None'}")
+            return None, None
+
+    async def collect_trajectories(self, item: Item) -> Tuple[ScoredDataGroup, List]:
+        """Collect and score model trajectories."""
+        messages = self._convert_messages_to_list(item[0])
+        completion_params = self._get_train_completion_params()
+
+        # Retry logic for training trajectories
+        max_retries = self.config.max_retries
+        retry_delay = self.config.retry_delay
+
+        # Get item info for debug logging
+        item_id = f"train_{self.iter if hasattr(self, 'iter') else 'unknown'}"
+
+        for attempt in range(max_retries):
+            try:
+                # Log full debug request
+                self._log_full_debug_request(
+                    messages,
+                    completion_params,
+                    f"TRAINING attempt {attempt + 1}/{max_retries}",
+                    item_id,
+                )
+
+                completions = await self.server.chat_completion(
+                    messages=messages, **completion_params
+                )
+
+                # Log full debug response
+                self._log_full_debug_response(
+                    completions, f"TRAINING attempt {attempt + 1}/{max_retries}"
+                )
+
+                # Check if we got valid completions
+                if not completions.choices:
+                    if attempt < max_retries - 1:
+                        print(
+                            f"DEBUG: No choices in collect_trajectories (attempt {attempt + 1}/{max_retries})"
+                        )
+                        await asyncio.sleep(retry_delay)
+                        continue
+                    else:
+                        print(
+                            f"DEBUG: No choices in collect_trajectories after {max_retries} attempts"
+                        )
+                        return None, []
+
+                # Check if any completion has None content
+                valid_completions = []
+                for completion_choice in completions.choices:
+                    if (
+                        completion_choice.message.content is not None
+                        and isinstance(completion_choice.message.content, str)
+                        and len(completion_choice.message.content.strip())
+                        >= self.config.min_response_length
+                    ):
+                        valid_completions.append(completion_choice)
+
+                # If we don't have enough valid completions, retry
+                if (
+                    len(valid_completions) < len(completions.choices) // 2
+                ):  # If less than half are valid
+                    if attempt < max_retries - 1:
+                        print(
+                            f"DEBUG: Only {len(valid_completions)}/{len(completions.choices)} "
+                            f"valid completions (attempt {attempt + 1}/{max_retries})"
+                        )
+                        await asyncio.sleep(retry_delay)
+                        continue
+                    else:
+                        print(
+                            f"DEBUG: Only {len(valid_completions)}/{len(completions.choices)} "
+                            f"valid completions after {max_retries} attempts"
+                        )
+                        # Continue with what we have
+
+                # Build trajectories using valid completions
+                to_score = []
+                for completion_choice in valid_completions:
+                    # Add assistant response to existing messages
+                    trajectory_messages = messages + [
+                        {
+                            "role": "assistant",
+                            "content": completion_choice.message.content,
+                        }
+                    ]
+                    to_score.append((tuple(trajectory_messages), item[1]))
+
+                # Success - we got at least some valid trajectories
+                break
+
+            except Exception as e:
+                if attempt < max_retries - 1:
+                    print(
+                        f"DEBUG: collect_trajectories API call failed (attempt {attempt + 1}/{max_retries}): {e}"
+                    )
+                    await asyncio.sleep(retry_delay)
+                    continue
+                else:
+                    print(
+                        f"DEBUG: collect_trajectories API call failed after {max_retries} attempts: {e}"
+                    )
+                    return None, []
+
+        scored_data = await self.score(to_score)
+
+        # Add rollouts for wandb visualization
+        if scored_data is not None:
+            await self.add_rollouts_for_wandb(scored_data, item)
+
+        return scored_data, []
+
+    def _convert_messages_to_list(self, prompt_tuple: Tuple) -> List[Dict]:
+        """Convert frozenset message format to list format."""
+        messages = []
+        for role_dict in prompt_tuple:
+            messages.append(dict(role_dict))
+        return messages
+
+    def _get_train_completion_params(self) -> Dict:
+        """Get completion parameters for training rollouts."""
+        return {
+            "n": self.config.group_size,
+            "max_tokens": self.config.train_max_tokens,
+            "temperature": self.config.rollout_temperature,
+        }
+
+    def _get_eval_completion_params(self) -> Dict:
+        """Get completion parameters for evaluation."""
+        return {
+            "n": 1,
+            "max_tokens": self.config.eval_max_tokens,
+            "temperature": self.config.eval_temperature,
+            "split": "eval",
+        }
+
+    async def score(self, rollout_group_data: List[Tuple]) -> Optional[ScoredDataGroup]:
+        """Score a group of rollout data."""
+        if not rollout_group_data:
+            return None
+
+        try:
+            scores = ScoredDataGroup()
+            scores["tokens"] = []
+            scores["masks"] = []
+            scores["scores"] = []
+
+            random.shuffle(rollout_group_data)
+
+            for item in rollout_group_data:
+                # Simplified validation
+                if not item or len(item) < 2 or not item[0]:
+                    continue
+
+                model_response = item[0][-1]["content"]
+                expected_reversed = item[1]
+
+                # Extract reversed text from model response
+                extracted_reversed = self._extract_reversed_text(model_response)
+
+                # Score 1.0 if exact match, 0.0 otherwise
+                reward = 1.0 if extracted_reversed == expected_reversed else 0.0
+
+                # Track metrics
+                self.total_attempts += 1
+                if reward == 1.0:
+                    self.successful_reversals += 1
+                else:
+                    self.failed_reversals += 1
+                    if extracted_reversed is None:
+                        self.format_errors += 1
+
+                # Tokenize the conversation for learning
+                out_dict = tokenize_for_trainer(self.tokenizer, item[0])
+                tokens = out_dict["tokens"]
+                masks = out_dict["masks"]
+
+                # Skip obviously bad examples
+                if len([1 for mask in masks if mask != -100]) < 10:
+                    continue
+
+                scores["tokens"].append(tokens)
+                scores["masks"].append(masks)
+                scores["scores"].append(reward)
+
+                if len(scores["tokens"]) >= self.config.group_size:
+                    break
+
+            if not scores["tokens"]:
+                return None
+
+            # Group-level logging after scoring is complete
+            group_successes = sum(1 for score in scores["scores"] if score == 1.0)
+            group_size = len(scores["scores"])
+            any_success = group_successes > 0
+            success_indicator = "✅" if any_success else "❌"
+
+            # Calculate running totals
+            total_success_rate = (
+                (self.successful_reversals / self.total_attempts * 100)
+                if self.total_attempts > 0
+                else 0.0
+            )
+
+            print(
+                f"{success_indicator} Group scored: {group_successes}/{group_size} successful reversals | "
+                f"Total success rate: {self.successful_reversals}/{self.total_attempts} ({total_success_rate:.1f}%)"
+            )
+
+            # Update percent correct buffer
+            for score in scores["scores"]:
+                self.percent_correct_buffer.append(max(score, 0))
+
+            # Return None if all scores are the same (no learning signal)
+            if len(set(scores["scores"])) == 1:
+                return None
+
+            return scores
+
+        except Exception as e:
+            print(f"Error in score method: {e}")
+            print(f"DEBUG: Exception type: {type(e)}")
+            print(
+                f"DEBUG: rollout_group_data length: {len(rollout_group_data) if rollout_group_data else 'None'}"
+            )
+            if rollout_group_data:
+                print(f"DEBUG: first item type: {type(rollout_group_data[0])}")
+                print(
+                    f"DEBUG: first item length: {len(rollout_group_data[0]) if rollout_group_data[0] else 'None'}"
+                )
+            return None
+
+    async def rollout_and_score_eval(self, test_item: dict) -> dict:
+        """Rollout and score evaluation."""
+        try:
+            prompt, expected_reversed = self.prepare_eval_item(test_item)
+            if prompt is None:
+                return {"score": 0.0, "sample": None}
+
+            messages = self._convert_messages_to_list(prompt)
+            completion_params = self._get_eval_completion_params()
+
+            # Retry logic for failed API calls
+            max_retries = self.config.max_retries
+            retry_delay = self.config.retry_delay
+
+            # Get item info for debug logging
+            item_id = test_item.get("id", f"eval_{hash(test_item.get('text', ''))}")
+
+            for attempt in range(max_retries):
+                try:
+                    # Log full debug request
+                    self._log_full_debug_request(
+                        messages,
+                        completion_params,
+                        f"EVAL attempt {attempt + 1}/{max_retries}",
+                        item_id,
+                    )
+
+                    completion = await self.server.chat_completion(
+                        messages=messages, **completion_params
+                    )
+
+                    # Log full debug response
+                    self._log_full_debug_response(
+                        completion, f"EVAL attempt {attempt + 1}/{max_retries}"
+                    )
+
+                    if not completion.choices:
+                        if attempt < max_retries - 1:
+                            print(
+                                f"DEBUG: No choices in completion (attempt {attempt + 1}/{max_retries})"
+                            )
+                            await asyncio.sleep(retry_delay)
+                            continue
+                        else:
+                            print(f"DEBUG: No choices after {max_retries} attempts")
+                            return {"score": 0.0, "sample": None}
+
+                    model_response = completion.choices[0].message.content
+
+                    # Check for None content or very short responses
+                    if model_response is None:
+                        if attempt < max_retries - 1:
+                            print(
+                                f"DEBUG: model_response is None (attempt {attempt + 1}/{max_retries})"
+                            )
+                            await asyncio.sleep(retry_delay)
+                            continue
+                        else:
+                            print(
+                                f"DEBUG: model_response is None after {max_retries} attempts"
+                            )
+                            return {"score": 0.0, "sample": None}
+
+                    if not isinstance(model_response, str):
+                        if attempt < max_retries - 1:
+                            print(
+                                f"DEBUG: model_response is not a string. "
+                                f"Type: {type(model_response)} "
+                                f"(attempt {attempt + 1}/{max_retries})"
+                            )
+                            await asyncio.sleep(retry_delay)
+                            continue
+                        else:
+                            print(
+                                f"DEBUG: model_response is not a string after "
+                                f"{max_retries} attempts. Type: {type(model_response)}"
+                            )
+                            return {"score": 0.0, "sample": None}
+
+                    # Check for very short responses (likely just EOS token)
+                    if len(model_response.strip()) < self.config.min_response_length:
+                        if attempt < max_retries - 1:
+                            print(
+                                f"DEBUG: Very short response (likely EOS token only): "
+                                f"'{model_response}' (attempt {attempt + 1}/{max_retries})"
+                            )
+                            await asyncio.sleep(retry_delay)
+                            continue
+                        else:
+                            print(
+                                f"DEBUG: Very short response after {max_retries} "
+                                f"attempts: '{model_response}'"
+                            )
+                            return {"score": 0.0, "sample": None}
+
+                    # Success - we got a valid response
+                    break
+
+                except Exception as e:
+                    if attempt < max_retries - 1:
+                        print(
+                            f"DEBUG: API call failed (attempt {attempt + 1}/{max_retries}): {e}"
+                        )
+                        await asyncio.sleep(retry_delay)
+                        continue
+                    else:
+                        print(
+                            f"DEBUG: API call failed after {max_retries} attempts: {e}"
+                        )
+                        raise
+
+            # Extract reversed text from model response
+            extracted_reversed = self._extract_reversed_text(model_response)
+
+            # Score 1.0 if exact match, 0.0 otherwise
+            score = 1.0 if extracted_reversed == expected_reversed else 0.0
+
+            # Get original text from the user message
+            original_text = test_item.get("text", "")
+
+            # Add full conversation including model response
+            full_messages = messages + [
+                {"role": "assistant", "content": model_response}
+            ]
+
+            sample = {
+                "messages": full_messages,
+                "original_text": original_text,
+                "expected_reversed": expected_reversed,
+                "extracted_reversed": extracted_reversed,
+                "score": int(score),
+                "correct": bool(score),
+                "finish_reason": completion.choices[0].finish_reason,
+                "thinking_mode": self.config.thinking_mode,
+                "format_compliant": extracted_reversed is not None,
+                "dataset_item_id": item_id,
+            }
+
+            # Add thinking-specific parsing info
+            if self.config.thinking_mode:
+                if "</think>" in model_response:
+                    sample["response_after_think"] = model_response.split("</think>")[
+                        -1
+                    ].strip()
+                    sample["thinking_content"] = self._thinking_extract_pattern.search(
+                        model_response
+                    )
+                    if sample["thinking_content"]:
+                        sample["thinking_content"] = (
+                            sample["thinking_content"].group(1).strip()
+                        )
+                else:
+                    sample["response_after_think"] = model_response
+                    sample["thinking_content"] = None
+
+            return {"score": score, "sample": sample}
+
+        except Exception as e:
+            print(f"Error in evaluation: {e}")
+            print(f"DEBUG: Exception type: {type(e)}")
+            print(
+                f"DEBUG: test_item keys: {list(test_item.keys()) if test_item else 'test_item is None'}"
+            )
+            return {"score": 0.0, "sample": None}
+
+    async def evaluate(self, *args, **kwargs) -> None:
+        """Evaluate the model on the test dataset."""
+        start_time = time.time()
+
+        try:
+            eval_tasks = [
+                self.rollout_and_score_eval(test_item) for test_item in self.test
+            ]
+            results = await tqdm_asyncio.gather(*eval_tasks)
+
+            # Filter valid results
+            valid_results = [
+                result
+                for result in results
+                if not isinstance(result, Exception)
+                and result
+                and result.get("sample") is not None
+            ]
+
+            if not valid_results:
+                print("Warning: No valid evaluation results obtained")
+                return
+
+        except Exception as e:
+            print(f"Error during evaluation: {e}")
+            return
+
+        # Extract scores and samples from valid results
+        scores = [result["score"] for result in valid_results]
+        samples = [result["sample"] for result in valid_results]
+        valid_scores = [s for s in scores if s is not None]
+
+        if not valid_scores:
+            print("Warning: No valid scores found during evaluation")
+            return
+
+        percent_correct = sum(valid_scores) / len(valid_scores)
+        self.eval_metrics.append(("eval/percent_correct", percent_correct))
+
+        # Calculate additional metrics
+        format_compliant = sum(
+            1 for sample in samples if sample.get("format_compliant", False)
+        )
+
+        thinking_mode_used = self.config.thinking_mode
+
+        # Get response metrics
+        response_lengths = []
+        thinking_utilization = 0
+
+        for sample in samples:
+            if not sample:
+                continue
+
+            # Track response length
+            messages = sample.get("messages", [])
+            if messages:
+                assistant_msg = messages[-1].get("content", "")
+                response_lengths.append(len(assistant_msg))
+
+            # Track thinking utilization in thinking mode
+            if thinking_mode_used:
+                thinking_content = sample.get("thinking_content")
+                if thinking_content:
+                    thinking_utilization += 1
+
+        # Response length metrics
+        if response_lengths:
+            avg_response_length = sum(response_lengths) / len(response_lengths)
+            response_length_std = (
+                sum((x - avg_response_length) ** 2 for x in response_lengths)
+                / len(response_lengths)
+            ) ** 0.5
+            self.eval_metrics.append(("eval/avg_response_length", avg_response_length))
+            self.eval_metrics.append(("eval/response_length_std", response_length_std))
+
+        # Thinking utilization rate
+        if thinking_mode_used and samples:
+            thinking_utilization_rate = thinking_utilization / len(samples)
+            self.eval_metrics.append(
+                ("eval/thinking_utilization_rate", thinking_utilization_rate)
+            )
+
+        # Add overall dataset statistics
+        total_dataset_items = len(self.test) if hasattr(self, "test") else 0
+        evaluated_items = len(samples)
+        self.eval_metrics.append(("eval/total_dataset_items", total_dataset_items))
+        self.eval_metrics.append(("eval/evaluated_items", evaluated_items))
+        self.eval_metrics.append(("eval/valid_scores", len(valid_scores)))
+        self.eval_metrics.append(
+            (
+                "eval/format_compliance_rate",
+                format_compliant / len(samples) if samples else 0.0,
+            )
+        )
+
+        end_time = time.time()
+
+        # Build evaluation metrics dict
+        eval_metrics = {
+            "eval/percent_correct": percent_correct,
+            "eval/total_samples": len(samples),
+            "eval/correct_samples": sum(valid_scores),
+            "eval/format_compliance_rate": (
+                format_compliant / len(samples) if samples else 0.0
+            ),
+        }
+
+        # Add response length metrics
+        if response_lengths:
+            eval_metrics["eval/avg_response_length"] = avg_response_length
+            eval_metrics["eval/response_length_std"] = response_length_std
+
+        # Add thinking utilization
+        if thinking_mode_used and samples:
+            eval_metrics["eval/thinking_utilization_rate"] = thinking_utilization_rate
+
+        try:
+            await self.evaluate_log(
+                metrics=eval_metrics,
+                samples=samples,
+                start_time=start_time,
+                end_time=end_time,
+                generation_parameters={
+                    "temperature": self.config.eval_temperature,
+                    "max_tokens": self.config.eval_max_tokens,
+                    "thinking_mode": thinking_mode_used,
+                },
+            )
+        except Exception as e:
+            print(f"Error logging evaluation results: {e}")
+
+    async def add_rollouts_for_wandb(
+        self,
+        scored_data: Union[ScoredDataGroup, List[ScoredDataGroup]],
+        item: Item = None,
+    ) -> None:
+        """Add rollouts to wandb for visualization."""
+        if item is None or scored_data is None or not scored_data.get("tokens"):
+            return
+
+        # Extract ground truth
+        expected_reversed = item[1]
+
+        # Extract original text from the item prompt
+        original_text = "unknown_text"
+        try:
+            # The item[0] contains the prompt tuple with system and user messages
+            for role_dict in item[0]:
+                role_dict_converted = dict(role_dict)
+                if role_dict_converted.get("role") == "user":
+                    user_content = role_dict_converted.get("content", "")
+                    # Extract original text from the user message
+                    lines = user_content.split("\n")
+                    for line in lines:
+                        if (
+                            line.strip()
+                            and not line.startswith("Please reverse")
+                            and not line.startswith("The text to reverse:")
+                        ):
+                            original_text = line.strip()
+                            break
+                    break
+        except Exception as e:
+            print(f"DEBUG: Exception in add_rollouts_for_wandb text extraction: {e}")
+            original_text = "extraction_failed"
+
+        # Keep a reasonable number of rollouts
+        num_keep = self.config.num_rollouts_per_group_for_logging
+        if num_keep == -1:
+            num_keep = self.config.group_size
+
+        num_keep = min(num_keep, len(scored_data["tokens"]))
+
+        current_rollouts = []
+        mode = "thinking" if self.config.thinking_mode else "direct"
+
+        for i in range(num_keep):
+            # Decode the full trajectory
+            full_text = self.tokenizer.decode(
+                scored_data["tokens"][i], skip_special_tokens=True
+            )
+            score_val = scored_data["scores"][i]
+
+            # Extract the model's reversed text
+            extracted_reversed = "unknown"
+            try:
+                # Try to get model response from messages or decode from tokens
+                messages = scored_data.get("messages", [])
+                if i < len(messages) and isinstance(messages[i], list) and messages[i]:
+                    model_response = messages[i][-1].get("content", "")
+                else:
+                    # Fallback to decoding tokens
+                    model_response = full_text
+
+                extracted_reversed = (
+                    self._extract_reversed_text(model_response) or "format_error"
+                )
+            except Exception as e:
+                print(
+                    f"DEBUG: Exception in add_rollouts_for_wandb reversal parsing: {e}"
+                )
+                extracted_reversed = "parse_error"
+
+            current_rollouts.append(
+                (
+                    full_text,
+                    score_val,
+                    expected_reversed,
+                    extracted_reversed,
+                    original_text,
+                    mode,
+                )
+            )
+
+        self.rollouts_for_wandb.append(current_rollouts)
+
+        # Keep only recent rollouts
+        if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
+            self.rollouts_for_wandb.pop(0)
+
+    async def create_rollout_table(self, wandb_metrics: Dict) -> Dict:
+        """Create wandb table for rollout visualization."""
+        if not self.rollouts_for_wandb:
+            return wandb_metrics
+
+        table = wandb.Table(
+            columns=[
+                "full_text",
+                "score",
+                "expected_reversed",
+                "extracted_reversed",
+                "original_text",
+                "mode",
+            ]
+        )
+
+        for group_rollouts in self.rollouts_for_wandb:
+            for rollout_tuple in group_rollouts:
+                if len(rollout_tuple) == 6:
+                    table.add_data(*rollout_tuple)
+
+        wandb_metrics["train/rollouts"] = table
+        self.rollouts_for_wandb = []
+        return wandb_metrics
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
+        """Log metrics to wandb."""
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        # Basic accuracy metrics
+        if self.percent_correct_buffer:
+            wandb_metrics["train/percent_correct"] = sum(
+                self.percent_correct_buffer
+            ) / len(self.percent_correct_buffer)
+
+        # Reversal-specific metrics
+        if self.total_attempts > 0:
+            wandb_metrics["train/success_rate"] = (
+                self.successful_reversals / self.total_attempts
+            )
+            wandb_metrics["train/failure_rate"] = (
+                self.failed_reversals / self.total_attempts
+            )
+            wandb_metrics["train/format_error_rate"] = (
+                self.format_errors / self.total_attempts
+            )
+            wandb_metrics["train/format_compliance_rate"] = 1.0 - (
+                self.format_errors / self.total_attempts
+            )
+
+        # Configuration metrics
+        wandb_metrics.update(
+            {
+                "train/thinking_mode_enabled": (
+                    1.0 if self.config.thinking_mode else 0.0
+                ),
+                "train/total_attempts": self.total_attempts,
+                "train/successful_reversals": self.successful_reversals,
+                "train/failed_reversals": self.failed_reversals,
+                "train/format_errors": self.format_errors,
+                "config/group_size": self.config.group_size,
+                "config/train_max_tokens": self.config.train_max_tokens,
+                "config/eval_max_tokens": self.config.eval_max_tokens,
+            }
+        )
+
+        # Reset training metrics
+        self._reset_metrics()
+
+        # Add evaluation metrics
+        for metric_name, metric_value in self.eval_metrics:
+            wandb_metrics[metric_name] = metric_value
+        self.eval_metrics = []
+
+        # Add rollout table
+        wandb_metrics = await self.create_rollout_table(wandb_metrics)
+
+        await super().wandb_log(wandb_metrics)
+
+
+if __name__ == "__main__":
+    TextReversalEnv.cli()
diff --git a/environments/tool_use_turnlevel_advantage_server.py b/environments/tool_use_turnlevel_advantage_server.py
new file mode 100644
index 00000000..7ade9146
--- /dev/null
+++ b/environments/tool_use_turnlevel_advantage_server.py
@@ -0,0 +1,1875 @@
+"""
+Multi-Turn Tool-Calling Environment with Turn-Level Advantages (MT-GRPO)
+=========================================================================
+
+Implements turn-level credit assignment following the MT-GRPO approach from:
+"Reinforcing Multi-Turn Reasoning in LLM Agents via Turn-Level Credit Assignment"
+(Zeng et al., 2025) - https://arxiv.org/abs/2505.11821
+
+This environment provides a flexible base for multi-turn tool-calling tasks with
+fine-grained advantage estimation. Users can customize by overriding:
+
+    • `compute_turn_reward()` - Define turn-level reward signals (R^T_τ)
+    • `compute_outcome_reward()` - Define outcome-level reward (R^O)
+    • `validate_tool_call_turn()` - Custom tool call validation
+    • `validate_summary_turn()` - Custom summary/narration validation
+    • `get_next_item()` - Custom data loading
+
+Key MT-GRPO formula (Equation 7 from paper):
+    Â_τ = Â^T_τ + λ · Â^O   for all turns τ = 1, 2, ..., T
+
+Where:
+    - Â^T_τ = standardized turn-level advantage
+    - Â^O = standardized outcome-level advantage
+    - λ = turn_level_advantage_lambda (default 1.0)
+
+Dataset columns expected
+------------------------
+* `conversations` – list[dict] with keys `from` and `value`
+"""
+
+import ast
+import asyncio
+import json
+import random
+import re
+from collections import Counter
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import wandb
+from datasets import load_dataset
+from pydantic import Field
+from tqdm.asyncio import tqdm_asyncio
+
+from atroposlib.envs.base import (
+    APIServerConfig,
+    BaseEnv,
+    BaseEnvConfig,
+    EvalHandlingEnum,
+    Item,
+    ScoredDataGroup,
+)
+from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer
+
+# ==============================================================================
+# Configuration
+# ==============================================================================
+
+
+class MTGRPOEnvConfig(BaseEnvConfig):
+    """
+    Configuration for Multi-Turn Tool Calling with Turn-Level Advantages.
+
+    Key MT-GRPO parameters:
+        - turn_level_advantage_lambda: λ coefficient for combining turn and outcome advantages
+        - wrong_call_penalty: Penalty for incorrect tool calls (negative reward)
+        - tool_execution_reward: Reward for valid tool call structure
+        - tool_match_reward: Reward for correct tool call content
+        - summary_reward: Reward for valid summary/narration turns
+    """
+
+    # MT-GRPO specific parameters
+    turn_level_advantage_lambda: float = Field(
+        default=1.0,
+        description="λ coefficient for MT-GRPO advantage: Â_τ = Â^T_τ + λ·Â^O. Paper uses 1.0.",
+    )
+    wrong_call_penalty: float = Field(
+        default=-0.2,
+        description="Penalty applied when tool call validation fails (R^T component).",
+    )
+    tool_execution_reward: float = Field(
+        default=0.5,
+        description="Reward for valid tool call structure (<think> + <tool_call>).",
+    )
+    tool_match_reward: float = Field(
+        default=0.5,
+        description="Reward for tool call content matching expected.",
+    )
+    summary_reward: float = Field(
+        default=1.0,
+        description="Reward for valid summary/narration turn.",
+    )
+
+    # Environment structure parameters
+    max_tool_call_turns_cap: Optional[int] = Field(
+        default=2,
+        description="Upper cap on tool-calling turns per episode (None → no cap).",
+    )
+    validate_think_blocks: bool = Field(
+        default=True,
+        description="Whether to require <think> blocks in assistant messages.",
+    )
+    generate_all_gpt_turns: bool = Field(
+        default=True,
+        description="Generate GPT turns after tool responses, including summaries.",
+    )
+    max_gen_per_turn: int = Field(
+        default=2048,
+        description="Max tokens the model may generate in a single turn.",
+    )
+
+    # Data handling parameters
+    skip_completed: bool = Field(
+        default=True,
+        description="Skip conversations whose first user prompt appears in completed tasks.",
+    )
+    completed_dataset_id: Optional[str] = Field(
+        default=None,
+        description="Dataset id for completed tasks (used when skip_completed=True).",
+    )
+    scenario_category: str = Field(
+        default="all",
+        description='Scenario type: "single", "multistep", "multiturn", or "all" (accepts everything).',
+    )
+    min_tool_call_turns: int = Field(
+        default=1,
+        description="Minimum number of tool-calling turns required (set to 2 for MT-GRPO focus).",
+    )
+    min_successful_turns: int = Field(
+        default=1,
+        description="Minimum successful turns to keep a rollout (0 = keep all, even failures).",
+    )
+    add_dynamic_few_shot: bool = Field(
+        default=True,
+        description="Insert most-recent successful example into system prompt.",
+    )
+    use_parallel_requests: bool = Field(
+        default=True,
+        description="Use parallel requests instead of n parameter for batching.",
+    )
+
+
+# ==============================================================================
+# Default Prompts (Override in subclass if needed)
+# ==============================================================================
+
+DEFAULT_SYSTEM_PROMPT = (
+    "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the "
+    "problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
+    "solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
+    "</think> tags, and then provide your solution or response to the problem."
+)
+
+DEFAULT_FEW_SHOT_EXAMPLE = (
+    "Example Reasoning format:\n"
+    "<think>\n"
+    "Okay, the user asked for a calculation, I need to use python_interpreter tool to compute it.\n"
+    "</think>\n"
+    "<tool_call>\n"
+    '{"name": "python_interpreter", "arguments": {"code": "print(2+2)"}}\n'
+    "</tool_call>\n"
+)
+
+DEFAULT_SEQ_TOOL_HELPER = (
+    "You are in sequential tool calling mode. "
+    "Call exactly **one** tool, wait for its <tool_response>, "
+    "then decide whether to call another. "
+    "Never bundle multiple <tool_call> blocks in the same message. "
+    "Do not generate <tool_response> blocks by yourself since system will provide. "
+    "When you get the <tool_response> and if you believe the task is complete, "
+    "return your reasoning in <think> blocks followed by plain text summary."
+)
+
+DEFAULT_NARRATION_HELPER = (
+    "Reply with a <think> block followed by the user-visible summary for the tool result above. "
+    "Do **not** include any <tool_call> blocks in that tool result summary message. "
+    "Do **not** generate <tool_response> blocks yourself since those are provided by the system.\n"
+    "Example Tool Result Summary:\n"
+    "<think>\n"
+    "The tool call was successful, the user asked for the weather in SF and the tool returned 70 degrees and sunny.\n"
+    "</think>\n"
+    "The weather in SF is 70 degrees and sunny."
+)
+
+DEFAULT_CONTINUATION_HELPER = (
+    "Continue with your reasoning in <think> blocks. "
+    "If you need to call another tool, output <think>...</think> followed by <tool_call>...</tool_call>. "
+    "Do **not** generate <tool_response> blocks yourself since those are provided by the system.\n"
+    "Example continuation:\n"
+    "<think>\n"
+    "The first tool returned the data I needed. Now I need to call the next tool to complete the task.\n"
+    "</think>\n"
+    "<tool_call>\n"
+    '{"name": "next_tool", "arguments": {"param": "value"}}\n'
+    "</tool_call>"
+)
+
+
+def fix_system_prompt_json_examples(system_prompt: str) -> str:
+    """
+    Fix system prompts that use Python dict syntax in examples to use proper JSON.
+
+    The dataset often contains examples like:
+        <tool_call>
+        {'name': <function-name>,'arguments': <args-dict>}
+        </tool_call>
+
+    This converts them to proper JSON format.
+    """
+    # Replace the common Python dict example pattern with proper JSON
+    # Pattern 1: {'name': <function-name>,'arguments': <args-dict>}
+    system_prompt = re.sub(
+        r"<tool_call>\s*\{'name':\s*<function-name>,\s*'arguments':\s*<args-dict>\}\s*</tool_call>",
+        '<tool_call>\n{"name": "<function-name>", "arguments": {"<arg1>": "<value1>"}}\n</tool_call>',
+        system_prompt,
+        flags=re.IGNORECASE | re.DOTALL,
+    )
+
+    # Pattern 2: Example with single quotes in pydantic schema description
+    # Replace single-quoted JSON schema examples
+    system_prompt = re.sub(r"'title':\s*'([^']+)'", r'"title": "\1"', system_prompt)
+    system_prompt = re.sub(r"'type':\s*'([^']+)'", r'"type": "\1"', system_prompt)
+    system_prompt = re.sub(r"'properties':\s*\{", r'"properties": {', system_prompt)
+    system_prompt = re.sub(r"'required':\s*\[", r'"required": [', system_prompt)
+
+    # Fix the pydantic model json schema section - replace Python dict with JSON
+    # This handles: {'title': 'FunctionCall', 'type': 'object', ...}
+    pydantic_pattern = r"\{'title':\s*'FunctionCall'[^}]+\}"
+    pydantic_replacement = (
+        '{"title": "FunctionCall", "type": "object", "properties": '
+        '{"name": {"title": "Name", "type": "string"}, '
+        '"arguments": {"title": "Arguments", "type": "object"}}, '
+        '"required": ["name", "arguments"]}'
+    )
+    system_prompt = re.sub(pydantic_pattern, pydantic_replacement, system_prompt)
+
+    return system_prompt
+
+
+# ==============================================================================
+# Utility Functions (Can be used by subclasses)
+# ==============================================================================
+
+
+def strip_tool_response_tags(content: str) -> str:
+    """Strip outer <tool_response> tags from content if present."""
+    # Match <tool_response>...</tool_response> and extract inner content
+    match = re.match(
+        r"^\s*<tool_response>\s*([\s\S]*?)\s*</tool_response>\s*$",
+        content,
+        flags=re.IGNORECASE,
+    )
+    if match:
+        return match.group(1).strip()
+    return content
+
+
+def normalize_tool_call_json(txt: str) -> str:
+    """
+    Normalize assistant replies to canonical JSON format.
+    Preserves <think> block and converts Python-style dicts to JSON.
+    """
+    m = re.match(r"^\s*(<think>[\s\S]*?</think>)\s*", txt, flags=re.IGNORECASE)
+    if not m:
+        return txt
+    think_part = m.group(1)
+
+    def _convert(match: re.Match) -> str:
+        raw = match.group(1).strip()
+        try:
+            obj = ast.literal_eval(raw)
+            return f"<tool_call>{json.dumps(obj, separators=(',', ':'))}</tool_call>"
+        except Exception:
+            pass
+        try:
+            json_like = re.sub(r"'([^']*)':", r'"\1":', raw)
+            json_like = re.sub(r":\s*'([^']*)'", r':"\1"', json_like)
+            json.loads(json_like)
+            return f"<tool_call>{json_like}</tool_call>"
+        except Exception:
+            return match.group(0)
+
+    tail = txt[len(m.group(0)) :]
+    tail = re.sub(
+        r"<tool_call>\s*([\s\S]*?)\s*</tool_call>",
+        _convert,
+        tail,
+        flags=re.DOTALL | re.IGNORECASE,
+    )
+    out = think_part + tail
+    out = re.sub(r"\s*<tool_call>\s*", "\n<tool_call>\n", out)
+    out = re.sub(r"\s*</tool_call>\s*", "\n</tool_call>\n", out)
+    return out
+
+
+def canonicalise_tool_json(raw: str) -> Optional[str]:
+    """Parse raw as JSON or Python literal, return canonical JSON string."""
+    raw = raw.strip()
+
+    # Reject if it contains XML tags - malformed data
+    if re.search(r"</?(?:think|tool_call|tool_response)>", raw, re.IGNORECASE):
+        return None
+
+    # First try direct JSON parsing
+    try:
+        obj = json.loads(raw)
+        if isinstance(obj, dict) and "name" in obj:
+            return json.dumps(obj, separators=(",", ":"))
+    except Exception:
+        pass
+
+    # Try Python literal (handles single quotes)
+    try:
+        obj = ast.literal_eval(raw)
+        if isinstance(obj, dict) and "name" in obj:
+            return json.dumps(obj, separators=(",", ":"))
+    except Exception:
+        pass
+
+    return None
+
+
+def extract_tool_calls(
+    txt: str, require_think_block: bool = True
+) -> Optional[List[dict]]:
+    """
+    Extract tool calls from a response.
+
+    Args:
+        txt: The response text to parse
+        require_think_block: If True, requires <think> block before tool calls.
+                            If False, extracts tool calls regardless of think blocks.
+
+    Returns list of parsed tool call dicts, or None if validation fails.
+    """
+    txt = normalize_tool_call_json(txt)
+
+    if re.search(r"<tool_response\s*>", txt, flags=re.IGNORECASE):
+        print("\033[91m[DEBUG extract] Rejected: contains <tool_response>\033[0m")
+        return None
+
+    if require_think_block:
+        # Strict mode: require <think> block at start
+        m = re.match(r"\s*(<think>[\s\S]*?</think>)", txt, flags=re.IGNORECASE)
+        if not m:
+            print(
+                f"\033[91m[DEBUG extract] Rejected: no <think> block "
+                f"(require_think={require_think_block})\033[0m"
+            )
+            print(f"\033[91m[DEBUG extract] Text starts with: '{txt[:150]}...'\033[0m")
+            return None
+        rest = txt[len(m.group(1)) :]
+    else:
+        # Flexible mode: skip any leading content before first <tool_call>
+        # Find the first <tool_call> tag
+        first_tc = re.search(r"<tool_call>", txt, flags=re.IGNORECASE)
+        if not first_tc:
+            print("\033[91m[DEBUG extract] Rejected: no <tool_call> found\033[0m")
+            return None
+        rest = txt[first_tc.start() :]
+
+    tc_pattern = r"\s*<tool_call>\s*([\s\S]*?)\s*</tool_call>\s*"
+
+    tool_calls = []
+    while True:
+        m_tc = re.match(tc_pattern, rest, flags=re.IGNORECASE)
+        if not m_tc:
+            break
+        raw_json = m_tc.group(1)
+        print(f"\033[93m[DEBUG extract] Raw tool call content: '{raw_json}'\033[0m")
+        canon = canonicalise_tool_json(raw_json)
+        if canon is None:
+            print("\033[91m[DEBUG extract] Failed to canonicalise tool call\033[0m")
+            return None
+        print(f"\033[92m[DEBUG extract] Canonicalised: '{canon}'\033[0m")
+        tool_calls.append(json.loads(canon))
+        rest = rest[m_tc.end() :]
+
+    if not tool_calls:
+        print(
+            f"\033[91m[DEBUG extract] No tool calls extracted from rest: '{rest[:150]}...'\033[0m"
+        )
+        return None
+
+    # In strict mode, nothing should remain after tool calls
+    # In flexible mode, allow trailing content (like explanations)
+    if require_think_block and rest.strip():
+        print(
+            f"\033[91m[DEBUG extract] Rejected: trailing content after tool calls: '{rest[:100]}'\033[0m"
+        )
+        return None
+
+    return tool_calls
+
+
+def validate_think_only(txt: str) -> bool:
+    """Validate a narration/summary turn (think block only, no tool calls)."""
+    txt = normalize_tool_call_json(txt)
+    if not isinstance(txt, str):
+        return False
+
+    think_blocks = re.findall(r"<think>[\s\S]*?</think>", txt, flags=re.IGNORECASE)
+    if len(think_blocks) != 1:
+        return False
+    if not re.match(r"^\s*<think>", txt, flags=re.IGNORECASE):
+        return False
+    if re.search(r"<tool_call\s*>", txt, flags=re.IGNORECASE):
+        return False
+    if re.search(r"<tool_response\s*>", txt, flags=re.IGNORECASE):
+        return False
+    return True
+
+
+def coerce_jsonlike(val):
+    """Best-effort coercion of JSON-like values (handles double-encoding)."""
+    if not isinstance(val, str):
+        return val
+    s = val.strip()
+    if s.lower() == "true":
+        return True
+    if s.lower() == "false":
+        return False
+    if s.lower() in ("null", "none"):
+        return None
+    if (s.startswith("{") and s.endswith("}")) or (
+        s.startswith("[") and s.endswith("]")
+    ):
+        try:
+            return json.loads(s)
+        except Exception:
+            pass
+    try:
+        return ast.literal_eval(s)
+    except Exception:
+        return val
+
+
+def parse_expected_call(raw_call) -> dict:
+    """Parse an expected tool call (handles double-encoding)."""
+    obj = raw_call
+    if isinstance(raw_call, str):
+        try:
+            obj = json.loads(raw_call)
+        except Exception:
+            obj = coerce_jsonlike(raw_call)
+    if not isinstance(obj, dict):
+        return {}
+    if "arguments" in obj:
+        obj["arguments"] = coerce_jsonlike(obj["arguments"])
+    return obj
+
+
+def json_objects_match(model_json, expected_json) -> bool:
+    """Check if model output matches expected (expected is subset of model)."""
+    model_json = coerce_jsonlike(model_json)
+    expected_json = coerce_jsonlike(expected_json)
+
+    if isinstance(expected_json, dict):
+        if not isinstance(model_json, dict):
+            return False
+        for k, v in expected_json.items():
+            if k not in model_json:
+                return False
+            if not json_objects_match(model_json[k], v):
+                return False
+        return True
+    return model_json == expected_json
+
+
+# ==============================================================================
+# Base MT-GRPO Environment (Abstract methods for customization)
+# ==============================================================================
+
+
+class BaseMTGRPOEnv(BaseEnv):
+    """
+    Base class for Multi-Turn GRPO environments with turn-level credit assignment.
+
+    Override these methods to customize for your use case:
+
+        # Required overrides:
+        - get_next_item(): Return training items
+        - setup(): Initialize dataset and items
+
+        # Reward customization (optional):
+        - compute_turn_reward(): Custom turn-level reward (R^T_τ)
+        - compute_outcome_reward(): Custom outcome reward (R^O)
+
+        # Validation customization (optional):
+        - validate_tool_call_turn(): Custom tool call validation
+        - validate_summary_turn(): Custom summary validation
+
+        # Prompt customization (optional):
+        - get_system_prompt(): Custom system prompt
+        - get_few_shot_example(): Custom few-shot example
+
+    The MT-GRPO advantage computation is handled automatically:
+        Â_τ = Â^T_τ + λ · Â^O
+    """
+
+    name = "base_mtgrpo"
+
+    def __init__(
+        self,
+        config: MTGRPOEnvConfig,
+        server_configs: List[APIServerConfig],
+        slurm: bool = True,
+        testing: bool = False,
+    ):
+        super().__init__(config, server_configs, slurm, testing)
+
+        self.percent_correct_buffer: List[float] = []
+        self.raw_score_buffer: List[float] = []
+        self.eval_metrics: List[Tuple[str, float]] = []
+        self.rollouts_for_wandb: List = []
+        self.dynamic_example: Optional[str] = None
+        self.completed_tasks: set[str] = set()
+        self.train_items: List = []
+        self.test_items: List = []
+        self.iter = 0
+        # Set max_token_len from config (used by base class for length filtering)
+        self.max_token_len = config.max_token_length
+
+    # ==========================================================================
+    # CUSTOMIZATION HOOKS - Override these in your subclass
+    # ==========================================================================
+
+    def get_system_prompt(self) -> str:
+        """
+        Return the system prompt for the environment.
+        Override to customize the base instructions.
+        """
+        return DEFAULT_SYSTEM_PROMPT
+
+    def get_few_shot_example(self) -> str:
+        """
+        Return a few-shot example for the prompt.
+        Override to provide domain-specific examples.
+        """
+        if self.config.add_dynamic_few_shot and self.dynamic_example:
+            return "Example Reasoning format:\n" + self.dynamic_example
+        return DEFAULT_FEW_SHOT_EXAMPLE
+
+    def get_sequential_helper(self) -> str:
+        """Return helper text for sequential tool calling mode."""
+        return DEFAULT_SEQ_TOOL_HELPER
+
+    def get_narration_helper(self) -> str:
+        """Return helper text for narration/summary turns."""
+        return DEFAULT_NARRATION_HELPER
+
+    def get_continuation_helper(self) -> str:
+        """Return helper text for continuation after tool response."""
+        return DEFAULT_CONTINUATION_HELPER
+
+    def validate_tool_call_turn(
+        self,
+        response: str,
+        expected_calls: List[str],
+        pred_calls: List[Any],
+    ) -> Tuple[bool, List[Any]]:
+        """
+        Validate a tool-calling turn and extract predictions.
+
+        Args:
+            response: The model's response text
+            expected_calls: List of expected tool call JSON strings
+            pred_calls: List to append predictions to
+
+        Returns:
+            (is_valid, updated_pred_calls)
+
+        Override to implement custom tool call validation logic.
+        """
+        # Use config to determine if think blocks are required
+        require_think = self.config.validate_think_blocks
+        calls = extract_tool_calls(response, require_think_block=require_think)
+
+        if calls is None:
+            print(
+                f"\033[91m[DEBUG] extract_tool_calls returned None (require_think={require_think})\033[0m"
+            )
+            return False, pred_calls + ["__MISMATCH__"]
+
+        if len(calls) != len(expected_calls):
+            print(
+                f"\033[91m[DEBUG] Call count mismatch: got {len(calls)}, expected {len(expected_calls)}\033[0m"
+            )
+            return False, pred_calls + ["__MISMATCH__"]
+
+        for mdl, exp_raw in zip(calls, expected_calls):
+            exp_obj = parse_expected_call(exp_raw)
+            if not json_objects_match(mdl, exp_obj):
+                print(
+                    f"\033[91m[DEBUG] JSON mismatch: model={mdl}, expected={exp_obj}\033[0m"
+                )
+                return False, pred_calls + ["__MISMATCH__"]
+
+        return True, pred_calls + calls
+
+    def validate_summary_turn(self, response: str) -> bool:
+        """
+        Validate a summary/narration turn (no tool calls expected).
+
+        Args:
+            response: The model's response text
+
+        Returns:
+            True if valid summary turn
+
+        Override to implement custom summary validation.
+        """
+        # If think blocks not required, just check there are no tool calls
+        if not self.config.validate_think_blocks:
+            if re.search(r"<tool_call\s*>", response, flags=re.IGNORECASE):
+                return False
+            if re.search(r"<tool_response\s*>", response, flags=re.IGNORECASE):
+                return False
+            return True
+
+        # Strict mode: require think block
+        return validate_think_only(response)
+
+    def compute_turn_reward(
+        self,
+        turn_idx: int,
+        response: str,
+        expected_calls: List[str],
+        pred_calls: List[Any],
+        is_valid: bool,
+    ) -> float:
+        """
+        Compute the turn-level reward R^T_τ for a single turn.
+
+        This is called for each turn to compute the intermediate reward signal.
+        The paper uses rewards like:
+            - Tool Execution Reward (0.5): Valid tool call structure
+            - Tool Match Reward (0.5): Correct tool call content
+
+        Args:
+            turn_idx: Index of the current turn (0-based)
+            response: The model's response text
+            expected_calls: List of expected tool call JSON strings (empty for summary turns)
+            pred_calls: List of predicted tool calls for this turn
+            is_valid: Whether the turn passed validation
+
+        Returns:
+            Turn reward value (R^T_τ)
+
+        Override to implement custom turn-level rewards.
+        """
+        reward = 0.0
+
+        if expected_calls:  # Tool-calling turn
+            # Check if structure is valid (has tool_call, optionally with think block)
+            require_think = self.config.validate_think_blocks
+            has_valid_structure = (
+                extract_tool_calls(response, require_think_block=require_think)
+                is not None
+            )
+            if has_valid_structure:
+                reward += self.config.tool_execution_reward
+
+            # Check if content matches
+            if is_valid:
+                reward += self.config.tool_match_reward
+
+        else:  # Summary/narration turn
+            if is_valid:
+                reward += self.config.summary_reward
+
+        # Apply penalty for mismatches
+        if "__MISMATCH__" in pred_calls:
+            reward += self.config.wrong_call_penalty
+
+        return reward
+
+    def compute_outcome_reward(
+        self,
+        turn_rewards: List[float],
+        responses: List[str],
+        expected_calls_by_turn: List[List[str]],
+        completed_turns: int,
+    ) -> float:
+        """
+        Compute the outcome-level reward R^O for the entire trajectory.
+
+        This is the final success/failure signal for the episode.
+        The paper uses binary outcome rewards:
+            - 1.0 if all turns completed successfully
+            - 0.0 otherwise
+
+        Args:
+            turn_rewards: List of turn rewards computed so far
+            responses: List of model responses for each turn
+            expected_calls_by_turn: Expected calls for each turn
+            completed_turns: Number of turns that passed validation
+
+        Returns:
+            Outcome reward value (R^O)
+
+        Override to implement custom outcome rewards (e.g., answer correctness).
+        """
+        total_turns = len(expected_calls_by_turn)
+        all_successful = completed_turns == total_turns
+        return 1.0 if all_successful else 0.0
+
+    # ==========================================================================
+    # MT-GRPO CORE LOGIC (Usually don't need to override)
+    # ==========================================================================
+
+    def _compute_turn_and_outcome_rewards(
+        self,
+        responses_by_turn: List[str],
+        pred_calls_by_turn: List[List],
+        expected_calls_by_turn: List[List[str]],
+    ) -> Tuple[List[float], float, int]:
+        """
+        Compute all turn-level rewards and the outcome reward.
+
+        Returns:
+            (turn_rewards, outcome_reward, num_successful_turns)
+
+        Calls the customizable compute_turn_reward() and compute_outcome_reward()
+        hooks for each turn.
+        """
+        turn_rewards = []
+        completed_turns = 0
+        total_turns = len(expected_calls_by_turn)
+
+        for turn_idx in range(len(responses_by_turn)):
+            if turn_idx >= total_turns:
+                break
+
+            response = responses_by_turn[turn_idx]
+            expected_calls = expected_calls_by_turn[turn_idx]
+            pred_calls = (
+                pred_calls_by_turn[turn_idx]
+                if turn_idx < len(pred_calls_by_turn)
+                else []
+            )
+
+            # Determine if turn is valid
+            if expected_calls:
+                is_valid = "__MISMATCH__" not in pred_calls and len(pred_calls) == len(
+                    expected_calls
+                )
+            else:
+                is_valid = self.validate_summary_turn(response)
+
+            if is_valid:
+                completed_turns += 1
+
+            # Compute turn reward using customizable hook
+            turn_reward = self.compute_turn_reward(
+                turn_idx, response, expected_calls, pred_calls, is_valid
+            )
+            turn_rewards.append(turn_reward)
+
+        # Fill missing turns with zero reward
+        while len(turn_rewards) < total_turns:
+            turn_rewards.append(0.0)
+
+        # Compute outcome reward using customizable hook
+        outcome_reward = self.compute_outcome_reward(
+            turn_rewards, responses_by_turn, expected_calls_by_turn, completed_turns
+        )
+
+        return turn_rewards, outcome_reward, completed_turns
+
+    def _compute_mt_grpo_advantages(
+        self,
+        turn_rewards_batch: List[List[float]],
+        outcome_rewards_batch: List[float],
+    ) -> List[List[float]]:
+        """
+        Compute MT-GRPO advantages following the paper (Equation 7):
+
+            Â_τ = Â^T_τ + λ · Â^O   for ALL turns τ = 1, 2, ..., T
+
+        Where:
+            - Â^T_τ = (R^T_τ - mean) / std  (standardized turn advantage)
+            - Â^O = (R^O - mean) / std  (standardized outcome advantage)
+            - λ = turn_level_advantage_lambda
+
+        This method should NOT be overridden - it implements the core MT-GRPO algorithm.
+        """
+        if not turn_rewards_batch:
+            return []
+
+        num_rollouts = len(turn_rewards_batch)
+        max_turns = max(len(rewards) for rewards in turn_rewards_batch)
+        lam = self.config.turn_level_advantage_lambda
+
+        # Compute standardized turn advantages Â^T_τ for each turn position
+        turn_advantages_by_turn = []
+        for turn_idx in range(max_turns):
+            turn_rewards = [
+                rewards[turn_idx] if turn_idx < len(rewards) else 0.0
+                for rewards in turn_rewards_batch
+            ]
+            mean_turn = np.mean(turn_rewards)
+            std_turn = np.std(turn_rewards) or 1.0
+            standardized = (np.array(turn_rewards) - mean_turn) / std_turn
+            turn_advantages_by_turn.append(standardized)
+
+        # Compute standardized outcome advantage Â^O
+        mean_outcome = np.mean(outcome_rewards_batch)
+        std_outcome = np.std(outcome_rewards_batch) or 1.0
+        outcome_advantages = (
+            np.array(outcome_rewards_batch) - mean_outcome
+        ) / std_outcome
+
+        # Combine: Â_τ = Â^T_τ + λ · Â^O
+        mt_grpo_advantages = []
+        for rollout_idx in range(num_rollouts):
+            rollout_advantages = []
+            actual_num_turns = len(turn_rewards_batch[rollout_idx])
+
+            for turn_idx in range(actual_num_turns):
+                A_T = turn_advantages_by_turn[turn_idx][rollout_idx]
+                A_O = outcome_advantages[rollout_idx]
+                adv = A_T + lam * A_O
+                rollout_advantages.append(float(adv))
+
+            mt_grpo_advantages.append(rollout_advantages)
+
+        return mt_grpo_advantages
+
+    def _assign_advantages_to_tokens(
+        self,
+        tokens: List[int],
+        masks: List[int],
+        turn_advantages: List[float],
+    ) -> List[float]:
+        """
+        Assign turn-level advantages to tokens based on mask transitions.
+        Tokens with mask != -100 are trainable (assistant responses).
+        """
+        token_advantages = [0.0] * len(tokens)
+
+        if not turn_advantages:
+            return token_advantages
+
+        in_trainable_region = False
+        current_turn_idx = 0
+        current_advantage = turn_advantages[0] if turn_advantages else 0.0
+
+        for i, mask in enumerate(masks):
+            if mask != -100:
+                if not in_trainable_region:
+                    in_trainable_region = True
+                    if current_turn_idx < len(turn_advantages):
+                        current_advantage = turn_advantages[current_turn_idx]
+                token_advantages[i] = current_advantage
+            else:
+                if in_trainable_region:
+                    in_trainable_region = False
+                    current_turn_idx += 1
+
+        return token_advantages
+
+    # ==========================================================================
+    # TRAJECTORY COLLECTION (Core implementation)
+    # ==========================================================================
+
+    async def _build_turn_contexts(
+        self,
+        turn_idx: int,
+        contexts: List[List[Dict[str, str]]],
+        inter_turns: List[List[Dict[str, str]]],
+        active: List[bool],
+    ) -> Tuple[List[str], List[int]]:
+        """Build prompts for the current turn from active rollout contexts."""
+        if turn_idx > 0 and turn_idx - 1 < len(inter_turns):
+            filler = inter_turns[turn_idx - 1]
+            print(
+                f"    \033[96m[DEBUG _build_turn_contexts] Adding inter_turn {turn_idx-1} to contexts:\033[0m"
+            )
+            for msg in filler:
+                print(
+                    f"      role={msg.get('role')}, content_preview={msg.get('content', '')[:200]}..."
+                )
+            for r in range(len(contexts)):
+                if active[r]:
+                    contexts[r].extend(filler)
+
+        prompts, ridx_map = [], []
+        for r in range(len(contexts)):
+            if not active[r]:
+                continue
+            ptxt = self.tokenizer.apply_chat_template(
+                contexts[r], add_generation_prompt=True, tokenize=False
+            )
+            prompts.append(ptxt)
+            ridx_map.append(r)
+
+        return prompts, ridx_map
+
+    async def _execute_turn_inference(
+        self,
+        turn_idx: int,
+        prompts: List[str],
+        ridx_map: List[int],
+    ) -> List[str]:
+        """Execute inference for a turn."""
+        if turn_idx == 0 and not self.config.use_parallel_requests:
+            return await self._batch_identical_prompts(prompts[0], len(ridx_map))
+        else:
+            return await self._batch_heterogeneous_prompts(prompts)
+
+    async def _batch_identical_prompts(self, prompt: str, count: int) -> List[str]:
+        """Handle identical prompts using n parameter."""
+        print(f"    \033[93m→ TURN 1 prompt full:\033[0m \033[92m{prompt}\033[0m")
+
+        resp = await self.server.completion(
+            prompt=prompt,
+            n=count,
+            max_tokens=self.config.max_gen_per_turn,
+            temperature=0.8,
+        )
+        choices = [c.text for c in resp.choices]
+
+        # Debug: print each rollout
+        for i, raw in enumerate(choices):
+            preview = raw[:1000] + ("..." if len(raw) > 1000 else "")
+            print(
+                f"    \033[93m· turn 1 rollout raw [{i}] (len={len(raw)}):\033[0m "
+                f"\033[94m{preview}\033[0m"
+            )
+            if not raw.strip():
+                print(f"      → (empty or error string returned for rollout {i})")
+        print("    → All turn 1 rollouts printed; moving on.\n" + "-" * 48)
+
+        return choices
+
+    async def _batch_heterogeneous_prompts(self, prompts: List[str]) -> List[str]:
+        """Handle heterogeneous prompts using parallel requests."""
+        print(f"    → Parallelizing {len(prompts)} prompts")
+
+        # Print each prompt - show more for Turn 2+ debugging
+        for idx_p, p_str in enumerate(prompts):
+            # Check if continuation helper is in the prompt
+            has_continuation = "Continue with your reasoning" in p_str
+            print(
+                f"    \033[93m→ prompt[{idx_p}] (len={len(p_str)}, has_continuation_helper={has_continuation}):\033[0m"
+            )
+            # Print last 1500 chars to see the tool response and helper
+            print(f"    \033[92m...{p_str[-1500:]}\033[0m")
+
+        async def _call_single(prompt_str: str) -> str:
+            try:
+                comp = await self.server.completion(
+                    prompt=prompt_str,
+                    n=1,
+                    max_tokens=self.config.max_gen_per_turn,
+                    temperature=0.8,
+                )
+                return comp.choices[0].text
+            except Exception as e:
+                print(f"    → _call_single exception: {e}")
+                return ""
+
+        tasks = [_call_single(p) for p in prompts]
+        results = await asyncio.gather(*tasks)
+
+        # Debug: print results
+        choices = []
+        for i, rtext in enumerate(results):
+            raw = rtext or ""
+            print(
+                f"    \033[93m· rollout {i} reply:\033[0m \033[94m{raw[:500]}{'...' if len(raw) > 500 else ''}\033[0m"
+            )
+            if not raw:
+                print(f"    → Rollout {i} returned empty or error string")
+            choices.append(raw)
+        print("-" * 48)
+
+        return choices
+
+    async def _process_turn_responses(
+        self,
+        turn_idx: int,
+        choices: List[str],
+        ridx_map: List[int],
+        contexts: List[List[Dict[str, str]]],
+        preds_by_turn: List[List[List]],
+        responses_by_turn: List[List[str]],
+        active: List[bool],
+        expected_calls_by_turn: List[List[str]],
+    ) -> None:
+        """Process and validate responses for a single turn."""
+        for txt, r in zip(choices, ridx_map):
+            raw_txt = txt or ""
+            norm_txt = normalize_tool_call_json(raw_txt)
+
+            print(f"\n\033[93m=== TURN {turn_idx+1} · ROLLOUT {r} ===\033[0m")
+            preview = raw_txt[:1500] + ("..." if len(raw_txt) > 1500 else "")
+            print(
+                f"\033[95mRaw assistant reply (len={len(raw_txt)}):\033[0m\n"
+                f"\033[94m{preview}\033[0m"
+            )
+
+            responses_by_turn[r].append(norm_txt)
+            expected_calls = expected_calls_by_turn[turn_idx]
+
+            if expected_calls:  # Tool-calling turn
+                print(f"\033[95mExpected tool calls:\033[0m {expected_calls}")
+
+                is_valid, updated_preds = self.validate_tool_call_turn(
+                    norm_txt, expected_calls, preds_by_turn[r][turn_idx]
+                )
+                preds_by_turn[r][turn_idx] = updated_preds
+
+                print(
+                    f"\033[95mExtracted/validated:\033[0m {updated_preds}, valid={is_valid}"
+                )
+
+                if not is_valid:
+                    print(
+                        f"\033[91m[DEBUG] Tool call validation failed for rollout {r}\033[0m"
+                    )
+                    active[r] = False
+                else:
+                    # Update dynamic example on success
+                    if self.config.add_dynamic_few_shot:
+                        self.dynamic_example = norm_txt.strip()
+
+            else:  # Summary turn
+                is_valid = self.validate_summary_turn(norm_txt)
+                print(f"\033[95mSummary turn validation:\033[0m valid={is_valid}")
+                if not is_valid:
+                    print(
+                        f"\033[91m[DEBUG] Invalid summary turn for rollout {r}\033[0m"
+                    )
+                    active[r] = False
+                preds_by_turn[r][turn_idx] = []
+
+            contexts[r].append({"role": "assistant", "content": norm_txt})
+
+    async def collect_trajectories(
+        self,
+        item: Tuple[Tuple[frozenset, ...], List[List[str]], List[List[Dict[str, str]]]],
+    ) -> Tuple[Optional[ScoredDataGroup], List[Item]]:
+        """
+        Roll-out multi-turn tool-calling with MT-GRPO turn-level advantage computation.
+        """
+        messages_tuple, expected_calls_by_turn, inter_turns = item
+        base_ctx = [dict(m) for m in messages_tuple]
+
+        num_rollouts = self.config.group_size
+        contexts: List[List[Dict[str, str]]] = [
+            list(base_ctx) for _ in range(num_rollouts)
+        ]
+        preds_by_turn: List[List[List]] = [
+            [[] for _ in range(len(expected_calls_by_turn))]
+            for _ in range(num_rollouts)
+        ]
+        responses_by_turn: List[List[str]] = [[] for _ in range(num_rollouts)]
+        active = [True] * num_rollouts
+
+        max_turns = len(expected_calls_by_turn)
+        print(f"[DEBUG] max_turns={max_turns}, len(inter_turns)={len(inter_turns)}")
+        print(
+            f"[DEBUG] expected_calls_by_turn count per turn: {[len(t) for t in expected_calls_by_turn]}"
+        )
+        if inter_turns:
+            print(
+                f"[DEBUG] inter_turns[0] roles: {[m.get('role') for m in inter_turns[0]] if inter_turns else 'empty'}"
+            )
+
+        for turn_idx in range(max_turns):
+            print(
+                f"\n[collect_trajectories] Beginning turn {turn_idx+1}/{max_turns} for this group"
+            )
+
+            prompts, ridx_map = await self._build_turn_contexts(
+                turn_idx, contexts, inter_turns, active
+            )
+
+            if not prompts:
+                print("    → No active prompts, stopping.")
+                break
+
+            choices = await self._execute_turn_inference(turn_idx, prompts, ridx_map)
+
+            await self._process_turn_responses(
+                turn_idx,
+                choices,
+                ridx_map,
+                contexts,
+                preds_by_turn,
+                responses_by_turn,
+                active,
+                expected_calls_by_turn,
+            )
+
+            survivors = sum(1 for a in active if a)
+            print(
+                f"    → Finished turn {turn_idx+1}; {survivors}/{num_rollouts} rollouts still active"
+            )
+
+            if not any(active):
+                print("    → All rollouts terminated; stopping further turns.")
+                break
+
+        # Compute rewards and advantages
+        turn_rewards_batch = []
+        outcome_rewards_batch = []
+        successful_turns_batch = []
+
+        for r in range(num_rollouts):
+            turn_rewards, outcome_reward, num_successful = (
+                self._compute_turn_and_outcome_rewards(
+                    responses_by_turn[r], preds_by_turn[r], expected_calls_by_turn
+                )
+            )
+            turn_rewards_batch.append(turn_rewards)
+            outcome_rewards_batch.append(outcome_reward)
+            successful_turns_batch.append(num_successful)
+
+        # Compute MT-GRPO advantages
+        mt_grpo_advantages = self._compute_mt_grpo_advantages(
+            turn_rewards_batch, outcome_rewards_batch
+        )
+
+        # Build scored data group
+        scored = ScoredDataGroup(tokens=[], masks=[], scores=[], advantages=[])
+
+        for r in range(num_rollouts):
+            out = tokenize_for_trainer(
+                tokenizer=self.tokenizer,
+                chat=contexts[r],
+                include_messages=self.config.include_messages,
+            )
+
+            token_advantages = self._assign_advantages_to_tokens(
+                out["tokens"],
+                out["masks"],
+                mt_grpo_advantages[r] if r < len(mt_grpo_advantages) else [],
+            )
+
+            # Compute final score for this rollout
+            # Use dense reward (sum of turn rewards / total) + outcome bonus
+            # This preserves partial successes for dataset generation
+            total_turns = len(expected_calls_by_turn)
+            dense_reward = sum(turn_rewards_batch[r]) / max(1, total_turns)
+            final_score = dense_reward + outcome_rewards_batch[r]
+
+            # Apply minimum successful turns threshold
+            # Rollouts below threshold get negative score (will be filtered out)
+            if successful_turns_batch[r] < self.config.min_successful_turns:
+                final_score = -1.0
+
+            scored["tokens"].append(out["tokens"])
+            scored["masks"].append(out["masks"])
+            scored["scores"].append(final_score)
+            scored["advantages"].append(token_advantages)
+
+        # Apply length penalty
+        if scored["scores"]:
+            cutoff = self.config.max_token_length * 0.5
+            for i, ln in enumerate([len(t) for t in scored["tokens"]]):
+                if ln > cutoff and scored["scores"][i] > 0.99:
+                    frac = min(
+                        (ln - cutoff) / (self.config.max_token_length - cutoff), 1.0
+                    )
+                    scored["scores"][i] = max(0.0, scored["scores"][i] - frac)
+
+        # Update metrics
+        for s in scored["scores"]:
+            self.raw_score_buffer.append(s)
+            self.percent_correct_buffer.append(1.0 if s >= 1.0 else 0.0)
+
+        # Validate group
+        if len(scored["tokens"]) < self.config.group_size:
+            return None, []
+
+        # For dataset generation: keep group if at least one rollout has positive score
+        # For RL training: require score diversity (ensure_scores_are_not_same)
+        has_positive_score = any(s > 0 for s in scored["scores"])
+        if not has_positive_score:
+            return None, []  # Drop groups where all rollouts failed
+
+        if self.config.ensure_scores_are_not_same and len(set(scored["scores"])) == 1:
+            return None, []
+
+        # Final rollout summary
+        print("\n\033[92m=== FINAL ROLLOUT SUMMARY ===\033[0m")
+        for r_i, (ctx, score, num_success) in enumerate(
+            zip(contexts, scored["scores"], successful_turns_batch)
+        ):
+            last_assistant = next(
+                (m["content"] for m in reversed(ctx) if m["role"] == "assistant"),
+                "(no assistant message)",
+            )
+            print(
+                f"\n\033[96mRollout {r_i} · score={score:.3f} · successful_turns={num_success}\033[0m"
+            )
+            print(f"{last_assistant[:300]}{'...' if len(last_assistant) > 300 else ''}")
+            print("-" * 60)
+        print("=== END SUMMARY ===\n")
+
+        await self.add_rollouts_for_wandb(scored, item)
+        return scored, []
+
+    # ==========================================================================
+    # EVALUATION AND LOGGING
+    # ==========================================================================
+
+    def _score_episode(
+        self,
+        pred_calls: list,
+        exp_calls: list,
+        lam: float = 0.5,
+    ) -> Tuple[float, int]:
+        """Score an episode for evaluation."""
+        exp_jsons = [parse_expected_call(r) for r in exp_calls]
+
+        mismatch_penalty = 0.0
+        if pred_calls and "__MISMATCH__" in pred_calls:
+            pred_calls = [c for c in pred_calls if c != "__MISMATCH__"]
+            mismatch_penalty = self.config.wrong_call_penalty
+
+        pred_calls += [{}] * (len(exp_jsons) - len(pred_calls))
+
+        correct = sum(
+            1 for p, e in zip(pred_calls, exp_jsons) if json_objects_match(p, e)
+        )
+        dense = correct / max(1, len(exp_jsons))
+        bonus = lam if correct == len(exp_jsons) else 0.0
+
+        return dense + bonus + mismatch_penalty, correct
+
+    async def rollout_and_score_eval(self, item) -> float:
+        """Single evaluation rollout."""
+        messages_tuple, expected_calls_by_turn, inter_turns = item
+        base_ctx = [dict(m) for m in messages_tuple]
+        ctx = list(base_ctx)
+        preds = []
+
+        for turn_idx, expected_turn_calls in enumerate(expected_calls_by_turn):
+            if turn_idx > 0 and turn_idx - 1 < len(inter_turns):
+                ctx.extend(inter_turns[turn_idx - 1])
+
+            prompt = self.tokenizer.apply_chat_template(
+                ctx, add_generation_prompt=True, tokenize=False
+            )
+            max_new = min(
+                self.config.max_gen_per_turn,
+                max(1, self.config.max_token_length - len(prompt)),
+            )
+
+            comp = await self.server.completion(
+                prompt=prompt, n=1, max_tokens=max_new, temperature=0.0, split="eval"
+            )
+            reply = comp.choices[0].text
+            ctx.append({"role": "assistant", "content": reply})
+
+            tool_jsons = (
+                extract_tool_calls(
+                    reply, require_think_block=self.config.validate_think_blocks
+                )
+                if expected_turn_calls
+                else []
+            )
+            if tool_jsons is None:
+                break
+            preds.extend(tool_jsons)
+
+            if turn_idx >= len(expected_calls_by_turn) - 1:
+                break
+
+        expected_calls_flat = [
+            call for turn_calls in expected_calls_by_turn for call in turn_calls
+        ]
+        score, _ = self._score_episode(preds, expected_calls_flat)
+        return score
+
+    async def evaluate(self, *_, **__):
+        subset = self.test_items[: min(128, len(self.test_items))]
+        scores = await tqdm_asyncio.gather(
+            *[self.rollout_and_score_eval(it) for it in subset]
+        )
+        avg_reward = sum(scores) / len(scores)
+        pct_exact = sum(1 for s in scores if s >= 1.0) / len(scores)
+        self.eval_metrics.append(("eval/avg_reward", avg_reward))
+        self.eval_metrics.append(("eval/percent_correct", pct_exact))
+
+    async def create_rollout_table(self, wdict):
+        if self.rollouts_for_wandb:
+            table = wandb.Table(columns=["generation", "score", "expected_tool_call"])
+            for grp in self.rollouts_for_wandb:
+                for g, sc, exp in grp:
+                    exp_str = json.dumps(exp, separators=(",", ":"))
+                    table.add_data(g, sc, exp_str)
+            wdict["train/rollouts"] = table
+        self.rollouts_for_wandb = []
+        return wdict
+
+    async def add_rollouts_for_wandb(
+        self, scored: ScoredDataGroup, item: Item, *, num_keep: int = 1
+    ):
+        num_keep = min(num_keep, len(scored["tokens"]))
+        expected_calls_flat = [call for turn_calls in item[1] for call in turn_calls]
+        self.rollouts_for_wandb.append(
+            [
+                (
+                    self.tokenizer.decode(scored["tokens"][i]),
+                    scored["scores"][i],
+                    expected_calls_flat,
+                )
+                for i in range(num_keep)
+            ]
+        )
+        if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
+            self.rollouts_for_wandb.pop(0)
+
+    async def wandb_log(self, metrics: Optional[Dict] = None):
+        metrics = metrics or {}
+        metrics = await self.create_rollout_table(metrics)
+        if self.raw_score_buffer:
+            avg_reward = sum(self.raw_score_buffer) / len(self.raw_score_buffer)
+            pct_correct = sum(self.percent_correct_buffer) / len(
+                self.percent_correct_buffer
+            )
+            metrics["train/avg_reward"] = avg_reward
+            metrics["train/percent_correct"] = pct_correct
+            self.raw_score_buffer.clear()
+            self.percent_correct_buffer.clear()
+        for k, v in self.eval_metrics:
+            metrics[k] = v
+        self.eval_metrics.clear()
+        await super().wandb_log(metrics)
+
+
+# ==============================================================================
+# Concrete Implementation: Tool-Calling Environment
+# ==============================================================================
+
+
+class MultiTurnToolUseTurnLevelAdvantageEnv(BaseMTGRPOEnv):
+    """
+    Concrete implementation of MT-GRPO for multi-turn tool-calling tasks.
+
+    This environment:
+    - Loads tool-calling conversation datasets
+    - Validates tool calls against expected JSON
+    - Computes turn-level rewards for tool execution and correctness
+    - Uses MT-GRPO advantage estimation for fine-grained credit assignment
+
+    To create your own environment, subclass BaseMTGRPOEnv and override:
+    - setup(): Load your dataset
+    - get_next_item(): Return training items
+    - compute_turn_reward(): Custom turn rewards (optional)
+    - compute_outcome_reward(): Custom outcome rewards (optional)
+    """
+
+    name = "multiturn_tool_use_turnlevel_advantage"
+
+    def __init__(
+        self,
+        config: MTGRPOEnvConfig,
+        server_configs: List[APIServerConfig],
+        slurm: bool = True,
+        testing: bool = False,
+    ):
+        super().__init__(config, server_configs, slurm, testing)
+
+        # Try local fixed dataset first, fall back to HuggingFace
+        local_path = "data/hermes_reasoning_tool_use_fixed.jsonl"
+        import os
+
+        if os.path.exists(local_path):
+            print(f"[dataset] Loading from local file: {local_path}")
+            self.ds = load_dataset("json", data_files=local_path, split="train")
+        else:
+            print(
+                "[dataset] Loading from HuggingFace: interstellarninja/hermes_reasoning_tool_use"
+            )
+            self.ds = load_dataset(
+                "interstellarninja/hermes_reasoning_tool_use", split="train"
+            )
+
+    @classmethod
+    def config_init(cls) -> Tuple[MTGRPOEnvConfig, List[APIServerConfig]]:
+        env_cfg = MTGRPOEnvConfig(
+            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview",
+            group_size=16,
+            use_wandb=True,
+            rollout_server_url="http://localhost:8000",
+            total_steps=2000,
+            batch_size=1024,
+            steps_per_eval=20,
+            max_token_length=1024 * 64,
+            inference_weight=1.0,
+            wandb_name="multiturn_tool_use_turnlevel_advantage",
+            eval_handling=EvalHandlingEnum.LIMIT_TRAIN,
+            eval_limit_ratio=0.1,
+            # MT-GRPO parameters
+            turn_level_advantage_lambda=1.0,
+            wrong_call_penalty=-0.2,
+            tool_execution_reward=0.5,
+            tool_match_reward=0.5,
+            summary_reward=1.0,
+            # Environment parameters
+            max_tool_call_turns_cap=3,
+            validate_think_blocks=True,  # Dataset has <think> blocks
+            generate_all_gpt_turns=True,
+            add_dynamic_few_shot=True,
+            scenario_category="all",  # Accept all scenario types
+            min_tool_call_turns=2,  # Require at least 2 tool-calling turns for MT-GRPO
+            min_successful_turns=1,  # Keep rollouts with at least 1 successful turn
+            use_parallel_requests=False,
+            skip_completed=True,
+            completed_dataset_id=None,
+        )
+        server_cfgs = [
+            APIServerConfig(
+                model_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview",
+                base_url="http://localhost:9004/v1",
+                api_key="x",
+                num_max_requests_at_once=32,
+                num_requests_for_eval=256,
+            )
+        ]
+        return env_cfg, server_cfgs
+
+    async def setup(self):
+        """Initialize dataset and prepare training/test items."""
+        ds = self.ds.shuffle()
+
+        # Load completed tasks filter
+        if self.config.skip_completed and self.config.completed_dataset_id:
+            try:
+                _done_ds = load_dataset(self.config.completed_dataset_id, split="train")
+                self.completed_tasks = set(_done_ds["task"])
+                print(f"[filter] Loaded {len(self.completed_tasks):,} completed tasks")
+            except Exception as exc:
+                self.completed_tasks = set()
+                print(f"[filter] Could not load completed-task dataset: {exc}")
+
+        # Statistics
+        counts = Counter()
+        for row in ds:
+            conv = row["conversations"]
+            num_turns = sum(
+                1
+                for msg in conv
+                if msg["from"] in ("gpt", "assistant")
+                and re.search(r"<tool_call>", msg["value"], re.IGNORECASE)
+            )
+            counts[num_turns] += 1
+
+        print("Tool-call distribution:")
+        for k in sorted(counts):
+            print(f"  {k:2d} turns → {counts[k]} examples")
+
+        split = ds.train_test_split(0.02)
+        split["train"] = split["train"].shuffle()
+        split["test"] = split["test"].shuffle()
+        self._prep_items(split["train"], is_train=True)
+        self._prep_items(split["test"], is_train=False)
+
+        random.shuffle(self.train_items)
+        random.shuffle(self.test_items)
+
+        if not self.train_items:
+            raise ValueError("No training items prepared: check dataset formatting.")
+
+    def _check_sequential_tools(self, conv: List[Dict[str, str]]) -> bool:
+        """Check if tool calls follow sequential pattern."""
+        tool_indices = [
+            i
+            for i, m in enumerate(conv)
+            if m["from"] in ("gpt", "assistant") and "<tool_call>" in m["value"].lower()
+        ]
+        if not tool_indices:
+            return False
+
+        for i in range(len(tool_indices) - 1):
+            start, end = tool_indices[i], tool_indices[i + 1]
+            in_between = conv[start + 1 : end]
+            if any(m["from"] != "tool" for m in in_between):
+                return False
+
+        last_tool_idx = tool_indices[-1]
+        next_responses = [
+            i
+            for i, m in enumerate(conv[last_tool_idx + 1 :], start=last_tool_idx + 1)
+            if m["from"] == "tool"
+        ]
+        if not next_responses:
+            return False
+
+        return True
+
+    def _prep_items(self, dataset, *, is_train: bool):
+        """Process dataset items for training/testing."""
+        target = self.train_items if is_train else self.test_items
+        before_len = len(target)
+
+        for row in dataset:
+            conv = row["conversations"]
+
+            if (
+                len(conv) < 3
+                or conv[0]["from"] != "system"
+                or conv[1]["from"] != "human"
+            ):
+                continue
+
+            if self.config.skip_completed and self.completed_tasks:
+                if conv[1]["value"].strip() in self.completed_tasks:
+                    continue
+
+            tool_indices = [
+                i
+                for i, m in enumerate(conv)
+                if m["from"] in ("gpt", "assistant")
+                and "<tool_call>" in m["value"].lower()
+            ]
+
+            if not tool_indices:
+                continue
+
+            # Filter by minimum tool-calling turns
+            if len(tool_indices) < self.config.min_tool_call_turns:
+                continue
+
+            # Validate scenario
+            if self.config.scenario_category == "multistep":
+                if len(tool_indices) < 2:
+                    continue
+                human_after_first_tool = any(
+                    i > tool_indices[0] and m["from"] == "human"
+                    for i, m in enumerate(conv)
+                )
+                if human_after_first_tool:
+                    continue
+                if not self._check_sequential_tools(conv):
+                    continue
+            elif self.config.scenario_category == "single":
+                if len(tool_indices) != 1:
+                    continue
+            elif self.config.scenario_category == "multiturn":
+                # ─── STRICT MULTI-TURN PATTERN ───
+                # User → Assistant(tool_call) → Tool → Assistant(summary) → User → ...
+                # Each tool call is followed by a summary, then a user turn before next tool call
+
+                human_after_first_tool = any(
+                    i > tool_indices[0] and m["from"] == "human"
+                    for i, m in enumerate(conv)
+                )
+
+                # Must have at least one human after the first tool-call
+                if not human_after_first_tool:
+                    continue
+
+                # Must have at least TWO tool-calling turns for true multiturn
+                if len(tool_indices) < 2:
+                    continue
+
+                # First assistant turn must be the first tool-calling message
+                first_asst_idx = next(
+                    (
+                        i
+                        for i, m in enumerate(conv[2:], start=2)
+                        if m["from"] in ("gpt", "assistant")
+                    ),
+                    None,
+                )
+                if first_asst_idx != tool_indices[0]:
+                    continue
+
+                # Build multiturn structure
+                expected_calls_by_turn = []
+                inter_turns = []
+                ok = True
+
+                for idx_t, tool_idx in enumerate(tool_indices):
+                    # 1. Tool response directly after the tool-call
+                    try:
+                        tool_resp_idx = next(
+                            j
+                            for j in range(tool_idx + 1, len(conv))
+                            if conv[j]["from"] == "tool"
+                        )
+                    except StopIteration:
+                        ok = False
+                        break
+
+                    # 2. Assistant summary (no <tool_call>) after tool response
+                    try:
+                        summ_idx = next(
+                            j
+                            for j in range(tool_resp_idx + 1, len(conv))
+                            if conv[j]["from"] in ("gpt", "assistant")
+                        )
+                    except StopIteration:
+                        ok = False
+                        break
+                    if "<tool_call" in conv[summ_idx]["value"].lower():
+                        ok = False
+                        break
+
+                    # 3. If another tool-call follows, ensure a human turn exists
+                    nxt_tool_idx = (
+                        tool_indices[idx_t + 1]
+                        if idx_t + 1 < len(tool_indices)
+                        else None
+                    )
+                    if nxt_tool_idx is not None:
+                        slice_after_summary = conv[summ_idx + 1 : nxt_tool_idx]
+                        if not any(m["from"] == "human" for m in slice_after_summary):
+                            ok = False
+                            break
+
+                    # ─── Build turn A: assistant tool-call ───
+                    tool_call_msg = conv[tool_idx]["value"]
+                    if self.config.validate_think_blocks and not re.match(
+                        r"^\s*<think>", tool_call_msg, flags=re.IGNORECASE
+                    ):
+                        ok = False
+                        break
+                    tc_raws = re.findall(
+                        r"<tool_call>\s*(.*?)\s*</tool_call>",
+                        tool_call_msg,
+                        flags=re.DOTALL | re.IGNORECASE,
+                    )
+                    if not tc_raws:
+                        ok = False
+                        break
+                    turn_calls = []
+                    for raw in tc_raws:
+                        canon = canonicalise_tool_json(raw)
+                        if canon is None:
+                            ok = False
+                            break
+                        turn_calls.append(canon)
+                    if not ok:
+                        break
+                    expected_calls_by_turn.append(turn_calls)
+
+                    # Inter-turn after tool-call → tool response + narration helper
+                    tool_content = strip_tool_response_tags(
+                        conv[tool_resp_idx]["value"]
+                    )
+                    inter_turns.append(
+                        [
+                            {
+                                "role": "tool",
+                                "content": tool_content
+                                + "\n\n"
+                                + self.get_narration_helper(),
+                            }
+                        ]
+                    )
+
+                    # ─── Build turn B: assistant narration (no calls) ───
+                    expected_calls_by_turn.append([])
+
+                    # Inter-turn after narration → up to next tool-call (user messages, etc.)
+                    slice_end = nxt_tool_idx if nxt_tool_idx is not None else len(conv)
+                    next_ctx_slice = [
+                        {
+                            "role": "user" if m["from"] == "human" else "assistant",
+                            "content": m["value"],
+                        }
+                        for m in conv[summ_idx + 1 : slice_end]
+                    ]
+                    inter_turns.append(next_ctx_slice)
+
+                if not ok:
+                    continue
+
+                # Remove trailing inter-turn (nothing after final narration)
+                if inter_turns:
+                    inter_turns.pop()
+
+                # Apply turn cap
+                cap = self.config.max_tool_call_turns_cap
+                if cap is not None:
+                    keep_turns = 0
+                    calls_seen = 0
+                    for idx, calls in enumerate(expected_calls_by_turn):
+                        keep_turns += 1
+                        if calls:
+                            calls_seen += 1
+                            if calls_seen == cap:
+                                if idx + 1 < len(expected_calls_by_turn):
+                                    keep_turns += 1
+                                break
+                    expected_calls_by_turn = expected_calls_by_turn[:keep_turns]
+                    inter_turns = inter_turns[: keep_turns - 1]
+
+                # Build system prompt for multiturn
+                running_msgs = []
+                dataset_system = conv[0]["value"]
+                combined_system = dataset_system + "\n\n" + self.get_few_shot_example()
+                running_msgs.append(
+                    frozenset({"role": "system", "content": combined_system}.items())
+                )
+                running_msgs.append(
+                    frozenset({"role": "user", "content": conv[1]["value"]}.items())
+                )
+
+                if len(expected_calls_by_turn) >= 2:
+                    target.append(
+                        (tuple(running_msgs), expected_calls_by_turn, inter_turns)
+                    )
+                continue  # multiturn handled
+
+            elif self.config.scenario_category == "all":
+                # Accept all scenarios with at least one tool call
+                if len(tool_indices) < 1:
+                    continue
+            else:
+                continue
+
+            # Build system prompt
+            running_msgs = []
+            # Use dataset's system prompt directly (already fixed with proper JSON syntax)
+            # Append our few-shot example for reinforcement
+            dataset_system = conv[0]["value"]
+            combined_system = dataset_system + "\n\n" + self.get_few_shot_example()
+            running_msgs.append(
+                frozenset({"role": "system", "content": combined_system}.items())
+            )
+
+            user_content = conv[1]["value"]
+            if self.config.scenario_category == "multistep":
+                user_content = f"{user_content}\n\n{self.get_sequential_helper()}"
+            running_msgs.append(
+                frozenset({"role": "user", "content": user_content}.items())
+            )
+
+            # Extract expected calls and inter-turns
+            expected_calls_by_turn = []
+            inter_turns = []
+            skip_conversation = False
+
+            for i, tool_idx in enumerate(tool_indices):
+                tool_call_msg = conv[tool_idx]["value"]
+
+                # Validate think blocks if required (skip entire conversation if missing)
+                if self.config.validate_think_blocks and not re.match(
+                    r"^\s*<think>", tool_call_msg, flags=re.IGNORECASE
+                ):
+                    skip_conversation = True
+                    break
+
+                matches = re.findall(
+                    r"<tool_call>\s*(.*?)\s*</tool_call>",
+                    tool_call_msg,
+                    re.DOTALL | re.IGNORECASE,
+                )
+                if not matches:
+                    skip_conversation = True
+                    break
+
+                turn_calls = []
+                for raw in matches:
+                    canon = canonicalise_tool_json(raw)
+                    if canon:
+                        turn_calls.append(canon)
+
+                if not turn_calls:
+                    skip_conversation = True
+                    break
+
+                expected_calls_by_turn.append(turn_calls)
+
+                # Build inter-turn messages (everything between this tool call and the next)
+                if i < len(tool_indices) - 1:
+                    next_tool_idx = tool_indices[i + 1]
+                    inter_msgs = []
+                    for msg_idx in range(tool_idx + 1, next_tool_idx):
+                        msg = conv[msg_idx]
+                        role = (
+                            "tool"
+                            if msg["from"] == "tool"
+                            else ("user" if msg["from"] == "human" else "assistant")
+                        )
+                        content = msg["value"]
+                        # Strip outer <tool_response> tags if present (dataset may have them)
+                        if role == "tool":
+                            content = strip_tool_response_tags(content)
+                        # Add continuation helper to tool responses to remind model about format
+                        if role == "tool" and msg_idx == tool_idx + 1:
+                            content = content + "\n\n" + self.get_continuation_helper()
+                        inter_msgs.append({"role": role, "content": content})
+                    if inter_msgs:
+                        inter_turns.append(inter_msgs)
+
+            if skip_conversation:
+                continue
+
+            # Handle final summary turn
+            if self.config.generate_all_gpt_turns and tool_indices:
+                last_tool_response_idx = tool_indices[-1] + 1
+                has_final_narration = (
+                    last_tool_response_idx + 1 < len(conv)
+                    and conv[last_tool_response_idx + 1]["from"] in ("gpt", "assistant")
+                    and "<tool_call>" not in conv[last_tool_response_idx + 1]["value"]
+                )
+                if has_final_narration:
+                    expected_calls_by_turn.append([])
+                    # Strip outer <tool_response> tags if present
+                    tool_content = strip_tool_response_tags(
+                        conv[last_tool_response_idx]["value"]
+                    )
+                    inter_turns.append(
+                        [
+                            {
+                                "role": "tool",
+                                "content": tool_content
+                                + "\n\n"
+                                + self.get_narration_helper(),
+                            }
+                        ]
+                    )
+
+            # Apply turn cap
+            cap = self.config.max_tool_call_turns_cap
+            if cap is not None:
+                keep_turns = 0
+                calls_seen = 0
+                for idx, calls in enumerate(expected_calls_by_turn):
+                    keep_turns += 1
+                    if calls:
+                        calls_seen += 1
+                        if calls_seen == cap:
+                            if self.config.generate_all_gpt_turns and idx + 1 < len(
+                                expected_calls_by_turn
+                            ):
+                                keep_turns += 1
+                            break
+                expected_calls_by_turn = expected_calls_by_turn[:keep_turns]
+                inter_turns = inter_turns[: keep_turns - 1]
+
+            # Add item - require at least one turn with expected calls
+            if len(expected_calls_by_turn) >= 1:
+                if (
+                    self.config.scenario_category == "multistep"
+                    and len(expected_calls_by_turn) >= 2
+                ):
+                    target.append(
+                        (tuple(running_msgs), expected_calls_by_turn, inter_turns)
+                    )
+                elif self.config.scenario_category in ("single", "multiturn", "all"):
+                    target.append(
+                        (tuple(running_msgs), expected_calls_by_turn, inter_turns)
+                    )
+
+        print(
+            f"[prep_items] {'train' if is_train else 'test'}: added {len(target) - before_len} items."
+        )
+
+    async def get_next_item(self):
+        """Return the next training item."""
+        if not self.train_items:
+            raise ValueError("train_items is empty – dataset preparation failed.")
+
+        if self.iter >= len(self.train_items):
+            random.shuffle(self.train_items)
+            self.iter = 0
+
+        itm = self.train_items[self.iter]
+        self.iter += 1
+        return itm
+
+
+if __name__ == "__main__":
+    MultiTurnToolUseTurnLevelAdvantageEnv.cli()