atropos/environments/eval_environments/verifiers_eval.py

"""
Verifiers Evaluation Environment for Atropos

This environment evaluates models using Prime Intellect's Verifiers library.
It supports any environment registered with the Verifiers ecosystem.

Uses verifiers' native rollout and scoring machinery - just pass an OpenAI-compatible
client and verifiers handles generation, parsing, and scoring.

To install a Verifiers/Prime environment:
1. uv tool install prime
2. prime login
3. prime env install will/wordle (or any owner/environment)
Docs: https://docs.primeintellect.ai/tutorials-environments/install

Usage:
    python verifiers_eval.py \
        --server-url http://localhost:8000/v1 \
        --model-name Qwen/Qwen2.5-7B-Instruct \
        --vf-env-name primeintellect/gsm8k \
        --max-eval-items 100

Works with any OpenAI-compatible API (OpenAI, vLLM, SGLang, Ollama, etc.)
"""

import argparse
import asyncio
import time
from typing import Tuple

import verifiers as vf
from openai import AsyncOpenAI

from atroposlib.envs.eval import EvalBase, evaluate_log
from atroposlib.envs.server_handling.server_manager import ServerManager


# Patch math_verify timeout to work in async context
# The signal-based timeout doesn't work in non-main threads (asyncio event loop)
def _no_signal_timeout(_timeout_seconds: int):
    """Replacement timeout decorator that doesn't use signals."""

    def decorator(func):
        def wrapper(*args, **kwargs):
            # Just call the function without timeout - safe in async context
            return func(*args, **kwargs)

        return wrapper

    return decorator


try:
    import math_verify.grader
    import math_verify.parser
    import math_verify.utils

    # Patch all modules that use the timeout decorator
    math_verify.utils.timeout = _no_signal_timeout
    math_verify.parser.timeout = _no_signal_timeout
    math_verify.grader.timeout = _no_signal_timeout
except ImportError:
    pass  # math_verify not installed


class VerifiersEval(EvalBase):
    """
    Verifiers Evaluation using EvalBase pattern.

    Uses verifiers' native batch evaluation for efficiency,
    with EvalBase's standardized logging via evaluate_log().

    Works with any OpenAI-compatible API (OpenAI, vLLM, SGLang, Ollama, etc.)
    """

    def __init__(
        self,
        vf_env_name: str = "primeintellect/gsm8k",
        env_args: dict = None,
        temperature: float = 0.0,
        max_tokens: int = 2048,
        max_eval_items: int = -1,
        max_concurrent: int = 64,
        eval_dir: str = None,
        verbose: bool = True,
        **kwargs,
    ):
        self.vf_env_name = vf_env_name
        self.env_args = env_args or {}
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.max_eval_items = max_eval_items
        self.max_concurrent = max_concurrent

        # Load verifiers environment
        self.vf_env = vf.load_environment(vf_env_name, **self.env_args)
        self.reward_func_names = self.vf_env.rubric._get_reward_func_names()

        # Initialize EvalBase (calls setup_data)
        super().__init__(
            eval_dir=eval_dir,
            verbose=verbose,
            **kwargs,
        )

    def get_generation_params(self):
        """Generation params for logging."""
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
            "n": 1,
        }

    def setup_data(self) -> list:
        """Return evaluation dataset from verifiers environment."""
        dataset = self.vf_env.get_eval_dataset()
        if self.max_eval_items > 0:
            n = min(len(dataset), self.max_eval_items)
            dataset = dataset.select(range(n))
        return dataset.to_list()

    async def run_item(
        self, server: ServerManager, data_item: dict  # noqa: ARG002
    ) -> Tuple[dict, list]:
        """Not used - we override __call__ for batch evaluation."""
        raise NotImplementedError(
            "VerifiersEval uses batch evaluation via __call__, not per-item run_item"
        )

    async def __call__(self, server_manager: ServerManager):
        """Run evaluation using verifiers' native batch machinery."""
        start_time = time.time()

        # Create OpenAI client from server config
        server = server_manager.servers[0]
        client = AsyncOpenAI(
            api_key=server.config.api_key or "x",
            base_url=server.config.base_url,
            timeout=getattr(server.config, "timeout", 600),
        )
        model = server.config.model_name

        print(f"\n{'=' * 60}")
        print(f"Verifiers Evaluation: {self.vf_env_name}")
        print(f"{'=' * 60}")
        print(f"  Model: {model}")
        print(f"  Items: {len(self.data)}")
        print(f"  Reward functions: {self.reward_func_names}")
        print(f"  Temperature: {self.temperature}")
        print(f"  Max concurrent: {self.max_concurrent}")
        print(f"{'=' * 60}\n")

        num_examples = self.max_eval_items if self.max_eval_items > 0 else -1

        # Use verifiers' batch evaluation
        results = await self.vf_env.evaluate(
            client=client,
            model=model,
            sampling_args={
                "temperature": self.temperature,
                "max_tokens": self.max_tokens,
            },
            num_examples=num_examples,
            max_concurrent=self.max_concurrent,
            save_results=False,
        )

        end_time = time.time()

        # Extract from verifiers output
        rewards = results["reward"]
        per_func_metrics = results["metrics"]
        prompts = results["prompt"]
        completions = results["completion"]
        answers = results["answer"]

        total = len(rewards)
        correct = sum(1 for r in rewards if r > 0)
        avg_score = sum(rewards) / total if total > 0 else 0.0
        accuracy = correct / total if total > 0 else 0.0

        # Per-reward function breakdown
        reward_breakdown = {}
        for func_name, values in per_func_metrics.items():
            if values:
                reward_breakdown[func_name] = {
                    "avg": sum(values) / len(values),
                    "correct": sum(1 for v in values if v > 0),
                }

        metrics = {
            "accuracy": accuracy,
            "avg_score": avg_score,
        }

        # Print results summary
        print(f"\n{'=' * 60}")
        print("Verifiers Evaluation Results")
        print(f"{'=' * 60}")
        print(f"  Average Score: {avg_score:.4f}")
        print(f"  Accuracy: {accuracy:.2%} ({correct}/{total})")
        print(f"  Time: {end_time - start_time:.1f}s")
        print("\n  Per-Reward Function:")
        for name, data in reward_breakdown.items():
            print(
                f"    {name}: avg={data['avg']:.4f}, correct={data['correct']}/{total}"
            )
        print(f"{'=' * 60}\n")

        # Build samples for logging
        system_prompt = self.vf_env.system_prompt or ""
        samples = []
        for i in range(min(total, 100)):  # Limit samples for logging
            prompt_msgs = prompts[i] if isinstance(prompts[i], list) else []
            completion_msgs = completions[i] if completions[i] else []

            messages = []
            if system_prompt:
                messages.append({"role": "system", "content": system_prompt})
            messages.extend(prompt_msgs)
            if isinstance(completion_msgs, list):
                messages.extend(completion_msgs)

            samples.append(
                {
                    "messages": messages,
                    "gold_answer": answers[i] if i < len(answers) else "",
                    "score": rewards[i],
                    "correct": rewards[i] > 0,
                    "metrics": {
                        k: v[i] for k, v in per_func_metrics.items() if i < len(v)
                    },
                }
            )

        # Use EvalBase's evaluate_log
        task_name = f"VerifiersEval@{self.vf_env_name.replace('/', '_')}"
        evaluate_log(
            metrics=metrics,
            eval_dir=getattr(self, "eval_dir", None),
            task_name=task_name,
            model_name=model,
            start_time=start_time,
            end_time=end_time,
            generation_parameters=self.get_generation_params(),
            samples=samples,
            verbose=getattr(self, "verbose", False),
        )

        return metrics


async def main():
    """CLI entry point for verifiers evaluation."""
    import os

    from atroposlib.envs.server_handling.server_baseline import APIServerConfig

    parser = argparse.ArgumentParser(
        description="Evaluate models using Verifiers environments"
    )
    # Server args (same as eval_runner)
    parser.add_argument(
        "--server-url",
        type=str,
        default="http://localhost:8000/v1",
        help="URL of the inference server",
    )
    parser.add_argument(
        "--model-name",
        type=str,
        required=True,
        help="Model name to evaluate",
    )
    parser.add_argument(
        "--api-key",
        type=str,
        default=os.getenv("OPENAI_API_KEY", "x"),
        help="API key (defaults to OPENAI_API_KEY env var)",
    )
    # Verifiers-specific args
    parser.add_argument(
        "--vf-env-name",
        type=str,
        default="primeintellect/gsm8k",
        help="Verifiers environment name (e.g., primeintellect/gsm8k)",
    )
    parser.add_argument(
        "--max-eval-items",
        type=int,
        default=-1,
        help="Maximum items to evaluate (-1 for all)",
    )
    parser.add_argument(
        "--temperature",
        type=float,
        default=0.0,
        help="Generation temperature",
    )
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=2048,
        help="Maximum tokens per completion",
    )
    parser.add_argument(
        "--max-concurrent",
        type=int,
        default=64,
        help="Maximum concurrent requests",
    )
    parser.add_argument(
        "--eval-dir",
        type=str,
        default=None,
        help="Directory to save evaluation results",
    )
    args = parser.parse_args()

    # Create server manager
    server_manager = ServerManager(
        configs=[
            APIServerConfig(
                api_key=args.api_key,
                base_url=args.server_url,
                model_name=args.model_name,
                health_check=False,
            ),
        ]
    )

    # Create and run evaluation
    eval_instance = VerifiersEval(
        vf_env_name=args.vf_env_name,
        max_eval_items=args.max_eval_items,
        temperature=args.temperature,
        max_tokens=args.max_tokens,
        max_concurrent=args.max_concurrent,
        eval_dir=args.eval_dir,
        verbose=True,
    )
    return await eval_instance(server_manager)


if __name__ == "__main__":
    asyncio.run(main())