atropos/atroposlib/type_definitions.py

from typing import Any, Dict, List, Literal, Optional

from openai.types.chat import ChatCompletionContentPartParam
from typing_extensions import TypedDict

Content = str | list[ChatCompletionContentPartParam]
Item = Any
number = int | float
UUID = str


class Message(TypedDict):
    role: Literal["system", "user", "assistant", "tool"]
    content: Content
    reward: Optional[float]


class AgentStep(TypedDict, total=False):
    """Represents a single step in an agent's history.

    Attributes:
        step: The step number.
        messages: A list of messages exchanged during the step.
        reward: The reward received at this step.
    """

    step: int
    messages: List[Message]
    reward: float


# AgentHistory maps agent ids (e.g. "Player 1", "Player 2") to their respective list of steps.
AgentHistory = Dict[str, List[AgentStep]]


class Observation(TypedDict):
    """Represents an observation in a game history.

    Attributes:
        raw: The raw observation data (as a dictionary).
        rendered: The rendered string of the observation suitable for input into an LLM.
    """

    raw: Dict[str, Any]
    rendered: Content


class GameStep(TypedDict):
    """Represents a single step in a game history. Essentially an (s,a,r) triple with metadata.

    Attributes:
        step: The step number.
        agent: The agent who took the action (optional for final steps).
        observation: The observation at this step.
        action: The action taken by the agent (if any).
        reward: The reward received; can be a float or a dictionary mapping agent names to rewards.
        done: A flag indicating whether the game has ended after this step.
        info: Additional information related to the step.
    """

    step: int
    agent_id: str
    observation: Observation
    action: str
    reward: float | Dict[str, float]
    done: bool
    info: Dict[str, Any]


# GameHistory is represented as a list of game steps.
GameHistory = List[GameStep]


class EvaluationConfigGeneral(TypedDict):
    """Configuration section of evaluation results."""

    total_evaluation_time_seconds: str
    model_name: Optional[str]
    generation_parameters: Dict[str, Any]


class EvaluationResults(TypedDict):
    """Results section containing metrics for tasks and aggregated results."""

    all: Dict[str, float]  # Aggregated metrics across all tasks


class EvaluationMetrics(TypedDict):
    """Complete evaluation metrics JSON structure."""

    config_general: EvaluationConfigGeneral
    results: EvaluationResults


class EvaluationSample(TypedDict, total=False):
    """Individual sample data written to JSONL files.

    All fields are optional to accommodate different evaluation scenarios.
    """

    messages: Optional[List[Dict[str, str]]]
    question: Optional[str]
    gold_answer: Optional[str]
    gold_parsed: Optional[str]
    model_parsed: Optional[str]
    score: Optional[int]
    correct: Optional[bool]
    finish_reason: Optional[str]
    response_after_think: Optional[str]