mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
109 lines
3 KiB
Python
109 lines
3 KiB
Python
from typing import Any, Dict, List, Literal, Optional
|
|
|
|
from openai.types.chat import ChatCompletionContentPartParam
|
|
from typing_extensions import TypedDict
|
|
|
|
Content = str | list[ChatCompletionContentPartParam]
|
|
Item = Any
|
|
number = int | float
|
|
UUID = str
|
|
|
|
|
|
class Message(TypedDict):
|
|
role: Literal["system", "user", "assistant", "tool"]
|
|
content: Content
|
|
reward: Optional[float]
|
|
|
|
|
|
class AgentStep(TypedDict, total=False):
|
|
"""Represents a single step in an agent's history.
|
|
|
|
Attributes:
|
|
step: The step number.
|
|
messages: A list of messages exchanged during the step.
|
|
reward: The reward received at this step.
|
|
"""
|
|
|
|
step: int
|
|
messages: List[Message]
|
|
reward: float
|
|
|
|
|
|
# AgentHistory maps agent ids (e.g. "Player 1", "Player 2") to their respective list of steps.
|
|
AgentHistory = Dict[str, List[AgentStep]]
|
|
|
|
|
|
class Observation(TypedDict):
|
|
"""Represents an observation in a game history.
|
|
|
|
Attributes:
|
|
raw: The raw observation data (as a dictionary).
|
|
rendered: The rendered string of the observation suitable for input into an LLM.
|
|
"""
|
|
|
|
raw: Dict[str, Any]
|
|
rendered: Content
|
|
|
|
|
|
class GameStep(TypedDict):
|
|
"""Represents a single step in a game history. Essentially an (s,a,r) triple with metadata.
|
|
|
|
Attributes:
|
|
step: The step number.
|
|
agent: The agent who took the action (optional for final steps).
|
|
observation: The observation at this step.
|
|
action: The action taken by the agent (if any).
|
|
reward: The reward received; can be a float or a dictionary mapping agent names to rewards.
|
|
done: A flag indicating whether the game has ended after this step.
|
|
info: Additional information related to the step.
|
|
"""
|
|
|
|
step: int
|
|
agent_id: str
|
|
observation: Observation
|
|
action: str
|
|
reward: float | Dict[str, float]
|
|
done: bool
|
|
info: Dict[str, Any]
|
|
|
|
|
|
# GameHistory is represented as a list of game steps.
|
|
GameHistory = List[GameStep]
|
|
|
|
|
|
class EvaluationConfigGeneral(TypedDict):
|
|
"""Configuration section of evaluation results."""
|
|
|
|
total_evaluation_time_seconds: str
|
|
model_name: Optional[str]
|
|
generation_parameters: Dict[str, Any]
|
|
|
|
|
|
class EvaluationResults(TypedDict):
|
|
"""Results section containing metrics for tasks and aggregated results."""
|
|
|
|
all: Dict[str, float] # Aggregated metrics across all tasks
|
|
|
|
|
|
class EvaluationMetrics(TypedDict):
|
|
"""Complete evaluation metrics JSON structure."""
|
|
|
|
config_general: EvaluationConfigGeneral
|
|
results: EvaluationResults
|
|
|
|
|
|
class EvaluationSample(TypedDict, total=False):
|
|
"""Individual sample data written to JSONL files.
|
|
|
|
All fields are optional to accommodate different evaluation scenarios.
|
|
"""
|
|
|
|
messages: Optional[List[Dict[str, str]]]
|
|
question: Optional[str]
|
|
gold_answer: Optional[str]
|
|
gold_parsed: Optional[str]
|
|
model_parsed: Optional[str]
|
|
score: Optional[int]
|
|
correct: Optional[bool]
|
|
finish_reason: Optional[str]
|
|
response_after_think: Optional[str]
|