Merge pull request #207 from NousResearch/2025-07-07-evaluate

Add evaluate subcommand to cli
This commit is contained in:
hjc-puro 2025-07-08 06:53:37 +08:00 committed by GitHub
commit 72e75c2b13
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 846 additions and 376 deletions

View file

@ -297,6 +297,17 @@ Always refer to the specific environment script's help for all available options
python environments/your_environment_script.py process --help
```
### Environment Evaluation with `evaluate`
For running evaluation on environments, Atropos provides an `evaluate` subcommand that calls the environment's `evaluate` method:
```sh
python gsm8k_server.py evaluate \
--openai.base_url https://openrouter.ai/api/v1 \
--openai.api_key $OPENROUTER_API_KEY \
--openai.model_name qwen/qwen3-14b
```
### Offline Data Generation Quick Start
Run the below in separate terminals:

View file

@ -1142,17 +1142,25 @@ class BaseEnv(ABC):
generate_html(self.config.data_path_to_save_groups)
async def _run_evaluate(self):
"""
Internal method to run evaluation with proper setup.
"""
await self.setup()
await self.evaluate()
@classmethod
def cli(cls):
"""
Command-line interface entry point for the environment.
This method handles the CLI commands for serve and process.
This method handles the CLI commands for serve, process, and evaluate.
"""
# Create subcommands dictionary
subcommands = {
"serve": cls.get_cli_serve_config_cls(),
"process": cls.get_cli_process_config_cls(),
"evaluate": cls.get_cli_evaluate_config_cls(),
}
# Custom exception handler for cleaner error output
@ -1603,3 +1611,251 @@ class BaseEnv(ABC):
asyncio.run(env.process_manager())
return CliProcessConfig
@classmethod
def get_cli_evaluate_config_cls(cls) -> type:
"""
Returns the CLI configuration class for evaluate commands.
Returns:
type: The CliEvaluateConfig class for evaluate commands.
"""
# Get the default configurations from the specific environment class via config_init
default_env_config_from_init, default_server_configs_from_init = (
cls.config_init()
)
# Define namespace prefixes
env_full_prefix = f"{ENV_NAMESPACE}{NAMESPACE_SEP}"
openai_full_prefix = f"{OPENAI_NAMESPACE}{NAMESPACE_SEP}"
# Create Pydantic model classes based on the types from config_init.
# The defaults from config_init will be the primary source of defaults.
env_config_cls_from_init = type(default_env_config_from_init)
# Handle server_configs_from_init appropriately for creating a default CLI model
# If it's a list (multiple servers), we'll take the first one as a template for CLI args,
# or use APIServerConfig if the list is empty or contains ServerBaseline.
# If it's a single APIServerConfig, we use its type.
# If it's ServerBaseline, we use APIServerConfig type for CLI args to allow overrides.
if isinstance(default_server_configs_from_init, list):
if default_server_configs_from_init and isinstance(
default_server_configs_from_init[0], APIServerConfig
):
openai_config_cls_for_cli = type(default_server_configs_from_init[0])
# Use the actual instance for default values later if it's a single config
default_openai_config_instance_for_cli = (
default_server_configs_from_init[0]
if len(default_server_configs_from_init) == 1
else openai_config_cls_for_cli()
)
else:
openai_config_cls_for_cli = (
APIServerConfig # Default to APIServerConfig for CLI definition
)
default_openai_config_instance_for_cli = APIServerConfig()
elif isinstance(default_server_configs_from_init, APIServerConfig):
openai_config_cls_for_cli = type(default_server_configs_from_init)
default_openai_config_instance_for_cli = default_server_configs_from_init
else: # ServerBaseline or other
openai_config_cls_for_cli = APIServerConfig
default_openai_config_instance_for_cli = APIServerConfig()
class CliEvaluateConfig(
get_prefixed_pydantic_model(env_config_cls_from_init, env_full_prefix),
get_prefixed_pydantic_model(openai_config_cls_for_cli, openai_full_prefix),
ServerManagerConfig, # ServerManagerConfig defaults are fine as is.
Cmd,
):
"""
Configuration for the evaluate command.
Supports overrides via YAML config file and CLI arguments.
Order of precedence: CLI > YAML > `config_init` defaults.
"""
config: str | None = Field(
default=None,
description="Path to .yaml config file. CLI args override this.",
)
def run(self) -> None:
"""The logic to execute for the 'evaluate' command."""
# Set default wandb name if not provided and class has a name
wandb_name_attr = f"{ENV_NAMESPACE}{NAMESPACE_SEP}wandb_name"
if (
getattr(self, wandb_name_attr, None) is None
and cls.name is not None
):
setattr(self, wandb_name_attr, cls.name)
# Load configuration from YAML file if specified
if self.config is not None:
with open(self.config, "r") as f:
yaml_config = yaml.safe_load(f)
print(f"Loaded config from {self.config}")
else:
yaml_config = {}
# Get CLI flags passed with double dashes
cli_passed_flags = get_double_dash_flags()
# --- Configuration Merging ---
# Priority: CLI > YAML > `config_init` defaults
# 1. Environment Configuration
# Start with defaults from config_init
env_config_dict_base = default_env_config_from_init.model_dump()
# Apply specific overrides for evaluate mode that are generally useful
env_config_dict_base["use_wandb"] = True
env_config_dict = merge_dicts(
env_config_dict_base, # `config_init` defaults with evaluate adjustments
yaml_config.get(ENV_NAMESPACE, {}), # YAML config
extract_namespace(cli_passed_flags, env_full_prefix), # CLI args
)
# 2. OpenAI Configuration
oai_cli_passed_args = extract_namespace(
cli_passed_flags, openai_full_prefix
) # CLI args
yaml_oai_config = yaml_config.get(OPENAI_NAMESPACE, {})
# Determine the base OpenAI config from config_init for merging
# This uses the instance we determined earlier for CLI definition defaults
openai_config_dict_base = (
default_openai_config_instance_for_cli.model_dump()
)
if isinstance(default_server_configs_from_init, ServerBaseline) and (
oai_cli_passed_args or yaml_oai_config
):
# If config_init provided ServerBaseline, but CLI/YAML provides OpenAI specifics,
# it implies an override intent for a single server.
# We use the default_openai_config_instance_for_cli (which would be a default APIServerConfig)
# as the base for merging, allowing it to be fully specified by YAML/CLI.
pass # Base is already set correctly for this case
if isinstance(yaml_oai_config, list) and len(yaml_oai_config) == 1:
# If YAML specifies a single server config for OpenAI namespace
yaml_oai_single_server_config = yaml_oai_config[0]
elif isinstance(yaml_oai_config, dict):
yaml_oai_single_server_config = yaml_oai_config
else:
yaml_oai_single_server_config = {}
openai_config_dict = merge_dicts(
openai_config_dict_base, # Default from config_init (or default APIServerConfig)
yaml_oai_single_server_config, # YAML config for a single server
oai_cli_passed_args, # CLI args
)
# 3. Server Manager Configuration
server_manager_cli_passed_flags = {}
if "slurm" in cli_passed_flags:
server_manager_cli_passed_flags["slurm"] = cli_passed_flags["slurm"]
if "testing" in cli_passed_flags:
server_manager_cli_passed_flags["testing"] = cli_passed_flags[
"testing"
]
server_manager_yaml_dict = {}
if "slurm" in yaml_config:
server_manager_yaml_dict["slurm"] = yaml_config["slurm"]
if "testing" in yaml_config:
server_manager_yaml_dict["testing"] = yaml_config["testing"]
# Start with ServerManagerConfig defaults, then apply YAML, then CLI
# For evaluate mode, slurm and testing are typically False unless specified.
server_manager_config_dict_base = ServerManagerConfig(
slurm=False, testing=False
).model_dump()
server_manager_config_dict = merge_dicts(
server_manager_config_dict_base,
server_manager_yaml_dict,
server_manager_cli_passed_flags,
)
# --- Instantiate Final Config Objects ---
# Use the original class types from config_init (or APIServerConfig for OpenAI CLI)
env_config = env_config_cls_from_init(**env_config_dict)
server_manager_config = ServerManagerConfig(
**server_manager_config_dict
)
# Determine the final server_configs.
# For 'evaluate', we typically expect a single server configuration for the OAI part.
# The resolve_openai_configs will handle complex cases, but for 'evaluate',
# the openai_config_dict we built should represent the single intended server.
# If default_server_configs_from_init was ServerBaseline, and we have openai_config_dict,
# it means we are overriding to use a specific APIServerConfig.
# If default_server_configs_from_init was a list or single APIServerConfig,
# resolve_openai_configs will merge appropriately.
final_openai_configs = resolve_openai_configs(
default_server_configs=default_server_configs_from_init, # Pass the original structure
openai_config_dict=openai_config_dict, # This is the merged single server config for CLI/YAML
yaml_config=yaml_config, # Pass full YAML for resolve_openai_configs logic
cli_passed_flags=cli_passed_flags, # Pass full CLI for resolve_openai_configs
logger=logger,
)
# Add warning for localhost or 0.0.0.0
if isinstance(final_openai_configs, list):
for cfg in final_openai_configs:
if (
isinstance(cfg, APIServerConfig)
and cfg.base_url
and (
"localhost" in cfg.base_url
or "0.0.0.0" in cfg.base_url
or "127.0.0.1" in cfg.base_url
)
):
warnings.warn(
"You are using a local Base URL for an OpenAI compatible server in 'evaluate' mode. "
"Ensure you have a server running at this address or results may not be generated.",
UserWarning,
)
break # Warn once
elif (
isinstance(final_openai_configs, APIServerConfig)
and final_openai_configs.base_url
and (
"localhost" in final_openai_configs.base_url
or "0.0.0.0" in final_openai_configs.base_url
or "127.0.0.1" in final_openai_configs.base_url
)
):
warnings.warn(
"You are using a local Base URL for an OpenAI compatible server in 'evaluate' mode. "
"Ensure you have a server running at this address or results may not be generated.",
UserWarning,
)
rprint(env_config)
rprint(final_openai_configs)
# --- Create and Run Environment ---
# Create the environment instance
env = cls(
config=env_config,
server_configs=final_openai_configs,
slurm=server_manager_config.slurm,
testing=server_manager_config.testing,
)
print("Running evaluation...")
# Handle the case where we might already be in an event loop
try:
loop = asyncio.get_running_loop()
task = loop.create_task(env._run_evaluate())
loop.run_until_complete(task)
except RuntimeError:
asyncio.run(env._run_evaluate())
print("Evaluation completed.")
return CliEvaluateConfig

View file

@ -144,15 +144,15 @@ env = PayToPlayEnv(config, server_configs, testing=True)
```python
async def training_loop():
await env.setup()
for step in range(config.total_steps):
# Get next question
question = await env.get_next_item()
# Agent selects agent cards and makes payments
# Evaluates response and gets training signal
scored_data, _ = await env.collect_trajectories(question)
# Log metrics
await env.wandb_log()
```
@ -162,7 +162,7 @@ async def training_loop():
### Pricing Strategy
- **Technical Expert ($0.03)**: Premium pricing reflects high accuracy and specialized knowledge
- **Communication Specialist ($0.02)**: Mid-tier pricing for clarity and accessibility focus
- **Communication Specialist ($0.02)**: Mid-tier pricing for clarity and accessibility focus
- **Creative Thinker ($0.01)**: Budget option encouraging creativity and innovation
### Budget Scenarios
@ -190,7 +190,7 @@ async def _agent_select_judges(self, question: str) -> JudgeSelection:
# 1. Agent analyzes question directly (no pre-categorization)
# 2. Get agent card performance stats
judge_stats = self._get_judge_performance_stats()
# 3. AI agent makes strategic decision with full context
selection_response = await self.server.chat_completion(
messages=[
@ -198,7 +198,7 @@ async def _agent_select_judges(self, question: str) -> JudgeSelection:
{"role": "user", "content": selection_prompt}
]
)
# 4. Validate and execute selection
return validated_selection
```
@ -285,19 +285,19 @@ Set `testing_mode=False` for real USDC payments on Base blockchain:
## 🎖️ Key Features
**Multiple Specialized Agent Cards**: Different expertise areas and pricing tiers
**Intelligent Agent Selection**: AI-driven agent card selection with dynamic question analysis
**Budget Awareness**: Real economic constraints drive efficient learning
**Performance Tracking**: Historical data informs future decisions
**Blockchain Integration**: Real USDC payments on Base network
**Comprehensive Monitoring**: Detailed metrics and decision analysis
**Fallback Mechanisms**: Robust handling of budget constraints
**Testing Framework**: Simulation mode for development and testing
**Multiple Specialized Agent Cards**: Different expertise areas and pricing tiers
**Intelligent Agent Selection**: AI-driven agent card selection with dynamic question analysis
**Budget Awareness**: Real economic constraints drive efficient learning
**Performance Tracking**: Historical data informs future decisions
**Blockchain Integration**: Real USDC payments on Base network
**Comprehensive Monitoring**: Detailed metrics and decision analysis
**Fallback Mechanisms**: Robust handling of budget constraints
**Testing Framework**: Simulation mode for development and testing
## 🚧 Future Enhancements
- **Dynamic Pricing**: Agent card prices adjust based on demand and performance
- **Agent Card Reputation System**: Community-driven agent card quality ratings
- **Agent Card Reputation System**: Community-driven agent card quality ratings
- **Multi-Round Evaluation**: Iterative feedback and improvement cycles
- **Agent Card Specialization**: More granular specialty categories
- **Economic Incentives**: Reward mechanisms for high-performing agent cards
@ -327,4 +327,3 @@ This environment builds upon recent advances in reinforcement learning from AI f
- **RLAIF vs. RLHF**: Lee, H., et al. (2023). "RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with AI Feedback." *arXiv preprint arXiv:2309.00267*. [https://arxiv.org/abs/2309.00267](https://arxiv.org/abs/2309.00267)
- **Mixture of Judges**: Xu, T., et al. (2024). "The Perfect Blend: Redefining RLHF with Mixture of Judges." *arXiv preprint arXiv:2409.20370*. [https://arxiv.org/abs/2409.20370](https://arxiv.org/abs/2409.20370)

View file

@ -9,12 +9,13 @@ Author: OpenBlock Labs
License: MIT
"""
import yaml
from decimal import Decimal
from typing import Dict, List, Tuple
from dataclasses import dataclass
from decimal import Decimal
from enum import Enum
from pathlib import Path
from typing import Dict, List, Tuple
import yaml
def _load_config():
@ -22,15 +23,11 @@ def _load_config():
# Deterministic path: from pay_to_play directory to modal/configs
config_file = Path(__file__).parent.parent / "configs" / "pay_to_play_modal.yaml"
if config_file.exists():
with open(config_file, 'r') as f:
with open(config_file, "r") as f:
return yaml.safe_load(f)
# Default config if file doesn't exist
return {
"model": {
"name": "microsoft/DialoGPT-small"
}
}
return {"model": {"name": "microsoft/DialoGPT-small"}}
# Load config once at module level
@ -40,10 +37,11 @@ _CONFIG = _load_config()
class AgentCardSpecialty(Enum):
"""
Agent card specialties for different types of evaluation.
Each specialty represents a domain of expertise that an agent card can provide.
Agent cards can have multiple specialties to handle diverse evaluation needs.
"""
TECHNICAL_ACCURACY = "technical_accuracy"
CLARITY_COMMUNICATION = "clarity_communication"
CREATIVE_THINKING = "creative_thinking"
@ -55,10 +53,10 @@ class AgentCardSpecialty(Enum):
class AgentCardConfig:
"""
Configuration for an agent card (without wallet credentials).
This class contains all the metadata needed to define an agent card's capabilities,
pricing, and evaluation approach. Wallet credentials are kept separate for security.
Attributes:
name: Human-readable name for the agent card
price_usd: Cost in USD to use this agent card for one evaluation
@ -67,6 +65,7 @@ class AgentCardConfig:
system_prompt: The prompt used to guide this agent card's evaluations
model_name: The specific model this agent uses for evaluation
"""
name: str
price_usd: Decimal
specialties: List[AgentCardSpecialty]
@ -100,7 +99,10 @@ AGENT_CARDS_CONFIG: Dict[str, AgentCardConfig] = {
"technical_expert": AgentCardConfig(
name="Technical Expert",
price_usd=Decimal("0.03"), # Premium pricing for specialized expertise
specialties=[AgentCardSpecialty.TECHNICAL_ACCURACY, AgentCardSpecialty.REASONING_LOGIC],
specialties=[
AgentCardSpecialty.TECHNICAL_ACCURACY,
AgentCardSpecialty.REASONING_LOGIC,
],
description=(
"Specialized in technical accuracy, complex reasoning, and factual correctness. "
"Excellent for STEM questions, programming challenges, and analytical tasks."
@ -121,9 +123,8 @@ AGENT_CARDS_CONFIG: Dict[str, AgentCardConfig] = {
"tags, then provide your evaluation.\n\n"
"End with \\boxed{score} where score is between 0.0 and 1.0."
),
model_name=_teacher_model # Use model from config
model_name=_teacher_model, # Use model from config
),
"communication_specialist": AgentCardConfig(
name="Communication Specialist",
price_usd=Decimal("0.02"), # Mid-tier pricing for communication focus
@ -146,9 +147,8 @@ AGENT_CARDS_CONFIG: Dict[str, AgentCardConfig] = {
"then provide your evaluation.\n\n"
"End with \\boxed{score} where score is between 0.0 and 1.0."
),
model_name=_teacher_model # Use model from config
model_name=_teacher_model, # Use model from config
),
"creative_thinker": AgentCardConfig(
name="Creative Thinker",
price_usd=Decimal("0.01"), # Budget pricing to encourage creative exploration
@ -172,7 +172,7 @@ AGENT_CARDS_CONFIG: Dict[str, AgentCardConfig] = {
"<think> </think> tags, then provide your evaluation.\n\n"
"End with \\boxed{score} where score is between 0.0 and 1.0."
),
model_name=_teacher_model # Use model from config
model_name=_teacher_model, # Use model from config
),
}
@ -180,13 +180,13 @@ AGENT_CARDS_CONFIG: Dict[str, AgentCardConfig] = {
def get_agent_card_config(agent_card_id: str) -> AgentCardConfig:
"""
Get configuration for a specific agent card.
Args:
agent_card_id: The unique identifier for the agent card
Returns:
AgentCardConfig object containing the agent card's configuration
Raises:
ValueError: If the agent_card_id is not found
"""
@ -202,26 +202,28 @@ def get_agent_card_config(agent_card_id: str) -> AgentCardConfig:
def get_all_agent_card_configs() -> Dict[str, AgentCardConfig]:
"""
Get all agent card configurations.
Returns:
Dictionary mapping agent card IDs to their configurations
"""
return AGENT_CARDS_CONFIG.copy()
def get_agent_cards_by_specialty(specialty: AgentCardSpecialty) -> Dict[str, AgentCardConfig]:
def get_agent_cards_by_specialty(
specialty: AgentCardSpecialty,
) -> Dict[str, AgentCardConfig]:
"""
Get all agent cards that have a specific specialty.
Args:
specialty: The specialty to filter by
Returns:
Dictionary of agent card IDs and configs that have the specified specialty
"""
return {
agent_card_id: config
for agent_card_id, config in AGENT_CARDS_CONFIG.items()
agent_card_id: config
for agent_card_id, config in AGENT_CARDS_CONFIG.items()
if specialty in config.specialties
}
@ -229,7 +231,7 @@ def get_agent_cards_by_specialty(specialty: AgentCardSpecialty) -> Dict[str, Age
def get_cheapest_agent_card() -> Tuple[str, AgentCardConfig]:
"""
Get the cheapest available agent card.
Returns:
Tuple of (agent_card_id, agent_card_config) for the lowest priced agent card
"""
@ -239,7 +241,7 @@ def get_cheapest_agent_card() -> Tuple[str, AgentCardConfig]:
def get_most_expensive_agent_card() -> Tuple[str, AgentCardConfig]:
"""
Get the most expensive available agent card.
Returns:
Tuple of (agent_card_id, agent_card_config) for the highest priced agent card
"""
@ -249,7 +251,7 @@ def get_most_expensive_agent_card() -> Tuple[str, AgentCardConfig]:
def get_price_range() -> Tuple[Decimal, Decimal]:
"""
Get the price range of all agent cards.
Returns:
Tuple of (min_price, max_price) across all agent cards
"""
@ -260,16 +262,16 @@ def get_price_range() -> Tuple[Decimal, Decimal]:
def validate_agent_card_configs() -> None:
"""
Validate that all agent card configurations are properly formatted.
This function is called automatically on module import to ensure
all agent card configurations are valid.
Raises:
ValueError: If any agent card configuration is invalid
"""
if not AGENT_CARDS_CONFIG:
raise ValueError("No agent card configurations defined")
# Validate each agent card configuration
for agent_card_id, config in AGENT_CARDS_CONFIG.items():
try:
@ -277,8 +279,10 @@ def validate_agent_card_configs() -> None:
# We just need to access it to trigger validation
_ = config.name
except Exception as e:
raise ValueError(f"Invalid configuration for agent card '{agent_card_id}': {e}")
raise ValueError(
f"Invalid configuration for agent card '{agent_card_id}': {e}"
)
# Ensure we have agent cards across different price points
min_price, max_price = get_price_range()
if min_price == max_price:
@ -286,4 +290,4 @@ def validate_agent_card_configs() -> None:
# Validate configurations on import
validate_agent_card_configs()
validate_agent_card_configs()

File diff suppressed because it is too large Load diff

View file

@ -1,3 +1,3 @@
web3>=6.0.0
eth-account>=0.8.0
base58>=2.1.0
base58>=2.1.0

View file

@ -20,4 +20,4 @@
"network": "base",
"chain_id": 8453,
"usdc_contract": "0x833589fCD6eDb6E08f4c7C32D4f71b54bdA02913"
}
}