#!/usr/bin/env python3 """ EVM Environment for Atropos: Ethereum Virtual Machine Transaction Agent Training This environment trains language models to generate and execute profitable Ethereum transactions using Anvil (Foundry's local blockchain simulation). """ import json import logging import os import random import re import sys import traceback from typing import Any, Dict, List, Optional, Tuple from anvil import AnvilBackend, AnvilConfig from evm_config import EVMEnvConfig from openai import OpenAI from utils import cleanup_blockchain, cleanup_manager, setup_evm_error_message from atroposlib.envs.base import BaseEnv, ScoredDataGroup from atroposlib.envs.server_handling.server_manager import APIServerConfig from atroposlib.type_definitions import Item from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer # Add logger logger = logging.getLogger(__name__) # System prompt for EVM transaction agent system_prompt = ( "You are a deep thinking AI, you may use extremely long chains of thought " "to deeply consider the problem and deliberate with yourself via systematic " "reasoning processes to help come to a correct solution prior to answering. " "You should enclose your thoughts and internal monologue inside " "tags, and then provide your solution or response to the problem.\n\n" ) system_prompt += """You are allowed to use a maximum of 2048 tokens. Please strive to use less. You are here to assist a user execute transfers of both ETH and ERC-20 tokens as requested. Your job is to generate correct Ethereum transaction data for the requested action. IMPORTANT: After your thinking, your response must include a valid JSON transaction object: {"to": "0x...", "value": "amount_in_wei", "data": "0x..."} - 'to': The recipient address (contract or EOA) - 'value': Amount of ETH to send in wei (string) - 'data': Transaction data If you do not provide a valid JSON transaction object, your submission will be ignored and you \ will receive a score of -1.0. Example 1: { "to": "0xe688b84b23f322a994A53dbF8E15FA82CDB71127", "value": "0.01", "data": "0x" } Example 2: { "to": "0xEA29e9da69317d80075fBfc836E843C6d65971F5", "value": "0x", "data": "0xa9059cbb000000000000000000000000ea29e9da69317d80075fbfc836e843c6d65971f50000000000000000000000000000000000000000000000000000000005f5e100" # noqa: E501 } """ class EVMEnv(BaseEnv): """EVM Transaction Environment for training agents to interact with Ethereum""" name = "evm_agent" env_config_cls = EVMEnvConfig def __init__( self, config: EVMEnvConfig, server_configs: List[APIServerConfig], slurm=True, testing=False, ): """Initialize the EVM environment""" super().__init__(config, server_configs, slurm, testing) # Set up minimal logging - only for essential operations self.logger = logging.getLogger(f"{self.__class__.__name__}") self.logger.setLevel(logging.WARNING) # Only warnings and errors self.logger.propagate = False if not self.logger.handlers: handler = logging.StreamHandler() formatter = logging.Formatter("%(message)s") # Clean format handler.setFormatter(formatter) self.logger.addHandler(handler) # Suppress base environment logs if config.suppress_base_env_logs: base_logger = logging.getLogger("atroposlib.envs.base") base_logger.setLevel(logging.WARNING) # Load Anvil configuration self.anvil_config = AnvilConfig(config.anvil_config_path) # Initialize blockchain handler self.blockchain = AnvilBackend(self.anvil_config) # Performance tracking for adaptive question selection self.question_performance = {qtype: [] for qtype in config.question_types} self.current_question_type = None # Store current prompt data for scoring self.current_prompt_data = None # Register cleanup with the global cleanup manager cleanup_manager.register_cleanup(cleanup_blockchain, self.blockchain) async def setup(self): """Setup the EVM environment and start Anvil""" try: print("Starting Anvil blockchain simulation...") self.blockchain.start() self.blockchain.setup_wallet() print("EVM environment setup completed successfully.") except Exception as e: error_message = setup_evm_error_message(self.anvil_config, e) print(error_message) # Cleanup and exit cleanup_blockchain(self.blockchain) sys.exit(1) async def get_next_item(self) -> Optional[Item]: """Generate the next transaction challenge for the agent""" try: # Select question type based on performance (exploration vs exploitation) question_type = self._select_question_type() self.current_question_type = question_type # Generate question prompt and get structured data prompt_text, prompt_data = await self._generate_question_prompt( question_type ) # Store the prompt data for scoring self.current_prompt_data = prompt_data # Display Generated Input self.logger.debug("\n=== Generated Input ===") self.logger.debug(prompt_text) self.logger.debug("=" * 50) prompt = tuple( [frozenset({"role": "user", "content": prompt_text}.items())] ) return (prompt, None, None) except Exception as e: print(f"Error in get_next_item: {e}") traceback.print_exc() return None def _select_question_type(self) -> str: """Select question type using weakness-targeting strategy with 80/20 ratio""" # If no performance data yet, select randomly if not any(self.question_performance.values()): return random.choice(self.config.question_types) # Calculate average scores for each question type avg_scores = {} for qtype, scores in self.question_performance.items(): if scores: avg_scores[qtype] = sum(scores) / len(scores) else: avg_scores[qtype] = 0.0 # Prioritize untested question types # Split into weak and strong areas based on configurable performance threshold weak_threshold = self.config.weak_performance_threshold weak_qtypes = [ qtype for qtype, score in avg_scores.items() if score < weak_threshold ] strong_qtypes = [ qtype for qtype, score in avg_scores.items() if score >= weak_threshold ] # Configurable focus on weak areas vs strong areas for mastery maintenance if random.random() < self.config.weak_area_focus_ratio and weak_qtypes: selected_type = random.choice(weak_qtypes) elif strong_qtypes: selected_type = random.choice(strong_qtypes) else: selected_type = random.choice(list(avg_scores.keys())) return selected_type async def _generate_question_prompt( self, question_type: str ) -> Tuple[str, Optional[Dict[str, Any]]]: """Generate a dynamic question prompt using LLM based on the question type""" # Initialize OpenAI client client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) # Create prompt for LLM to generate a request llm_prompt = f"""You are generating a natural language transaction request for an Ethereum blockchain agent. TRANSACTION TYPE: "{question_type}" CONTEXT: - Wallet Address: {self.anvil_config.funding.custom_wallet} - Current Balances: {json.dumps(self.blockchain.get_wallet_balances(), indent=2)} TASK: Generate a realistic, conversational transaction request that: 1. Matches the specified transaction type exactly 2. Does not use more than current wallet balances and typically would be small transfers or possibly larger, like how a real person may use their assets 3. Includes all necessary details (token, amount, destination address) 4. Sounds like how a real user would naturally request a transaction 5. Varies in tone and style (casual, formal, urgent, etc.) REQUIREMENTS: - Use realistic destination addresses (not placeholder text like "0x1234567890123456789012345678901234567890") - Does not specify amounts larger than 50% of the current balance - Make the request executable Generate ONE natural language request that matches the transaction type "{question_type}". Respond with a JSON object with the following fields: - question_type: The type of transaction to generate - request: The natural language request text - destination_address: The destination address - transfer_token: The token to transfer - transfer_amount: The amount to transfer Examples: 1. ETH transfer {{ "question_type": "ETH transfer", "request": "yo, can i send 0.01 ETH to my buddy jasper? His address is jasper.eth" "destination_address": "jasper.eth" "transfer_token": "ETH" "transfer_amount": "0.01" }} 2. ERC-20 transfer using 18 decimal token {{ "question_type": "ERC-20 transfer using 18 decimal token", "request": "Send 100 CRV to 0xe688b84b23f322a994A53dbF8E15FA82CDB71127" "destination_address": "0xe688b84b23f322a994A53dbF8E15FA82CDB71127" "transfer_token": "CRV" "transfer_amount": "100" }} 3. ERC-20 transfer using a non-18 decimal token (e.g. USDT) {{ "question_type": "ERC-20 transfer using a non-18 decimal token", "request": "give 100 tether to 0xea29e9da69317d80075fbfc836e843c6d65971f5" "destination_address": "0xea29e9da69317d80075fbfc836e843c6d65971f5" "transfer_token": "USDT" "transfer_amount": "100" }} """ try: # Generate multiple responses in a single call for efficiency response = client.chat.completions.create( model=self.config.question_generation_model, messages=[{"role": "user", "content": llm_prompt}], temperature=self.config.question_generation_temperature, max_tokens=self.config.question_generation_max_tokens, n=self.config.question_generation_n, ) # Try each response until we find a valid one for i, choice in enumerate(response.choices): generated_content = choice.message.content.strip() # Extract JSON from response using generic function prompt_data = self._extract_json_from_response( generated_content, ["question_type", "request"], "prompt" ) # Validate required fields if prompt_data and self._validate_prompt_data( prompt_data, question_type ): return prompt_data["request"], prompt_data # All choices failed, use fallback fallback_data = { "question_type": question_type, "request": "Transfer 0.01 ETH to 0x0000000000000000000000000000000000000000", "destination_address": "0x0000000000000000000000000000000000000000", "transfer_token": "ETH", "transfer_amount": "0.01", } return fallback_data["request"], fallback_data except Exception: fallback_data = { "question_type": question_type, "request": "Transfer 0.01 ETH to 0x0000000000000000000000000000000000000000", "destination_address": "0x0000000000000000000000000000000000000000", "transfer_token": "ETH", "transfer_amount": "0.01", } return fallback_data["request"], fallback_data def _extract_json_from_response( self, response: str, required_keys: List[str], json_type: str = "JSON" ) -> Optional[Dict[str, Any]]: """Generic JSON extraction from LLM response, handling thinking tags""" if not isinstance(response, str): return None # First, try to extract content after thinking tags (following SWE pattern) content_after_think = response think_end_match = re.search(r"", response, re.IGNORECASE) if think_end_match: content_after_think = response[think_end_match.end() :].strip() # Create patterns based on required keys if len(required_keys) >= 2: key1, key2 = required_keys[0], required_keys[1] json_patterns = [ rf'\{{[^{{}}]*"{key1}"[^{{}}]*"{key2}"[^{{}}]*\}}', # Simple pattern with first two keys rf'\{{.*?"{key1}".*?"{key2}".*?\}}', # Flexible pattern with first two keys r"\{.*?\}", # Any JSON object ] else: json_patterns = [r"\{.*?\}"] # Fallback to any JSON for pattern in json_patterns: matches = re.findall(pattern, content_after_think, re.DOTALL) for match in matches: try: # Clean up the JSON string json_str = match.strip() # Parse the JSON obj = json.loads(json_str) # Verify it has the expected structure if isinstance(obj, dict) and required_keys[0] in obj: return obj except json.JSONDecodeError: continue return None def _extract_transaction_json(self, response: str) -> Optional[Dict[str, Any]]: """Extract transaction JSON from LLM response""" return self._extract_json_from_response( response, ["to", "value"], "transaction" ) def _validate_prompt_data( self, prompt_data: Dict[str, Any], expected_question_type: str ) -> bool: """Validate that prompt data has all required fields and correct question type""" required_fields = [ "question_type", "request", "destination_address", "transfer_token", "transfer_amount", ] # Check all required fields are present if not all(field in prompt_data for field in required_fields): return False # Check question type matches what we requested if prompt_data["question_type"] != expected_question_type: return False # Check that fields are not empty for field in required_fields: if not prompt_data[field] or str(prompt_data[field]).strip() == "": return False return True async def collect_trajectories( self, item: Item ) -> Tuple[Optional[ScoredDataGroup], List[Item]]: """Collect trajectories by having the agent generate transactions""" to_score = [] to_backlog = [] system_msg = { "role": "system", "content": system_prompt, } user_msg = {"role": "user", "content": dict(item[0][0])["content"]} messages = [system_msg, user_msg] try: # Use proper Atropos framework pattern like humor generation chat_completions = await self.server.chat_completion( messages=messages, n=self.config.group_size, max_tokens=2048, ) # Store completions for output saving self.last_completions = [] for i, choice in enumerate(chat_completions.choices): # Store the completion self.last_completions.append(choice.message.content) # Display Generated Output self.logger.debug(f"\n=== Generated Output {i+1} ===") self.logger.debug(choice.message.content) self.logger.debug("=" * 50) history = [ {"role": "system", "content": system_msg["content"]}, {"role": "user", "content": user_msg["content"]}, {"role": "assistant", "content": choice.message.content}, ] to_score.append((history, item[1], None)) except Exception as e: print(f"Error in collect_trajectories: {e}") traceback.print_exc() to_backlog.append(item) if not to_score: return None, to_backlog scored_data = await self.score(to_score) return scored_data, to_backlog async def score(self, rollout_group_data) -> Optional[ScoredDataGroup]: """Score the generated transactions by executing them on Anvil""" if not rollout_group_data: return None scores = ScoredDataGroup() scores["tokens"] = [] scores["masks"] = [] scores["scores"] = [] scores["advantages"] = None scores["ref_logprobs"] = None scores["messages"] = None scores["group_overrides"] = {"group_size": self.config.group_size} scores["overrides"] = None scores["ground_truths"] = [] for i, item in enumerate(rollout_group_data): out = tokenize_for_trainer(self.tokenizer, item[0]) tokens = out["tokens"] masks = out["masks"] try: # Extract the agent's response (transaction JSON) agent_response = item[0][-1]["content"].strip() ground_truth = item[1] if isinstance(item[1], str) else "" # Score the transaction score = await self._score_transaction(agent_response) # Display Score self.logger.debug(f"\n=== Score {i+1} ===") self.logger.debug(f"{score}") self.logger.debug("=" * 50) # Track performance for this question type if self.current_question_type: self.question_performance[self.current_question_type].append(score) # Keep only last 10 scores per question type if len(self.question_performance[self.current_question_type]) > 10: self.question_performance[self.current_question_type].pop(0) except Exception as e: score = -1.0 ground_truth = item[1] if isinstance(item[1], str) else "" # Display Score for error case print(f"\n=== Score {i+1} ===") print(f"{score} (Error: {e})") print("=" * 50) # Skip if too few tokens if len([i for i in masks if i != -100]) < 10: continue scores["tokens"].append(tokens) scores["masks"].append(masks) scores["scores"].append(score) scores["ground_truths"].append(ground_truth) if len(scores["tokens"]) >= self.config.group_size: break if not scores["tokens"]: return None return scores async def _score_transaction(self, agent_response: str) -> float: """Score a transaction based on multiple criteria""" try: # First, extract JSON from the response (handling thinking tags) tx_obj = self._extract_transaction_json(agent_response) if tx_obj is None: return -1.0 # Could not extract valid JSON # Validate required fields if not all(field in tx_obj for field in ["to", "value", "data"]): return -1.0 # Missing required fields # Get expected transfer details from stored prompt data if not hasattr(self, "current_prompt_data") or not self.current_prompt_data: return -1.0 expected_token = self.current_prompt_data.get("transfer_token", "ETH") expected_amount = self.current_prompt_data.get("transfer_amount", "0") expected_destination = self.current_prompt_data.get( "destination_address", "" ) # Get sender and destination addresses sender_address = self.anvil_config.funding.custom_wallet destination_address = tx_obj.get("to", "") # Get relevant tokens to check relevant_tokens = ["ETH"] if expected_token != "ETH": relevant_tokens.append(expected_token) # Take a snapshot before execution snapshot_id = self.blockchain.snapshot() # Get pre-execution balances for both addresses pre_balances = { "sender": self.blockchain.get_wallet_balances( sender_address, relevant_tokens ), "destination": self.blockchain.get_wallet_balances( destination_address, relevant_tokens ), } try: # Execute the transaction result = self.blockchain.execute_transaction(tx_obj) # Get post-execution balances post_balances = { "sender": self.blockchain.get_wallet_balances( sender_address, relevant_tokens ), "destination": self.blockchain.get_wallet_balances( destination_address, relevant_tokens ), } # Calculate score based on execution result and balance changes score = self._calculate_transaction_score( tx_obj, result, agent_response, pre_balances, post_balances, expected_token, expected_amount, expected_destination, ) # Revert to snapshot to maintain clean state self.blockchain.revert(snapshot_id) return score except Exception: # Revert on any error self.blockchain.revert(snapshot_id) return -1.0 # Execution error except Exception: return -1.0 # General error def _calculate_transaction_score( self, tx_obj: Dict[str, Any], result: Dict[str, Any], agent_response: str, pre_balances: Dict[str, Dict[str, Any]], post_balances: Dict[str, Dict[str, Any]], transfer_token: str, transfer_amount: str, destination_address: str, ) -> float: """Calculate score based on transaction execution results and balance changes""" base_score = 0.0 # 1. Successful execution (0.3 points) if result.get("status") == "0x1": base_score += 0.3 # Transaction succeeded # 2. Correct transaction - exact balance verification (0.5 points) balance_score = self._verify_expected_transfers( pre_balances, post_balances, transfer_token, transfer_amount, destination_address, ) base_score += balance_score # 3. Thinking quality (max 0.1 points, with negative for missing thinking) thinking_score = self._analyze_thinking_quality(agent_response) base_score += thinking_score # Range: -0.2 to +0.1 # 4. To field verification (0.05 points) tx_to = tx_obj.get("to", "").lower() expected_to = destination_address.lower() if tx_to == expected_to: base_score += 0.05 # 5. Data field verification (0.05 points) data_field = tx_obj.get("data", "0x") if transfer_token == "ETH": # ETH transfer should have empty data field if data_field == "0x": base_score += 0.05 else: # ERC-20 transfer should have transfer function call data if data_field.startswith("0xa9059cbb"): # transfer function selector base_score += 0.05 return base_score def _verify_expected_transfers( self, pre_balances: Dict[str, Dict[str, Any]], post_balances: Dict[str, Dict[str, Any]], expected_token: str, expected_amount: str, expected_destination: str, ) -> float: """Verify that the expected transfer amounts occurred""" try: expected_amount_float = float(expected_amount) # For ETH transfers - only check destination address if expected_token == "ETH": # Extract balance values from the nested dictionary structure dest_pre_balance = ( pre_balances["destination"].get("ETH", {}).get("balance", 0) ) dest_post_balance = ( post_balances["destination"].get("ETH", {}).get("balance", 0) ) dest_eth_change = float(dest_post_balance) - float(dest_pre_balance) # Check if destination gained exactly the expected amount if abs(dest_eth_change - expected_amount_float) < 1e-10: return 0.5 # For ERC-20 transfers - check both sender and destination else: # Extract balance values from the nested dictionary structure sender_pre_balance = ( pre_balances["sender"].get(expected_token, {}).get("balance", 0) ) sender_post_balance = ( post_balances["sender"].get(expected_token, {}).get("balance", 0) ) dest_pre_balance = ( pre_balances["destination"] .get(expected_token, {}) .get("balance", 0) ) dest_post_balance = ( post_balances["destination"] .get(expected_token, {}) .get("balance", 0) ) sender_token_change = float(sender_post_balance) - float( sender_pre_balance ) dest_token_change = float(dest_post_balance) - float(dest_pre_balance) # For ERC-20, expect exact amounts (no gas costs in token) if ( abs(sender_token_change + expected_amount_float) < 1e-6 and abs(dest_token_change - expected_amount_float) < 1e-6 ): return 0.5 return 0.0 # Transfer amounts don't match expectations except (ValueError, TypeError): return 0.0 def _analyze_thinking_quality(self, response: str) -> float: """Evaluate thinking tag quality with max 0.1 points, negative for missing thinking""" thinking_score = 0.0 # Check for thinking tags if "" not in response or "" not in response: return -0.2 # Penalty for no thinking tags # Extract thinking content try: thinking_match = re.search(r"(.*?)", response, re.DOTALL) if not thinking_match: return -0.2 # No thinking content found thinking_content = thinking_match.group(1).strip() if not thinking_content: return -0.1 # Empty thinking tags # Basic quality assessment for positive score (max 0.1) word_count = len(thinking_content.split()) # Award points based on thinking depth if word_count >= 50: # Substantial thinking thinking_score += 0.1 elif word_count >= 20: # Moderate thinking thinking_score += 0.05 elif word_count >= 5: # Minimal thinking thinking_score += 0.02 return thinking_score except Exception: return -0.1 # Error in processing thinking async def evaluate(self, *args, **kwargs): """Evaluation method - could implement portfolio performance tracking""" return def close(self): """Clean up resources""" cleanup_blockchain(self.blockchain) @classmethod def config_init(cls) -> Tuple[EVMEnvConfig, List[APIServerConfig]]: """Initialize configuration for EVM environment""" # pydantic-settings automatically loads from YAML file env_config = EVMEnvConfig( tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B", group_size=4, use_wandb=True, rollout_server_url="http://localhost:8000", total_steps=500, batch_size=16, steps_per_eval=50, max_token_length=2048, wandb_name="evm-agent", anvil_config_path="configs/token_transfers.yaml", ) # API server configuration server_configs = [ APIServerConfig( model_name="gpt-4o-mini", base_url=None, # Use OpenAI directly api_key=os.environ.get("OPENAI_API_KEY"), num_requests_for_eval=64, ), ] return env_config, server_configs if __name__ == "__main__": EVMEnv.cli()