diff --git a/environments/smolagents_integration/README.md b/environments/smolagents_integration/README.md index e8f4d33c..0c935438 100644 --- a/environments/smolagents_integration/README.md +++ b/environments/smolagents_integration/README.md @@ -78,7 +78,8 @@ python -m environments.smolagents_integration.smolagents_env process \ --env.max_concurrent_processes 8 \ --env.use_chat_completion true \ --openai.model_name "gpt-4o" \ - --openai.base_url "https://api.openai.com/v1" + --openai.base_url "https://api.openai.com/v1" \ + --openai.api_key x ``` ```bash @@ -91,7 +92,8 @@ python -m environments.smolagents_integration.smolagents_env process \ --env.max_concurrent_processes 8 \ --env.use_chat_completion true \ --openai.model_name "gpt-4o" \ - --openai.base_url "https://api.openai.com/v1" + --openai.base_url "https://api.openai.com/v1" \ + --openai.api_key x ``` Note: The command syntax uses dots (`.`) to separate namespaces. Also, the OpenAI API key should be set in your environment variables as `OPENAI_API_KEY` or in a `.env` file in the project root. @@ -118,8 +120,8 @@ python -m environments.smolagents_integration.smolagents_env serve \ --env.use_chat_completion true \ --env.max_concurrent_processes 5 \ --env.group_size 8 \ - --openai.model_name "gpt-4o" \ - --openai.base_url "https://api.openai.com/v1" + --openai.model_name "your-model-name" \ + --openai.base_url "http://localhost:8000/v1" ``` ## How It Works @@ -204,4 +206,4 @@ Each task includes: - **Web tool errors**: If Tavily tools aren't working, make sure you have set the `TAVILY_API_KEY` environment variable and have installed the `tavily-python` package. - **Tool import errors**: If you see errors about missing tool modules, ensure your working directory allows proper imports of the tools folder. - **Permission errors with file tools**: Ensure your process has the correct permissions to read/write files in the directories being accessed. -- **Memory issues**: If you encounter memory usage problems, try lowering the `max_concurrent_processes` parameter. \ No newline at end of file +- **Memory issues**: If you encounter memory usage problems, try lowering the `max_concurrent_processes` parameter. diff --git a/environments/smolagents_integration/agent_process_runner.py b/environments/smolagents_integration/agent_process_runner.py index 116defa3..933d01c1 100644 --- a/environments/smolagents_integration/agent_process_runner.py +++ b/environments/smolagents_integration/agent_process_runner.py @@ -12,22 +12,19 @@ from typing import Any, Dict from smolagents import CodeAgent # Import tools directly -from environments.smolagents_integration.server_proxy import ServerProxy -from environments.smolagents_integration.smolagents_model import ProcessSafeAtroposServerModel -from environments.smolagents_integration.tools.file_tools import ( - append_to_file, read_file, write_file -) +from .server_proxy import ServerProxy +from .smolagents_model import ProcessSafeAtroposServerModel +from .tools.file_tools import append_to_file, read_file, write_file # Conditionally import Tavily tools if API key is available tavily_tools = [] if os.environ.get("TAVILY_API_KEY"): try: - from environments.smolagents_integration.tools.tavily_tools import ( - TavilyExtractTool, TavilySearchTool - ) + from .tools.tavily_tools import TavilyExtractTool, TavilySearchTool + tavily_tools = [ TavilySearchTool(api_key=os.environ.get("TAVILY_API_KEY")), - TavilyExtractTool(api_key=os.environ.get("TAVILY_API_KEY")) + TavilyExtractTool(api_key=os.environ.get("TAVILY_API_KEY")), ] except ImportError: pass @@ -64,7 +61,9 @@ def run_agent_process( try: start_time = time.time() process_id = os.getpid() - logger.info(f"Process {process_id} starting for task {task_metadata.get('task_id', 'unknown')}") + logger.info( + f"Process {process_id} starting for task {task_metadata.get('task_id', 'unknown')}" + ) # Create a model using the server proxy model = ProcessSafeAtroposServerModel( @@ -85,14 +84,18 @@ def run_agent_process( verbosity_level=agent_config.get("verbosity", 2), ) - logger.info(f"Process {process_id}: Running agent on prompt with {len(prompt)} chars") + logger.info( + f"Process {process_id}: Running agent on prompt with {len(prompt)} chars" + ) # Run the agent and get response agent_response = agent.run(prompt) - + # Ensure the response is properly formatted (convert sets, etc. to strings) if not isinstance(agent_response, str): - logger.info(f"Converting non-string response of type {type(agent_response)} to string") + logger.info( + f"Converting non-string response of type {type(agent_response)} to string" + ) try: if isinstance(agent_response, set): # Convert sets to comma-separated strings @@ -103,7 +106,7 @@ def run_agent_process( except Exception as e: logger.error(f"Failed to convert agent_response to string: {e}") agent_response = str(agent_response) - + # Extract agent memory agent_memory = getattr(agent, "memory", None) if hasattr(agent, "write_memory_to_messages"): @@ -113,14 +116,16 @@ def run_agent_process( execution_time = time.time() - start_time # Prepare and send result - result_queue.put({ - "status": "success", - "response": agent_response, - "task_id": task_metadata.get("task_id"), - "execution_time": execution_time, - "agent_memory": agent_memory, - "task_metadata": task_metadata, - }) + result_queue.put( + { + "status": "success", + "response": agent_response, + "task_id": task_metadata.get("task_id"), + "execution_time": execution_time, + "agent_memory": agent_memory, + "task_metadata": task_metadata, + } + ) logger.info(f"Process {process_id}: Agent completed in {execution_time:.2f}s") @@ -128,14 +133,16 @@ def run_agent_process( # Log the exception and put error result in queue logger.error(f"Process {os.getpid()}: Error in agent execution: {e}") logger.error(traceback.format_exc()) - - result_queue.put({ - "status": "error", - "error_message": str(e), - "error_traceback": traceback.format_exc(), - "task_id": task_metadata.get("task_id"), - "task_metadata": task_metadata, - }) + + result_queue.put( + { + "status": "error", + "error_message": str(e), + "error_traceback": traceback.format_exc(), + "task_id": task_metadata.get("task_id"), + "task_metadata": task_metadata, + } + ) finally: logger.info(f"Process {os.getpid()}: Cleanup complete") diff --git a/environments/smolagents_integration/smolagents_env.py b/environments/smolagents_integration/smolagents_env.py index 3f443c2f..5bbf9988 100644 --- a/environments/smolagents_integration/smolagents_env.py +++ b/environments/smolagents_integration/smolagents_env.py @@ -3,35 +3,21 @@ SmolagentsEnv - Environment for creating high-quality agent trajectories for training language models using the SmolaGents agent framework. """ -import asyncio import json import logging import multiprocessing -import os -import time from dataclasses import dataclass from datetime import datetime -from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union -import numpy as np -from pydantic import BaseModel, Field -from smolagents import CodeAgent, Tool -from smolagents.tools import tool # Import the tool decorator +from pydantic import Field -import wandb from atroposlib.envs.base import BaseEnv, BaseEnvConfig, ScoredDataGroup -from atroposlib.envs.server_handling.openai_server import OpenAIServer from atroposlib.envs.server_handling.server_baseline import APIServerConfig -from atroposlib.envs.server_handling.server_manager import ServerManager from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer -from environments.smolagents_integration.agent_process_runner import run_agent_process -from environments.smolagents_integration.server_proxy import ServerProxyManager -from environments.smolagents_integration.tools.file_tools import ( - append_to_file, - read_file, - write_file, -) + +from .agent_process_runner import run_agent_process +from .server_proxy import ServerProxyManager @dataclass @@ -73,7 +59,6 @@ class SmolagentsEnvConfig(BaseEnvConfig): default="combined", description="Scoring strategy: basic, correctness, or combined", ) - # Removed max_concurrent_agents as we only use process-based execution length_penalty_weight: float = Field( default=0.1, description="Weight for length penalty in scoring (0.0 to disable)" ) @@ -240,8 +225,6 @@ class SmolagentsEnv(BaseEnv): logger.info("SmolagentsEnv setup complete") - # _create_tools method removed - tools are created directly in agent_process_runner.py - async def get_next_item(self) -> Item: """Get the next item from the GAIA dataset.""" if not self.examples: @@ -414,9 +397,9 @@ class SmolagentsEnv(BaseEnv): ) else: # Just log the error and omit this example from training - error_message = result.get('error_message', 'Unknown error') - task_id = result.get('task_id', 'Unknown task') - + error_message = result.get("error_message", "Unknown error") + task_id = result.get("task_id", "Unknown task") + logger.warning( f"Omitting failed task {task_id} from training batch: {error_message}" ) @@ -493,7 +476,7 @@ class SmolagentsEnv(BaseEnv): """ Score the agent trajectory based on multiple criteria: - Answer correctness using GAIA scoring - - Message format adherence + - Message format adherence - Final answer tool usage - Execution success (detection of errors) - Efficiency (steps only) @@ -504,30 +487,35 @@ class SmolagentsEnv(BaseEnv): true_answer: The ground truth answer agent_memory: The memory trace of the agent's steps execution_time: Time taken for execution (not used in scoring) - + Returns: float: A score between 0.0 and 1.0 """ # Import all scoring functions upfront - from environments.smolagents_integration.evaluations.smolagent_integrations.rubrics.gaia_scorer import ( - question_scorer, check_close_call + from .evaluations.smolagent_integrations.rubrics.gaia_scorer import ( + check_close_call, + question_scorer, ) - from environments.smolagents_integration.evaluations.smolagent_integrations.smolagents_scorer import ( - check_format_adherence, check_final_answer_usage, - calculate_execution_score, calculate_efficiency_score + from .evaluations.smolagent_integrations.smolagents_scorer import ( + calculate_efficiency_score, + calculate_execution_score, + check_final_answer_usage, + check_format_adherence, ) - + # Initialize component scores correctness_score = 0.0 format_score = 0.0 final_answer_score = 0.0 execution_score = 0.0 efficiency_score = 0.0 - + # 1. Calculate correctness score using GAIA scorer # Ensure agent_response is a string before scoring if not isinstance(agent_response, str): - logger.warning(f"agent_response is not a string, it's a {type(agent_response)}: {agent_response}") + logger.warning( + f"agent_response is not a string, it's a {type(agent_response)}: {agent_response}" + ) try: if isinstance(agent_response, set): # Convert sets to comma-separated strings @@ -538,15 +526,15 @@ class SmolagentsEnv(BaseEnv): except Exception as e: logger.error(f"Failed to convert agent_response to string: {e}") agent_response = "" - + is_correct = question_scorer(agent_response, true_answer) is_near_correct = check_close_call(agent_response, true_answer, is_correct) - + if is_correct: correctness_score = 1.0 elif is_near_correct: correctness_score = 0.5 - + # 2. Check format adherence if agent_memory: format_scores = [] @@ -555,10 +543,12 @@ class SmolagentsEnv(BaseEnv): format_scores.append(check_format_adherence(step["content"])) elif "model_output" in step and isinstance(step["model_output"], str): format_scores.append(check_format_adherence(step["model_output"])) - + # Average the format scores across all steps - format_score = sum(format_scores) / len(format_scores) if format_scores else 0.0 - + format_score = ( + sum(format_scores) / len(format_scores) if format_scores else 0.0 + ) + # 3. Check for final_answer tool usage final_answer_used = False for step in agent_memory: @@ -570,41 +560,44 @@ class SmolagentsEnv(BaseEnv): if check_final_answer_usage(step["model_output"]): final_answer_used = True break - + final_answer_score = 1.0 if final_answer_used else 0.0 - + # 4. Check for execution errors and calculate execution score execution_score = calculate_execution_score(agent_memory) - + # 5. Calculate efficiency score steps_count = len(agent_memory) efficiency_score = calculate_efficiency_score( - steps_count=steps_count, - max_steps=self.max_steps + steps_count=steps_count, max_steps=self.max_steps ) - + # Component weights - can be adjusted to emphasize different aspects correctness_weight = 0.50 # 50% of score - correctness matters most - format_weight = 0.20 # 20% - format adherence is important + format_weight = 0.20 # 20% - format adherence is important final_answer_weight = 0.10 # 10% - using final_answer tool properly - execution_weight = 0.10 # 10% - avoiding errors - efficiency_weight = 0.10 # 10% - being efficient - + execution_weight = 0.10 # 10% - avoiding errors + efficiency_weight = 0.10 # 10% - being efficient + # Calculate combined score with weights combined_score = ( - correctness_score * correctness_weight + - format_score * format_weight + - final_answer_score * final_answer_weight + - execution_score * execution_weight + - efficiency_score * efficiency_weight + correctness_score * correctness_weight + + format_score * format_weight + + final_answer_score * final_answer_weight + + execution_score * execution_weight + + efficiency_score * efficiency_weight ) - + # Apply length penalty if configured length_penalty = 0.0 if self.config.length_penalty_weight > 0: # agent_response should already be a string from the previous conversion # but double-check just to be sure - response_to_measure = agent_response if isinstance(agent_response, str) else str(agent_response) + response_to_measure = ( + agent_response + if isinstance(agent_response, str) + else str(agent_response) + ) response_length = len(response_to_measure) # Penalize very long responses if response_length > 2000: @@ -613,57 +606,75 @@ class SmolagentsEnv(BaseEnv): self.config.length_penalty_weight * (response_length - 2000) / 1000, ) combined_score = max(0.0, combined_score - length_penalty) - + # Debug logging for score calculation if self.debug_scoring: logger.info("=== SCORE CALCULATION (detailed) ===") - logger.info(f"1. Correctness component:") + logger.info("1. Correctness component:") logger.info(f" - True answer: '{true_answer}'") logger.info(f" - Agent answer: '{agent_response}'") logger.info(f" - Is correct: {is_correct}") logger.info(f" - Is near correct: {is_near_correct}") logger.info(f" - Raw score: {correctness_score:.3f}") logger.info(f" - Weight: {correctness_weight}") - logger.info(f" - Weighted score: {correctness_score * correctness_weight:.3f}") - - logger.info(f"2. Format adherence component:") - logger.info(f" - Format scores: {format_scores if 'format_scores' in locals() else []}") + logger.info( + f" - Weighted score: {correctness_score * correctness_weight:.3f}" + ) + + logger.info("2. Format adherence component:") + logger.info( + f" - Format scores: {format_scores if 'format_scores' in locals() else []}" + ) logger.info(f" - Raw score: {format_score:.3f}") logger.info(f" - Weight: {format_weight}") logger.info(f" - Weighted score: {format_score * format_weight:.3f}") - - logger.info(f"3. Final answer tool usage:") - logger.info(f" - Final answer tool used: {final_answer_used if 'final_answer_used' in locals() else False}") + + logger.info("3. Final answer tool usage:") + logger.info( + f" - Final answer tool used: {final_answer_used if 'final_answer_used' in locals() else False}" + ) logger.info(f" - Raw score: {final_answer_score:.3f}") logger.info(f" - Weight: {final_answer_weight}") - logger.info(f" - Weighted score: {final_answer_score * final_answer_weight:.3f}") - - logger.info(f"4. Execution component:") + logger.info( + f" - Weighted score: {final_answer_score * final_answer_weight:.3f}" + ) + + logger.info("4. Execution component:") logger.info(f" - Steps count: {len(agent_memory) if agent_memory else 0}") logger.info(f" - Raw score: {execution_score:.3f}") logger.info(f" - Weight: {execution_weight}") - logger.info(f" - Weighted score: {execution_score * execution_weight:.3f}") - - logger.info(f"5. Efficiency component:") + logger.info( + f" - Weighted score: {execution_score * execution_weight:.3f}" + ) + + logger.info("5. Efficiency component:") logger.info(f" - Steps count: {len(agent_memory) if agent_memory else 0}") logger.info(f" - Max steps: {self.max_steps}") logger.info(f" - Raw score: {efficiency_score:.3f}") logger.info(f" - Weight: {efficiency_weight}") - logger.info(f" - Weighted score: {efficiency_score * efficiency_weight:.3f}") - - logger.info(f"6. Length penalty:") - logger.info(f" - Response length: {len(agent_response) if isinstance(agent_response, str) else 'N/A'}") + logger.info( + f" - Weighted score: {efficiency_score * efficiency_weight:.3f}" + ) + + logger.info("6. Length penalty:") + logger.info( + f" - Response length: {len(agent_response) if isinstance(agent_response, str) else 'N/A'}" + ) logger.info(f" - Penalty: {length_penalty:.3f}") - - logger.info(f"7. Final score calculation:") - logger.info(f" - Correctness: {correctness_score * correctness_weight:.3f}") + + logger.info("7. Final score calculation:") + logger.info( + f" - Correctness: {correctness_score * correctness_weight:.3f}" + ) logger.info(f" - Format adherence: {format_score * format_weight:.3f}") - logger.info(f" - Final answer tool: {final_answer_score * final_answer_weight:.3f}") + logger.info( + f" - Final answer tool: {final_answer_score * final_answer_weight:.3f}" + ) logger.info(f" - Execution: {execution_score * execution_weight:.3f}") logger.info(f" - Efficiency: {efficiency_score * efficiency_weight:.3f}") logger.info(f" - Length penalty: -{length_penalty:.3f}") logger.info(f" - FINAL SCORE: {combined_score:.3f}") - + return combined_score def _create_scored_data_group( @@ -704,76 +715,82 @@ class SmolagentsEnv(BaseEnv): # Create a comprehensive markdown document for HTML compatibility # Format similar to the Wikipedia environment complete_conversation = [] - + # Add task information at the top task_type = item.metadata.get("task", "Unknown task") task_id = item.metadata.get("task_id", "Unknown ID") complete_conversation.append(f"# GAIA Task: {task_type} (ID: {task_id})") - + # Process each message in the conversation for i, msg in enumerate(messages): role = msg.get("role", "unknown") content = msg.get("content", "") - + # Skip empty messages if not content: continue - + # Add a header for each role complete_conversation.append(f"## {role.upper()}") - + # Handle different content formats if isinstance(content, list): content_text = [] for item in content: - if isinstance(item, dict) and 'text' in item: - content_text.append(item['text']) + if isinstance(item, dict) and "text" in item: + content_text.append(item["text"]) else: content_text.append(str(item)) - content = '\n'.join(content_text) + content = "\n".join(content_text) elif not isinstance(content, str): # Handle non-string content content = str(content) - + # Add the message content complete_conversation.append(content) - + # If this message has agent memory, show it if role == "assistant" and i > 1 and self.config.save_full_traces: # Agent memory with thinking is often present in assistant messages if "" in content: complete_conversation.append("### Thinking Process") # The thinking is already in the content - + # Add tool usage information if present if "tool_usage" in msg: - tool_name = msg.get("tool_usage", {}).get("name", "unknown tool") + tool_name = msg.get("tool_usage", {}).get( + "name", "unknown tool" + ) complete_conversation.append(f"### 🛠️ Tool Used: {tool_name}") tool_args = msg.get("tool_usage", {}).get("args", {}) if tool_args: complete_conversation.append("```json") - complete_conversation.append(json.dumps(tool_args, indent=2)) + complete_conversation.append( + json.dumps(tool_args, indent=2) + ) complete_conversation.append("```") - + tool_result = msg.get("tool_usage", {}).get("result", None) if tool_result: complete_conversation.append("### Tool Result") complete_conversation.append("```") complete_conversation.append(str(tool_result)) complete_conversation.append("```") - + # Add score information at the end complete_conversation.append(f"\n## Score: {scored_data['score']:.4f}") - + # Join everything into a single string with double newlines between sections full_conversation_markdown = "\n\n".join(complete_conversation) - + # Create the ScoredDataGroup with a single comprehensive markdown document scored_group = ScoredDataGroup( tokens=[self.tokenizer.encode(json.dumps(messages))], masks=[[1] * len(self.tokenizer.encode(json.dumps(messages)))], scores=[scored_data["score"]], - messages=[full_conversation_markdown], # Use single markdown document for HTML + messages=[ + full_conversation_markdown + ], # Use single markdown document for HTML _original_messages=[messages], # Keep original for trainer API ) @@ -787,16 +804,16 @@ class SmolagentsEnv(BaseEnv): {"role": "user", "content": item.prompt}, {"role": "assistant", "content": scored_data["response"]}, ] - + # Use the standard tokenize_for_trainer utility - + # Tokenize using the standard utility (only trains on assistant messages) tokenized = tokenize_for_trainer( - self.tokenizer, + self.tokenizer, messages, - train_on_all_assistant_turns=True # Train on all assistant turns if present + train_on_all_assistant_turns=True, # Train on all assistant turns if present ) - + # Create the ScoredDataGroup scored_group = ScoredDataGroup( tokens=[tokenized["tokens"]], @@ -807,7 +824,6 @@ class SmolagentsEnv(BaseEnv): return scored_group - async def evaluate(self, **kwargs): """ Evaluate the agent on a subset of the GAIA benchmark. diff --git a/environments/smolagents_integration/smolagents_model.py b/environments/smolagents_integration/smolagents_model.py index b1407485..5cf7815c 100644 --- a/environments/smolagents_integration/smolagents_model.py +++ b/environments/smolagents_integration/smolagents_model.py @@ -4,11 +4,10 @@ Process-safe implementation of the AtroposServerModel for SmolaGents. import logging import traceback -from typing import Any, Dict, List, Optional from smolagents.models import ChatMessage, MessageRole, Model -from environments.smolagents_integration.server_proxy import ServerProxy +from .server_proxy import ServerProxy # Configure logger for the model class logger = logging.getLogger(__name__) @@ -45,7 +44,8 @@ class ProcessSafeAtroposServerModel(Model): # Log the configuration logger.info( - f"Initializing ProcessSafeAtroposServerModel with model_id={model_id}, use_chat_completion={self.use_chat_completion}" + f"Initializing ProcessSafeAtroposServerModel with model_id={model_id}, " + f"use_chat_completion={self.use_chat_completion}" ) super().__init__(model_id=model_id, **kwargs) @@ -117,7 +117,12 @@ class ProcessSafeAtroposServerModel(Model): openai_role = "user" elif role_str == "assistant": openai_role = "assistant" - elif role_str in ("tool_call", "tool_response", "function_call", "function_response"): + elif role_str in ( + "tool_call", + "tool_response", + "function_call", + "function_response", + ): # Silently map tool and function calls/responses to user roles openai_role = "user" else: @@ -172,8 +177,8 @@ class ProcessSafeAtroposServerModel(Model): messages=messages, stop_sequences=stop_sequences, **kwargs ) - # Extract timeout from kwargs or use default - timeout = kwargs.pop("timeout", 120) # Default 2 minutes + # Extract timeout from kwargs or use default (but not used in this method) + kwargs.pop("timeout", 120) # Default 2 minutes try: # Use chat_completion if configured