diff --git a/environments/smolagents_integration/evaluations/smolagent_integrations/__init__.py b/environments/smolagents_integration/evaluations/smolagent_integrations/__init__.py index 9c37f23a..54941eca 100644 --- a/environments/smolagents_integration/evaluations/smolagent_integrations/__init__.py +++ b/environments/smolagents_integration/evaluations/smolagent_integrations/__init__.py @@ -1,3 +1,3 @@ """ SmolaGents evaluation utilities for Atropos integrations. -""" \ No newline at end of file +""" diff --git a/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/__init__.py b/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/__init__.py index fa5093f2..91b053ea 100644 --- a/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/__init__.py +++ b/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/__init__.py @@ -1,3 +1,3 @@ """ Scoring rubrics for SmolaGents integrations. -""" \ No newline at end of file +""" diff --git a/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/gaia_scorer.py b/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/gaia_scorer.py index 97b8dc69..06a29c69 100644 --- a/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/gaia_scorer.py +++ b/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/gaia_scorer.py @@ -56,7 +56,10 @@ def question_scorer( try: model_answer = str(model_answer) except Exception as e: - warnings.warn(f"Failed to convert model_answer to string: {e}. Type: {type(model_answer)}", UserWarning) + warnings.warn( + f"Failed to convert model_answer to string: {e}. Type: {type(model_answer)}", + UserWarning, + ) return False # if gt is a number @@ -73,7 +76,9 @@ def question_scorer( # check length is the same if len(gt_elems) != len(ma_elems): - warnings.warn("Answer lists have different lengths, returning False.", UserWarning) + warnings.warn( + "Answer lists have different lengths, returning False.", UserWarning + ) return False # compare each element as float or str @@ -85,7 +90,8 @@ def question_scorer( else: # we do not remove punct since comparisons can include punct comparisons.append( - normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False) + normalize_str(ma_elem, remove_punct=False) + == normalize_str(gt_elem, remove_punct=False) ) return all(comparisons) @@ -116,8 +122,12 @@ def check_close_call(prediction, true_answer, is_correct): return is_correct else: if ( - check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer)) - and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2 + check_prediction_contains_answer_letters_in_order( + str(prediction), str(true_answer) + ) + and len(str(true_answer)) * 0.5 + <= len(str(prediction)) + <= len(str(true_answer)) * 2 ): # Remove print statement that causes duplicated output return True @@ -142,9 +152,12 @@ def normalize_str(input_str, remove_punct=True) -> str: try: input_str = str(input_str) except Exception as e: - warnings.warn(f"Failed to convert input to string: {e}. Type: {type(input_str)}", UserWarning) + warnings.warn( + f"Failed to convert input to string: {e}. Type: {type(input_str)}", + UserWarning, + ) return "" - + # Remove all white spaces. Required e.g for seagull vs. sea gull no_spaces = re.sub(r"\s", "", input_str) diff --git a/environments/smolagents_integration/evaluations/smolagent_integrations/smolagents_scorer.py b/environments/smolagents_integration/evaluations/smolagent_integrations/smolagents_scorer.py index 43bed3fc..3438d5b0 100644 --- a/environments/smolagents_integration/evaluations/smolagent_integrations/smolagents_scorer.py +++ b/environments/smolagents_integration/evaluations/smolagent_integrations/smolagents_scorer.py @@ -7,32 +7,33 @@ execution error detection, and efficiency metrics. """ import re -from typing import Dict, List, Optional, Any +from typing import Any, Dict, List, Optional + import numpy as np def check_format_adherence(memory_content: str) -> float: """ Check if memory content follows the required CodeAgent format. - + The expected format includes: - "Thought:" section with reasoning - "Code:" section with a Python code block - Code blocks with triple backticks and "" marker - + Args: memory_content: The content of a memory step to check - + Returns: float: Score between 0.0 and 1.0 indicating format compliance """ thought_pattern = r"Thought: .+" code_pattern = r"Code:\s*```py\s*[\s\S]*?```" - + # Check if both patterns exist in the content has_thought = bool(re.search(thought_pattern, memory_content)) has_code = bool(re.search(code_pattern, memory_content)) - + if has_thought and has_code: return 1.0 elif has_thought or has_code: @@ -44,10 +45,10 @@ def check_format_adherence(memory_content: str) -> float: def check_final_answer_usage(memory_content: str) -> bool: """ Check if the final_answer tool was used appropriately. - + Args: memory_content: The content of a memory step to check - + Returns: bool: True if final_answer tool was used, False otherwise """ @@ -58,101 +59,108 @@ def check_final_answer_usage(memory_content: str) -> bool: def extract_execution_errors(agent_memory: List[Dict]) -> List[Dict]: """ Extract execution errors from agent memory. - + Looks for error patterns in the observations field of each memory step. - + Args: agent_memory: List of memory steps from agent execution - + Returns: List[Dict]: List of errors with step number and error message """ execution_errors = [] - + if not agent_memory: return execution_errors - + for step in agent_memory: # In SmolaGents ActionStep, observations field contains execution output - if isinstance(step, dict) and "observations" in step and isinstance(step["observations"], str): + if ( + isinstance(step, dict) + and "observations" in step + and isinstance(step["observations"], str) + ): observation = step["observations"] - + # Look for error patterns error_patterns = [ - r"Error: .*", + r"Error: .*", r"Exception: .*", r"Traceback \(most recent call last\).*", r".*Error: .*", - r".*Exception: .*" + r".*Exception: .*", ] - + for pattern in error_patterns: matches = re.findall(pattern, observation, re.DOTALL) if matches: # Record step number and error - execution_errors.append({ - "step": step.get("step_number", 0), - "error": matches[0] - }) - + execution_errors.append( + {"step": step.get("step_number", 0), "error": matches[0]} + ) + return execution_errors def calculate_efficiency_score( - steps_count: int, + steps_count: int, max_steps: int, execution_time: float = None, # Parameter kept for backward compatibility but not used - execution_times_history: Optional[List[float]] = None # Parameter kept for backward compatibility but not used + execution_times_history: Optional[ + List[float] + ] = None, # Parameter kept for backward compatibility but not used ) -> float: """ Calculate efficiency score based on steps used only. Execution time is no longer considered in the score calculation. - + Args: steps_count: Number of steps taken by the agent max_steps: Maximum allowed steps execution_time: Not used, kept for backward compatibility execution_times_history: Not used, kept for backward compatibility - + Returns: float: Efficiency score between 0.0 and 1.0 """ # Start with full efficiency score efficiency_score = 1.0 - + # Penalty for excessive steps (above 75% of max) step_penalty = 1.0 if steps_count > (max_steps * 0.75): - step_penalty = max(0.5, 1.0 - ((steps_count - max_steps * 0.75) / (max_steps * 0.25))) + step_penalty = max( + 0.5, 1.0 - ((steps_count - max_steps * 0.75) / (max_steps * 0.25)) + ) efficiency_score *= step_penalty - + # Note: Execution time penalty has been removed - + return efficiency_score def calculate_execution_score(agent_memory: List[Dict]) -> float: """ Calculate execution success score by detecting errors in agent memory. - + Args: agent_memory: List of memory steps from agent execution - + Returns: float: Execution score between 0.0 and 1.0 """ execution_errors = extract_execution_errors(agent_memory) - + if not agent_memory: return 0.0 - + total_steps = len(agent_memory) error_steps = len(execution_errors) - + if total_steps > 0: # Penalize proportionally to the number of steps with errors execution_score = max(0, 1.0 - (error_steps / total_steps)) else: execution_score = 0.0 - - return execution_score \ No newline at end of file + + return execution_score diff --git a/environments/smolagents_integration/smolagents_env.py b/environments/smolagents_integration/smolagents_env.py index 6243e815..3851b1ff 100644 --- a/environments/smolagents_integration/smolagents_env.py +++ b/environments/smolagents_integration/smolagents_env.py @@ -777,7 +777,11 @@ class SmolagentsEnv(BaseEnv): # Handle both dict and ChatMessage objects if hasattr(message, "role") and hasattr(message, "content"): # Convert ChatMessage to dict - role = message.role.value if hasattr(message.role, "value") else str(message.role) + role = ( + message.role.value + if hasattr(message.role, "value") + else str(message.role) + ) messages.append({"role": role, "content": message.content}) elif isinstance(message, dict): messages.append(message) diff --git a/environments/smolagents_integration/tools/file_tools.py b/environments/smolagents_integration/tools/file_tools.py index 602b391b..5cc790c9 100644 --- a/environments/smolagents_integration/tools/file_tools.py +++ b/environments/smolagents_integration/tools/file_tools.py @@ -1,8 +1,10 @@ #!/usr/bin/env python import os + from smolagents import tool + @tool def read_file(file_path: str) -> str: """ @@ -19,6 +21,7 @@ def read_file(file_path: str) -> str: print(content) return content + @tool def write_file(file_path: str, content: str) -> str: """ @@ -36,6 +39,7 @@ def write_file(file_path: str, content: str) -> str: f.write(content) return f"Content written to {file_path}" + @tool def append_to_file(file_path: str, content: str) -> str: """ diff --git a/test_run.sh b/test_run.sh index b78c47b1..4abacef6 100755 --- a/test_run.sh +++ b/test_run.sh @@ -84,4 +84,4 @@ else echo "" echo "ERROR: Test failed!" exit 1 -fi \ No newline at end of file +fi