diff --git a/environments/smolagents_integration/evaluations/smolagent_integrations/__init__.py b/environments/smolagents_integration/evaluations/smolagent_integrations/__init__.py
index 9c37f23a..54941eca 100644
--- a/environments/smolagents_integration/evaluations/smolagent_integrations/__init__.py
+++ b/environments/smolagents_integration/evaluations/smolagent_integrations/__init__.py
@@ -1,3 +1,3 @@
 """
 SmolaGents evaluation utilities for Atropos integrations.
-"""
\ No newline at end of file
+"""
diff --git a/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/__init__.py b/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/__init__.py
index fa5093f2..91b053ea 100644
--- a/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/__init__.py
+++ b/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/__init__.py
@@ -1,3 +1,3 @@
 """
 Scoring rubrics for SmolaGents integrations.
-"""
\ No newline at end of file
+"""
diff --git a/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/gaia_scorer.py b/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/gaia_scorer.py
index 97b8dc69..06a29c69 100644
--- a/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/gaia_scorer.py
+++ b/environments/smolagents_integration/evaluations/smolagent_integrations/rubrics/gaia_scorer.py
@@ -56,7 +56,10 @@ def question_scorer(
         try:
             model_answer = str(model_answer)
         except Exception as e:
-            warnings.warn(f"Failed to convert model_answer to string: {e}. Type: {type(model_answer)}", UserWarning)
+            warnings.warn(
+                f"Failed to convert model_answer to string: {e}. Type: {type(model_answer)}",
+                UserWarning,
+            )
             return False
 
     # if gt is a number
@@ -73,7 +76,9 @@ def question_scorer(
 
         # check length is the same
         if len(gt_elems) != len(ma_elems):
-            warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
+            warnings.warn(
+                "Answer lists have different lengths, returning False.", UserWarning
+            )
             return False
 
         # compare each element as float or str
@@ -85,7 +90,8 @@ def question_scorer(
             else:
                 # we do not remove punct since comparisons can include punct
                 comparisons.append(
-                    normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
+                    normalize_str(ma_elem, remove_punct=False)
+                    == normalize_str(gt_elem, remove_punct=False)
                 )
         return all(comparisons)
 
@@ -116,8 +122,12 @@ def check_close_call(prediction, true_answer, is_correct):
             return is_correct
         else:
             if (
-                check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
-                and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
+                check_prediction_contains_answer_letters_in_order(
+                    str(prediction), str(true_answer)
+                )
+                and len(str(true_answer)) * 0.5
+                <= len(str(prediction))
+                <= len(str(true_answer)) * 2
             ):
                 # Remove print statement that causes duplicated output
                 return True
@@ -142,9 +152,12 @@ def normalize_str(input_str, remove_punct=True) -> str:
         try:
             input_str = str(input_str)
         except Exception as e:
-            warnings.warn(f"Failed to convert input to string: {e}. Type: {type(input_str)}", UserWarning)
+            warnings.warn(
+                f"Failed to convert input to string: {e}. Type: {type(input_str)}",
+                UserWarning,
+            )
             return ""
-            
+
     # Remove all white spaces. Required e.g for seagull vs. sea gull
     no_spaces = re.sub(r"\s", "", input_str)
 
diff --git a/environments/smolagents_integration/evaluations/smolagent_integrations/smolagents_scorer.py b/environments/smolagents_integration/evaluations/smolagent_integrations/smolagents_scorer.py
index 43bed3fc..3438d5b0 100644
--- a/environments/smolagents_integration/evaluations/smolagent_integrations/smolagents_scorer.py
+++ b/environments/smolagents_integration/evaluations/smolagent_integrations/smolagents_scorer.py
@@ -7,32 +7,33 @@ execution error detection, and efficiency metrics.
 """
 
 import re
-from typing import Dict, List, Optional, Any
+from typing import Any, Dict, List, Optional
+
 import numpy as np
 
 
 def check_format_adherence(memory_content: str) -> float:
     """
     Check if memory content follows the required CodeAgent format.
-    
+
     The expected format includes:
     - "Thought:" section with reasoning
     - "Code:" section with a Python code block
     - Code blocks with triple backticks and "<end_code>" marker
-    
+
     Args:
         memory_content: The content of a memory step to check
-        
+
     Returns:
         float: Score between 0.0 and 1.0 indicating format compliance
     """
     thought_pattern = r"Thought: .+"
     code_pattern = r"Code:\s*```py\s*[\s\S]*?```<end_code>"
-    
+
     # Check if both patterns exist in the content
     has_thought = bool(re.search(thought_pattern, memory_content))
     has_code = bool(re.search(code_pattern, memory_content))
-    
+
     if has_thought and has_code:
         return 1.0
     elif has_thought or has_code:
@@ -44,10 +45,10 @@ def check_format_adherence(memory_content: str) -> float:
 def check_final_answer_usage(memory_content: str) -> bool:
     """
     Check if the final_answer tool was used appropriately.
-    
+
     Args:
         memory_content: The content of a memory step to check
-        
+
     Returns:
         bool: True if final_answer tool was used, False otherwise
     """
@@ -58,101 +59,108 @@ def check_final_answer_usage(memory_content: str) -> bool:
 def extract_execution_errors(agent_memory: List[Dict]) -> List[Dict]:
     """
     Extract execution errors from agent memory.
-    
+
     Looks for error patterns in the observations field of each memory step.
-    
+
     Args:
         agent_memory: List of memory steps from agent execution
-        
+
     Returns:
         List[Dict]: List of errors with step number and error message
     """
     execution_errors = []
-    
+
     if not agent_memory:
         return execution_errors
-    
+
     for step in agent_memory:
         # In SmolaGents ActionStep, observations field contains execution output
-        if isinstance(step, dict) and "observations" in step and isinstance(step["observations"], str):
+        if (
+            isinstance(step, dict)
+            and "observations" in step
+            and isinstance(step["observations"], str)
+        ):
             observation = step["observations"]
-            
+
             # Look for error patterns
             error_patterns = [
-                r"Error: .*", 
+                r"Error: .*",
                 r"Exception: .*",
                 r"Traceback \(most recent call last\).*",
                 r".*Error: .*",
-                r".*Exception: .*"
+                r".*Exception: .*",
             ]
-            
+
             for pattern in error_patterns:
                 matches = re.findall(pattern, observation, re.DOTALL)
                 if matches:
                     # Record step number and error
-                    execution_errors.append({
-                        "step": step.get("step_number", 0),
-                        "error": matches[0]
-                    })
-    
+                    execution_errors.append(
+                        {"step": step.get("step_number", 0), "error": matches[0]}
+                    )
+
     return execution_errors
 
 
 def calculate_efficiency_score(
-    steps_count: int, 
+    steps_count: int,
     max_steps: int,
     execution_time: float = None,  # Parameter kept for backward compatibility but not used
-    execution_times_history: Optional[List[float]] = None  # Parameter kept for backward compatibility but not used
+    execution_times_history: Optional[
+        List[float]
+    ] = None,  # Parameter kept for backward compatibility but not used
 ) -> float:
     """
     Calculate efficiency score based on steps used only.
     Execution time is no longer considered in the score calculation.
-    
+
     Args:
         steps_count: Number of steps taken by the agent
         max_steps: Maximum allowed steps
         execution_time: Not used, kept for backward compatibility
         execution_times_history: Not used, kept for backward compatibility
-        
+
     Returns:
         float: Efficiency score between 0.0 and 1.0
     """
     # Start with full efficiency score
     efficiency_score = 1.0
-    
+
     # Penalty for excessive steps (above 75% of max)
     step_penalty = 1.0
     if steps_count > (max_steps * 0.75):
-        step_penalty = max(0.5, 1.0 - ((steps_count - max_steps * 0.75) / (max_steps * 0.25)))
+        step_penalty = max(
+            0.5, 1.0 - ((steps_count - max_steps * 0.75) / (max_steps * 0.25))
+        )
         efficiency_score *= step_penalty
-    
+
     # Note: Execution time penalty has been removed
-    
+
     return efficiency_score
 
 
 def calculate_execution_score(agent_memory: List[Dict]) -> float:
     """
     Calculate execution success score by detecting errors in agent memory.
-    
+
     Args:
         agent_memory: List of memory steps from agent execution
-        
+
     Returns:
         float: Execution score between 0.0 and 1.0
     """
     execution_errors = extract_execution_errors(agent_memory)
-    
+
     if not agent_memory:
         return 0.0
-    
+
     total_steps = len(agent_memory)
     error_steps = len(execution_errors)
-    
+
     if total_steps > 0:
         # Penalize proportionally to the number of steps with errors
         execution_score = max(0, 1.0 - (error_steps / total_steps))
     else:
         execution_score = 0.0
-    
-    return execution_score
\ No newline at end of file
+
+    return execution_score
diff --git a/environments/smolagents_integration/smolagents_env.py b/environments/smolagents_integration/smolagents_env.py
index 6243e815..3851b1ff 100644
--- a/environments/smolagents_integration/smolagents_env.py
+++ b/environments/smolagents_integration/smolagents_env.py
@@ -777,7 +777,11 @@ class SmolagentsEnv(BaseEnv):
                     # Handle both dict and ChatMessage objects
                     if hasattr(message, "role") and hasattr(message, "content"):
                         # Convert ChatMessage to dict
-                        role = message.role.value if hasattr(message.role, "value") else str(message.role)
+                        role = (
+                            message.role.value
+                            if hasattr(message.role, "value")
+                            else str(message.role)
+                        )
                         messages.append({"role": role, "content": message.content})
                     elif isinstance(message, dict):
                         messages.append(message)
diff --git a/environments/smolagents_integration/tools/file_tools.py b/environments/smolagents_integration/tools/file_tools.py
index 602b391b..5cc790c9 100644
--- a/environments/smolagents_integration/tools/file_tools.py
+++ b/environments/smolagents_integration/tools/file_tools.py
@@ -1,8 +1,10 @@
 #!/usr/bin/env python
 
 import os
+
 from smolagents import tool
 
+
 @tool
 def read_file(file_path: str) -> str:
     """
@@ -19,6 +21,7 @@ def read_file(file_path: str) -> str:
         print(content)
         return content
 
+
 @tool
 def write_file(file_path: str, content: str) -> str:
     """
@@ -36,6 +39,7 @@ def write_file(file_path: str, content: str) -> str:
         f.write(content)
     return f"Content written to {file_path}"
 
+
 @tool
 def append_to_file(file_path: str, content: str) -> str:
     """
diff --git a/test_run.sh b/test_run.sh
index b78c47b1..4abacef6 100755
--- a/test_run.sh
+++ b/test_run.sh
@@ -84,4 +84,4 @@ else
     echo ""
     echo "ERROR: Test failed!"
     exit 1
-fi
\ No newline at end of file
+fi