diff --git a/environments/eval_environments/agieval_eval.py b/environments/eval_environments/agieval_eval.py
index fac22d54..75638175 100644
--- a/environments/eval_environments/agieval_eval.py
+++ b/environments/eval_environments/agieval_eval.py
@@ -36,16 +36,11 @@ import time
 from string import ascii_uppercase
 from typing import Dict, List, Optional, Tuple
 
-import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    build_mcqa_fallback_patterns,
     create_system_content,
     extract_letter_from_answer_tag,
-    extract_thinking_content,
     get_default_thinking_prompt,
-    save_eval_results,
-    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
@@ -333,12 +328,13 @@ class AGIEvalEnv(BaseEnv):
 
     async def setup(self) -> None:
         """Load the AGIEval dataset and prepare for evaluation."""
-        print(f"\nAGIEval Evaluation Setup (Generative Mode):")
+        print("\nAGIEval Evaluation Setup (Generative Mode):")
         print(f"  Max tokens for reasoning: {self.config.eval_max_tokens}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
         if self.config.thinking_mode:
-            print(f"  Thinking prompt: {self._get_thinking_prompt()[:100]}...")
+            prompt_preview = self._get_thinking_prompt()[:100]
+            print(f"  Thinking prompt: {prompt_preview}...")
 
         # Determine which subsets to use
         if self.config.subsets:
@@ -379,7 +375,7 @@ class AGIEvalEnv(BaseEnv):
         print(f"\n  Total evaluation items: {len(self.eval_data)}")
 
         # Print subset distribution
-        print(f"\n  Subset distribution:")
+        print("\n  Subset distribution:")
         for subset, count in sorted(subset_counts.items()):
             print(f"    {subset}: {count} questions")
 
@@ -584,7 +580,7 @@ class AGIEvalEnv(BaseEnv):
                             break
                         elif attempt < self.config.max_retries - 1:
                             if self.config.full_debug:
-                                print(f"  Response too short, retrying...")
+                                print("  Response too short, retrying...")
                             await asyncio.sleep(self.config.retry_delay)
 
                 except Exception as e:
@@ -594,15 +590,15 @@ class AGIEvalEnv(BaseEnv):
                     )
                     if hasattr(e, "response"):
                         try:
-                            print(
-                                f"    Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
-                            )
-                        except:
+                            resp_text = e.response.text[:500] if hasattr(e.response, "text") else str(e.response)
+                            print(f"    Response: {resp_text}")
+                        except Exception:
                             pass
                     if attempt < self.config.max_retries - 1:
                         await asyncio.sleep(self.config.retry_delay)
                     else:
-                        print(f"  Failed after {self.config.max_retries} attempts")
+                        retries = self.config.max_retries
+                        print(f"  Failed after {retries} attempts")
                         return {"is_correct": None, "sample": None}
 
             if not model_response:
@@ -669,9 +665,9 @@ class AGIEvalEnv(BaseEnv):
         """Run AGIEval evaluation."""
         start_time = time.time()
 
-        print(f"\n{'='*60}")
-        print(f"Starting AGIEval Evaluation (Generative/Reasoning Mode)")
-        print(f"{'='*60}")
+        print("\n" + "=" * 60)
+        print("Starting AGIEval Evaluation (Generative/Reasoning Mode)")
+        print("=" * 60)
         print(f"  Total questions: {len(self.all_eval_items)}")
         print(f"  Max tokens (for reasoning): {self.config.eval_max_tokens}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
@@ -782,9 +778,9 @@ class AGIEvalEnv(BaseEnv):
         self.eval_metrics = [(k, v) for k, v in eval_metrics.items()]
 
         # Print summary
-        print(f"\n{'='*60}")
-        print(f"AGIEval Evaluation Results")
-        print(f"{'='*60}")
+        print("\n" + "=" * 60)
+        print("AGIEval Evaluation Results")
+        print("=" * 60)
         print(
             f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})"
         )
@@ -794,7 +790,7 @@ class AGIEvalEnv(BaseEnv):
             print(f"Format Compliance: {format_compliance_rate:.4f}")
             print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
 
-        print(f"\nSubset Breakdown:")
+        print("\nSubset Breakdown:")
         for subset, stats in sorted(subset_results.items()):
             if stats["total"] > 0:
                 subset_acc = stats["correct"] / stats["total"]
@@ -802,7 +798,7 @@ class AGIEvalEnv(BaseEnv):
                     f"  {subset}: {subset_acc:.4f} ({stats['correct']}/{stats['total']})"
                 )
 
-        print(f"\nExtraction Method Statistics:")
+        print("\nExtraction Method Statistics:")
         for method, stats in sorted(
             extraction_methods.items(), key=lambda x: -x[1]["count"]
         ):
@@ -810,7 +806,7 @@ class AGIEvalEnv(BaseEnv):
                 method_acc = stats["correct"] / stats["count"]
                 print(f"  {method}: {stats['count']} uses, {method_acc:.4f} accuracy")
 
-        print(f"{'='*60}\n")
+        print("=" * 60 + "\n")
 
         # Log evaluation results
         try:
diff --git a/environments/eval_environments/aime_eval.py b/environments/eval_environments/aime_eval.py
index abb08a2e..53026499 100644
--- a/environments/eval_environments/aime_eval.py
+++ b/environments/eval_environments/aime_eval.py
@@ -23,12 +23,8 @@ Supports thinking mode with <think></think> tags for extended reasoning.
 """
 
 import asyncio
-import os
 import random
-import re
-import time
-from concurrent.futures import ProcessPoolExecutor
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
 
 import wandb
 from datasets import load_dataset
@@ -50,7 +46,6 @@ from atroposlib.envs.base import (
     APIServerConfig,
     BaseEnv,
     BaseEnvConfig,
-    EvalHandlingEnum,
 )
 
 # Available AIME years
@@ -62,7 +57,13 @@ AIME_DATASETS = {
 
 # Prompt template following lighteval's AIME structure
 # Important: Uses the "I hope it is correct" format for math-verify
-AIME_PROMPT_TEMPLATE = """Solve the following math problem efficiently and clearly. The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
+AIME_PROMPT_TEMPLATE = """Solve the following math problem efficiently and clearly.
+
+The last line of your response should be of the following format:
+'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes)
+where ANSWER is just the final number or expression that solves the problem.
+
+Think step by step before answering.
 
 Note: AIME answers are always integers from 0 to 999.
 
@@ -172,7 +173,7 @@ class AIMEEvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nAIME Evaluation Setup (Generative Mode):")
+        print("\nAIME Evaluation Setup (Generative Mode):")
         print(f"  Years: {self.config.years}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
@@ -501,12 +502,12 @@ class AIMEEvalEnv(BaseEnv):
         print(f"  Format Compliance: {format_valid / total:.2%}")
         if self.config.thinking_mode:
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
-        print(f"\n  Per-Year Breakdown:")
+        print("\n  Per-Year Breakdown:")
         for year, data in sorted(year_metrics.items()):
             print(
                 f"    AIME {year}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
             )
-        print(f"\n  Verification Methods:")
+        print("\n  Verification Methods:")
         for method, count in sorted(method_counts.items(), key=lambda x: -x[1]):
             print(f"    {method}: {count} ({count/total:.1%})")
         print(f"{'='*60}\n")
diff --git a/environments/eval_environments/aimo_eval.py b/environments/eval_environments/aimo_eval.py
index 10385c82..9ba38eef 100644
--- a/environments/eval_environments/aimo_eval.py
+++ b/environments/eval_environments/aimo_eval.py
@@ -17,12 +17,8 @@ Supports thinking mode with <think></think> tags for extended reasoning.
 """
 
 import asyncio
-import os
 import random
-import re
-import time
-from concurrent.futures import ProcessPoolExecutor
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
 
 import wandb
 from datasets import load_dataset
@@ -45,7 +41,6 @@ from atroposlib.envs.base import (
     APIServerConfig,
     BaseEnv,
     BaseEnvConfig,
-    EvalHandlingEnum,
 )
 
 # Prompt template - AIMO doesn't have a specific template in lighteval
@@ -161,7 +156,7 @@ class AIMOEvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nAIMO Evaluation Setup (Generative Mode):")
+        print("\nAIMO Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
@@ -401,7 +396,7 @@ class AIMOEvalEnv(BaseEnv):
         print(f"  Format Compliance: {format_valid / total:.2%}")
         if self.config.thinking_mode:
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
-        print(f"\n  Verification Methods:")
+        print("\n  Verification Methods:")
         for method, count in sorted(method_counts.items(), key=lambda x: -x[1]):
             print(f"    {method}: {count} ({count/total:.1%})")
         print(f"{'='*60}\n")
diff --git a/environments/eval_environments/arc_agi_eval.py b/environments/eval_environments/arc_agi_eval.py
index 3da7bf90..bd522a3f 100644
--- a/environments/eval_environments/arc_agi_eval.py
+++ b/environments/eval_environments/arc_agi_eval.py
@@ -27,17 +27,14 @@ Answer must be provided in <answer></answer> tags as a JSON 2D array.
 import ast
 import asyncio
 import json
-import os
 import re
-import time
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import wandb
 from datasets import load_dataset
 from eval_helpers import (
     ANSWER_TAG_PATTERN,
     create_system_content,
-    extract_thinking_content,
     get_default_thinking_prompt,
     save_eval_results,
     validate_thinking_format,
@@ -49,7 +46,6 @@ from atroposlib.envs.base import (
     APIServerConfig,
     BaseEnv,
     BaseEnvConfig,
-    EvalHandlingEnum,
 )
 
 
@@ -168,7 +164,7 @@ class ARCAGIEvalEnv(BaseEnv):
 
     async def setup(self):
         """Load the ARC-AGI 2 dataset."""
-        print(f"\nARC-AGI 2 Evaluation Setup (Generative Mode):")
+        print("\nARC-AGI 2 Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
@@ -216,7 +212,8 @@ class ARCAGIEvalEnv(BaseEnv):
         gold_output = item["question"][0]["output"]
 
         # Build the prompt
-        query = """You are solving an ARC-AGI puzzle. You will be shown training examples where an input grid is transformed into an output grid following a specific pattern or rule.
+        query = """You are solving an ARC-AGI puzzle. You will be shown training examples
+where an input grid is transformed into an output grid following a specific pattern or rule.
 
 Your task is to:
 1. Analyze the training examples to understand the transformation pattern
@@ -315,7 +312,7 @@ Example format:
                 grid = ast.literal_eval(match)
                 if self._is_valid_grid(grid):
                     return grid
-            except:
+            except Exception:
                 continue
 
         # Strategy 4: Extract rows one per line
@@ -328,7 +325,7 @@ Example format:
                 grid = [json.loads(row) for row in rows]
                 if self._is_valid_grid(grid):
                     return grid
-            except:
+            except Exception:
                 pass
 
         return None
diff --git a/environments/eval_environments/arc_eval.py b/environments/eval_environments/arc_eval.py
index 4dbeddee..43f02308 100644
--- a/environments/eval_environments/arc_eval.py
+++ b/environments/eval_environments/arc_eval.py
@@ -21,11 +21,8 @@ Supports optional thinking mode with <think></think> tags.
 """
 
 import asyncio
-import os
-import re
-import time
 from string import ascii_uppercase
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import wandb
 from datasets import load_dataset
@@ -33,7 +30,6 @@ from eval_helpers import (
     build_mcqa_fallback_patterns,
     create_system_content,
     extract_letter_from_answer_tag,
-    extract_thinking_content,
     get_default_thinking_prompt,
     save_eval_results,
     validate_thinking_format,
@@ -45,7 +41,6 @@ from atroposlib.envs.base import (
     APIServerConfig,
     BaseEnv,
     BaseEnvConfig,
-    EvalHandlingEnum,
 )
 
 
@@ -173,7 +168,7 @@ class ARCEvalEnv(BaseEnv):
 
     async def setup(self):
         """Load the ARC dataset."""
-        print(f"\nARC Evaluation Setup (Generative Mode):")
+        print("\nARC Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Evaluation split: {self.config.eval_split}")
diff --git a/environments/eval_environments/arena_hard_environment.py b/environments/eval_environments/arena_hard_environment.py
index 0c14ce1d..742288e4 100644
--- a/environments/eval_environments/arena_hard_environment.py
+++ b/environments/eval_environments/arena_hard_environment.py
@@ -9,7 +9,6 @@ from datasets import load_dataset
 from eval_helpers import (
     create_system_content,
     get_default_thinking_prompt,
-    save_eval_results,
 )
 from pydantic import Field
 from tenacity import retry, stop_after_attempt, wait_random_exponential
diff --git a/environments/eval_environments/bbh_eval.py b/environments/eval_environments/bbh_eval.py
index 66b3b235..81aba681 100644
--- a/environments/eval_environments/bbh_eval.py
+++ b/environments/eval_environments/bbh_eval.py
@@ -20,12 +20,9 @@ snarks, sports_understanding, temporal_sequences, tracking_shuffled_objects (3/5
 """
 
 import asyncio
-import os
 import random
-import re
-import time
 from string import ascii_uppercase
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import wandb
 from datasets import load_dataset
@@ -33,7 +30,6 @@ from eval_helpers import (
     build_mcqa_fallback_patterns,
     create_system_content,
     extract_letter_from_answer_tag,
-    extract_thinking_content,
     get_default_thinking_prompt,
     save_eval_results,
     validate_thinking_format,
@@ -45,7 +41,6 @@ from atroposlib.envs.base import (
     APIServerConfig,
     BaseEnv,
     BaseEnvConfig,
-    EvalHandlingEnum,
 )
 
 # All available BBH subsets
@@ -86,8 +81,7 @@ def format_bbh_prompt(item: Dict) -> Tuple[str, List[str], int]:
     input_prefix = item.get("example_input_prefix", "\nQuestion: ")
     input_text = item.get("input", "")
     choice_prefix = item.get("choice_prefix", "\n  Choices: ")
-    output_prefix = item.get("example_output_prefix", "\nAnswer: ")
-
+    # Note: output_prefix from item.get("example_output_prefix") is not used in generative mode
     choices = item.get("choices", [])
     target_idx = item.get("target_idx", 0)
 
@@ -222,7 +216,7 @@ class BBHEvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nBBH Evaluation Setup (Generative Mode):")
+        print("\nBBH Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Evaluation split: {self.config.eval_split}")
@@ -553,7 +547,7 @@ class BBHEvalEnv(BaseEnv):
         print(f"  Format Compliance: {format_valid / total:.2%}")
         if self.config.thinking_mode:
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
-        print(f"\n  Per-Subset Breakdown:")
+        print("\n  Per-Subset Breakdown:")
         for subset, data in sorted(
             subset_metrics.items(), key=lambda x: -x[1]["accuracy"]
         ):
diff --git a/environments/eval_environments/boolq_eval.py b/environments/eval_environments/boolq_eval.py
index 4f9868ef..e0276841 100644
--- a/environments/eval_environments/boolq_eval.py
+++ b/environments/eval_environments/boolq_eval.py
@@ -20,11 +20,7 @@ Supports optional thinking mode with <think></think> tags.
 """
 
 import asyncio
-import os
-import re
-import time
-from string import ascii_uppercase
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import wandb
 from datasets import load_dataset
@@ -33,7 +29,6 @@ from eval_helpers import (
     build_mcqa_fallback_patterns,
     create_system_content,
     extract_letter_from_answer_tag,
-    extract_thinking_content,
     get_default_thinking_prompt,
     save_eval_results,
     validate_thinking_format,
@@ -45,7 +40,6 @@ from atroposlib.envs.base import (
     APIServerConfig,
     BaseEnv,
     BaseEnvConfig,
-    EvalHandlingEnum,
 )
 
 
@@ -170,7 +164,7 @@ class BoolQEvalEnv(BaseEnv):
 
     async def setup(self):
         """Load the BoolQ dataset."""
-        print(f"\nBoolQ Evaluation Setup (Generative Mode):")
+        print("\nBoolQ Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
diff --git a/environments/eval_environments/drop_eval.py b/environments/eval_environments/drop_eval.py
index 348e8711..994471d0 100644
--- a/environments/eval_environments/drop_eval.py
+++ b/environments/eval_environments/drop_eval.py
@@ -23,17 +23,13 @@ import asyncio
 import os
 import re
 import time
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
-import wandb
 from datasets import load_dataset
 from eval_helpers import (
     create_system_content,
     extract_freeform_from_answer_tag,
-    extract_thinking_content,
     get_default_thinking_prompt,
-    save_eval_results,
-    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
@@ -317,7 +313,7 @@ Question: {question}"""
 
     async def setup(self) -> None:
         """Load the DROP dataset and prepare for evaluation."""
-        print(f"\nDROP Evaluation Setup:")
+        print("\nDROP Evaluation Setup:")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Max tokens: {self.config.eval_max_tokens}")
         print(f"  Evaluation split: {self.config.eval_split}")
@@ -539,7 +535,7 @@ Question: {question}"""
                             print(
                                 f"    Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
                             )
-                        except:
+                        except Exception:
                             pass
                     if attempt < self.config.max_retries - 1:
                         await asyncio.sleep(self.config.retry_delay)
@@ -615,7 +611,7 @@ Question: {question}"""
         start_time = time.time()
 
         print(f"\n{'='*60}")
-        print(f"Starting DROP Evaluation")
+        print("Starting DROP Evaluation")
         print(f"{'='*60}")
         print(f"  Total questions: {len(self.all_eval_items)}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
@@ -694,7 +690,7 @@ Question: {question}"""
 
         # Print summary
         print(f"\n{'='*60}")
-        print(f"DROP Evaluation Results")
+        print("DROP Evaluation Results")
         print(f"{'='*60}")
         print(f"Exact Match Accuracy: {accuracy:.4f} ({correct_count}/{total_count})")
         print(f"Average F1 Score: {avg_f1:.4f}")
diff --git a/environments/eval_environments/eval_helpers.py b/environments/eval_environments/eval_helpers.py
index 8f11800d..cb1318a0 100644
--- a/environments/eval_environments/eval_helpers.py
+++ b/environments/eval_environments/eval_helpers.py
@@ -19,7 +19,7 @@ import os
 import re
 from concurrent.futures import ProcessPoolExecutor
 from string import ascii_uppercase
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 # Try to import math_verify libraries (optional dependency for math evals)
 try:
diff --git a/environments/eval_environments/gpqa_eval.py b/environments/eval_environments/gpqa_eval.py
index b69e956b..fabc2d3f 100644
--- a/environments/eval_environments/gpqa_eval.py
+++ b/environments/eval_environments/gpqa_eval.py
@@ -29,16 +29,11 @@ import time
 from string import ascii_uppercase
 from typing import Dict, List, Optional, Tuple
 
-import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    build_mcqa_fallback_patterns,
     create_system_content,
     extract_letter_from_answer_tag,
-    extract_thinking_content,
     get_default_thinking_prompt,
-    save_eval_results,
-    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
@@ -289,7 +284,7 @@ class GPQAEvalEnv(BaseEnv):
 
     async def setup(self) -> None:
         """Load the GPQA dataset and prepare for evaluation."""
-        print(f"\nGPQA Evaluation Setup (Generative Mode):")
+        print("\nGPQA Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Max tokens for reasoning: {self.config.eval_max_tokens}")
@@ -507,7 +502,7 @@ class GPQAEvalEnv(BaseEnv):
                             break
                         elif attempt < self.config.max_retries - 1:
                             if self.config.full_debug:
-                                print(f"  Response too short, retrying...")
+                                print("  Response too short, retrying...")
                             await asyncio.sleep(self.config.retry_delay)
 
                 except Exception as e:
@@ -520,7 +515,7 @@ class GPQAEvalEnv(BaseEnv):
                             print(
                                 f"    Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
                             )
-                        except:
+                        except Exception:
                             pass
                     if attempt < self.config.max_retries - 1:
                         await asyncio.sleep(self.config.retry_delay)
@@ -592,7 +587,7 @@ class GPQAEvalEnv(BaseEnv):
         start_time = time.time()
 
         print(f"\n{'='*60}")
-        print(f"Starting GPQA Evaluation (Generative/Reasoning Mode)")
+        print("Starting GPQA Evaluation (Generative/Reasoning Mode)")
         print(f"{'='*60}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Total questions: {len(self.all_eval_items)}")
@@ -708,7 +703,7 @@ class GPQAEvalEnv(BaseEnv):
             print(f"Format Compliance: {format_compliance_rate:.4f}")
             print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
 
-        print(f"\nSubdomain Breakdown:")
+        print("\nSubdomain Breakdown:")
         for subdomain, stats in sorted(subdomain_results.items()):
             if stats["total"] > 0:
                 subdom_acc = stats["correct"] / stats["total"]
@@ -716,7 +711,7 @@ class GPQAEvalEnv(BaseEnv):
                     f"  {subdomain}: {subdom_acc:.4f} ({stats['correct']}/{stats['total']})"
                 )
 
-        print(f"\nExtraction Method Statistics:")
+        print("\nExtraction Method Statistics:")
         for method, stats in sorted(
             extraction_methods.items(), key=lambda x: -x[1]["count"]
         ):
diff --git a/environments/eval_environments/gsm8k_eval.py b/environments/eval_environments/gsm8k_eval.py
index d0d15800..59dde1b2 100644
--- a/environments/eval_environments/gsm8k_eval.py
+++ b/environments/eval_environments/gsm8k_eval.py
@@ -161,7 +161,7 @@ class GSM8KEvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nGSM8K Evaluation Setup (Generative Mode):")
+        print("\nGSM8K Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Evaluation split: {self.config.eval_split}")
@@ -416,7 +416,7 @@ class GSM8KEvalEnv(BaseEnv):
         print(f"  Format Compliance: {format_valid / total:.2%}")
         if self.config.thinking_mode:
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
-        print(f"\n  Verification Methods:")
+        print("\n  Verification Methods:")
         for method, count in sorted(method_counts.items(), key=lambda x: -x[1]):
             print(f"    {method}: {count} ({count/total:.1%})")
         print(f"{'='*60}\n")
diff --git a/environments/eval_environments/hellaswag_eval.py b/environments/eval_environments/hellaswag_eval.py
index bfc3e2c6..1c6fae79 100644
--- a/environments/eval_environments/hellaswag_eval.py
+++ b/environments/eval_environments/hellaswag_eval.py
@@ -167,7 +167,7 @@ class HellaSwagEvalEnv(BaseEnv):
 
     async def setup(self):
         """Load the HellaSwag dataset."""
-        print(f"\nHellaSwag Evaluation Setup (Generative Mode):")
+        print("\nHellaSwag Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
diff --git a/environments/eval_environments/hle_eval.py b/environments/eval_environments/hle_eval.py
index 963c4767..dc6becdb 100644
--- a/environments/eval_environments/hle_eval.py
+++ b/environments/eval_environments/hle_eval.py
@@ -152,7 +152,7 @@ class HLEEvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nHLE Evaluation Setup (Generative Mode):")
+        print("\nHLE Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
@@ -525,7 +525,7 @@ class HLEEvalEnv(BaseEnv):
         if self.config.thinking_mode:
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
         if category_metrics:
-            print(f"\n  Per-Category Breakdown:")
+            print("\n  Per-Category Breakdown:")
             for cat, data in sorted(
                 category_metrics.items(), key=lambda x: -x[1]["accuracy"]
             ):
diff --git a/environments/eval_environments/ifeval_eval.py b/environments/eval_environments/ifeval_eval.py
index 2061d914..86c703ca 100644
--- a/environments/eval_environments/ifeval_eval.py
+++ b/environments/eval_environments/ifeval_eval.py
@@ -228,7 +228,7 @@ class IFEvalEnv(BaseEnv):
 
     async def setup(self) -> None:
         """Load the IFEval dataset and prepare for evaluation."""
-        print(f"\nIFEval Evaluation Setup:")
+        print("\nIFEval Evaluation Setup:")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Max tokens: {self.config.eval_max_tokens}")
         print(f"  Evaluation split: {self.config.eval_split}")
@@ -478,7 +478,7 @@ class IFEvalEnv(BaseEnv):
                             break
                         elif attempt < self.config.max_retries - 1:
                             if self.config.full_debug:
-                                print(f"  Response too short, retrying...")
+                                print("  Response too short, retrying...")
                             await asyncio.sleep(self.config.retry_delay)
 
                 except Exception as e:
@@ -490,7 +490,7 @@ class IFEvalEnv(BaseEnv):
                             print(
                                 f"    Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
                             )
-                        except:
+                        except Exception:
                             pass
                     if attempt < self.config.max_retries - 1:
                         await asyncio.sleep(self.config.retry_delay)
@@ -569,7 +569,7 @@ class IFEvalEnv(BaseEnv):
         start_time = time.time()
 
         print(f"\n{'='*60}")
-        print(f"Starting IFEval Evaluation (Instruction Following)")
+        print("Starting IFEval Evaluation (Instruction Following)")
         print(f"{'='*60}")
         print(f"  Total prompts: {len(self.all_eval_items)}")
         print(f"  Max tokens: {self.config.eval_max_tokens}")
@@ -682,7 +682,7 @@ class IFEvalEnv(BaseEnv):
 
         # Print summary
         print(f"\n{'='*60}")
-        print(f"IFEval Evaluation Results")
+        print("IFEval Evaluation Results")
         print(f"{'='*60}")
         print(
             f"Prompt-Level Strict Accuracy: {prompt_strict_acc:.4f} ({prompt_strict_count}/{total_count})"
diff --git a/environments/eval_environments/judgemark_eval.py b/environments/eval_environments/judgemark_eval.py
index 7b70ce4b..37728912 100644
--- a/environments/eval_environments/judgemark_eval.py
+++ b/environments/eval_environments/judgemark_eval.py
@@ -351,7 +351,7 @@ class JudgeMarkEvalEnv(BaseEnv):
 
     async def setup(self):
         """Load JudgeMark data files."""
-        print(f"\nLoading JudgeMark v2 data...")
+        print("\nLoading JudgeMark v2 data...")
 
         # Determine data directory
         data_dir = JUDGEMARK_DATA_DIR
@@ -701,7 +701,7 @@ class JudgeMarkEvalEnv(BaseEnv):
             f"  Models with reference: {calibrated_cross_stats['num_models_with_reference']}"
         )
 
-        print(f"\n  Per-model averages (calibrated):")
+        print("\n  Per-model averages (calibrated):")
         sorted_models = sorted(
             model_stats.items(), key=lambda x: x[1]["mean_calibrated"], reverse=True
         )
diff --git a/environments/eval_environments/math500_eval.py b/environments/eval_environments/math500_eval.py
index 1d8ad7a1..c59b58ab 100644
--- a/environments/eval_environments/math500_eval.py
+++ b/environments/eval_environments/math500_eval.py
@@ -172,7 +172,7 @@ class MATH500EvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nMATH-500 Evaluation Setup (Generative Mode):")
+        print("\nMATH-500 Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
@@ -470,7 +470,7 @@ class MATH500EvalEnv(BaseEnv):
         if self.config.thinking_mode:
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
         if subject_metrics and len(subject_metrics) > 1:
-            print(f"\n  Per-Subject Breakdown:")
+            print("\n  Per-Subject Breakdown:")
             for subject, data in sorted(
                 subject_metrics.items(), key=lambda x: -x[1]["accuracy"]
             ):
@@ -478,7 +478,7 @@ class MATH500EvalEnv(BaseEnv):
                     f"    {subject}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
                 )
         if level_metrics and len(level_metrics) > 1:
-            print(f"\n  Per-Level Breakdown:")
+            print("\n  Per-Level Breakdown:")
             for level, data in sorted(level_metrics.items()):
                 print(
                     f"    Level {level}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
diff --git a/environments/eval_environments/math_eval.py b/environments/eval_environments/math_eval.py
index d010822b..e516e779 100644
--- a/environments/eval_environments/math_eval.py
+++ b/environments/eval_environments/math_eval.py
@@ -178,7 +178,7 @@ class MATHEvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nMATH Evaluation Setup (Generative Mode):")
+        print("\nMATH Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subsets: {self.config.subsets}")
         print(f"  Evaluation split: {self.config.eval_split}")
@@ -484,7 +484,7 @@ class MATHEvalEnv(BaseEnv):
         print(f"  Format Compliance: {format_valid / total:.2%}")
         if self.config.thinking_mode:
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
-        print(f"\n  Per-Subset Breakdown:")
+        print("\n  Per-Subset Breakdown:")
         for subset, data in sorted(
             subset_metrics.items(), key=lambda x: -x[1]["accuracy"]
         ):
@@ -492,7 +492,7 @@ class MATHEvalEnv(BaseEnv):
                 f"    {subset}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
             )
         if level_metrics and len(level_metrics) > 1:
-            print(f"\n  Per-Level Breakdown:")
+            print("\n  Per-Level Breakdown:")
             for level, data in sorted(level_metrics.items()):
                 print(
                     f"    {level}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
diff --git a/environments/eval_environments/mixeval_eval.py b/environments/eval_environments/mixeval_eval.py
index 0a7be95f..a75d746f 100644
--- a/environments/eval_environments/mixeval_eval.py
+++ b/environments/eval_environments/mixeval_eval.py
@@ -343,7 +343,7 @@ class MixEvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nMixEval Evaluation Setup (with LLM Judge):")
+        print("\nMixEval Evaluation Setup (with LLM Judge):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Difficulty: {self.config.difficulty}")
         print(f"  Question types: {self.config.question_types}")
@@ -737,7 +737,7 @@ class MixEvalEnv(BaseEnv):
             print(f"  Format Compliance: {format_valid / total:.2%}")
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
         print(f"  Judge Error Rate: {self.judge_error_count / total:.2%}")
-        print(f"\n  Per-Benchmark Breakdown:")
+        print("\n  Per-Benchmark Breakdown:")
         for bench, data in sorted(
             benchmark_metrics.items(), key=lambda x: -x[1]["avg_score"]
         ):
diff --git a/environments/eval_environments/mmlu_eval.py b/environments/eval_environments/mmlu_eval.py
index dcf9f338..92b05a78 100644
--- a/environments/eval_environments/mmlu_eval.py
+++ b/environments/eval_environments/mmlu_eval.py
@@ -464,7 +464,7 @@ class MMLUEvalEnv(BaseEnv):
         if not self.subjects:
             raise ValueError("No valid MMLU subjects specified for evaluation.")
 
-        print(f"\nMMLU Evaluation Setup (Generative Mode):")
+        print("\nMMLU Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subjects: {len(self.subjects)} subjects")
         print(f"  Few-shot examples: {self.config.num_few_shot}")
@@ -821,7 +821,7 @@ class MMLUEvalEnv(BaseEnv):
                             print(
                                 f"    Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
                             )
-                        except:
+                        except Exception:
                             pass
                     if attempt < self.config.max_retries - 1:
                         await asyncio.sleep(self.config.retry_delay)
@@ -907,7 +907,7 @@ class MMLUEvalEnv(BaseEnv):
         start_time = time.time()
 
         print(f"\n{'='*60}")
-        print(f"Starting MMLU Evaluation (Generative/Reasoning Mode)")
+        print("Starting MMLU Evaluation (Generative/Reasoning Mode)")
         print(f"{'='*60}")
         print(f"  Subjects: {len(self.subjects)}")
         print(f"  Total questions: {len(self.all_eval_items)}")
@@ -1046,7 +1046,7 @@ class MMLUEvalEnv(BaseEnv):
 
         # Print summary
         print(f"\n{'='*60}")
-        print(f"MMLU Evaluation Results")
+        print("MMLU Evaluation Results")
         print(f"{'='*60}")
         print(
             f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})"
@@ -1057,7 +1057,7 @@ class MMLUEvalEnv(BaseEnv):
             print(f"Format Compliance: {format_compliance_rate:.4f}")
             print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
 
-        print(f"\nCategory Breakdown:")
+        print("\nCategory Breakdown:")
         for category, stats in category_results.items():
             if stats["total"] > 0:
                 cat_acc = stats["correct"] / stats["total"]
@@ -1065,7 +1065,7 @@ class MMLUEvalEnv(BaseEnv):
                     f"  {category}: {cat_acc:.4f} ({stats['correct']}/{stats['total']})"
                 )
 
-        print(f"\nExtraction Method Statistics:")
+        print("\nExtraction Method Statistics:")
         for method, stats in sorted(
             extraction_methods.items(), key=lambda x: -x[1]["count"]
         ):
diff --git a/environments/eval_environments/mmlu_pro_eval.py b/environments/eval_environments/mmlu_pro_eval.py
index 1eae91a8..e4315ab2 100644
--- a/environments/eval_environments/mmlu_pro_eval.py
+++ b/environments/eval_environments/mmlu_pro_eval.py
@@ -307,7 +307,7 @@ class MMLUProEvalEnv(BaseEnv):
 
     async def setup(self) -> None:
         """Load the MMLU-Pro dataset and prepare for evaluation."""
-        print(f"\nMMLU-Pro Evaluation Setup (Generative Mode):")
+        print("\nMMLU-Pro Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Few-shot examples: {self.config.num_few_shot}")
         print(f"  Max tokens for reasoning: {self.config.eval_max_tokens}")
@@ -358,7 +358,7 @@ class MMLUProEvalEnv(BaseEnv):
             cat = item.get("category", "unknown")
             category_counts[cat] = category_counts.get(cat, 0) + 1
 
-        print(f"\n  Category distribution:")
+        print("\n  Category distribution:")
         for cat, count in sorted(category_counts.items()):
             print(f"    {cat}: {count} questions")
 
@@ -586,7 +586,7 @@ class MMLUProEvalEnv(BaseEnv):
                             break
                         elif attempt < self.config.max_retries - 1:
                             if self.config.full_debug:
-                                print(f"  Response too short, retrying...")
+                                print("  Response too short, retrying...")
                             await asyncio.sleep(self.config.retry_delay)
 
                 except Exception as e:
@@ -599,7 +599,7 @@ class MMLUProEvalEnv(BaseEnv):
                             print(
                                 f"    Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
                             )
-                        except:
+                        except Exception:
                             pass
                     if attempt < self.config.max_retries - 1:
                         await asyncio.sleep(self.config.retry_delay)
@@ -673,7 +673,7 @@ class MMLUProEvalEnv(BaseEnv):
         start_time = time.time()
 
         print(f"\n{'='*60}")
-        print(f"Starting MMLU-Pro Evaluation (Generative/Reasoning Mode)")
+        print("Starting MMLU-Pro Evaluation (Generative/Reasoning Mode)")
         print(f"{'='*60}")
         print(f"  Total questions: {len(self.all_eval_items)}")
         print(f"  Few-shot examples: {self.config.num_few_shot}")
@@ -788,7 +788,7 @@ class MMLUProEvalEnv(BaseEnv):
 
         # Print summary
         print(f"\n{'='*60}")
-        print(f"MMLU-Pro Evaluation Results")
+        print("MMLU-Pro Evaluation Results")
         print(f"{'='*60}")
         print(
             f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})"
@@ -799,7 +799,7 @@ class MMLUProEvalEnv(BaseEnv):
             print(f"Format Compliance: {format_compliance_rate:.4f}")
             print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
 
-        print(f"\nCategory Breakdown:")
+        print("\nCategory Breakdown:")
         for category, stats in sorted(category_results.items()):
             if stats["total"] > 0:
                 cat_acc = stats["correct"] / stats["total"]
@@ -807,7 +807,7 @@ class MMLUProEvalEnv(BaseEnv):
                     f"  {category}: {cat_acc:.4f} ({stats['correct']}/{stats['total']})"
                 )
 
-        print(f"\nExtraction Method Statistics:")
+        print("\nExtraction Method Statistics:")
         for method, stats in sorted(
             extraction_methods.items(), key=lambda x: -x[1]["count"]
         ):
diff --git a/environments/eval_environments/musr_eval.py b/environments/eval_environments/musr_eval.py
index 02aa0656..13251fb8 100644
--- a/environments/eval_environments/musr_eval.py
+++ b/environments/eval_environments/musr_eval.py
@@ -282,7 +282,7 @@ class MuSREvalEnv(BaseEnv):
         if isinstance(choices_raw, str):
             try:
                 choices = ast.literal_eval(choices_raw)
-            except:
+            except Exception:
                 choices = []
         else:
             choices = choices_raw
@@ -301,7 +301,7 @@ class MuSREvalEnv(BaseEnv):
 
     async def setup(self) -> None:
         """Load the MuSR dataset and prepare for evaluation."""
-        print(f"\nMuSR Evaluation Setup:")
+        print("\nMuSR Evaluation Setup:")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Max tokens: {self.config.eval_max_tokens}")
@@ -495,7 +495,7 @@ class MuSREvalEnv(BaseEnv):
                             print(
                                 f"    Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
                             )
-                        except:
+                        except Exception:
                             pass
                     if attempt < self.config.max_retries - 1:
                         await asyncio.sleep(self.config.retry_delay)
diff --git a/environments/eval_environments/obqa_eval.py b/environments/eval_environments/obqa_eval.py
index 84f89bcc..57e3b684 100644
--- a/environments/eval_environments/obqa_eval.py
+++ b/environments/eval_environments/obqa_eval.py
@@ -291,7 +291,7 @@ class OBQAEvalEnv(BaseEnv):
 
     async def setup(self) -> None:
         """Load the OpenBookQA dataset and prepare for evaluation."""
-        print(f"\nOpenBookQA Evaluation Setup:")
+        print("\nOpenBookQA Evaluation Setup:")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Eval split: {self.config.eval_split}")
@@ -481,7 +481,7 @@ class OBQAEvalEnv(BaseEnv):
                             print(
                                 f"    Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
                             )
-                        except:
+                        except Exception:
                             pass
                     if attempt < self.config.max_retries - 1:
                         await asyncio.sleep(self.config.retry_delay)
@@ -560,7 +560,7 @@ class OBQAEvalEnv(BaseEnv):
         start_time = time.time()
 
         print(f"\n{'='*60}")
-        print(f"Starting OpenBookQA Evaluation")
+        print("Starting OpenBookQA Evaluation")
         print(f"{'='*60}")
         print(f"  Total questions: {len(self.all_eval_items)}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
@@ -641,7 +641,7 @@ class OBQAEvalEnv(BaseEnv):
 
         # Print summary
         print(f"\n{'='*60}")
-        print(f"OpenBookQA Evaluation Results")
+        print("OpenBookQA Evaluation Results")
         print(f"{'='*60}")
         print(f"Accuracy: {accuracy:.4f} ({correct_count}/{total_count})")
         print(f"Answer Extraction Rate: {extraction_rate:.4f}")
diff --git a/environments/eval_environments/olympiadbench_eval.py b/environments/eval_environments/olympiadbench_eval.py
index 4b3cde69..1417001c 100644
--- a/environments/eval_environments/olympiadbench_eval.py
+++ b/environments/eval_environments/olympiadbench_eval.py
@@ -199,7 +199,7 @@ class OlympiadBenchEvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nOlympiadBench Evaluation Setup (Generative Mode):")
+        print("\nOlympiadBench Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Evaluation split: {self.config.eval_split}")
@@ -648,7 +648,7 @@ class OlympiadBenchEvalEnv(BaseEnv):
         if self.config.thinking_mode:
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
         if subject_metrics:
-            print(f"\n  Per-Subject Breakdown:")
+            print("\n  Per-Subject Breakdown:")
             for subject, data in subject_metrics.items():
                 print(
                     f"    {subject}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
diff --git a/environments/eval_environments/piqa_eval.py b/environments/eval_environments/piqa_eval.py
index 75d69f02..2fce1a6e 100644
--- a/environments/eval_environments/piqa_eval.py
+++ b/environments/eval_environments/piqa_eval.py
@@ -167,7 +167,7 @@ class PIQAEvalEnv(BaseEnv):
 
     async def setup(self):
         """Load the PIQA dataset."""
-        print(f"\nPIQA Evaluation Setup (Generative Mode):")
+        print("\nPIQA Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
diff --git a/environments/eval_environments/pubmedqa_eval.py b/environments/eval_environments/pubmedqa_eval.py
index ca9f2e37..6ce688e7 100644
--- a/environments/eval_environments/pubmedqa_eval.py
+++ b/environments/eval_environments/pubmedqa_eval.py
@@ -154,7 +154,7 @@ class PubMedQAEvalEnv(BaseEnv):
         if not self._dataset_loaded:
             await self._load_dataset()
 
-        print(f"\nPubMedQA Evaluation Setup (Generative Mode):")
+        print("\nPubMedQA Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Evaluation split: {self.config.eval_split}")
@@ -481,7 +481,7 @@ class PubMedQAEvalEnv(BaseEnv):
         print(f"  Format Compliance: {format_valid / total:.2%}")
         if self.config.thinking_mode:
             print(f"  Thinking Utilization: {has_thinking / total:.2%}")
-        print(f"\n  Per-Answer Breakdown:")
+        print("\n  Per-Answer Breakdown:")
         for answer, data in answer_metrics.items():
             print(
                 f"    {answer}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
diff --git a/environments/eval_environments/simpleqa_eval.py b/environments/eval_environments/simpleqa_eval.py
index eb1121ae..f15f28ef 100644
--- a/environments/eval_environments/simpleqa_eval.py
+++ b/environments/eval_environments/simpleqa_eval.py
@@ -455,7 +455,7 @@ class SimpleQAEvalEnv(BaseEnv):
             else "String Matching (Nous)"
         )
 
-        print(f"\nSimpleQA Evaluation Setup:")
+        print("\nSimpleQA Evaluation Setup:")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Scoring mode: {scoring_mode}")
         print(f"  Max tokens for answer: {self.config.eval_max_tokens}")
@@ -627,7 +627,7 @@ class SimpleQAEvalEnv(BaseEnv):
                             break
                         elif attempt < self.config.max_retries - 1:
                             if self.config.full_debug:
-                                print(f"  Response too short, retrying...")
+                                print("  Response too short, retrying...")
                             await asyncio.sleep(self.config.retry_delay)
 
                 except Exception as e:
@@ -639,7 +639,7 @@ class SimpleQAEvalEnv(BaseEnv):
                             print(
                                 f"    Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
                             )
-                        except:
+                        except Exception:
                             pass
                     if attempt < self.config.max_retries - 1:
                         await asyncio.sleep(self.config.retry_delay)
@@ -808,7 +808,7 @@ class SimpleQAEvalEnv(BaseEnv):
         )
 
         print(f"\n{'='*60}")
-        print(f"Starting SimpleQA Evaluation")
+        print("Starting SimpleQA Evaluation")
         print(f"{'='*60}")
         print(f"  Total questions: {len(self.all_eval_items)}")
         print(f"  Scoring mode: {scoring_mode}")
@@ -983,7 +983,7 @@ class SimpleQAEvalEnv(BaseEnv):
                 f"Accuracy (if attempted): {eval_metrics['eval/accuracy_if_attempted']:.4f}"
             )
             print(f"Not Attempted Rate: {eval_metrics['eval/not_attempted_rate']:.4f}")
-            print(f"\nGrade Distribution:")
+            print("\nGrade Distribution:")
             print(f"  CORRECT: {correct_count} ({100*correct_count/total_count:.1f}%)")
             print(
                 f"  INCORRECT: {incorrect_count} ({100*incorrect_count/total_count:.1f}%)"
@@ -1012,7 +1012,7 @@ class SimpleQAEvalEnv(BaseEnv):
             print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
 
         if len(sorted_topics) > 0:
-            print(f"\nTop Topics (by count):")
+            print("\nTop Topics (by count):")
             for topic, stats in sorted_topics[:10]:
                 if stats["total"] > 0:
                     topic_acc = stats["correct"] / stats["total"]
diff --git a/environments/eval_environments/siqa_eval.py b/environments/eval_environments/siqa_eval.py
index 8a56fcb2..6c027ff6 100644
--- a/environments/eval_environments/siqa_eval.py
+++ b/environments/eval_environments/siqa_eval.py
@@ -167,7 +167,7 @@ class SIQAEvalEnv(BaseEnv):
 
     async def setup(self):
         """Load the SIQA dataset."""
-        print(f"\nSIQA Evaluation Setup (Generative Mode):")
+        print("\nSIQA Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Evaluation split: {self.config.eval_split}")
         print(f"  Thinking mode: {self.config.thinking_mode}")
diff --git a/environments/eval_environments/winogrande_eval.py b/environments/eval_environments/winogrande_eval.py
index 81823f6d..87bcc3fd 100644
--- a/environments/eval_environments/winogrande_eval.py
+++ b/environments/eval_environments/winogrande_eval.py
@@ -172,7 +172,7 @@ class WinoGrandeEvalEnv(BaseEnv):
 
     async def setup(self):
         """Load the WinoGrande dataset."""
-        print(f"\nWinoGrande Evaluation Setup (Generative Mode):")
+        print("\nWinoGrande Evaluation Setup (Generative Mode):")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Subset: {self.config.subset}")
         print(f"  Evaluation split: {self.config.eval_split}")