diff --git a/environments/eval_environments/agieval_eval.py b/environments/eval_environments/agieval_eval.py index fac22d54..75638175 100644 --- a/environments/eval_environments/agieval_eval.py +++ b/environments/eval_environments/agieval_eval.py @@ -36,16 +36,11 @@ import time from string import ascii_uppercase from typing import Dict, List, Optional, Tuple -import wandb from datasets import load_dataset from eval_helpers import ( - build_mcqa_fallback_patterns, create_system_content, extract_letter_from_answer_tag, - extract_thinking_content, get_default_thinking_prompt, - save_eval_results, - validate_thinking_format, ) from pydantic import Field from tqdm.asyncio import tqdm_asyncio @@ -333,12 +328,13 @@ class AGIEvalEnv(BaseEnv): async def setup(self) -> None: """Load the AGIEval dataset and prepare for evaluation.""" - print(f"\nAGIEval Evaluation Setup (Generative Mode):") + print("\nAGIEval Evaluation Setup (Generative Mode):") print(f" Max tokens for reasoning: {self.config.eval_max_tokens}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") if self.config.thinking_mode: - print(f" Thinking prompt: {self._get_thinking_prompt()[:100]}...") + prompt_preview = self._get_thinking_prompt()[:100] + print(f" Thinking prompt: {prompt_preview}...") # Determine which subsets to use if self.config.subsets: @@ -379,7 +375,7 @@ class AGIEvalEnv(BaseEnv): print(f"\n Total evaluation items: {len(self.eval_data)}") # Print subset distribution - print(f"\n Subset distribution:") + print("\n Subset distribution:") for subset, count in sorted(subset_counts.items()): print(f" {subset}: {count} questions") @@ -584,7 +580,7 @@ class AGIEvalEnv(BaseEnv): break elif attempt < self.config.max_retries - 1: if self.config.full_debug: - print(f" Response too short, retrying...") + print(" Response too short, retrying...") await asyncio.sleep(self.config.retry_delay) except Exception as e: @@ -594,15 +590,15 @@ class AGIEvalEnv(BaseEnv): ) if hasattr(e, "response"): try: - print( - f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}" - ) - except: + resp_text = e.response.text[:500] if hasattr(e.response, "text") else str(e.response) + print(f" Response: {resp_text}") + except Exception: pass if attempt < self.config.max_retries - 1: await asyncio.sleep(self.config.retry_delay) else: - print(f" Failed after {self.config.max_retries} attempts") + retries = self.config.max_retries + print(f" Failed after {retries} attempts") return {"is_correct": None, "sample": None} if not model_response: @@ -669,9 +665,9 @@ class AGIEvalEnv(BaseEnv): """Run AGIEval evaluation.""" start_time = time.time() - print(f"\n{'='*60}") - print(f"Starting AGIEval Evaluation (Generative/Reasoning Mode)") - print(f"{'='*60}") + print("\n" + "=" * 60) + print("Starting AGIEval Evaluation (Generative/Reasoning Mode)") + print("=" * 60) print(f" Total questions: {len(self.all_eval_items)}") print(f" Max tokens (for reasoning): {self.config.eval_max_tokens}") print(f" Thinking mode: {self.config.thinking_mode}") @@ -782,9 +778,9 @@ class AGIEvalEnv(BaseEnv): self.eval_metrics = [(k, v) for k, v in eval_metrics.items()] # Print summary - print(f"\n{'='*60}") - print(f"AGIEval Evaluation Results") - print(f"{'='*60}") + print("\n" + "=" * 60) + print("AGIEval Evaluation Results") + print("=" * 60) print( f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})" ) @@ -794,7 +790,7 @@ class AGIEvalEnv(BaseEnv): print(f"Format Compliance: {format_compliance_rate:.4f}") print(f"Thinking Utilization: {thinking_utilization}/{total_count}") - print(f"\nSubset Breakdown:") + print("\nSubset Breakdown:") for subset, stats in sorted(subset_results.items()): if stats["total"] > 0: subset_acc = stats["correct"] / stats["total"] @@ -802,7 +798,7 @@ class AGIEvalEnv(BaseEnv): f" {subset}: {subset_acc:.4f} ({stats['correct']}/{stats['total']})" ) - print(f"\nExtraction Method Statistics:") + print("\nExtraction Method Statistics:") for method, stats in sorted( extraction_methods.items(), key=lambda x: -x[1]["count"] ): @@ -810,7 +806,7 @@ class AGIEvalEnv(BaseEnv): method_acc = stats["correct"] / stats["count"] print(f" {method}: {stats['count']} uses, {method_acc:.4f} accuracy") - print(f"{'='*60}\n") + print("=" * 60 + "\n") # Log evaluation results try: diff --git a/environments/eval_environments/aime_eval.py b/environments/eval_environments/aime_eval.py index abb08a2e..53026499 100644 --- a/environments/eval_environments/aime_eval.py +++ b/environments/eval_environments/aime_eval.py @@ -23,12 +23,8 @@ Supports thinking mode with tags for extended reasoning. """ import asyncio -import os import random -import re -import time -from concurrent.futures import ProcessPoolExecutor -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional import wandb from datasets import load_dataset @@ -50,7 +46,6 @@ from atroposlib.envs.base import ( APIServerConfig, BaseEnv, BaseEnvConfig, - EvalHandlingEnum, ) # Available AIME years @@ -62,7 +57,13 @@ AIME_DATASETS = { # Prompt template following lighteval's AIME structure # Important: Uses the "I hope it is correct" format for math-verify -AIME_PROMPT_TEMPLATE = """Solve the following math problem efficiently and clearly. The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering. +AIME_PROMPT_TEMPLATE = """Solve the following math problem efficiently and clearly. + +The last line of your response should be of the following format: +'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) +where ANSWER is just the final number or expression that solves the problem. + +Think step by step before answering. Note: AIME answers are always integers from 0 to 999. @@ -172,7 +173,7 @@ class AIMEEvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nAIME Evaluation Setup (Generative Mode):") + print("\nAIME Evaluation Setup (Generative Mode):") print(f" Years: {self.config.years}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") @@ -501,12 +502,12 @@ class AIMEEvalEnv(BaseEnv): print(f" Format Compliance: {format_valid / total:.2%}") if self.config.thinking_mode: print(f" Thinking Utilization: {has_thinking / total:.2%}") - print(f"\n Per-Year Breakdown:") + print("\n Per-Year Breakdown:") for year, data in sorted(year_metrics.items()): print( f" AIME {year}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})" ) - print(f"\n Verification Methods:") + print("\n Verification Methods:") for method, count in sorted(method_counts.items(), key=lambda x: -x[1]): print(f" {method}: {count} ({count/total:.1%})") print(f"{'='*60}\n") diff --git a/environments/eval_environments/aimo_eval.py b/environments/eval_environments/aimo_eval.py index 10385c82..9ba38eef 100644 --- a/environments/eval_environments/aimo_eval.py +++ b/environments/eval_environments/aimo_eval.py @@ -17,12 +17,8 @@ Supports thinking mode with tags for extended reasoning. """ import asyncio -import os import random -import re -import time -from concurrent.futures import ProcessPoolExecutor -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional import wandb from datasets import load_dataset @@ -45,7 +41,6 @@ from atroposlib.envs.base import ( APIServerConfig, BaseEnv, BaseEnvConfig, - EvalHandlingEnum, ) # Prompt template - AIMO doesn't have a specific template in lighteval @@ -161,7 +156,7 @@ class AIMOEvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nAIMO Evaluation Setup (Generative Mode):") + print("\nAIMO Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") @@ -401,7 +396,7 @@ class AIMOEvalEnv(BaseEnv): print(f" Format Compliance: {format_valid / total:.2%}") if self.config.thinking_mode: print(f" Thinking Utilization: {has_thinking / total:.2%}") - print(f"\n Verification Methods:") + print("\n Verification Methods:") for method, count in sorted(method_counts.items(), key=lambda x: -x[1]): print(f" {method}: {count} ({count/total:.1%})") print(f"{'='*60}\n") diff --git a/environments/eval_environments/arc_agi_eval.py b/environments/eval_environments/arc_agi_eval.py index 3da7bf90..bd522a3f 100644 --- a/environments/eval_environments/arc_agi_eval.py +++ b/environments/eval_environments/arc_agi_eval.py @@ -27,17 +27,14 @@ Answer must be provided in tags as a JSON 2D array. import ast import asyncio import json -import os import re -import time -from typing import Any, Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple import wandb from datasets import load_dataset from eval_helpers import ( ANSWER_TAG_PATTERN, create_system_content, - extract_thinking_content, get_default_thinking_prompt, save_eval_results, validate_thinking_format, @@ -49,7 +46,6 @@ from atroposlib.envs.base import ( APIServerConfig, BaseEnv, BaseEnvConfig, - EvalHandlingEnum, ) @@ -168,7 +164,7 @@ class ARCAGIEvalEnv(BaseEnv): async def setup(self): """Load the ARC-AGI 2 dataset.""" - print(f"\nARC-AGI 2 Evaluation Setup (Generative Mode):") + print("\nARC-AGI 2 Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") @@ -216,7 +212,8 @@ class ARCAGIEvalEnv(BaseEnv): gold_output = item["question"][0]["output"] # Build the prompt - query = """You are solving an ARC-AGI puzzle. You will be shown training examples where an input grid is transformed into an output grid following a specific pattern or rule. + query = """You are solving an ARC-AGI puzzle. You will be shown training examples +where an input grid is transformed into an output grid following a specific pattern or rule. Your task is to: 1. Analyze the training examples to understand the transformation pattern @@ -315,7 +312,7 @@ Example format: grid = ast.literal_eval(match) if self._is_valid_grid(grid): return grid - except: + except Exception: continue # Strategy 4: Extract rows one per line @@ -328,7 +325,7 @@ Example format: grid = [json.loads(row) for row in rows] if self._is_valid_grid(grid): return grid - except: + except Exception: pass return None diff --git a/environments/eval_environments/arc_eval.py b/environments/eval_environments/arc_eval.py index 4dbeddee..43f02308 100644 --- a/environments/eval_environments/arc_eval.py +++ b/environments/eval_environments/arc_eval.py @@ -21,11 +21,8 @@ Supports optional thinking mode with tags. """ import asyncio -import os -import re -import time from string import ascii_uppercase -from typing import Any, Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple import wandb from datasets import load_dataset @@ -33,7 +30,6 @@ from eval_helpers import ( build_mcqa_fallback_patterns, create_system_content, extract_letter_from_answer_tag, - extract_thinking_content, get_default_thinking_prompt, save_eval_results, validate_thinking_format, @@ -45,7 +41,6 @@ from atroposlib.envs.base import ( APIServerConfig, BaseEnv, BaseEnvConfig, - EvalHandlingEnum, ) @@ -173,7 +168,7 @@ class ARCEvalEnv(BaseEnv): async def setup(self): """Load the ARC dataset.""" - print(f"\nARC Evaluation Setup (Generative Mode):") + print("\nARC Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Subset: {self.config.subset}") print(f" Evaluation split: {self.config.eval_split}") diff --git a/environments/eval_environments/arena_hard_environment.py b/environments/eval_environments/arena_hard_environment.py index 0c14ce1d..742288e4 100644 --- a/environments/eval_environments/arena_hard_environment.py +++ b/environments/eval_environments/arena_hard_environment.py @@ -9,7 +9,6 @@ from datasets import load_dataset from eval_helpers import ( create_system_content, get_default_thinking_prompt, - save_eval_results, ) from pydantic import Field from tenacity import retry, stop_after_attempt, wait_random_exponential diff --git a/environments/eval_environments/bbh_eval.py b/environments/eval_environments/bbh_eval.py index 66b3b235..81aba681 100644 --- a/environments/eval_environments/bbh_eval.py +++ b/environments/eval_environments/bbh_eval.py @@ -20,12 +20,9 @@ snarks, sports_understanding, temporal_sequences, tracking_shuffled_objects (3/5 """ import asyncio -import os import random -import re -import time from string import ascii_uppercase -from typing import Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional, Tuple import wandb from datasets import load_dataset @@ -33,7 +30,6 @@ from eval_helpers import ( build_mcqa_fallback_patterns, create_system_content, extract_letter_from_answer_tag, - extract_thinking_content, get_default_thinking_prompt, save_eval_results, validate_thinking_format, @@ -45,7 +41,6 @@ from atroposlib.envs.base import ( APIServerConfig, BaseEnv, BaseEnvConfig, - EvalHandlingEnum, ) # All available BBH subsets @@ -86,8 +81,7 @@ def format_bbh_prompt(item: Dict) -> Tuple[str, List[str], int]: input_prefix = item.get("example_input_prefix", "\nQuestion: ") input_text = item.get("input", "") choice_prefix = item.get("choice_prefix", "\n Choices: ") - output_prefix = item.get("example_output_prefix", "\nAnswer: ") - + # Note: output_prefix from item.get("example_output_prefix") is not used in generative mode choices = item.get("choices", []) target_idx = item.get("target_idx", 0) @@ -222,7 +216,7 @@ class BBHEvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nBBH Evaluation Setup (Generative Mode):") + print("\nBBH Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Subset: {self.config.subset}") print(f" Evaluation split: {self.config.eval_split}") @@ -553,7 +547,7 @@ class BBHEvalEnv(BaseEnv): print(f" Format Compliance: {format_valid / total:.2%}") if self.config.thinking_mode: print(f" Thinking Utilization: {has_thinking / total:.2%}") - print(f"\n Per-Subset Breakdown:") + print("\n Per-Subset Breakdown:") for subset, data in sorted( subset_metrics.items(), key=lambda x: -x[1]["accuracy"] ): diff --git a/environments/eval_environments/boolq_eval.py b/environments/eval_environments/boolq_eval.py index 4f9868ef..e0276841 100644 --- a/environments/eval_environments/boolq_eval.py +++ b/environments/eval_environments/boolq_eval.py @@ -20,11 +20,7 @@ Supports optional thinking mode with tags. """ import asyncio -import os -import re -import time -from string import ascii_uppercase -from typing import Any, Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple import wandb from datasets import load_dataset @@ -33,7 +29,6 @@ from eval_helpers import ( build_mcqa_fallback_patterns, create_system_content, extract_letter_from_answer_tag, - extract_thinking_content, get_default_thinking_prompt, save_eval_results, validate_thinking_format, @@ -45,7 +40,6 @@ from atroposlib.envs.base import ( APIServerConfig, BaseEnv, BaseEnvConfig, - EvalHandlingEnum, ) @@ -170,7 +164,7 @@ class BoolQEvalEnv(BaseEnv): async def setup(self): """Load the BoolQ dataset.""" - print(f"\nBoolQ Evaluation Setup (Generative Mode):") + print("\nBoolQ Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") diff --git a/environments/eval_environments/drop_eval.py b/environments/eval_environments/drop_eval.py index 348e8711..994471d0 100644 --- a/environments/eval_environments/drop_eval.py +++ b/environments/eval_environments/drop_eval.py @@ -23,17 +23,13 @@ import asyncio import os import re import time -from typing import Any, Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple -import wandb from datasets import load_dataset from eval_helpers import ( create_system_content, extract_freeform_from_answer_tag, - extract_thinking_content, get_default_thinking_prompt, - save_eval_results, - validate_thinking_format, ) from pydantic import Field from tqdm.asyncio import tqdm_asyncio @@ -317,7 +313,7 @@ Question: {question}""" async def setup(self) -> None: """Load the DROP dataset and prepare for evaluation.""" - print(f"\nDROP Evaluation Setup:") + print("\nDROP Evaluation Setup:") print(f" Dataset: {self.config.dataset_name}") print(f" Max tokens: {self.config.eval_max_tokens}") print(f" Evaluation split: {self.config.eval_split}") @@ -539,7 +535,7 @@ Question: {question}""" print( f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}" ) - except: + except Exception: pass if attempt < self.config.max_retries - 1: await asyncio.sleep(self.config.retry_delay) @@ -615,7 +611,7 @@ Question: {question}""" start_time = time.time() print(f"\n{'='*60}") - print(f"Starting DROP Evaluation") + print("Starting DROP Evaluation") print(f"{'='*60}") print(f" Total questions: {len(self.all_eval_items)}") print(f" Thinking mode: {self.config.thinking_mode}") @@ -694,7 +690,7 @@ Question: {question}""" # Print summary print(f"\n{'='*60}") - print(f"DROP Evaluation Results") + print("DROP Evaluation Results") print(f"{'='*60}") print(f"Exact Match Accuracy: {accuracy:.4f} ({correct_count}/{total_count})") print(f"Average F1 Score: {avg_f1:.4f}") diff --git a/environments/eval_environments/eval_helpers.py b/environments/eval_environments/eval_helpers.py index 8f11800d..cb1318a0 100644 --- a/environments/eval_environments/eval_helpers.py +++ b/environments/eval_environments/eval_helpers.py @@ -19,7 +19,7 @@ import os import re from concurrent.futures import ProcessPoolExecutor from string import ascii_uppercase -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple # Try to import math_verify libraries (optional dependency for math evals) try: diff --git a/environments/eval_environments/gpqa_eval.py b/environments/eval_environments/gpqa_eval.py index b69e956b..fabc2d3f 100644 --- a/environments/eval_environments/gpqa_eval.py +++ b/environments/eval_environments/gpqa_eval.py @@ -29,16 +29,11 @@ import time from string import ascii_uppercase from typing import Dict, List, Optional, Tuple -import wandb from datasets import load_dataset from eval_helpers import ( - build_mcqa_fallback_patterns, create_system_content, extract_letter_from_answer_tag, - extract_thinking_content, get_default_thinking_prompt, - save_eval_results, - validate_thinking_format, ) from pydantic import Field from tqdm.asyncio import tqdm_asyncio @@ -289,7 +284,7 @@ class GPQAEvalEnv(BaseEnv): async def setup(self) -> None: """Load the GPQA dataset and prepare for evaluation.""" - print(f"\nGPQA Evaluation Setup (Generative Mode):") + print("\nGPQA Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Subset: {self.config.subset}") print(f" Max tokens for reasoning: {self.config.eval_max_tokens}") @@ -507,7 +502,7 @@ class GPQAEvalEnv(BaseEnv): break elif attempt < self.config.max_retries - 1: if self.config.full_debug: - print(f" Response too short, retrying...") + print(" Response too short, retrying...") await asyncio.sleep(self.config.retry_delay) except Exception as e: @@ -520,7 +515,7 @@ class GPQAEvalEnv(BaseEnv): print( f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}" ) - except: + except Exception: pass if attempt < self.config.max_retries - 1: await asyncio.sleep(self.config.retry_delay) @@ -592,7 +587,7 @@ class GPQAEvalEnv(BaseEnv): start_time = time.time() print(f"\n{'='*60}") - print(f"Starting GPQA Evaluation (Generative/Reasoning Mode)") + print("Starting GPQA Evaluation (Generative/Reasoning Mode)") print(f"{'='*60}") print(f" Subset: {self.config.subset}") print(f" Total questions: {len(self.all_eval_items)}") @@ -708,7 +703,7 @@ class GPQAEvalEnv(BaseEnv): print(f"Format Compliance: {format_compliance_rate:.4f}") print(f"Thinking Utilization: {thinking_utilization}/{total_count}") - print(f"\nSubdomain Breakdown:") + print("\nSubdomain Breakdown:") for subdomain, stats in sorted(subdomain_results.items()): if stats["total"] > 0: subdom_acc = stats["correct"] / stats["total"] @@ -716,7 +711,7 @@ class GPQAEvalEnv(BaseEnv): f" {subdomain}: {subdom_acc:.4f} ({stats['correct']}/{stats['total']})" ) - print(f"\nExtraction Method Statistics:") + print("\nExtraction Method Statistics:") for method, stats in sorted( extraction_methods.items(), key=lambda x: -x[1]["count"] ): diff --git a/environments/eval_environments/gsm8k_eval.py b/environments/eval_environments/gsm8k_eval.py index d0d15800..59dde1b2 100644 --- a/environments/eval_environments/gsm8k_eval.py +++ b/environments/eval_environments/gsm8k_eval.py @@ -161,7 +161,7 @@ class GSM8KEvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nGSM8K Evaluation Setup (Generative Mode):") + print("\nGSM8K Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Subset: {self.config.subset}") print(f" Evaluation split: {self.config.eval_split}") @@ -416,7 +416,7 @@ class GSM8KEvalEnv(BaseEnv): print(f" Format Compliance: {format_valid / total:.2%}") if self.config.thinking_mode: print(f" Thinking Utilization: {has_thinking / total:.2%}") - print(f"\n Verification Methods:") + print("\n Verification Methods:") for method, count in sorted(method_counts.items(), key=lambda x: -x[1]): print(f" {method}: {count} ({count/total:.1%})") print(f"{'='*60}\n") diff --git a/environments/eval_environments/hellaswag_eval.py b/environments/eval_environments/hellaswag_eval.py index bfc3e2c6..1c6fae79 100644 --- a/environments/eval_environments/hellaswag_eval.py +++ b/environments/eval_environments/hellaswag_eval.py @@ -167,7 +167,7 @@ class HellaSwagEvalEnv(BaseEnv): async def setup(self): """Load the HellaSwag dataset.""" - print(f"\nHellaSwag Evaluation Setup (Generative Mode):") + print("\nHellaSwag Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") diff --git a/environments/eval_environments/hle_eval.py b/environments/eval_environments/hle_eval.py index 963c4767..dc6becdb 100644 --- a/environments/eval_environments/hle_eval.py +++ b/environments/eval_environments/hle_eval.py @@ -152,7 +152,7 @@ class HLEEvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nHLE Evaluation Setup (Generative Mode):") + print("\nHLE Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") @@ -525,7 +525,7 @@ class HLEEvalEnv(BaseEnv): if self.config.thinking_mode: print(f" Thinking Utilization: {has_thinking / total:.2%}") if category_metrics: - print(f"\n Per-Category Breakdown:") + print("\n Per-Category Breakdown:") for cat, data in sorted( category_metrics.items(), key=lambda x: -x[1]["accuracy"] ): diff --git a/environments/eval_environments/ifeval_eval.py b/environments/eval_environments/ifeval_eval.py index 2061d914..86c703ca 100644 --- a/environments/eval_environments/ifeval_eval.py +++ b/environments/eval_environments/ifeval_eval.py @@ -228,7 +228,7 @@ class IFEvalEnv(BaseEnv): async def setup(self) -> None: """Load the IFEval dataset and prepare for evaluation.""" - print(f"\nIFEval Evaluation Setup:") + print("\nIFEval Evaluation Setup:") print(f" Dataset: {self.config.dataset_name}") print(f" Max tokens: {self.config.eval_max_tokens}") print(f" Evaluation split: {self.config.eval_split}") @@ -478,7 +478,7 @@ class IFEvalEnv(BaseEnv): break elif attempt < self.config.max_retries - 1: if self.config.full_debug: - print(f" Response too short, retrying...") + print(" Response too short, retrying...") await asyncio.sleep(self.config.retry_delay) except Exception as e: @@ -490,7 +490,7 @@ class IFEvalEnv(BaseEnv): print( f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}" ) - except: + except Exception: pass if attempt < self.config.max_retries - 1: await asyncio.sleep(self.config.retry_delay) @@ -569,7 +569,7 @@ class IFEvalEnv(BaseEnv): start_time = time.time() print(f"\n{'='*60}") - print(f"Starting IFEval Evaluation (Instruction Following)") + print("Starting IFEval Evaluation (Instruction Following)") print(f"{'='*60}") print(f" Total prompts: {len(self.all_eval_items)}") print(f" Max tokens: {self.config.eval_max_tokens}") @@ -682,7 +682,7 @@ class IFEvalEnv(BaseEnv): # Print summary print(f"\n{'='*60}") - print(f"IFEval Evaluation Results") + print("IFEval Evaluation Results") print(f"{'='*60}") print( f"Prompt-Level Strict Accuracy: {prompt_strict_acc:.4f} ({prompt_strict_count}/{total_count})" diff --git a/environments/eval_environments/judgemark_eval.py b/environments/eval_environments/judgemark_eval.py index 7b70ce4b..37728912 100644 --- a/environments/eval_environments/judgemark_eval.py +++ b/environments/eval_environments/judgemark_eval.py @@ -351,7 +351,7 @@ class JudgeMarkEvalEnv(BaseEnv): async def setup(self): """Load JudgeMark data files.""" - print(f"\nLoading JudgeMark v2 data...") + print("\nLoading JudgeMark v2 data...") # Determine data directory data_dir = JUDGEMARK_DATA_DIR @@ -701,7 +701,7 @@ class JudgeMarkEvalEnv(BaseEnv): f" Models with reference: {calibrated_cross_stats['num_models_with_reference']}" ) - print(f"\n Per-model averages (calibrated):") + print("\n Per-model averages (calibrated):") sorted_models = sorted( model_stats.items(), key=lambda x: x[1]["mean_calibrated"], reverse=True ) diff --git a/environments/eval_environments/math500_eval.py b/environments/eval_environments/math500_eval.py index 1d8ad7a1..c59b58ab 100644 --- a/environments/eval_environments/math500_eval.py +++ b/environments/eval_environments/math500_eval.py @@ -172,7 +172,7 @@ class MATH500EvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nMATH-500 Evaluation Setup (Generative Mode):") + print("\nMATH-500 Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") @@ -470,7 +470,7 @@ class MATH500EvalEnv(BaseEnv): if self.config.thinking_mode: print(f" Thinking Utilization: {has_thinking / total:.2%}") if subject_metrics and len(subject_metrics) > 1: - print(f"\n Per-Subject Breakdown:") + print("\n Per-Subject Breakdown:") for subject, data in sorted( subject_metrics.items(), key=lambda x: -x[1]["accuracy"] ): @@ -478,7 +478,7 @@ class MATH500EvalEnv(BaseEnv): f" {subject}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})" ) if level_metrics and len(level_metrics) > 1: - print(f"\n Per-Level Breakdown:") + print("\n Per-Level Breakdown:") for level, data in sorted(level_metrics.items()): print( f" Level {level}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})" diff --git a/environments/eval_environments/math_eval.py b/environments/eval_environments/math_eval.py index d010822b..e516e779 100644 --- a/environments/eval_environments/math_eval.py +++ b/environments/eval_environments/math_eval.py @@ -178,7 +178,7 @@ class MATHEvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nMATH Evaluation Setup (Generative Mode):") + print("\nMATH Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Subsets: {self.config.subsets}") print(f" Evaluation split: {self.config.eval_split}") @@ -484,7 +484,7 @@ class MATHEvalEnv(BaseEnv): print(f" Format Compliance: {format_valid / total:.2%}") if self.config.thinking_mode: print(f" Thinking Utilization: {has_thinking / total:.2%}") - print(f"\n Per-Subset Breakdown:") + print("\n Per-Subset Breakdown:") for subset, data in sorted( subset_metrics.items(), key=lambda x: -x[1]["accuracy"] ): @@ -492,7 +492,7 @@ class MATHEvalEnv(BaseEnv): f" {subset}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})" ) if level_metrics and len(level_metrics) > 1: - print(f"\n Per-Level Breakdown:") + print("\n Per-Level Breakdown:") for level, data in sorted(level_metrics.items()): print( f" {level}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})" diff --git a/environments/eval_environments/mixeval_eval.py b/environments/eval_environments/mixeval_eval.py index 0a7be95f..a75d746f 100644 --- a/environments/eval_environments/mixeval_eval.py +++ b/environments/eval_environments/mixeval_eval.py @@ -343,7 +343,7 @@ class MixEvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nMixEval Evaluation Setup (with LLM Judge):") + print("\nMixEval Evaluation Setup (with LLM Judge):") print(f" Dataset: {self.config.dataset_name}") print(f" Difficulty: {self.config.difficulty}") print(f" Question types: {self.config.question_types}") @@ -737,7 +737,7 @@ class MixEvalEnv(BaseEnv): print(f" Format Compliance: {format_valid / total:.2%}") print(f" Thinking Utilization: {has_thinking / total:.2%}") print(f" Judge Error Rate: {self.judge_error_count / total:.2%}") - print(f"\n Per-Benchmark Breakdown:") + print("\n Per-Benchmark Breakdown:") for bench, data in sorted( benchmark_metrics.items(), key=lambda x: -x[1]["avg_score"] ): diff --git a/environments/eval_environments/mmlu_eval.py b/environments/eval_environments/mmlu_eval.py index dcf9f338..92b05a78 100644 --- a/environments/eval_environments/mmlu_eval.py +++ b/environments/eval_environments/mmlu_eval.py @@ -464,7 +464,7 @@ class MMLUEvalEnv(BaseEnv): if not self.subjects: raise ValueError("No valid MMLU subjects specified for evaluation.") - print(f"\nMMLU Evaluation Setup (Generative Mode):") + print("\nMMLU Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Subjects: {len(self.subjects)} subjects") print(f" Few-shot examples: {self.config.num_few_shot}") @@ -821,7 +821,7 @@ class MMLUEvalEnv(BaseEnv): print( f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}" ) - except: + except Exception: pass if attempt < self.config.max_retries - 1: await asyncio.sleep(self.config.retry_delay) @@ -907,7 +907,7 @@ class MMLUEvalEnv(BaseEnv): start_time = time.time() print(f"\n{'='*60}") - print(f"Starting MMLU Evaluation (Generative/Reasoning Mode)") + print("Starting MMLU Evaluation (Generative/Reasoning Mode)") print(f"{'='*60}") print(f" Subjects: {len(self.subjects)}") print(f" Total questions: {len(self.all_eval_items)}") @@ -1046,7 +1046,7 @@ class MMLUEvalEnv(BaseEnv): # Print summary print(f"\n{'='*60}") - print(f"MMLU Evaluation Results") + print("MMLU Evaluation Results") print(f"{'='*60}") print( f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})" @@ -1057,7 +1057,7 @@ class MMLUEvalEnv(BaseEnv): print(f"Format Compliance: {format_compliance_rate:.4f}") print(f"Thinking Utilization: {thinking_utilization}/{total_count}") - print(f"\nCategory Breakdown:") + print("\nCategory Breakdown:") for category, stats in category_results.items(): if stats["total"] > 0: cat_acc = stats["correct"] / stats["total"] @@ -1065,7 +1065,7 @@ class MMLUEvalEnv(BaseEnv): f" {category}: {cat_acc:.4f} ({stats['correct']}/{stats['total']})" ) - print(f"\nExtraction Method Statistics:") + print("\nExtraction Method Statistics:") for method, stats in sorted( extraction_methods.items(), key=lambda x: -x[1]["count"] ): diff --git a/environments/eval_environments/mmlu_pro_eval.py b/environments/eval_environments/mmlu_pro_eval.py index 1eae91a8..e4315ab2 100644 --- a/environments/eval_environments/mmlu_pro_eval.py +++ b/environments/eval_environments/mmlu_pro_eval.py @@ -307,7 +307,7 @@ class MMLUProEvalEnv(BaseEnv): async def setup(self) -> None: """Load the MMLU-Pro dataset and prepare for evaluation.""" - print(f"\nMMLU-Pro Evaluation Setup (Generative Mode):") + print("\nMMLU-Pro Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Few-shot examples: {self.config.num_few_shot}") print(f" Max tokens for reasoning: {self.config.eval_max_tokens}") @@ -358,7 +358,7 @@ class MMLUProEvalEnv(BaseEnv): cat = item.get("category", "unknown") category_counts[cat] = category_counts.get(cat, 0) + 1 - print(f"\n Category distribution:") + print("\n Category distribution:") for cat, count in sorted(category_counts.items()): print(f" {cat}: {count} questions") @@ -586,7 +586,7 @@ class MMLUProEvalEnv(BaseEnv): break elif attempt < self.config.max_retries - 1: if self.config.full_debug: - print(f" Response too short, retrying...") + print(" Response too short, retrying...") await asyncio.sleep(self.config.retry_delay) except Exception as e: @@ -599,7 +599,7 @@ class MMLUProEvalEnv(BaseEnv): print( f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}" ) - except: + except Exception: pass if attempt < self.config.max_retries - 1: await asyncio.sleep(self.config.retry_delay) @@ -673,7 +673,7 @@ class MMLUProEvalEnv(BaseEnv): start_time = time.time() print(f"\n{'='*60}") - print(f"Starting MMLU-Pro Evaluation (Generative/Reasoning Mode)") + print("Starting MMLU-Pro Evaluation (Generative/Reasoning Mode)") print(f"{'='*60}") print(f" Total questions: {len(self.all_eval_items)}") print(f" Few-shot examples: {self.config.num_few_shot}") @@ -788,7 +788,7 @@ class MMLUProEvalEnv(BaseEnv): # Print summary print(f"\n{'='*60}") - print(f"MMLU-Pro Evaluation Results") + print("MMLU-Pro Evaluation Results") print(f"{'='*60}") print( f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})" @@ -799,7 +799,7 @@ class MMLUProEvalEnv(BaseEnv): print(f"Format Compliance: {format_compliance_rate:.4f}") print(f"Thinking Utilization: {thinking_utilization}/{total_count}") - print(f"\nCategory Breakdown:") + print("\nCategory Breakdown:") for category, stats in sorted(category_results.items()): if stats["total"] > 0: cat_acc = stats["correct"] / stats["total"] @@ -807,7 +807,7 @@ class MMLUProEvalEnv(BaseEnv): f" {category}: {cat_acc:.4f} ({stats['correct']}/{stats['total']})" ) - print(f"\nExtraction Method Statistics:") + print("\nExtraction Method Statistics:") for method, stats in sorted( extraction_methods.items(), key=lambda x: -x[1]["count"] ): diff --git a/environments/eval_environments/musr_eval.py b/environments/eval_environments/musr_eval.py index 02aa0656..13251fb8 100644 --- a/environments/eval_environments/musr_eval.py +++ b/environments/eval_environments/musr_eval.py @@ -282,7 +282,7 @@ class MuSREvalEnv(BaseEnv): if isinstance(choices_raw, str): try: choices = ast.literal_eval(choices_raw) - except: + except Exception: choices = [] else: choices = choices_raw @@ -301,7 +301,7 @@ class MuSREvalEnv(BaseEnv): async def setup(self) -> None: """Load the MuSR dataset and prepare for evaluation.""" - print(f"\nMuSR Evaluation Setup:") + print("\nMuSR Evaluation Setup:") print(f" Dataset: {self.config.dataset_name}") print(f" Subset: {self.config.subset}") print(f" Max tokens: {self.config.eval_max_tokens}") @@ -495,7 +495,7 @@ class MuSREvalEnv(BaseEnv): print( f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}" ) - except: + except Exception: pass if attempt < self.config.max_retries - 1: await asyncio.sleep(self.config.retry_delay) diff --git a/environments/eval_environments/obqa_eval.py b/environments/eval_environments/obqa_eval.py index 84f89bcc..57e3b684 100644 --- a/environments/eval_environments/obqa_eval.py +++ b/environments/eval_environments/obqa_eval.py @@ -291,7 +291,7 @@ class OBQAEvalEnv(BaseEnv): async def setup(self) -> None: """Load the OpenBookQA dataset and prepare for evaluation.""" - print(f"\nOpenBookQA Evaluation Setup:") + print("\nOpenBookQA Evaluation Setup:") print(f" Dataset: {self.config.dataset_name}") print(f" Subset: {self.config.subset}") print(f" Eval split: {self.config.eval_split}") @@ -481,7 +481,7 @@ class OBQAEvalEnv(BaseEnv): print( f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}" ) - except: + except Exception: pass if attempt < self.config.max_retries - 1: await asyncio.sleep(self.config.retry_delay) @@ -560,7 +560,7 @@ class OBQAEvalEnv(BaseEnv): start_time = time.time() print(f"\n{'='*60}") - print(f"Starting OpenBookQA Evaluation") + print("Starting OpenBookQA Evaluation") print(f"{'='*60}") print(f" Total questions: {len(self.all_eval_items)}") print(f" Thinking mode: {self.config.thinking_mode}") @@ -641,7 +641,7 @@ class OBQAEvalEnv(BaseEnv): # Print summary print(f"\n{'='*60}") - print(f"OpenBookQA Evaluation Results") + print("OpenBookQA Evaluation Results") print(f"{'='*60}") print(f"Accuracy: {accuracy:.4f} ({correct_count}/{total_count})") print(f"Answer Extraction Rate: {extraction_rate:.4f}") diff --git a/environments/eval_environments/olympiadbench_eval.py b/environments/eval_environments/olympiadbench_eval.py index 4b3cde69..1417001c 100644 --- a/environments/eval_environments/olympiadbench_eval.py +++ b/environments/eval_environments/olympiadbench_eval.py @@ -199,7 +199,7 @@ class OlympiadBenchEvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nOlympiadBench Evaluation Setup (Generative Mode):") + print("\nOlympiadBench Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Subset: {self.config.subset}") print(f" Evaluation split: {self.config.eval_split}") @@ -648,7 +648,7 @@ class OlympiadBenchEvalEnv(BaseEnv): if self.config.thinking_mode: print(f" Thinking Utilization: {has_thinking / total:.2%}") if subject_metrics: - print(f"\n Per-Subject Breakdown:") + print("\n Per-Subject Breakdown:") for subject, data in subject_metrics.items(): print( f" {subject}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})" diff --git a/environments/eval_environments/piqa_eval.py b/environments/eval_environments/piqa_eval.py index 75d69f02..2fce1a6e 100644 --- a/environments/eval_environments/piqa_eval.py +++ b/environments/eval_environments/piqa_eval.py @@ -167,7 +167,7 @@ class PIQAEvalEnv(BaseEnv): async def setup(self): """Load the PIQA dataset.""" - print(f"\nPIQA Evaluation Setup (Generative Mode):") + print("\nPIQA Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") diff --git a/environments/eval_environments/pubmedqa_eval.py b/environments/eval_environments/pubmedqa_eval.py index ca9f2e37..6ce688e7 100644 --- a/environments/eval_environments/pubmedqa_eval.py +++ b/environments/eval_environments/pubmedqa_eval.py @@ -154,7 +154,7 @@ class PubMedQAEvalEnv(BaseEnv): if not self._dataset_loaded: await self._load_dataset() - print(f"\nPubMedQA Evaluation Setup (Generative Mode):") + print("\nPubMedQA Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Subset: {self.config.subset}") print(f" Evaluation split: {self.config.eval_split}") @@ -481,7 +481,7 @@ class PubMedQAEvalEnv(BaseEnv): print(f" Format Compliance: {format_valid / total:.2%}") if self.config.thinking_mode: print(f" Thinking Utilization: {has_thinking / total:.2%}") - print(f"\n Per-Answer Breakdown:") + print("\n Per-Answer Breakdown:") for answer, data in answer_metrics.items(): print( f" {answer}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})" diff --git a/environments/eval_environments/simpleqa_eval.py b/environments/eval_environments/simpleqa_eval.py index eb1121ae..f15f28ef 100644 --- a/environments/eval_environments/simpleqa_eval.py +++ b/environments/eval_environments/simpleqa_eval.py @@ -455,7 +455,7 @@ class SimpleQAEvalEnv(BaseEnv): else "String Matching (Nous)" ) - print(f"\nSimpleQA Evaluation Setup:") + print("\nSimpleQA Evaluation Setup:") print(f" Dataset: {self.config.dataset_name}") print(f" Scoring mode: {scoring_mode}") print(f" Max tokens for answer: {self.config.eval_max_tokens}") @@ -627,7 +627,7 @@ class SimpleQAEvalEnv(BaseEnv): break elif attempt < self.config.max_retries - 1: if self.config.full_debug: - print(f" Response too short, retrying...") + print(" Response too short, retrying...") await asyncio.sleep(self.config.retry_delay) except Exception as e: @@ -639,7 +639,7 @@ class SimpleQAEvalEnv(BaseEnv): print( f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}" ) - except: + except Exception: pass if attempt < self.config.max_retries - 1: await asyncio.sleep(self.config.retry_delay) @@ -808,7 +808,7 @@ class SimpleQAEvalEnv(BaseEnv): ) print(f"\n{'='*60}") - print(f"Starting SimpleQA Evaluation") + print("Starting SimpleQA Evaluation") print(f"{'='*60}") print(f" Total questions: {len(self.all_eval_items)}") print(f" Scoring mode: {scoring_mode}") @@ -983,7 +983,7 @@ class SimpleQAEvalEnv(BaseEnv): f"Accuracy (if attempted): {eval_metrics['eval/accuracy_if_attempted']:.4f}" ) print(f"Not Attempted Rate: {eval_metrics['eval/not_attempted_rate']:.4f}") - print(f"\nGrade Distribution:") + print("\nGrade Distribution:") print(f" CORRECT: {correct_count} ({100*correct_count/total_count:.1f}%)") print( f" INCORRECT: {incorrect_count} ({100*incorrect_count/total_count:.1f}%)" @@ -1012,7 +1012,7 @@ class SimpleQAEvalEnv(BaseEnv): print(f"Thinking Utilization: {thinking_utilization}/{total_count}") if len(sorted_topics) > 0: - print(f"\nTop Topics (by count):") + print("\nTop Topics (by count):") for topic, stats in sorted_topics[:10]: if stats["total"] > 0: topic_acc = stats["correct"] / stats["total"] diff --git a/environments/eval_environments/siqa_eval.py b/environments/eval_environments/siqa_eval.py index 8a56fcb2..6c027ff6 100644 --- a/environments/eval_environments/siqa_eval.py +++ b/environments/eval_environments/siqa_eval.py @@ -167,7 +167,7 @@ class SIQAEvalEnv(BaseEnv): async def setup(self): """Load the SIQA dataset.""" - print(f"\nSIQA Evaluation Setup (Generative Mode):") + print("\nSIQA Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Evaluation split: {self.config.eval_split}") print(f" Thinking mode: {self.config.thinking_mode}") diff --git a/environments/eval_environments/winogrande_eval.py b/environments/eval_environments/winogrande_eval.py index 81823f6d..87bcc3fd 100644 --- a/environments/eval_environments/winogrande_eval.py +++ b/environments/eval_environments/winogrande_eval.py @@ -172,7 +172,7 @@ class WinoGrandeEvalEnv(BaseEnv): async def setup(self): """Load the WinoGrande dataset.""" - print(f"\nWinoGrande Evaluation Setup (Generative Mode):") + print("\nWinoGrande Evaluation Setup (Generative Mode):") print(f" Dataset: {self.config.dataset_name}") print(f" Subset: {self.config.subset}") print(f" Evaluation split: {self.config.eval_split}")