more linter nonsense

This commit is contained in:
teknium 2025-12-24 11:04:33 +00:00
parent f18d46549d
commit abdda3978a
29 changed files with 113 additions and 151 deletions

View file

@ -36,16 +36,11 @@ import time
from string import ascii_uppercase
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
@ -333,12 +328,13 @@ class AGIEvalEnv(BaseEnv):
async def setup(self) -> None:
"""Load the AGIEval dataset and prepare for evaluation."""
print(f"\nAGIEval Evaluation Setup (Generative Mode):")
print("\nAGIEval Evaluation Setup (Generative Mode):")
print(f" Max tokens for reasoning: {self.config.eval_max_tokens}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")
if self.config.thinking_mode:
print(f" Thinking prompt: {self._get_thinking_prompt()[:100]}...")
prompt_preview = self._get_thinking_prompt()[:100]
print(f" Thinking prompt: {prompt_preview}...")
# Determine which subsets to use
if self.config.subsets:
@ -379,7 +375,7 @@ class AGIEvalEnv(BaseEnv):
print(f"\n Total evaluation items: {len(self.eval_data)}")
# Print subset distribution
print(f"\n Subset distribution:")
print("\n Subset distribution:")
for subset, count in sorted(subset_counts.items()):
print(f" {subset}: {count} questions")
@ -584,7 +580,7 @@ class AGIEvalEnv(BaseEnv):
break
elif attempt < self.config.max_retries - 1:
if self.config.full_debug:
print(f" Response too short, retrying...")
print(" Response too short, retrying...")
await asyncio.sleep(self.config.retry_delay)
except Exception as e:
@ -594,15 +590,15 @@ class AGIEvalEnv(BaseEnv):
)
if hasattr(e, "response"):
try:
print(
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
)
except:
resp_text = e.response.text[:500] if hasattr(e.response, "text") else str(e.response)
print(f" Response: {resp_text}")
except Exception:
pass
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay)
else:
print(f" Failed after {self.config.max_retries} attempts")
retries = self.config.max_retries
print(f" Failed after {retries} attempts")
return {"is_correct": None, "sample": None}
if not model_response:
@ -669,9 +665,9 @@ class AGIEvalEnv(BaseEnv):
"""Run AGIEval evaluation."""
start_time = time.time()
print(f"\n{'='*60}")
print(f"Starting AGIEval Evaluation (Generative/Reasoning Mode)")
print(f"{'='*60}")
print("\n" + "=" * 60)
print("Starting AGIEval Evaluation (Generative/Reasoning Mode)")
print("=" * 60)
print(f" Total questions: {len(self.all_eval_items)}")
print(f" Max tokens (for reasoning): {self.config.eval_max_tokens}")
print(f" Thinking mode: {self.config.thinking_mode}")
@ -782,9 +778,9 @@ class AGIEvalEnv(BaseEnv):
self.eval_metrics = [(k, v) for k, v in eval_metrics.items()]
# Print summary
print(f"\n{'='*60}")
print(f"AGIEval Evaluation Results")
print(f"{'='*60}")
print("\n" + "=" * 60)
print("AGIEval Evaluation Results")
print("=" * 60)
print(
f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})"
)
@ -794,7 +790,7 @@ class AGIEvalEnv(BaseEnv):
print(f"Format Compliance: {format_compliance_rate:.4f}")
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
print(f"\nSubset Breakdown:")
print("\nSubset Breakdown:")
for subset, stats in sorted(subset_results.items()):
if stats["total"] > 0:
subset_acc = stats["correct"] / stats["total"]
@ -802,7 +798,7 @@ class AGIEvalEnv(BaseEnv):
f" {subset}: {subset_acc:.4f} ({stats['correct']}/{stats['total']})"
)
print(f"\nExtraction Method Statistics:")
print("\nExtraction Method Statistics:")
for method, stats in sorted(
extraction_methods.items(), key=lambda x: -x[1]["count"]
):
@ -810,7 +806,7 @@ class AGIEvalEnv(BaseEnv):
method_acc = stats["correct"] / stats["count"]
print(f" {method}: {stats['count']} uses, {method_acc:.4f} accuracy")
print(f"{'='*60}\n")
print("=" * 60 + "\n")
# Log evaluation results
try:

View file

@ -23,12 +23,8 @@ Supports thinking mode with <think></think> tags for extended reasoning.
"""
import asyncio
import os
import random
import re
import time
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional
import wandb
from datasets import load_dataset
@ -50,7 +46,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# Available AIME years
@ -62,7 +57,13 @@ AIME_DATASETS = {
# Prompt template following lighteval's AIME structure
# Important: Uses the "I hope it is correct" format for math-verify
AIME_PROMPT_TEMPLATE = """Solve the following math problem efficiently and clearly. The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
AIME_PROMPT_TEMPLATE = """Solve the following math problem efficiently and clearly.
The last line of your response should be of the following format:
'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes)
where ANSWER is just the final number or expression that solves the problem.
Think step by step before answering.
Note: AIME answers are always integers from 0 to 999.
@ -172,7 +173,7 @@ class AIMEEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nAIME Evaluation Setup (Generative Mode):")
print("\nAIME Evaluation Setup (Generative Mode):")
print(f" Years: {self.config.years}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")
@ -501,12 +502,12 @@ class AIMEEvalEnv(BaseEnv):
print(f" Format Compliance: {format_valid / total:.2%}")
if self.config.thinking_mode:
print(f" Thinking Utilization: {has_thinking / total:.2%}")
print(f"\n Per-Year Breakdown:")
print("\n Per-Year Breakdown:")
for year, data in sorted(year_metrics.items()):
print(
f" AIME {year}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
)
print(f"\n Verification Methods:")
print("\n Verification Methods:")
for method, count in sorted(method_counts.items(), key=lambda x: -x[1]):
print(f" {method}: {count} ({count/total:.1%})")
print(f"{'='*60}\n")

View file

@ -17,12 +17,8 @@ Supports thinking mode with <think></think> tags for extended reasoning.
"""
import asyncio
import os
import random
import re
import time
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional
import wandb
from datasets import load_dataset
@ -45,7 +41,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# Prompt template - AIMO doesn't have a specific template in lighteval
@ -161,7 +156,7 @@ class AIMOEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nAIMO Evaluation Setup (Generative Mode):")
print("\nAIMO Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")
@ -401,7 +396,7 @@ class AIMOEvalEnv(BaseEnv):
print(f" Format Compliance: {format_valid / total:.2%}")
if self.config.thinking_mode:
print(f" Thinking Utilization: {has_thinking / total:.2%}")
print(f"\n Verification Methods:")
print("\n Verification Methods:")
for method, count in sorted(method_counts.items(), key=lambda x: -x[1]):
print(f" {method}: {count} ({count/total:.1%})")
print(f"{'='*60}\n")

View file

@ -27,17 +27,14 @@ Answer must be provided in <answer></answer> tags as a JSON 2D array.
import ast
import asyncio
import json
import os
import re
import time
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
ANSWER_TAG_PATTERN,
create_system_content,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
@ -49,7 +46,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
@ -168,7 +164,7 @@ class ARCAGIEvalEnv(BaseEnv):
async def setup(self):
"""Load the ARC-AGI 2 dataset."""
print(f"\nARC-AGI 2 Evaluation Setup (Generative Mode):")
print("\nARC-AGI 2 Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")
@ -216,7 +212,8 @@ class ARCAGIEvalEnv(BaseEnv):
gold_output = item["question"][0]["output"]
# Build the prompt
query = """You are solving an ARC-AGI puzzle. You will be shown training examples where an input grid is transformed into an output grid following a specific pattern or rule.
query = """You are solving an ARC-AGI puzzle. You will be shown training examples
where an input grid is transformed into an output grid following a specific pattern or rule.
Your task is to:
1. Analyze the training examples to understand the transformation pattern
@ -315,7 +312,7 @@ Example format:
grid = ast.literal_eval(match)
if self._is_valid_grid(grid):
return grid
except:
except Exception:
continue
# Strategy 4: Extract rows one per line
@ -328,7 +325,7 @@ Example format:
grid = [json.loads(row) for row in rows]
if self._is_valid_grid(grid):
return grid
except:
except Exception:
pass
return None

View file

@ -21,11 +21,8 @@ Supports optional thinking mode with <think></think> tags.
"""
import asyncio
import os
import re
import time
from string import ascii_uppercase
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
@ -33,7 +30,6 @@ from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
@ -45,7 +41,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
@ -173,7 +168,7 @@ class ARCEvalEnv(BaseEnv):
async def setup(self):
"""Load the ARC dataset."""
print(f"\nARC Evaluation Setup (Generative Mode):")
print("\nARC Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subset: {self.config.subset}")
print(f" Evaluation split: {self.config.eval_split}")

View file

@ -9,7 +9,6 @@ from datasets import load_dataset
from eval_helpers import (
create_system_content,
get_default_thinking_prompt,
save_eval_results,
)
from pydantic import Field
from tenacity import retry, stop_after_attempt, wait_random_exponential

View file

@ -20,12 +20,9 @@ snarks, sports_understanding, temporal_sequences, tracking_shuffled_objects (3/5
"""
import asyncio
import os
import random
import re
import time
from string import ascii_uppercase
from typing import Dict, List, Optional, Set, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
@ -33,7 +30,6 @@ from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
@ -45,7 +41,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# All available BBH subsets
@ -86,8 +81,7 @@ def format_bbh_prompt(item: Dict) -> Tuple[str, List[str], int]:
input_prefix = item.get("example_input_prefix", "\nQuestion: ")
input_text = item.get("input", "")
choice_prefix = item.get("choice_prefix", "\n Choices: ")
output_prefix = item.get("example_output_prefix", "\nAnswer: ")
# Note: output_prefix from item.get("example_output_prefix") is not used in generative mode
choices = item.get("choices", [])
target_idx = item.get("target_idx", 0)
@ -222,7 +216,7 @@ class BBHEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nBBH Evaluation Setup (Generative Mode):")
print("\nBBH Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subset: {self.config.subset}")
print(f" Evaluation split: {self.config.eval_split}")
@ -553,7 +547,7 @@ class BBHEvalEnv(BaseEnv):
print(f" Format Compliance: {format_valid / total:.2%}")
if self.config.thinking_mode:
print(f" Thinking Utilization: {has_thinking / total:.2%}")
print(f"\n Per-Subset Breakdown:")
print("\n Per-Subset Breakdown:")
for subset, data in sorted(
subset_metrics.items(), key=lambda x: -x[1]["accuracy"]
):

View file

@ -20,11 +20,7 @@ Supports optional thinking mode with <think></think> tags.
"""
import asyncio
import os
import re
import time
from string import ascii_uppercase
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
@ -33,7 +29,6 @@ from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
@ -45,7 +40,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
@ -170,7 +164,7 @@ class BoolQEvalEnv(BaseEnv):
async def setup(self):
"""Load the BoolQ dataset."""
print(f"\nBoolQ Evaluation Setup (Generative Mode):")
print("\nBoolQ Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")

View file

@ -23,17 +23,13 @@ import asyncio
import os
import re
import time
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
create_system_content,
extract_freeform_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
@ -317,7 +313,7 @@ Question: {question}"""
async def setup(self) -> None:
"""Load the DROP dataset and prepare for evaluation."""
print(f"\nDROP Evaluation Setup:")
print("\nDROP Evaluation Setup:")
print(f" Dataset: {self.config.dataset_name}")
print(f" Max tokens: {self.config.eval_max_tokens}")
print(f" Evaluation split: {self.config.eval_split}")
@ -539,7 +535,7 @@ Question: {question}"""
print(
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
)
except:
except Exception:
pass
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay)
@ -615,7 +611,7 @@ Question: {question}"""
start_time = time.time()
print(f"\n{'='*60}")
print(f"Starting DROP Evaluation")
print("Starting DROP Evaluation")
print(f"{'='*60}")
print(f" Total questions: {len(self.all_eval_items)}")
print(f" Thinking mode: {self.config.thinking_mode}")
@ -694,7 +690,7 @@ Question: {question}"""
# Print summary
print(f"\n{'='*60}")
print(f"DROP Evaluation Results")
print("DROP Evaluation Results")
print(f"{'='*60}")
print(f"Exact Match Accuracy: {accuracy:.4f} ({correct_count}/{total_count})")
print(f"Average F1 Score: {avg_f1:.4f}")

View file

@ -19,7 +19,7 @@ import os
import re
from concurrent.futures import ProcessPoolExecutor
from string import ascii_uppercase
from typing import Any, Dict, List, Optional, Set, Tuple
from typing import Dict, List, Optional, Set, Tuple
# Try to import math_verify libraries (optional dependency for math evals)
try:

View file

@ -29,16 +29,11 @@ import time
from string import ascii_uppercase
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
@ -289,7 +284,7 @@ class GPQAEvalEnv(BaseEnv):
async def setup(self) -> None:
"""Load the GPQA dataset and prepare for evaluation."""
print(f"\nGPQA Evaluation Setup (Generative Mode):")
print("\nGPQA Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subset: {self.config.subset}")
print(f" Max tokens for reasoning: {self.config.eval_max_tokens}")
@ -507,7 +502,7 @@ class GPQAEvalEnv(BaseEnv):
break
elif attempt < self.config.max_retries - 1:
if self.config.full_debug:
print(f" Response too short, retrying...")
print(" Response too short, retrying...")
await asyncio.sleep(self.config.retry_delay)
except Exception as e:
@ -520,7 +515,7 @@ class GPQAEvalEnv(BaseEnv):
print(
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
)
except:
except Exception:
pass
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay)
@ -592,7 +587,7 @@ class GPQAEvalEnv(BaseEnv):
start_time = time.time()
print(f"\n{'='*60}")
print(f"Starting GPQA Evaluation (Generative/Reasoning Mode)")
print("Starting GPQA Evaluation (Generative/Reasoning Mode)")
print(f"{'='*60}")
print(f" Subset: {self.config.subset}")
print(f" Total questions: {len(self.all_eval_items)}")
@ -708,7 +703,7 @@ class GPQAEvalEnv(BaseEnv):
print(f"Format Compliance: {format_compliance_rate:.4f}")
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
print(f"\nSubdomain Breakdown:")
print("\nSubdomain Breakdown:")
for subdomain, stats in sorted(subdomain_results.items()):
if stats["total"] > 0:
subdom_acc = stats["correct"] / stats["total"]
@ -716,7 +711,7 @@ class GPQAEvalEnv(BaseEnv):
f" {subdomain}: {subdom_acc:.4f} ({stats['correct']}/{stats['total']})"
)
print(f"\nExtraction Method Statistics:")
print("\nExtraction Method Statistics:")
for method, stats in sorted(
extraction_methods.items(), key=lambda x: -x[1]["count"]
):

View file

@ -161,7 +161,7 @@ class GSM8KEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nGSM8K Evaluation Setup (Generative Mode):")
print("\nGSM8K Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subset: {self.config.subset}")
print(f" Evaluation split: {self.config.eval_split}")
@ -416,7 +416,7 @@ class GSM8KEvalEnv(BaseEnv):
print(f" Format Compliance: {format_valid / total:.2%}")
if self.config.thinking_mode:
print(f" Thinking Utilization: {has_thinking / total:.2%}")
print(f"\n Verification Methods:")
print("\n Verification Methods:")
for method, count in sorted(method_counts.items(), key=lambda x: -x[1]):
print(f" {method}: {count} ({count/total:.1%})")
print(f"{'='*60}\n")

View file

@ -167,7 +167,7 @@ class HellaSwagEvalEnv(BaseEnv):
async def setup(self):
"""Load the HellaSwag dataset."""
print(f"\nHellaSwag Evaluation Setup (Generative Mode):")
print("\nHellaSwag Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")

View file

@ -152,7 +152,7 @@ class HLEEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nHLE Evaluation Setup (Generative Mode):")
print("\nHLE Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")
@ -525,7 +525,7 @@ class HLEEvalEnv(BaseEnv):
if self.config.thinking_mode:
print(f" Thinking Utilization: {has_thinking / total:.2%}")
if category_metrics:
print(f"\n Per-Category Breakdown:")
print("\n Per-Category Breakdown:")
for cat, data in sorted(
category_metrics.items(), key=lambda x: -x[1]["accuracy"]
):

View file

@ -228,7 +228,7 @@ class IFEvalEnv(BaseEnv):
async def setup(self) -> None:
"""Load the IFEval dataset and prepare for evaluation."""
print(f"\nIFEval Evaluation Setup:")
print("\nIFEval Evaluation Setup:")
print(f" Dataset: {self.config.dataset_name}")
print(f" Max tokens: {self.config.eval_max_tokens}")
print(f" Evaluation split: {self.config.eval_split}")
@ -478,7 +478,7 @@ class IFEvalEnv(BaseEnv):
break
elif attempt < self.config.max_retries - 1:
if self.config.full_debug:
print(f" Response too short, retrying...")
print(" Response too short, retrying...")
await asyncio.sleep(self.config.retry_delay)
except Exception as e:
@ -490,7 +490,7 @@ class IFEvalEnv(BaseEnv):
print(
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
)
except:
except Exception:
pass
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay)
@ -569,7 +569,7 @@ class IFEvalEnv(BaseEnv):
start_time = time.time()
print(f"\n{'='*60}")
print(f"Starting IFEval Evaluation (Instruction Following)")
print("Starting IFEval Evaluation (Instruction Following)")
print(f"{'='*60}")
print(f" Total prompts: {len(self.all_eval_items)}")
print(f" Max tokens: {self.config.eval_max_tokens}")
@ -682,7 +682,7 @@ class IFEvalEnv(BaseEnv):
# Print summary
print(f"\n{'='*60}")
print(f"IFEval Evaluation Results")
print("IFEval Evaluation Results")
print(f"{'='*60}")
print(
f"Prompt-Level Strict Accuracy: {prompt_strict_acc:.4f} ({prompt_strict_count}/{total_count})"

View file

@ -351,7 +351,7 @@ class JudgeMarkEvalEnv(BaseEnv):
async def setup(self):
"""Load JudgeMark data files."""
print(f"\nLoading JudgeMark v2 data...")
print("\nLoading JudgeMark v2 data...")
# Determine data directory
data_dir = JUDGEMARK_DATA_DIR
@ -701,7 +701,7 @@ class JudgeMarkEvalEnv(BaseEnv):
f" Models with reference: {calibrated_cross_stats['num_models_with_reference']}"
)
print(f"\n Per-model averages (calibrated):")
print("\n Per-model averages (calibrated):")
sorted_models = sorted(
model_stats.items(), key=lambda x: x[1]["mean_calibrated"], reverse=True
)

View file

@ -172,7 +172,7 @@ class MATH500EvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nMATH-500 Evaluation Setup (Generative Mode):")
print("\nMATH-500 Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")
@ -470,7 +470,7 @@ class MATH500EvalEnv(BaseEnv):
if self.config.thinking_mode:
print(f" Thinking Utilization: {has_thinking / total:.2%}")
if subject_metrics and len(subject_metrics) > 1:
print(f"\n Per-Subject Breakdown:")
print("\n Per-Subject Breakdown:")
for subject, data in sorted(
subject_metrics.items(), key=lambda x: -x[1]["accuracy"]
):
@ -478,7 +478,7 @@ class MATH500EvalEnv(BaseEnv):
f" {subject}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
)
if level_metrics and len(level_metrics) > 1:
print(f"\n Per-Level Breakdown:")
print("\n Per-Level Breakdown:")
for level, data in sorted(level_metrics.items()):
print(
f" Level {level}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"

View file

@ -178,7 +178,7 @@ class MATHEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nMATH Evaluation Setup (Generative Mode):")
print("\nMATH Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subsets: {self.config.subsets}")
print(f" Evaluation split: {self.config.eval_split}")
@ -484,7 +484,7 @@ class MATHEvalEnv(BaseEnv):
print(f" Format Compliance: {format_valid / total:.2%}")
if self.config.thinking_mode:
print(f" Thinking Utilization: {has_thinking / total:.2%}")
print(f"\n Per-Subset Breakdown:")
print("\n Per-Subset Breakdown:")
for subset, data in sorted(
subset_metrics.items(), key=lambda x: -x[1]["accuracy"]
):
@ -492,7 +492,7 @@ class MATHEvalEnv(BaseEnv):
f" {subset}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
)
if level_metrics and len(level_metrics) > 1:
print(f"\n Per-Level Breakdown:")
print("\n Per-Level Breakdown:")
for level, data in sorted(level_metrics.items()):
print(
f" {level}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"

View file

@ -343,7 +343,7 @@ class MixEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nMixEval Evaluation Setup (with LLM Judge):")
print("\nMixEval Evaluation Setup (with LLM Judge):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Difficulty: {self.config.difficulty}")
print(f" Question types: {self.config.question_types}")
@ -737,7 +737,7 @@ class MixEvalEnv(BaseEnv):
print(f" Format Compliance: {format_valid / total:.2%}")
print(f" Thinking Utilization: {has_thinking / total:.2%}")
print(f" Judge Error Rate: {self.judge_error_count / total:.2%}")
print(f"\n Per-Benchmark Breakdown:")
print("\n Per-Benchmark Breakdown:")
for bench, data in sorted(
benchmark_metrics.items(), key=lambda x: -x[1]["avg_score"]
):

View file

@ -464,7 +464,7 @@ class MMLUEvalEnv(BaseEnv):
if not self.subjects:
raise ValueError("No valid MMLU subjects specified for evaluation.")
print(f"\nMMLU Evaluation Setup (Generative Mode):")
print("\nMMLU Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subjects: {len(self.subjects)} subjects")
print(f" Few-shot examples: {self.config.num_few_shot}")
@ -821,7 +821,7 @@ class MMLUEvalEnv(BaseEnv):
print(
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
)
except:
except Exception:
pass
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay)
@ -907,7 +907,7 @@ class MMLUEvalEnv(BaseEnv):
start_time = time.time()
print(f"\n{'='*60}")
print(f"Starting MMLU Evaluation (Generative/Reasoning Mode)")
print("Starting MMLU Evaluation (Generative/Reasoning Mode)")
print(f"{'='*60}")
print(f" Subjects: {len(self.subjects)}")
print(f" Total questions: {len(self.all_eval_items)}")
@ -1046,7 +1046,7 @@ class MMLUEvalEnv(BaseEnv):
# Print summary
print(f"\n{'='*60}")
print(f"MMLU Evaluation Results")
print("MMLU Evaluation Results")
print(f"{'='*60}")
print(
f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})"
@ -1057,7 +1057,7 @@ class MMLUEvalEnv(BaseEnv):
print(f"Format Compliance: {format_compliance_rate:.4f}")
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
print(f"\nCategory Breakdown:")
print("\nCategory Breakdown:")
for category, stats in category_results.items():
if stats["total"] > 0:
cat_acc = stats["correct"] / stats["total"]
@ -1065,7 +1065,7 @@ class MMLUEvalEnv(BaseEnv):
f" {category}: {cat_acc:.4f} ({stats['correct']}/{stats['total']})"
)
print(f"\nExtraction Method Statistics:")
print("\nExtraction Method Statistics:")
for method, stats in sorted(
extraction_methods.items(), key=lambda x: -x[1]["count"]
):

View file

@ -307,7 +307,7 @@ class MMLUProEvalEnv(BaseEnv):
async def setup(self) -> None:
"""Load the MMLU-Pro dataset and prepare for evaluation."""
print(f"\nMMLU-Pro Evaluation Setup (Generative Mode):")
print("\nMMLU-Pro Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Few-shot examples: {self.config.num_few_shot}")
print(f" Max tokens for reasoning: {self.config.eval_max_tokens}")
@ -358,7 +358,7 @@ class MMLUProEvalEnv(BaseEnv):
cat = item.get("category", "unknown")
category_counts[cat] = category_counts.get(cat, 0) + 1
print(f"\n Category distribution:")
print("\n Category distribution:")
for cat, count in sorted(category_counts.items()):
print(f" {cat}: {count} questions")
@ -586,7 +586,7 @@ class MMLUProEvalEnv(BaseEnv):
break
elif attempt < self.config.max_retries - 1:
if self.config.full_debug:
print(f" Response too short, retrying...")
print(" Response too short, retrying...")
await asyncio.sleep(self.config.retry_delay)
except Exception as e:
@ -599,7 +599,7 @@ class MMLUProEvalEnv(BaseEnv):
print(
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
)
except:
except Exception:
pass
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay)
@ -673,7 +673,7 @@ class MMLUProEvalEnv(BaseEnv):
start_time = time.time()
print(f"\n{'='*60}")
print(f"Starting MMLU-Pro Evaluation (Generative/Reasoning Mode)")
print("Starting MMLU-Pro Evaluation (Generative/Reasoning Mode)")
print(f"{'='*60}")
print(f" Total questions: {len(self.all_eval_items)}")
print(f" Few-shot examples: {self.config.num_few_shot}")
@ -788,7 +788,7 @@ class MMLUProEvalEnv(BaseEnv):
# Print summary
print(f"\n{'='*60}")
print(f"MMLU-Pro Evaluation Results")
print("MMLU-Pro Evaluation Results")
print(f"{'='*60}")
print(
f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})"
@ -799,7 +799,7 @@ class MMLUProEvalEnv(BaseEnv):
print(f"Format Compliance: {format_compliance_rate:.4f}")
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
print(f"\nCategory Breakdown:")
print("\nCategory Breakdown:")
for category, stats in sorted(category_results.items()):
if stats["total"] > 0:
cat_acc = stats["correct"] / stats["total"]
@ -807,7 +807,7 @@ class MMLUProEvalEnv(BaseEnv):
f" {category}: {cat_acc:.4f} ({stats['correct']}/{stats['total']})"
)
print(f"\nExtraction Method Statistics:")
print("\nExtraction Method Statistics:")
for method, stats in sorted(
extraction_methods.items(), key=lambda x: -x[1]["count"]
):

View file

@ -282,7 +282,7 @@ class MuSREvalEnv(BaseEnv):
if isinstance(choices_raw, str):
try:
choices = ast.literal_eval(choices_raw)
except:
except Exception:
choices = []
else:
choices = choices_raw
@ -301,7 +301,7 @@ class MuSREvalEnv(BaseEnv):
async def setup(self) -> None:
"""Load the MuSR dataset and prepare for evaluation."""
print(f"\nMuSR Evaluation Setup:")
print("\nMuSR Evaluation Setup:")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subset: {self.config.subset}")
print(f" Max tokens: {self.config.eval_max_tokens}")
@ -495,7 +495,7 @@ class MuSREvalEnv(BaseEnv):
print(
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
)
except:
except Exception:
pass
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay)

View file

@ -291,7 +291,7 @@ class OBQAEvalEnv(BaseEnv):
async def setup(self) -> None:
"""Load the OpenBookQA dataset and prepare for evaluation."""
print(f"\nOpenBookQA Evaluation Setup:")
print("\nOpenBookQA Evaluation Setup:")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subset: {self.config.subset}")
print(f" Eval split: {self.config.eval_split}")
@ -481,7 +481,7 @@ class OBQAEvalEnv(BaseEnv):
print(
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
)
except:
except Exception:
pass
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay)
@ -560,7 +560,7 @@ class OBQAEvalEnv(BaseEnv):
start_time = time.time()
print(f"\n{'='*60}")
print(f"Starting OpenBookQA Evaluation")
print("Starting OpenBookQA Evaluation")
print(f"{'='*60}")
print(f" Total questions: {len(self.all_eval_items)}")
print(f" Thinking mode: {self.config.thinking_mode}")
@ -641,7 +641,7 @@ class OBQAEvalEnv(BaseEnv):
# Print summary
print(f"\n{'='*60}")
print(f"OpenBookQA Evaluation Results")
print("OpenBookQA Evaluation Results")
print(f"{'='*60}")
print(f"Accuracy: {accuracy:.4f} ({correct_count}/{total_count})")
print(f"Answer Extraction Rate: {extraction_rate:.4f}")

View file

@ -199,7 +199,7 @@ class OlympiadBenchEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nOlympiadBench Evaluation Setup (Generative Mode):")
print("\nOlympiadBench Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subset: {self.config.subset}")
print(f" Evaluation split: {self.config.eval_split}")
@ -648,7 +648,7 @@ class OlympiadBenchEvalEnv(BaseEnv):
if self.config.thinking_mode:
print(f" Thinking Utilization: {has_thinking / total:.2%}")
if subject_metrics:
print(f"\n Per-Subject Breakdown:")
print("\n Per-Subject Breakdown:")
for subject, data in subject_metrics.items():
print(
f" {subject}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"

View file

@ -167,7 +167,7 @@ class PIQAEvalEnv(BaseEnv):
async def setup(self):
"""Load the PIQA dataset."""
print(f"\nPIQA Evaluation Setup (Generative Mode):")
print("\nPIQA Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")

View file

@ -154,7 +154,7 @@ class PubMedQAEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nPubMedQA Evaluation Setup (Generative Mode):")
print("\nPubMedQA Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subset: {self.config.subset}")
print(f" Evaluation split: {self.config.eval_split}")
@ -481,7 +481,7 @@ class PubMedQAEvalEnv(BaseEnv):
print(f" Format Compliance: {format_valid / total:.2%}")
if self.config.thinking_mode:
print(f" Thinking Utilization: {has_thinking / total:.2%}")
print(f"\n Per-Answer Breakdown:")
print("\n Per-Answer Breakdown:")
for answer, data in answer_metrics.items():
print(
f" {answer}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"

View file

@ -455,7 +455,7 @@ class SimpleQAEvalEnv(BaseEnv):
else "String Matching (Nous)"
)
print(f"\nSimpleQA Evaluation Setup:")
print("\nSimpleQA Evaluation Setup:")
print(f" Dataset: {self.config.dataset_name}")
print(f" Scoring mode: {scoring_mode}")
print(f" Max tokens for answer: {self.config.eval_max_tokens}")
@ -627,7 +627,7 @@ class SimpleQAEvalEnv(BaseEnv):
break
elif attempt < self.config.max_retries - 1:
if self.config.full_debug:
print(f" Response too short, retrying...")
print(" Response too short, retrying...")
await asyncio.sleep(self.config.retry_delay)
except Exception as e:
@ -639,7 +639,7 @@ class SimpleQAEvalEnv(BaseEnv):
print(
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
)
except:
except Exception:
pass
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay)
@ -808,7 +808,7 @@ class SimpleQAEvalEnv(BaseEnv):
)
print(f"\n{'='*60}")
print(f"Starting SimpleQA Evaluation")
print("Starting SimpleQA Evaluation")
print(f"{'='*60}")
print(f" Total questions: {len(self.all_eval_items)}")
print(f" Scoring mode: {scoring_mode}")
@ -983,7 +983,7 @@ class SimpleQAEvalEnv(BaseEnv):
f"Accuracy (if attempted): {eval_metrics['eval/accuracy_if_attempted']:.4f}"
)
print(f"Not Attempted Rate: {eval_metrics['eval/not_attempted_rate']:.4f}")
print(f"\nGrade Distribution:")
print("\nGrade Distribution:")
print(f" CORRECT: {correct_count} ({100*correct_count/total_count:.1f}%)")
print(
f" INCORRECT: {incorrect_count} ({100*incorrect_count/total_count:.1f}%)"
@ -1012,7 +1012,7 @@ class SimpleQAEvalEnv(BaseEnv):
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
if len(sorted_topics) > 0:
print(f"\nTop Topics (by count):")
print("\nTop Topics (by count):")
for topic, stats in sorted_topics[:10]:
if stats["total"] > 0:
topic_acc = stats["correct"] / stats["total"]

View file

@ -167,7 +167,7 @@ class SIQAEvalEnv(BaseEnv):
async def setup(self):
"""Load the SIQA dataset."""
print(f"\nSIQA Evaluation Setup (Generative Mode):")
print("\nSIQA Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Evaluation split: {self.config.eval_split}")
print(f" Thinking mode: {self.config.thinking_mode}")

View file

@ -172,7 +172,7 @@ class WinoGrandeEvalEnv(BaseEnv):
async def setup(self):
"""Load the WinoGrande dataset."""
print(f"\nWinoGrande Evaluation Setup (Generative Mode):")
print("\nWinoGrande Evaluation Setup (Generative Mode):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Subset: {self.config.subset}")
print(f" Evaluation split: {self.config.eval_split}")