mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
more linter nonsense
This commit is contained in:
parent
f18d46549d
commit
abdda3978a
29 changed files with 113 additions and 151 deletions
|
|
@ -36,16 +36,11 @@ import time
|
|||
from string import ascii_uppercase
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import wandb
|
||||
from datasets import load_dataset
|
||||
from eval_helpers import (
|
||||
build_mcqa_fallback_patterns,
|
||||
create_system_content,
|
||||
extract_letter_from_answer_tag,
|
||||
extract_thinking_content,
|
||||
get_default_thinking_prompt,
|
||||
save_eval_results,
|
||||
validate_thinking_format,
|
||||
)
|
||||
from pydantic import Field
|
||||
from tqdm.asyncio import tqdm_asyncio
|
||||
|
|
@ -333,12 +328,13 @@ class AGIEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self) -> None:
|
||||
"""Load the AGIEval dataset and prepare for evaluation."""
|
||||
print(f"\nAGIEval Evaluation Setup (Generative Mode):")
|
||||
print("\nAGIEval Evaluation Setup (Generative Mode):")
|
||||
print(f" Max tokens for reasoning: {self.config.eval_max_tokens}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
if self.config.thinking_mode:
|
||||
print(f" Thinking prompt: {self._get_thinking_prompt()[:100]}...")
|
||||
prompt_preview = self._get_thinking_prompt()[:100]
|
||||
print(f" Thinking prompt: {prompt_preview}...")
|
||||
|
||||
# Determine which subsets to use
|
||||
if self.config.subsets:
|
||||
|
|
@ -379,7 +375,7 @@ class AGIEvalEnv(BaseEnv):
|
|||
print(f"\n Total evaluation items: {len(self.eval_data)}")
|
||||
|
||||
# Print subset distribution
|
||||
print(f"\n Subset distribution:")
|
||||
print("\n Subset distribution:")
|
||||
for subset, count in sorted(subset_counts.items()):
|
||||
print(f" {subset}: {count} questions")
|
||||
|
||||
|
|
@ -584,7 +580,7 @@ class AGIEvalEnv(BaseEnv):
|
|||
break
|
||||
elif attempt < self.config.max_retries - 1:
|
||||
if self.config.full_debug:
|
||||
print(f" Response too short, retrying...")
|
||||
print(" Response too short, retrying...")
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -594,15 +590,15 @@ class AGIEvalEnv(BaseEnv):
|
|||
)
|
||||
if hasattr(e, "response"):
|
||||
try:
|
||||
print(
|
||||
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
|
||||
)
|
||||
except:
|
||||
resp_text = e.response.text[:500] if hasattr(e.response, "text") else str(e.response)
|
||||
print(f" Response: {resp_text}")
|
||||
except Exception:
|
||||
pass
|
||||
if attempt < self.config.max_retries - 1:
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
else:
|
||||
print(f" Failed after {self.config.max_retries} attempts")
|
||||
retries = self.config.max_retries
|
||||
print(f" Failed after {retries} attempts")
|
||||
return {"is_correct": None, "sample": None}
|
||||
|
||||
if not model_response:
|
||||
|
|
@ -669,9 +665,9 @@ class AGIEvalEnv(BaseEnv):
|
|||
"""Run AGIEval evaluation."""
|
||||
start_time = time.time()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Starting AGIEval Evaluation (Generative/Reasoning Mode)")
|
||||
print(f"{'='*60}")
|
||||
print("\n" + "=" * 60)
|
||||
print("Starting AGIEval Evaluation (Generative/Reasoning Mode)")
|
||||
print("=" * 60)
|
||||
print(f" Total questions: {len(self.all_eval_items)}")
|
||||
print(f" Max tokens (for reasoning): {self.config.eval_max_tokens}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
@ -782,9 +778,9 @@ class AGIEvalEnv(BaseEnv):
|
|||
self.eval_metrics = [(k, v) for k, v in eval_metrics.items()]
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"AGIEval Evaluation Results")
|
||||
print(f"{'='*60}")
|
||||
print("\n" + "=" * 60)
|
||||
print("AGIEval Evaluation Results")
|
||||
print("=" * 60)
|
||||
print(
|
||||
f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})"
|
||||
)
|
||||
|
|
@ -794,7 +790,7 @@ class AGIEvalEnv(BaseEnv):
|
|||
print(f"Format Compliance: {format_compliance_rate:.4f}")
|
||||
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
|
||||
|
||||
print(f"\nSubset Breakdown:")
|
||||
print("\nSubset Breakdown:")
|
||||
for subset, stats in sorted(subset_results.items()):
|
||||
if stats["total"] > 0:
|
||||
subset_acc = stats["correct"] / stats["total"]
|
||||
|
|
@ -802,7 +798,7 @@ class AGIEvalEnv(BaseEnv):
|
|||
f" {subset}: {subset_acc:.4f} ({stats['correct']}/{stats['total']})"
|
||||
)
|
||||
|
||||
print(f"\nExtraction Method Statistics:")
|
||||
print("\nExtraction Method Statistics:")
|
||||
for method, stats in sorted(
|
||||
extraction_methods.items(), key=lambda x: -x[1]["count"]
|
||||
):
|
||||
|
|
@ -810,7 +806,7 @@ class AGIEvalEnv(BaseEnv):
|
|||
method_acc = stats["correct"] / stats["count"]
|
||||
print(f" {method}: {stats['count']} uses, {method_acc:.4f} accuracy")
|
||||
|
||||
print(f"{'='*60}\n")
|
||||
print("=" * 60 + "\n")
|
||||
|
||||
# Log evaluation results
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -23,12 +23,8 @@ Supports thinking mode with <think></think> tags for extended reasoning.
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import wandb
|
||||
from datasets import load_dataset
|
||||
|
|
@ -50,7 +46,6 @@ from atroposlib.envs.base import (
|
|||
APIServerConfig,
|
||||
BaseEnv,
|
||||
BaseEnvConfig,
|
||||
EvalHandlingEnum,
|
||||
)
|
||||
|
||||
# Available AIME years
|
||||
|
|
@ -62,7 +57,13 @@ AIME_DATASETS = {
|
|||
|
||||
# Prompt template following lighteval's AIME structure
|
||||
# Important: Uses the "I hope it is correct" format for math-verify
|
||||
AIME_PROMPT_TEMPLATE = """Solve the following math problem efficiently and clearly. The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
|
||||
AIME_PROMPT_TEMPLATE = """Solve the following math problem efficiently and clearly.
|
||||
|
||||
The last line of your response should be of the following format:
|
||||
'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes)
|
||||
where ANSWER is just the final number or expression that solves the problem.
|
||||
|
||||
Think step by step before answering.
|
||||
|
||||
Note: AIME answers are always integers from 0 to 999.
|
||||
|
||||
|
|
@ -172,7 +173,7 @@ class AIMEEvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nAIME Evaluation Setup (Generative Mode):")
|
||||
print("\nAIME Evaluation Setup (Generative Mode):")
|
||||
print(f" Years: {self.config.years}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
@ -501,12 +502,12 @@ class AIMEEvalEnv(BaseEnv):
|
|||
print(f" Format Compliance: {format_valid / total:.2%}")
|
||||
if self.config.thinking_mode:
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
print(f"\n Per-Year Breakdown:")
|
||||
print("\n Per-Year Breakdown:")
|
||||
for year, data in sorted(year_metrics.items()):
|
||||
print(
|
||||
f" AIME {year}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
|
||||
)
|
||||
print(f"\n Verification Methods:")
|
||||
print("\n Verification Methods:")
|
||||
for method, count in sorted(method_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {method}: {count} ({count/total:.1%})")
|
||||
print(f"{'='*60}\n")
|
||||
|
|
|
|||
|
|
@ -17,12 +17,8 @@ Supports thinking mode with <think></think> tags for extended reasoning.
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import wandb
|
||||
from datasets import load_dataset
|
||||
|
|
@ -45,7 +41,6 @@ from atroposlib.envs.base import (
|
|||
APIServerConfig,
|
||||
BaseEnv,
|
||||
BaseEnvConfig,
|
||||
EvalHandlingEnum,
|
||||
)
|
||||
|
||||
# Prompt template - AIMO doesn't have a specific template in lighteval
|
||||
|
|
@ -161,7 +156,7 @@ class AIMOEvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nAIMO Evaluation Setup (Generative Mode):")
|
||||
print("\nAIMO Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
@ -401,7 +396,7 @@ class AIMOEvalEnv(BaseEnv):
|
|||
print(f" Format Compliance: {format_valid / total:.2%}")
|
||||
if self.config.thinking_mode:
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
print(f"\n Verification Methods:")
|
||||
print("\n Verification Methods:")
|
||||
for method, count in sorted(method_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {method}: {count} ({count/total:.1%})")
|
||||
print(f"{'='*60}\n")
|
||||
|
|
|
|||
|
|
@ -27,17 +27,14 @@ Answer must be provided in <answer></answer> tags as a JSON 2D array.
|
|||
import ast
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import wandb
|
||||
from datasets import load_dataset
|
||||
from eval_helpers import (
|
||||
ANSWER_TAG_PATTERN,
|
||||
create_system_content,
|
||||
extract_thinking_content,
|
||||
get_default_thinking_prompt,
|
||||
save_eval_results,
|
||||
validate_thinking_format,
|
||||
|
|
@ -49,7 +46,6 @@ from atroposlib.envs.base import (
|
|||
APIServerConfig,
|
||||
BaseEnv,
|
||||
BaseEnvConfig,
|
||||
EvalHandlingEnum,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -168,7 +164,7 @@ class ARCAGIEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self):
|
||||
"""Load the ARC-AGI 2 dataset."""
|
||||
print(f"\nARC-AGI 2 Evaluation Setup (Generative Mode):")
|
||||
print("\nARC-AGI 2 Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
@ -216,7 +212,8 @@ class ARCAGIEvalEnv(BaseEnv):
|
|||
gold_output = item["question"][0]["output"]
|
||||
|
||||
# Build the prompt
|
||||
query = """You are solving an ARC-AGI puzzle. You will be shown training examples where an input grid is transformed into an output grid following a specific pattern or rule.
|
||||
query = """You are solving an ARC-AGI puzzle. You will be shown training examples
|
||||
where an input grid is transformed into an output grid following a specific pattern or rule.
|
||||
|
||||
Your task is to:
|
||||
1. Analyze the training examples to understand the transformation pattern
|
||||
|
|
@ -315,7 +312,7 @@ Example format:
|
|||
grid = ast.literal_eval(match)
|
||||
if self._is_valid_grid(grid):
|
||||
return grid
|
||||
except:
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Strategy 4: Extract rows one per line
|
||||
|
|
@ -328,7 +325,7 @@ Example format:
|
|||
grid = [json.loads(row) for row in rows]
|
||||
if self._is_valid_grid(grid):
|
||||
return grid
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -21,11 +21,8 @@ Supports optional thinking mode with <think></think> tags.
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from string import ascii_uppercase
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import wandb
|
||||
from datasets import load_dataset
|
||||
|
|
@ -33,7 +30,6 @@ from eval_helpers import (
|
|||
build_mcqa_fallback_patterns,
|
||||
create_system_content,
|
||||
extract_letter_from_answer_tag,
|
||||
extract_thinking_content,
|
||||
get_default_thinking_prompt,
|
||||
save_eval_results,
|
||||
validate_thinking_format,
|
||||
|
|
@ -45,7 +41,6 @@ from atroposlib.envs.base import (
|
|||
APIServerConfig,
|
||||
BaseEnv,
|
||||
BaseEnvConfig,
|
||||
EvalHandlingEnum,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -173,7 +168,7 @@ class ARCEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self):
|
||||
"""Load the ARC dataset."""
|
||||
print(f"\nARC Evaluation Setup (Generative Mode):")
|
||||
print("\nARC Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
|
|
|
|||
|
|
@ -9,7 +9,6 @@ from datasets import load_dataset
|
|||
from eval_helpers import (
|
||||
create_system_content,
|
||||
get_default_thinking_prompt,
|
||||
save_eval_results,
|
||||
)
|
||||
from pydantic import Field
|
||||
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
||||
|
|
|
|||
|
|
@ -20,12 +20,9 @@ snarks, sports_understanding, temporal_sequences, tracking_shuffled_objects (3/5
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from string import ascii_uppercase
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import wandb
|
||||
from datasets import load_dataset
|
||||
|
|
@ -33,7 +30,6 @@ from eval_helpers import (
|
|||
build_mcqa_fallback_patterns,
|
||||
create_system_content,
|
||||
extract_letter_from_answer_tag,
|
||||
extract_thinking_content,
|
||||
get_default_thinking_prompt,
|
||||
save_eval_results,
|
||||
validate_thinking_format,
|
||||
|
|
@ -45,7 +41,6 @@ from atroposlib.envs.base import (
|
|||
APIServerConfig,
|
||||
BaseEnv,
|
||||
BaseEnvConfig,
|
||||
EvalHandlingEnum,
|
||||
)
|
||||
|
||||
# All available BBH subsets
|
||||
|
|
@ -86,8 +81,7 @@ def format_bbh_prompt(item: Dict) -> Tuple[str, List[str], int]:
|
|||
input_prefix = item.get("example_input_prefix", "\nQuestion: ")
|
||||
input_text = item.get("input", "")
|
||||
choice_prefix = item.get("choice_prefix", "\n Choices: ")
|
||||
output_prefix = item.get("example_output_prefix", "\nAnswer: ")
|
||||
|
||||
# Note: output_prefix from item.get("example_output_prefix") is not used in generative mode
|
||||
choices = item.get("choices", [])
|
||||
target_idx = item.get("target_idx", 0)
|
||||
|
||||
|
|
@ -222,7 +216,7 @@ class BBHEvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nBBH Evaluation Setup (Generative Mode):")
|
||||
print("\nBBH Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
|
|
@ -553,7 +547,7 @@ class BBHEvalEnv(BaseEnv):
|
|||
print(f" Format Compliance: {format_valid / total:.2%}")
|
||||
if self.config.thinking_mode:
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
print(f"\n Per-Subset Breakdown:")
|
||||
print("\n Per-Subset Breakdown:")
|
||||
for subset, data in sorted(
|
||||
subset_metrics.items(), key=lambda x: -x[1]["accuracy"]
|
||||
):
|
||||
|
|
|
|||
|
|
@ -20,11 +20,7 @@ Supports optional thinking mode with <think></think> tags.
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from string import ascii_uppercase
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import wandb
|
||||
from datasets import load_dataset
|
||||
|
|
@ -33,7 +29,6 @@ from eval_helpers import (
|
|||
build_mcqa_fallback_patterns,
|
||||
create_system_content,
|
||||
extract_letter_from_answer_tag,
|
||||
extract_thinking_content,
|
||||
get_default_thinking_prompt,
|
||||
save_eval_results,
|
||||
validate_thinking_format,
|
||||
|
|
@ -45,7 +40,6 @@ from atroposlib.envs.base import (
|
|||
APIServerConfig,
|
||||
BaseEnv,
|
||||
BaseEnvConfig,
|
||||
EvalHandlingEnum,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -170,7 +164,7 @@ class BoolQEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self):
|
||||
"""Load the BoolQ dataset."""
|
||||
print(f"\nBoolQ Evaluation Setup (Generative Mode):")
|
||||
print("\nBoolQ Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
|
|||
|
|
@ -23,17 +23,13 @@ import asyncio
|
|||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import wandb
|
||||
from datasets import load_dataset
|
||||
from eval_helpers import (
|
||||
create_system_content,
|
||||
extract_freeform_from_answer_tag,
|
||||
extract_thinking_content,
|
||||
get_default_thinking_prompt,
|
||||
save_eval_results,
|
||||
validate_thinking_format,
|
||||
)
|
||||
from pydantic import Field
|
||||
from tqdm.asyncio import tqdm_asyncio
|
||||
|
|
@ -317,7 +313,7 @@ Question: {question}"""
|
|||
|
||||
async def setup(self) -> None:
|
||||
"""Load the DROP dataset and prepare for evaluation."""
|
||||
print(f"\nDROP Evaluation Setup:")
|
||||
print("\nDROP Evaluation Setup:")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Max tokens: {self.config.eval_max_tokens}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
|
|
@ -539,7 +535,7 @@ Question: {question}"""
|
|||
print(
|
||||
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
if attempt < self.config.max_retries - 1:
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
|
@ -615,7 +611,7 @@ Question: {question}"""
|
|||
start_time = time.time()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Starting DROP Evaluation")
|
||||
print("Starting DROP Evaluation")
|
||||
print(f"{'='*60}")
|
||||
print(f" Total questions: {len(self.all_eval_items)}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
@ -694,7 +690,7 @@ Question: {question}"""
|
|||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"DROP Evaluation Results")
|
||||
print("DROP Evaluation Results")
|
||||
print(f"{'='*60}")
|
||||
print(f"Exact Match Accuracy: {accuracy:.4f} ({correct_count}/{total_count})")
|
||||
print(f"Average F1 Score: {avg_f1:.4f}")
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import os
|
|||
import re
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from string import ascii_uppercase
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
|
||||
# Try to import math_verify libraries (optional dependency for math evals)
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -29,16 +29,11 @@ import time
|
|||
from string import ascii_uppercase
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import wandb
|
||||
from datasets import load_dataset
|
||||
from eval_helpers import (
|
||||
build_mcqa_fallback_patterns,
|
||||
create_system_content,
|
||||
extract_letter_from_answer_tag,
|
||||
extract_thinking_content,
|
||||
get_default_thinking_prompt,
|
||||
save_eval_results,
|
||||
validate_thinking_format,
|
||||
)
|
||||
from pydantic import Field
|
||||
from tqdm.asyncio import tqdm_asyncio
|
||||
|
|
@ -289,7 +284,7 @@ class GPQAEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self) -> None:
|
||||
"""Load the GPQA dataset and prepare for evaluation."""
|
||||
print(f"\nGPQA Evaluation Setup (Generative Mode):")
|
||||
print("\nGPQA Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Max tokens for reasoning: {self.config.eval_max_tokens}")
|
||||
|
|
@ -507,7 +502,7 @@ class GPQAEvalEnv(BaseEnv):
|
|||
break
|
||||
elif attempt < self.config.max_retries - 1:
|
||||
if self.config.full_debug:
|
||||
print(f" Response too short, retrying...")
|
||||
print(" Response too short, retrying...")
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -520,7 +515,7 @@ class GPQAEvalEnv(BaseEnv):
|
|||
print(
|
||||
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
if attempt < self.config.max_retries - 1:
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
|
@ -592,7 +587,7 @@ class GPQAEvalEnv(BaseEnv):
|
|||
start_time = time.time()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Starting GPQA Evaluation (Generative/Reasoning Mode)")
|
||||
print("Starting GPQA Evaluation (Generative/Reasoning Mode)")
|
||||
print(f"{'='*60}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Total questions: {len(self.all_eval_items)}")
|
||||
|
|
@ -708,7 +703,7 @@ class GPQAEvalEnv(BaseEnv):
|
|||
print(f"Format Compliance: {format_compliance_rate:.4f}")
|
||||
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
|
||||
|
||||
print(f"\nSubdomain Breakdown:")
|
||||
print("\nSubdomain Breakdown:")
|
||||
for subdomain, stats in sorted(subdomain_results.items()):
|
||||
if stats["total"] > 0:
|
||||
subdom_acc = stats["correct"] / stats["total"]
|
||||
|
|
@ -716,7 +711,7 @@ class GPQAEvalEnv(BaseEnv):
|
|||
f" {subdomain}: {subdom_acc:.4f} ({stats['correct']}/{stats['total']})"
|
||||
)
|
||||
|
||||
print(f"\nExtraction Method Statistics:")
|
||||
print("\nExtraction Method Statistics:")
|
||||
for method, stats in sorted(
|
||||
extraction_methods.items(), key=lambda x: -x[1]["count"]
|
||||
):
|
||||
|
|
|
|||
|
|
@ -161,7 +161,7 @@ class GSM8KEvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nGSM8K Evaluation Setup (Generative Mode):")
|
||||
print("\nGSM8K Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
|
|
@ -416,7 +416,7 @@ class GSM8KEvalEnv(BaseEnv):
|
|||
print(f" Format Compliance: {format_valid / total:.2%}")
|
||||
if self.config.thinking_mode:
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
print(f"\n Verification Methods:")
|
||||
print("\n Verification Methods:")
|
||||
for method, count in sorted(method_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {method}: {count} ({count/total:.1%})")
|
||||
print(f"{'='*60}\n")
|
||||
|
|
|
|||
|
|
@ -167,7 +167,7 @@ class HellaSwagEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self):
|
||||
"""Load the HellaSwag dataset."""
|
||||
print(f"\nHellaSwag Evaluation Setup (Generative Mode):")
|
||||
print("\nHellaSwag Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
|
|||
|
|
@ -152,7 +152,7 @@ class HLEEvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nHLE Evaluation Setup (Generative Mode):")
|
||||
print("\nHLE Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
@ -525,7 +525,7 @@ class HLEEvalEnv(BaseEnv):
|
|||
if self.config.thinking_mode:
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
if category_metrics:
|
||||
print(f"\n Per-Category Breakdown:")
|
||||
print("\n Per-Category Breakdown:")
|
||||
for cat, data in sorted(
|
||||
category_metrics.items(), key=lambda x: -x[1]["accuracy"]
|
||||
):
|
||||
|
|
|
|||
|
|
@ -228,7 +228,7 @@ class IFEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self) -> None:
|
||||
"""Load the IFEval dataset and prepare for evaluation."""
|
||||
print(f"\nIFEval Evaluation Setup:")
|
||||
print("\nIFEval Evaluation Setup:")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Max tokens: {self.config.eval_max_tokens}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
|
|
@ -478,7 +478,7 @@ class IFEvalEnv(BaseEnv):
|
|||
break
|
||||
elif attempt < self.config.max_retries - 1:
|
||||
if self.config.full_debug:
|
||||
print(f" Response too short, retrying...")
|
||||
print(" Response too short, retrying...")
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -490,7 +490,7 @@ class IFEvalEnv(BaseEnv):
|
|||
print(
|
||||
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
if attempt < self.config.max_retries - 1:
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
|
@ -569,7 +569,7 @@ class IFEvalEnv(BaseEnv):
|
|||
start_time = time.time()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Starting IFEval Evaluation (Instruction Following)")
|
||||
print("Starting IFEval Evaluation (Instruction Following)")
|
||||
print(f"{'='*60}")
|
||||
print(f" Total prompts: {len(self.all_eval_items)}")
|
||||
print(f" Max tokens: {self.config.eval_max_tokens}")
|
||||
|
|
@ -682,7 +682,7 @@ class IFEvalEnv(BaseEnv):
|
|||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"IFEval Evaluation Results")
|
||||
print("IFEval Evaluation Results")
|
||||
print(f"{'='*60}")
|
||||
print(
|
||||
f"Prompt-Level Strict Accuracy: {prompt_strict_acc:.4f} ({prompt_strict_count}/{total_count})"
|
||||
|
|
|
|||
|
|
@ -351,7 +351,7 @@ class JudgeMarkEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self):
|
||||
"""Load JudgeMark data files."""
|
||||
print(f"\nLoading JudgeMark v2 data...")
|
||||
print("\nLoading JudgeMark v2 data...")
|
||||
|
||||
# Determine data directory
|
||||
data_dir = JUDGEMARK_DATA_DIR
|
||||
|
|
@ -701,7 +701,7 @@ class JudgeMarkEvalEnv(BaseEnv):
|
|||
f" Models with reference: {calibrated_cross_stats['num_models_with_reference']}"
|
||||
)
|
||||
|
||||
print(f"\n Per-model averages (calibrated):")
|
||||
print("\n Per-model averages (calibrated):")
|
||||
sorted_models = sorted(
|
||||
model_stats.items(), key=lambda x: x[1]["mean_calibrated"], reverse=True
|
||||
)
|
||||
|
|
|
|||
|
|
@ -172,7 +172,7 @@ class MATH500EvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nMATH-500 Evaluation Setup (Generative Mode):")
|
||||
print("\nMATH-500 Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
@ -470,7 +470,7 @@ class MATH500EvalEnv(BaseEnv):
|
|||
if self.config.thinking_mode:
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
if subject_metrics and len(subject_metrics) > 1:
|
||||
print(f"\n Per-Subject Breakdown:")
|
||||
print("\n Per-Subject Breakdown:")
|
||||
for subject, data in sorted(
|
||||
subject_metrics.items(), key=lambda x: -x[1]["accuracy"]
|
||||
):
|
||||
|
|
@ -478,7 +478,7 @@ class MATH500EvalEnv(BaseEnv):
|
|||
f" {subject}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
|
||||
)
|
||||
if level_metrics and len(level_metrics) > 1:
|
||||
print(f"\n Per-Level Breakdown:")
|
||||
print("\n Per-Level Breakdown:")
|
||||
for level, data in sorted(level_metrics.items()):
|
||||
print(
|
||||
f" Level {level}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
|
||||
|
|
|
|||
|
|
@ -178,7 +178,7 @@ class MATHEvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nMATH Evaluation Setup (Generative Mode):")
|
||||
print("\nMATH Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subsets: {self.config.subsets}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
|
|
@ -484,7 +484,7 @@ class MATHEvalEnv(BaseEnv):
|
|||
print(f" Format Compliance: {format_valid / total:.2%}")
|
||||
if self.config.thinking_mode:
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
print(f"\n Per-Subset Breakdown:")
|
||||
print("\n Per-Subset Breakdown:")
|
||||
for subset, data in sorted(
|
||||
subset_metrics.items(), key=lambda x: -x[1]["accuracy"]
|
||||
):
|
||||
|
|
@ -492,7 +492,7 @@ class MATHEvalEnv(BaseEnv):
|
|||
f" {subset}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
|
||||
)
|
||||
if level_metrics and len(level_metrics) > 1:
|
||||
print(f"\n Per-Level Breakdown:")
|
||||
print("\n Per-Level Breakdown:")
|
||||
for level, data in sorted(level_metrics.items()):
|
||||
print(
|
||||
f" {level}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
|
||||
|
|
|
|||
|
|
@ -343,7 +343,7 @@ class MixEvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nMixEval Evaluation Setup (with LLM Judge):")
|
||||
print("\nMixEval Evaluation Setup (with LLM Judge):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Difficulty: {self.config.difficulty}")
|
||||
print(f" Question types: {self.config.question_types}")
|
||||
|
|
@ -737,7 +737,7 @@ class MixEvalEnv(BaseEnv):
|
|||
print(f" Format Compliance: {format_valid / total:.2%}")
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
print(f" Judge Error Rate: {self.judge_error_count / total:.2%}")
|
||||
print(f"\n Per-Benchmark Breakdown:")
|
||||
print("\n Per-Benchmark Breakdown:")
|
||||
for bench, data in sorted(
|
||||
benchmark_metrics.items(), key=lambda x: -x[1]["avg_score"]
|
||||
):
|
||||
|
|
|
|||
|
|
@ -464,7 +464,7 @@ class MMLUEvalEnv(BaseEnv):
|
|||
if not self.subjects:
|
||||
raise ValueError("No valid MMLU subjects specified for evaluation.")
|
||||
|
||||
print(f"\nMMLU Evaluation Setup (Generative Mode):")
|
||||
print("\nMMLU Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subjects: {len(self.subjects)} subjects")
|
||||
print(f" Few-shot examples: {self.config.num_few_shot}")
|
||||
|
|
@ -821,7 +821,7 @@ class MMLUEvalEnv(BaseEnv):
|
|||
print(
|
||||
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
if attempt < self.config.max_retries - 1:
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
|
@ -907,7 +907,7 @@ class MMLUEvalEnv(BaseEnv):
|
|||
start_time = time.time()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Starting MMLU Evaluation (Generative/Reasoning Mode)")
|
||||
print("Starting MMLU Evaluation (Generative/Reasoning Mode)")
|
||||
print(f"{'='*60}")
|
||||
print(f" Subjects: {len(self.subjects)}")
|
||||
print(f" Total questions: {len(self.all_eval_items)}")
|
||||
|
|
@ -1046,7 +1046,7 @@ class MMLUEvalEnv(BaseEnv):
|
|||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"MMLU Evaluation Results")
|
||||
print("MMLU Evaluation Results")
|
||||
print(f"{'='*60}")
|
||||
print(
|
||||
f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})"
|
||||
|
|
@ -1057,7 +1057,7 @@ class MMLUEvalEnv(BaseEnv):
|
|||
print(f"Format Compliance: {format_compliance_rate:.4f}")
|
||||
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
|
||||
|
||||
print(f"\nCategory Breakdown:")
|
||||
print("\nCategory Breakdown:")
|
||||
for category, stats in category_results.items():
|
||||
if stats["total"] > 0:
|
||||
cat_acc = stats["correct"] / stats["total"]
|
||||
|
|
@ -1065,7 +1065,7 @@ class MMLUEvalEnv(BaseEnv):
|
|||
f" {category}: {cat_acc:.4f} ({stats['correct']}/{stats['total']})"
|
||||
)
|
||||
|
||||
print(f"\nExtraction Method Statistics:")
|
||||
print("\nExtraction Method Statistics:")
|
||||
for method, stats in sorted(
|
||||
extraction_methods.items(), key=lambda x: -x[1]["count"]
|
||||
):
|
||||
|
|
|
|||
|
|
@ -307,7 +307,7 @@ class MMLUProEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self) -> None:
|
||||
"""Load the MMLU-Pro dataset and prepare for evaluation."""
|
||||
print(f"\nMMLU-Pro Evaluation Setup (Generative Mode):")
|
||||
print("\nMMLU-Pro Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Few-shot examples: {self.config.num_few_shot}")
|
||||
print(f" Max tokens for reasoning: {self.config.eval_max_tokens}")
|
||||
|
|
@ -358,7 +358,7 @@ class MMLUProEvalEnv(BaseEnv):
|
|||
cat = item.get("category", "unknown")
|
||||
category_counts[cat] = category_counts.get(cat, 0) + 1
|
||||
|
||||
print(f"\n Category distribution:")
|
||||
print("\n Category distribution:")
|
||||
for cat, count in sorted(category_counts.items()):
|
||||
print(f" {cat}: {count} questions")
|
||||
|
||||
|
|
@ -586,7 +586,7 @@ class MMLUProEvalEnv(BaseEnv):
|
|||
break
|
||||
elif attempt < self.config.max_retries - 1:
|
||||
if self.config.full_debug:
|
||||
print(f" Response too short, retrying...")
|
||||
print(" Response too short, retrying...")
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -599,7 +599,7 @@ class MMLUProEvalEnv(BaseEnv):
|
|||
print(
|
||||
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
if attempt < self.config.max_retries - 1:
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
|
@ -673,7 +673,7 @@ class MMLUProEvalEnv(BaseEnv):
|
|||
start_time = time.time()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Starting MMLU-Pro Evaluation (Generative/Reasoning Mode)")
|
||||
print("Starting MMLU-Pro Evaluation (Generative/Reasoning Mode)")
|
||||
print(f"{'='*60}")
|
||||
print(f" Total questions: {len(self.all_eval_items)}")
|
||||
print(f" Few-shot examples: {self.config.num_few_shot}")
|
||||
|
|
@ -788,7 +788,7 @@ class MMLUProEvalEnv(BaseEnv):
|
|||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"MMLU-Pro Evaluation Results")
|
||||
print("MMLU-Pro Evaluation Results")
|
||||
print(f"{'='*60}")
|
||||
print(
|
||||
f"Overall Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_count})"
|
||||
|
|
@ -799,7 +799,7 @@ class MMLUProEvalEnv(BaseEnv):
|
|||
print(f"Format Compliance: {format_compliance_rate:.4f}")
|
||||
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
|
||||
|
||||
print(f"\nCategory Breakdown:")
|
||||
print("\nCategory Breakdown:")
|
||||
for category, stats in sorted(category_results.items()):
|
||||
if stats["total"] > 0:
|
||||
cat_acc = stats["correct"] / stats["total"]
|
||||
|
|
@ -807,7 +807,7 @@ class MMLUProEvalEnv(BaseEnv):
|
|||
f" {category}: {cat_acc:.4f} ({stats['correct']}/{stats['total']})"
|
||||
)
|
||||
|
||||
print(f"\nExtraction Method Statistics:")
|
||||
print("\nExtraction Method Statistics:")
|
||||
for method, stats in sorted(
|
||||
extraction_methods.items(), key=lambda x: -x[1]["count"]
|
||||
):
|
||||
|
|
|
|||
|
|
@ -282,7 +282,7 @@ class MuSREvalEnv(BaseEnv):
|
|||
if isinstance(choices_raw, str):
|
||||
try:
|
||||
choices = ast.literal_eval(choices_raw)
|
||||
except:
|
||||
except Exception:
|
||||
choices = []
|
||||
else:
|
||||
choices = choices_raw
|
||||
|
|
@ -301,7 +301,7 @@ class MuSREvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self) -> None:
|
||||
"""Load the MuSR dataset and prepare for evaluation."""
|
||||
print(f"\nMuSR Evaluation Setup:")
|
||||
print("\nMuSR Evaluation Setup:")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Max tokens: {self.config.eval_max_tokens}")
|
||||
|
|
@ -495,7 +495,7 @@ class MuSREvalEnv(BaseEnv):
|
|||
print(
|
||||
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
if attempt < self.config.max_retries - 1:
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
|
|
|||
|
|
@ -291,7 +291,7 @@ class OBQAEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self) -> None:
|
||||
"""Load the OpenBookQA dataset and prepare for evaluation."""
|
||||
print(f"\nOpenBookQA Evaluation Setup:")
|
||||
print("\nOpenBookQA Evaluation Setup:")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Eval split: {self.config.eval_split}")
|
||||
|
|
@ -481,7 +481,7 @@ class OBQAEvalEnv(BaseEnv):
|
|||
print(
|
||||
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
if attempt < self.config.max_retries - 1:
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
|
@ -560,7 +560,7 @@ class OBQAEvalEnv(BaseEnv):
|
|||
start_time = time.time()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Starting OpenBookQA Evaluation")
|
||||
print("Starting OpenBookQA Evaluation")
|
||||
print(f"{'='*60}")
|
||||
print(f" Total questions: {len(self.all_eval_items)}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
@ -641,7 +641,7 @@ class OBQAEvalEnv(BaseEnv):
|
|||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"OpenBookQA Evaluation Results")
|
||||
print("OpenBookQA Evaluation Results")
|
||||
print(f"{'='*60}")
|
||||
print(f"Accuracy: {accuracy:.4f} ({correct_count}/{total_count})")
|
||||
print(f"Answer Extraction Rate: {extraction_rate:.4f}")
|
||||
|
|
|
|||
|
|
@ -199,7 +199,7 @@ class OlympiadBenchEvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nOlympiadBench Evaluation Setup (Generative Mode):")
|
||||
print("\nOlympiadBench Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
|
|
@ -648,7 +648,7 @@ class OlympiadBenchEvalEnv(BaseEnv):
|
|||
if self.config.thinking_mode:
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
if subject_metrics:
|
||||
print(f"\n Per-Subject Breakdown:")
|
||||
print("\n Per-Subject Breakdown:")
|
||||
for subject, data in subject_metrics.items():
|
||||
print(
|
||||
f" {subject}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
|
||||
|
|
|
|||
|
|
@ -167,7 +167,7 @@ class PIQAEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self):
|
||||
"""Load the PIQA dataset."""
|
||||
print(f"\nPIQA Evaluation Setup (Generative Mode):")
|
||||
print("\nPIQA Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
|
|||
|
|
@ -154,7 +154,7 @@ class PubMedQAEvalEnv(BaseEnv):
|
|||
if not self._dataset_loaded:
|
||||
await self._load_dataset()
|
||||
|
||||
print(f"\nPubMedQA Evaluation Setup (Generative Mode):")
|
||||
print("\nPubMedQA Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
|
|
@ -481,7 +481,7 @@ class PubMedQAEvalEnv(BaseEnv):
|
|||
print(f" Format Compliance: {format_valid / total:.2%}")
|
||||
if self.config.thinking_mode:
|
||||
print(f" Thinking Utilization: {has_thinking / total:.2%}")
|
||||
print(f"\n Per-Answer Breakdown:")
|
||||
print("\n Per-Answer Breakdown:")
|
||||
for answer, data in answer_metrics.items():
|
||||
print(
|
||||
f" {answer}: {data['accuracy']:.2%} ({data['correct']}/{data['total']})"
|
||||
|
|
|
|||
|
|
@ -455,7 +455,7 @@ class SimpleQAEvalEnv(BaseEnv):
|
|||
else "String Matching (Nous)"
|
||||
)
|
||||
|
||||
print(f"\nSimpleQA Evaluation Setup:")
|
||||
print("\nSimpleQA Evaluation Setup:")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Scoring mode: {scoring_mode}")
|
||||
print(f" Max tokens for answer: {self.config.eval_max_tokens}")
|
||||
|
|
@ -627,7 +627,7 @@ class SimpleQAEvalEnv(BaseEnv):
|
|||
break
|
||||
elif attempt < self.config.max_retries - 1:
|
||||
if self.config.full_debug:
|
||||
print(f" Response too short, retrying...")
|
||||
print(" Response too short, retrying...")
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -639,7 +639,7 @@ class SimpleQAEvalEnv(BaseEnv):
|
|||
print(
|
||||
f" Response: {e.response.text[:500] if hasattr(e.response, 'text') else e.response}"
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
if attempt < self.config.max_retries - 1:
|
||||
await asyncio.sleep(self.config.retry_delay)
|
||||
|
|
@ -808,7 +808,7 @@ class SimpleQAEvalEnv(BaseEnv):
|
|||
)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Starting SimpleQA Evaluation")
|
||||
print("Starting SimpleQA Evaluation")
|
||||
print(f"{'='*60}")
|
||||
print(f" Total questions: {len(self.all_eval_items)}")
|
||||
print(f" Scoring mode: {scoring_mode}")
|
||||
|
|
@ -983,7 +983,7 @@ class SimpleQAEvalEnv(BaseEnv):
|
|||
f"Accuracy (if attempted): {eval_metrics['eval/accuracy_if_attempted']:.4f}"
|
||||
)
|
||||
print(f"Not Attempted Rate: {eval_metrics['eval/not_attempted_rate']:.4f}")
|
||||
print(f"\nGrade Distribution:")
|
||||
print("\nGrade Distribution:")
|
||||
print(f" CORRECT: {correct_count} ({100*correct_count/total_count:.1f}%)")
|
||||
print(
|
||||
f" INCORRECT: {incorrect_count} ({100*incorrect_count/total_count:.1f}%)"
|
||||
|
|
@ -1012,7 +1012,7 @@ class SimpleQAEvalEnv(BaseEnv):
|
|||
print(f"Thinking Utilization: {thinking_utilization}/{total_count}")
|
||||
|
||||
if len(sorted_topics) > 0:
|
||||
print(f"\nTop Topics (by count):")
|
||||
print("\nTop Topics (by count):")
|
||||
for topic, stats in sorted_topics[:10]:
|
||||
if stats["total"] > 0:
|
||||
topic_acc = stats["correct"] / stats["total"]
|
||||
|
|
|
|||
|
|
@ -167,7 +167,7 @@ class SIQAEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self):
|
||||
"""Load the SIQA dataset."""
|
||||
print(f"\nSIQA Evaluation Setup (Generative Mode):")
|
||||
print("\nSIQA Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
print(f" Thinking mode: {self.config.thinking_mode}")
|
||||
|
|
|
|||
|
|
@ -172,7 +172,7 @@ class WinoGrandeEvalEnv(BaseEnv):
|
|||
|
||||
async def setup(self):
|
||||
"""Load the WinoGrande dataset."""
|
||||
print(f"\nWinoGrande Evaluation Setup (Generative Mode):")
|
||||
print("\nWinoGrande Evaluation Setup (Generative Mode):")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Subset: {self.config.subset}")
|
||||
print(f" Evaluation split: {self.config.eval_split}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue