hopefully final linter fixes lol

This commit is contained in:
teknium 2025-12-24 23:36:36 +00:00
parent 67869c3a79
commit 85296c519e
29 changed files with 76 additions and 155 deletions

View file

@ -24,6 +24,8 @@ Supports thinking mode with <think></think> tags for extended reasoning.
import asyncio
import random
import re
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional
import wandb

View file

@ -18,6 +18,7 @@ Supports thinking mode with <think></think> tags for extended reasoning.
import asyncio
import random
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional
import wandb

View file

@ -28,7 +28,7 @@ import ast
import asyncio
import json
import re
from typing import Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset

View file

@ -30,6 +30,7 @@ from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,

View file

@ -20,6 +20,7 @@ Supports optional thinking mode with <think></think> tags.
"""
import asyncio
import re
from typing import Dict, List, Optional, Tuple
import wandb

View file

@ -125,7 +125,7 @@ def extract_letter_from_answer_tag(
content_to_check = answer_content
prefix_match = ANSWER_PREFIX_PATTERN.match(content_to_check)
if prefix_match:
content_to_check = content_to_check[prefix_match.end() :].strip()
content_to_check = content_to_check[prefix_match.end() :].strip() # noqa: E203
if debug:
print(f" Stripped prefix, remaining content: '{content_to_check}'")
@ -304,7 +304,7 @@ def extract_number_from_answer_tag(
content_to_check = answer_content
prefix_match = ANSWER_PREFIX_PATTERN.match(content_to_check)
if prefix_match:
content_to_check = content_to_check[prefix_match.end() :].strip()
content_to_check = content_to_check[prefix_match.end() :].strip() # noqa: E203
if debug:
print(f" Stripped prefix, remaining content: '{content_to_check}'")

View file

@ -18,18 +18,14 @@ Supports thinking mode with <think></think> tags for extended reasoning.
"""
import asyncio
import os
import random
import re
import time
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional
import wandb
from datasets import load_dataset
from eval_helpers import (
THINK_CONTENT_AFTER_PATTERN,
compare_math_strings,
create_system_content,
extract_boxed_answers,
extract_thinking_content,
@ -47,8 +43,7 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
)
# Prompt template following lighteval's structure
# Added boxed instruction for consistency with our math verification
@ -329,7 +324,7 @@ class GSM8KEvalEnv(BaseEnv):
print(f"Extracted: {extracted_answer}")
print(f"Correct: {is_correct} (method: {method})")
if has_multiple_boxed:
print(f"WARNING: Multiple \\boxed{{}} found - marked incorrect")
print("WARNING: Multiple \\boxed{} found - marked incorrect")
return {
"item_id": item["id"],

View file

@ -20,11 +20,8 @@ Supports optional thinking mode with <think></think> tags.
"""
import asyncio
import os
import re
import time
from string import ascii_uppercase
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
@ -32,7 +29,6 @@ from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
@ -44,7 +40,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)

View file

@ -17,10 +17,8 @@ Note: This implementation uses the text-only questions (filters out image questi
"""
import asyncio
import os
import random
import re
import time
from typing import Dict, List, Optional, Tuple
import wandb
@ -40,12 +38,11 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# Prompt template for HLE with answer tag instruction
HLE_PROMPT_TEMPLATE = """Answer the following challenging question. Think step by step and reason carefully before providing your answer.
# noqa: E501
Provide your final answer within <answer></answer> tags.
Example format:
@ -339,13 +336,13 @@ class HLEEvalEnv(BaseEnv):
return answer, "fallback_pattern"
# Last resort: take the last line/sentence
lines = [l.strip() for l in response.strip().split("\n") if l.strip()]
lines = [line.strip() for line in response.strip().split("\n") if line.strip()]
if lines:
last_line = lines[-1]
# Clean up common prefixes
for prefix in ["Therefore,", "Thus,", "So,", "Hence,"]:
if last_line.startswith(prefix):
last_line = last_line[len(prefix) :].strip()
last_line = last_line[len(prefix) :].strip() # noqa: E203
if debug:
preview = last_line[:50] + "..." if len(last_line) > 50 else last_line

View file

@ -31,15 +31,10 @@ import re
import time
from typing import Any, Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
THINK_CONTENT_AFTER_PATTERN,
create_system_content,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio

View file

@ -918,7 +918,7 @@ class JsonFormat(Instruction):
"""Check the Json format."""
def build_description(self):
self._description_pattern = "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```."
self._description_pattern = "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```." # noqa: E501
return self._description_pattern
def get_instruction_args(self):
@ -1302,7 +1302,7 @@ class EndChecker(Instruction):
)
if self._end_phrase is None:
self._end_phrase = random.choice(_ENDING_OPTIONS)
self._description_pattern = "Finish your response with this exact phrase {ender}. No other words should follow this phrase."
self._description_pattern = "Finish your response with this exact phrase {ender}. No other words should follow this phrase." # noqa: E501
return self._description_pattern.format(ender=self._end_phrase)
def get_instruction_args(self):
@ -1324,7 +1324,7 @@ class TitleChecker(Instruction):
def build_description(self):
"""Build the instruction description."""
self._description_pattern = "Your answer must contain a title, wrapped in double angular brackets, such as <<poem of joy>>."
self._description_pattern = "Your answer must contain a title, wrapped in double angular brackets, such as <<poem of joy>>." # noqa: E501
return self._description_pattern
def get_instruction_args(self):
@ -1390,7 +1390,7 @@ class LetterFrequencyChecker(Instruction):
else:
self._comparison_relation = let_relation
self._description_pattern = "In your response, the letter {letter} should appear {let_relation} {let_frequency} times."
self._description_pattern = "In your response, the letter {letter} should appear {let_relation} {let_frequency} times." # noqa: E501
return self._description_pattern.format(
letter=self._letter,
@ -1459,7 +1459,7 @@ class LowercaseLettersEnglishChecker(Instruction):
def build_description(self):
"""Build the instruction description."""
self._description_pattern = "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed."
self._description_pattern = "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed." # noqa: E501
return self._description_pattern
def get_instruction_args(self):
@ -1539,7 +1539,7 @@ class CapitalWordFrequencyChecker(Instruction):
f"{_COMPARISON_RELATION}, but {capital_relation} is given."
)
self._description_pattern = "In your response, words with all capital letters should appear {relation} {frequency} times."
self._description_pattern = "In your response, words with all capital letters should appear {relation} {frequency} times." # noqa: E501
return self._description_pattern.format(
frequency=self._frequency, relation=self._comparison_relation

View file

@ -1604,7 +1604,7 @@ LANGUAGE_CODES = {
_ALPHABETS = "([A-Za-z])"
_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" # noqa: E501
_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
_DIGITS = "([0-9])"

View file

@ -34,16 +34,14 @@ import re
import statistics
import traceback
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import numpy as np
import openai
import scipy.stats
from eval_helpers import (
create_system_content,
get_default_thinking_prompt,
save_eval_results,
)
from pydantic import Field

View file

@ -18,12 +18,9 @@ Supports thinking mode with <think></think> tags for extended reasoning.
"""
import asyncio
import os
import random
import re
import time
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional
import wandb
from datasets import load_dataset
@ -46,7 +43,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# Prompt template following lighteval's MATH-500 structure
@ -54,7 +50,7 @@ MATH500_PROMPT_TEMPLATE = """Solve the following problem. The final line of your
"ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering.
However, for reliable parsing, also put your final answer in \\boxed{{}} format.
# noqa: E501
{problem}"""

View file

@ -21,12 +21,9 @@ Supports thinking mode with <think></think> tags for extended reasoning.
"""
import asyncio
import os
import random
import re
import time
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional
import wandb
from datasets import load_dataset
@ -49,7 +46,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# All available MATH subsets

View file

@ -31,14 +31,12 @@ import asyncio
import os
import random
import re
import time
from string import ascii_uppercase
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
THINK_CONTENT_AFTER_PATTERN,
create_system_content,
extract_thinking_content,
get_default_thinking_prompt,
@ -58,7 +56,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# Prompt construction helpers
@ -125,9 +122,9 @@ def judge_freeform_prompt(question: str, answer: str, gold: str) -> List[Dict]:
"content": f"""You will be provided with a question, its golden answer(s), and the model's answer, while the context of the question is not given here. Your task is to judge how correct the model's answer is based on the golden answer(s), without seeing the context of the question, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Your should first briefly give your reasoning process regarding how the model's answer conforms to or contradicts the golden answer(s), and then give the correctness score. The correctness score must strictly follow this format: "[[score]]", e.g., "The correctness score: [[0.5]]".
Note that each one of the golden answers is considered correct. Thus if the model's answer matches any one of the golden answers, it should be considered correct. Judge the below case, give the brief reasoning process and the correctness score.
# noqa: E501
Question: {question}
Golden Answer(s): {gold}
Golden Answer(s): {gold} # noqa: E501
Model's Answer: {answer}
Your Judgment:
""",
@ -150,7 +147,7 @@ def judge_multichoice_prompt(
"content": f"""You will be provided with a multiple-choice question, its options, the gold answer, and the model's answer, while the context of the question is not given here. Your task is to extract or judge which option is chosen by the model based on its response, and to determine whether or not the model answered correctly. The model scores can either be 0 (incorrect) or 1 (correct). The correctness score must strictly follow this format: "[[score]]", e.g., "The correctness score: [[1]]".
Question: {question}
Options:
Options: # noqa: E501
{parsed_options}
Golden Answer: {gold}
Model's Answer: {answer}

View file

@ -18,22 +18,16 @@ Supports optional thinking mode with <think></think> tags for extended reasoning
import asyncio
import os
import random
import re
import time
from string import ascii_uppercase
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
@ -879,13 +873,13 @@ class MMLUEvalEnv(BaseEnv):
status = "" if is_correct else ""
format_status = "" if format_valid else ""
print(
f" [{status}] {subject}: gold={gold_letter}, extracted={extracted_answer} ({extraction_method}), format={format_status}"
f" [{status}] {subject}: gold={gold_letter}, extracted={extracted_answer} ({extraction_method}), format={format_status}" # noqa: E501
)
return {"is_correct": is_correct, "sample": sample}
except Exception as e:
if self.config.full_debug:
if self.config.full_debug: # noqa: E501
print(f"Error in rollout_and_score_eval: {e}")
import traceback

View file

@ -23,22 +23,16 @@ Supports optional thinking mode with <think></think> tags for extended reasoning
import asyncio
import os
import random
import re
import time
from string import ascii_uppercase
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio

View file

@ -24,13 +24,11 @@ import asyncio
import os
import random
import re
import time
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
THINK_CONTENT_AFTER_PATTERN,
create_system_content,
extract_thinking_content,
get_default_thinking_prompt,
@ -50,7 +48,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# MT-Bench categories
@ -87,7 +84,7 @@ Your job is to evaluate a task carried out by an AI system powered by a large la
You will be provided with the inputs and output of the task, as well as the evaluation criteria and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation criteria and scoring rubric provided.
# INPUT
Below are the inputs required for performing the task:
Below are the inputs required for performing the task: # noqa: E501
<inputs>
{question}
</inputs>
@ -116,11 +113,11 @@ How well the response answers the question?{' ' + reference_text if reference_te
1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. Review the evaluation criteria and scoring rubric to understand the different levels of performance and the descriptions for each score.
2. Review the inputs and output: Look at the inputs provided for the task. Examine the output generated from completing the task.
3. Compare output to score descriptions: Compare the output against the criteria and score descriptions in the scoring rubric. For each criterion, decide which description best matches the output.
4. After comparing the output to the score descriptions, pay attention to the small details that might impact the final score that you assign. Sometimes a small difference can dictate the final score.
5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring to specific aspects of the output and comparing them to the rubric.
6. Assign a final score based on the scoring rubric.
## FORMAT FOR THE EVALUATION
4. After comparing the output to the score descriptions, pay attention to the small details that might impact the final score that you assign. Sometimes a small difference can dictate the final score. # noqa: E501
5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring to specific aspects of the output and comparing them to the rubric. # noqa: E501
6. Assign a final score based on the scoring rubric. # noqa: E501
# noqa: E501
## FORMAT FOR THE EVALUATION # noqa: E501
- Write the verbal feedback inside <feedback> tags without any additional surrounding text.
- Write the numeric score inside <score> tags, without any additional surrounding text and always after the feedback.
@ -309,7 +306,7 @@ class MTBenchEvalEnv(BaseEnv):
if not self._dataset_loaded:
await self._load_dataset()
print(f"\nMT-Bench Evaluation Setup (Multi-Turn with LLM Judge):")
print("\nMT-Bench Evaluation Setup (Multi-Turn with LLM Judge):")
print(f" Dataset: {self.config.dataset_name}")
print(f" Categories: {self.config.categories or 'all'}")
print(f" Evaluation split: {self.config.eval_split}")
@ -556,7 +553,7 @@ class MTBenchEvalEnv(BaseEnv):
judge_question = turn_prompt
else:
# For turn 2, include context from turn 1
judge_question = f"Context from previous turn:\nUser: {turns[0]}\nAssistant: {turn_responses[0]}\n\nCurrent turn:\nUser: {turn_prompt}"
judge_question = f"Context from previous turn:\nUser: {turns[0]}\nAssistant: {turn_responses[0]}\n\nCurrent turn:\nUser: {turn_prompt}" # noqa: E501
# Get reference for this turn if available
turn_reference = (
@ -690,12 +687,12 @@ class MTBenchEvalEnv(BaseEnv):
if self.config.thinking_mode:
print(f" Format Compliance (T1): {format_valid_t1 / total:.2%}")
print(f" Format Compliance (T2): {format_valid_t2 / total:.2%}")
print(f"\n Per-Category Breakdown:")
print("\n Per-Category Breakdown:")
for cat, data in sorted(
category_metrics.items(), key=lambda x: -x[1]["avg_score"]
):
print(
f" {cat}: {data['avg_score']:.2f} (T1: {data['avg_turn_1']:.2f}, T2: {data['avg_turn_2']:.2f}) [{data['count']} items]"
f" {cat}: {data['avg_score']:.2f} (T1: {data['avg_turn_1']:.2f}, T2: {data['avg_turn_2']:.2f}) [{data['count']} items]" # noqa: E501
)
print(f"{'='*60}\n")

View file

@ -25,17 +25,13 @@ import asyncio
import os
import re
import time
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
create_system_content,
extract_number_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
@ -291,7 +287,7 @@ class MuSREvalEnv(BaseEnv):
valid_numbers = ", ".join(str(i + 1) for i in range(num_choices))
query = "Read the narrative and answer the question. Think step by step before answering.\n\n"
query += f"Provide your final answer within <answer></answer> tags, containing only the number ({valid_numbers}).\n\n"
query += f"Provide your final answer within <answer></answer> tags, containing only the number ({valid_numbers}).\n\n" # noqa: E501
query += "Example format:\n<answer>1</answer>\n\n"
query += f"{narrative}\n\n{question}\n\n"
for i, choice in enumerate(choices):

View file

@ -23,18 +23,13 @@ import os
import re
import time
from string import ascii_uppercase
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
@ -278,8 +273,8 @@ class OBQAEvalEnv(BaseEnv):
num_choices = len(choice_texts)
valid_letters = ", ".join(ascii_uppercase[:num_choices])
query = "Answer the following multiple choice question about common sense. Think step by step before answering.\n\n"
query += f"Provide your final answer within <answer></answer> tags, containing only the letter ({valid_letters}).\n\n"
query = "Answer the following multiple choice question about common sense. Think step by step before answering.\n\n" # noqa: E501
query += f"Provide your final answer within <answer></answer> tags, containing only the letter ({valid_letters}).\n\n" # noqa: E501
query += "Example format:\n<answer>A</answer>\n\n"
query += f"Question: {question}\n"
@ -542,7 +537,7 @@ class OBQAEvalEnv(BaseEnv):
if self.config.full_debug:
status = "" if is_correct else ""
print(
f" [{status}] Q: {eval_item.get('question_stem', '')[:50]}... | Pred: {extracted_answer}, Gold: {gold_answer}"
f" [{status}] Q: {eval_item.get('question_stem', '')[:50]}... | Pred: {extracted_answer}, Gold: {gold_answer}" # noqa: E501
)
return {"result": {"correct": is_correct}, "sample": sample}

View file

@ -18,10 +18,8 @@ Theorem proving (TP) problems are not included as they require different evaluat
"""
import asyncio
import os
import random
import re
import time
from typing import Dict, List, Optional, Tuple
import wandb
@ -41,7 +39,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# Available text-only subsets in OlympiadBench
@ -219,7 +216,7 @@ class OlympiadBenchEvalEnv(BaseEnv):
if self.config.subset not in AVAILABLE_SUBSETS:
print(
f"Warning: Subset '{self.config.subset}' may not be text-only. Available text-only subsets: {AVAILABLE_SUBSETS}"
f"Warning: Subset '{self.config.subset}' may not be text-only. Available text-only subsets: {AVAILABLE_SUBSETS}" # noqa: E501
)
try:
@ -303,7 +300,7 @@ class OlympiadBenchEvalEnv(BaseEnv):
multiple_answer_text += "(单位)"
unit_text = ",注意答案的单位不要放在\\boxed{}"
instruction = f"以下是{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。"
instruction = f"以下是{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。" # noqa: E501
instruction += f"\n\n请将你的最终答案放在<answer></answer>标签中,格式为{multiple_answer_text}{unit_text}"
instruction += "\n\n示例格式:\n<answer>\\boxed{42}</answer>"
else:
@ -322,8 +319,8 @@ class OlympiadBenchEvalEnv(BaseEnv):
multiple_answer_text += "(unit)"
unit_text = ", note that the unit of the answer should not be included in \\boxed{}"
instruction = f"The following is an open-ended problem from an International {subject} competition. {answer_type_text}Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results."
instruction += f"\n\nProvide your final answer within <answer></answer> tags in the format {multiple_answer_text}{unit_text}."
instruction = f"The following is an open-ended problem from an International {subject} competition. {answer_type_text}Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results." # noqa: E501
instruction += f"\n\nProvide your final answer within <answer></answer> tags in the format {multiple_answer_text}{unit_text}." # noqa: E501
instruction += "\n\nExample format:\n<answer>\\boxed{42}</answer>"
return f"{instruction}\n\n{item['question']}"

View file

@ -9,7 +9,6 @@ from typing import Dict, List, Optional, Tuple, Union
import wandb
from datasets import load_dataset
from eval_helpers import (
create_system_content,
get_default_thinking_prompt,
)
from pydantic import Field

View file

@ -20,11 +20,8 @@ Supports optional thinking mode with <think></think> tags.
"""
import asyncio
import os
import re
import time
from string import ascii_uppercase
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
@ -32,7 +29,6 @@ from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
@ -44,7 +40,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)

View file

@ -17,10 +17,8 @@ the gold standard (yes/no/maybe).
"""
import asyncio
import os
import random
import re
import time
from typing import Dict, List, Optional, Tuple
import wandb
@ -40,7 +38,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)
# Valid answers for PubMedQA
@ -48,7 +45,7 @@ VALID_ANSWERS = {"yes", "no", "maybe"}
# Prompt template for PubMedQA with answer tag instruction
PUBMEDQA_PROMPT_TEMPLATE = """Answer the following biomedical research question based on the provided context. Think step by step before answering.
PUBMEDQA_PROMPT_TEMPLATE = """Answer the following biomedical research question based on the provided context. Think step by step before answering. # noqa: E501
Provide your final answer within <answer></answer> tags, containing only one of: yes, no, or maybe.

View file

@ -8,9 +8,7 @@ from typing import Dict, List, Optional, Tuple, Union
import wandb
from datasets import load_dataset
from eval_helpers import (
create_system_content,
get_default_thinking_prompt,
save_eval_results,
)
from pydantic import Field
from tenacity import (

View file

@ -29,15 +29,10 @@ import time
from typing import Dict, List, Optional, Tuple
import openai
import wandb
from datasets import load_dataset
from eval_helpers import (
ANSWER_TAG_PATTERN,
create_system_content,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
@ -59,7 +54,7 @@ Question: {question}"""
# LLM Judge grading template - identical to lighteval's GRADER_TEMPLATE (for optional judge mode)
SIMPLEQA_GRADER_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
SIMPLEQA_GRADER_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. # noqa: E501
First, I will give examples of each grade, and then you will grade a new example.
@ -69,13 +64,13 @@ Question: What are the names of Barack Obama's children?
Gold target: Malia Obama and Sasha Obama
Predicted answer 1: sasha and malia obama
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. # noqa: E501
```
These predicted answers are all CORRECT because:
- They fully contain the important information in the gold target.
- They do not contain any information that contradicts the gold target.
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. # noqa: E501
The following are examples of INCORRECT predicted answers.
@ -87,11 +82,11 @@ Predicted answer 2: Malia, Sasha, and Susan.
Predicted answer 3: Barack Obama does not have any children.
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? # noqa: E501
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. # noqa: E501
```
These predicted answers are all INCORRECT because:
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. # noqa: E501
The following are examples of NOT_ATTEMPTED predicted answers.
@ -100,8 +95,8 @@ Question: What are the names of Barack Obama's children?
Gold target: Malia and Sasha
Predicted answer 1: I don't know.
Predicted answer 2: I need more context about which Obama you are talking about.
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. # noqa: E501
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. # noqa: E501
```
These predicted answers are all NOT_ATTEMPTED because:
- The important information in the gold target is not included in the answer.
@ -109,22 +104,22 @@ These predicted answers are all NOT_ATTEMPTED because:
Also note the following things:
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". # noqa: E501
- Predicted answers "120k", "124k", and 115k" are all CORRECT.
- Predicted answers "100k" and "113k" are INCORRECT.
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. # noqa: E501
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. # noqa: E501
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. # noqa: E501
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". # noqa: E501
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. # noqa: E501
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. # noqa: E501
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. # noqa: E501
- Do not punish for typos in people's name if it's clearly the same name.
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". # noqa: E501
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. # noqa: E501
```
Question: {question}
Gold target: {target}
@ -734,7 +729,7 @@ class SimpleQAEvalEnv(BaseEnv):
if self.config.full_debug:
status = "" if is_correct else ""
print(
f" [{status}] {topic[:20]}: exact={match_results['exact_match']}, fuzzy={match_results['fuzzy_match']}"
f" [{status}] {topic[:20]}: exact={match_results['exact_match']}, fuzzy={match_results['fuzzy_match']}" # noqa: E501
)
return {"score": 1.0 if is_correct else 0.0, "sample": sample}
@ -996,10 +991,10 @@ class SimpleQAEvalEnv(BaseEnv):
f"Overall Accuracy: {eval_metrics['eval/accuracy']:.4f} ({correct_count}/{total_count})"
)
print(
f"Exact Match Accuracy: {eval_metrics['eval/exact_match_accuracy']:.4f} ({exact_match_count}/{total_count})"
f"Exact Match Accuracy: {eval_metrics['eval/exact_match_accuracy']:.4f} ({exact_match_count}/{total_count})" # noqa: E501
)
print(
f"Fuzzy Match Accuracy: {eval_metrics['eval/fuzzy_match_accuracy']:.4f} ({fuzzy_match_count}/{total_count})"
f"Fuzzy Match Accuracy: {eval_metrics['eval/fuzzy_match_accuracy']:.4f} ({fuzzy_match_count}/{total_count})" # noqa: E501
)
print(f"\nEvaluation Time: {end_time - start_time:.1f} seconds")

View file

@ -20,11 +20,8 @@ Supports optional thinking mode with <think></think> tags.
"""
import asyncio
import os
import re
import time
from string import ascii_uppercase
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
@ -32,7 +29,6 @@ from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
@ -44,7 +40,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)

View file

@ -20,11 +20,8 @@ Supports optional thinking mode with <think></think> tags.
"""
import asyncio
import os
import re
import time
from string import ascii_uppercase
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset
@ -32,7 +29,6 @@ from eval_helpers import (
build_mcqa_fallback_patterns,
create_system_content,
extract_letter_from_answer_tag,
extract_thinking_content,
get_default_thinking_prompt,
save_eval_results,
validate_thinking_format,
@ -44,7 +40,6 @@ from atroposlib.envs.base import (
APIServerConfig,
BaseEnv,
BaseEnvConfig,
EvalHandlingEnum,
)