hopefully final linter fixes lol

2026-05-01 17:45:16 +00:00 · 2025-12-24 23:36:36 +00:00 · 2025-12-24 23:36:36 +00:00 · 85296c519e
commit 85296c519e
parent 67869c3a79
29 changed files with 76 additions and 155 deletions
--- a/environments/eval_environments/aime_eval.py
+++ b/environments/eval_environments/aime_eval.py
@ -24,6 +24,8 @@ Supports thinking mode with <think></think> tags for extended reasoning.

 import asyncio
 import random
+import re
+from concurrent.futures import ProcessPoolExecutor
 from typing import Dict, List, Optional

 import wandb
--- a/environments/eval_environments/aimo_eval.py
+++ b/environments/eval_environments/aimo_eval.py
@ -18,6 +18,7 @@ Supports thinking mode with <think></think> tags for extended reasoning.

 import asyncio
 import random
+from concurrent.futures import ProcessPoolExecutor
 from typing import Dict, List, Optional

 import wandb
--- a/environments/eval_environments/arc_agi_eval.py
+++ b/environments/eval_environments/arc_agi_eval.py
@ -28,7 +28,7 @@ import ast
 import asyncio
 import json
 import re
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple

 import wandb
 from datasets import load_dataset
--- a/environments/eval_environments/bbh_eval.py
+++ b/environments/eval_environments/bbh_eval.py
@ -30,6 +30,7 @@ from eval_helpers import (
    build_mcqa_fallback_patterns,
    create_system_content,
    extract_letter_from_answer_tag,
+    extract_thinking_content,
    get_default_thinking_prompt,
    save_eval_results,
    validate_thinking_format,
--- a/environments/eval_environments/boolq_eval.py
+++ b/environments/eval_environments/boolq_eval.py
@ -20,6 +20,7 @@ Supports optional thinking mode with <think></think> tags.
 """

 import asyncio
+import re
 from typing import Dict, List, Optional, Tuple

 import wandb
--- a/environments/eval_environments/eval_helpers.py
+++ b/environments/eval_environments/eval_helpers.py
@ -125,7 +125,7 @@ def extract_letter_from_answer_tag(
    content_to_check = answer_content
    prefix_match = ANSWER_PREFIX_PATTERN.match(content_to_check)
    if prefix_match:
-        content_to_check = content_to_check[prefix_match.end() :].strip()
+        content_to_check = content_to_check[prefix_match.end() :].strip()  # noqa: E203
        if debug:
            print(f"    Stripped prefix, remaining content: '{content_to_check}'")

@ -304,7 +304,7 @@ def extract_number_from_answer_tag(
    content_to_check = answer_content
    prefix_match = ANSWER_PREFIX_PATTERN.match(content_to_check)
    if prefix_match:
-        content_to_check = content_to_check[prefix_match.end() :].strip()
+        content_to_check = content_to_check[prefix_match.end() :].strip()  # noqa: E203
        if debug:
            print(f"    Stripped prefix, remaining content: '{content_to_check}'")

--- a/environments/eval_environments/gsm8k_eval.py
+++ b/environments/eval_environments/gsm8k_eval.py
@ -18,18 +18,14 @@ Supports thinking mode with <think></think> tags for extended reasoning.
 """

 import asyncio
-import os
 import random
-import re
-import time
 from concurrent.futures import ProcessPoolExecutor
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional

 import wandb
 from datasets import load_dataset
 from eval_helpers import (
    THINK_CONTENT_AFTER_PATTERN,
-    compare_math_strings,
    create_system_content,
    extract_boxed_answers,
    extract_thinking_content,
@ -47,8 +43,7 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
-)
+    )

 # Prompt template following lighteval's structure
 # Added boxed instruction for consistency with our math verification
@ -329,7 +324,7 @@ class GSM8KEvalEnv(BaseEnv):
            print(f"Extracted: {extracted_answer}")
            print(f"Correct: {is_correct} (method: {method})")
            if has_multiple_boxed:
-                print(f"WARNING: Multiple \\boxed{{}} found - marked incorrect")
+                print("WARNING: Multiple \\boxed{} found - marked incorrect")

        return {
            "item_id": item["id"],
--- a/environments/eval_environments/hellaswag_eval.py
+++ b/environments/eval_environments/hellaswag_eval.py
@ -20,11 +20,8 @@ Supports optional thinking mode with <think></think> tags.
 """

 import asyncio
-import os
-import re
-import time
 from string import ascii_uppercase
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple

 import wandb
 from datasets import load_dataset
@ -32,7 +29,6 @@ from eval_helpers import (
    build_mcqa_fallback_patterns,
    create_system_content,
    extract_letter_from_answer_tag,
-    extract_thinking_content,
    get_default_thinking_prompt,
    save_eval_results,
    validate_thinking_format,
@ -44,7 +40,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )


--- a/environments/eval_environments/hle_eval.py
+++ b/environments/eval_environments/hle_eval.py
@ -17,10 +17,8 @@ Note: This implementation uses the text-only questions (filters out image questi
 """

 import asyncio
-import os
 import random
 import re
-import time
 from typing import Dict, List, Optional, Tuple

 import wandb
@ -40,12 +38,11 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )

 # Prompt template for HLE with answer tag instruction
 HLE_PROMPT_TEMPLATE = """Answer the following challenging question. Think step by step and reason carefully before providing your answer.
-
+  # noqa: E501
 Provide your final answer within <answer></answer> tags.

 Example format:
@ -339,13 +336,13 @@ class HLEEvalEnv(BaseEnv):
                    return answer, "fallback_pattern"

        # Last resort: take the last line/sentence
-        lines = [l.strip() for l in response.strip().split("\n") if l.strip()]
+        lines = [line.strip() for line in response.strip().split("\n") if line.strip()]
        if lines:
            last_line = lines[-1]
            # Clean up common prefixes
            for prefix in ["Therefore,", "Thus,", "So,", "Hence,"]:
                if last_line.startswith(prefix):
-                    last_line = last_line[len(prefix) :].strip()
+                    last_line = last_line[len(prefix) :].strip()  # noqa: E203

            if debug:
                preview = last_line[:50] + "..." if len(last_line) > 50 else last_line
--- a/environments/eval_environments/ifeval_eval.py
+++ b/environments/eval_environments/ifeval_eval.py
@ -31,15 +31,10 @@ import re
 import time
 from typing import Any, Dict, List, Optional, Tuple

-import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    THINK_CONTENT_AFTER_PATTERN,
    create_system_content,
-    extract_thinking_content,
    get_default_thinking_prompt,
-    save_eval_results,
-    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
--- a/environments/eval_environments/ifeval_instructions/instructions.py
+++ b/environments/eval_environments/ifeval_instructions/instructions.py
@ -918,7 +918,7 @@ class JsonFormat(Instruction):
    """Check the Json format."""

    def build_description(self):
-        self._description_pattern = "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```."
+        self._description_pattern = "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```."  # noqa: E501
        return self._description_pattern

    def get_instruction_args(self):
@ -1302,7 +1302,7 @@ class EndChecker(Instruction):
        )
        if self._end_phrase is None:
            self._end_phrase = random.choice(_ENDING_OPTIONS)
-        self._description_pattern = "Finish your response with this exact phrase {ender}. No other words should follow this phrase."
+        self._description_pattern = "Finish your response with this exact phrase {ender}. No other words should follow this phrase."  # noqa: E501
        return self._description_pattern.format(ender=self._end_phrase)

    def get_instruction_args(self):
@ -1324,7 +1324,7 @@ class TitleChecker(Instruction):

    def build_description(self):
        """Build the instruction description."""
-        self._description_pattern = "Your answer must contain a title, wrapped in double angular brackets, such as <<poem of joy>>."
+        self._description_pattern = "Your answer must contain a title, wrapped in double angular brackets, such as <<poem of joy>>."  # noqa: E501
        return self._description_pattern

    def get_instruction_args(self):
@ -1390,7 +1390,7 @@ class LetterFrequencyChecker(Instruction):
        else:
            self._comparison_relation = let_relation

-        self._description_pattern = "In your response, the letter {letter} should appear {let_relation} {let_frequency} times."
+        self._description_pattern = "In your response, the letter {letter} should appear {let_relation} {let_frequency} times."  # noqa: E501

        return self._description_pattern.format(
            letter=self._letter,
@ -1459,7 +1459,7 @@ class LowercaseLettersEnglishChecker(Instruction):

    def build_description(self):
        """Build the instruction description."""
-        self._description_pattern = "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed."
+        self._description_pattern = "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed."  # noqa: E501
        return self._description_pattern

    def get_instruction_args(self):
@ -1539,7 +1539,7 @@ class CapitalWordFrequencyChecker(Instruction):
                f"{_COMPARISON_RELATION}, but {capital_relation} is given."
            )

-        self._description_pattern = "In your response, words with all capital letters should appear {relation} {frequency} times."
+        self._description_pattern = "In your response, words with all capital letters should appear {relation} {frequency} times."  # noqa: E501

        return self._description_pattern.format(
            frequency=self._frequency, relation=self._comparison_relation
--- a/environments/eval_environments/ifeval_instructions/instructions_utils.py
+++ b/environments/eval_environments/ifeval_instructions/instructions_utils.py
@ -1604,7 +1604,7 @@ LANGUAGE_CODES = {
 _ALPHABETS = "([A-Za-z])"
 _PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
 _SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
-_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
+_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"  # noqa: E501
 _ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
 _WEBSITES = "[.](com|net|org|io|gov|edu|me)"
 _DIGITS = "([0-9])"
--- a/environments/eval_environments/judgemark_eval.py
+++ b/environments/eval_environments/judgemark_eval.py
@ -34,16 +34,14 @@ import re
 import statistics
 import traceback
 from collections import defaultdict
-from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple

 import numpy as np
 import openai
 import scipy.stats
 from eval_helpers import (
    create_system_content,
-    get_default_thinking_prompt,
    save_eval_results,
 )
 from pydantic import Field
--- a/environments/eval_environments/math500_eval.py
+++ b/environments/eval_environments/math500_eval.py
@ -18,12 +18,9 @@ Supports thinking mode with <think></think> tags for extended reasoning.
 """

 import asyncio
-import os
 import random
-import re
-import time
 from concurrent.futures import ProcessPoolExecutor
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional

 import wandb
 from datasets import load_dataset
@ -46,7 +43,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )

 # Prompt template following lighteval's MATH-500 structure
@ -54,7 +50,7 @@ MATH500_PROMPT_TEMPLATE = """Solve the following problem. The final line of your
 "ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering.

 However, for reliable parsing, also put your final answer in \\boxed{{}} format.
-
+  # noqa: E501
 {problem}"""


--- a/environments/eval_environments/math_eval.py
+++ b/environments/eval_environments/math_eval.py
@ -21,12 +21,9 @@ Supports thinking mode with <think></think> tags for extended reasoning.
 """

 import asyncio
-import os
 import random
-import re
-import time
 from concurrent.futures import ProcessPoolExecutor
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional

 import wandb
 from datasets import load_dataset
@ -49,7 +46,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )

 # All available MATH subsets
--- a/environments/eval_environments/mixeval_eval.py
+++ b/environments/eval_environments/mixeval_eval.py
@ -31,14 +31,12 @@ import asyncio
 import os
 import random
 import re
-import time
 from string import ascii_uppercase
 from typing import Dict, List, Optional, Tuple

 import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    THINK_CONTENT_AFTER_PATTERN,
    create_system_content,
    extract_thinking_content,
    get_default_thinking_prompt,
@ -58,7 +56,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )

 # Prompt construction helpers
@ -125,9 +122,9 @@ def judge_freeform_prompt(question: str, answer: str, gold: str) -> List[Dict]:
            "content": f"""You will be provided with a question, its golden answer(s), and the model's answer, while the context of the question is not given here. Your task is to judge how correct the model's answer is based on the golden answer(s), without seeing the context of the question, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Your should first briefly give your reasoning process regarding how the model's answer conforms to or contradicts the golden answer(s), and then give the correctness score. The correctness score must strictly follow this format: "[[score]]", e.g., "The correctness score: [[0.5]]".

 Note that each one of the golden answers is considered correct. Thus if the model's answer matches any one of the golden answers, it should be considered correct. Judge the below case, give the brief reasoning process and the correctness score.
-
+  # noqa: E501
 Question: {question}
-Golden Answer(s): {gold}
+Golden Answer(s): {gold}  # noqa: E501
 Model's Answer: {answer}
 Your Judgment:
 """,
@ -150,7 +147,7 @@ def judge_multichoice_prompt(
            "content": f"""You will be provided with a multiple-choice question, its options, the gold answer, and the model's answer, while the context of the question is not given here. Your task is to extract or judge which option is chosen by the model based on its response, and to determine whether or not the model answered correctly. The model scores can either be 0 (incorrect) or 1 (correct). The correctness score must strictly follow this format: "[[score]]", e.g., "The correctness score: [[1]]".

 Question: {question}
-Options:
+Options:  # noqa: E501
 {parsed_options}
 Golden Answer: {gold}
 Model's Answer: {answer}
--- a/environments/eval_environments/mmlu_eval.py
+++ b/environments/eval_environments/mmlu_eval.py
@ -18,22 +18,16 @@ Supports optional thinking mode with <think></think> tags for extended reasoning

 import asyncio
 import os
-import random
 import re
 import time
 from string import ascii_uppercase
 from typing import Dict, List, Optional, Tuple

-import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    build_mcqa_fallback_patterns,
    create_system_content,
    extract_letter_from_answer_tag,
-    extract_thinking_content,
    get_default_thinking_prompt,
-    save_eval_results,
-    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
@ -879,13 +873,13 @@ class MMLUEvalEnv(BaseEnv):
                status = "✓" if is_correct else "✗"
                format_status = "✓" if format_valid else "✗"
                print(
-                    f"  [{status}] {subject}: gold={gold_letter}, extracted={extracted_answer} ({extraction_method}), format={format_status}"
+                    f"  [{status}] {subject}: gold={gold_letter}, extracted={extracted_answer} ({extraction_method}), format={format_status}"  # noqa: E501
                )

            return {"is_correct": is_correct, "sample": sample}

        except Exception as e:
-            if self.config.full_debug:
+            if self.config.full_debug:  # noqa: E501
                print(f"Error in rollout_and_score_eval: {e}")
                import traceback

--- a/environments/eval_environments/mmlu_pro_eval.py
+++ b/environments/eval_environments/mmlu_pro_eval.py
@ -23,22 +23,16 @@ Supports optional thinking mode with <think></think> tags for extended reasoning

 import asyncio
 import os
-import random
 import re
 import time
 from string import ascii_uppercase
 from typing import Dict, List, Optional, Tuple

-import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    build_mcqa_fallback_patterns,
    create_system_content,
    extract_letter_from_answer_tag,
-    extract_thinking_content,
    get_default_thinking_prompt,
-    save_eval_results,
-    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
--- a/environments/eval_environments/mtbench_eval.py
+++ b/environments/eval_environments/mtbench_eval.py
@ -24,13 +24,11 @@ import asyncio
 import os
 import random
 import re
-import time
 from typing import Dict, List, Optional, Tuple

 import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    THINK_CONTENT_AFTER_PATTERN,
    create_system_content,
    extract_thinking_content,
    get_default_thinking_prompt,
@ -50,7 +48,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )

 # MT-Bench categories
@ -87,7 +84,7 @@ Your job is to evaluate a task carried out by an AI system powered by a large la
 You will be provided with the inputs and output of the task, as well as the evaluation criteria and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation criteria and scoring rubric provided.

 # INPUT
-Below are the inputs required for performing the task:
+Below are the inputs required for performing the task:  # noqa: E501
 <inputs>
 {question}
 </inputs>
@ -116,11 +113,11 @@ How well the response answers the question?{' ' + reference_text if reference_te
 1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. Review the evaluation criteria and scoring rubric to understand the different levels of performance and the descriptions for each score.
 2. Review the inputs and output: Look at the inputs provided for the task. Examine the output generated from completing the task.
 3. Compare output to score descriptions: Compare the output against the criteria and score descriptions in the scoring rubric. For each criterion, decide which description best matches the output.
-4. After comparing the output to the score descriptions, pay attention to the small details that might impact the final score that you assign. Sometimes a small difference can dictate the final score.
-5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring to specific aspects of the output and comparing them to the rubric.
-6. Assign a final score based on the scoring rubric.
-
-## FORMAT FOR THE EVALUATION
+4. After comparing the output to the score descriptions, pay attention to the small details that might impact the final score that you assign. Sometimes a small difference can dictate the final score.  # noqa: E501
+5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring to specific aspects of the output and comparing them to the rubric.  # noqa: E501
+6. Assign a final score based on the scoring rubric.  # noqa: E501
+  # noqa: E501
+## FORMAT FOR THE EVALUATION  # noqa: E501
 - Write the verbal feedback inside <feedback> tags without any additional surrounding text.
 - Write the numeric score inside <score> tags, without any additional surrounding text and always after the feedback.

@ -309,7 +306,7 @@ class MTBenchEvalEnv(BaseEnv):
        if not self._dataset_loaded:
            await self._load_dataset()

-        print(f"\nMT-Bench Evaluation Setup (Multi-Turn with LLM Judge):")
+        print("\nMT-Bench Evaluation Setup (Multi-Turn with LLM Judge):")
        print(f"  Dataset: {self.config.dataset_name}")
        print(f"  Categories: {self.config.categories or 'all'}")
        print(f"  Evaluation split: {self.config.eval_split}")
@ -556,7 +553,7 @@ class MTBenchEvalEnv(BaseEnv):
                judge_question = turn_prompt
            else:
                # For turn 2, include context from turn 1
-                judge_question = f"Context from previous turn:\nUser: {turns[0]}\nAssistant: {turn_responses[0]}\n\nCurrent turn:\nUser: {turn_prompt}"
+                judge_question = f"Context from previous turn:\nUser: {turns[0]}\nAssistant: {turn_responses[0]}\n\nCurrent turn:\nUser: {turn_prompt}"  # noqa: E501

            # Get reference for this turn if available
            turn_reference = (
@ -690,12 +687,12 @@ class MTBenchEvalEnv(BaseEnv):
        if self.config.thinking_mode:
            print(f"  Format Compliance (T1): {format_valid_t1 / total:.2%}")
            print(f"  Format Compliance (T2): {format_valid_t2 / total:.2%}")
-        print(f"\n  Per-Category Breakdown:")
+        print("\n  Per-Category Breakdown:")
        for cat, data in sorted(
            category_metrics.items(), key=lambda x: -x[1]["avg_score"]
        ):
            print(
-                f"    {cat}: {data['avg_score']:.2f} (T1: {data['avg_turn_1']:.2f}, T2: {data['avg_turn_2']:.2f}) [{data['count']} items]"
+                f"    {cat}: {data['avg_score']:.2f} (T1: {data['avg_turn_1']:.2f}, T2: {data['avg_turn_2']:.2f}) [{data['count']} items]"  # noqa: E501
            )
        print(f"{'='*60}\n")

--- a/environments/eval_environments/musr_eval.py
+++ b/environments/eval_environments/musr_eval.py
@ -25,17 +25,13 @@ import asyncio
 import os
 import re
 import time
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple

-import wandb
 from datasets import load_dataset
 from eval_helpers import (
    create_system_content,
    extract_number_from_answer_tag,
-    extract_thinking_content,
    get_default_thinking_prompt,
-    save_eval_results,
-    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
@ -291,7 +287,7 @@ class MuSREvalEnv(BaseEnv):
        valid_numbers = ", ".join(str(i + 1) for i in range(num_choices))

        query = "Read the narrative and answer the question. Think step by step before answering.\n\n"
-        query += f"Provide your final answer within <answer></answer> tags, containing only the number ({valid_numbers}).\n\n"
+        query += f"Provide your final answer within <answer></answer> tags, containing only the number ({valid_numbers}).\n\n"  # noqa: E501
        query += "Example format:\n<answer>1</answer>\n\n"
        query += f"{narrative}\n\n{question}\n\n"
        for i, choice in enumerate(choices):
--- a/environments/eval_environments/obqa_eval.py
+++ b/environments/eval_environments/obqa_eval.py
@ -23,18 +23,13 @@ import os
 import re
 import time
 from string import ascii_uppercase
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple

-import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    build_mcqa_fallback_patterns,
    create_system_content,
    extract_letter_from_answer_tag,
-    extract_thinking_content,
    get_default_thinking_prompt,
-    save_eval_results,
-    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
@ -278,8 +273,8 @@ class OBQAEvalEnv(BaseEnv):
        num_choices = len(choice_texts)
        valid_letters = ", ".join(ascii_uppercase[:num_choices])

-        query = "Answer the following multiple choice question about common sense. Think step by step before answering.\n\n"
-        query += f"Provide your final answer within <answer></answer> tags, containing only the letter ({valid_letters}).\n\n"
+        query = "Answer the following multiple choice question about common sense. Think step by step before answering.\n\n"  # noqa: E501
+        query += f"Provide your final answer within <answer></answer> tags, containing only the letter ({valid_letters}).\n\n"  # noqa: E501
        query += "Example format:\n<answer>A</answer>\n\n"
        query += f"Question: {question}\n"

@ -542,7 +537,7 @@ class OBQAEvalEnv(BaseEnv):
            if self.config.full_debug:
                status = "✓" if is_correct else "✗"
                print(
-                    f"  [{status}] Q: {eval_item.get('question_stem', '')[:50]}... | Pred: {extracted_answer}, Gold: {gold_answer}"
+                    f"  [{status}] Q: {eval_item.get('question_stem', '')[:50]}... | Pred: {extracted_answer}, Gold: {gold_answer}"  # noqa: E501
                )

            return {"result": {"correct": is_correct}, "sample": sample}
--- a/environments/eval_environments/olympiadbench_eval.py
+++ b/environments/eval_environments/olympiadbench_eval.py
@ -18,10 +18,8 @@ Theorem proving (TP) problems are not included as they require different evaluat
 """

 import asyncio
-import os
 import random
 import re
-import time
 from typing import Dict, List, Optional, Tuple

 import wandb
@ -41,7 +39,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )

 # Available text-only subsets in OlympiadBench
@ -219,7 +216,7 @@ class OlympiadBenchEvalEnv(BaseEnv):

        if self.config.subset not in AVAILABLE_SUBSETS:
            print(
-                f"Warning: Subset '{self.config.subset}' may not be text-only. Available text-only subsets: {AVAILABLE_SUBSETS}"
+                f"Warning: Subset '{self.config.subset}' may not be text-only. Available text-only subsets: {AVAILABLE_SUBSETS}"  # noqa: E501
            )

        try:
@ -303,7 +300,7 @@ class OlympiadBenchEvalEnv(BaseEnv):
                multiple_answer_text += "(单位)"
                unit_text = "，注意答案的单位不要放在\\boxed{}中"

-            instruction = f"以下是{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。"
+            instruction = f"以下是{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。"  # noqa: E501
            instruction += f"\n\n请将你的最终答案放在<answer></answer>标签中，格式为{multiple_answer_text}{unit_text}。"
            instruction += "\n\n示例格式:\n<answer>\\boxed{42}</answer>"
        else:
@ -322,8 +319,8 @@ class OlympiadBenchEvalEnv(BaseEnv):
                multiple_answer_text += "(unit)"
                unit_text = ", note that the unit of the answer should not be included in \\boxed{}"

-            instruction = f"The following is an open-ended problem from an International {subject} competition. {answer_type_text}Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results."
-            instruction += f"\n\nProvide your final answer within <answer></answer> tags in the format {multiple_answer_text}{unit_text}."
+            instruction = f"The following is an open-ended problem from an International {subject} competition. {answer_type_text}Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results."  # noqa: E501
+            instruction += f"\n\nProvide your final answer within <answer></answer> tags in the format {multiple_answer_text}{unit_text}."  # noqa: E501
            instruction += "\n\nExample format:\n<answer>\\boxed{42}</answer>"

        return f"{instruction}\n\n{item['question']}"
--- a/environments/eval_environments/pairwise_judgement_environment.py
+++ b/environments/eval_environments/pairwise_judgement_environment.py
@ -9,7 +9,6 @@ from typing import Dict, List, Optional, Tuple, Union
 import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    create_system_content,
    get_default_thinking_prompt,
 )
 from pydantic import Field
--- a/environments/eval_environments/piqa_eval.py
+++ b/environments/eval_environments/piqa_eval.py
@ -20,11 +20,8 @@ Supports optional thinking mode with <think></think> tags.
 """

 import asyncio
-import os
-import re
-import time
 from string import ascii_uppercase
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple

 import wandb
 from datasets import load_dataset
@ -32,7 +29,6 @@ from eval_helpers import (
    build_mcqa_fallback_patterns,
    create_system_content,
    extract_letter_from_answer_tag,
-    extract_thinking_content,
    get_default_thinking_prompt,
    save_eval_results,
    validate_thinking_format,
@ -44,7 +40,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )


--- a/environments/eval_environments/pubmedqa_eval.py
+++ b/environments/eval_environments/pubmedqa_eval.py
@ -17,10 +17,8 @@ the gold standard (yes/no/maybe).
 """

 import asyncio
-import os
 import random
 import re
-import time
 from typing import Dict, List, Optional, Tuple

 import wandb
@ -40,7 +38,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )

 # Valid answers for PubMedQA
@ -48,7 +45,7 @@ VALID_ANSWERS = {"yes", "no", "maybe"}


 # Prompt template for PubMedQA with answer tag instruction
-PUBMEDQA_PROMPT_TEMPLATE = """Answer the following biomedical research question based on the provided context. Think step by step before answering.
+PUBMEDQA_PROMPT_TEMPLATE = """Answer the following biomedical research question based on the provided context. Think step by step before answering.  # noqa: E501

 Provide your final answer within <answer></answer> tags, containing only one of: yes, no, or maybe.

--- a/environments/eval_environments/refusalbench_environment.py
+++ b/environments/eval_environments/refusalbench_environment.py
@ -8,9 +8,7 @@ from typing import Dict, List, Optional, Tuple, Union
 import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    create_system_content,
    get_default_thinking_prompt,
-    save_eval_results,
 )
 from pydantic import Field
 from tenacity import (
--- a/environments/eval_environments/simpleqa_eval.py
+++ b/environments/eval_environments/simpleqa_eval.py
@ -29,15 +29,10 @@ import time
 from typing import Dict, List, Optional, Tuple

 import openai
-import wandb
 from datasets import load_dataset
 from eval_helpers import (
-    ANSWER_TAG_PATTERN,
    create_system_content,
-    extract_thinking_content,
    get_default_thinking_prompt,
-    save_eval_results,
-    validate_thinking_format,
 )
 from pydantic import Field
 from tqdm.asyncio import tqdm_asyncio
@ -59,7 +54,7 @@ Question: {question}"""


 # LLM Judge grading template - identical to lighteval's GRADER_TEMPLATE (for optional judge mode)
-SIMPLEQA_GRADER_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
+SIMPLEQA_GRADER_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].  # noqa: E501
 First, I will give examples of each grade, and then you will grade a new example.


@ -69,13 +64,13 @@ Question: What are the names of Barack Obama's children?
 Gold target: Malia Obama and Sasha Obama
 Predicted answer 1: sasha and malia obama
 Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
-Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
+Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.  # noqa: E501
 ```
 These predicted answers are all CORRECT because:
 - They fully contain the important information in the gold target.
 - They do not contain any information that contradicts the gold target.
 - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
+- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.  # noqa: E501


 The following are examples of INCORRECT predicted answers.
@ -87,11 +82,11 @@ Predicted answer 2: Malia, Sasha, and Susan.
 Predicted answer 3: Barack Obama does not have any children.
 Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
 Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
-Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
-Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
+Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?  # noqa: E501
+Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.  # noqa: E501
 ```
 These predicted answers are all INCORRECT because:
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
+- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.  # noqa: E501


 The following are examples of NOT_ATTEMPTED predicted answers.
@ -100,8 +95,8 @@ Question: What are the names of Barack Obama's children?
 Gold target: Malia and Sasha
 Predicted answer 1: I don't know.
 Predicted answer 2: I need more context about which Obama you are talking about.
-Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
-Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
+Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.  # noqa: E501
+Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.  # noqa: E501
 ```
 These predicted answers are all NOT_ATTEMPTED because:
 - The important information in the gold target is not included in the answer.
@ -109,22 +104,22 @@ These predicted answers are all NOT_ATTEMPTED because:


 Also note the following things:
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
+- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".  # noqa: E501
 - Predicted answers "120k", "124k", and 115k" are all CORRECT.
 - Predicted answers "100k" and "113k" are INCORRECT.
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
+- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.  # noqa: E501
+- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.  # noqa: E501
+- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.  # noqa: E501
 - Do not punish predicted answers if they omit information that would be clearly inferred from the question.
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
+- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".  # noqa: E501
+- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.  # noqa: E501
+- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.  # noqa: E501
+- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.  # noqa: E501
 - Do not punish for typos in people's name if it's clearly the same name.
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
+- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".  # noqa: E501


-Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.  # noqa: E501
 ```
 Question: {question}
 Gold target: {target}
@ -734,7 +729,7 @@ class SimpleQAEvalEnv(BaseEnv):
                if self.config.full_debug:
                    status = "✓" if is_correct else "✗"
                    print(
-                        f"  [{status}] {topic[:20]}: exact={match_results['exact_match']}, fuzzy={match_results['fuzzy_match']}"
+                        f"  [{status}] {topic[:20]}: exact={match_results['exact_match']}, fuzzy={match_results['fuzzy_match']}"  # noqa: E501
                    )

                return {"score": 1.0 if is_correct else 0.0, "sample": sample}
@ -996,10 +991,10 @@ class SimpleQAEvalEnv(BaseEnv):
                f"Overall Accuracy: {eval_metrics['eval/accuracy']:.4f} ({correct_count}/{total_count})"
            )
            print(
-                f"Exact Match Accuracy: {eval_metrics['eval/exact_match_accuracy']:.4f} ({exact_match_count}/{total_count})"
+                f"Exact Match Accuracy: {eval_metrics['eval/exact_match_accuracy']:.4f} ({exact_match_count}/{total_count})"  # noqa: E501
            )
            print(
-                f"Fuzzy Match Accuracy: {eval_metrics['eval/fuzzy_match_accuracy']:.4f} ({fuzzy_match_count}/{total_count})"
+                f"Fuzzy Match Accuracy: {eval_metrics['eval/fuzzy_match_accuracy']:.4f} ({fuzzy_match_count}/{total_count})"  # noqa: E501
            )

        print(f"\nEvaluation Time: {end_time - start_time:.1f} seconds")
--- a/environments/eval_environments/siqa_eval.py
+++ b/environments/eval_environments/siqa_eval.py
@ -20,11 +20,8 @@ Supports optional thinking mode with <think></think> tags.
 """

 import asyncio
-import os
-import re
-import time
 from string import ascii_uppercase
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple

 import wandb
 from datasets import load_dataset
@ -32,7 +29,6 @@ from eval_helpers import (
    build_mcqa_fallback_patterns,
    create_system_content,
    extract_letter_from_answer_tag,
-    extract_thinking_content,
    get_default_thinking_prompt,
    save_eval_results,
    validate_thinking_format,
@ -44,7 +40,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )


--- a/environments/eval_environments/winogrande_eval.py
+++ b/environments/eval_environments/winogrande_eval.py
@ -20,11 +20,8 @@ Supports optional thinking mode with <think></think> tags.
 """

 import asyncio
-import os
-import re
-import time
 from string import ascii_uppercase
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple

 import wandb
 from datasets import load_dataset
@ -32,7 +29,6 @@ from eval_helpers import (
    build_mcqa_fallback_patterns,
    create_system_content,
    extract_letter_from_answer_tag,
-    extract_thinking_content,
    get_default_thinking_prompt,
    save_eval_results,
    validate_thinking_format,
@ -44,7 +40,6 @@ from atroposlib.envs.base import (
    APIServerConfig,
    BaseEnv,
    BaseEnvConfig,
-    EvalHandlingEnum,
 )