mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
update name of eval example
This commit is contained in:
parent
8ec5066998
commit
cb6bf37e68
1 changed files with 0 additions and 0 deletions
170
environments/eval_environments/aime24_hermes_example.py
Normal file
170
environments/eval_environments/aime24_hermes_example.py
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
import asyncio
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from datasets import load_dataset
|
||||
from latex2sympy2_extended import NormalizationConfig
|
||||
from math_verify import LatexExtractionConfig, parse, verify
|
||||
from math_verify.errors import TimeoutException
|
||||
|
||||
from atroposlib.envs.eval import EvalBase, eval_runner, pass_at_k
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
|
||||
hermes_system_prompt = (
|
||||
"You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the "
|
||||
"problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
|
||||
"solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
|
||||
"</think> tags, and then provide your solution or response to the problem."
|
||||
)
|
||||
|
||||
|
||||
def score_answer(gold, resp) -> Optional[bool]:
|
||||
try:
|
||||
gold_parsed = parse(
|
||||
gold,
|
||||
extraction_mode="first_match",
|
||||
extraction_config=[LatexExtractionConfig()],
|
||||
)
|
||||
except (Exception, TimeoutException, KeyError, TypeError, NotImplementedError):
|
||||
return None
|
||||
if len(gold_parsed) != 0:
|
||||
try:
|
||||
answer_parsed = parse(
|
||||
resp,
|
||||
extraction_config=[
|
||||
LatexExtractionConfig(
|
||||
normalization_config=NormalizationConfig(
|
||||
nits=False,
|
||||
malformed_operators=False,
|
||||
basic_latex=True,
|
||||
boxed="all",
|
||||
units=True,
|
||||
),
|
||||
# Ensures that boxed is tried first
|
||||
boxed_match_priority=0,
|
||||
try_extract_without_anchor=False,
|
||||
)
|
||||
],
|
||||
extraction_mode="first_match",
|
||||
)
|
||||
except (
|
||||
Exception,
|
||||
TimeoutException,
|
||||
KeyError,
|
||||
TypeError,
|
||||
NotImplementedError,
|
||||
):
|
||||
# Can't parse, so we skip
|
||||
return None
|
||||
# Reward 1 if the content is the same as the ground truth, 0 otherwise
|
||||
try:
|
||||
return verify(answer_parsed, gold_parsed)
|
||||
except (
|
||||
Exception,
|
||||
TimeoutException,
|
||||
KeyError,
|
||||
TypeError,
|
||||
NotImplementedError,
|
||||
):
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
class AIME24(EvalBase):
|
||||
"""
|
||||
AIME24 Eval Environment
|
||||
|
||||
kwargs:
|
||||
use_system_prompt (bool): Whether to use the system prompt in the evaluation.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.mp_executor = ProcessPoolExecutor(8)
|
||||
|
||||
def setup_data(self):
|
||||
aime_test_data = load_dataset("HuggingFaceH4/aime_2024", split="train")
|
||||
data = list()
|
||||
for item in aime_test_data:
|
||||
data.append(
|
||||
{
|
||||
"problem": item["problem"],
|
||||
"answer": item["answer"],
|
||||
}
|
||||
)
|
||||
return data
|
||||
|
||||
async def run_item(
|
||||
self, server: ServerManager, data_item: dict
|
||||
) -> Tuple[dict, list]:
|
||||
"""
|
||||
An abstract method that must be overridden in a subclass to define how a
|
||||
specific item should be processed. This method encapsulates the logic required
|
||||
to run or process the given data item on the provided server instance.
|
||||
|
||||
Args:
|
||||
server (ServerManager): An instance of ServerManager used to manage and
|
||||
interact with server operations during the item processing.
|
||||
data_item (dict): A dictionary representing the data item to be processed.
|
||||
The structure and content of the dictionary would depend on the
|
||||
specific application and use case.
|
||||
|
||||
Returns:
|
||||
Tuple[dict, list]: A tuple where the first element is a dictionary
|
||||
containing the processed results or output of the operation, and
|
||||
the second element is a list containing any additional data generated
|
||||
or collected during the item's processing.
|
||||
"""
|
||||
answer = data_item["answer"]
|
||||
question = data_item["problem"]
|
||||
use_sys_prompt = getattr(self, "use_system_prompt", False)
|
||||
async with server.managed_server() as managed:
|
||||
messages = (
|
||||
[{"role": "system", "content": hermes_system_prompt}]
|
||||
if use_sys_prompt
|
||||
else []
|
||||
)
|
||||
messages.append({"role": "user", "content": question})
|
||||
completion = await self.chat_completion(managed, messages)
|
||||
loop = asyncio.get_event_loop()
|
||||
gold = "\\boxed{" + answer + "}" if "\\boxed" not in answer else answer
|
||||
tasks = []
|
||||
for choice in completion.choices:
|
||||
resp = choice.message.content.split("</think>")[-1]
|
||||
tasks.append(
|
||||
loop.run_in_executor(self.mp_executor, score_answer, gold, resp)
|
||||
)
|
||||
rewards = await asyncio.gather(*tasks)
|
||||
rewards = [1.0 if reward else 0.0 for reward in rewards]
|
||||
passing = sum(rewards)
|
||||
n = self.get_generation_params()["n"]
|
||||
pass_at_k_val = getattr(self, "pass_at_k", 1)
|
||||
acc_at_k = pass_at_k(n, passing, getattr(self, "pass_at_k", 1))
|
||||
print(acc_at_k, n, passing, pass_at_k_val)
|
||||
key = f"pass@{pass_at_k_val}"
|
||||
if n != pass_at_k_val:
|
||||
key += f":{n}"
|
||||
return {
|
||||
key: acc_at_k,
|
||||
}, [
|
||||
{
|
||||
"messages": messages,
|
||||
"answer": choice.message.content,
|
||||
"score": rewards[i],
|
||||
}
|
||||
for i, choice in enumerate(completion.choices)
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(
|
||||
eval_runner(
|
||||
AIME24(
|
||||
pass_at_n=1,
|
||||
pass_at_n_samples=4,
|
||||
temperature=1.0,
|
||||
max_tokens=32768,
|
||||
use_system_prompt=True,
|
||||
)
|
||||
)
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue