Merge pull request #31 from NousResearch/fix-math-evals-due-to-updated-dataset

fix olympiadbench due to upstream changes
This commit is contained in:
dmahan93 2025-05-09 09:42:06 -05:00 committed by GitHub
commit b959c30ebf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -9,6 +9,7 @@ import re
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
import wandb
from datasets import load_dataset from datasets import load_dataset
from latex2sympy2_extended import NormalizationConfig from latex2sympy2_extended import NormalizationConfig
from math_verify import LatexExtractionConfig, parse, verify from math_verify import LatexExtractionConfig, parse, verify
@ -16,13 +17,12 @@ from math_verify.errors import TimeoutException
from pydantic import Field from pydantic import Field
from tqdm.asyncio import tqdm_asyncio from tqdm.asyncio import tqdm_asyncio
import wandb
from atroposlib.envs.base import ( from atroposlib.envs.base import (
BaseEnv, BaseEnv,
BaseEnvConfig, BaseEnvConfig,
EvalHandlingEnum, EvalHandlingEnum,
OpenaiConfig,
ScoredDataGroup, ScoredDataGroup,
ServerBaseline,
) )
prompt_format = ( prompt_format = (
@ -115,7 +115,7 @@ class MathEnv(BaseEnv):
def __init__( def __init__(
self, self,
config: RSConfig, config: RSConfig,
server_configs: List[OpenaiConfig], server_configs: ServerBaseline,
slurm=True, slurm=True,
testing=False, testing=False,
): ):
@ -133,7 +133,7 @@ class MathEnv(BaseEnv):
self.iter = 0 self.iter = 0
@classmethod @classmethod
def config_init(cls) -> Tuple[RSConfig, List[OpenaiConfig]]: def config_init(cls) -> Tuple[RSConfig, ServerBaseline]:
env_config = RSConfig( env_config = RSConfig(
tokenizer_name="Qwen/Qwen2.5-7B", tokenizer_name="Qwen/Qwen2.5-7B",
group_size=8, group_size=8,
@ -147,14 +147,10 @@ class MathEnv(BaseEnv):
eval_handling=EvalHandlingEnum.LIMIT_TRAIN, eval_handling=EvalHandlingEnum.LIMIT_TRAIN,
eval_limit_ratio=0.1, eval_limit_ratio=0.1,
) )
server_configs = [ server_configs = ServerBaseline(
OpenaiConfig(
model_name="default", model_name="default",
base_url="http://localhost:9004/v1",
api_key="x",
num_requests_for_eval=256, # since evaling only on one... num_requests_for_eval=256, # since evaling only on one...
), )
]
return env_config, server_configs return env_config, server_configs
@ -222,8 +218,8 @@ class MathEnv(BaseEnv):
) )
) )
for name, t_dataset in zip( for name, t_dataset in zip(
["amc23", "minerva", "olympiad"], ["amc23", "minerva"],
[amc_test_data, minerva_test_data, olympiad_test_data], [amc_test_data, minerva_test_data],
): ):
for item in t_dataset: for item in t_dataset:
self.test.append( self.test.append(
@ -235,6 +231,17 @@ class MathEnv(BaseEnv):
name, name,
) )
) )
for name, t_dataset in zip(["olympiad"], [olympiad_test_data]):
for item in t_dataset:
self.test.append(
(
prompt_format.format(
prompt=problem_format.format(problem=item["question"])
),
item["final_answer"][0],
name,
)
)
return return
async def rollout_and_score_eval(self, question, answer, subset): async def rollout_and_score_eval(self, question, answer, subset):