diff --git a/eval/eval.py b/eval/eval.py index d2e24555..53571dd0 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -2,6 +2,7 @@ import argparse import asyncio import json import os +import re import time from datetime import datetime from typing import Any, Dict, List @@ -10,6 +11,7 @@ from openai import AsyncOpenAI from tqdm.asyncio import tqdm_asyncio from reasoning_gym.factory import create_dataset +from reasoning_gym.utils import SYSTEM_PROMPTS class AsyncOpenRouterEvaluator: @@ -25,22 +27,33 @@ class AsyncOpenRouterEvaluator: async with self.semaphore: try: completion = await self.client.chat.completions.create( - extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}] + extra_headers=self.extra_headers, + model=self.model, + messages=[ + {"role": "system", "content": SYSTEM_PROMPTS["default"]}, + {"role": "user", "content": prompt}, + ], ) return completion.choices[0].message.content except Exception as e: print(f"Error calling OpenRouter API: {str(e)}") raise + def parse_model_response(self, response: str) -> str: + """Gather the final answer between the and tags.""" + match = re.search(r"(.*?)", response, re.DOTALL) + return match.group(1).strip() if match else response + async def process_single_question(self, entry: Dict, dataset) -> Dict: """Process a single question and return the result.""" response = await self.get_model_response(entry["question"]) - score = dataset.score_answer(answer=response, entry=entry) + answer = self.parse_model_response(response) + score = dataset.score_answer(answer=answer, entry=entry) return { "question": entry["question"], "expected_answer": entry["answer"], - "model_answer": response, + "model_answer": answer, "score": score, "metadata": entry["metadata"], } diff --git a/eval/eval.sh b/eval/eval.sh old mode 100644 new mode 100755 diff --git a/eval/results/summary_openai_o1_20250212_103017.json b/eval/results/summary_openai_o1_20250212_103017.json new file mode 100644 index 00000000..b85dc37f --- /dev/null +++ b/eval/results/summary_openai_o1_20250212_103017.json @@ -0,0 +1,61 @@ +[ + { + "dataset_name": "letter_counting", + "model": "openai/o1", + "average_score": 0.99, + "total_examples": 50, + "timestamp": "2025-02-12T10:26:39.897674", + "config": { + "min_words": 5, + "max_words": 15, + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "propositional_logic", + "model": "openai/o1", + "average_score": 0.010000000000000004, + "total_examples": 50, + "timestamp": "2025-02-12T10:27:45.054740", + "config": { + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "leg_counting", + "model": "openai/o1", + "average_score": 0.802, + "total_examples": 50, + "timestamp": "2025-02-12T10:28:06.199253", + "config": { + "min_animals": 3, + "max_animals": 8, + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "group_anagrams", + "model": "openai/o1", + "average_score": 0.94, + "total_examples": 50, + "timestamp": "2025-02-12T10:30:02.084562", + "config": { + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "spell_backward", + "model": "openai/o1", + "average_score": 0.9802000000000001, + "total_examples": 50, + "timestamp": "2025-02-12T10:30:17.839014", + "config": { + "size": 50, + "seed": 42 + } + } +] \ No newline at end of file diff --git a/reasoning_gym/utils.py b/reasoning_gym/utils.py index 320a553d..457004ce 100644 --- a/reasoning_gym/utils.py +++ b/reasoning_gym/utils.py @@ -4,12 +4,15 @@ from decimal import Decimal, InvalidOperation from fractions import Fraction from typing import Any, Optional, Union -# DeepSeek Zero system prompt SYSTEM_PROMPTS = { "DeepSeekZero": """A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here -""" +""", + "default": """Given a problem, your task is to answer the question by thinking step-by-step in a clear and specific manner. +Once you have thought about the reasoning process, provide the answer in the following format: + answer here +""", }