diff --git a/eval/eval.py b/eval/eval.py
index d2e24555..53571dd0 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -2,6 +2,7 @@ import argparse
import asyncio
import json
import os
+import re
import time
from datetime import datetime
from typing import Any, Dict, List
@@ -10,6 +11,7 @@ from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio
from reasoning_gym.factory import create_dataset
+from reasoning_gym.utils import SYSTEM_PROMPTS
class AsyncOpenRouterEvaluator:
@@ -25,22 +27,33 @@ class AsyncOpenRouterEvaluator:
async with self.semaphore:
try:
completion = await self.client.chat.completions.create(
- extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}]
+ extra_headers=self.extra_headers,
+ model=self.model,
+ messages=[
+ {"role": "system", "content": SYSTEM_PROMPTS["default"]},
+ {"role": "user", "content": prompt},
+ ],
)
return completion.choices[0].message.content
except Exception as e:
print(f"Error calling OpenRouter API: {str(e)}")
raise
+ def parse_model_response(self, response: str) -> str:
+ """Gather the final answer between the and tags."""
+ match = re.search(r"(.*?)", response, re.DOTALL)
+ return match.group(1).strip() if match else response
+
async def process_single_question(self, entry: Dict, dataset) -> Dict:
"""Process a single question and return the result."""
response = await self.get_model_response(entry["question"])
- score = dataset.score_answer(answer=response, entry=entry)
+ answer = self.parse_model_response(response)
+ score = dataset.score_answer(answer=answer, entry=entry)
return {
"question": entry["question"],
"expected_answer": entry["answer"],
- "model_answer": response,
+ "model_answer": answer,
"score": score,
"metadata": entry["metadata"],
}
diff --git a/eval/eval.sh b/eval/eval.sh
old mode 100644
new mode 100755
diff --git a/eval/results/summary_openai_o1_20250212_103017.json b/eval/results/summary_openai_o1_20250212_103017.json
new file mode 100644
index 00000000..b85dc37f
--- /dev/null
+++ b/eval/results/summary_openai_o1_20250212_103017.json
@@ -0,0 +1,61 @@
+[
+ {
+ "dataset_name": "letter_counting",
+ "model": "openai/o1",
+ "average_score": 0.99,
+ "total_examples": 50,
+ "timestamp": "2025-02-12T10:26:39.897674",
+ "config": {
+ "min_words": 5,
+ "max_words": 15,
+ "size": 50,
+ "seed": 42
+ }
+ },
+ {
+ "dataset_name": "propositional_logic",
+ "model": "openai/o1",
+ "average_score": 0.010000000000000004,
+ "total_examples": 50,
+ "timestamp": "2025-02-12T10:27:45.054740",
+ "config": {
+ "size": 50,
+ "seed": 42
+ }
+ },
+ {
+ "dataset_name": "leg_counting",
+ "model": "openai/o1",
+ "average_score": 0.802,
+ "total_examples": 50,
+ "timestamp": "2025-02-12T10:28:06.199253",
+ "config": {
+ "min_animals": 3,
+ "max_animals": 8,
+ "size": 50,
+ "seed": 42
+ }
+ },
+ {
+ "dataset_name": "group_anagrams",
+ "model": "openai/o1",
+ "average_score": 0.94,
+ "total_examples": 50,
+ "timestamp": "2025-02-12T10:30:02.084562",
+ "config": {
+ "size": 50,
+ "seed": 42
+ }
+ },
+ {
+ "dataset_name": "spell_backward",
+ "model": "openai/o1",
+ "average_score": 0.9802000000000001,
+ "total_examples": 50,
+ "timestamp": "2025-02-12T10:30:17.839014",
+ "config": {
+ "size": 50,
+ "seed": 42
+ }
+ }
+]
\ No newline at end of file
diff --git a/reasoning_gym/utils.py b/reasoning_gym/utils.py
index 320a553d..457004ce 100644
--- a/reasoning_gym/utils.py
+++ b/reasoning_gym/utils.py
@@ -4,12 +4,15 @@ from decimal import Decimal, InvalidOperation
from fractions import Fraction
from typing import Any, Optional, Union
-# DeepSeek Zero system prompt
SYSTEM_PROMPTS = {
"DeepSeekZero": """A conversation between User and Assistant. The user asks a question, and the Assistant solves it.
The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here
answer here
-"""
+""",
+ "default": """Given a problem, your task is to answer the question by thinking step-by-step in a clear and specific manner.
+Once you have thought about the reasoning process, provide the answer in the following format:
+ answer here
+""",
}