diff --git a/eval/eval.py b/eval/eval.py
index d2e24555..53571dd0 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -2,6 +2,7 @@ import argparse
 import asyncio
 import json
 import os
+import re
 import time
 from datetime import datetime
 from typing import Any, Dict, List
@@ -10,6 +11,7 @@ from openai import AsyncOpenAI
 from tqdm.asyncio import tqdm_asyncio
 
 from reasoning_gym.factory import create_dataset
+from reasoning_gym.utils import SYSTEM_PROMPTS
 
 
 class AsyncOpenRouterEvaluator:
@@ -25,22 +27,33 @@ class AsyncOpenRouterEvaluator:
         async with self.semaphore:
             try:
                 completion = await self.client.chat.completions.create(
-                    extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}]
+                    extra_headers=self.extra_headers,
+                    model=self.model,
+                    messages=[
+                        {"role": "system", "content": SYSTEM_PROMPTS["default"]},
+                        {"role": "user", "content": prompt},
+                    ],
                 )
                 return completion.choices[0].message.content
             except Exception as e:
                 print(f"Error calling OpenRouter API: {str(e)}")
                 raise
 
+    def parse_model_response(self, response: str) -> str:
+        """Gather the final answer between the <answer> and </answer> tags."""
+        match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
+        return match.group(1).strip() if match else response
+
     async def process_single_question(self, entry: Dict, dataset) -> Dict:
         """Process a single question and return the result."""
         response = await self.get_model_response(entry["question"])
-        score = dataset.score_answer(answer=response, entry=entry)
+        answer = self.parse_model_response(response)
+        score = dataset.score_answer(answer=answer, entry=entry)
 
         return {
             "question": entry["question"],
             "expected_answer": entry["answer"],
-            "model_answer": response,
+            "model_answer": answer,
             "score": score,
             "metadata": entry["metadata"],
         }
diff --git a/eval/eval.sh b/eval/eval.sh
old mode 100644
new mode 100755
diff --git a/eval/results/summary_openai_o1_20250212_103017.json b/eval/results/summary_openai_o1_20250212_103017.json
new file mode 100644
index 00000000..b85dc37f
--- /dev/null
+++ b/eval/results/summary_openai_o1_20250212_103017.json
@@ -0,0 +1,61 @@
+[
+  {
+    "dataset_name": "letter_counting",
+    "model": "openai/o1",
+    "average_score": 0.99,
+    "total_examples": 50,
+    "timestamp": "2025-02-12T10:26:39.897674",
+    "config": {
+      "min_words": 5,
+      "max_words": 15,
+      "size": 50,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "propositional_logic",
+    "model": "openai/o1",
+    "average_score": 0.010000000000000004,
+    "total_examples": 50,
+    "timestamp": "2025-02-12T10:27:45.054740",
+    "config": {
+      "size": 50,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "leg_counting",
+    "model": "openai/o1",
+    "average_score": 0.802,
+    "total_examples": 50,
+    "timestamp": "2025-02-12T10:28:06.199253",
+    "config": {
+      "min_animals": 3,
+      "max_animals": 8,
+      "size": 50,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "group_anagrams",
+    "model": "openai/o1",
+    "average_score": 0.94,
+    "total_examples": 50,
+    "timestamp": "2025-02-12T10:30:02.084562",
+    "config": {
+      "size": 50,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "spell_backward",
+    "model": "openai/o1",
+    "average_score": 0.9802000000000001,
+    "total_examples": 50,
+    "timestamp": "2025-02-12T10:30:17.839014",
+    "config": {
+      "size": 50,
+      "seed": 42
+    }
+  }
+]
\ No newline at end of file
diff --git a/reasoning_gym/utils.py b/reasoning_gym/utils.py
index 320a553d..457004ce 100644
--- a/reasoning_gym/utils.py
+++ b/reasoning_gym/utils.py
@@ -4,12 +4,15 @@ from decimal import Decimal, InvalidOperation
 from fractions import Fraction
 from typing import Any, Optional, Union
 
-# DeepSeek Zero system prompt
 SYSTEM_PROMPTS = {
     "DeepSeekZero": """A conversation between User and Assistant. The user asks a question, and the Assistant solves it.
 The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think>
 <answer> answer here </answer>
-"""
+""",
+    "default": """Given a problem, your task is to answer the question by thinking step-by-step in a clear and specific manner.
+Once you have thought about the reasoning process, provide the answer in the following format:
+<answer> answer here </answer>
+""",
 }