mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-05-01 17:45:24 +00:00
system prompt for structured output, and parse such outputs
This commit is contained in:
parent
56ba500959
commit
3d84816f95
4 changed files with 82 additions and 5 deletions
19
eval/eval.py
19
eval/eval.py
|
|
@ -2,6 +2,7 @@ import argparse
|
|||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List
|
||||
|
|
@ -10,6 +11,7 @@ from openai import AsyncOpenAI
|
|||
from tqdm.asyncio import tqdm_asyncio
|
||||
|
||||
from reasoning_gym.factory import create_dataset
|
||||
from reasoning_gym.utils import SYSTEM_PROMPTS
|
||||
|
||||
|
||||
class AsyncOpenRouterEvaluator:
|
||||
|
|
@ -25,22 +27,33 @@ class AsyncOpenRouterEvaluator:
|
|||
async with self.semaphore:
|
||||
try:
|
||||
completion = await self.client.chat.completions.create(
|
||||
extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}]
|
||||
extra_headers=self.extra_headers,
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPTS["default"]},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
except Exception as e:
|
||||
print(f"Error calling OpenRouter API: {str(e)}")
|
||||
raise
|
||||
|
||||
def parse_model_response(self, response: str) -> str:
|
||||
"""Gather the final answer between the <answer> and </answer> tags."""
|
||||
match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
|
||||
return match.group(1).strip() if match else response
|
||||
|
||||
async def process_single_question(self, entry: Dict, dataset) -> Dict:
|
||||
"""Process a single question and return the result."""
|
||||
response = await self.get_model_response(entry["question"])
|
||||
score = dataset.score_answer(answer=response, entry=entry)
|
||||
answer = self.parse_model_response(response)
|
||||
score = dataset.score_answer(answer=answer, entry=entry)
|
||||
|
||||
return {
|
||||
"question": entry["question"],
|
||||
"expected_answer": entry["answer"],
|
||||
"model_answer": response,
|
||||
"model_answer": answer,
|
||||
"score": score,
|
||||
"metadata": entry["metadata"],
|
||||
}
|
||||
|
|
|
|||
0
eval/eval.sh
Normal file → Executable file
0
eval/eval.sh
Normal file → Executable file
61
eval/results/summary_openai_o1_20250212_103017.json
Normal file
61
eval/results/summary_openai_o1_20250212_103017.json
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
[
|
||||
{
|
||||
"dataset_name": "letter_counting",
|
||||
"model": "openai/o1",
|
||||
"average_score": 0.99,
|
||||
"total_examples": 50,
|
||||
"timestamp": "2025-02-12T10:26:39.897674",
|
||||
"config": {
|
||||
"min_words": 5,
|
||||
"max_words": 15,
|
||||
"size": 50,
|
||||
"seed": 42
|
||||
}
|
||||
},
|
||||
{
|
||||
"dataset_name": "propositional_logic",
|
||||
"model": "openai/o1",
|
||||
"average_score": 0.010000000000000004,
|
||||
"total_examples": 50,
|
||||
"timestamp": "2025-02-12T10:27:45.054740",
|
||||
"config": {
|
||||
"size": 50,
|
||||
"seed": 42
|
||||
}
|
||||
},
|
||||
{
|
||||
"dataset_name": "leg_counting",
|
||||
"model": "openai/o1",
|
||||
"average_score": 0.802,
|
||||
"total_examples": 50,
|
||||
"timestamp": "2025-02-12T10:28:06.199253",
|
||||
"config": {
|
||||
"min_animals": 3,
|
||||
"max_animals": 8,
|
||||
"size": 50,
|
||||
"seed": 42
|
||||
}
|
||||
},
|
||||
{
|
||||
"dataset_name": "group_anagrams",
|
||||
"model": "openai/o1",
|
||||
"average_score": 0.94,
|
||||
"total_examples": 50,
|
||||
"timestamp": "2025-02-12T10:30:02.084562",
|
||||
"config": {
|
||||
"size": 50,
|
||||
"seed": 42
|
||||
}
|
||||
},
|
||||
{
|
||||
"dataset_name": "spell_backward",
|
||||
"model": "openai/o1",
|
||||
"average_score": 0.9802000000000001,
|
||||
"total_examples": 50,
|
||||
"timestamp": "2025-02-12T10:30:17.839014",
|
||||
"config": {
|
||||
"size": 50,
|
||||
"seed": 42
|
||||
}
|
||||
}
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue