mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
71 lines
2.1 KiB
Python
71 lines
2.1 KiB
Python
"""
|
|
Tests for the get_score_answer_fn helper with hard-coded sample cases.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from reasoning_gym import get_score_answer_fn
|
|
|
|
TEST_CASES = [
|
|
{
|
|
"dataset": "letter_jumble",
|
|
"entry": {"answer": "second opportunity to receive"},
|
|
"model_answer": "second opportunity to receive",
|
|
"expected": 1.0,
|
|
"id": "rg_4806-correct",
|
|
},
|
|
{
|
|
"dataset": "word_sorting",
|
|
"entry": {
|
|
"answer": "arrive, burdens, computers, federal, louder, paragraphs, side, specified, virus",
|
|
"metadata": {
|
|
"sorted_words": [
|
|
"arrive",
|
|
"burdens",
|
|
"computers",
|
|
"federal",
|
|
"louder",
|
|
"paragraphs",
|
|
"side",
|
|
"specified",
|
|
"virus",
|
|
]
|
|
},
|
|
},
|
|
"model_answer": "arrive, burdens, computers, federal, louder, paragraphs, side, specified, virus",
|
|
"expected": 1.0,
|
|
"id": "rg_16004-word_sorting-correct",
|
|
},
|
|
{
|
|
"dataset": "spell_backward",
|
|
"entry": {"answer": "ssiknu"},
|
|
"model_answer": "ssiknu",
|
|
"expected": 1.0,
|
|
"id": "rg_14211-correct",
|
|
},
|
|
{
|
|
"dataset": "letter_jumble",
|
|
"entry": {"answer": "second opportunity to receive"},
|
|
"model_answer": "completely wrong answer here",
|
|
"expected": 0.0,
|
|
"id": "rg_4806-incorrect",
|
|
},
|
|
{
|
|
"dataset": "spell_backward",
|
|
"entry": {"answer": "ssiknu"},
|
|
"model_answer": "unkiss",
|
|
"expected": 0.0,
|
|
"id": "rg_14211-incorrect",
|
|
},
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("case", TEST_CASES, ids=lambda c: c["id"])
|
|
def test_get_score_answer_fn_hardcoded(case):
|
|
"""
|
|
Ensure the dataset-specific scorer returns the expected value
|
|
for the given model answer and entry.
|
|
"""
|
|
scorer = get_score_answer_fn(case["dataset"])
|
|
returned = scorer(case["model_answer"], case["entry"])
|
|
assert returned == pytest.approx(case["expected"], abs=1e-8)
|