mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
100 lines
3.5 KiB
Python
100 lines
3.5 KiB
Python
import pytest
|
|
|
|
from reasoning_gym.games.wikirace import WikiraceConfig, WikiraceCurriculum, WikiraceDataset
|
|
|
|
|
|
def test_wikirace_game_config_validation():
|
|
"""Test that invalid configs raise appropriate errors"""
|
|
with pytest.raises(AssertionError):
|
|
config = WikiraceConfig(min_distance=0)
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = WikiraceConfig(min_distance=3, max_distance=2)
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = WikiraceConfig(max_tries=-2)
|
|
config.validate()
|
|
|
|
|
|
def test_wikirace_game_deterministic():
|
|
"""Test that dataset generates same items with same seed"""
|
|
config1 = WikiraceConfig(seed=42, size=2)
|
|
dataset1 = WikiraceDataset(config1)
|
|
config2 = WikiraceConfig(seed=42, size=2)
|
|
dataset2 = WikiraceDataset(config2)
|
|
|
|
for i in range(len(dataset1)):
|
|
assert dataset1[i] == dataset2[i]
|
|
|
|
|
|
def test_wikirace_game_items():
|
|
"""Test basic properties of generated items"""
|
|
config = WikiraceConfig(
|
|
seed=42,
|
|
size=2,
|
|
)
|
|
dataset = WikiraceDataset(config)
|
|
|
|
for item in dataset:
|
|
assert isinstance(item, dict)
|
|
assert "question" in item
|
|
assert "answer" in item
|
|
assert "metadata" in item
|
|
|
|
# Check metadata contains required fields
|
|
assert "source" in item["metadata"]
|
|
assert "links" in item["metadata"]
|
|
assert "target" in item["metadata"]
|
|
assert "current" in item["metadata"]
|
|
assert "distance" in item["metadata"]
|
|
|
|
# Verify number of source numbers is within config range
|
|
assert config.min_distance <= item["metadata"]["distance"] <= config.max_distance
|
|
|
|
# A non-int answer fails
|
|
assert dataset.score_answer(answer="nope", entry=item) == 0.01
|
|
|
|
# A negative answer fails
|
|
assert dataset.score_answer(answer="-1", entry=item) == 0.01
|
|
|
|
# An out of bond answer fails
|
|
assert dataset.score_answer(answer=str(len(item["metadata"]["links"])), entry=item) == 0.01
|
|
|
|
# A parsable answer gives at least 0.1
|
|
assert dataset.score_answer(answer="0", entry=item) >= 0.1
|
|
|
|
# The expected answer gives 1.0
|
|
assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
|
|
|
|
|
|
def test_wikirace_game_single():
|
|
"""Test a known item"""
|
|
config = WikiraceConfig(
|
|
seed=42,
|
|
size=1,
|
|
)
|
|
dataset = WikiraceDataset(config)
|
|
item = dataset[0]
|
|
|
|
# If those asserts fails, it probably just means you changed the generation algorithm, which is fine
|
|
# you'll have have to update this test
|
|
assert item["metadata"]["source"] == "Vadim Bakatin"
|
|
assert item["metadata"]["target"] == "Azerbaijan Technological University"
|
|
assert item["metadata"]["distance"] == 3
|
|
assert len(item["metadata"]["path"]) == 0
|
|
|
|
# If those asserts fails, it is most likely an actual error
|
|
|
|
# Only valid answer is 4 - Moscow
|
|
assert dataset.score_answer(answer="4", entry=item) == 1.0
|
|
# Selecting 8 - Russians makes you go further away from the target
|
|
assert dataset.score_answer(answer="2", entry=item) == 0.1
|
|
# Selecting 0 - Commmunist Party of the Soviet Union doesn't get you further away, but it doesn't get you closer either
|
|
assert dataset.score_answer(answer="2", entry=item) == 0.1
|
|
|
|
# Use this to check the results if you need to update this test
|
|
# (with pytest -s)
|
|
# for (i,_) in item['metadata']['links']:
|
|
# print(i, dataset.score_answer(answer=str(i), entry=item))
|