reasoning-gym/tests/test_wikirace.py

100 lines
3.5 KiB
Python

import pytest
from reasoning_gym.games.wikirace import WikiraceConfig, WikiraceCurriculum, WikiraceDataset
def test_wikirace_game_config_validation():
"""Test that invalid configs raise appropriate errors"""
with pytest.raises(AssertionError):
config = WikiraceConfig(min_distance=0)
config.validate()
with pytest.raises(AssertionError):
config = WikiraceConfig(min_distance=3, max_distance=2)
config.validate()
with pytest.raises(AssertionError):
config = WikiraceConfig(max_tries=-2)
config.validate()
def test_wikirace_game_deterministic():
"""Test that dataset generates same items with same seed"""
config1 = WikiraceConfig(seed=42, size=2)
dataset1 = WikiraceDataset(config1)
config2 = WikiraceConfig(seed=42, size=2)
dataset2 = WikiraceDataset(config2)
for i in range(len(dataset1)):
assert dataset1[i] == dataset2[i]
def test_wikirace_game_items():
"""Test basic properties of generated items"""
config = WikiraceConfig(
seed=42,
size=2,
)
dataset = WikiraceDataset(config)
for item in dataset:
assert isinstance(item, dict)
assert "question" in item
assert "answer" in item
assert "metadata" in item
# Check metadata contains required fields
assert "source" in item["metadata"]
assert "links" in item["metadata"]
assert "target" in item["metadata"]
assert "current" in item["metadata"]
assert "distance" in item["metadata"]
# Verify number of source numbers is within config range
assert config.min_distance <= item["metadata"]["distance"] <= config.max_distance
# A non-int answer fails
assert dataset.score_answer(answer="nope", entry=item) == 0.01
# A negative answer fails
assert dataset.score_answer(answer="-1", entry=item) == 0.01
# An out of bond answer fails
assert dataset.score_answer(answer=str(len(item["metadata"]["links"])), entry=item) == 0.01
# A parsable answer gives at least 0.1
assert dataset.score_answer(answer="0", entry=item) >= 0.1
# The expected answer gives 1.0
assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
def test_wikirace_game_single():
"""Test a known item"""
config = WikiraceConfig(
seed=42,
size=1,
)
dataset = WikiraceDataset(config)
item = dataset[0]
# If those asserts fails, it probably just means you changed the generation algorithm, which is fine
# you'll have have to update this test
assert item["metadata"]["source"] == "Vadim Bakatin"
assert item["metadata"]["target"] == "Azerbaijan Technological University"
assert item["metadata"]["distance"] == 3
assert len(item["metadata"]["path"]) == 0
# If those asserts fails, it is most likely an actual error
# Only valid answer is 4 - Moscow
assert dataset.score_answer(answer="4", entry=item) == 1.0
# Selecting 8 - Russians makes you go further away from the target
assert dataset.score_answer(answer="2", entry=item) == 0.1
# Selecting 0 - Commmunist Party of the Soviet Union doesn't get you further away, but it doesn't get you closer either
assert dataset.score_answer(answer="2", entry=item) == 0.1
# Use this to check the results if you need to update this test
# (with pytest -s)
# for (i,_) in item['metadata']['links']:
# print(i, dataset.score_answer(answer=str(i), entry=item))