mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
106 lines
3.3 KiB
Python
106 lines
3.3 KiB
Python
from fractions import Fraction
|
|
|
|
import pytest
|
|
|
|
from reasoning_gym.probability import CoinFlipConfig, CoinFlipCurriculum, CoinFlipDataset
|
|
|
|
|
|
def test_coin_flip_config_validation():
|
|
"""Test that invalid configs raise errors"""
|
|
with pytest.raises(AssertionError):
|
|
config = CoinFlipConfig(size=0)
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = CoinFlipConfig(min_trials=0)
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = CoinFlipConfig(min_trials=5, max_trials=3)
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = CoinFlipConfig(allow_exact=False, allow_at_least=False)
|
|
config.validate()
|
|
|
|
|
|
def test_coin_flip_deterministic():
|
|
"""Dataset generates same items with same seed"""
|
|
config = CoinFlipConfig(size=10, seed=42)
|
|
dataset1 = CoinFlipDataset(config)
|
|
dataset2 = CoinFlipDataset(config)
|
|
for i in range(len(dataset1)):
|
|
assert dataset1[i] == dataset2[i]
|
|
|
|
|
|
def test_coin_flip_items():
|
|
"""Test basic properties of generated items"""
|
|
config = CoinFlipConfig(min_trials=3, max_trials=6, size=7, seed=42)
|
|
dataset = CoinFlipDataset(config)
|
|
|
|
for i in range(len(dataset)):
|
|
item = dataset[i]
|
|
assert isinstance(item, dict)
|
|
assert "question" in item
|
|
assert "answer" in item
|
|
assert 0.0 <= float(item["answer"]) <= 1.0
|
|
assert "metadata" in item
|
|
|
|
metadata = item["metadata"]
|
|
assert "num_trials" in metadata
|
|
assert "k_heads" in metadata
|
|
assert "problem_type" in metadata
|
|
assert metadata["problem_type"] in ["exact", "at_least"]
|
|
|
|
rational = metadata["rational"]
|
|
assert rational["denominator"] == 2 ** metadata["num_trials"]
|
|
assert rational["numerator"] > 0
|
|
|
|
|
|
def test_coin_flip_score_answer():
|
|
"""Test full and partial reward behavior"""
|
|
config = CoinFlipConfig(size=200, seed=42)
|
|
dataset = CoinFlipDataset(config)
|
|
|
|
for i in range(len(dataset)):
|
|
entry = dataset[i]
|
|
answer = entry["answer"]
|
|
|
|
# Exact answer -> full reward
|
|
reward = dataset.score_answer(answer, entry)
|
|
assert reward == 1.0
|
|
|
|
# Slightly wrong answer -> partial reward
|
|
if float(answer) + 0.01 <= 1.0:
|
|
slightly_wrong = str(float(answer) + 0.01)
|
|
else:
|
|
slightly_wrong = str(float(answer) - 0.01)
|
|
reward_partial = dataset.score_answer(slightly_wrong, entry)
|
|
assert 0.0 <= reward_partial <= 1.0
|
|
|
|
|
|
def test_coin_flip_curriculum():
|
|
"""Test curriculum generates valid configurations and increments attributes"""
|
|
|
|
curriculum = CoinFlipCurriculum()
|
|
base_value = {"size": 100, "seed": 32}
|
|
|
|
cfg = curriculum.generate_configuration(base_value)
|
|
|
|
assert isinstance(cfg, CoinFlipConfig)
|
|
assert cfg.size == 100
|
|
assert cfg.seed == 32
|
|
assert cfg.min_trials == 3
|
|
assert cfg.max_trials == 3
|
|
|
|
# Increment attribute level for num_trials
|
|
curriculum.increment_attr_level("num_trials")
|
|
cfg_inc = curriculum.generate_configuration(base_value)
|
|
assert cfg_inc.min_trials == 3
|
|
assert cfg_inc.max_trials == 4
|
|
|
|
# Decrement attribute level
|
|
curriculum.decrement_attr_level("num_trials")
|
|
cfg_dec = curriculum.generate_configuration(base_value)
|
|
assert cfg_dec.min_trials == 3
|
|
assert cfg_dec.max_trials == 3
|