mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
* math prompt improvements * ignore brackets in complex_arithmetic results * improve additional instruction in prompt of polynomial_equations * more strict tests for score_answer in polynomial_equations * simplify special reward handling * fix test_intermediate_integration * fix sokoban dataset * add common dataset score_answer consistency test
91 lines
3.2 KiB
Python
91 lines
3.2 KiB
Python
import pytest
|
|
|
|
from reasoning_gym.algorithmic.palindrome_generation import PalindromeConfig, PalindromeDataset
|
|
|
|
|
|
def test_palindrome_config_validation():
|
|
"""Test that invalid configs raise appropriate errors"""
|
|
with pytest.raises(AssertionError):
|
|
config = PalindromeConfig(min_length=0) # Too short
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = PalindromeConfig(min_length=5, max_length=3) # Invalid range
|
|
config.validate()
|
|
|
|
|
|
def test_palindrome_deterministic():
|
|
"""Test that dataset generates same items with same seed"""
|
|
config = PalindromeConfig(seed=42, size=10)
|
|
dataset1 = PalindromeDataset(config)
|
|
dataset2 = PalindromeDataset(config)
|
|
|
|
for i in range(len(dataset1)):
|
|
assert dataset1[i] == dataset2[i]
|
|
|
|
|
|
def test_palindrome_items():
|
|
"""Test basic properties of generated items"""
|
|
config = PalindromeConfig(min_length=3, max_length=7, size=10, seed=42)
|
|
dataset = PalindromeDataset(config)
|
|
|
|
for item in dataset:
|
|
assert isinstance(item, dict)
|
|
assert "question" in item
|
|
assert "answer" in item
|
|
assert "metadata" in item
|
|
|
|
# Check metadata contains required fields
|
|
assert "letters" in item["metadata"]
|
|
assert "generated_palindrome" in item["metadata"]
|
|
|
|
# Verify answer is a palindrome
|
|
palindrome = item["answer"]
|
|
assert palindrome == palindrome[::-1], f"{palindrome} is not a palindrome"
|
|
|
|
|
|
def test_palindrome_randomization():
|
|
"""Test letter randomization in the question"""
|
|
config = PalindromeConfig(min_length=4, max_length=4, size=10, seed=42)
|
|
dataset = PalindromeDataset(config)
|
|
|
|
for item in dataset:
|
|
letters = item["metadata"]["letters"]
|
|
palindrome = item["metadata"]["generated_palindrome"]
|
|
|
|
# Ensure the same letters are present but in different order
|
|
assert sorted(letters) == sorted(palindrome)
|
|
|
|
|
|
def test_score_answer():
|
|
"""Test the scoring mechanism for palindrome answers.
|
|
|
|
Expected behavior:
|
|
- Correct answer (palindrome with only correct letters in the correct quantities) gives 1.0
|
|
- An answer that is a palindrome, but not with the same letters as provided, gives 0.05
|
|
- An answer that is a string, but not a palindrome gives 0.02
|
|
- An empty string gives 0.01.
|
|
- None gives 0.0.
|
|
"""
|
|
config = PalindromeConfig(min_length=4, max_length=6, size=10, seed=42)
|
|
dataset = PalindromeDataset(config)
|
|
|
|
for item in dataset:
|
|
correct_answer = item["answer"]
|
|
|
|
# Correct answer should score 1.0
|
|
assert dataset.score_answer(correct_answer, entry=item) == 1.0
|
|
|
|
# Incorrect answer (palindrome, but not correct one) should score 0.05
|
|
pal_letters = "racecar" if "racecar" != correct_answer else "aba"
|
|
assert dataset.score_answer(pal_letters, entry=item) == 0.05
|
|
|
|
# Incorrect answer (not palindrome) should score 0.02
|
|
wrong_letters = "abcd" if "abcd" != correct_answer else "efgh"
|
|
assert dataset.score_answer(wrong_letters, entry=item) == 0.02
|
|
|
|
# Empty String input should score 0.0
|
|
assert dataset.score_answer("", entry=item) == 0.0
|
|
|
|
# Empty input should score 0.0
|
|
assert dataset.score_answer(None, entry=item) == 0.0
|