reasoning-gym/tests/test_palindrome.py

91 lines
3.2 KiB
Python

import pytest
from reasoning_gym.algorithmic.palindrome_generation import PalindromeConfig, PalindromeDataset
def test_palindrome_config_validation():
"""Test that invalid configs raise appropriate errors"""
with pytest.raises(AssertionError):
config = PalindromeConfig(min_length=0) # Too short
config.validate()
with pytest.raises(AssertionError):
config = PalindromeConfig(min_length=5, max_length=3) # Invalid range
config.validate()
def test_palindrome_deterministic():
"""Test that dataset generates same items with same seed"""
config = PalindromeConfig(seed=42, size=10)
dataset1 = PalindromeDataset(config)
dataset2 = PalindromeDataset(config)
for i in range(len(dataset1)):
assert dataset1[i] == dataset2[i]
def test_palindrome_items():
"""Test basic properties of generated items"""
config = PalindromeConfig(min_length=3, max_length=7, size=10, seed=42)
dataset = PalindromeDataset(config)
for item in dataset:
assert isinstance(item, dict)
assert "question" in item
assert "answer" in item
assert "metadata" in item
# Check metadata contains required fields
assert "letters" in item["metadata"]
assert "generated_palindrome" in item["metadata"]
# Verify answer is a palindrome
palindrome = item["answer"]
assert palindrome == palindrome[::-1], f"{palindrome} is not a palindrome"
def test_palindrome_randomization():
"""Test letter randomization in the question"""
config = PalindromeConfig(min_length=4, max_length=4, size=10, seed=42)
dataset = PalindromeDataset(config)
for item in dataset:
letters = item["metadata"]["letters"]
palindrome = item["metadata"]["generated_palindrome"]
# Ensure the same letters are present but in different order
assert sorted(letters) == sorted(palindrome)
def test_score_answer():
"""Test the scoring mechanism for palindrome answers.
Expected behavior:
- Correct answer (palindrome with only correct letters in the correct quantities) gives 1.0
- An answer that is a palindrome, but not with the same letters as provided, gives 0.05
- An answer that is a string, but not a palindrome gives 0.02
- An empty string gives 0.01.
- None gives 0.0.
"""
config = PalindromeConfig(min_length=4, max_length=6, size=10, seed=42)
dataset = PalindromeDataset(config)
for item in dataset:
correct_answer = item["answer"]
# Correct answer should score 1.0
assert dataset.score_answer(correct_answer, entry=item) == 1.0
# Incorrect answer (palindrome, but not correct one) should score 0.05
pal_letters = "racecar" if "racecar" != correct_answer else "aba"
assert dataset.score_answer(pal_letters, entry=item) == 0.05
# Incorrect answer (not palindrome) should score 0.02
wrong_letters = "abcd" if "abcd" != correct_answer else "efgh"
assert dataset.score_answer(wrong_letters, entry=item) == 0.02
# Empty String input should score 0.01
assert dataset.score_answer("", entry=item) == 0.01
# Empty input should score 0.0
assert dataset.score_answer(None, entry=item) == 0.0