mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
* init * fix tests * unify codeio * filtered for libraries not present in reasoning-gym * fix more bounds * puzzle24 * knight swap curriculum * fix number sorting * fix attributes * add validation of config in creation of dataset * dry run for instantiating and validating the datasets * remove unused imports * fix curriculum tests to reference newly updated attribute names
373 lines
13 KiB
Python
373 lines
13 KiB
Python
"""Tests for syllogism task generation"""
|
|
|
|
import pytest
|
|
|
|
from reasoning_gym.logic.syllogisms import Quantifier, SyllogismConfig, SyllogismCurriculum, SyllogismDataset, Term
|
|
|
|
|
|
def test_syllogism_config_validation():
|
|
"""Test that invalid configs raise appropriate errors"""
|
|
with pytest.raises(AssertionError):
|
|
config = SyllogismConfig(
|
|
allow_all=False,
|
|
allow_no=False,
|
|
allow_some=False,
|
|
allow_some_not=False,
|
|
) # No quantifiers allowed
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = SyllogismConfig(invalid_ratio=-0.1) # Invalid ratio
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = SyllogismConfig(invalid_ratio=1.1) # Invalid ratio
|
|
config.validate()
|
|
|
|
|
|
def test_syllogism_dataset_deterministic():
|
|
"""Test that dataset generates same items with same seed"""
|
|
config = SyllogismConfig(seed=42, size=10)
|
|
dataset1 = SyllogismDataset(config)
|
|
dataset2 = SyllogismDataset(config)
|
|
|
|
for i in range(len(dataset1)):
|
|
assert dataset1[i] == dataset2[i]
|
|
|
|
|
|
def test_syllogism_dataset_items():
|
|
"""Test basic properties of generated items"""
|
|
config = SyllogismConfig(size=10, seed=42)
|
|
dataset = SyllogismDataset(config)
|
|
|
|
for i in range(len(dataset)):
|
|
item = dataset[i]
|
|
# Check item structure
|
|
assert isinstance(item, dict)
|
|
assert "question" in item
|
|
assert "answer" in item
|
|
assert "metadata" in item
|
|
|
|
# Check metadata
|
|
assert "premise1" in item["metadata"]
|
|
assert "conclusion" in item["metadata"]
|
|
assert "is_valid" in item["metadata"]
|
|
assert "type" in item["metadata"]
|
|
|
|
# For traditional syllogisms, check for premise2
|
|
if item["metadata"]["type"] == "syllogism":
|
|
assert "premise2" in item["metadata"]
|
|
|
|
# Verify answer format
|
|
assert item["answer"] in ("Yes", "No")
|
|
|
|
# Verify question format
|
|
assert "Consider these statements:" in item["question"]
|
|
assert "1." in item["question"]
|
|
if item["metadata"]["type"] == "syllogism":
|
|
assert "2." in item["question"]
|
|
assert "Does it logically follow that:" in item["question"]
|
|
|
|
|
|
def test_valid_syllogism_forms():
|
|
"""Test specific valid syllogistic forms"""
|
|
config = SyllogismConfig(size=1, seed=42)
|
|
dataset = SyllogismDataset(config)
|
|
|
|
# Create some test terms
|
|
A = Term("mortal", "mortals")
|
|
B = Term("human", "humans")
|
|
C = Term("animal", "animals")
|
|
|
|
# Test Barbara (AAA-1)
|
|
# Major premise: All M are P
|
|
# Minor premise: All S are M
|
|
# Conclusion: All S are P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.ALL, B, C), # All B (M) are C (P)
|
|
(Quantifier.ALL, A, B), # All A (S) are B (M)
|
|
(Quantifier.ALL, A, C), # All A (S) are C (P)
|
|
)
|
|
|
|
# Test Celarent (EAE-1)
|
|
# Major premise: No M are P
|
|
# Minor premise: All S are M
|
|
# Conclusion: No S are P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.NO, B, C), # No B (M) are C (P)
|
|
(Quantifier.ALL, A, B), # All A (S) are B (M)
|
|
(Quantifier.NO, A, C), # No A (S) are C (P)
|
|
)
|
|
|
|
# Test Cesare (EAE-2) — corrected order
|
|
# Major premise: No P are M
|
|
# Minor premise: All S are M
|
|
# Conclusion: No S are P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.NO, C, B), # No C (P) are B (M) [Major premise]
|
|
(Quantifier.ALL, A, B), # All A (S) are B (M) [Minor premise]
|
|
(Quantifier.NO, A, C), # No A (S) are C (P)
|
|
)
|
|
|
|
# Test Darii (AII-1)
|
|
# Major premise: All M are P
|
|
# Minor premise: Some S are M
|
|
# Conclusion: Some S are P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.ALL, B, C), # All B (M) are C (P)
|
|
(Quantifier.SOME, A, B), # Some A (S) are B (M)
|
|
(Quantifier.SOME, A, C), # Some A (S) are C (P)
|
|
)
|
|
|
|
# Test Disamis (IAI-3)
|
|
# Major premise: Some M are P
|
|
# Minor premise: All M are S
|
|
# Conclusion: Some S are P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.SOME, B, C), # Some B (M) are C (P)
|
|
(Quantifier.ALL, B, A), # All B (M) are A (S)
|
|
(Quantifier.SOME, A, C), # Some A (S) are C (P)
|
|
)
|
|
|
|
# Test Ferio (EIO-1)
|
|
# Major premise: No M are P
|
|
# Minor premise: Some S are M
|
|
# Conclusion: Some S are not P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.NO, B, C), # No B (M) are C (P)
|
|
(Quantifier.SOME, A, B), # Some A (S) are B (M)
|
|
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
|
)
|
|
|
|
# Test Festino (EIO-2)
|
|
# Major premise: No P are M
|
|
# Minor premise: Some S are M
|
|
# Conclusion: Some S are not P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.NO, C, B), # No C (P) are B (M)
|
|
(Quantifier.SOME, A, B), # Some A (S) are B (M)
|
|
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
|
)
|
|
|
|
# Test Datisi (AII-3)
|
|
# Major premise: All M are P
|
|
# Minor premise: Some M are S
|
|
# Conclusion: Some S are P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.ALL, B, C), # All B (M) are C (P)
|
|
(Quantifier.SOME, B, A), # Some B (M) are A (S)
|
|
(Quantifier.SOME, A, C), # Some A (S) are C (P)
|
|
)
|
|
|
|
# Test Bocardo (OAO-3)
|
|
# Major premise: Some M are not P
|
|
# Minor premise: All M are S
|
|
# Conclusion: Some S are not P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.SOME_NOT, B, C), # Some B (M) are not C (P)
|
|
(Quantifier.ALL, B, A), # All B (M) are A (S)
|
|
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
|
)
|
|
|
|
# Test Baroco (AOO-2)
|
|
# Major premise: All P are M
|
|
# Minor premise: Some S are not M
|
|
# Conclusion: Some S are not P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.ALL, C, B), # All C (P) are B (M)
|
|
(Quantifier.SOME_NOT, A, B), # Some A (S) are not B (M)
|
|
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
|
)
|
|
|
|
# Test Camestres (AEE-2)
|
|
# Major premise: All P are M
|
|
# Minor premise: No S are M
|
|
# Conclusion: No S are P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.ALL, C, B), # All C (P) are B (M)
|
|
(Quantifier.NO, A, B), # No A (S) are B (M)
|
|
(Quantifier.NO, A, C), # No A (S) are C (P)
|
|
)
|
|
|
|
# Test Dimaris (IAI-4)
|
|
# Major premise: Some P are M
|
|
# Minor premise: All M are S
|
|
# Conclusion: Some S are P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.SOME, C, B), # Some C (P) are B (M)
|
|
(Quantifier.ALL, B, A), # All B (M) are A (S)
|
|
(Quantifier.SOME, A, C), # Some A (S) are C (P)
|
|
)
|
|
|
|
# Test Ferison (EIO-3)
|
|
# Major premise: No M are P
|
|
# Minor premise: Some M are S
|
|
# Conclusion: Some S are not P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.NO, B, C), # No B (M) are C (P)
|
|
(Quantifier.SOME, B, A), # Some B (M) are A (S)
|
|
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
|
)
|
|
|
|
# Test Fresison (EIO-4)
|
|
# Major premise: No P are M
|
|
# Minor premise: Some M are S
|
|
# Conclusion: Some S are not P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.NO, C, B), # No C (P) are B (M)
|
|
(Quantifier.SOME, B, A), # Some B (M) are A (S)
|
|
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
|
)
|
|
|
|
# Test Camenes (AEE-4)
|
|
# Major premise: All P are M
|
|
# Minor premise: No M are S
|
|
# Conclusion: No S are P
|
|
assert dataset._is_valid_syllogism(
|
|
(Quantifier.ALL, C, B), # All C (P) are B (M)
|
|
(Quantifier.NO, B, A), # No B (M) are A (S)
|
|
(Quantifier.NO, A, C), # No A (S) are C (P)
|
|
)
|
|
|
|
# Test invalid forms
|
|
assert not dataset._is_valid_syllogism(
|
|
(Quantifier.SOME, B, C), # Some B are C
|
|
(Quantifier.SOME, A, B), # Some A are B
|
|
(Quantifier.SOME, A, C), # Some A are C (invalid: two particular premises)
|
|
)
|
|
|
|
assert not dataset._is_valid_syllogism(
|
|
(Quantifier.NO, B, C), # No B are C
|
|
(Quantifier.NO, A, B), # No A are B
|
|
(Quantifier.NO, A, C), # No A are C (invalid: two negative premises)
|
|
)
|
|
|
|
# Test specific invalid case with two negative premises
|
|
S = Term("student", "students")
|
|
M = Term("human", "humans")
|
|
P = Term("chef", "chefs")
|
|
assert not dataset._is_valid_syllogism(
|
|
(Quantifier.NO, S, M), # No students are humans
|
|
(Quantifier.NO, M, P), # No humans are chefs
|
|
(Quantifier.NO, S, P), # No students are chefs (invalid!)
|
|
)
|
|
|
|
child = Term("child", "children")
|
|
animal = Term("animal", "animals")
|
|
doctor = Term("doctor", "doctors")
|
|
|
|
# Premise 1: Some children are not animals
|
|
# Premise 2: All animals are doctors
|
|
# Conclusion: Some children are not doctors
|
|
# We expect this NOT to be a valid syllogism
|
|
assert not dataset._is_valid_syllogism(
|
|
(Quantifier.SOME_NOT, child, animal), # Some children are not animals
|
|
(Quantifier.ALL, animal, doctor), # All animals are doctors
|
|
(Quantifier.SOME_NOT, child, doctor), # Some children are not doctors
|
|
)
|
|
|
|
|
|
def test_logical_equivalence():
|
|
"""Test logical equivalence rules for inversions"""
|
|
config = SyllogismConfig(size=1, seed=42)
|
|
dataset = SyllogismDataset(config)
|
|
|
|
# Create test terms
|
|
A = Term("student", "students")
|
|
B = Term("human", "humans")
|
|
|
|
# Test direct inversion of NO statements
|
|
assert dataset._check_logical_equivalence(
|
|
(Quantifier.NO, A, B), # No students are humans
|
|
(Quantifier.NO, B, A), # No humans are students
|
|
)
|
|
|
|
# Test particular inversion of ALL statements
|
|
assert dataset._check_logical_equivalence(
|
|
(Quantifier.ALL, A, B), # All students are humans
|
|
(Quantifier.SOME, B, A), # Some humans are students
|
|
)
|
|
|
|
# Test direct inversion of SOME statements
|
|
assert dataset._check_logical_equivalence(
|
|
(Quantifier.SOME, A, B), # Some students are humans
|
|
(Quantifier.SOME, B, A), # Some humans are students
|
|
)
|
|
|
|
# Test invalid inversions
|
|
assert not dataset._check_logical_equivalence(
|
|
(Quantifier.SOME_NOT, A, B), # Some students are not humans
|
|
(Quantifier.SOME_NOT, B, A), # Some humans are not students (invalid)
|
|
)
|
|
|
|
assert not dataset._check_logical_equivalence(
|
|
(Quantifier.ALL, A, B), # All students are humans
|
|
(Quantifier.ALL, B, A), # All humans are students (invalid)
|
|
)
|
|
|
|
|
|
def test_inversion_generation():
|
|
"""Test generation of inversion problems"""
|
|
# Force inversion problems by setting probability to 1.0
|
|
config = SyllogismConfig(size=10, seed=42, inversion_probability=1.0)
|
|
dataset = SyllogismDataset(config)
|
|
|
|
for item in dataset:
|
|
# Check type is marked as inversion
|
|
assert item["metadata"]["type"] == "inversion"
|
|
# Check both premises and selection
|
|
assert "premise1" in item["metadata"]
|
|
assert "premise2" in item["metadata"]
|
|
assert "selected_premise" in item["metadata"]
|
|
assert item["metadata"]["selected_premise"] in (1, 2)
|
|
# Check format
|
|
assert item["answer"] in ("Yes", "No")
|
|
assert "Consider these statements:" in item["question"]
|
|
assert "1." in item["question"]
|
|
assert "2." in item["question"] # Inversion questions now show both premises
|
|
assert "Does it logically follow that:" in item["question"]
|
|
|
|
|
|
def test_syllogism_dataset_iteration():
|
|
"""Test that iteration respects dataset size"""
|
|
config = SyllogismConfig(size=5, seed=42)
|
|
dataset = SyllogismDataset(config)
|
|
|
|
items = list(dataset)
|
|
assert len(items) == config.size
|
|
|
|
# Test multiple iterations yield same items
|
|
assert items == list(dataset)
|
|
|
|
|
|
def test_syllogism_curriculum():
|
|
curriculum = SyllogismCurriculum()
|
|
|
|
base_value = {"size": 150, "seed": 1}
|
|
|
|
base_cfg: SyllogismConfig = curriculum.generate_configuration(base_value)
|
|
assert base_cfg.seed == 1
|
|
assert base_cfg.size == 150
|
|
assert base_cfg.allow_all == True
|
|
assert base_cfg.allow_no == False
|
|
assert base_cfg.allow_some == False
|
|
assert base_cfg.allow_some_not == False
|
|
|
|
# test incrementing attribute levels
|
|
curriculum.increment_attr_level("allow_all")
|
|
curriculum.increment_attr_level("allow_no")
|
|
curriculum.increment_attr_level("allow_some")
|
|
curriculum.increment_attr_level("allow_some_not")
|
|
increased_cfg = curriculum.generate_configuration(base_value)
|
|
assert increased_cfg.allow_all == True
|
|
assert increased_cfg.allow_no == True
|
|
assert increased_cfg.allow_some == False
|
|
assert increased_cfg.allow_some_not == False
|
|
|
|
# test decrementing attribute levels
|
|
curriculum.decrement_attr_level("allow_no")
|
|
partially_decreased_cfg = curriculum.generate_configuration(base_value)
|
|
assert partially_decreased_cfg.allow_all == True
|
|
assert partially_decreased_cfg.allow_no == False
|
|
assert partially_decreased_cfg.allow_some == False
|
|
assert partially_decreased_cfg.allow_some_not == False
|