reasoning-gym/tests/test_syllogisms.py

"""Tests for syllogism task generation"""

import pytest

from reasoning_gym.logic.syllogisms import Quantifier, SyllogismConfig, SyllogismDataset, Term


def test_syllogism_config_validation():
    """Test that invalid configs raise appropriate errors"""
    with pytest.raises(AssertionError):
        config = SyllogismConfig(
            allow_all=False,
            allow_no=False,
            allow_some=False,
            allow_some_not=False,
        )  # No quantifiers allowed
        config.validate()

    with pytest.raises(AssertionError):
        config = SyllogismConfig(invalid_ratio=-0.1)  # Invalid ratio
        config.validate()

    with pytest.raises(AssertionError):
        config = SyllogismConfig(invalid_ratio=1.1)  # Invalid ratio
        config.validate()


def test_syllogism_dataset_deterministic():
    """Test that dataset generates same items with same seed"""
    config = SyllogismConfig(seed=42, size=10)
    dataset1 = SyllogismDataset(config)
    dataset2 = SyllogismDataset(config)

    for i in range(len(dataset1)):
        assert dataset1[i] == dataset2[i]


def test_syllogism_dataset_items():
    """Test basic properties of generated items"""
    config = SyllogismConfig(size=10, seed=42)
    dataset = SyllogismDataset(config)

    for i in range(len(dataset)):
        item = dataset[i]
        # Check item structure
        assert isinstance(item, dict)
        assert "question" in item
        assert "answer" in item
        assert "metadata" in item

        # Check metadata
        assert "premise1" in item["metadata"]
        assert "conclusion" in item["metadata"]
        assert "is_valid" in item["metadata"]
        assert "type" in item["metadata"]

        # For traditional syllogisms, check for premise2
        if item["metadata"]["type"] == "syllogism":
            assert "premise2" in item["metadata"]

        # Verify answer format
        assert item["answer"] in ("Yes", "No")

        # Verify question format
        assert "Consider these statements:" in item["question"]
        assert "1." in item["question"]
        if item["metadata"]["type"] == "syllogism":
            assert "2." in item["question"]
        assert "Does it logically follow that:" in item["question"]


def test_valid_syllogism_forms():
    """Test specific valid syllogistic forms"""
    config = SyllogismConfig(size=1, seed=42)
    dataset = SyllogismDataset(config)

    # Create some test terms
    A = Term("mortal", "mortals")
    B = Term("human", "humans")
    C = Term("animal", "animals")

    # Test Barbara (AAA-1)
    # Major premise: All M are P
    # Minor premise: All S are M
    # Conclusion:    All S are P
    assert dataset._is_valid_syllogism(
        (Quantifier.ALL, B, C),  # All B (M) are C (P)
        (Quantifier.ALL, A, B),  # All A (S) are B (M)
        (Quantifier.ALL, A, C),  # All A (S) are C (P)
    )

    # Test Celarent (EAE-1)
    # Major premise: No M are P
    # Minor premise: All S are M
    # Conclusion:    No S are P
    assert dataset._is_valid_syllogism(
        (Quantifier.NO, B, C),  # No B (M) are C (P)
        (Quantifier.ALL, A, B),  # All A (S) are B (M)
        (Quantifier.NO, A, C),  # No A (S) are C (P)
    )

    # Test Cesare (EAE-2) — corrected order
    # Major premise: No P are M
    # Minor premise: All S are M
    # Conclusion:    No S are P
    assert dataset._is_valid_syllogism(
        (Quantifier.NO, C, B),  # No C (P) are B (M)  [Major premise]
        (Quantifier.ALL, A, B),  # All A (S) are B (M) [Minor premise]
        (Quantifier.NO, A, C),  # No A (S) are C (P)
    )

    # Test Darii (AII-1)
    # Major premise: All M are P
    # Minor premise: Some S are M
    # Conclusion:    Some S are P
    assert dataset._is_valid_syllogism(
        (Quantifier.ALL, B, C),  # All B (M) are C (P)
        (Quantifier.SOME, A, B),  # Some A (S) are B (M)
        (Quantifier.SOME, A, C),  # Some A (S) are C (P)
    )

    # Test Disamis (IAI-3)
    # Major premise: Some M are P
    # Minor premise: All M are S
    # Conclusion:    Some S are P
    assert dataset._is_valid_syllogism(
        (Quantifier.SOME, B, C),  # Some B (M) are C (P)
        (Quantifier.ALL, B, A),  # All B (M) are A (S)
        (Quantifier.SOME, A, C),  # Some A (S) are C (P)
    )

    # Test Ferio (EIO-1)
    # Major premise: No M are P
    # Minor premise: Some S are M
    # Conclusion:    Some S are not P
    assert dataset._is_valid_syllogism(
        (Quantifier.NO, B, C),  # No B (M) are C (P)
        (Quantifier.SOME, A, B),  # Some A (S) are B (M)
        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
    )

    # Test Festino (EIO-2)
    # Major premise: No P are M
    # Minor premise: Some S are M
    # Conclusion:    Some S are not P
    assert dataset._is_valid_syllogism(
        (Quantifier.NO, C, B),  # No C (P) are B (M)
        (Quantifier.SOME, A, B),  # Some A (S) are B (M)
        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
    )

    # Test Datisi (AII-3)
    # Major premise: All M are P
    # Minor premise: Some M are S
    # Conclusion:    Some S are P
    assert dataset._is_valid_syllogism(
        (Quantifier.ALL, B, C),  # All B (M) are C (P)
        (Quantifier.SOME, B, A),  # Some B (M) are A (S)
        (Quantifier.SOME, A, C),  # Some A (S) are C (P)
    )

    # Test Bocardo (OAO-3)
    # Major premise: Some M are not P
    # Minor premise: All M are S
    # Conclusion:    Some S are not P
    assert dataset._is_valid_syllogism(
        (Quantifier.SOME_NOT, B, C),  # Some B (M) are not C (P)
        (Quantifier.ALL, B, A),  # All B (M) are A (S)
        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
    )

    # Test Baroco (AOO-2)
    # Major premise: All P are M
    # Minor premise: Some S are not M
    # Conclusion:    Some S are not P
    assert dataset._is_valid_syllogism(
        (Quantifier.ALL, C, B),  # All C (P) are B (M)
        (Quantifier.SOME_NOT, A, B),  # Some A (S) are not B (M)
        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
    )

    # Test Camestres (AEE-2)
    # Major premise: All P are M
    # Minor premise: No S are M
    # Conclusion:    No S are P
    assert dataset._is_valid_syllogism(
        (Quantifier.ALL, C, B),  # All C (P) are B (M)
        (Quantifier.NO, A, B),  # No A (S) are B (M)
        (Quantifier.NO, A, C),  # No A (S) are C (P)
    )

    # Test Dimaris (IAI-4)
    # Major premise: Some P are M
    # Minor premise: All M are S
    # Conclusion:    Some S are P
    assert dataset._is_valid_syllogism(
        (Quantifier.SOME, C, B),  # Some C (P) are B (M)
        (Quantifier.ALL, B, A),  # All B (M) are A (S)
        (Quantifier.SOME, A, C),  # Some A (S) are C (P)
    )

    # Test Ferison (EIO-3)
    # Major premise: No M are P
    # Minor premise: Some M are S
    # Conclusion:    Some S are not P
    assert dataset._is_valid_syllogism(
        (Quantifier.NO, B, C),  # No B (M) are C (P)
        (Quantifier.SOME, B, A),  # Some B (M) are A (S)
        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
    )

    # Test Fresison (EIO-4)
    # Major premise: No P are M
    # Minor premise: Some M are S
    # Conclusion:    Some S are not P
    assert dataset._is_valid_syllogism(
        (Quantifier.NO, C, B),  # No C (P) are B (M)
        (Quantifier.SOME, B, A),  # Some B (M) are A (S)
        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
    )

    # Test Camenes (AEE-4)
    # Major premise: All P are M
    # Minor premise: No M are S
    # Conclusion:    No S are P
    assert dataset._is_valid_syllogism(
        (Quantifier.ALL, C, B),  # All C (P) are B (M)
        (Quantifier.NO, B, A),  # No B (M) are A (S)
        (Quantifier.NO, A, C),  # No A (S) are C (P)
    )

    # Test invalid forms
    assert not dataset._is_valid_syllogism(
        (Quantifier.SOME, B, C),  # Some B are C
        (Quantifier.SOME, A, B),  # Some A are B
        (Quantifier.SOME, A, C),  # Some A are C (invalid: two particular premises)
    )

    assert not dataset._is_valid_syllogism(
        (Quantifier.NO, B, C),  # No B are C
        (Quantifier.NO, A, B),  # No A are B
        (Quantifier.NO, A, C),  # No A are C (invalid: two negative premises)
    )

    # Test specific invalid case with two negative premises
    S = Term("student", "students")
    M = Term("human", "humans")
    P = Term("chef", "chefs")
    assert not dataset._is_valid_syllogism(
        (Quantifier.NO, S, M),  # No students are humans
        (Quantifier.NO, M, P),  # No humans are chefs
        (Quantifier.NO, S, P),  # No students are chefs (invalid!)
    )

    child = Term("child", "children")
    animal = Term("animal", "animals")
    doctor = Term("doctor", "doctors")

    # Premise 1: Some children are not animals
    # Premise 2: All animals are doctors
    # Conclusion: Some children are not doctors
    # We expect this NOT to be a valid syllogism
    assert not dataset._is_valid_syllogism(
        (Quantifier.SOME_NOT, child, animal),  # Some children are not animals
        (Quantifier.ALL, animal, doctor),  # All animals are doctors
        (Quantifier.SOME_NOT, child, doctor),  # Some children are not doctors
    )


def test_logical_equivalence():
    """Test logical equivalence rules for inversions"""
    config = SyllogismConfig(size=1, seed=42)
    dataset = SyllogismDataset(config)

    # Create test terms
    A = Term("student", "students")
    B = Term("human", "humans")

    # Test direct inversion of NO statements
    assert dataset._check_logical_equivalence(
        (Quantifier.NO, A, B),  # No students are humans
        (Quantifier.NO, B, A),  # No humans are students
    )

    # Test particular inversion of ALL statements
    assert dataset._check_logical_equivalence(
        (Quantifier.ALL, A, B),  # All students are humans
        (Quantifier.SOME, B, A),  # Some humans are students
    )

    # Test direct inversion of SOME statements
    assert dataset._check_logical_equivalence(
        (Quantifier.SOME, A, B),  # Some students are humans
        (Quantifier.SOME, B, A),  # Some humans are students
    )

    # Test invalid inversions
    assert not dataset._check_logical_equivalence(
        (Quantifier.SOME_NOT, A, B),  # Some students are not humans
        (Quantifier.SOME_NOT, B, A),  # Some humans are not students (invalid)
    )

    assert not dataset._check_logical_equivalence(
        (Quantifier.ALL, A, B),  # All students are humans
        (Quantifier.ALL, B, A),  # All humans are students (invalid)
    )


def test_inversion_generation():
    """Test generation of inversion problems"""
    # Force inversion problems by setting probability to 1.0
    config = SyllogismConfig(size=10, seed=42, inversion_probability=1.0)
    dataset = SyllogismDataset(config)

    for item in dataset:
        # Check type is marked as inversion
        assert item["metadata"]["type"] == "inversion"
        # Check both premises and selection
        assert "premise1" in item["metadata"]
        assert "premise2" in item["metadata"]
        assert "selected_premise" in item["metadata"]
        assert item["metadata"]["selected_premise"] in (1, 2)
        # Check format
        assert item["answer"] in ("Yes", "No")
        assert "Consider these statements:" in item["question"]
        assert "1." in item["question"]
        assert "2." in item["question"]  # Inversion questions now show both premises
        assert "Does it logically follow that:" in item["question"]


def test_syllogism_dataset_iteration():
    """Test that iteration respects dataset size"""
    config = SyllogismConfig(size=5, seed=42)
    dataset = SyllogismDataset(config)

    items = list(dataset)
    assert len(items) == config.size

    # Test multiple iterations yield same items
    assert items == list(dataset)