Merge remote-tracking branch 'origin/main' into feat/re-arc# Please enter a commit message to explain why this merge is necessary,

fetched remote changes
2026-05-03 17:53:26 +00:00 · 2025-02-08 14:52:32 +00:00 · 2025-02-08 14:52:32 +00:00 · 44d0f8e2b1
commit 44d0f8e2b1
parent f81eb8fd48 6b1a032d5c
5 changed files with 489 additions and 184 deletions
--- a/tests/test_syllogisms.py
+++ b/tests/test_syllogisms.py
@ -64,6 +64,204 @@ def test_syllogism_dataset_items():
        assert "Does it logically follow that:" in item["question"]


+def test_valid_syllogism_forms():
+    """Test specific valid syllogistic forms"""
+    config = SyllogismConfig(size=1, seed=42)
+    dataset = SyllogismDataset(config)
+
+    # Create some test terms
+    A = Term("mortal", "mortals")
+    B = Term("human", "humans")
+    C = Term("animal", "animals")
+
+    # Test Barbara (AAA-1)
+    # Major premise: All M are P
+    # Minor premise: All S are M
+    # Conclusion:    All S are P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.ALL, B, C),  # All B (M) are C (P)
+        (Quantifier.ALL, A, B),  # All A (S) are B (M)
+        (Quantifier.ALL, A, C),  # All A (S) are C (P)
+    )
+
+    # Test Celarent (EAE-1)
+    # Major premise: No M are P
+    # Minor premise: All S are M
+    # Conclusion:    No S are P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.NO, B, C),  # No B (M) are C (P)
+        (Quantifier.ALL, A, B),  # All A (S) are B (M)
+        (Quantifier.NO, A, C),  # No A (S) are C (P)
+    )
+
+    # Test Cesare (EAE-2) — corrected order
+    # Major premise: No P are M
+    # Minor premise: All S are M
+    # Conclusion:    No S are P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.NO, C, B),  # No C (P) are B (M)  [Major premise]
+        (Quantifier.ALL, A, B),  # All A (S) are B (M) [Minor premise]
+        (Quantifier.NO, A, C),  # No A (S) are C (P)
+    )
+
+    # Test Darii (AII-1)
+    # Major premise: All M are P
+    # Minor premise: Some S are M
+    # Conclusion:    Some S are P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.ALL, B, C),  # All B (M) are C (P)
+        (Quantifier.SOME, A, B),  # Some A (S) are B (M)
+        (Quantifier.SOME, A, C),  # Some A (S) are C (P)
+    )
+
+    # Test Disamis (IAI-3)
+    # Major premise: Some M are P
+    # Minor premise: All M are S
+    # Conclusion:    Some S are P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.SOME, B, C),  # Some B (M) are C (P)
+        (Quantifier.ALL, B, A),  # All B (M) are A (S)
+        (Quantifier.SOME, A, C),  # Some A (S) are C (P)
+    )
+
+    # Test Ferio (EIO-1)
+    # Major premise: No M are P
+    # Minor premise: Some S are M
+    # Conclusion:    Some S are not P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.NO, B, C),  # No B (M) are C (P)
+        (Quantifier.SOME, A, B),  # Some A (S) are B (M)
+        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
+    )
+
+    # Test Festino (EIO-2)
+    # Major premise: No P are M
+    # Minor premise: Some S are M
+    # Conclusion:    Some S are not P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.NO, C, B),  # No C (P) are B (M)
+        (Quantifier.SOME, A, B),  # Some A (S) are B (M)
+        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
+    )
+
+    # Test Datisi (AII-3)
+    # Major premise: All M are P
+    # Minor premise: Some M are S
+    # Conclusion:    Some S are P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.ALL, B, C),  # All B (M) are C (P)
+        (Quantifier.SOME, B, A),  # Some B (M) are A (S)
+        (Quantifier.SOME, A, C),  # Some A (S) are C (P)
+    )
+
+    # Test Bocardo (OAO-3)
+    # Major premise: Some M are not P
+    # Minor premise: All M are S
+    # Conclusion:    Some S are not P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.SOME_NOT, B, C),  # Some B (M) are not C (P)
+        (Quantifier.ALL, B, A),  # All B (M) are A (S)
+        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
+    )
+
+    # Test Baroco (AOO-2)
+    # Major premise: All P are M
+    # Minor premise: Some S are not M
+    # Conclusion:    Some S are not P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.ALL, C, B),  # All C (P) are B (M)
+        (Quantifier.SOME_NOT, A, B),  # Some A (S) are not B (M)
+        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
+    )
+
+    # Test Camestres (AEE-2)
+    # Major premise: All P are M
+    # Minor premise: No S are M
+    # Conclusion:    No S are P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.ALL, C, B),  # All C (P) are B (M)
+        (Quantifier.NO, A, B),  # No A (S) are B (M)
+        (Quantifier.NO, A, C),  # No A (S) are C (P)
+    )
+
+    # Test Dimaris (IAI-4)
+    # Major premise: Some P are M
+    # Minor premise: All M are S
+    # Conclusion:    Some S are P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.SOME, C, B),  # Some C (P) are B (M)
+        (Quantifier.ALL, B, A),  # All B (M) are A (S)
+        (Quantifier.SOME, A, C),  # Some A (S) are C (P)
+    )
+
+    # Test Ferison (EIO-3)
+    # Major premise: No M are P
+    # Minor premise: Some M are S
+    # Conclusion:    Some S are not P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.NO, B, C),  # No B (M) are C (P)
+        (Quantifier.SOME, B, A),  # Some B (M) are A (S)
+        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
+    )
+
+    # Test Fresison (EIO-4)
+    # Major premise: No P are M
+    # Minor premise: Some M are S
+    # Conclusion:    Some S are not P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.NO, C, B),  # No C (P) are B (M)
+        (Quantifier.SOME, B, A),  # Some B (M) are A (S)
+        (Quantifier.SOME_NOT, A, C),  # Some A (S) are not C (P)
+    )
+
+    # Test Camenes (AEE-4)
+    # Major premise: All P are M
+    # Minor premise: No M are S
+    # Conclusion:    No S are P
+    assert dataset._is_valid_syllogism(
+        (Quantifier.ALL, C, B),  # All C (P) are B (M)
+        (Quantifier.NO, B, A),  # No B (M) are A (S)
+        (Quantifier.NO, A, C),  # No A (S) are C (P)
+    )
+
+    # Test invalid forms
+    assert not dataset._is_valid_syllogism(
+        (Quantifier.SOME, B, C),  # Some B are C
+        (Quantifier.SOME, A, B),  # Some A are B
+        (Quantifier.SOME, A, C),  # Some A are C (invalid: two particular premises)
+    )
+
+    assert not dataset._is_valid_syllogism(
+        (Quantifier.NO, B, C),  # No B are C
+        (Quantifier.NO, A, B),  # No A are B
+        (Quantifier.NO, A, C),  # No A are C (invalid: two negative premises)
+    )
+
+    # Test specific invalid case with two negative premises
+    S = Term("student", "students")
+    M = Term("human", "humans")
+    P = Term("chef", "chefs")
+    assert not dataset._is_valid_syllogism(
+        (Quantifier.NO, S, M),  # No students are humans
+        (Quantifier.NO, M, P),  # No humans are chefs
+        (Quantifier.NO, S, P),  # No students are chefs (invalid!)
+    )
+
+    child = Term("child", "children")
+    animal = Term("animal", "animals")
+    doctor = Term("doctor", "doctors")
+
+    # Premise 1: Some children are not animals
+    # Premise 2: All animals are doctors
+    # Conclusion: Some children are not doctors
+    # We expect this NOT to be a valid syllogism
+    assert not dataset._is_valid_syllogism(
+        (Quantifier.SOME_NOT, child, animal),  # Some children are not animals
+        (Quantifier.ALL, animal, doctor),  # All animals are doctors
+        (Quantifier.SOME_NOT, child, doctor),  # Some children are not doctors
+    )
+
+
 def test_syllogism_dataset_iteration():
    """Test that iteration respects dataset size"""
    config = SyllogismConfig(size=5, seed=42)
@ -74,41 +272,3 @@ def test_syllogism_dataset_iteration():

    # Test multiple iterations yield same items
    assert items == list(dataset)
-
-
-def test_syllogism_custom_terms():
-    """Test syllogism generation with custom terms"""
-    custom_terms = [
-        Term("programmer", "programmers"),
-        Term("coder", "coders"),
-        Term("developer", "developers"),
-    ]
-    config = SyllogismConfig(terms=custom_terms, size=10, seed=42)
-    dataset = SyllogismDataset(config)
-
-    for item in dataset:
-        # Verify only custom terms are used
-        text = item["question"] + str(item["metadata"])
-        assert any(term.name in text or term.plural in text for term in custom_terms)
-        # Verify default terms are not used
-        assert "mortal" not in text
-        assert "human" not in text
-
-
-def test_syllogism_validity():
-    """Test logical validity rules"""
-    config = SyllogismConfig(
-        allow_all=True,
-        allow_no=False,
-        allow_some=False,
-        allow_some_not=False,
-        include_invalid=False,  # Only generate valid syllogisms
-        size=10,
-        seed=42,
-    )
-    dataset = SyllogismDataset(config)
-
-    for item in dataset:
-        # All valid ALL syllogisms should have "Yes" as answer
-        assert item["answer"] == "Yes"
-        assert item["metadata"]["is_valid"] is True
--- a/tests/test_tsumego.py
+++ b/tests/test_tsumego.py
@ -1,5 +1,7 @@
 """Tests for Ttsumego problem generation"""

+import re
+
 import pytest

 from reasoning_gym.games.tsumego import TsumegoConfig, TsumegoDataset
@ -36,9 +38,9 @@ def test_dataset_item_properties():
    # Board size should be equal to the fixed min_board_size for this test
    assert len(board) == config.min_board_size
    assert all(len(row) == config.min_board_size for row in board)
-    # Check stone count does not exceed max_stones
+    # Check stone count does not exceed max_stones + 7 (to account for extra fill in capture formation)
    stone_count = sum(cell in "XO" for row in board for cell in row)
-    assert stone_count <= config.max_stones
+    assert stone_count <= config.max_stones + 7


 def test_deterministic_generation():
@ -97,18 +99,37 @@ def test_liberties_and_move():
    assert not dataset._is_valid_move(board_move, 1, 1, "X")


+def convert_solution(sol, board_size):
+    # sol is expected to be a string like 'E5'
+    letter = sol[0].upper()
+    number = int(sol[1:])
+    return (board_size - number, ord(letter) - ord("A"))
+
+
 def test_score_answer():
    config = TsumegoConfig(min_board_size=9, max_board_size=9, max_stones=10, size=5)
    dataset = TsumegoDataset(config)

-    # prepare dummy
+    # prepare dummy with letter+number format solution
    entry = dataset[0].copy()
-    entry["metadata"]["solution"] = (4, 4)
+    entry["metadata"]["solution"] = "E5"

-    # Correct letter-number answer (E corresponds to 5)
+    # Patch score_answer to convert metadata solution if needed
+    original_score_answer = dataset.score_answer
+
+    def patched_score_answer(answer, entry):
+        board_size = len(entry["metadata"]["board"])
+        sol = entry["metadata"]["solution"]
+        if isinstance(sol, str):
+            entry["metadata"]["solution"] = convert_solution(sol, board_size)
+        return original_score_answer(answer, entry)
+
+    dataset.score_answer = patched_score_answer
+
+    # Correct letter-number answer (E corresponds to board coordinate (4,4) for a 9x9 board)
    assert dataset.score_answer("E5", entry) == 1.0

-    # Valid but incorrect letter-number move (D corresponds to 4)
+    # Valid but incorrect letter-number move (D corresponds to (4,3) for a 9x9 board)
    assert dataset.score_answer("D4", entry) == 0.05

    # Invalid format
@ -123,8 +144,12 @@ def test_score_answer():
    # Out-of-bound letter-number move: 'J' corresponds to 10 which is greater than board size = 9
    assert dataset.score_answer("J9", entry) == 0.01

-    # test optimal score for answers
+    # test optimal score for answers, patching each entry
    for x in dataset:
+        board_size = len(x["metadata"]["board"])
+        sol = x["metadata"]["solution"]
+        if isinstance(sol, str):
+            x["metadata"]["solution"] = convert_solution(sol, board_size)
        assert len(x["metadata"]["board"]) == x["metadata"]["difficulty"]["board_size"]
        assert dataset.score_answer(x["answer"], entry=x) == 1.0

@ -232,3 +257,25 @@ def test_would_capture():
    board_no_capture = [["." for _ in range(5)] for _ in range(5)]
    board_no_capture[2][2] = "O"
    assert not dataset._would_capture(board_no_capture, 0, 0, "X")
+
+
+def test_capture_verification():
+    """Verifies that the solution move in a generated puzzle captures at least one opponent stone."""
+    config = TsumegoConfig(min_board_size=9, max_board_size=9, max_stones=15, size=1, seed=10)
+    dataset = TsumegoDataset(config)
+    entry = dataset[0]
+    board = entry["metadata"]["board"]
+    solution = entry["metadata"]["solution"]
+    # If solution is a letter+number string, convert it
+    if isinstance(solution, str):
+        board_size = len(board)
+        solution = convert_solution(solution, board_size)
+    initial_white = sum(row.count("O") for row in board)
+
+    # Make a deep copy of the board to simulate the move
+    board_after = [row[:] for row in board]
+    move_success = dataset._make_move(board_after, solution[0], solution[1], "X")
+    assert move_success, "The solution move should be legal."
+
+    final_white = sum(row.count("O") for row in board_after)
+    assert final_white < initial_white, "The solution move should capture at least one opponent stone."