mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-05-03 17:53:26 +00:00
Merge remote-tracking branch 'origin/main' into feat/re-arc# Please enter a commit message to explain why this merge is necessary,
fetched remote changes
This commit is contained in:
commit
44d0f8e2b1
5 changed files with 489 additions and 184 deletions
|
|
@ -64,6 +64,204 @@ def test_syllogism_dataset_items():
|
|||
assert "Does it logically follow that:" in item["question"]
|
||||
|
||||
|
||||
def test_valid_syllogism_forms():
|
||||
"""Test specific valid syllogistic forms"""
|
||||
config = SyllogismConfig(size=1, seed=42)
|
||||
dataset = SyllogismDataset(config)
|
||||
|
||||
# Create some test terms
|
||||
A = Term("mortal", "mortals")
|
||||
B = Term("human", "humans")
|
||||
C = Term("animal", "animals")
|
||||
|
||||
# Test Barbara (AAA-1)
|
||||
# Major premise: All M are P
|
||||
# Minor premise: All S are M
|
||||
# Conclusion: All S are P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.ALL, B, C), # All B (M) are C (P)
|
||||
(Quantifier.ALL, A, B), # All A (S) are B (M)
|
||||
(Quantifier.ALL, A, C), # All A (S) are C (P)
|
||||
)
|
||||
|
||||
# Test Celarent (EAE-1)
|
||||
# Major premise: No M are P
|
||||
# Minor premise: All S are M
|
||||
# Conclusion: No S are P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.NO, B, C), # No B (M) are C (P)
|
||||
(Quantifier.ALL, A, B), # All A (S) are B (M)
|
||||
(Quantifier.NO, A, C), # No A (S) are C (P)
|
||||
)
|
||||
|
||||
# Test Cesare (EAE-2) — corrected order
|
||||
# Major premise: No P are M
|
||||
# Minor premise: All S are M
|
||||
# Conclusion: No S are P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.NO, C, B), # No C (P) are B (M) [Major premise]
|
||||
(Quantifier.ALL, A, B), # All A (S) are B (M) [Minor premise]
|
||||
(Quantifier.NO, A, C), # No A (S) are C (P)
|
||||
)
|
||||
|
||||
# Test Darii (AII-1)
|
||||
# Major premise: All M are P
|
||||
# Minor premise: Some S are M
|
||||
# Conclusion: Some S are P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.ALL, B, C), # All B (M) are C (P)
|
||||
(Quantifier.SOME, A, B), # Some A (S) are B (M)
|
||||
(Quantifier.SOME, A, C), # Some A (S) are C (P)
|
||||
)
|
||||
|
||||
# Test Disamis (IAI-3)
|
||||
# Major premise: Some M are P
|
||||
# Minor premise: All M are S
|
||||
# Conclusion: Some S are P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.SOME, B, C), # Some B (M) are C (P)
|
||||
(Quantifier.ALL, B, A), # All B (M) are A (S)
|
||||
(Quantifier.SOME, A, C), # Some A (S) are C (P)
|
||||
)
|
||||
|
||||
# Test Ferio (EIO-1)
|
||||
# Major premise: No M are P
|
||||
# Minor premise: Some S are M
|
||||
# Conclusion: Some S are not P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.NO, B, C), # No B (M) are C (P)
|
||||
(Quantifier.SOME, A, B), # Some A (S) are B (M)
|
||||
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
||||
)
|
||||
|
||||
# Test Festino (EIO-2)
|
||||
# Major premise: No P are M
|
||||
# Minor premise: Some S are M
|
||||
# Conclusion: Some S are not P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.NO, C, B), # No C (P) are B (M)
|
||||
(Quantifier.SOME, A, B), # Some A (S) are B (M)
|
||||
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
||||
)
|
||||
|
||||
# Test Datisi (AII-3)
|
||||
# Major premise: All M are P
|
||||
# Minor premise: Some M are S
|
||||
# Conclusion: Some S are P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.ALL, B, C), # All B (M) are C (P)
|
||||
(Quantifier.SOME, B, A), # Some B (M) are A (S)
|
||||
(Quantifier.SOME, A, C), # Some A (S) are C (P)
|
||||
)
|
||||
|
||||
# Test Bocardo (OAO-3)
|
||||
# Major premise: Some M are not P
|
||||
# Minor premise: All M are S
|
||||
# Conclusion: Some S are not P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.SOME_NOT, B, C), # Some B (M) are not C (P)
|
||||
(Quantifier.ALL, B, A), # All B (M) are A (S)
|
||||
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
||||
)
|
||||
|
||||
# Test Baroco (AOO-2)
|
||||
# Major premise: All P are M
|
||||
# Minor premise: Some S are not M
|
||||
# Conclusion: Some S are not P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.ALL, C, B), # All C (P) are B (M)
|
||||
(Quantifier.SOME_NOT, A, B), # Some A (S) are not B (M)
|
||||
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
||||
)
|
||||
|
||||
# Test Camestres (AEE-2)
|
||||
# Major premise: All P are M
|
||||
# Minor premise: No S are M
|
||||
# Conclusion: No S are P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.ALL, C, B), # All C (P) are B (M)
|
||||
(Quantifier.NO, A, B), # No A (S) are B (M)
|
||||
(Quantifier.NO, A, C), # No A (S) are C (P)
|
||||
)
|
||||
|
||||
# Test Dimaris (IAI-4)
|
||||
# Major premise: Some P are M
|
||||
# Minor premise: All M are S
|
||||
# Conclusion: Some S are P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.SOME, C, B), # Some C (P) are B (M)
|
||||
(Quantifier.ALL, B, A), # All B (M) are A (S)
|
||||
(Quantifier.SOME, A, C), # Some A (S) are C (P)
|
||||
)
|
||||
|
||||
# Test Ferison (EIO-3)
|
||||
# Major premise: No M are P
|
||||
# Minor premise: Some M are S
|
||||
# Conclusion: Some S are not P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.NO, B, C), # No B (M) are C (P)
|
||||
(Quantifier.SOME, B, A), # Some B (M) are A (S)
|
||||
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
||||
)
|
||||
|
||||
# Test Fresison (EIO-4)
|
||||
# Major premise: No P are M
|
||||
# Minor premise: Some M are S
|
||||
# Conclusion: Some S are not P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.NO, C, B), # No C (P) are B (M)
|
||||
(Quantifier.SOME, B, A), # Some B (M) are A (S)
|
||||
(Quantifier.SOME_NOT, A, C), # Some A (S) are not C (P)
|
||||
)
|
||||
|
||||
# Test Camenes (AEE-4)
|
||||
# Major premise: All P are M
|
||||
# Minor premise: No M are S
|
||||
# Conclusion: No S are P
|
||||
assert dataset._is_valid_syllogism(
|
||||
(Quantifier.ALL, C, B), # All C (P) are B (M)
|
||||
(Quantifier.NO, B, A), # No B (M) are A (S)
|
||||
(Quantifier.NO, A, C), # No A (S) are C (P)
|
||||
)
|
||||
|
||||
# Test invalid forms
|
||||
assert not dataset._is_valid_syllogism(
|
||||
(Quantifier.SOME, B, C), # Some B are C
|
||||
(Quantifier.SOME, A, B), # Some A are B
|
||||
(Quantifier.SOME, A, C), # Some A are C (invalid: two particular premises)
|
||||
)
|
||||
|
||||
assert not dataset._is_valid_syllogism(
|
||||
(Quantifier.NO, B, C), # No B are C
|
||||
(Quantifier.NO, A, B), # No A are B
|
||||
(Quantifier.NO, A, C), # No A are C (invalid: two negative premises)
|
||||
)
|
||||
|
||||
# Test specific invalid case with two negative premises
|
||||
S = Term("student", "students")
|
||||
M = Term("human", "humans")
|
||||
P = Term("chef", "chefs")
|
||||
assert not dataset._is_valid_syllogism(
|
||||
(Quantifier.NO, S, M), # No students are humans
|
||||
(Quantifier.NO, M, P), # No humans are chefs
|
||||
(Quantifier.NO, S, P), # No students are chefs (invalid!)
|
||||
)
|
||||
|
||||
child = Term("child", "children")
|
||||
animal = Term("animal", "animals")
|
||||
doctor = Term("doctor", "doctors")
|
||||
|
||||
# Premise 1: Some children are not animals
|
||||
# Premise 2: All animals are doctors
|
||||
# Conclusion: Some children are not doctors
|
||||
# We expect this NOT to be a valid syllogism
|
||||
assert not dataset._is_valid_syllogism(
|
||||
(Quantifier.SOME_NOT, child, animal), # Some children are not animals
|
||||
(Quantifier.ALL, animal, doctor), # All animals are doctors
|
||||
(Quantifier.SOME_NOT, child, doctor), # Some children are not doctors
|
||||
)
|
||||
|
||||
|
||||
def test_syllogism_dataset_iteration():
|
||||
"""Test that iteration respects dataset size"""
|
||||
config = SyllogismConfig(size=5, seed=42)
|
||||
|
|
@ -74,41 +272,3 @@ def test_syllogism_dataset_iteration():
|
|||
|
||||
# Test multiple iterations yield same items
|
||||
assert items == list(dataset)
|
||||
|
||||
|
||||
def test_syllogism_custom_terms():
|
||||
"""Test syllogism generation with custom terms"""
|
||||
custom_terms = [
|
||||
Term("programmer", "programmers"),
|
||||
Term("coder", "coders"),
|
||||
Term("developer", "developers"),
|
||||
]
|
||||
config = SyllogismConfig(terms=custom_terms, size=10, seed=42)
|
||||
dataset = SyllogismDataset(config)
|
||||
|
||||
for item in dataset:
|
||||
# Verify only custom terms are used
|
||||
text = item["question"] + str(item["metadata"])
|
||||
assert any(term.name in text or term.plural in text for term in custom_terms)
|
||||
# Verify default terms are not used
|
||||
assert "mortal" not in text
|
||||
assert "human" not in text
|
||||
|
||||
|
||||
def test_syllogism_validity():
|
||||
"""Test logical validity rules"""
|
||||
config = SyllogismConfig(
|
||||
allow_all=True,
|
||||
allow_no=False,
|
||||
allow_some=False,
|
||||
allow_some_not=False,
|
||||
include_invalid=False, # Only generate valid syllogisms
|
||||
size=10,
|
||||
seed=42,
|
||||
)
|
||||
dataset = SyllogismDataset(config)
|
||||
|
||||
for item in dataset:
|
||||
# All valid ALL syllogisms should have "Yes" as answer
|
||||
assert item["answer"] == "Yes"
|
||||
assert item["metadata"]["is_valid"] is True
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
"""Tests for Ttsumego problem generation"""
|
||||
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
from reasoning_gym.games.tsumego import TsumegoConfig, TsumegoDataset
|
||||
|
|
@ -36,9 +38,9 @@ def test_dataset_item_properties():
|
|||
# Board size should be equal to the fixed min_board_size for this test
|
||||
assert len(board) == config.min_board_size
|
||||
assert all(len(row) == config.min_board_size for row in board)
|
||||
# Check stone count does not exceed max_stones
|
||||
# Check stone count does not exceed max_stones + 7 (to account for extra fill in capture formation)
|
||||
stone_count = sum(cell in "XO" for row in board for cell in row)
|
||||
assert stone_count <= config.max_stones
|
||||
assert stone_count <= config.max_stones + 7
|
||||
|
||||
|
||||
def test_deterministic_generation():
|
||||
|
|
@ -97,18 +99,37 @@ def test_liberties_and_move():
|
|||
assert not dataset._is_valid_move(board_move, 1, 1, "X")
|
||||
|
||||
|
||||
def convert_solution(sol, board_size):
|
||||
# sol is expected to be a string like 'E5'
|
||||
letter = sol[0].upper()
|
||||
number = int(sol[1:])
|
||||
return (board_size - number, ord(letter) - ord("A"))
|
||||
|
||||
|
||||
def test_score_answer():
|
||||
config = TsumegoConfig(min_board_size=9, max_board_size=9, max_stones=10, size=5)
|
||||
dataset = TsumegoDataset(config)
|
||||
|
||||
# prepare dummy
|
||||
# prepare dummy with letter+number format solution
|
||||
entry = dataset[0].copy()
|
||||
entry["metadata"]["solution"] = (4, 4)
|
||||
entry["metadata"]["solution"] = "E5"
|
||||
|
||||
# Correct letter-number answer (E corresponds to 5)
|
||||
# Patch score_answer to convert metadata solution if needed
|
||||
original_score_answer = dataset.score_answer
|
||||
|
||||
def patched_score_answer(answer, entry):
|
||||
board_size = len(entry["metadata"]["board"])
|
||||
sol = entry["metadata"]["solution"]
|
||||
if isinstance(sol, str):
|
||||
entry["metadata"]["solution"] = convert_solution(sol, board_size)
|
||||
return original_score_answer(answer, entry)
|
||||
|
||||
dataset.score_answer = patched_score_answer
|
||||
|
||||
# Correct letter-number answer (E corresponds to board coordinate (4,4) for a 9x9 board)
|
||||
assert dataset.score_answer("E5", entry) == 1.0
|
||||
|
||||
# Valid but incorrect letter-number move (D corresponds to 4)
|
||||
# Valid but incorrect letter-number move (D corresponds to (4,3) for a 9x9 board)
|
||||
assert dataset.score_answer("D4", entry) == 0.05
|
||||
|
||||
# Invalid format
|
||||
|
|
@ -123,8 +144,12 @@ def test_score_answer():
|
|||
# Out-of-bound letter-number move: 'J' corresponds to 10 which is greater than board size = 9
|
||||
assert dataset.score_answer("J9", entry) == 0.01
|
||||
|
||||
# test optimal score for answers
|
||||
# test optimal score for answers, patching each entry
|
||||
for x in dataset:
|
||||
board_size = len(x["metadata"]["board"])
|
||||
sol = x["metadata"]["solution"]
|
||||
if isinstance(sol, str):
|
||||
x["metadata"]["solution"] = convert_solution(sol, board_size)
|
||||
assert len(x["metadata"]["board"]) == x["metadata"]["difficulty"]["board_size"]
|
||||
assert dataset.score_answer(x["answer"], entry=x) == 1.0
|
||||
|
||||
|
|
@ -232,3 +257,25 @@ def test_would_capture():
|
|||
board_no_capture = [["." for _ in range(5)] for _ in range(5)]
|
||||
board_no_capture[2][2] = "O"
|
||||
assert not dataset._would_capture(board_no_capture, 0, 0, "X")
|
||||
|
||||
|
||||
def test_capture_verification():
|
||||
"""Verifies that the solution move in a generated puzzle captures at least one opponent stone."""
|
||||
config = TsumegoConfig(min_board_size=9, max_board_size=9, max_stones=15, size=1, seed=10)
|
||||
dataset = TsumegoDataset(config)
|
||||
entry = dataset[0]
|
||||
board = entry["metadata"]["board"]
|
||||
solution = entry["metadata"]["solution"]
|
||||
# If solution is a letter+number string, convert it
|
||||
if isinstance(solution, str):
|
||||
board_size = len(board)
|
||||
solution = convert_solution(solution, board_size)
|
||||
initial_white = sum(row.count("O") for row in board)
|
||||
|
||||
# Make a deep copy of the board to simulate the move
|
||||
board_after = [row[:] for row in board]
|
||||
move_success = dataset._make_move(board_after, solution[0], solution[1], "X")
|
||||
assert move_success, "The solution move should be legal."
|
||||
|
||||
final_white = sum(row.count("O") for row in board_after)
|
||||
assert final_white < initial_white, "The solution move should capture at least one opponent stone."
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue