reasoning-gym/tests/test_tower_of_hanoi.py

"""Tests for Tower of Hanoi puzzle generation"""

import re

import pytest

from reasoning_gym.games.tower_of_hanoi import HanoiConfig, HanoiDataset


def test_toh_config_validation():
    """Test that invalid configurations raise appropriate errors."""
    # Test negative number of disks
    with pytest.raises(AssertionError):
        config = HanoiConfig(min_disks=0)  # At least 1 disk required
        config.validate()

    # Test max_disks less than min_disks
    with pytest.raises(AssertionError):
        config = HanoiConfig(min_disks=5, max_disks=3)
        config.validate()

    # Test min_pegs less than 3
    with pytest.raises(AssertionError):
        config = HanoiConfig(min_pegs=2)
        config.validate()

    # Test max_pegs less than min_pegs
    with pytest.raises(AssertionError):
        config = HanoiConfig(min_pegs=3, max_pegs=2)
        config.validate()

    # Test invalid move configurations if any (assuming such validations exist)
    # Add more tests based on the actual validation logic in HanoiConfig


def test_toh_dataset_deterministic():
    """Test that dataset generates the same items with the same seed."""
    config = HanoiConfig(seed=42, size=10)
    dataset1 = HanoiDataset(config)
    dataset2 = HanoiDataset(config)

    for i in range(len(dataset1)):
        assert dataset1[i] == dataset2[i], f"Mismatch found in instance {i} with seed 42."


def test_toh_dataset_items():
    """Test basic properties of generated items."""
    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=10, seed=42)
    dataset = HanoiDataset(config)

    for i in range(len(dataset)):
        item = dataset[i]

        # Check item structure
        assert isinstance(item, dict), f"Item {i} is not a dictionary."
        assert "question" in item, f"Item {i} missing 'question' key."
        assert "answer" in item, f"Item {i} missing 'answer' key."
        assert "metadata" in item, f"Item {i} missing 'metadata' key."

        # Check metadata
        metadata = item["metadata"]
        assert "num_disks" in metadata, f"Item {i} metadata missing 'num_disks'."
        assert "num_pegs" in metadata, f"Item {i} metadata missing 'num_pegs'."
        assert "start_peg" in metadata, f"Item {i} metadata missing 'start_peg'."
        assert "target_peg" in metadata, f"Item {i} metadata missing 'target_peg'."
        assert "auxiliary_pegs" in metadata, f"Item {i} metadata missing 'auxiliary_pegs'."
        assert "solution_length" in metadata, f"Item {i} metadata missing 'solution_length'."

        num_disks = metadata["num_disks"]
        num_pegs = metadata["num_pegs"]
        start_peg = metadata["start_peg"]
        target_peg = metadata["target_peg"]
        auxiliary_pegs = metadata["auxiliary_pegs"]
        solution_length = metadata["solution_length"]

        # Verify peg counts
        assert num_pegs == len(metadata["auxiliary_pegs"]) + 2, f"Item {i} has inconsistent peg counts."

        # Verify solution_length consistency
        assert solution_length == len(
            item["answer"]
        ), f"Item {i} metadata 'solution_length' does not match actual number of moves."

        # Optional: Additional checks like verifying that start and target pegs are distinct
        assert start_peg != target_peg, f"Item {i} has identical start and target pegs."


def test_toh_move_validity():
    """Test that all moves in each problem instance are valid according to Tower of Hanoi rules."""
    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=10, seed=42)
    dataset = HanoiDataset(config)

    for idx, instance in enumerate(dataset):
        num_disks = instance["metadata"]["num_disks"]
        num_pegs = instance["metadata"]["num_pegs"]
        start_peg = instance["metadata"]["start_peg"]
        target_peg = instance["metadata"]["target_peg"]
        auxiliary_pegs = instance["metadata"]["auxiliary_pegs"]
        pegs = list(range(1, num_pegs + 1))

        # Initialize pegs_state: all disks start on the start peg
        pegs_state = {peg: [] for peg in pegs}
        for disk in range(num_disks, 0, -1):
            pegs_state[start_peg].append(disk)

        # Iterate over each move and validate
        for move_num, move in enumerate(instance["answer"], start=1):
            disk, from_peg, to_peg = parse_move(move)

            # Check that from_peg exists
            assert from_peg in pegs, f"Move {move_num} in Instance {idx} references non-existent from_peg {from_peg}."

            # Check that to_peg exists
            assert to_peg in pegs, f"Move {move_num} in Instance {idx} references non-existent to_peg {to_peg}."

            # Check that from_peg is not empty
            assert pegs_state[
                from_peg
            ], f"Move {move_num} in Instance {idx} attempts to move from an empty Peg {from_peg}."

            # Check that the disk to move is on top of from_peg
            top_disk = pegs_state[from_peg][-1]
            assert disk == top_disk, (
                f"Move {move_num} in Instance {idx} attempts to move disk {disk} "
                f"which is not on top of Peg {from_peg} (top disk: {top_disk})."
            )

            # Check that moving disk to to_peg does not violate size constraints
            if pegs_state[to_peg]:
                top_to_disk = pegs_state[to_peg][-1]
                assert top_to_disk > disk, (
                    f"Move {move_num} in Instance {idx} attempts to place disk {disk} "
                    f"on top of smaller disk {top_to_disk} on Peg {to_peg}."
                )

            # Perform the move
            pegs_state[from_peg].pop()
            pegs_state[to_peg].append(disk)


def test_toh_final_state_correct():
    """Test that the final state of each problem instance has all disks on the target peg in correct order."""
    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=10, seed=42)
    dataset = HanoiDataset(config)

    for idx, instance in enumerate(dataset):
        num_disks = instance["metadata"]["num_disks"]
        num_pegs = instance["metadata"]["num_pegs"]
        start_peg = instance["metadata"]["start_peg"]
        target_peg = instance["metadata"]["target_peg"]
        auxiliary_pegs = instance["metadata"]["auxiliary_pegs"]
        pegs = list(range(1, num_pegs + 1))

        # Initialize pegs_state: all disks start on the start peg
        pegs_state = {peg: [] for peg in pegs}
        for disk in range(num_disks, 0, -1):
            pegs_state[start_peg].append(disk)

        # Perform all moves
        for move in instance["answer"]:
            disk, from_peg, to_peg = parse_move(move)
            pegs_state[from_peg].pop()
            pegs_state[to_peg].append(disk)

        # After all moves, all disks should be on target peg in descending order
        final_pegs = pegs_state[target_peg]
        assert len(final_pegs) == num_disks, f"Instance {idx} does not have all disks on the target Peg {target_peg}."

        # Verify that disks are in correct order on target peg
        expected_final = list(range(num_disks, 0, -1))
        assert final_pegs == expected_final, f"Instance {idx} has disks on Peg {target_peg} in incorrect order."

        # Ensure all other pegs are empty
        for peg in pegs:
            if peg != target_peg:
                assert (
                    len(pegs_state[peg]) == 0
                ), f"Instance {idx} has disks remaining on Peg {peg}, which should be empty."


def test_toh_dataset_iteration():
    """Test that iteration respects dataset size and multiple iterations yield the same items."""
    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=5, seed=42)
    dataset = HanoiDataset(config)

    # Test dataset size
    assert len(dataset) == config.size, f"Dataset size mismatch: expected {config.size}, got {len(dataset)}."

    # Collect items
    items = list(dataset)

    # Test multiple iterations yield the same items
    assert items == list(dataset), "Multiple iterations over the dataset do not yield the same items."


def parse_move(move_str: str) -> tuple:
    """Parse a move string and extract disk number, from peg, and to peg.

    Args:
        move_str (str): Move instruction, e.g., "Move disk 2 from Peg 1 to Peg 3".

    Returns:
        tuple: (disk, from_peg, to_peg)
    """
    pattern = r"Move disk (\d+) from Peg (\d+) to Peg (\d+)"
    match = re.match(pattern, move_str)
    assert match is not None, f"Move string '{move_str}' does not match the expected format."
    disk = int(match.group(1))
    from_peg = int(match.group(2))
    to_peg = int(match.group(3))
    return disk, from_peg, to_peg


def is_valid_final_state(pegs_state: dict, target_peg: int, num_disks: int) -> bool:
    """Verify that all disks are on the target peg in descending order.

    Args:
        pegs_state (dict): Current state of the pegs.
        target_peg (int): The target peg number.
        num_disks (int): Total number of disks.

    Returns:
        bool: True if valid, False otherwise.
    """
    target_stack = pegs_state[target_peg]
    if len(target_stack) != num_disks:
        return False
    return target_stack == list(range(num_disks, 0, -1))


def test_score_answer():
    """
    Test that the score_answer method returns the expected reward values.

    Expected behavior:
      - Correct answer (i.e. equivalent in length, or better, than the one provided in the dataset item) gives 1.0.
      - A correct solution that is suboptimal length gives a proportional reward of optimal_move_count/user_move_count
      - A badly formatted answer gives a minimal reward (0.01).
      - An answer that is syntactically valid but does not solve the puzzle gives a partial reward (0.05).
      - An empty string gives 0.01.
      - None gives 0.0.
    """
    # Create a dataset instance using the default configuration.
    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=5, seed=42)
    dataset = HanoiDataset(config)
    # Pick one instance from the dataset for testing.
    item = dataset[0]
    correct_answer = item["answer"]

    # 1. Correct answer should yield full reward.
    score_correct = dataset.score_answer(answer=correct_answer, entry=item)
    assert score_correct == 1.0, f"Correct answer score {score_correct} is not 1.0."

    # 2. A badly formatted answer should yield minimal reward (0.01).
    score_bad_format = dataset.score_answer(answer="a wrong solution", entry=item)
    assert score_bad_format == 0.01, f"Badly formatted answer score {score_bad_format} is not 0.01."

    # 3. An answer that is validly formatted but unsolved.
    # For example, remove the last move from the correct answer.
    unfinished_answer = correct_answer[:-1]
    score_unsolved = dataset.score_answer(answer=unfinished_answer, entry=item)
    assert score_unsolved == 0.05, f"Unsolved answer score {score_unsolved} is not 0.05."

    # 4. An empty answer should yield 0.01.
    score_empty = dataset.score_answer(answer="", entry=item)
    assert score_empty == 0.01, f"Empty answer score {score_empty} is not 0.01."

    # 5. A None answer should yield 0.0.
    score_none = dataset.score_answer(answer=None, entry=item)
    assert score_none == 0.0, f"None answer score {score_none} is not 0.0."