diff --git a/README.md b/README.md index 9335a1d2..0552092e 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets - `WordSequenceReversalDataset`: Reverse word order in text spans - `WordLadderDataset`: Generate word ladder puzzles where one word is transformed into another by changing one letter at a time - `GroupAnagramsDataset`: Group anagrams together in a list of words +- `IsomorphicStrings`: Check if two strings are isomorphic (have the same character mapping) ### Code Tasks diff --git a/reasoning_gym/algorithmic/__init__.py b/reasoning_gym/algorithmic/__init__.py index 4e4688bf..60247372 100644 --- a/reasoning_gym/algorithmic/__init__.py +++ b/reasoning_gym/algorithmic/__init__.py @@ -9,6 +9,7 @@ Algorithmic tasks for training reasoning capabilities: from .base_conversion import BaseConversionConfig, BaseConversionDataset from .caesar_cipher import CaesarCipherConfig, CaesarCipherDataset from .group_anagrams import GroupAnagramsConfig, GroupAnagramsDataset +from .isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset from .letter_counting import LetterCountingConfig, LetterCountingDataset from .letter_jumble import LetterJumbleConfig, LetterJumbleDataset from .number_filtering import NumberFilteringConfig, NumberFilteringDataset @@ -48,4 +49,6 @@ __all__ = [ "PalindromeDataset", "GroupAnagramsConfig", "GroupAnagramsDataset", + "IsomorphicStringsConfig", + "IsomorphicStringsDataset", ] diff --git a/reasoning_gym/algorithmic/isomorphic_strings.py b/reasoning_gym/algorithmic/isomorphic_strings.py new file mode 100644 index 00000000..3b4a59e5 --- /dev/null +++ b/reasoning_gym/algorithmic/isomorphic_strings.py @@ -0,0 +1,121 @@ +"""Check if two strings are isomorphic. + +Two strings are isomorphic if the characters in one string can be replaced to get the second string. + +A popular Leetcode problem: +https://leetcode.com/problems/isomorphic-strings/description/ +""" + +from dataclasses import dataclass +from random import Random +from typing import Optional + +from ..factory import ProceduralDataset, register_dataset + +QUESTION_TEMPLATE = """Two strings are isomorphic if the characters in one string can be replaced to get the second string. + +All occurrences of a character must be replaced with another character while preserving the order of characters. + +No two characters may map to the same character, but a character may map to itself. + +Example 1: +Input: egg add +Output: True +Explanation: The strings s and t can be made identical by: + - Mapping 'e' to 'a'. + - Mapping 'g' to 'd'. + +Example 2: +Input: foo bar +Output: False +Explanation: + - The strings cannot be made identical as 'o' needs to be mapped to both 'a' and 'r'. + +Return True if the following two strings are isomorphic, or False otherwise: +{s} {t} +""" + + +@dataclass +class IsomorphicStringsConfig: + """Configuration for Isomorphic Strings dataset generation""" + + max_string_length: int = 10 # Maximum length of the strings + p_solvable: float = 0.5 # Probability that the generated question is solvable + + size: int = 500 # Virtual dataset size + seed: Optional[int] = None + + def validate(self): + """Validate configuration parameters""" + assert 2 <= self.max_string_length, "max_string_length must be at least 2" + assert 0 <= self.p_solvable <= 1, "p_solvable must be between 0 and 1" + + +class IsomorphicStringsDataset(ProceduralDataset): + """Generates Isomorphic Strings exercises with configurable difficulty""" + + def __init__(self, config: IsomorphicStringsConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + self.letters = {chr(i) for i in range(ord("a"), ord("z") + 1)} + + def _check_isomorphic(self, s: str, t: str) -> bool: + """Check if two strings are isomorphic""" + if len(s) != len(t): + return False + + mapping, inverse_mapping = {}, {} # s -> t, t -> s + for i in range(len(s)): + if (s[i] in mapping and mapping[s[i]] != t[i]) or ( + t[i] in inverse_mapping and s[i] != inverse_mapping[t[i]] + ): + return False + mapping[s[i]] = t[i] + inverse_mapping[t[i]] = s[i] + + return True + + def _generate_inputs(self, rng: Random, solvable: bool) -> tuple[str, str]: + """Generate the two input strings""" + s, t = [], [] + mapping = {} + + # Generate a valid isomorphic pair first (leave one character for potential conflict) + for _ in range(rng.randint(1, self.config.max_string_length - 1)): + char_s = rng.choice(list(self.letters)) + if char_s not in mapping: + # Choose a random character that is not already mapped + char_t = rng.choice(list(self.letters - set(mapping.values()))) + mapping[char_s] = char_t + else: + # Use the existing mapping + char_t = mapping[char_s] + s.append(char_s) + t.append(char_t) + + if not solvable: + # Solution should be unsolvable, create conflict + letter = rng.choice(list(mapping.keys())) + conflict = rng.choice(list(self.letters - {mapping[letter]})) + insert_idx = rng.randint(0, len(s)) + s.insert(insert_idx, letter) + t.insert(insert_idx, conflict) + + return "".join(s), "".join(t) + + def __getitem__(self, idx: int) -> dict: + """Generate a single Isomorphic Strings question""" + rng = Random(self.seed + idx) + + solvable = rng.random() < self.config.p_solvable + s, t = self._generate_inputs(rng, solvable) + answer = self._check_isomorphic(s, t) + + return { + "question": QUESTION_TEMPLATE.format(s=s, t=t), + "answer": str(answer), + "metadata": {"words": [s, t], "solution": answer, "solvable": solvable}, + } + + +register_dataset("isomorphic_strings", IsomorphicStringsDataset, IsomorphicStringsConfig) diff --git a/tests/test_isomorphic_strings.py b/tests/test_isomorphic_strings.py new file mode 100644 index 00000000..6e515cf7 --- /dev/null +++ b/tests/test_isomorphic_strings.py @@ -0,0 +1,108 @@ +"""Tests for Isomorphic Strings questions generation""" + +import json + +import pytest + +from reasoning_gym.algorithmic.isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset + + +def test_isomorphic_strings_config_validation(): + """Test that invalid configs raise appropriate errors""" + with pytest.raises(AssertionError): + config = IsomorphicStringsConfig(max_string_length=-1) # Negative not allowed + config.validate() + + with pytest.raises(AssertionError): + config = IsomorphicStringsConfig(max_string_length=0) # Zero not allowed + config.validate() + + with pytest.raises(AssertionError): + config = IsomorphicStringsConfig(max_string_length=1) # One not allowed + config.validate() + + with pytest.raises(AssertionError): + config = IsomorphicStringsConfig(p_solvable=-0.01) # < 0 not allowed + config.validate() + + with pytest.raises(AssertionError): + config = IsomorphicStringsConfig(p_solvable=1.01) # > 1 not allowed + config.validate() + + +def test_isomorphic_strings_dataset_deterministic(): + """Test that dataset generates same items with same seed""" + config = IsomorphicStringsConfig(seed=42, size=10) + dataset1 = IsomorphicStringsDataset(config) + dataset2 = IsomorphicStringsDataset(config) + + for i in range(len(dataset1)): + assert dataset1[i] == dataset2[i] + + +def test_isomorphic_strings_dataset_items(): + """Test basic properties of generated items""" + config = IsomorphicStringsConfig(max_string_length=10, size=10, seed=42) + dataset = IsomorphicStringsDataset(config) + + for i in range(len(dataset)): + item = dataset[i] + # Check item structure + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Check metadata + assert "words" in item["metadata"] + assert "solution" in item["metadata"] + assert "solvable" in item["metadata"] + + words = item["metadata"]["words"] + solution = item["metadata"]["solution"] + solvable = item["metadata"]["solvable"] + + # Verify list dimensions + assert len(words) == 2 + assert solution in {True, False} + assert solvable in {True, False} + assert solution == solvable + + +def test_isomorphic_strings_dataset_iteration(): + """Test that iteration respects dataset size""" + config = IsomorphicStringsConfig(size=5, seed=42) + dataset = IsomorphicStringsDataset(config) + + items = list(dataset) + assert len(items) == config.size + + # Test multiple iterations yield same items + assert items == list(dataset) + + +def test_isomorphic_strings_answer(): + """Test the _check_isomorphic method""" + config = IsomorphicStringsConfig(seed=42) + dataset = IsomorphicStringsDataset(config) + + # General use case + s, t = "foo", "bar" + assert dataset._check_isomorphic(s, t) == False + + s, t = "foo", "baa" + assert dataset._check_isomorphic(s, t) == True + + # Unequal lengths + s, t = "foo", "bo" + assert dataset._check_isomorphic(s, t) == False + + # Empty strings + ( + s, + t, + ) = ( + "", + "", + ) + assert dataset._check_isomorphic(s, t) == True