mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
isomorphic strings
This commit is contained in:
parent
071c22a809
commit
d78ce0a9f7
4 changed files with 233 additions and 0 deletions
|
|
@ -99,6 +99,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
|
|||
- `WordSequenceReversalDataset`: Reverse word order in text spans
|
||||
- `WordLadderDataset`: Generate word ladder puzzles where one word is transformed into another by changing one letter at a time
|
||||
- `GroupAnagramsDataset`: Group anagrams together in a list of words
|
||||
- `IsomorphicStrings`: Check if two strings are isomorphic (have the same character mapping)
|
||||
|
||||
### <small>Code Tasks</small>
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ Algorithmic tasks for training reasoning capabilities:
|
|||
from .base_conversion import BaseConversionConfig, BaseConversionDataset
|
||||
from .caesar_cipher import CaesarCipherConfig, CaesarCipherDataset
|
||||
from .group_anagrams import GroupAnagramsConfig, GroupAnagramsDataset
|
||||
from .isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset
|
||||
from .letter_counting import LetterCountingConfig, LetterCountingDataset
|
||||
from .letter_jumble import LetterJumbleConfig, LetterJumbleDataset
|
||||
from .number_filtering import NumberFilteringConfig, NumberFilteringDataset
|
||||
|
|
@ -48,4 +49,6 @@ __all__ = [
|
|||
"PalindromeDataset",
|
||||
"GroupAnagramsConfig",
|
||||
"GroupAnagramsDataset",
|
||||
"IsomorphicStringsConfig",
|
||||
"IsomorphicStringsDataset",
|
||||
]
|
||||
|
|
|
|||
121
reasoning_gym/algorithmic/isomorphic_strings.py
Normal file
121
reasoning_gym/algorithmic/isomorphic_strings.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
"""Check if two strings are isomorphic.
|
||||
|
||||
Two strings are isomorphic if the characters in one string can be replaced to get the second string.
|
||||
|
||||
A popular Leetcode problem:
|
||||
https://leetcode.com/problems/isomorphic-strings/description/
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from random import Random
|
||||
from typing import Optional
|
||||
|
||||
from ..factory import ProceduralDataset, register_dataset
|
||||
|
||||
QUESTION_TEMPLATE = """Two strings are isomorphic if the characters in one string can be replaced to get the second string.
|
||||
|
||||
All occurrences of a character must be replaced with another character while preserving the order of characters.
|
||||
|
||||
No two characters may map to the same character, but a character may map to itself.
|
||||
|
||||
Example 1:
|
||||
Input: egg add
|
||||
Output: True
|
||||
Explanation: The strings s and t can be made identical by:
|
||||
- Mapping 'e' to 'a'.
|
||||
- Mapping 'g' to 'd'.
|
||||
|
||||
Example 2:
|
||||
Input: foo bar
|
||||
Output: False
|
||||
Explanation:
|
||||
- The strings cannot be made identical as 'o' needs to be mapped to both 'a' and 'r'.
|
||||
|
||||
Return True if the following two strings are isomorphic, or False otherwise:
|
||||
{s} {t}
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class IsomorphicStringsConfig:
|
||||
"""Configuration for Isomorphic Strings dataset generation"""
|
||||
|
||||
max_string_length: int = 10 # Maximum length of the strings
|
||||
p_solvable: float = 0.5 # Probability that the generated question is solvable
|
||||
|
||||
size: int = 500 # Virtual dataset size
|
||||
seed: Optional[int] = None
|
||||
|
||||
def validate(self):
|
||||
"""Validate configuration parameters"""
|
||||
assert 2 <= self.max_string_length, "max_string_length must be at least 2"
|
||||
assert 0 <= self.p_solvable <= 1, "p_solvable must be between 0 and 1"
|
||||
|
||||
|
||||
class IsomorphicStringsDataset(ProceduralDataset):
|
||||
"""Generates Isomorphic Strings exercises with configurable difficulty"""
|
||||
|
||||
def __init__(self, config: IsomorphicStringsConfig):
|
||||
super().__init__(config=config, seed=config.seed, size=config.size)
|
||||
self.letters = {chr(i) for i in range(ord("a"), ord("z") + 1)}
|
||||
|
||||
def _check_isomorphic(self, s: str, t: str) -> bool:
|
||||
"""Check if two strings are isomorphic"""
|
||||
if len(s) != len(t):
|
||||
return False
|
||||
|
||||
mapping, inverse_mapping = {}, {} # s -> t, t -> s
|
||||
for i in range(len(s)):
|
||||
if (s[i] in mapping and mapping[s[i]] != t[i]) or (
|
||||
t[i] in inverse_mapping and s[i] != inverse_mapping[t[i]]
|
||||
):
|
||||
return False
|
||||
mapping[s[i]] = t[i]
|
||||
inverse_mapping[t[i]] = s[i]
|
||||
|
||||
return True
|
||||
|
||||
def _generate_inputs(self, rng: Random, solvable: bool) -> tuple[str, str]:
|
||||
"""Generate the two input strings"""
|
||||
s, t = [], []
|
||||
mapping = {}
|
||||
|
||||
# Generate a valid isomorphic pair first (leave one character for potential conflict)
|
||||
for _ in range(rng.randint(1, self.config.max_string_length - 1)):
|
||||
char_s = rng.choice(list(self.letters))
|
||||
if char_s not in mapping:
|
||||
# Choose a random character that is not already mapped
|
||||
char_t = rng.choice(list(self.letters - set(mapping.values())))
|
||||
mapping[char_s] = char_t
|
||||
else:
|
||||
# Use the existing mapping
|
||||
char_t = mapping[char_s]
|
||||
s.append(char_s)
|
||||
t.append(char_t)
|
||||
|
||||
if not solvable:
|
||||
# Solution should be unsolvable, create conflict
|
||||
letter = rng.choice(list(mapping.keys()))
|
||||
conflict = rng.choice(list(self.letters - {mapping[letter]}))
|
||||
insert_idx = rng.randint(0, len(s))
|
||||
s.insert(insert_idx, letter)
|
||||
t.insert(insert_idx, conflict)
|
||||
|
||||
return "".join(s), "".join(t)
|
||||
|
||||
def __getitem__(self, idx: int) -> dict:
|
||||
"""Generate a single Isomorphic Strings question"""
|
||||
rng = Random(self.seed + idx)
|
||||
|
||||
solvable = rng.random() < self.config.p_solvable
|
||||
s, t = self._generate_inputs(rng, solvable)
|
||||
answer = self._check_isomorphic(s, t)
|
||||
|
||||
return {
|
||||
"question": QUESTION_TEMPLATE.format(s=s, t=t),
|
||||
"answer": str(answer),
|
||||
"metadata": {"words": [s, t], "solution": answer, "solvable": solvable},
|
||||
}
|
||||
|
||||
|
||||
register_dataset("isomorphic_strings", IsomorphicStringsDataset, IsomorphicStringsConfig)
|
||||
108
tests/test_isomorphic_strings.py
Normal file
108
tests/test_isomorphic_strings.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
"""Tests for Isomorphic Strings questions generation"""
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from reasoning_gym.algorithmic.isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset
|
||||
|
||||
|
||||
def test_isomorphic_strings_config_validation():
|
||||
"""Test that invalid configs raise appropriate errors"""
|
||||
with pytest.raises(AssertionError):
|
||||
config = IsomorphicStringsConfig(max_string_length=-1) # Negative not allowed
|
||||
config.validate()
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
config = IsomorphicStringsConfig(max_string_length=0) # Zero not allowed
|
||||
config.validate()
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
config = IsomorphicStringsConfig(max_string_length=1) # One not allowed
|
||||
config.validate()
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
config = IsomorphicStringsConfig(p_solvable=-0.01) # < 0 not allowed
|
||||
config.validate()
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
config = IsomorphicStringsConfig(p_solvable=1.01) # > 1 not allowed
|
||||
config.validate()
|
||||
|
||||
|
||||
def test_isomorphic_strings_dataset_deterministic():
|
||||
"""Test that dataset generates same items with same seed"""
|
||||
config = IsomorphicStringsConfig(seed=42, size=10)
|
||||
dataset1 = IsomorphicStringsDataset(config)
|
||||
dataset2 = IsomorphicStringsDataset(config)
|
||||
|
||||
for i in range(len(dataset1)):
|
||||
assert dataset1[i] == dataset2[i]
|
||||
|
||||
|
||||
def test_isomorphic_strings_dataset_items():
|
||||
"""Test basic properties of generated items"""
|
||||
config = IsomorphicStringsConfig(max_string_length=10, size=10, seed=42)
|
||||
dataset = IsomorphicStringsDataset(config)
|
||||
|
||||
for i in range(len(dataset)):
|
||||
item = dataset[i]
|
||||
# Check item structure
|
||||
assert isinstance(item, dict)
|
||||
assert "question" in item
|
||||
assert "answer" in item
|
||||
assert "metadata" in item
|
||||
|
||||
# Check metadata
|
||||
assert "words" in item["metadata"]
|
||||
assert "solution" in item["metadata"]
|
||||
assert "solvable" in item["metadata"]
|
||||
|
||||
words = item["metadata"]["words"]
|
||||
solution = item["metadata"]["solution"]
|
||||
solvable = item["metadata"]["solvable"]
|
||||
|
||||
# Verify list dimensions
|
||||
assert len(words) == 2
|
||||
assert solution in {True, False}
|
||||
assert solvable in {True, False}
|
||||
assert solution == solvable
|
||||
|
||||
|
||||
def test_isomorphic_strings_dataset_iteration():
|
||||
"""Test that iteration respects dataset size"""
|
||||
config = IsomorphicStringsConfig(size=5, seed=42)
|
||||
dataset = IsomorphicStringsDataset(config)
|
||||
|
||||
items = list(dataset)
|
||||
assert len(items) == config.size
|
||||
|
||||
# Test multiple iterations yield same items
|
||||
assert items == list(dataset)
|
||||
|
||||
|
||||
def test_isomorphic_strings_answer():
|
||||
"""Test the _check_isomorphic method"""
|
||||
config = IsomorphicStringsConfig(seed=42)
|
||||
dataset = IsomorphicStringsDataset(config)
|
||||
|
||||
# General use case
|
||||
s, t = "foo", "bar"
|
||||
assert dataset._check_isomorphic(s, t) == False
|
||||
|
||||
s, t = "foo", "baa"
|
||||
assert dataset._check_isomorphic(s, t) == True
|
||||
|
||||
# Unequal lengths
|
||||
s, t = "foo", "bo"
|
||||
assert dataset._check_isomorphic(s, t) == False
|
||||
|
||||
# Empty strings
|
||||
(
|
||||
s,
|
||||
t,
|
||||
) = (
|
||||
"",
|
||||
"",
|
||||
)
|
||||
assert dataset._check_isomorphic(s, t) == True
|
||||
Loading…
Add table
Add a link
Reference in a new issue