From dd6fc96891b7dfed8a499af51e2420ea9351142e Mon Sep 17 00:00:00 2001 From: "Andreas Koepf (aider)" Date: Thu, 23 Jan 2025 19:11:15 +0100 Subject: [PATCH] feat: Add letter counting algorithmic task with configurable text span generation --- reasoning_gym/algorithmic/__init__.py | 11 +++ reasoning_gym/algorithmic/letter_counting.py | 93 ++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 reasoning_gym/algorithmic/__init__.py create mode 100644 reasoning_gym/algorithmic/letter_counting.py diff --git a/reasoning_gym/algorithmic/__init__.py b/reasoning_gym/algorithmic/__init__.py new file mode 100644 index 00000000..ffd4f64b --- /dev/null +++ b/reasoning_gym/algorithmic/__init__.py @@ -0,0 +1,11 @@ +""" +Algorithmic tasks for training reasoning capabilities: +- Text processing +- Counting +- Sorting +- Pattern matching +""" + +from .letter_counting import LetterCountingConfig, LetterCountingDataset, letter_counting_dataset + +__all__ = ["LetterCountingConfig", "LetterCountingDataset", "letter_counting_dataset"] diff --git a/reasoning_gym/algorithmic/letter_counting.py b/reasoning_gym/algorithmic/letter_counting.py new file mode 100644 index 00000000..449b1b1b --- /dev/null +++ b/reasoning_gym/algorithmic/letter_counting.py @@ -0,0 +1,93 @@ +"""Letter counting task generator""" +from dataclasses import dataclass +import re +from random import Random +from typing import List, Optional + +from reasoning_gym.data import read_data_file + +@dataclass +class LetterCountingConfig: + """Configuration for letter counting task generation""" + min_words: int = 5 # Minimum words in span + max_words: int = 15 # Maximum words in span + seed: Optional[int] = None + size: int = 500 # Virtual dataset size + + def validate(self): + """Validate configuration parameters""" + assert self.min_words > 0, "min_words must be positive" + assert self.max_words >= self.min_words, "max_words must be >= min_words" + + +class LetterCountingDataset: + """Generates letter counting tasks from text spans""" + + def __init__(self, config: LetterCountingConfig): + self.config = config + self.config.validate() + self.seed = config.seed if config.seed is not None else Random().randint(0, 2**32) + + # Load and preprocess text + text = read_data_file("in_the_year_2889.txt") + self.words = [word for word in re.findall(r'\b\w+\b', text)] + + def __len__(self) -> int: + return self.config.size + + def __iter__(self): + self._current_idx = 0 + return self + + def __next__(self): + if self._current_idx >= self.config.size: + raise StopIteration + item = self[self._current_idx] + self._current_idx += 1 + return item + + def __getitem__(self, idx: int) -> dict: + """Generate a single letter counting task""" + rng = Random(self.seed + idx) + + # Select random span of words + span_length = rng.randint(self.config.min_words, self.config.max_words) + start_idx = rng.randint(0, len(self.words) - span_length) + span = self.words[start_idx:start_idx + span_length] + + # Get all unique letters from span + letters = set(''.join(span).lower()) + if not letters: + letters = {'a'} # Fallback if span has no letters + + # Select random letter that appears in the span + target_letter = rng.choice(list(letters)) + + # Count occurrences + count = sum(word.lower().count(target_letter) for word in span) + + return { + "question": f'How many times does the letter "{target_letter}" appear in the text: "{" ".join(span)}"?', + "answer": str(count), + "metadata": { + "span_length": span_length, + "target_letter": target_letter, + "span": span + } + } + + +def letter_counting_dataset( + min_words: int = 5, + max_words: int = 15, + seed: Optional[int] = None, + size: int = 500, +) -> LetterCountingDataset: + """Create a LetterCountingDataset with the given configuration.""" + config = LetterCountingConfig( + min_words=min_words, + max_words=max_words, + seed=seed, + size=size, + ) + return LetterCountingDataset(config)