fix: Ensure letter counting dataset extracts only alphanumeric words

2026-04-23 16:55:05 +00:00 · 2025-01-23 19:17:45 +01:00 · 2025-01-23 19:17:45 +01:00 · 102d4dae00
commit 102d4dae00
parent 3488d4339c
1 changed files with 2 additions and 1 deletions
--- a/reasoning_gym/algorithmic/letter_counting.py
+++ b/reasoning_gym/algorithmic/letter_counting.py
@ -30,7 +30,8 @@ class LetterCountingDataset:
        
        # Load and preprocess text
        text = read_data_file("in_the_year_2889.txt")
-        self.words = [word for word in re.findall(r'\b\w+\b', text)]
+        # Extract words and clean them to contain only alphanumeric characters
+        self.words = [word for word in re.findall(r'\b\w+\b', text) if word.isalnum()]

    def __len__(self) -> int:
        return self.config.size