From 102d4dae002db34e86a6a5399084fe7db65e488a Mon Sep 17 00:00:00 2001 From: "Andreas Koepf (aider)" Date: Thu, 23 Jan 2025 19:17:45 +0100 Subject: [PATCH] fix: Ensure letter counting dataset extracts only alphanumeric words --- reasoning_gym/algorithmic/letter_counting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reasoning_gym/algorithmic/letter_counting.py b/reasoning_gym/algorithmic/letter_counting.py index 449b1b1b..5f2c372c 100644 --- a/reasoning_gym/algorithmic/letter_counting.py +++ b/reasoning_gym/algorithmic/letter_counting.py @@ -30,7 +30,8 @@ class LetterCountingDataset: # Load and preprocess text text = read_data_file("in_the_year_2889.txt") - self.words = [word for word in re.findall(r'\b\w+\b', text)] + # Extract words and clean them to contain only alphanumeric characters + self.words = [word for word in re.findall(r'\b\w+\b', text) if word.isalnum()] def __len__(self) -> int: return self.config.size