fix: Ensure letter counting dataset extracts only alphanumeric words

This commit is contained in:
Andreas Koepf (aider) 2025-01-23 19:17:45 +01:00
parent 3488d4339c
commit 102d4dae00

View file

@ -30,7 +30,8 @@ class LetterCountingDataset:
# Load and preprocess text
text = read_data_file("in_the_year_2889.txt")
self.words = [word for word in re.findall(r'\b\w+\b', text)]
# Extract words and clean them to contain only alphanumeric characters
self.words = [word for word in re.findall(r'\b\w+\b', text) if word.isalnum()]
def __len__(self) -> int:
return self.config.size