Ensure only words are considered

This commit is contained in:
abdulhakeem 2025-01-25 23:08:41 -06:00
parent 5347fb7a2e
commit 384a00ec71
2 changed files with 3 additions and 2 deletions

View file

@ -81,7 +81,7 @@ Available dataset names (which can be used with `create_dataset()`):
- `NumberFilteringDataset`: Filter numbers based on comparison with threshold - `NumberFilteringDataset`: Filter numbers based on comparison with threshold
- `NumberSortingDataset`: Sort lists of numbers in ascending or descending order - `NumberSortingDataset`: Sort lists of numbers in ascending or descending order
- `LetterJumbleDataset`: Unscramble words that have had their letters randomly jumbled - `LetterJumbleDataset`: Unscramble words that have had their letters randomly jumbled
- `SentenceReorderingDataset`: Reorder sentence after words it in have been randomly shuffled - `SentenceReorderingDataset`: Reorder sentence after words in it have been randomly shuffled
- `WordReversalDataset`: Reverse word order in text spans - `WordReversalDataset`: Reverse word order in text spans
#### Cognition Tasks #### Cognition Tasks

View file

@ -29,10 +29,11 @@ class SentenceReorderingDataset(ProceduralDataset):
# Load and preprocess text # Load and preprocess text
text = read_data_file("in_the_year_2889.txt") text = read_data_file("in_the_year_2889.txt")
# Extract sentences make sure they are greater than or equal to the number of words in a sentence # Extract sentences make sure they are greater than or equal to the number of words in a sentence
# Ensure that only the length of alpganumeric characters in the sentence is considered
self.sentences = [ self.sentences = [
sentence sentence
for sentence in re.findall(r"[^.!?]+", text) for sentence in re.findall(r"[^.!?]+", text)
if len(sentence.split()) >= self.config.num_of_words_in_sentence if len(re.findall(r"\b\w+\b", sentence)) >= self.config.num_of_words_in_sentence
] ]
def _generate_sentence_dataset(self, sentence: str, seed: int, idx: int, shuffle=True): def _generate_sentence_dataset(self, sentence: str, seed: int, idx: int, shuffle=True):