diff --git a/README.md b/README.md index 1426cac9..90088113 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ Available dataset names (which can be used with `create_dataset()`): - `NumberFilteringDataset`: Filter numbers based on comparison with threshold - `NumberSortingDataset`: Sort lists of numbers in ascending or descending order - `LetterJumbleDataset`: Unscramble words that have had their letters randomly jumbled -- `SentenceReorderingDataset`: Reorder sentence after words it in have been randomly shuffled +- `SentenceReorderingDataset`: Reorder sentence after words in it have been randomly shuffled - `WordReversalDataset`: Reverse word order in text spans #### Cognition Tasks diff --git a/reasoning_gym/algorithmic/sentence_reordering.py b/reasoning_gym/algorithmic/sentence_reordering.py index c376fcd6..04dfbff3 100644 --- a/reasoning_gym/algorithmic/sentence_reordering.py +++ b/reasoning_gym/algorithmic/sentence_reordering.py @@ -29,10 +29,11 @@ class SentenceReorderingDataset(ProceduralDataset): # Load and preprocess text text = read_data_file("in_the_year_2889.txt") # Extract sentences make sure they are greater than or equal to the number of words in a sentence + # Ensure that only the length of alpganumeric characters in the sentence is considered self.sentences = [ sentence for sentence in re.findall(r"[^.!?]+", text) - if len(sentence.split()) >= self.config.num_of_words_in_sentence + if len(re.findall(r"\b\w+\b", sentence)) >= self.config.num_of_words_in_sentence ] def _generate_sentence_dataset(self, sentence: str, seed: int, idx: int, shuffle=True):