Ensure only words are considered

2026-04-28 17:29:39 +00:00 · 2025-01-25 23:08:41 -06:00 · 2025-01-25 23:08:41 -06:00 · 384a00ec71
commit 384a00ec71
parent 5347fb7a2e
2 changed files with 3 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -81,7 +81,7 @@ Available dataset names (which can be used with `create_dataset()`):
 - `NumberFilteringDataset`: Filter numbers based on comparison with threshold
 - `NumberSortingDataset`: Sort lists of numbers in ascending or descending order
 - `LetterJumbleDataset`: Unscramble words that have had their letters randomly jumbled
- `SentenceReorderingDataset`: Reorder sentence after words it in have been randomly shuffled
+- `SentenceReorderingDataset`: Reorder sentence after words in it have been randomly shuffled
 - `WordReversalDataset`: Reverse word order in text spans
 #### Cognition Tasks
--- a/reasoning_gym/algorithmic/sentence_reordering.py
+++ b/reasoning_gym/algorithmic/sentence_reordering.py
@ -29,10 +29,11 @@ class SentenceReorderingDataset(ProceduralDataset):
        # Load and preprocess text
        text = read_data_file("in_the_year_2889.txt")
        # Extract sentences make sure they are greater than or equal to the number of words in a sentence
        # Ensure that only the length of alpganumeric characters in the sentence is considered
        self.sentences = [
            sentence
            for sentence in re.findall(r"[^.!?]+", text)
-            if len(sentence.split()) >= self.config.num_of_words_in_sentence
+            if len(re.findall(r"\b\w+\b", sentence)) >= self.config.num_of_words_in_sentence
        ]
    def _generate_sentence_dataset(self, sentence: str, seed: int, idx: int, shuffle=True):