mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-28 17:29:39 +00:00
Ensure only words are considered
This commit is contained in:
parent
5347fb7a2e
commit
384a00ec71
2 changed files with 3 additions and 2 deletions
|
|
@ -81,7 +81,7 @@ Available dataset names (which can be used with `create_dataset()`):
|
||||||
- `NumberFilteringDataset`: Filter numbers based on comparison with threshold
|
- `NumberFilteringDataset`: Filter numbers based on comparison with threshold
|
||||||
- `NumberSortingDataset`: Sort lists of numbers in ascending or descending order
|
- `NumberSortingDataset`: Sort lists of numbers in ascending or descending order
|
||||||
- `LetterJumbleDataset`: Unscramble words that have had their letters randomly jumbled
|
- `LetterJumbleDataset`: Unscramble words that have had their letters randomly jumbled
|
||||||
- `SentenceReorderingDataset`: Reorder sentence after words it in have been randomly shuffled
|
- `SentenceReorderingDataset`: Reorder sentence after words in it have been randomly shuffled
|
||||||
- `WordReversalDataset`: Reverse word order in text spans
|
- `WordReversalDataset`: Reverse word order in text spans
|
||||||
|
|
||||||
#### Cognition Tasks
|
#### Cognition Tasks
|
||||||
|
|
|
||||||
|
|
@ -29,10 +29,11 @@ class SentenceReorderingDataset(ProceduralDataset):
|
||||||
# Load and preprocess text
|
# Load and preprocess text
|
||||||
text = read_data_file("in_the_year_2889.txt")
|
text = read_data_file("in_the_year_2889.txt")
|
||||||
# Extract sentences make sure they are greater than or equal to the number of words in a sentence
|
# Extract sentences make sure they are greater than or equal to the number of words in a sentence
|
||||||
|
# Ensure that only the length of alpganumeric characters in the sentence is considered
|
||||||
self.sentences = [
|
self.sentences = [
|
||||||
sentence
|
sentence
|
||||||
for sentence in re.findall(r"[^.!?]+", text)
|
for sentence in re.findall(r"[^.!?]+", text)
|
||||||
if len(sentence.split()) >= self.config.num_of_words_in_sentence
|
if len(re.findall(r"\b\w+\b", sentence)) >= self.config.num_of_words_in_sentence
|
||||||
]
|
]
|
||||||
|
|
||||||
def _generate_sentence_dataset(self, sentence: str, seed: int, idx: int, shuffle=True):
|
def _generate_sentence_dataset(self, sentence: str, seed: int, idx: int, shuffle=True):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue