From 09634db2cff10c335457d8706a5fadd49eb2afff Mon Sep 17 00:00:00 2001 From: abdulhakeem Date: Sat, 25 Jan 2025 22:52:35 -0600 Subject: [PATCH 1/8] Add sentence reordering and unit tests to validate it --- README.md | 1 + reasoning_gym/algorithmic/__init__.py | 3 + .../algorithmic/sentence_reordering.py | 79 +++++++++++++++++++ tests/test_sentence_reordering.py | 45 +++++++++++ 4 files changed, 128 insertions(+) create mode 100644 reasoning_gym/algorithmic/sentence_reordering.py create mode 100644 tests/test_sentence_reordering.py diff --git a/README.md b/README.md index dae6975a..1426cac9 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ Available dataset names (which can be used with `create_dataset()`): - `NumberFilteringDataset`: Filter numbers based on comparison with threshold - `NumberSortingDataset`: Sort lists of numbers in ascending or descending order - `LetterJumbleDataset`: Unscramble words that have had their letters randomly jumbled +- `SentenceReorderingDataset`: Reorder sentence after words it in have been randomly shuffled - `WordReversalDataset`: Reverse word order in text spans #### Cognition Tasks diff --git a/reasoning_gym/algorithmic/__init__.py b/reasoning_gym/algorithmic/__init__.py index 78136d66..5eb551f0 100644 --- a/reasoning_gym/algorithmic/__init__.py +++ b/reasoning_gym/algorithmic/__init__.py @@ -13,6 +13,7 @@ from .letter_jumble import LetterJumbleConfig, LetterJumbleDataset from .number_filtering import NumberFilteringConfig, NumberFilteringDataset from .number_sorting import NumberSortingConfig, NumberSortingDataset from .word_reversal import WordReversalConfig, WordReversalDataset +from .sentence_reordering import SentenceReorderingConfig, SentenceReorderingDataset __all__ = [ "BaseConversionConfig", @@ -29,4 +30,6 @@ __all__ = [ "NumberSortingDataset", "WordReversalConfig", "WordReversalDataset", + "SentenceReorderingConfig", + "SentenceReorderingDataset", ] diff --git a/reasoning_gym/algorithmic/sentence_reordering.py b/reasoning_gym/algorithmic/sentence_reordering.py new file mode 100644 index 00000000..36043c8e --- /dev/null +++ b/reasoning_gym/algorithmic/sentence_reordering.py @@ -0,0 +1,79 @@ +"""Sentence re-ordering task generator""" + +import re +from dataclasses import dataclass +from random import Random +from typing import List, Optional + +from ..data import read_data_file +from ..factory import ProceduralDataset, register_dataset + +@dataclass +class SentenceReorderingConfig: + """Configuration for sentence reordering task generation""" + num_of_words_in_sentence: int = 10 + seed: Optional[int] = None + size: int = 500 + + def validate(self) -> None: + """Validate configuration parameters""" + pass + + +class SentenceReorderingDataset(ProceduralDataset): + """Generates sentence reordering tasks from text spans""" + + def __init__(self, config: SentenceReorderingConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + + # Load and preprocess text + text = read_data_file("in_the_year_2889.txt") + # Extract sentences make sure they are greater than or equal to the number of words in a sentence + self.sentences = [ + sentence + for sentence in re.findall(r"[^.!?]+", text) + if len(sentence.split()) >= self.config.num_of_words_in_sentence + ] + + def _generate_sentence_dataset(self, sentence: str, seed: int, idx: int, shuffle=True): + """ + Generate a procedural dataset by shuffling the words in the input sentence. + + Args: + sentence (str): The correct sentence to use for dataset generation. + shuffle (bool): Whether to shuffle the words to create the input sentence. + + Returns: + dict: A dictionary containing the input sentence and the correct sentence (goal). + """ + rng = Random(seed + idx) + words = sentence.split() # Split the sentence into words + scrambled_words = words.copy() + if shuffle: + rng.shuffle(scrambled_words) # Shuffle the words to generate the input + input_sentence = " ".join(scrambled_words) + goal_sentence = " ".join(words) + return {"input": input_sentence, "goal": goal_sentence} + + def __getitem__(self, idx: int) -> dict: + """Generate a single sentence reordering task""" + rng = Random(self.seed + idx) + sentence_dataset = self._generate_sentence_dataset(rng.choice(self.sentences), self.seed, idx) + + # Ensure only 'input' and 'goal' keys are present + if set(sentence_dataset.keys()) != {'input', 'goal'}: + raise KeyError("The dictionary must contain only 'input' and 'goal' keys") + + # Solve the task by sorting words to match the goal sentence + input_words = sentence_dataset['input'].split() + question = " ".join(input_words) + goal_words = sentence_dataset['goal'].split() + solved_sentence = " ".join(sorted(input_words, key=lambda word: goal_words.index(word))) + + return { + "question": f"Correct the following sentence: {question}", + "answer": solved_sentence, + "metadata": {"num_of_words_in_sentence": len(goal_words)}, + } + +register_dataset("sentence_reordering", SentenceReorderingDataset, SentenceReorderingConfig) diff --git a/tests/test_sentence_reordering.py b/tests/test_sentence_reordering.py new file mode 100644 index 00000000..8ebcc3d1 --- /dev/null +++ b/tests/test_sentence_reordering.py @@ -0,0 +1,45 @@ +import pytest +from reasoning_gym.algorithmic.sentence_reordering import ( + SentenceReorderingConfig, + SentenceReorderingDataset, +) + +@pytest.fixture +def config(): + return SentenceReorderingConfig(num_of_words_in_sentence=5, seed=42, size=10) + +@pytest.fixture +def dataset(config): + return SentenceReorderingDataset(config=config) + +def test_config_validation(config): + # Test that the config validation does not raise any exceptions + try: + config.validate() + except Exception as e: + pytest.fail(f"Config validation raised an exception: {e}") + +def test_generate_sentence_dataset(dataset): + sentence = "This is a test sentence for reordering" + result = dataset._generate_sentence_dataset(sentence, seed=42, idx=0, shuffle=True) + assert "input" in result + assert "goal" in result + assert result["input"] != result["goal"] + assert sorted(result["input"].split()) == sorted(result["goal"].split()) + +def test_getitem(dataset, config): + item = dataset[0] + assert "question" in item + assert "answer" in item + assert "metadata" in item + assert item["metadata"]["num_of_words_in_sentence"] >= config.num_of_words_in_sentence + +def test_key_error_in_getitem(dataset): + # Modify the dataset to include an incorrect key + def mock_generate_sentence_dataset(*args, **kwargs): + return {"input": "mock input", "goal": "mock goal", "extra": "extra key"} + + dataset._generate_sentence_dataset = mock_generate_sentence_dataset + + with pytest.raises(KeyError): + dataset[0] \ No newline at end of file From b96b6720295459505a7ae969cbea22efbbd99664 Mon Sep 17 00:00:00 2001 From: abdulhakeem Date: Sat, 25 Jan 2025 22:58:47 -0600 Subject: [PATCH 2/8] Add line end to test file for sentence reordering --- tests/test_sentence_reordering.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_sentence_reordering.py b/tests/test_sentence_reordering.py index 8ebcc3d1..08142abc 100644 --- a/tests/test_sentence_reordering.py +++ b/tests/test_sentence_reordering.py @@ -42,4 +42,5 @@ def test_key_error_in_getitem(dataset): dataset._generate_sentence_dataset = mock_generate_sentence_dataset with pytest.raises(KeyError): - dataset[0] \ No newline at end of file + dataset[0] + \ No newline at end of file From d00856cab49220956ebb7aa8f5be0a4bd19ed80a Mon Sep 17 00:00:00 2001 From: abdulhakeem Date: Sat, 25 Jan 2025 23:02:17 -0600 Subject: [PATCH 3/8] Add assertion to ensure number of words in sentence is positive --- reasoning_gym/algorithmic/sentence_reordering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reasoning_gym/algorithmic/sentence_reordering.py b/reasoning_gym/algorithmic/sentence_reordering.py index 36043c8e..c376fcd6 100644 --- a/reasoning_gym/algorithmic/sentence_reordering.py +++ b/reasoning_gym/algorithmic/sentence_reordering.py @@ -17,7 +17,7 @@ class SentenceReorderingConfig: def validate(self) -> None: """Validate configuration parameters""" - pass + assert self.num_of_words_in_sentence > 0, "num_of_words_in_sentence must be positive" class SentenceReorderingDataset(ProceduralDataset): From 8debeef91399af363cfb8586df1c75b636c7171f Mon Sep 17 00:00:00 2001 From: abdulhakeem Date: Sat, 25 Jan 2025 23:03:12 -0600 Subject: [PATCH 4/8] Fix code smell --- tests/test_sentence_reordering.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_sentence_reordering.py b/tests/test_sentence_reordering.py index 08142abc..ffa22c7c 100644 --- a/tests/test_sentence_reordering.py +++ b/tests/test_sentence_reordering.py @@ -43,4 +43,3 @@ def test_key_error_in_getitem(dataset): with pytest.raises(KeyError): dataset[0] - \ No newline at end of file From 9d025b43fa35183fa56b1153f036ece98b9b91d4 Mon Sep 17 00:00:00 2001 From: abdulhakeem Date: Sat, 25 Jan 2025 23:08:41 -0600 Subject: [PATCH 5/8] Ensure only words are considered --- README.md | 2 +- reasoning_gym/algorithmic/sentence_reordering.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1426cac9..90088113 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ Available dataset names (which can be used with `create_dataset()`): - `NumberFilteringDataset`: Filter numbers based on comparison with threshold - `NumberSortingDataset`: Sort lists of numbers in ascending or descending order - `LetterJumbleDataset`: Unscramble words that have had their letters randomly jumbled -- `SentenceReorderingDataset`: Reorder sentence after words it in have been randomly shuffled +- `SentenceReorderingDataset`: Reorder sentence after words in it have been randomly shuffled - `WordReversalDataset`: Reverse word order in text spans #### Cognition Tasks diff --git a/reasoning_gym/algorithmic/sentence_reordering.py b/reasoning_gym/algorithmic/sentence_reordering.py index c376fcd6..04dfbff3 100644 --- a/reasoning_gym/algorithmic/sentence_reordering.py +++ b/reasoning_gym/algorithmic/sentence_reordering.py @@ -29,10 +29,11 @@ class SentenceReorderingDataset(ProceduralDataset): # Load and preprocess text text = read_data_file("in_the_year_2889.txt") # Extract sentences make sure they are greater than or equal to the number of words in a sentence + # Ensure that only the length of alpganumeric characters in the sentence is considered self.sentences = [ sentence for sentence in re.findall(r"[^.!?]+", text) - if len(sentence.split()) >= self.config.num_of_words_in_sentence + if len(re.findall(r"\b\w+\b", sentence)) >= self.config.num_of_words_in_sentence ] def _generate_sentence_dataset(self, sentence: str, seed: int, idx: int, shuffle=True): From 05004b7a51c61c46524c0f2803b41782f5d250f1 Mon Sep 17 00:00:00 2001 From: abdulhakeem Date: Sat, 25 Jan 2025 23:17:39 -0600 Subject: [PATCH 6/8] Add parameters to _generate_sentence_dataset --- reasoning_gym/algorithmic/sentence_reordering.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/reasoning_gym/algorithmic/sentence_reordering.py b/reasoning_gym/algorithmic/sentence_reordering.py index 04dfbff3..6856a264 100644 --- a/reasoning_gym/algorithmic/sentence_reordering.py +++ b/reasoning_gym/algorithmic/sentence_reordering.py @@ -42,6 +42,8 @@ class SentenceReorderingDataset(ProceduralDataset): Args: sentence (str): The correct sentence to use for dataset generation. + seed (int): The seed to use for random number generation. + idx (int): The index to add to the seed for random number generation. shuffle (bool): Whether to shuffle the words to create the input sentence. Returns: From acc3f5269afcb416753c7c9077666e9fb7a1c0cc Mon Sep 17 00:00:00 2001 From: abdulhakeem Date: Sat, 25 Jan 2025 23:22:16 -0600 Subject: [PATCH 7/8] Correct logic for number of words in sentence --- reasoning_gym/algorithmic/sentence_reordering.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/reasoning_gym/algorithmic/sentence_reordering.py b/reasoning_gym/algorithmic/sentence_reordering.py index 6856a264..e8f8ce1c 100644 --- a/reasoning_gym/algorithmic/sentence_reordering.py +++ b/reasoning_gym/algorithmic/sentence_reordering.py @@ -29,7 +29,7 @@ class SentenceReorderingDataset(ProceduralDataset): # Load and preprocess text text = read_data_file("in_the_year_2889.txt") # Extract sentences make sure they are greater than or equal to the number of words in a sentence - # Ensure that only the length of alpganumeric characters in the sentence is considered + # Ensure that only the length of alphanumeric characters in the sentence is considered self.sentences = [ sentence for sentence in re.findall(r"[^.!?]+", text) @@ -72,11 +72,14 @@ class SentenceReorderingDataset(ProceduralDataset): question = " ".join(input_words) goal_words = sentence_dataset['goal'].split() solved_sentence = " ".join(sorted(input_words, key=lambda word: goal_words.index(word))) + # Check for length alphanumeric characters in the solved sentence + num_of_words_in_sentence = len(re.findall(r"\b\w+\b", solved_sentence)) + return { "question": f"Correct the following sentence: {question}", "answer": solved_sentence, - "metadata": {"num_of_words_in_sentence": len(goal_words)}, + "metadata": {"num_of_words_in_sentence": num_of_words_in_sentence}, } register_dataset("sentence_reordering", SentenceReorderingDataset, SentenceReorderingConfig) From 03d4a5d8ac3689b3ec20a4e7b1ed30e8de319412 Mon Sep 17 00:00:00 2001 From: abdulhakeem Date: Sat, 25 Jan 2025 23:25:55 -0600 Subject: [PATCH 8/8] Make more tiny correction --- reasoning_gym/algorithmic/sentence_reordering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reasoning_gym/algorithmic/sentence_reordering.py b/reasoning_gym/algorithmic/sentence_reordering.py index e8f8ce1c..f7f9ff30 100644 --- a/reasoning_gym/algorithmic/sentence_reordering.py +++ b/reasoning_gym/algorithmic/sentence_reordering.py @@ -72,7 +72,7 @@ class SentenceReorderingDataset(ProceduralDataset): question = " ".join(input_words) goal_words = sentence_dataset['goal'].split() solved_sentence = " ".join(sorted(input_words, key=lambda word: goal_words.index(word))) - # Check for length alphanumeric characters in the solved sentence + # Check for length of alphanumeric characters in the solved sentence num_of_words_in_sentence = len(re.findall(r"\b\w+\b", solved_sentence))