diff --git a/GALLERY.md b/GALLERY.md index 5aa6fc66..8a14d81a 100644 --- a/GALLERY.md +++ b/GALLERY.md @@ -15,6 +15,7 @@ This gallery shows examples from all available datasets using their default conf - [chain_sum](#chain_sum) - [color_cube_rotation](#color_cube_rotation) - [complex_arithmetic](#complex_arithmetic) +- [count_bits](#count_bits) - [countdown](#countdown) - [course_schedule](#course_schedule) - [family_relationships](#family_relationships) @@ -32,6 +33,7 @@ This gallery shows examples from all available datasets using their default conf - [leg_counting](#leg_counting) - [letter_counting](#letter_counting) - [letter_jumble](#letter_jumble) +- [manipulate_matrix](#manipulate_matrix) - [maze](#maze) - [mini_sudoku](#mini_sudoku) - [n_queens](#n_queens) @@ -41,6 +43,7 @@ This gallery shows examples from all available datasets using their default conf - [palindrome](#palindrome) - [polynomial_equations](#polynomial_equations) - [polynomial_multiplication](#polynomial_multiplication) +- [power_function](#power_function) - [prime_factorization](#prime_factorization) - [propositional_logic](#propositional_logic) - [quantum_lock](#quantum_lock) @@ -1041,6 +1044,35 @@ Metadata: {'num1': (-7.0, -79.0), 'num2': (-7.0, -5.0), 'operation': '/', 'resul ```` +### count_bits +Generates Count Bits exercises with configurable difficulty + +Default configuration: +```python +max_n = 2147483647 +size = 500 +seed = 42 +``` + +Example tasks: +```` +Example 1: +Question: How many 1 bits are there in the binary representation of the number 1373158607? +Answer: 18 +Metadata: {'number': 1373158607, 'solution': 18, 'binary': '1010001110110001011110011001111'} + +Example 2: +Question: How many 1 bits are there in the binary representation of the number 82789451? +Answer: 14 +Metadata: {'number': 82789451, 'solution': 14, 'binary': '100111011110100010001001011'} + +Example 3: +Question: How many 1 bits are there in the binary representation of the number 877324117? +Answer: 16 +Metadata: {'number': 877324117, 'solution': 16, 'binary': '110100010010101110011101010101'} + +```` + ### countdown Generates Countdown Number Game tasks @@ -2033,6 +2065,81 @@ Metadata: {'num_words': 16, 'corruption_level': 0.516016391169858, 'scrambled_wo ```` +### manipulate_matrix +Generates Manipulate Matrix exercises with configurable difficulty + +Default configuration: +```python +min_rows = 1 +min_cols = 1 +max_rows = 10 +max_cols = 10 +max_transforms = 5 +p_rotate = 0.2 +p_hmirror = 0.2 +p_vmirror = 0.2 +p_dmirror = 0.2 +p_cmirror = 0.2 +p_map = 0.2 +p_crop = 0.2 +p_remove_every_nth_row = 0.2 +p_remove_every_nth_col = 0.2 +p_zero_divisible = 0.2 +size = 500 +seed = 42 +``` + +Example tasks: +```` +Example 1: +Question: For the following matrix: +4 +3 + +Perform the following series of operations in order: +- Identity transformation, i.e. no change + + +Answer: 4 +3 +Metadata: {'matrix': [[4], [3]], 'solution': [[4], [3]], 'operations': []} + +Example 2: +Question: For the following matrix: +2 7 5 1 7 + +Perform the following series of operations in order: +- Identity transformation, i.e. no change + + +Answer: 2 7 5 1 7 +Metadata: {'matrix': [[2, 7, 5, 1, 7]], 'solution': [[2, 7, 5, 1, 7]], 'operations': []} + +Example 3: +Question: For the following matrix: +8 1 2 6 3 4 0 3 1 +9 0 1 2 8 4 6 9 6 +5 5 1 5 4 9 2 1 8 +1 9 1 4 5 1 4 0 5 +6 1 7 7 3 3 2 4 3 +0 0 6 0 5 5 7 7 9 +8 2 3 7 7 5 9 0 4 + +Perform the following series of operations in order: +- Identity transformation, i.e. no change + + +Answer: 8 1 2 6 3 4 0 3 1 +9 0 1 2 8 4 6 9 6 +5 5 1 5 4 9 2 1 8 +1 9 1 4 5 1 4 0 5 +6 1 7 7 3 3 2 4 3 +0 0 6 0 5 5 7 7 9 +8 2 3 7 7 5 9 0 4 +Metadata: {'matrix': [[8, 1, 2, 6, 3, 4, 0, 3, 1], [9, 0, 1, 2, 8, 4, 6, 9, 6], [5, 5, 1, 5, 4, 9, 2, 1, 8], [1, 9, 1, 4, 5, 1, 4, 0, 5], [6, 1, 7, 7, 3, 3, 2, 4, 3], [0, 0, 6, 0, 5, 5, 7, 7, 9], [8, 2, 3, 7, 7, 5, 9, 0, 4]], 'solution': [[8, 1, 2, 6, 3, 4, 0, 3, 1], [9, 0, 1, 2, 8, 4, 6, 9, 6], [5, 5, 1, 5, 4, 9, 2, 1, 8], [1, 9, 1, 4, 5, 1, 4, 0, 5], [6, 1, 7, 7, 3, 3, 2, 4, 3], [0, 0, 6, 0, 5, 5, 7, 7, 9], [8, 2, 3, 7, 7, 5, 9, 0, 4]], 'operations': []} + +```` + ### maze Generates mazes with guaranteed shortest path distance from start to goal within [min_dist, max_dist]. @@ -2485,6 +2592,38 @@ Metadata: {'polynomial_expr': '(43 - 91*x)*(3*x**2 - 10*x)*(71*x**3 - 2*x - 29)' ```` +### power_function +Generates Power Function exercises with configurable difficulty + +Default configuration: +```python +min_base = -1000.0 +max_base = 1000.0 +min_exponent = -8 +max_exponent = 8 +size = 500 +seed = 42 +``` + +Example tasks: +```` +Example 1: +Question: Compute 278.8535969157674^-8 +Answer: 2.735205704728613e-20 +Metadata: {'base': 278.8535969157674, 'exponent': -8, 'solution': 2.735205704728613e-20} + +Example 2: +Question: Compute -922.8963213252399^-4 +Answer: 1.3784415023500506e-12 +Metadata: {'base': -922.8963213252399, 'exponent': -4, 'solution': 1.3784415023500506e-12} + +Example 3: +Question: Compute -182.9282414910125^-5 +Answer: -4.881982323540115e-12 +Metadata: {'base': -182.9282414910125, 'exponent': -5, 'solution': -4.881982323540115e-12} + +```` + ### prime_factorization Generates prime factorization tasks @@ -3655,7 +3794,7 @@ Metadata: {'task_type': 'datetime_tz', 'start_time': datetime.datetime(2964, 6, Example 2: Question: A video call started at 09:44 and ended at 12:22. How long was the call? Answer in HH:MM. Answer: 02:38 -Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 9, 9, 44), 'end_time': datetime.datetime(2025, 2, 9, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'} +Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 10, 9, 44), 'end_time': datetime.datetime(2025, 2, 10, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'} Example 3: Question: Calculate the time difference between Sat Dec 22 2677 and Thu Mar 21 2678. Express the result in D days. diff --git a/eval/r1/__init__.py b/eval/r1/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/eval/r1/eval.py b/eval/r1/eval.py new file mode 100644 index 00000000..737707c7 --- /dev/null +++ b/eval/r1/eval.py @@ -0,0 +1,139 @@ +import argparse +import json +import logging +import os +from dataclasses import asdict +from datetime import datetime +from typing import Any, Dict, List + +import requests +from eval_config import EvalConfig +from requests.exceptions import RequestException +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential + +import reasoning_gym +from reasoning_gym.utils import extract_answer + + +class OpenRouterEvaluator: + def __init__(self, model: str, config: EvalConfig): + self.logger = logging.getLogger(f"OpenRouterEvaluator.{model}") + self.config = config + self.output_dir = f"{config.eval_dir}/{config.category}" + os.makedirs(self.output_dir, exist_ok=True) + self.base_url = "https://openrouter.ai/api/v1/chat/completions" + self.api_key = os.getenv("OPENROUTER_API_KEY") + self.model = model + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "HTTP-Referer": os.getenv("OR_SITE_URL", "localhost"), + "X-Title": os.getenv("OR_APP_NAME", "Model Evaluation"), + "Content-Type": "application/json", + } + + def save_results(self, results: List[Dict[str, Any]], dataset, dataset_name) -> Dict[str, Any]: + + file_name = f"{self.output_dir}/{dataset_name}.json" + total_score = sum(r["score"] for r in results) + + metrics = { + "dataset_name": dataset_name, + "model": self.model, + "size": dataset.size, + "provider": self.config.provider, + "average_score": total_score / len(results) if results else 0, + "total_examples": len(results), + "timestamp": datetime.now().isoformat(), + "config": asdict(dataset.config), + "results": results, # save results to allow for performance recalculation + } + + with open(file_name, "w") as f: + json.dump(metrics, f, indent=2) + return metrics + + def prepare_messages(self, prompt: str) -> List[Dict[str, str]]: + messages = [ + {"role": self.config.developer_role, "content": self.config.developer_prompt}, + {"role": "user", "content": prompt}, + ] + payload = { + "model": self.model, + "messages": messages, + "provider": {"order": ["Nebius"], "allow_fallbacks": False}, + } # make sure only one provider is used + + return payload + + @retry( + retry=retry_if_exception_type(RequestException), + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=60), + ) + def get_model_response(self, prompt: str) -> str: + """Get response from the model via OpenRouter API.""" + + payload = self.prepare_messages(prompt) + try: + response = requests.post(self.base_url, headers=self.headers, json=payload, timeout=30) + response.raise_for_status() + except requests.exceptions.RequestException as e: + raise RequestException( + f"API request failed: {str(e)}", {"endpoint": self.base_url, "model": self.model} + ) from e + return response.json()["choices"][0]["message"]["content"] + + def evaluate_datasets(self) -> List[Dict[str, Any]]: + """Evaluate model on multiple datasets with their respective configurations.""" + all_results = [] + + for dataset_name in self.config.datasets: + self.logger.info(f"\nEvaluating dataset: {dataset_name}") + + # Create dataset with its specific configuration + dataset = reasoning_gym.create_dataset( + dataset_name, size=self.config.dataset_size, seed=self.config.dataset_seed + ) + results = [] + + for i, entry in enumerate(dataset): + print(f"On example {i+1} of {len(dataset)}") + response = self.get_model_response(entry["question"]) + model_answer = extract_answer(response) + + score = dataset.score_answer(answer=model_answer, entry=entry) + + result = { + "question": entry["question"], + "expected_answer": str(entry["answer"]), + "model_answer": model_answer, + "score": score, + "metadata": str(entry["metadata"]), + } + results.append(result) + + metrics = self.save_results(results, dataset, dataset_name) + + all_results.append({"metrics": metrics, "results": results}) + + return all_results + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets") + parser.add_argument("--yaml", required=True, help="Path to YAML configuration file") + + args = parser.parse_args() + config = EvalConfig.from_yaml(args.yaml) + output_dir = f"{config.eval_dir}/{config.category}" + os.makedirs(output_dir, exist_ok=True) + + evaluator = OpenRouterEvaluator(model=config.model, config=config) + all_results = evaluator.evaluate_datasets() + + with open(f"{output_dir}/summary.json", "w") as f: + json.dump(all_results, f, indent=2) + + +if __name__ == "__main__": + main() diff --git a/eval/r1/eval_config.py b/eval/r1/eval_config.py new file mode 100644 index 00000000..9a6c96f6 --- /dev/null +++ b/eval/r1/eval_config.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from typing import List, Union + +import yaml + +from reasoning_gym.utils import SYSTEM_PROMPTS + + +@dataclass +class EvalConfig: + category: str + datasets: Union[str, List[str]] + eval_dir: str + dataset_size: int + dataset_seed: int + model: str = "deepseek/deepseek-r1" + provider: str = "Nebius" + developer_role: str = "system" + developer_prompt: str = SYSTEM_PROMPTS["DeepSeekZero"] + + @classmethod + def from_yaml(cls, yaml_path: str): + with open(yaml_path, "r") as f: + config = yaml.safe_load(f) + return cls(**config) diff --git a/eval/r1/yaml/algebra.yaml b/eval/r1/yaml/algebra.yaml new file mode 100644 index 00000000..b95c1e31 --- /dev/null +++ b/eval/r1/yaml/algebra.yaml @@ -0,0 +1,13 @@ +model: deepseek/deepseek-r1 +category: algebra +datasets: + - intermediate_integration + - polynomial_equations + - polynomial_multiplication + - simple_equations + - simple_integration + - complex_arithmetic +eval_dir: eval/r1 +dataset_size: 50 +dataset_seed: 42 +developer_role: system diff --git a/eval/r1/yaml/algorithmic.yaml b/eval/r1/yaml/algorithmic.yaml new file mode 100644 index 00000000..c1c043ce --- /dev/null +++ b/eval/r1/yaml/algorithmic.yaml @@ -0,0 +1,25 @@ +model: deepseek/deepseek-r1 +category: algorithmic +datasets: + - base_conversion + - binary_matrix + - caesar _cipher + - group_anagrams + - isomorphic_strings + - letter_counting + - letter_jumble + - number_filtering + - number_sorting + - palindrome + - ransom_note + - rotate_matrix + - sentence_reordering + - spell_backward + - spiral_matrix + - word_ladder + - word_sequence_reversal + - word_sorting +eval_dir: eval/r1 +dataset_size: 50 +dataset_seed: 42 +developer_role: system diff --git a/eval/r1/yaml/cognition.yaml b/eval/r1/yaml/cognition.yaml new file mode 100644 index 00000000..911a92e5 --- /dev/null +++ b/eval/r1/yaml/cognition.yaml @@ -0,0 +1,11 @@ +model: deepseek/deepseek-r1 +category: cognition +datasets: + - color_cube_rotation + - figlet_font + - number_sequence + - rubiks_cube +eval_dir: eval/r1 +dataset_size: 50 +dataset_seed: 42 +developer_role: system diff --git a/eval/r1/yaml/logic.yaml b/eval/r1/yaml/logic.yaml new file mode 100644 index 00000000..400c4ff3 --- /dev/null +++ b/eval/r1/yaml/logic.yaml @@ -0,0 +1,11 @@ +model: deepseek/deepseek-r1 +category: logic +datasets: + - propositional_logic + - self_reference + - syllogism + - zebra_puzzles +eval_dir: eval/r1 +dataset_size: 50 +dataset_seed: 42 +developer_role: system diff --git a/reasoning_gym/algebra/polynomial_equations.py b/reasoning_gym/algebra/polynomial_equations.py index a1822958..eec45285 100644 --- a/reasoning_gym/algebra/polynomial_equations.py +++ b/reasoning_gym/algebra/polynomial_equations.py @@ -59,7 +59,7 @@ class PolynomialEquationsDataset(ProceduralDataset): self._prompt_templates = [ "Find the real value(s) of {variable} in the equation: {polynomial_expanded} = 0", "Solve for real {variable}: {polynomial_expanded} = 0", - "Determine the real value(s) of {variable} tha satisfies: {polynomial_expanded} = 0", + "Determine the real value(s) of {variable} that satisfies: {polynomial_expanded} = 0", "Solve the polynomial equation for real {variable}:\n{polynomial_expanded} = 0", ] super().__init__(config=config, seed=config.seed, size=config.size) diff --git a/reasoning_gym/algorithmic/__init__.py b/reasoning_gym/algorithmic/__init__.py index decfd4d2..ad76199f 100644 --- a/reasoning_gym/algorithmic/__init__.py +++ b/reasoning_gym/algorithmic/__init__.py @@ -13,6 +13,7 @@ from .group_anagrams import GroupAnagramsConfig, GroupAnagramsDataset from .isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset from .letter_counting import LetterCountingConfig, LetterCountingDataset from .letter_jumble import LetterJumbleConfig, LetterJumbleDataset +from .manipulate_matrix import ManipulateMatrixConfig, ManipulateMatrixDataset from .number_filtering import NumberFilteringConfig, NumberFilteringDataset from .number_sorting import NumberSortingConfig, NumberSortingDataset from .palindrome_generation import PalindromeConfig, PalindromeDataset @@ -61,6 +62,8 @@ __all__ = [ "IsomorphicStringsDataset", "RotateMatrixConfig", "RotateMatrixDataset", + "ManipulateMatrixConfig", + "ManipulateMatrixDataset", "BinaryMatrixConfig", "BinaryMatrixDataset", ] diff --git a/reasoning_gym/algorithmic/manipulate_matrix.py b/reasoning_gym/algorithmic/manipulate_matrix.py new file mode 100644 index 00000000..1b5412f4 --- /dev/null +++ b/reasoning_gym/algorithmic/manipulate_matrix.py @@ -0,0 +1,273 @@ +"""Manipulate matrices by performing augmentations such as rotations, flips, mapping, etc.""" + +from copy import deepcopy +from dataclasses import dataclass +from random import Random +from typing import Optional + +from ..factory import ProceduralDataset, register_dataset + +QUESTION_TEMPLATE = """For the following matrix: +{matrix} + +Perform the following series of operations in order: +- Identity transformation, i.e. no change +{operations} +""" + + +def num_rows(matrix: list[list[int]]) -> int: + return len(matrix) + + +def num_cols(matrix: list[list[int]]) -> int: + return len(matrix[0]) if matrix else 0 + + +@dataclass +class ManipulateMatrixConfig: + """Configuration for Manipulate Matrix dataset generation""" + + min_rows: int = 1 # Minimum number of rows + min_cols: int = 1 # Minimum number of columns + max_rows: int = 10 # Maximum number of rows + max_cols: int = 10 # Maximum number of columns + max_transforms: int = 5 # Maximum number of transformations to apply + p_rotate: float = 0.2 # Probability of rotating the matrix + p_hmirror: float = 0.2 # Probability of horizontally mirroring the matrix + p_vmirror: float = 0.2 # Probability of vertically mirroring the matrix + p_dmirror: float = 0.2 # Probability of mirroring along the diagonal + p_cmirror: float = 0.2 # Probability of mirroring along the counterdiagonal + p_map: float = 0.2 # Probability of mapping a certain value to another + p_crop: float = 0.2 # Probability of cropping the matrix + p_remove_every_nth_row: float = 0.2 # Probability of removing every nth row + p_remove_every_nth_col: float = 0.2 # Probability of removing every nth column + p_zero_divisible: float = 0.2 # Probability of setting elements divisible by some number to zero + + size: int = 500 # Virtual dataset size + seed: Optional[int] = None + + def validate(self): + """Validate configuration parameters""" + assert 1 <= self.min_rows, "min_rows must be at least 1" + assert 1 <= self.min_cols, "min_cols must be at least 1" + assert self.min_rows <= self.max_rows, "max_rows must be at least min_rows" + assert self.min_cols <= self.max_cols, "max_cols must be at least min_cols" + assert 0 <= self.max_transforms, "max_transforms must be non-negative" + assert 0 <= self.p_rotate <= 1, "p_rotate must be between 0 and 1" + assert 0 <= self.p_hmirror <= 1, "p_hmirror must be between 0 and 1" + assert 0 <= self.p_vmirror <= 1, "p_vmirror must be between 0 and 1" + assert 0 <= self.p_dmirror <= 1, "p_dmirror must be between 0 and 1" + assert 0 <= self.p_cmirror <= 1, "p_cmirror must be between 0 and 1" + assert 0 <= self.p_map <= 1, "p_map must be between 0 and 1" + assert 0 <= self.p_crop <= 1, "p_crop must be between 0 and 1" + assert 0 <= self.p_remove_every_nth_row <= 1, "p_remove_every_nth_row must be between 0 and 1" + assert 0 <= self.p_remove_every_nth_col <= 1, "p_remove_nth_col must be between 0 and 1" + assert 0 <= self.p_zero_divisible <= 1, "p_zero_divisible must be between 0 and 1" + + +class ManipulateMatrixDataset(ProceduralDataset): + """Generates Manipulate Matrix exercises with configurable difficulty""" + + def __init__(self, config: ManipulateMatrixConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + self._rotations = { + "90": self._rot90, + "180": self._rot180, + "270": self._rot270, + "360": self._identity, + } + self._all_transforms = [ + "rotate", + "hmirror", + "vmirror", + "dmirror", + "cmirror", + "map", + "zero_divisible", + "crop", + "remove_every_nth_row", + "remove_every_nth_col", + ] + + def _get_matrix(self, rng: Random) -> list[list[int]]: + """Generate a random matrix""" + rows = rng.randint(self.config.min_rows, self.config.max_rows) + cols = rng.randint(self.config.min_cols, self.config.max_cols) + numbers = [rng.randint(0, 9) for _ in range(rows * cols)] + matrix = [numbers[i * cols : (i + 1) * cols] for i in range(rows)] + return matrix + + def _matrix_to_str(self, matrix: list[list[int]]) -> str: + """Get a string representation of the matrix""" + return "\n".join(" ".join(str(x) for x in row) for row in matrix) + + def _identity(self, matrix: list[list[int]]) -> list[list[int]]: + """Identity transformation""" + return matrix + + def _rot90(self, matrix: list[list[int]]) -> list[list[int]]: + """quarter clockwise rotation""" + return [list(row) for row in zip(*matrix[::-1])] + + def _rot180(self, matrix: list[list[int]]) -> list[list[int]]: + """half rotation""" + return [list(row[::-1]) for row in matrix[::-1]] + + def _rot270(self, matrix: list[list[int]]) -> list[list[int]]: + """quarter anticlockwise rotation""" + return [list(row[::-1]) for row in zip(*matrix[::-1])][::-1] + + def _hmirror(self, matrix: list[list[int]]) -> list[list[int]]: + """mirroring along horizontal""" + return matrix[::-1] + + def _vmirror(self, matrix: list[list[int]]) -> list[list[int]]: + """mirroring along vertical""" + return [row[::-1] for row in matrix] + + def _dmirror(self, matrix: list[list[int]]) -> list[list[int]]: + """mirroring along diagonal""" + return list(list(row) for row in zip(*matrix)) + + def _cmirror(self, matrix: list[list[int]]) -> list[list[int]]: + """mirroring along counterdiagonal""" + return list(list(row) for row in zip(*[r[::-1] for r in matrix[::-1]])) + + def _map(self, matrix: list[list[int]], a: int, b: int) -> list[list[int]]: + """mapping a to b""" + return [[b if x == a else x for x in row] for row in matrix] + + def _zero_divisible(self, matrix: list[list[int]], k: int) -> list[list[int]]: + """set elements divisible by k to zero""" + return [[0 if x % k == 0 else x for x in row] for row in matrix] + + def _crop( + self, matrix: list[list[int]], row_start: int, row_end: int, col_start: int, col_end: int + ) -> list[list[int]]: + """crop the matrix (1-indexed)""" + return [row[col_start - 1 : col_end] for row in matrix[row_start - 1 : row_end]] + + def _remove_every_nth_row(self, matrix: list[list[int]], n: int) -> list[list[int]]: + """remove every nth row (1-indexed)""" + return [row for i, row in enumerate(matrix, start=1) if i % n != 0] + + def _remove_every_nth_col(self, matrix: list[list[int]], n: int) -> list[list[int]]: + """remove every nth column (1-indexed)""" + return [[col for i, col in enumerate(row, start=1) if i % n != 0] for row in matrix] + + def __getitem__(self, idx: int) -> dict: + """Generate a single Manipulate Matrix question""" + rng = Random(self.seed + idx) + + matrix = self._get_matrix(rng) + matrix_str = self._matrix_to_str(matrix) + + num_transforms = rng.randint(0, self.config.max_transforms) + transforms = rng.sample(self._all_transforms, num_transforms) + operations = [] + + answer = deepcopy(matrix) + + for transform in transforms: + # Rotate + if transform == "rotate" and rng.random() < self.config.p_rotate: + rotation = rng.choice(list(self._rotations.keys())) + answer = self._rotations[rotation](answer) + operations.append( + { + "transform": transform, + "degrees": rotation, + "instruction": f"- Rotate the matrix {rotation} degrees", + } + ) + # Horizontal mirror + if transform == "hmirror" and rng.random() < self.config.p_hmirror: + answer = self._hmirror(answer) + operations.append({"transform": transform, "instruction": "- Horizontally mirror the matrix"}) + # Vertical mirror + if transform == "vmirror" and rng.random() < self.config.p_vmirror: + answer = self._vmirror(answer) + operations.append({"transform": transform, "instruction": "- Vertically mirror the matrix"}) + # Diagonal mirror + if transform == "dmirror" and rng.random() < self.config.p_dmirror: + answer = self._dmirror(answer) + operations.append({"transform": transform, "instruction": "- Mirror the matrix along the diagonal"}) + # Counterdiagonal mirror + if transform == "cmirror" and rng.random() < self.config.p_cmirror: + answer = self._cmirror(answer) + operations.append( + {"transform": transform, "instruction": "- Mirror the matrix along the counterdiagonal"} + ) + # Map a value to another + if transform == "map" and rng.random() < self.config.p_map: + a, b = rng.sample(range(10), 2) + answer = self._map(answer, a, b) + operations.append( + {"transform": transform, "from": a, "to": b, "instruction": f"- Map each occurrence of {a} to {b}"} + ) + # Set elements divisible by k to zero + if transform == "zero_divisible" and rng.random() < self.config.p_zero_divisible: + k = rng.randint(1, 9) + answer = self._zero_divisible(answer, k) + operations.append( + {"transform": transform, "k": k, "instruction": f"- Set all elements divisible by {k} to zero"} + ) + # Crop the matrix + if transform == "crop" and rng.random() < self.config.p_crop: + row_start = rng.randint(1, num_rows(answer)) + row_end = rng.randint(row_start, num_rows(answer)) + col_start = rng.randint(1, num_cols(answer)) + col_end = rng.randint(col_start, num_cols(answer)) + answer = self._crop(answer, row_start, row_end, col_start, col_end) + operations.append( + { + "transform": transform, + "row_start": row_start, + "row_end": row_end, + "col_start": col_start, + "col_end": col_end, + "instruction": f"- Crop the matrix to rows {row_start}-{row_end} and columns {col_start}-{col_end} (1-indexed)", + } + ) + # Remove every nth row + if ( + transform == "remove_every_nth_row" + and rng.random() < self.config.p_remove_every_nth_row + and num_rows(answer) > 1 + ): + n = rng.randint(2, num_rows(answer)) + answer = self._remove_every_nth_row(answer, n) + formatting = "st" if n == 1 else "nd" if n == 2 else "th" + operations.append( + {"transform": transform, "n": n, "instruction": f"- Remove every {n}-{formatting} row (1-indexed)"} + ) + # Remove every nth column + if ( + transform == "remove_every_nth_col" + and rng.random() < self.config.p_remove_every_nth_col + and num_cols(answer) > 1 + ): + n = rng.randint(2, num_cols(answer)) + answer = self._remove_every_nth_col(answer, n) + formatting = "st" if n == 1 else "nd" if n == 2 else "th" + operations.append( + { + "transform": transform, + "n": n, + "instruction": f"- Remove every {n}-{formatting} column (1-indexed)", + } + ) + + answer_str = self._matrix_to_str(answer) + + return { + "question": QUESTION_TEMPLATE.format( + matrix=matrix_str, operations="\n".join(op["instruction"] for op in operations) + ), + "answer": answer_str, + "metadata": {"matrix": matrix, "solution": answer, "operations": operations}, + } + + +register_dataset("manipulate_matrix", ManipulateMatrixDataset, ManipulateMatrixConfig) diff --git a/reasoning_gym/arithmetic/__init__.py b/reasoning_gym/arithmetic/__init__.py index f5e3eb1f..05d321da 100644 --- a/reasoning_gym/arithmetic/__init__.py +++ b/reasoning_gym/arithmetic/__init__.py @@ -5,11 +5,13 @@ Arithmetic tasks for training reasoning capabilities: from .basic_arithmetic import BasicArithmeticDataset, BasicArithmeticDatasetConfig from .calendar_arithmetic import CalendarArithmeticConfig, CalendarArithmeticDataset from .chain_sum import ChainSum, ChainSumConfig +from .count_bits import CountBitsConfig, CountBitsDataset from .fraction_simplification import FractionSimplificationConfig, FractionSimplificationDataset from .gcd import GCDConfig, GCDDataset from .gsm_symbolic.gsm_symbolic import GSMSymbolicDataset, GSMSymbolicDatasetConfig from .lcm import LCMConfig, LCMDataset from .leg_counting import LegCountingConfig, LegCountingDataset +from .power_function import PowerFunctionConfig, PowerFunctionDataset from .prime_factorization import PrimeFactorizationConfig, PrimeFactorizationDataset from .time_intervals import TimeIntervalsConfig, TimeIntervalsDataset @@ -34,4 +36,6 @@ __all__ = [ "GSMSymbolicDataset", "TimeIntervalsConfig", "TimeIntervalsDataset", + "CountBitsConfig", + "CountBitsDataset", ] diff --git a/reasoning_gym/arithmetic/count_bits.py b/reasoning_gym/arithmetic/count_bits.py new file mode 100644 index 00000000..5dc2c099 --- /dev/null +++ b/reasoning_gym/arithmetic/count_bits.py @@ -0,0 +1,47 @@ +"""Count number of 1 bits in a number.""" + +from dataclasses import dataclass +from random import Random +from typing import Optional + +from ..factory import ProceduralDataset, register_dataset + +QUESTION_TEMPLATE = """How many 1 bits are there in the binary representation of the number {number}?""" + + +@dataclass +class CountBitsConfig: + """Configuration for Count Bits dataset generation""" + + max_n: int = 2**31 - 1 # Maximum number to consider + + size: int = 500 # Virtual dataset size + seed: Optional[int] = None + + def validate(self): + """Validate configuration parameters""" + assert 1 <= self.max_n, "max_n must be at least 1" + + +class CountBitsDataset(ProceduralDataset): + """Generates Count Bits exercises with configurable difficulty""" + + def __init__(self, config: CountBitsConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + + def __getitem__(self, idx: int) -> dict: + """Generate a single Count Bits question""" + rng = Random(self.seed + idx) + + number = rng.randint(1, self.config.max_n) + binary = bin(number)[2:] + answer = binary.count("1") + + return { + "question": QUESTION_TEMPLATE.format(number=number), + "answer": str(answer), + "metadata": {"number": number, "solution": answer, "binary": binary}, + } + + +register_dataset("count_bits", CountBitsDataset, CountBitsConfig) diff --git a/reasoning_gym/arithmetic/power_function.py b/reasoning_gym/arithmetic/power_function.py new file mode 100644 index 00000000..adbda12d --- /dev/null +++ b/reasoning_gym/arithmetic/power_function.py @@ -0,0 +1,62 @@ +"""Computhe the power of a number.""" + +from dataclasses import dataclass +from math import pow +from random import Random +from typing import Dict, Optional + +from ..factory import ProceduralDataset, register_dataset + +QUESTION_TEMPLATE = """Compute {base}^{exponent}""" + + +@dataclass +class PowerFunctionConfig: + """Configuration for Power Function dataset generation""" + + min_base: float = -1e3 # Minimum base value + max_base: float = 1e3 # Maximum base value + min_exponent: int = -8 # Minimum exponent value + max_exponent: int = 8 # Maximum exponent value + + size: int = 500 # Virtual dataset size + seed: Optional[int] = None + + +class PowerFunctionDataset(ProceduralDataset): + """Generates Power Function exercises with configurable difficulty""" + + def __init__(self, config: PowerFunctionConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + + def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float: + """Overwrite this method in derived classes if a single oracle answer is not available.""" + oracle_answer = entry["answer"] + reward = 0.0 + if answer is not None: + difference = abs(float(answer) - float(oracle_answer)) + if difference < 1e-6: + reward = 1.0 + elif difference < 1e-1: + reward = 0.5 + else: + reward = 0.01 + + return reward + + def __getitem__(self, idx: int) -> dict: + """Generate a single Power Function question""" + rng = Random(self.seed + idx) + + base = rng.uniform(self.config.min_base, self.config.max_base) + exponent = rng.randint(self.config.min_exponent, self.config.max_exponent) + answer = pow(base, exponent) + + return { + "question": f"Compute {base}^{exponent}", + "answer": str(answer), + "metadata": {"base": base, "exponent": exponent, "solution": answer}, + } + + +register_dataset("power_function", PowerFunctionDataset, PowerFunctionConfig) diff --git a/tests/test_count_bits.py b/tests/test_count_bits.py new file mode 100644 index 00000000..6a36c886 --- /dev/null +++ b/tests/test_count_bits.py @@ -0,0 +1,83 @@ +"""Tests for Count bits questions generation""" + +import pytest + +from reasoning_gym.arithmetic.count_bits import CountBitsConfig, CountBitsDataset + + +def test_count_bits_config_validation(): + """Test that invalid configs raise appropriate errors""" + with pytest.raises(AssertionError): + config = CountBitsConfig(max_n=-1) # Negative not allowed + config.validate() + + with pytest.raises(AssertionError): + config = CountBitsConfig(max_n=0) # Zero not allowed + config.validate() + + +def test_count_bits_dataset_deterministic(): + """Test that dataset generates same items with same seed""" + config = CountBitsConfig(seed=42, size=10) + dataset1 = CountBitsDataset(config) + dataset2 = CountBitsDataset(config) + + for i in range(len(dataset1)): + assert dataset1[i] == dataset2[i] + + +def test_count_bits_dataset_items(): + """Test basic properties of generated items""" + config = CountBitsConfig(max_n=10, size=10, seed=42) + dataset = CountBitsDataset(config) + + for i in range(len(dataset)): + item = dataset[i] + # Check item structure + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Check metadata + assert "number" in item["metadata"] + assert "solution" in item["metadata"] + assert "binary" in item["metadata"] + + number = item["metadata"]["number"] + solution = item["metadata"]["solution"] + binary = item["metadata"]["binary"] + + # Verify values + assert number <= config.max_n + assert solution >= 0 + assert set(binary) <= {"0", "1"} + + +def test_count_bits_dataset_iteration(): + """Test that iteration respects dataset size""" + config = CountBitsConfig(size=5, seed=42) + dataset = CountBitsDataset(config) + + items = list(dataset) + assert len(items) == config.size + + # Test multiple iterations yield same items + assert items == list(dataset) + + +def test_count_bits_answer(): + """Verify the number of 1 bits in the binary representation of a number""" + config = CountBitsConfig(size=5, seed=42) + dataset = CountBitsDataset(config) + + for item in dataset: + number = item["metadata"]["number"] + solution = item["metadata"]["solution"] + + # Count number of 1 bits in the number by shifting + count = 0 + while number: + count += number & 1 + number >>= 1 + assert solution == count diff --git a/tests/test_manipulate_matrix.py b/tests/test_manipulate_matrix.py new file mode 100644 index 00000000..9dfc3655 --- /dev/null +++ b/tests/test_manipulate_matrix.py @@ -0,0 +1,214 @@ +"""Tests for Manipulate Matrix questions generation""" + +import pytest + +from reasoning_gym.algorithmic.manipulate_matrix import ManipulateMatrixConfig, ManipulateMatrixDataset + + +def test_manipulate_matrix_config_validation(): + """Test that invalid configs raise appropriate errors""" + + with pytest.raises(AssertionError): + config = ManipulateMatrixConfig(max_transforms=-1) # max_transforms should be non-negative + config.validate() + + invalid_dims = [-1, 0] # Dimensions should be positive integers + dim_fields = ["min_rows", "min_cols", "max_rows", "max_cols"] + + for field in dim_fields: + for dim in invalid_dims: + with pytest.raises(AssertionError): + config = ManipulateMatrixConfig(**{field: dim}) + config.validate() + + invalid_probabilities = [-0.01, 1.01] # Probabilities should be between 0 and 1 inclusive + probability_fields = [ + "p_hmirror", + "p_vmirror", + "p_dmirror", + "p_cmirror", + "p_map", + "p_crop", + "p_remove_every_nth_row", + "p_remove_every_nth_col", + "p_zero_divisible", + ] + + for field in probability_fields: + for prob in invalid_probabilities: + with pytest.raises(AssertionError): + config = ManipulateMatrixConfig(**{field: prob}) + config.validate() + + +def test_manipulate_matrix_dataset_deterministic(): + """Test that dataset generates same items with same seed""" + config = ManipulateMatrixConfig(seed=42, size=10) + dataset1 = ManipulateMatrixDataset(config) + dataset2 = ManipulateMatrixDataset(config) + + for i in range(len(dataset1)): + assert dataset1[i] == dataset2[i] + + +def test_manipulate_matrix_dataset_items(): + """Test basic properties of generated items""" + config = ManipulateMatrixConfig(max_rows=7, max_cols=7, size=10, seed=42) + dataset = ManipulateMatrixDataset(config) + + for i in range(len(dataset)): + item = dataset[i] + # Check item structure + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Check metadata + assert "matrix" in item["metadata"] + assert "solution" in item["metadata"] + assert "operations" in item["metadata"] + + matrix = item["metadata"]["matrix"] + solution = item["metadata"]["solution"] + operations = item["metadata"]["operations"] + + # Verify matrix dimensions + assert len(matrix) <= config.max_rows + assert all(len(row) <= config.max_cols for row in matrix) + assert len(solution) <= config.max_rows + assert all(len(row) <= config.max_cols for row in solution) + for op in operations: + assert "transform" in op + assert "instruction" in op + + +def test_manipulate_matrix_dataset_iteration(): + """Test that iteration respects dataset size""" + config = ManipulateMatrixConfig(size=5, seed=42) + dataset = ManipulateMatrixDataset(config) + + items = list(dataset) + assert len(items) == config.size + + assert items == list(dataset) + + +def test_manipulate_matrix_transforms(): + """Test the _get_rotated method""" + config = ManipulateMatrixConfig(seed=42) + dataset = ManipulateMatrixDataset(config) + matrix = [ + [1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15], + [16, 17, 18, 19, 20], + [21, 22, 23, 24, 25], + ] + + # identity + assert dataset._identity(matrix) == matrix + + # rot 90 degrees + assert dataset._rot90(matrix) == [ + [21, 16, 11, 6, 1], + [22, 17, 12, 7, 2], + [23, 18, 13, 8, 3], + [24, 19, 14, 9, 4], + [25, 20, 15, 10, 5], + ] + + # rot 180 degrees + assert dataset._rot180(matrix) == [ + [25, 24, 23, 22, 21], + [20, 19, 18, 17, 16], + [15, 14, 13, 12, 11], + [10, 9, 8, 7, 6], + [5, 4, 3, 2, 1], + ] + + # rot 270 degrees + assert dataset._rot270(matrix) == [ + [5, 10, 15, 20, 25], + [4, 9, 14, 19, 24], + [3, 8, 13, 18, 23], + [2, 7, 12, 17, 22], + [1, 6, 11, 16, 21], + ] + + # hmirror + assert dataset._hmirror(matrix) == [ + [21, 22, 23, 24, 25], + [16, 17, 18, 19, 20], + [11, 12, 13, 14, 15], + [6, 7, 8, 9, 10], + [1, 2, 3, 4, 5], + ] + + # vmirror + assert dataset._vmirror(matrix) == [ + [5, 4, 3, 2, 1], + [10, 9, 8, 7, 6], + [15, 14, 13, 12, 11], + [20, 19, 18, 17, 16], + [25, 24, 23, 22, 21], + ] + + # dmirror + assert dataset._dmirror(matrix) == [ + [1, 6, 11, 16, 21], + [2, 7, 12, 17, 22], + [3, 8, 13, 18, 23], + [4, 9, 14, 19, 24], + [5, 10, 15, 20, 25], + ] + + # cmirror + assert dataset._cmirror(matrix) == [ + [25, 20, 15, 10, 5], + [24, 19, 14, 9, 4], + [23, 18, 13, 8, 3], + [22, 17, 12, 7, 2], + [21, 16, 11, 6, 1], + ] + + # map + assert dataset._map(matrix, a=13, b=0) == [ + [1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 0, 14, 15], # 13 -> 0 + [16, 17, 18, 19, 20], + [21, 22, 23, 24, 25], + ] + + # zero divisible + assert dataset._zero_divisible(matrix, k=3) == [ + [1, 2, 0, 4, 5], + [0, 7, 8, 0, 10], + [11, 0, 13, 14, 0], + [16, 17, 0, 19, 20], + [0, 22, 23, 0, 25], + ] + + # crop + assert dataset._crop(matrix, row_start=2, row_end=4, col_start=1, col_end=3) == [ + [6, 7, 8], + [11, 12, 13], + [16, 17, 18], + ] + + # remove every nth row + assert dataset._remove_every_nth_row(matrix, n=2) == [ + [1, 2, 3, 4, 5], + [11, 12, 13, 14, 15], + [21, 22, 23, 24, 25], + ] + + # remove every nth col + assert dataset._remove_every_nth_col(matrix, n=2) == [ + [1, 3, 5], + [6, 8, 10], + [11, 13, 15], + [16, 18, 20], + [21, 23, 25], + ] diff --git a/tests/test_power_function.py b/tests/test_power_function.py new file mode 100644 index 00000000..08d15826 --- /dev/null +++ b/tests/test_power_function.py @@ -0,0 +1,78 @@ +"""Tests for Power Function questions generation""" + +import pytest + +from reasoning_gym.arithmetic import PowerFunctionConfig, PowerFunctionDataset + + +def test_power_function_dataset_deterministic(): + """Test that dataset generates same items with same seed""" + config = PowerFunctionConfig(seed=42, size=10) + dataset1 = PowerFunctionDataset(config) + dataset2 = PowerFunctionDataset(config) + + for i in range(len(dataset1)): + assert dataset1[i] == dataset2[i] + + +def test_power_function_dataset_items(): + """Test basic properties of generated items""" + config = PowerFunctionConfig(min_base=-100, max_base=-100, min_exponent=-10, max_exponent=10, size=10, seed=42) + dataset = PowerFunctionDataset(config) + + for i in range(len(dataset)): + item = dataset[i] + # Check item structure + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Check metadata + assert "base" in item["metadata"] + assert "exponent" in item["metadata"] + + base = item["metadata"]["base"] + exponent = item["metadata"]["exponent"] + solution = item["metadata"]["solution"] + + # Verify values + assert config.min_base <= base <= config.max_base + assert config.min_exponent <= exponent <= config.max_exponent + assert solution == pow(base, exponent) + + +def test_power_function_dataset_iteration(): + """Test that iteration respects dataset size""" + config = PowerFunctionConfig(size=5, seed=42) + dataset = PowerFunctionDataset(config) + + items = list(dataset) + assert len(items) == config.size + + # Test multiple iterations yield same items + assert items == list(dataset) + + +def test_power_function_score_function(): + """Test score function""" + config = PowerFunctionConfig(seed=42) + dataset = PowerFunctionDataset(config) + + item = dataset[0] + + # Answer is within 1e-6 of solution + answer = str(item["metadata"]["solution"] - 1e-7) + assert dataset.score_answer(answer, item) == 1.0 + + # Answer is within 1e-1 of solution + answer = str(item["metadata"]["solution"] - 1e-2) + assert dataset.score_answer(answer, item) == 0.5 + + # Answer is far from solution + answer = str(item["metadata"]["solution"] - 1) + assert dataset.score_answer(answer, item) == 0.01 + + # Answer is None + answer = None + assert dataset.score_answer(answer, item) == 0.0