diff --git a/GALLERY.md b/GALLERY.md index e7ba9238..5aa6fc66 100644 --- a/GALLERY.md +++ b/GALLERY.md @@ -9,6 +9,7 @@ This gallery shows examples from all available datasets using their default conf - [base_conversion](#base_conversion) - [basic_arithmetic](#basic_arithmetic) - [bf](#bf) +- [binary_matrix](#binary_matrix) - [caesar_cipher](#caesar_cipher) - [calendar_arithmetic](#calendar_arithmetic) - [chain_sum](#chain_sum) @@ -751,6 +752,98 @@ Metadata: {'bfit_code': '\nint main() {\n print("under");\n}\n', 'bf_program' ```` +### binary_matrix +Generates Binary Matrix exercises with configurable difficulty + +Default configuration: +```python +max_n = 10 +p_zero = 0.25 +size = 500 +seed = 42 +``` + +Example tasks: +```` +Example 1: +Question: Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell. + +Example: + +Input: Find the distance to the nearest 0 for each cell in the matrix below: +0 0 0 +0 1 0 +1 1 1 + +Output: +0 0 0 +0 1 0 +1 2 1 + +Find the distance to the nearest 0 for each cell in the matrix below: +0 0 +1 0 + +Answer: 0 0 +1 0 +Metadata: {'matrix': [[0, 0], [1, 0]], 'solution': [[0, 0], [1, 0]]} + +Example 2: +Question: Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell. + +Example: + +Input: Find the distance to the nearest 0 for each cell in the matrix below: +0 0 0 +0 1 0 +1 1 1 + +Output: +0 0 0 +0 1 0 +1 2 1 + +Find the distance to the nearest 0 for each cell in the matrix below: +0 + +Answer: 0 +Metadata: {'matrix': [[0]], 'solution': [[0]]} + +Example 3: +Question: Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell. + +Example: + +Input: Find the distance to the nearest 0 for each cell in the matrix below: +0 0 0 +0 1 0 +1 1 1 + +Output: +0 0 0 +0 1 0 +1 2 1 + +Find the distance to the nearest 0 for each cell in the matrix below: +1 0 1 1 0 1 1 +1 0 1 1 1 1 1 +1 1 1 1 0 1 1 +1 1 1 1 0 0 1 +0 1 1 1 1 1 0 +1 0 1 1 1 1 0 +1 1 1 1 1 1 1 + +Answer: 1 0 1 1 0 1 2 +1 0 1 2 1 2 3 +2 1 2 1 0 1 2 +1 2 2 1 0 0 1 +0 1 2 2 1 1 0 +1 0 1 2 2 1 0 +2 1 2 3 3 2 1 +Metadata: {'matrix': [[1, 0, 1, 1, 0, 1, 1], [1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 0, 1], [0, 1, 1, 1, 1, 1, 0], [1, 0, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1]], 'solution': [[1, 0, 1, 1, 0, 1, 2], [1, 0, 1, 2, 1, 2, 3], [2, 1, 2, 1, 0, 1, 2], [1, 2, 2, 1, 0, 0, 1], [0, 1, 2, 2, 1, 1, 0], [1, 0, 1, 2, 2, 1, 0], [2, 1, 2, 3, 3, 2, 1]]} + +```` + ### caesar_cipher Generates Caesar cipher encryption/decryption tasks diff --git a/eval/.gitignore b/eval/.gitignore new file mode 100644 index 00000000..7db6c7ce --- /dev/null +++ b/eval/.gitignore @@ -0,0 +1,2 @@ +results/* +!results/summary* diff --git a/eval/eval.py b/eval/eval.py new file mode 100644 index 00000000..f8952e10 --- /dev/null +++ b/eval/eval.py @@ -0,0 +1,143 @@ +import argparse +import json +import os +from datetime import datetime +from typing import Any, Dict, List + +from openai import OpenAI + +from reasoning_gym.factory import DATASETS, create_dataset + + +class OpenRouterEvaluator: + def __init__(self, model: str): + self.client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=os.getenv("OPENROUTER_API_KEY")) + self.model = model + self.extra_headers = {} + + def get_model_response(self, prompt: str) -> str: + """Get response from the model via OpenRouter API.""" + try: + completion = self.client.chat.completions.create( + extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}] + ) + return completion.choices[0].message.content + except Exception as e: + print(f"Error calling OpenRouter API: {str(e)}") + raise + + def evaluate_datasets(self, dataset_configs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Evaluate model on multiple datasets with their respective configurations.""" + all_results = [] + + for dataset_config in dataset_configs: + dataset_name = dataset_config.pop("name") + print(f"\nEvaluating dataset: {dataset_name}") + + try: + # Create dataset with its specific configuration + data = create_dataset(dataset_name, **dataset_config) + results = [] + + for entry in data: + try: + response = self.get_model_response(entry["question"]) + score = data.score_answer(answer=response, entry=entry) + + result = { + "question": entry["question"], + "expected_answer": entry["answer"], + "model_answer": response, + "score": score, + "metadata": entry["metadata"], + } + results.append(result) + print(f"Processed question {len(results)}/{len(data)}. Score: {score}") + + except Exception as e: + print(f"Error processing question: {entry['question']}") + print(f"Error: {str(e)}") + + # Calculate aggregate metrics + total_score = sum(r["score"] for r in results) + metrics = { + "dataset_name": dataset_name, + "model": self.model, + "size": len(data), + "average_score": total_score / len(results) if results else 0, + "total_examples": len(results), + "timestamp": datetime.now().isoformat(), + "config": dataset_config, + } + + all_results.append({"metrics": metrics, "results": results}) + + except Exception as e: + print(f"Error evaluating dataset {dataset_name}: {str(e)}") + continue + + return all_results + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets") + parser.add_argument("--model", required=True, help="Model to evaluate") + parser.add_argument("--config", required=True, help="Path to JSON configuration file") + parser.add_argument("--output-dir", default="results", help="Output directory") + + args = parser.parse_args() + + # Create output directory if it doesn't exist + os.makedirs(args.output_dir, exist_ok=True) + + # Load dataset configurations + with open(args.config, "r") as f: + dataset_configs = json.load(f) + + evaluator = OpenRouterEvaluator(model=args.model) + all_results = evaluator.evaluate_datasets(dataset_configs) + + # Save results + output_file = os.path.join( + args.output_dir, f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + ) + + # Save detailed results + with open(output_file, "w") as f: + json.dump(all_results, f, indent=2) + + # Create summary + summary = [] + for result in all_results: + metrics = result["metrics"] + summary_entry = { + "dataset_name": metrics["dataset_name"], + "model": metrics["model"], + "average_score": metrics["average_score"], + "total_examples": metrics["total_examples"], + "timestamp": metrics["timestamp"], + "config": metrics["config"], + } + summary.append(summary_entry) + + # Save summary to a separate file + summary_file = os.path.join( + args.output_dir, f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + ) + + with open(summary_file, "w") as f: + json.dump(summary, f, indent=2) + + # Print summary + print("\nEvaluation Summary:") + for entry in summary: + print(f"\nDataset: {entry['dataset_name']}") + print(f"Average Score: {entry['average_score']:.2%}") + print(f"Total Examples: {entry['total_examples']}") + + print(f"\nDetailed results saved to: {output_file}") + print(f"Summary saved to: {summary_file}") + + +if __name__ == "__main__": + main() diff --git a/eval/eval.sh b/eval/eval.sh new file mode 100644 index 00000000..1d2a0beb --- /dev/null +++ b/eval/eval.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Check if OPENROUTER_API_KEY is set +if [ -z "$OPENROUTER_API_KEY" ]; then + echo "Error: OPENROUTER_API_KEY environment variable is not set" + echo "Please set it using: export OPENROUTER_API_KEY=your-api-key" + exit 1 +fi + +# Configuration +OUTPUT_DIR="results" + +# List of models to evaluate +MODELS=( + "google/gemini-2.0-flash-001" +) + +# Create output directory +mkdir -p "$OUTPUT_DIR" + +# Run evaluations +for model in "${MODELS[@]}"; do + echo "Evaluating $model..." + python eval.py \ + --model "$model" \ + --config "eval_basic.json" \ + --output-dir "$OUTPUT_DIR" +done + +echo "All evaluations completed!" diff --git a/eval/eval_basic.json b/eval/eval_basic.json new file mode 100644 index 00000000..6240ce9d --- /dev/null +++ b/eval/eval_basic.json @@ -0,0 +1,21 @@ +[ + { + "name": "letter_counting", + "min_words": 5, + "max_words": 15, + "size": 10, + "seed": 42 + }, + { + "name": "propositional_logic", + "size": 10, + "seed": 42 + }, + { + "name": "leg_counting", + "min_animals": 3, + "max_animals": 8, + "size": 10, + "seed": 42 + } + ] diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json new file mode 100644 index 00000000..8c9d6a5c --- /dev/null +++ b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json @@ -0,0 +1,39 @@ +[ + { + "dataset_name": "letter_counting", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.20600000000000002, + "total_examples": 10, + "timestamp": "2025-02-10T06:34:37.091554", + "config": { + "min_words": 5, + "max_words": 15, + "size": 10, + "seed": 42 + } + }, + { + "dataset_name": "propositional_logic", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.059, + "total_examples": 10, + "timestamp": "2025-02-10T06:35:11.432275", + "config": { + "size": 10, + "seed": 42 + } + }, + { + "dataset_name": "leg_counting", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.40199999999999997, + "total_examples": 10, + "timestamp": "2025-02-10T06:35:27.087469", + "config": { + "min_animals": 3, + "max_animals": 8, + "size": 10, + "seed": 42 + } + } +] diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json b/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json new file mode 100644 index 00000000..32086f46 --- /dev/null +++ b/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json @@ -0,0 +1,39 @@ +[ + { + "dataset_name": "letter_counting", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.20600000000000002, + "total_examples": 10, + "timestamp": "2025-02-09T22:47:25.934820", + "config": { + "min_words": 5, + "max_words": 15, + "size": 10, + "seed": 42 + } + }, + { + "dataset_name": "propositional_logic", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.059, + "total_examples": 10, + "timestamp": "2025-02-09T22:47:57.473560", + "config": { + "size": 10, + "seed": 42 + } + }, + { + "dataset_name": "leg_counting", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.40199999999999997, + "total_examples": 10, + "timestamp": "2025-02-09T22:48:13.546006", + "config": { + "min_animals": 3, + "max_animals": 8, + "size": 10, + "seed": 42 + } + } +] diff --git a/reasoning_gym/algebra/polynomial_equations.py b/reasoning_gym/algebra/polynomial_equations.py index ed7e857f..a1822958 100644 --- a/reasoning_gym/algebra/polynomial_equations.py +++ b/reasoning_gym/algebra/polynomial_equations.py @@ -1,7 +1,8 @@ +import math import random import string from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Dict, List, Optional, Tuple from sympy import Eq, Symbol, expand, solve @@ -26,6 +27,9 @@ class PolynomialEquationsConfig: ) # Allowed operators between terms, Avoid adding '*' or '/' because they will affect the degree seed: Optional[int] = None size: int = 500 + # reward function hyperparameters + penalty_missing_factor = 0.1 + penalty_extra_factor = 0.05 def validate(self) -> None: """Validate configuration parameters.""" @@ -146,5 +150,101 @@ class PolynomialEquationsDataset(ProceduralDataset): return polynomial_expr + def _parse_score_to_list(self, answer: Optional[str]) -> List[float]: + """Parses a comma-separated string of scores into a sorted list of floats. + + This method takes a string containing comma-separated numeric values, + attempts to convert each value to a float, and returns a sorted list of these floats. + Any values that cannot be converted to a float are ignored. + Handles empty strings gracefully. + + Args: + answer: An optional string containing comma-separated numeric values. + Can be None or an empty string. + Returns: + A sorted list of floats parsed from the input string. + Returns an empty list if the input is None, empty, or contains no valid numeric values. + """ + + if answer is None or len(answer) == 0: # Handle None or empty input + return [] + + output_float_vals = [] + for output_val in answer.split(","): + try: + # Convert to float, strip whitespace + output_float_vals.append(float(output_val.strip())) + except ValueError: + # Ignore values that cannot be converted to float + continue + + return sorted(output_float_vals) # Return the sorted list of floats + + def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float: + """ + Score an answer based on its numerical distance to oracle solutions using exponential decay. + This function compares a predicted answer (or list of answers) to a set of oracle solutions + (also a list of numbers). It calculates a reward based on how close the predicted solutions + are to the oracle solutions, using an exponential decay function. It also applies penalties + for missing or extra predicted solutions. The implementation is a greedy algorithm where we + find the closest matching oracle solution for a given predicted solution and only allow an + oracle solution to match once. + + Args: + answer: The predicted answer (or a string that can be parsed into a list of numbers). + May be None. + entry: A dictionary containing the oracle solution(s) under the key "answer" + (which can be a string that can be parsed into a list of numbers). + + Returns: + A float representing the final score. The score is non-negative. + """ + oracle_solutions = self._parse_score_to_list(entry["answer"]) # Parse oracle solutions + predicted_solutions = self._parse_score_to_list(answer) # Parse predicted solutions + + total_reward = 0.0 + matched_solutions = 0 + extra_solutions = 0 + missing_solutions = 0 + + for predicted_solution in predicted_solutions: + + # find the closest matching solution from the oracle solutions. + # this is a greedy approach to computing the score + matched_distance = float("inf") + matched_distance_index = None + for oracle_solution_index, oracle_solution in enumerate(oracle_solutions): + if matched_distance > abs(predicted_solution - oracle_solution): + matched_distance = abs(predicted_solution - oracle_solution) + matched_distance_index = oracle_solution_index + + if matched_distance_index is not None: + matched_solutions += 1 + # Remove matched oracle solution + oracle_solutions.pop(matched_distance_index) + # Exponential decay reward + total_reward += math.exp(-matched_distance) + else: + # Extra predicted solution + extra_solutions += 1 + + # Count remaining oracle solutions as missing + for oracle_solution in oracle_solutions: + missing_solutions += 1 + + # Calculate penalty for either missing or extra solutions + penalty = missing_solutions * self.config.penalty_missing_factor + penalty += extra_solutions * self.config.penalty_extra_factor + + if matched_solutions > 0: + # normalize the rewards that we found matching solutions for + # so that the value is bounded between 0 and 1 + total_reward = total_reward / matched_solutions + + # Final reward capped at 0 + final_reward = max(0, total_reward - penalty) + + return final_reward + register_dataset("polynomial_equations", PolynomialEquationsDataset, PolynomialEquationsConfig) diff --git a/reasoning_gym/algorithmic/__init__.py b/reasoning_gym/algorithmic/__init__.py index dac1b62c..ad76199f 100644 --- a/reasoning_gym/algorithmic/__init__.py +++ b/reasoning_gym/algorithmic/__init__.py @@ -7,6 +7,7 @@ Algorithmic tasks for training reasoning capabilities: """ from .base_conversion import BaseConversionConfig, BaseConversionDataset +from .binary_matrix import BinaryMatrixConfig, BinaryMatrixDataset from .caesar_cipher import CaesarCipherConfig, CaesarCipherDataset from .group_anagrams import GroupAnagramsConfig, GroupAnagramsDataset from .isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset @@ -63,4 +64,6 @@ __all__ = [ "RotateMatrixDataset", "ManipulateMatrixConfig", "ManipulateMatrixDataset", + "BinaryMatrixConfig", + "BinaryMatrixDataset", ] diff --git a/reasoning_gym/algorithmic/binary_matrix.py b/reasoning_gym/algorithmic/binary_matrix.py new file mode 100644 index 00000000..8ae122bb --- /dev/null +++ b/reasoning_gym/algorithmic/binary_matrix.py @@ -0,0 +1,125 @@ +"""Find the distance to the nearest 0 for each cell in a binary matrix. + +A popular Leetcode problem: +https://leetcode.com/problems/01-matrix/description/ +""" + +from collections import deque +from dataclasses import dataclass +from random import Random +from typing import Optional + +from ..factory import ProceduralDataset, register_dataset + +QUESTION_TEMPLATE = """Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell. + +Example: + +Input: Find the distance to the nearest 0 for each cell in the matrix below: +0 0 0 +0 1 0 +1 1 1 + +Output: +0 0 0 +0 1 0 +1 2 1 + +Find the distance to the nearest 0 for each cell in the matrix below: +{matrix} +""" + + +@dataclass +class BinaryMatrixConfig: + """Configuration for Binary Matrix dataset generation""" + + max_n: int = 10 # Maximum size of the matrix + p_zero: float = 0.25 # Probability of a cell being 0 + + size: int = 500 # Virtual dataset size + seed: Optional[int] = None + + def validate(self): + """Validate configuration parameters""" + assert 1 <= self.max_n, "max_n must be at least 1" + assert 0 < self.p_zero <= 1, "p_zero must be between 0 and 1" + + +class BinaryMatrixDataset(ProceduralDataset): + """Generates Binary Matrix exercises with configurable difficulty""" + + def __init__(self, config: BinaryMatrixConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + + def _get_binary_matrix(self, rng: Random) -> list[list[int]]: + """Generate a random binary matrix""" + n = rng.randint(1, self.config.max_n) + # Ensure at least one 0 in the matrix, so that a solution exists + numbers = [0] + [0 if rng.random() < self.config.p_zero else 1 for _ in range(n**2 - 1)] + rng.shuffle(numbers) + matrix = [numbers[i * n : (i + 1) * n] for i in range(n)] + return matrix + + def _get_distances(self, matrix: list[list[int]]) -> list[list[int]]: + """Get the distance to the nearest 0 for each cell in the matrix""" + n = len(matrix) + directions = [[1, 0], [-1, 0], [0, 1], [0, -1]] + visited = set() + queue = deque() + + output = [[float("inf")] * n for _ in range(n)] + + for r in range(n): + for c in range(n): + if matrix[r][c] == 0: + output[r][c] = 0 + visited.add((r, c)) + queue.append((r, c)) + + clock = 1 + while True: + temp = deque() + while queue: + r, c = queue.popleft() + for dr, dc in directions: + new_r, new_c = r + dr, c + dc + if ( + 0 <= new_r < n + and 0 <= new_c < n + and (new_r, new_c) not in visited + and matrix[new_r][new_c] == 1 + ): + output[new_r][new_c] = clock + visited.add((new_r, new_c)) + temp.append((new_r, new_c)) + if temp: + queue = temp + else: + break + clock += 1 + + return output + + def _matrix_to_str(self, matrix: list[list[int]]) -> str: + """Get a string representation of the matrix""" + return "\n".join(" ".join(str(x) for x in row) for row in matrix) + + def __getitem__(self, idx: int) -> dict: + """Generate a single Binary Matrix question""" + rng = Random(self.seed + idx) + + matrix = self._get_binary_matrix(rng) + matrix_str = self._matrix_to_str(matrix) + + answer = self._get_distances(matrix) + answer_str = self._matrix_to_str(answer) + + return { + "question": QUESTION_TEMPLATE.format(matrix=matrix_str), + "answer": answer_str, + "metadata": {"matrix": matrix, "solution": answer}, + } + + +register_dataset("binary_matrix", BinaryMatrixDataset, BinaryMatrixConfig) diff --git a/reasoning_gym/algorithmic/rotate_matrix.py b/reasoning_gym/algorithmic/rotate_matrix.py index ac50a281..4fdf651e 100644 --- a/reasoning_gym/algorithmic/rotate_matrix.py +++ b/reasoning_gym/algorithmic/rotate_matrix.py @@ -83,7 +83,7 @@ class RotateMatrixDataset(ProceduralDataset): return "\n".join(" ".join(str(x) for x in row) for row in matrix) def __getitem__(self, idx: int) -> dict: - """Generate a single Spiral Matrix question""" + """Generate a single Rotate Matrix question""" rng = Random(self.seed + idx) matrix = self._get_matrix(rng) diff --git a/reasoning_gym/algorithmic/word_ladder.py b/reasoning_gym/algorithmic/word_ladder.py index a0b000c2..64c65326 100644 --- a/reasoning_gym/algorithmic/word_ladder.py +++ b/reasoning_gym/algorithmic/word_ladder.py @@ -5,8 +5,7 @@ from dataclasses import dataclass from random import Random from typing import Dict, List, Optional, Set, Tuple -from reasoning_gym.data import read_data_file - +from ..data import get_data_file_path from ..factory import ProceduralDataset, register_dataset @@ -64,6 +63,7 @@ class WordLadderDataset(ProceduralDataset): self.config = config self.word_sets = {} self.word_graphs = {} + self._vocabulary = None # A large list of dictionary words to validate words against # Load words from CSV self.word_sets = self._load_words_from_csv( @@ -84,28 +84,24 @@ class WordLadderDataset(ProceduralDataset): assert 3 <= min_length <= max_length <= 5, "Word length must be between 3 and 5 inclusive" import csv - from io import StringIO word_sets = {} try: # Get CSV content as string - csv_content = read_data_file("words.csv") + with get_data_file_path("words.csv").open("r", encoding="utf-8") as csv_file: + reader = csv.DictReader(csv_file) - # Use StringIO to create a file-like object from the string - csv_file = StringIO(csv_content) - reader = csv.DictReader(csv_file) + for row in reader: + # Process each word length column using config range + for length in range(min_length, max_length + 1): + col_name = f"{length}_letter" + word = row.get(col_name, "") - for row in reader: - # Process each word length column using config range - for length in range(min_length, max_length + 1): - col_name = f"{length}_letter" - word = row.get(col_name, "") + if not word: # Skip empty entries + continue - if not word: # Skip empty entries - continue - - word_sets.setdefault(length, set()).add(word.upper()) + word_sets.setdefault(length, set()).add(word.upper()) except Exception as e: raise RuntimeError(f"Error processing words.csv content: {e}") from e @@ -220,5 +216,43 @@ class WordLadderDataset(ProceduralDataset): "metadata": {"start_word": start, "end_word": end, "word_length": length, "chain_length": len(path)}, } + def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float: + if answer is None: + return 0 + + answer_words = tuple(s.strip() for s in answer.upper().split(",")) + + metadata = entry["metadata"] + start_word = metadata["start_word"] + end_word = metadata["end_word"] + word_length = len(end_word) + known_words = self.word_sets[word_length] + + # Check conditions: + # 1. start and end word match question + # 2. all words have the correct length + # 3. every changed word is a single letter change from the previous word + # 4. all words are in our vocabulary + + if len(answer_words) < 2: + return 0 + + if answer_words[0] != start_word or answer_words[-1] != end_word: + return 0.01 + + if not all(len(w) == word_length for w in answer_words): + return 0.01 + + for i in range(1, len(answer_words)): + if sum(1 for a, b in zip(answer_words[i - 1], answer_words[i]) if a != b) != 1: + return 0.01 + + reward = 1.0 + for word in answer_words: + if not word in known_words: + reward *= 0.5 + + return reward + register_dataset("word_ladder", WordLadderDataset, WordLadderConfig) diff --git a/requirements-dev.txt b/requirements-dev.txt index b96fc1c2..18cbc82d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,3 +5,4 @@ isort>=5.13.2 flake8>=7.1.1 mypy>=1.14.1 pre-commit>=4.1.0 +openai>=1.61.1 diff --git a/tests/test_binary_matrix.py b/tests/test_binary_matrix.py new file mode 100644 index 00000000..60c700d7 --- /dev/null +++ b/tests/test_binary_matrix.py @@ -0,0 +1,100 @@ +"""Tests for Binary Matrix questions generation""" + +import pytest + +from reasoning_gym.algorithmic.binary_matrix import BinaryMatrixConfig, BinaryMatrixDataset + + +def test_binary_matrix_config_validation(): + """Test that invalid configs raise appropriate errors""" + with pytest.raises(AssertionError): + config = BinaryMatrixConfig(max_n=-1) # Negative not allowed + config.validate() + + with pytest.raises(AssertionError): + config = BinaryMatrixConfig(max_n=0) # Zero not allowed + config.validate() + + with pytest.raises(AssertionError): + config = BinaryMatrixConfig(p_zero=0) # <= 0 not allowed + config.validate() + + with pytest.raises(AssertionError): + config = BinaryMatrixConfig(p_zero=1.01) # > 1 not allowed + config.validate() + + +def test_binary_matrix_dataset_deterministic(): + """Test that dataset generates same items with same seed""" + config = BinaryMatrixConfig(seed=42, size=10) + dataset1 = BinaryMatrixDataset(config) + dataset2 = BinaryMatrixDataset(config) + + for i in range(len(dataset1)): + assert dataset1[i] == dataset2[i] + + +def test_binary_matrix_dataset_items(): + """Test basic properties of generated items""" + config = BinaryMatrixConfig(max_n=5, size=10, seed=42) + dataset = BinaryMatrixDataset(config) + + for i in range(len(dataset)): + item = dataset[i] + # Check item structure + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Check metadata + assert "matrix" in item["metadata"] + assert "solution" in item["metadata"] + + matrix = item["metadata"]["matrix"] + solution = item["metadata"]["solution"] + + # Verify list dimensions + assert len(matrix) <= config.max_n + assert all(len(row) <= config.max_n for row in matrix) + assert all(len(row) <= config.max_n for row in solution) + + # Verify matrix values + for r in range(len(matrix)): + for c in range(len(matrix[r])): + assert matrix[r][c] in {0, 1} + assert solution[r][c] >= matrix[r][c] + + +def test_binary_matrix_dataset_iteration(): + """Test that iteration respects dataset size""" + config = BinaryMatrixConfig(size=5, seed=42) + dataset = BinaryMatrixDataset(config) + + items = list(dataset) + assert len(items) == config.size + + # Test multiple iterations yield same items + assert items == list(dataset) + + +def test_binary_matrix_answer(): + """Test the _get_distances method""" + config = BinaryMatrixConfig(seed=42) + dataset = BinaryMatrixDataset(config) + + # 1x1 matrix + matrix = [[0]] + assert dataset._get_distances(matrix) == [[0]] + + # 2x2 matrix + matrix = [[0, 1], [1, 1]] + assert dataset._get_distances(matrix) == [[0, 1], [1, 2]] + + # 3x3 matrix + matrix = [[0, 0, 0], [0, 1, 0], [1, 1, 1]] + assert dataset._get_distances(matrix) == [[0, 0, 0], [0, 1, 0], [1, 2, 1]] + + # Empty matrix + matrix = [[0, 0, 0], [0, 0, 0], [0, 0, 0]] + assert dataset._get_distances(matrix) == [[0, 0, 0], [0, 0, 0], [0, 0, 0]] diff --git a/tests/test_polynomial_equations.py b/tests/test_polynomial_equations.py index 6e1bb0c0..e7caf654 100644 --- a/tests/test_polynomial_equations.py +++ b/tests/test_polynomial_equations.py @@ -1,4 +1,5 @@ import pytest +from pytest import approx from sympy import Symbol, sympify from reasoning_gym import create_dataset @@ -115,3 +116,25 @@ def test_polynomial_solutions_evaluation(): f"Solution {solution} does not satisfy the polynomial {poly_str}. " f"Evaluated value: {evaluated_value}" ) + + +@pytest.mark.parametrize( + "oracle_answer, predicted_answer, expected_reward", + [ + ("4,-4.12", "4,-4.12", 1.0), # Exact match + ("4,-4.12", "4.0001,-4.120001", approx(0.9999, rel=1e-3)), # Very close match + ("4,-4.12", "4.1,-4.2", approx(0.9139, rel=1e-3)), + ("4,8", "4", approx(0.9, rel=1e-3)), # Missing an oracle solution -> missing solution penalty applies + ("4", "4,8", approx(0.95, rel=1e-3)), # extra solution -> extra solution penalty + ("-1,-2", "1,4", approx(0.06890, rel=1e-3)), # -1 matched w/ 1 and -2 matched w/ 4 + ("", "1", approx(0, rel=1e-4)), # oracle no solution, predicted extra solution + ("1", "", approx(0, rel=1e-4)), # oracle has a solution, predicted no solution + ], +) +def test_polynomial_solutions_score_answer(oracle_answer, predicted_answer, expected_reward): + # You might want to parameterize cfg as well + cfg = PolynomialEquationsConfig(seed=999, size=3) + ds = PolynomialEquationsDataset(cfg) + + actual_reward = ds.score_answer(predicted_answer, {"answer": oracle_answer}) + assert actual_reward == pytest.approx(expected_reward, rel=1e-3) # Fuzzy comparison for floats diff --git a/tests/test_word_ladder.py b/tests/test_word_ladder.py index d42108ea..1aba4cf3 100644 --- a/tests/test_word_ladder.py +++ b/tests/test_word_ladder.py @@ -355,5 +355,45 @@ def test_word_ladder_edge_cases(): assert max_length > 3, "No challenging word pairs generated" -if __name__ == "__main__": - pytest.main([__file__]) +def test_word_ladder_score_answer(): + """Test the score_answer method""" + config = WordLadderConfig(min_word_length=4, max_word_length=4) + dataset = WordLadderDataset(config) + + # Create a test entry + entry = { + "question": "Transform the word ladder 'COLD' to 'WARM' by changing one letter at a time.", + "answer": "COLD,CORD,CARD,WARD,WARM", + "metadata": {"start_word": "COLD", "end_word": "WARM", "word_length": 4, "chain_length": 5}, + } + + # Test perfect answer + assert dataset.score_answer("COLD,CORD,CARD,WARD,WARM", entry) == 1.0 + + # Test None answer + assert dataset.score_answer(None, entry) == 0.0 + + # Test empty answer + assert dataset.score_answer("", entry) == 0.0 + + # Test single word answer + assert dataset.score_answer("COLD", entry) == 0.0 + + # Test wrong start word + assert dataset.score_answer("BOLD,CORD,CARD,WARD,WARM", entry) == 0.01 + + # Test wrong end word + assert dataset.score_answer("COLD,CORD,CARD,WARD,WARP", entry) == 0.01 + + # Test wrong word length + assert dataset.score_answer("COLD,CORDS,CARDS,WARD,WARM", entry) == 0.01 + + # Test invalid transitions (more than one letter change) + assert dataset.score_answer("COLD,WARD,WARM", entry) == 0.01 + + # Test case insensitivity + assert dataset.score_answer("cold,cord,card,ward,warm", entry) == 1.0 + + # Test with unknown words (should return partial credit) + assert dataset.score_answer("COLD,COXD,CARD,WARD,WARM", entry) < 1.0 + assert dataset.score_answer("COLD,COXD,CARD,WARD,WARM", entry) > 0.0