diff --git a/GALLERY.md b/GALLERY.md
index e7ba9238..5aa6fc66 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -9,6 +9,7 @@ This gallery shows examples from all available datasets using their default conf
 - [base_conversion](#base_conversion)
 - [basic_arithmetic](#basic_arithmetic)
 - [bf](#bf)
+- [binary_matrix](#binary_matrix)
 - [caesar_cipher](#caesar_cipher)
 - [calendar_arithmetic](#calendar_arithmetic)
 - [chain_sum](#chain_sum)
@@ -751,6 +752,98 @@ Metadata: {'bfit_code': '\nint main() {\n    print("under");\n}\n', 'bf_program'
 
 ````
 
+### binary_matrix
+Generates Binary Matrix exercises with configurable difficulty
+
+Default configuration:
+```python
+max_n = 10
+p_zero = 0.25
+size = 500
+seed = 42
+```
+
+Example tasks:
+````
+Example 1:
+Question: Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell.
+
+Example:
+
+Input: Find the distance to the nearest 0 for each cell in the matrix below:
+0 0 0
+0 1 0
+1 1 1
+
+Output:
+0 0 0
+0 1 0
+1 2 1
+
+Find the distance to the nearest 0 for each cell in the matrix below:
+0 0
+1 0
+
+Answer: 0 0
+1 0
+Metadata: {'matrix': [[0, 0], [1, 0]], 'solution': [[0, 0], [1, 0]]}
+
+Example 2:
+Question: Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell.
+
+Example:
+
+Input: Find the distance to the nearest 0 for each cell in the matrix below:
+0 0 0
+0 1 0
+1 1 1
+
+Output:
+0 0 0
+0 1 0
+1 2 1
+
+Find the distance to the nearest 0 for each cell in the matrix below:
+0
+
+Answer: 0
+Metadata: {'matrix': [[0]], 'solution': [[0]]}
+
+Example 3:
+Question: Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell.
+
+Example:
+
+Input: Find the distance to the nearest 0 for each cell in the matrix below:
+0 0 0
+0 1 0
+1 1 1
+
+Output:
+0 0 0
+0 1 0
+1 2 1
+
+Find the distance to the nearest 0 for each cell in the matrix below:
+1 0 1 1 0 1 1
+1 0 1 1 1 1 1
+1 1 1 1 0 1 1
+1 1 1 1 0 0 1
+0 1 1 1 1 1 0
+1 0 1 1 1 1 0
+1 1 1 1 1 1 1
+
+Answer: 1 0 1 1 0 1 2
+1 0 1 2 1 2 3
+2 1 2 1 0 1 2
+1 2 2 1 0 0 1
+0 1 2 2 1 1 0
+1 0 1 2 2 1 0
+2 1 2 3 3 2 1
+Metadata: {'matrix': [[1, 0, 1, 1, 0, 1, 1], [1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 0, 1], [0, 1, 1, 1, 1, 1, 0], [1, 0, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1]], 'solution': [[1, 0, 1, 1, 0, 1, 2], [1, 0, 1, 2, 1, 2, 3], [2, 1, 2, 1, 0, 1, 2], [1, 2, 2, 1, 0, 0, 1], [0, 1, 2, 2, 1, 1, 0], [1, 0, 1, 2, 2, 1, 0], [2, 1, 2, 3, 3, 2, 1]]}
+
+````
+
 ### caesar_cipher
 Generates Caesar cipher encryption/decryption tasks
 
diff --git a/eval/.gitignore b/eval/.gitignore
new file mode 100644
index 00000000..7db6c7ce
--- /dev/null
+++ b/eval/.gitignore
@@ -0,0 +1,2 @@
+results/*
+!results/summary*
diff --git a/eval/eval.py b/eval/eval.py
new file mode 100644
index 00000000..f8952e10
--- /dev/null
+++ b/eval/eval.py
@@ -0,0 +1,143 @@
+import argparse
+import json
+import os
+from datetime import datetime
+from typing import Any, Dict, List
+
+from openai import OpenAI
+
+from reasoning_gym.factory import DATASETS, create_dataset
+
+
+class OpenRouterEvaluator:
+    def __init__(self, model: str):
+        self.client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=os.getenv("OPENROUTER_API_KEY"))
+        self.model = model
+        self.extra_headers = {}
+
+    def get_model_response(self, prompt: str) -> str:
+        """Get response from the model via OpenRouter API."""
+        try:
+            completion = self.client.chat.completions.create(
+                extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}]
+            )
+            return completion.choices[0].message.content
+        except Exception as e:
+            print(f"Error calling OpenRouter API: {str(e)}")
+            raise
+
+    def evaluate_datasets(self, dataset_configs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Evaluate model on multiple datasets with their respective configurations."""
+        all_results = []
+
+        for dataset_config in dataset_configs:
+            dataset_name = dataset_config.pop("name")
+            print(f"\nEvaluating dataset: {dataset_name}")
+
+            try:
+                # Create dataset with its specific configuration
+                data = create_dataset(dataset_name, **dataset_config)
+                results = []
+
+                for entry in data:
+                    try:
+                        response = self.get_model_response(entry["question"])
+                        score = data.score_answer(answer=response, entry=entry)
+
+                        result = {
+                            "question": entry["question"],
+                            "expected_answer": entry["answer"],
+                            "model_answer": response,
+                            "score": score,
+                            "metadata": entry["metadata"],
+                        }
+                        results.append(result)
+                        print(f"Processed question {len(results)}/{len(data)}. Score: {score}")
+
+                    except Exception as e:
+                        print(f"Error processing question: {entry['question']}")
+                        print(f"Error: {str(e)}")
+
+                # Calculate aggregate metrics
+                total_score = sum(r["score"] for r in results)
+                metrics = {
+                    "dataset_name": dataset_name,
+                    "model": self.model,
+                    "size": len(data),
+                    "average_score": total_score / len(results) if results else 0,
+                    "total_examples": len(results),
+                    "timestamp": datetime.now().isoformat(),
+                    "config": dataset_config,
+                }
+
+                all_results.append({"metrics": metrics, "results": results})
+
+            except Exception as e:
+                print(f"Error evaluating dataset {dataset_name}: {str(e)}")
+                continue
+
+        return all_results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
+    parser.add_argument("--model", required=True, help="Model to evaluate")
+    parser.add_argument("--config", required=True, help="Path to JSON configuration file")
+    parser.add_argument("--output-dir", default="results", help="Output directory")
+
+    args = parser.parse_args()
+
+    # Create output directory if it doesn't exist
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load dataset configurations
+    with open(args.config, "r") as f:
+        dataset_configs = json.load(f)
+
+    evaluator = OpenRouterEvaluator(model=args.model)
+    all_results = evaluator.evaluate_datasets(dataset_configs)
+
+    # Save results
+    output_file = os.path.join(
+        args.output_dir, f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    )
+
+    # Save detailed results
+    with open(output_file, "w") as f:
+        json.dump(all_results, f, indent=2)
+
+    # Create summary
+    summary = []
+    for result in all_results:
+        metrics = result["metrics"]
+        summary_entry = {
+            "dataset_name": metrics["dataset_name"],
+            "model": metrics["model"],
+            "average_score": metrics["average_score"],
+            "total_examples": metrics["total_examples"],
+            "timestamp": metrics["timestamp"],
+            "config": metrics["config"],
+        }
+        summary.append(summary_entry)
+
+    # Save summary to a separate file
+    summary_file = os.path.join(
+        args.output_dir, f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    )
+
+    with open(summary_file, "w") as f:
+        json.dump(summary, f, indent=2)
+
+    # Print summary
+    print("\nEvaluation Summary:")
+    for entry in summary:
+        print(f"\nDataset: {entry['dataset_name']}")
+        print(f"Average Score: {entry['average_score']:.2%}")
+        print(f"Total Examples: {entry['total_examples']}")
+
+    print(f"\nDetailed results saved to: {output_file}")
+    print(f"Summary saved to: {summary_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/eval.sh b/eval/eval.sh
new file mode 100644
index 00000000..1d2a0beb
--- /dev/null
+++ b/eval/eval.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Check if OPENROUTER_API_KEY is set
+if [ -z "$OPENROUTER_API_KEY" ]; then
+    echo "Error: OPENROUTER_API_KEY environment variable is not set"
+    echo "Please set it using: export OPENROUTER_API_KEY=your-api-key"
+    exit 1
+fi
+
+# Configuration
+OUTPUT_DIR="results"
+
+# List of models to evaluate
+MODELS=(
+    "google/gemini-2.0-flash-001"
+)
+
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+
+# Run evaluations
+for model in "${MODELS[@]}"; do
+    echo "Evaluating $model..."
+    python eval.py \
+        --model "$model" \
+        --config "eval_basic.json" \
+        --output-dir "$OUTPUT_DIR"
+done
+
+echo "All evaluations completed!"
diff --git a/eval/eval_basic.json b/eval/eval_basic.json
new file mode 100644
index 00000000..6240ce9d
--- /dev/null
+++ b/eval/eval_basic.json
@@ -0,0 +1,21 @@
+[
+    {
+      "name": "letter_counting",
+      "min_words": 5,
+      "max_words": 15,
+      "size": 10,
+      "seed": 42
+    },
+    {
+      "name": "propositional_logic",
+      "size": 10,
+      "seed": 42
+    },
+    {
+      "name": "leg_counting",
+      "min_animals": 3,
+      "max_animals": 8,
+      "size": 10,
+      "seed": 42
+    }
+  ]
diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json
new file mode 100644
index 00000000..8c9d6a5c
--- /dev/null
+++ b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json
@@ -0,0 +1,39 @@
+[
+  {
+    "dataset_name": "letter_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.20600000000000002,
+    "total_examples": 10,
+    "timestamp": "2025-02-10T06:34:37.091554",
+    "config": {
+      "min_words": 5,
+      "max_words": 15,
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "propositional_logic",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.059,
+    "total_examples": 10,
+    "timestamp": "2025-02-10T06:35:11.432275",
+    "config": {
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "leg_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.40199999999999997,
+    "total_examples": 10,
+    "timestamp": "2025-02-10T06:35:27.087469",
+    "config": {
+      "min_animals": 3,
+      "max_animals": 8,
+      "size": 10,
+      "seed": 42
+    }
+  }
+]
diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json b/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json
new file mode 100644
index 00000000..32086f46
--- /dev/null
+++ b/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json
@@ -0,0 +1,39 @@
+[
+  {
+    "dataset_name": "letter_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.20600000000000002,
+    "total_examples": 10,
+    "timestamp": "2025-02-09T22:47:25.934820",
+    "config": {
+      "min_words": 5,
+      "max_words": 15,
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "propositional_logic",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.059,
+    "total_examples": 10,
+    "timestamp": "2025-02-09T22:47:57.473560",
+    "config": {
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "leg_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.40199999999999997,
+    "total_examples": 10,
+    "timestamp": "2025-02-09T22:48:13.546006",
+    "config": {
+      "min_animals": 3,
+      "max_animals": 8,
+      "size": 10,
+      "seed": 42
+    }
+  }
+]
diff --git a/reasoning_gym/algebra/polynomial_equations.py b/reasoning_gym/algebra/polynomial_equations.py
index ed7e857f..a1822958 100644
--- a/reasoning_gym/algebra/polynomial_equations.py
+++ b/reasoning_gym/algebra/polynomial_equations.py
@@ -1,7 +1,8 @@
+import math
 import random
 import string
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 from sympy import Eq, Symbol, expand, solve
 
@@ -26,6 +27,9 @@ class PolynomialEquationsConfig:
     )  # Allowed operators between terms, Avoid adding '*' or '/' because they will affect the degree
     seed: Optional[int] = None
     size: int = 500
+    # reward function hyperparameters
+    penalty_missing_factor = 0.1
+    penalty_extra_factor = 0.05
 
     def validate(self) -> None:
         """Validate configuration parameters."""
@@ -146,5 +150,101 @@ class PolynomialEquationsDataset(ProceduralDataset):
 
         return polynomial_expr
 
+    def _parse_score_to_list(self, answer: Optional[str]) -> List[float]:
+        """Parses a comma-separated string of scores into a sorted list of floats.
+
+        This method takes a string containing comma-separated numeric values,
+        attempts to convert each value to a float, and returns a sorted list of these floats.
+        Any values that cannot be converted to a float are ignored.
+        Handles empty strings gracefully.
+
+        Args:
+            answer: An optional string containing comma-separated numeric values.
+            Can be None or an empty string.
+        Returns:
+            A sorted list of floats parsed from the input string.
+            Returns an empty list if the input is None, empty, or contains no valid numeric values.
+        """
+
+        if answer is None or len(answer) == 0:  # Handle None or empty input
+            return []
+
+        output_float_vals = []
+        for output_val in answer.split(","):
+            try:
+                # Convert to float, strip whitespace
+                output_float_vals.append(float(output_val.strip()))
+            except ValueError:
+                # Ignore values that cannot be converted to float
+                continue
+
+        return sorted(output_float_vals)  # Return the sorted list of floats
+
+    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+        """
+        Score an answer based on its numerical distance to oracle solutions using exponential decay.
+        This function compares a predicted answer (or list of answers) to a set of oracle solutions
+        (also a list of numbers). It calculates a reward based on how close the predicted solutions
+        are to the oracle solutions, using an exponential decay function.  It also applies penalties
+        for missing or extra predicted solutions. The implementation is a greedy algorithm where we
+        find the closest matching oracle solution for a given predicted solution and only allow an
+        oracle solution to match once.
+
+        Args:
+            answer: The predicted answer (or a string that can be parsed into a list of numbers).
+                    May be None.
+            entry: A dictionary containing the oracle solution(s) under the key "answer"
+                (which can be a string that can be parsed into a list of numbers).
+
+        Returns:
+            A float representing the final score. The score is non-negative.
+        """
+        oracle_solutions = self._parse_score_to_list(entry["answer"])  # Parse oracle solutions
+        predicted_solutions = self._parse_score_to_list(answer)  # Parse predicted solutions
+
+        total_reward = 0.0
+        matched_solutions = 0
+        extra_solutions = 0
+        missing_solutions = 0
+
+        for predicted_solution in predicted_solutions:
+
+            # find the closest matching solution from the oracle solutions.
+            # this is a greedy approach to computing the score
+            matched_distance = float("inf")
+            matched_distance_index = None
+            for oracle_solution_index, oracle_solution in enumerate(oracle_solutions):
+                if matched_distance > abs(predicted_solution - oracle_solution):
+                    matched_distance = abs(predicted_solution - oracle_solution)
+                    matched_distance_index = oracle_solution_index
+
+            if matched_distance_index is not None:
+                matched_solutions += 1
+                # Remove matched oracle solution
+                oracle_solutions.pop(matched_distance_index)
+                # Exponential decay reward
+                total_reward += math.exp(-matched_distance)
+            else:
+                # Extra predicted solution
+                extra_solutions += 1
+
+        # Count remaining oracle solutions as missing
+        for oracle_solution in oracle_solutions:
+            missing_solutions += 1
+
+        # Calculate penalty for either missing or extra solutions
+        penalty = missing_solutions * self.config.penalty_missing_factor
+        penalty += extra_solutions * self.config.penalty_extra_factor
+
+        if matched_solutions > 0:
+            # normalize the rewards that we found matching solutions for
+            # so that the value is bounded between 0 and 1
+            total_reward = total_reward / matched_solutions
+
+        # Final reward capped at 0
+        final_reward = max(0, total_reward - penalty)
+
+        return final_reward
+
 
 register_dataset("polynomial_equations", PolynomialEquationsDataset, PolynomialEquationsConfig)
diff --git a/reasoning_gym/algorithmic/__init__.py b/reasoning_gym/algorithmic/__init__.py
index dac1b62c..ad76199f 100644
--- a/reasoning_gym/algorithmic/__init__.py
+++ b/reasoning_gym/algorithmic/__init__.py
@@ -7,6 +7,7 @@ Algorithmic tasks for training reasoning capabilities:
 """
 
 from .base_conversion import BaseConversionConfig, BaseConversionDataset
+from .binary_matrix import BinaryMatrixConfig, BinaryMatrixDataset
 from .caesar_cipher import CaesarCipherConfig, CaesarCipherDataset
 from .group_anagrams import GroupAnagramsConfig, GroupAnagramsDataset
 from .isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset
@@ -63,4 +64,6 @@ __all__ = [
     "RotateMatrixDataset",
     "ManipulateMatrixConfig",
     "ManipulateMatrixDataset",
+    "BinaryMatrixConfig",
+    "BinaryMatrixDataset",
 ]
diff --git a/reasoning_gym/algorithmic/binary_matrix.py b/reasoning_gym/algorithmic/binary_matrix.py
new file mode 100644
index 00000000..8ae122bb
--- /dev/null
+++ b/reasoning_gym/algorithmic/binary_matrix.py
@@ -0,0 +1,125 @@
+"""Find the distance to the nearest 0 for each cell in a binary matrix.
+
+A popular Leetcode problem:
+https://leetcode.com/problems/01-matrix/description/
+"""
+
+from collections import deque
+from dataclasses import dataclass
+from random import Random
+from typing import Optional
+
+from ..factory import ProceduralDataset, register_dataset
+
+QUESTION_TEMPLATE = """Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell.
+
+Example:
+
+Input: Find the distance to the nearest 0 for each cell in the matrix below:
+0 0 0
+0 1 0
+1 1 1
+
+Output:
+0 0 0
+0 1 0
+1 2 1
+
+Find the distance to the nearest 0 for each cell in the matrix below:
+{matrix}
+"""
+
+
+@dataclass
+class BinaryMatrixConfig:
+    """Configuration for Binary Matrix dataset generation"""
+
+    max_n: int = 10  # Maximum size of the matrix
+    p_zero: float = 0.25  # Probability of a cell being 0
+
+    size: int = 500  # Virtual dataset size
+    seed: Optional[int] = None
+
+    def validate(self):
+        """Validate configuration parameters"""
+        assert 1 <= self.max_n, "max_n must be at least 1"
+        assert 0 < self.p_zero <= 1, "p_zero must be between 0 and 1"
+
+
+class BinaryMatrixDataset(ProceduralDataset):
+    """Generates Binary Matrix exercises with configurable difficulty"""
+
+    def __init__(self, config: BinaryMatrixConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+
+    def _get_binary_matrix(self, rng: Random) -> list[list[int]]:
+        """Generate a random binary matrix"""
+        n = rng.randint(1, self.config.max_n)
+        # Ensure at least one 0 in the matrix, so that a solution exists
+        numbers = [0] + [0 if rng.random() < self.config.p_zero else 1 for _ in range(n**2 - 1)]
+        rng.shuffle(numbers)
+        matrix = [numbers[i * n : (i + 1) * n] for i in range(n)]
+        return matrix
+
+    def _get_distances(self, matrix: list[list[int]]) -> list[list[int]]:
+        """Get the distance to the nearest 0 for each cell in the matrix"""
+        n = len(matrix)
+        directions = [[1, 0], [-1, 0], [0, 1], [0, -1]]
+        visited = set()
+        queue = deque()
+
+        output = [[float("inf")] * n for _ in range(n)]
+
+        for r in range(n):
+            for c in range(n):
+                if matrix[r][c] == 0:
+                    output[r][c] = 0
+                    visited.add((r, c))
+                    queue.append((r, c))
+
+        clock = 1
+        while True:
+            temp = deque()
+            while queue:
+                r, c = queue.popleft()
+                for dr, dc in directions:
+                    new_r, new_c = r + dr, c + dc
+                    if (
+                        0 <= new_r < n
+                        and 0 <= new_c < n
+                        and (new_r, new_c) not in visited
+                        and matrix[new_r][new_c] == 1
+                    ):
+                        output[new_r][new_c] = clock
+                        visited.add((new_r, new_c))
+                        temp.append((new_r, new_c))
+            if temp:
+                queue = temp
+            else:
+                break
+            clock += 1
+
+        return output
+
+    def _matrix_to_str(self, matrix: list[list[int]]) -> str:
+        """Get a string representation of the matrix"""
+        return "\n".join(" ".join(str(x) for x in row) for row in matrix)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Generate a single Binary Matrix question"""
+        rng = Random(self.seed + idx)
+
+        matrix = self._get_binary_matrix(rng)
+        matrix_str = self._matrix_to_str(matrix)
+
+        answer = self._get_distances(matrix)
+        answer_str = self._matrix_to_str(answer)
+
+        return {
+            "question": QUESTION_TEMPLATE.format(matrix=matrix_str),
+            "answer": answer_str,
+            "metadata": {"matrix": matrix, "solution": answer},
+        }
+
+
+register_dataset("binary_matrix", BinaryMatrixDataset, BinaryMatrixConfig)
diff --git a/reasoning_gym/algorithmic/rotate_matrix.py b/reasoning_gym/algorithmic/rotate_matrix.py
index ac50a281..4fdf651e 100644
--- a/reasoning_gym/algorithmic/rotate_matrix.py
+++ b/reasoning_gym/algorithmic/rotate_matrix.py
@@ -83,7 +83,7 @@ class RotateMatrixDataset(ProceduralDataset):
         return "\n".join(" ".join(str(x) for x in row) for row in matrix)
 
     def __getitem__(self, idx: int) -> dict:
-        """Generate a single Spiral Matrix question"""
+        """Generate a single Rotate Matrix question"""
         rng = Random(self.seed + idx)
 
         matrix = self._get_matrix(rng)
diff --git a/reasoning_gym/algorithmic/word_ladder.py b/reasoning_gym/algorithmic/word_ladder.py
index a0b000c2..64c65326 100644
--- a/reasoning_gym/algorithmic/word_ladder.py
+++ b/reasoning_gym/algorithmic/word_ladder.py
@@ -5,8 +5,7 @@ from dataclasses import dataclass
 from random import Random
 from typing import Dict, List, Optional, Set, Tuple
 
-from reasoning_gym.data import read_data_file
-
+from ..data import get_data_file_path
 from ..factory import ProceduralDataset, register_dataset
 
 
@@ -64,6 +63,7 @@ class WordLadderDataset(ProceduralDataset):
         self.config = config
         self.word_sets = {}
         self.word_graphs = {}
+        self._vocabulary = None  # A large list of dictionary words to validate words against
 
         # Load words from CSV
         self.word_sets = self._load_words_from_csv(
@@ -84,28 +84,24 @@ class WordLadderDataset(ProceduralDataset):
         assert 3 <= min_length <= max_length <= 5, "Word length must be between 3 and 5 inclusive"
 
         import csv
-        from io import StringIO
 
         word_sets = {}
 
         try:
             # Get CSV content as string
-            csv_content = read_data_file("words.csv")
+            with get_data_file_path("words.csv").open("r", encoding="utf-8") as csv_file:
+                reader = csv.DictReader(csv_file)
 
-            # Use StringIO to create a file-like object from the string
-            csv_file = StringIO(csv_content)
-            reader = csv.DictReader(csv_file)
+                for row in reader:
+                    # Process each word length column using config range
+                    for length in range(min_length, max_length + 1):
+                        col_name = f"{length}_letter"
+                        word = row.get(col_name, "")
 
-            for row in reader:
-                # Process each word length column using config range
-                for length in range(min_length, max_length + 1):
-                    col_name = f"{length}_letter"
-                    word = row.get(col_name, "")
+                        if not word:  # Skip empty entries
+                            continue
 
-                    if not word:  # Skip empty entries
-                        continue
-
-                    word_sets.setdefault(length, set()).add(word.upper())
+                        word_sets.setdefault(length, set()).add(word.upper())
 
         except Exception as e:
             raise RuntimeError(f"Error processing words.csv content: {e}") from e
@@ -220,5 +216,43 @@ class WordLadderDataset(ProceduralDataset):
             "metadata": {"start_word": start, "end_word": end, "word_length": length, "chain_length": len(path)},
         }
 
+    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+        if answer is None:
+            return 0
+
+        answer_words = tuple(s.strip() for s in answer.upper().split(","))
+
+        metadata = entry["metadata"]
+        start_word = metadata["start_word"]
+        end_word = metadata["end_word"]
+        word_length = len(end_word)
+        known_words = self.word_sets[word_length]
+
+        # Check conditions:
+        # 1. start and end word match question
+        # 2. all words have the correct length
+        # 3. every changed word is a single letter change from the previous word
+        # 4. all words are in our vocabulary
+
+        if len(answer_words) < 2:
+            return 0
+
+        if answer_words[0] != start_word or answer_words[-1] != end_word:
+            return 0.01
+
+        if not all(len(w) == word_length for w in answer_words):
+            return 0.01
+
+        for i in range(1, len(answer_words)):
+            if sum(1 for a, b in zip(answer_words[i - 1], answer_words[i]) if a != b) != 1:
+                return 0.01
+
+        reward = 1.0
+        for word in answer_words:
+            if not word in known_words:
+                reward *= 0.5
+
+        return reward
+
 
 register_dataset("word_ladder", WordLadderDataset, WordLadderConfig)
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b96fc1c2..18cbc82d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,3 +5,4 @@ isort>=5.13.2
 flake8>=7.1.1
 mypy>=1.14.1
 pre-commit>=4.1.0
+openai>=1.61.1
diff --git a/tests/test_binary_matrix.py b/tests/test_binary_matrix.py
new file mode 100644
index 00000000..60c700d7
--- /dev/null
+++ b/tests/test_binary_matrix.py
@@ -0,0 +1,100 @@
+"""Tests for Binary Matrix questions generation"""
+
+import pytest
+
+from reasoning_gym.algorithmic.binary_matrix import BinaryMatrixConfig, BinaryMatrixDataset
+
+
+def test_binary_matrix_config_validation():
+    """Test that invalid configs raise appropriate errors"""
+    with pytest.raises(AssertionError):
+        config = BinaryMatrixConfig(max_n=-1)  # Negative not allowed
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = BinaryMatrixConfig(max_n=0)  # Zero not allowed
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = BinaryMatrixConfig(p_zero=0)  # <= 0 not allowed
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = BinaryMatrixConfig(p_zero=1.01)  # > 1 not allowed
+        config.validate()
+
+
+def test_binary_matrix_dataset_deterministic():
+    """Test that dataset generates same items with same seed"""
+    config = BinaryMatrixConfig(seed=42, size=10)
+    dataset1 = BinaryMatrixDataset(config)
+    dataset2 = BinaryMatrixDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i]
+
+
+def test_binary_matrix_dataset_items():
+    """Test basic properties of generated items"""
+    config = BinaryMatrixConfig(max_n=5, size=10, seed=42)
+    dataset = BinaryMatrixDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        # Check item structure
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        # Check metadata
+        assert "matrix" in item["metadata"]
+        assert "solution" in item["metadata"]
+
+        matrix = item["metadata"]["matrix"]
+        solution = item["metadata"]["solution"]
+
+        # Verify list dimensions
+        assert len(matrix) <= config.max_n
+        assert all(len(row) <= config.max_n for row in matrix)
+        assert all(len(row) <= config.max_n for row in solution)
+
+        # Verify matrix values
+        for r in range(len(matrix)):
+            for c in range(len(matrix[r])):
+                assert matrix[r][c] in {0, 1}
+                assert solution[r][c] >= matrix[r][c]
+
+
+def test_binary_matrix_dataset_iteration():
+    """Test that iteration respects dataset size"""
+    config = BinaryMatrixConfig(size=5, seed=42)
+    dataset = BinaryMatrixDataset(config)
+
+    items = list(dataset)
+    assert len(items) == config.size
+
+    # Test multiple iterations yield same items
+    assert items == list(dataset)
+
+
+def test_binary_matrix_answer():
+    """Test the _get_distances method"""
+    config = BinaryMatrixConfig(seed=42)
+    dataset = BinaryMatrixDataset(config)
+
+    # 1x1 matrix
+    matrix = [[0]]
+    assert dataset._get_distances(matrix) == [[0]]
+
+    # 2x2 matrix
+    matrix = [[0, 1], [1, 1]]
+    assert dataset._get_distances(matrix) == [[0, 1], [1, 2]]
+
+    # 3x3 matrix
+    matrix = [[0, 0, 0], [0, 1, 0], [1, 1, 1]]
+    assert dataset._get_distances(matrix) == [[0, 0, 0], [0, 1, 0], [1, 2, 1]]
+
+    # Empty matrix
+    matrix = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
+    assert dataset._get_distances(matrix) == [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
diff --git a/tests/test_polynomial_equations.py b/tests/test_polynomial_equations.py
index 6e1bb0c0..e7caf654 100644
--- a/tests/test_polynomial_equations.py
+++ b/tests/test_polynomial_equations.py
@@ -1,4 +1,5 @@
 import pytest
+from pytest import approx
 from sympy import Symbol, sympify
 
 from reasoning_gym import create_dataset
@@ -115,3 +116,25 @@ def test_polynomial_solutions_evaluation():
                 f"Solution {solution} does not satisfy the polynomial {poly_str}. "
                 f"Evaluated value: {evaluated_value}"
             )
+
+
+@pytest.mark.parametrize(
+    "oracle_answer, predicted_answer, expected_reward",
+    [
+        ("4,-4.12", "4,-4.12", 1.0),  # Exact match
+        ("4,-4.12", "4.0001,-4.120001", approx(0.9999, rel=1e-3)),  # Very close match
+        ("4,-4.12", "4.1,-4.2", approx(0.9139, rel=1e-3)),
+        ("4,8", "4", approx(0.9, rel=1e-3)),  # Missing an oracle solution -> missing solution penalty applies
+        ("4", "4,8", approx(0.95, rel=1e-3)),  # extra solution -> extra solution penalty
+        ("-1,-2", "1,4", approx(0.06890, rel=1e-3)),  # -1 matched w/ 1 and -2 matched w/ 4
+        ("", "1", approx(0, rel=1e-4)),  # oracle no solution, predicted extra solution
+        ("1", "", approx(0, rel=1e-4)),  # oracle has a solution, predicted no solution
+    ],
+)
+def test_polynomial_solutions_score_answer(oracle_answer, predicted_answer, expected_reward):
+    # You might want to parameterize cfg as well
+    cfg = PolynomialEquationsConfig(seed=999, size=3)
+    ds = PolynomialEquationsDataset(cfg)
+
+    actual_reward = ds.score_answer(predicted_answer, {"answer": oracle_answer})
+    assert actual_reward == pytest.approx(expected_reward, rel=1e-3)  # Fuzzy comparison for floats
diff --git a/tests/test_word_ladder.py b/tests/test_word_ladder.py
index d42108ea..1aba4cf3 100644
--- a/tests/test_word_ladder.py
+++ b/tests/test_word_ladder.py
@@ -355,5 +355,45 @@ def test_word_ladder_edge_cases():
     assert max_length > 3, "No challenging word pairs generated"
 
 
-if __name__ == "__main__":
-    pytest.main([__file__])
+def test_word_ladder_score_answer():
+    """Test the score_answer method"""
+    config = WordLadderConfig(min_word_length=4, max_word_length=4)
+    dataset = WordLadderDataset(config)
+
+    # Create a test entry
+    entry = {
+        "question": "Transform the word ladder 'COLD' to 'WARM' by changing one letter at a time.",
+        "answer": "COLD,CORD,CARD,WARD,WARM",
+        "metadata": {"start_word": "COLD", "end_word": "WARM", "word_length": 4, "chain_length": 5},
+    }
+
+    # Test perfect answer
+    assert dataset.score_answer("COLD,CORD,CARD,WARD,WARM", entry) == 1.0
+
+    # Test None answer
+    assert dataset.score_answer(None, entry) == 0.0
+
+    # Test empty answer
+    assert dataset.score_answer("", entry) == 0.0
+
+    # Test single word answer
+    assert dataset.score_answer("COLD", entry) == 0.0
+
+    # Test wrong start word
+    assert dataset.score_answer("BOLD,CORD,CARD,WARD,WARM", entry) == 0.01
+
+    # Test wrong end word
+    assert dataset.score_answer("COLD,CORD,CARD,WARD,WARP", entry) == 0.01
+
+    # Test wrong word length
+    assert dataset.score_answer("COLD,CORDS,CARDS,WARD,WARM", entry) == 0.01
+
+    # Test invalid transitions (more than one letter change)
+    assert dataset.score_answer("COLD,WARD,WARM", entry) == 0.01
+
+    # Test case insensitivity
+    assert dataset.score_answer("cold,cord,card,ward,warm", entry) == 1.0
+
+    # Test with unknown words (should return partial credit)
+    assert dataset.score_answer("COLD,COXD,CARD,WARD,WARM", entry) < 1.0
+    assert dataset.score_answer("COLD,COXD,CARD,WARD,WARM", entry) > 0.0