From f7836e17d0922379b0265277c7ff09c5e4521acb Mon Sep 17 00:00:00 2001
From: Zafir Stojanovski <zaf.stojano@gmail.com>
Date: Sun, 9 Feb 2025 01:10:57 +0100
Subject: [PATCH 01/11] binary matrix

---
 reasoning_gym/algorithmic/__init__.py      |   3 +
 reasoning_gym/algorithmic/binary_matrix.py | 125 +++++++++++++++++++++
 tests/test_binary_matrix.py                | 100 +++++++++++++++++
 3 files changed, 228 insertions(+)
 create mode 100644 reasoning_gym/algorithmic/binary_matrix.py
 create mode 100644 tests/test_binary_matrix.py

diff --git a/reasoning_gym/algorithmic/__init__.py b/reasoning_gym/algorithmic/__init__.py
index 9acc5007..decfd4d2 100644
--- a/reasoning_gym/algorithmic/__init__.py
+++ b/reasoning_gym/algorithmic/__init__.py
@@ -7,6 +7,7 @@ Algorithmic tasks for training reasoning capabilities:
 """
 
 from .base_conversion import BaseConversionConfig, BaseConversionDataset
+from .binary_matrix import BinaryMatrixConfig, BinaryMatrixDataset
 from .caesar_cipher import CaesarCipherConfig, CaesarCipherDataset
 from .group_anagrams import GroupAnagramsConfig, GroupAnagramsDataset
 from .isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset
@@ -60,4 +61,6 @@ __all__ = [
     "IsomorphicStringsDataset",
     "RotateMatrixConfig",
     "RotateMatrixDataset",
+    "BinaryMatrixConfig",
+    "BinaryMatrixDataset",
 ]
diff --git a/reasoning_gym/algorithmic/binary_matrix.py b/reasoning_gym/algorithmic/binary_matrix.py
new file mode 100644
index 00000000..c34d0566
--- /dev/null
+++ b/reasoning_gym/algorithmic/binary_matrix.py
@@ -0,0 +1,125 @@
+"""Find the distance to the nearest 0 for each cell in a binary matrix.
+
+A popular Leetcode problem:
+https://leetcode.com/problems/01-matrix/description/
+"""
+
+from collections import deque
+from copy import deepcopy
+from dataclasses import dataclass
+from random import Random
+from typing import Optional
+
+from ..factory import ProceduralDataset, register_dataset
+
+QUESTION_TEMPLATE = """Given a square matrix, your job is to find the distance of the nearest 0 for each cell.
+
+Example:
+
+Input: Rotate the matrix below by 90 degrees clockwise:
+0 0 0
+0 1 0
+1 1 1
+
+Output:
+0 0 0
+0 1 0
+1 2 1
+
+Find the distance to the nearest 0 for each cell in the matrix below:
+{matrix}
+"""
+
+
+@dataclass
+class BinaryMatrixConfig:
+    """Configuration for Binary Matrix dataset generation"""
+
+    max_n: int = 10  # Maximum size of the matrix
+    p_zero: float = 0.7  # Probability of a cell being 0
+
+    size: int = 500  # Virtual dataset size
+    seed: Optional[int] = None
+
+    def validate(self):
+        """Validate configuration parameters"""
+        assert 1 <= self.max_n, "max_n must be at least 1"
+        assert 0 < self.p_zero <= 1, "p_zero must be between 0 and 1"
+
+
+class BinaryMatrixDataset(ProceduralDataset):
+    """Generates Binary Matrix exercises with configurable difficulty"""
+
+    def __init__(self, config: BinaryMatrixConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+
+    def _get_binary_matrix(self, rng: Random) -> list[list[int]]:
+        """Generate a random binary matrix"""
+        n = rng.randint(1, self.config.max_n)
+        # Ensure at least one 0 in the matrix, so that a solution exists
+        numbers = [0] + [0 if rng.random() < self.config.p_zero else 1 for _ in range(n**2 - 1)]
+        matrix = [numbers[i * n : (i + 1) * n] for i in range(n)]
+        return matrix
+
+    def _get_distances(self, matrix: list[list[int]]) -> list[list[int]]:
+        """Get the distance to the nearest 0 for each cell in the matrix"""
+        n = len(matrix)
+        directions = [[1, 0], [-1, 0], [0, 1], [0, -1]]
+        visited = set()
+        queue = deque()
+
+        output = [[float("inf")] * n for _ in range(n)]
+
+        for r in range(n):
+            for c in range(n):
+                if matrix[r][c] == 0:
+                    output[r][c] = 0
+                    visited.add((r, c))
+                    queue.append((r, c))
+
+        clock = 1
+        while True:
+            temp = deque()
+            while queue:
+                r, c = queue.popleft()
+                for dr, dc in directions:
+                    new_r, new_c = r + dr, c + dc
+                    if (
+                        0 <= new_r < n
+                        and 0 <= new_c < n
+                        and (new_r, new_c) not in visited
+                        and matrix[new_r][new_c] == 1
+                    ):
+                        output[new_r][new_c] = clock
+                        visited.add((new_r, new_c))
+                        temp.append((new_r, new_c))
+            if temp:
+                queue = temp
+            else:
+                break
+            clock += 1
+
+        return output
+
+    def _matrix_to_str(self, matrix: list[list[int]]) -> str:
+        """Get a string representation of the matrix"""
+        return "\n".join(" ".join(str(x) for x in row) for row in matrix)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Generate a single Binary Matrix question"""
+        rng = Random(self.seed + idx)
+
+        matrix = self._get_binary_matrix(rng)
+        matrix_str = self._matrix_to_str(matrix)
+
+        answer = self._get_distances(matrix)
+        answer_str = self._matrix_to_str(answer)
+
+        return {
+            "question": QUESTION_TEMPLATE.format(matrix=matrix_str),
+            "answer": answer_str,
+            "metadata": {"matrix": matrix, "solution": answer},
+        }
+
+
+register_dataset("binary_matrix", BinaryMatrixDataset, BinaryMatrixConfig)
diff --git a/tests/test_binary_matrix.py b/tests/test_binary_matrix.py
new file mode 100644
index 00000000..60c700d7
--- /dev/null
+++ b/tests/test_binary_matrix.py
@@ -0,0 +1,100 @@
+"""Tests for Binary Matrix questions generation"""
+
+import pytest
+
+from reasoning_gym.algorithmic.binary_matrix import BinaryMatrixConfig, BinaryMatrixDataset
+
+
+def test_binary_matrix_config_validation():
+    """Test that invalid configs raise appropriate errors"""
+    with pytest.raises(AssertionError):
+        config = BinaryMatrixConfig(max_n=-1)  # Negative not allowed
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = BinaryMatrixConfig(max_n=0)  # Zero not allowed
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = BinaryMatrixConfig(p_zero=0)  # <= 0 not allowed
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = BinaryMatrixConfig(p_zero=1.01)  # > 1 not allowed
+        config.validate()
+
+
+def test_binary_matrix_dataset_deterministic():
+    """Test that dataset generates same items with same seed"""
+    config = BinaryMatrixConfig(seed=42, size=10)
+    dataset1 = BinaryMatrixDataset(config)
+    dataset2 = BinaryMatrixDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i]
+
+
+def test_binary_matrix_dataset_items():
+    """Test basic properties of generated items"""
+    config = BinaryMatrixConfig(max_n=5, size=10, seed=42)
+    dataset = BinaryMatrixDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        # Check item structure
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        # Check metadata
+        assert "matrix" in item["metadata"]
+        assert "solution" in item["metadata"]
+
+        matrix = item["metadata"]["matrix"]
+        solution = item["metadata"]["solution"]
+
+        # Verify list dimensions
+        assert len(matrix) <= config.max_n
+        assert all(len(row) <= config.max_n for row in matrix)
+        assert all(len(row) <= config.max_n for row in solution)
+
+        # Verify matrix values
+        for r in range(len(matrix)):
+            for c in range(len(matrix[r])):
+                assert matrix[r][c] in {0, 1}
+                assert solution[r][c] >= matrix[r][c]
+
+
+def test_binary_matrix_dataset_iteration():
+    """Test that iteration respects dataset size"""
+    config = BinaryMatrixConfig(size=5, seed=42)
+    dataset = BinaryMatrixDataset(config)
+
+    items = list(dataset)
+    assert len(items) == config.size
+
+    # Test multiple iterations yield same items
+    assert items == list(dataset)
+
+
+def test_binary_matrix_answer():
+    """Test the _get_distances method"""
+    config = BinaryMatrixConfig(seed=42)
+    dataset = BinaryMatrixDataset(config)
+
+    # 1x1 matrix
+    matrix = [[0]]
+    assert dataset._get_distances(matrix) == [[0]]
+
+    # 2x2 matrix
+    matrix = [[0, 1], [1, 1]]
+    assert dataset._get_distances(matrix) == [[0, 1], [1, 2]]
+
+    # 3x3 matrix
+    matrix = [[0, 0, 0], [0, 1, 0], [1, 1, 1]]
+    assert dataset._get_distances(matrix) == [[0, 0, 0], [0, 1, 0], [1, 2, 1]]
+
+    # Empty matrix
+    matrix = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
+    assert dataset._get_distances(matrix) == [[0, 0, 0], [0, 0, 0], [0, 0, 0]]

From 89fd56f8e9eb5cd72f3a59455e0026bd37ac2a92 Mon Sep 17 00:00:00 2001
From: Zafir Stojanovski <zaf.stojano@gmail.com>
Date: Sun, 9 Feb 2025 01:11:06 +0100
Subject: [PATCH 02/11] RotateMatrix typo

---
 reasoning_gym/algorithmic/rotate_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reasoning_gym/algorithmic/rotate_matrix.py b/reasoning_gym/algorithmic/rotate_matrix.py
index ac50a281..4fdf651e 100644
--- a/reasoning_gym/algorithmic/rotate_matrix.py
+++ b/reasoning_gym/algorithmic/rotate_matrix.py
@@ -83,7 +83,7 @@ class RotateMatrixDataset(ProceduralDataset):
         return "\n".join(" ".join(str(x) for x in row) for row in matrix)
 
     def __getitem__(self, idx: int) -> dict:
-        """Generate a single Spiral Matrix question"""
+        """Generate a single Rotate Matrix question"""
         rng = Random(self.seed + idx)
 
         matrix = self._get_matrix(rng)

From 0dd4c0589771f7614dcd08ba3e0dca90b93de0b3 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sat, 8 Feb 2025 21:36:21 -0800
Subject: [PATCH 03/11] [poly-reward] add a greedy strategy scoring function
 for polynomial equations

---
 reasoning_gym/algebra/polynomial_equations.py | 113 +++++++++++++++++-
 tests/test_polynomial_equations.py            |  25 ++++
 2 files changed, 135 insertions(+), 3 deletions(-)

diff --git a/reasoning_gym/algebra/polynomial_equations.py b/reasoning_gym/algebra/polynomial_equations.py
index ed7e857f..2a20fdf4 100644
--- a/reasoning_gym/algebra/polynomial_equations.py
+++ b/reasoning_gym/algebra/polynomial_equations.py
@@ -1,7 +1,8 @@
 import random
 import string
+import math
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional, Tuple, List, Dict
 
 from sympy import Eq, Symbol, expand, solve
 
@@ -26,6 +27,9 @@ class PolynomialEquationsConfig:
     )  # Allowed operators between terms, Avoid adding '*' or '/' because they will affect the degree
     seed: Optional[int] = None
     size: int = 500
+    # reward function hyperparameters
+    penalty_missing_factor = 0.1
+    penalty_extra_factor = 0.05
 
     def validate(self) -> None:
         """Validate configuration parameters."""
@@ -40,7 +44,8 @@ class PolynomialEquationsConfig:
 
         allowed_ops = {"+", "-"}
         assert len(self.operators) > 0, "operators tuple cannot be empty."
-        assert all(op in allowed_ops for op in self.operators), "Invalid operator found. Must be a subset of {+, -}."
+        assert all(
+            op in allowed_ops for op in self.operators), "Invalid operator found. Must be a subset of {+, -}."
 
 
 class PolynomialEquationsDataset(ProceduralDataset):
@@ -146,5 +151,107 @@ class PolynomialEquationsDataset(ProceduralDataset):
 
         return polynomial_expr
 
+    def _parse_score_to_list(self, answer: Optional[str]) -> List[float]:
+        """Parses a comma-separated string of scores into a sorted list of floats.
 
-register_dataset("polynomial_equations", PolynomialEquationsDataset, PolynomialEquationsConfig)
+        This method takes a string containing comma-separated numeric values, attempts to convert each value to a float,
+        and returns a sorted list of these floats.  Any values that cannot be converted to a float are ignored.
+        Handles empty strings gracefully.
+
+        Args:
+            answer: An optional string containing comma-separated numeric values.  Can be None or an empty string.
+
+        Returns:
+            A sorted list of floats parsed from the input string. Returns an empty list if the input is None, empty, or contains no valid numeric values.
+        """
+
+        if answer is None or len(answer) == 0:  # Handle None or empty input
+            return []
+
+        output_float_vals = []
+        for output_val in answer.split(","):
+            try:
+                # Convert to float, strip whitespace
+                output_float_vals.append(float(output_val.strip()))
+            except ValueError:
+                # Ignore values that cannot be converted to float
+                continue  # Continue to the next value in the string
+
+        return sorted(output_float_vals)  # Return the sorted list of floats
+
+    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+        """
+        Score an answer based on its numerical distance to oracle solutions using exponential decay.
+        This function compares a predicted answer (or list of answers) to a set of oracle solutions 
+        (also a list of numbers). It calculates a reward based on how close the predicted solutions
+        are to the oracle solutions, using an exponential decay function.  It also applies penalties
+        for missing or extra predicted solutions.
+
+        Args:
+            answer: The predicted answer (or a string that can be parsed into a list of numbers).  
+                    May be None.
+            entry: A dictionary containing the oracle solution(s) under the key "answer"
+                (which can be a string that can be parsed into a list of numbers).
+
+        Returns:
+            A float representing the final score. The score is non-negative.
+
+        Raises:
+        TypeError: If the 'answer' in entry is not a string or list.
+        ValueError: If the 'answer' in entry cannot be parsed to a number.
+        TypeError: If the answer is not a string or list.
+        ValueError: If the answer cannot be parsed to a number.
+        """
+        oracle_solutions = self._parse_score_to_list(
+            entry["answer"])  # Parse oracle solutions
+        predicted_solutions = self._parse_score_to_list(
+            answer)  # Parse predicted solutions
+
+        total_reward = 0.0
+        matched_solutions = 0
+        extra_solutions = 0
+        missing_solutions = 0
+
+        for predicted_solution in predicted_solutions:
+
+            # find the closest matching solution from the oracle solutions.
+            # this is a greedy approach to computing the score
+            matched_distance = float('inf')
+            matched_distance_index = None
+            for oracle_solution_index, oracle_solution in enumerate(oracle_solutions):
+                if matched_distance > abs(predicted_solution - oracle_solution):
+                    matched_distance = abs(
+                        predicted_solution - oracle_solution)
+                    matched_distance_index = oracle_solution_index
+
+            if matched_distance_index is not None:
+                matched_solutions += 1
+                # Remove matched oracle solution
+                oracle_solutions.pop(matched_distance_index)
+                # Exponential decay reward
+                total_reward += math.exp(-matched_distance)
+            else:
+                # Extra predicted solution
+                extra_solutions += 1
+
+        # Count remaining oracle solutions as missing
+        for oracle_solution in oracle_solutions:
+            missing_solutions += 1
+
+         # Calculate penalty for either missing or extra solutions
+        penalty = missing_solutions * self.config.penalty_missing_factor
+        penalty += extra_solutions * self.config.penalty_extra_factor
+
+        if matched_solutions > 0:
+            # normalize the rewards that we found matching solutions for
+            # so that the value is bounded between 0 and 1
+            total_reward = total_reward / matched_solutions
+
+        # Final reward capped at 0
+        final_reward = max(0, total_reward - penalty)
+
+        return final_reward
+
+
+register_dataset("polynomial_equations",
+                 PolynomialEquationsDataset, PolynomialEquationsConfig)
diff --git a/tests/test_polynomial_equations.py b/tests/test_polynomial_equations.py
index 6e1bb0c0..556932a7 100644
--- a/tests/test_polynomial_equations.py
+++ b/tests/test_polynomial_equations.py
@@ -1,4 +1,5 @@
 import pytest
+from pytest import approx
 from sympy import Symbol, sympify
 
 from reasoning_gym import create_dataset
@@ -115,3 +116,27 @@ def test_polynomial_solutions_evaluation():
                 f"Solution {solution} does not satisfy the polynomial {poly_str}. "
                 f"Evaluated value: {evaluated_value}"
             )
+
+
+@pytest.mark.parametrize(
+    "oracle_answer, predicted_answer, expected_reward",
+    [
+        ("4,-4.12", "4,-4.12", 1.0),  # Exact match
+        ("4,-4.12", "4.0001,-4.120001", approx(0.9999, rel=1e-3)),  # Very close match
+        ("4,-4.12", "4.1,-4.2", approx(0.9139, rel=1e-3)),
+        ("4,8", "4", approx(0.9, rel=1e-3)), # Missing an oracle solution -> missing solution penalty applies
+        ("4", "4,8", approx(0.95, rel=1e-3)), # extra solution -> extra solution penalty
+        ("-1,-2", "1,4", approx(0.06890, rel=1e-3)), # -1 matched w/ 1 and -2 matched w/ 4
+        ("", "1", approx(0, rel=1e-4)), # oracle no solution, predicted extra solution
+        ("1", "", approx(0, rel=1e-4)), # oracle has a solution, predicted no solution
+    ],
+)
+def test_polynomial_solutions_score_answer(oracle_answer, predicted_answer, expected_reward):
+    # You might want to parameterize cfg as well
+    cfg = PolynomialEquationsConfig(seed=999, size=3)
+    ds = PolynomialEquationsDataset(cfg)
+
+    actual_reward = ds.score_answer(
+        predicted_answer, {"answer": oracle_answer})
+    assert actual_reward == pytest.approx(
+        expected_reward, rel=1e-3)  # Fuzzy comparison for floats

From 7a6f7ea9da746fccbb88992b3d9cae647d4688c5 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sat, 8 Feb 2025 21:41:18 -0800
Subject: [PATCH 04/11] [poly-reward] minor updates to the docstrings

---
 reasoning_gym/algebra/polynomial_equations.py | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/reasoning_gym/algebra/polynomial_equations.py b/reasoning_gym/algebra/polynomial_equations.py
index 2a20fdf4..96498148 100644
--- a/reasoning_gym/algebra/polynomial_equations.py
+++ b/reasoning_gym/algebra/polynomial_equations.py
@@ -154,15 +154,17 @@ class PolynomialEquationsDataset(ProceduralDataset):
     def _parse_score_to_list(self, answer: Optional[str]) -> List[float]:
         """Parses a comma-separated string of scores into a sorted list of floats.
 
-        This method takes a string containing comma-separated numeric values, attempts to convert each value to a float,
-        and returns a sorted list of these floats.  Any values that cannot be converted to a float are ignored.
+        This method takes a string containing comma-separated numeric values, 
+        attempts to convert each value to a float, and returns a sorted list of these floats.  
+        Any values that cannot be converted to a float are ignored.
         Handles empty strings gracefully.
 
         Args:
-            answer: An optional string containing comma-separated numeric values.  Can be None or an empty string.
-
+            answer: An optional string containing comma-separated numeric values.  
+            Can be None or an empty string.
         Returns:
-            A sorted list of floats parsed from the input string. Returns an empty list if the input is None, empty, or contains no valid numeric values.
+            A sorted list of floats parsed from the input string. 
+            Returns an empty list if the input is None, empty, or contains no valid numeric values.
         """
 
         if answer is None or len(answer) == 0:  # Handle None or empty input
@@ -175,7 +177,7 @@ class PolynomialEquationsDataset(ProceduralDataset):
                 output_float_vals.append(float(output_val.strip()))
             except ValueError:
                 # Ignore values that cannot be converted to float
-                continue  # Continue to the next value in the string
+                continue
 
         return sorted(output_float_vals)  # Return the sorted list of floats
 
@@ -185,7 +187,9 @@ class PolynomialEquationsDataset(ProceduralDataset):
         This function compares a predicted answer (or list of answers) to a set of oracle solutions 
         (also a list of numbers). It calculates a reward based on how close the predicted solutions
         are to the oracle solutions, using an exponential decay function.  It also applies penalties
-        for missing or extra predicted solutions.
+        for missing or extra predicted solutions. The implementation is a greedy algorithm where we
+        find the closest matching oracle solution for a given predicted solution and only allow an
+        oracle solution to match once.
 
         Args:
             answer: The predicted answer (or a string that can be parsed into a list of numbers).  
@@ -195,12 +199,6 @@ class PolynomialEquationsDataset(ProceduralDataset):
 
         Returns:
             A float representing the final score. The score is non-negative.
-
-        Raises:
-        TypeError: If the 'answer' in entry is not a string or list.
-        ValueError: If the 'answer' in entry cannot be parsed to a number.
-        TypeError: If the answer is not a string or list.
-        ValueError: If the answer cannot be parsed to a number.
         """
         oracle_solutions = self._parse_score_to_list(
             entry["answer"])  # Parse oracle solutions

From e5862371ed9283c296a6a26775a47e6392611166 Mon Sep 17 00:00:00 2001
From: Zafir Stojanovski <zaf.stojano@gmail.com>
Date: Sun, 9 Feb 2025 13:00:46 +0100
Subject: [PATCH 05/11] update instruction and shuffle numbers

---
 reasoning_gym/algorithmic/binary_matrix.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/reasoning_gym/algorithmic/binary_matrix.py b/reasoning_gym/algorithmic/binary_matrix.py
index c34d0566..0fbe96b3 100644
--- a/reasoning_gym/algorithmic/binary_matrix.py
+++ b/reasoning_gym/algorithmic/binary_matrix.py
@@ -16,7 +16,7 @@ QUESTION_TEMPLATE = """Given a square matrix, your job is to find the distance o
 
 Example:
 
-Input: Rotate the matrix below by 90 degrees clockwise:
+Input: Find the distance to the nearest 0 for each cell in the matrix below:
 0 0 0
 0 1 0
 1 1 1
@@ -58,6 +58,7 @@ class BinaryMatrixDataset(ProceduralDataset):
         n = rng.randint(1, self.config.max_n)
         # Ensure at least one 0 in the matrix, so that a solution exists
         numbers = [0] + [0 if rng.random() < self.config.p_zero else 1 for _ in range(n**2 - 1)]
+        rng.shuffle(numbers)
         matrix = [numbers[i * n : (i + 1) * n] for i in range(n)]
         return matrix
 

From 04b332384448b5abf7823bda5b409016caa16830 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sun, 9 Feb 2025 07:30:18 -0800
Subject: [PATCH 06/11] [poly-reward] run pre-commit hooks

---
 reasoning_gym/algebra/polynomial_equations.py | 35 ++++++++-----------
 tests/test_polynomial_equations.py            | 16 ++++-----
 2 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/reasoning_gym/algebra/polynomial_equations.py b/reasoning_gym/algebra/polynomial_equations.py
index 96498148..a1822958 100644
--- a/reasoning_gym/algebra/polynomial_equations.py
+++ b/reasoning_gym/algebra/polynomial_equations.py
@@ -1,8 +1,8 @@
+import math
 import random
 import string
-import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, List, Dict
+from typing import Dict, List, Optional, Tuple
 
 from sympy import Eq, Symbol, expand, solve
 
@@ -44,8 +44,7 @@ class PolynomialEquationsConfig:
 
         allowed_ops = {"+", "-"}
         assert len(self.operators) > 0, "operators tuple cannot be empty."
-        assert all(
-            op in allowed_ops for op in self.operators), "Invalid operator found. Must be a subset of {+, -}."
+        assert all(op in allowed_ops for op in self.operators), "Invalid operator found. Must be a subset of {+, -}."
 
 
 class PolynomialEquationsDataset(ProceduralDataset):
@@ -154,16 +153,16 @@ class PolynomialEquationsDataset(ProceduralDataset):
     def _parse_score_to_list(self, answer: Optional[str]) -> List[float]:
         """Parses a comma-separated string of scores into a sorted list of floats.
 
-        This method takes a string containing comma-separated numeric values, 
-        attempts to convert each value to a float, and returns a sorted list of these floats.  
+        This method takes a string containing comma-separated numeric values,
+        attempts to convert each value to a float, and returns a sorted list of these floats.
         Any values that cannot be converted to a float are ignored.
         Handles empty strings gracefully.
 
         Args:
-            answer: An optional string containing comma-separated numeric values.  
+            answer: An optional string containing comma-separated numeric values.
             Can be None or an empty string.
         Returns:
-            A sorted list of floats parsed from the input string. 
+            A sorted list of floats parsed from the input string.
             Returns an empty list if the input is None, empty, or contains no valid numeric values.
         """
 
@@ -184,7 +183,7 @@ class PolynomialEquationsDataset(ProceduralDataset):
     def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
         """
         Score an answer based on its numerical distance to oracle solutions using exponential decay.
-        This function compares a predicted answer (or list of answers) to a set of oracle solutions 
+        This function compares a predicted answer (or list of answers) to a set of oracle solutions
         (also a list of numbers). It calculates a reward based on how close the predicted solutions
         are to the oracle solutions, using an exponential decay function.  It also applies penalties
         for missing or extra predicted solutions. The implementation is a greedy algorithm where we
@@ -192,7 +191,7 @@ class PolynomialEquationsDataset(ProceduralDataset):
         oracle solution to match once.
 
         Args:
-            answer: The predicted answer (or a string that can be parsed into a list of numbers).  
+            answer: The predicted answer (or a string that can be parsed into a list of numbers).
                     May be None.
             entry: A dictionary containing the oracle solution(s) under the key "answer"
                 (which can be a string that can be parsed into a list of numbers).
@@ -200,10 +199,8 @@ class PolynomialEquationsDataset(ProceduralDataset):
         Returns:
             A float representing the final score. The score is non-negative.
         """
-        oracle_solutions = self._parse_score_to_list(
-            entry["answer"])  # Parse oracle solutions
-        predicted_solutions = self._parse_score_to_list(
-            answer)  # Parse predicted solutions
+        oracle_solutions = self._parse_score_to_list(entry["answer"])  # Parse oracle solutions
+        predicted_solutions = self._parse_score_to_list(answer)  # Parse predicted solutions
 
         total_reward = 0.0
         matched_solutions = 0
@@ -214,12 +211,11 @@ class PolynomialEquationsDataset(ProceduralDataset):
 
             # find the closest matching solution from the oracle solutions.
             # this is a greedy approach to computing the score
-            matched_distance = float('inf')
+            matched_distance = float("inf")
             matched_distance_index = None
             for oracle_solution_index, oracle_solution in enumerate(oracle_solutions):
                 if matched_distance > abs(predicted_solution - oracle_solution):
-                    matched_distance = abs(
-                        predicted_solution - oracle_solution)
+                    matched_distance = abs(predicted_solution - oracle_solution)
                     matched_distance_index = oracle_solution_index
 
             if matched_distance_index is not None:
@@ -236,7 +232,7 @@ class PolynomialEquationsDataset(ProceduralDataset):
         for oracle_solution in oracle_solutions:
             missing_solutions += 1
 
-         # Calculate penalty for either missing or extra solutions
+        # Calculate penalty for either missing or extra solutions
         penalty = missing_solutions * self.config.penalty_missing_factor
         penalty += extra_solutions * self.config.penalty_extra_factor
 
@@ -251,5 +247,4 @@ class PolynomialEquationsDataset(ProceduralDataset):
         return final_reward
 
 
-register_dataset("polynomial_equations",
-                 PolynomialEquationsDataset, PolynomialEquationsConfig)
+register_dataset("polynomial_equations", PolynomialEquationsDataset, PolynomialEquationsConfig)
diff --git a/tests/test_polynomial_equations.py b/tests/test_polynomial_equations.py
index 556932a7..e7caf654 100644
--- a/tests/test_polynomial_equations.py
+++ b/tests/test_polynomial_equations.py
@@ -124,11 +124,11 @@ def test_polynomial_solutions_evaluation():
         ("4,-4.12", "4,-4.12", 1.0),  # Exact match
         ("4,-4.12", "4.0001,-4.120001", approx(0.9999, rel=1e-3)),  # Very close match
         ("4,-4.12", "4.1,-4.2", approx(0.9139, rel=1e-3)),
-        ("4,8", "4", approx(0.9, rel=1e-3)), # Missing an oracle solution -> missing solution penalty applies
-        ("4", "4,8", approx(0.95, rel=1e-3)), # extra solution -> extra solution penalty
-        ("-1,-2", "1,4", approx(0.06890, rel=1e-3)), # -1 matched w/ 1 and -2 matched w/ 4
-        ("", "1", approx(0, rel=1e-4)), # oracle no solution, predicted extra solution
-        ("1", "", approx(0, rel=1e-4)), # oracle has a solution, predicted no solution
+        ("4,8", "4", approx(0.9, rel=1e-3)),  # Missing an oracle solution -> missing solution penalty applies
+        ("4", "4,8", approx(0.95, rel=1e-3)),  # extra solution -> extra solution penalty
+        ("-1,-2", "1,4", approx(0.06890, rel=1e-3)),  # -1 matched w/ 1 and -2 matched w/ 4
+        ("", "1", approx(0, rel=1e-4)),  # oracle no solution, predicted extra solution
+        ("1", "", approx(0, rel=1e-4)),  # oracle has a solution, predicted no solution
     ],
 )
 def test_polynomial_solutions_score_answer(oracle_answer, predicted_answer, expected_reward):
@@ -136,7 +136,5 @@ def test_polynomial_solutions_score_answer(oracle_answer, predicted_answer, expe
     cfg = PolynomialEquationsConfig(seed=999, size=3)
     ds = PolynomialEquationsDataset(cfg)
 
-    actual_reward = ds.score_answer(
-        predicted_answer, {"answer": oracle_answer})
-    assert actual_reward == pytest.approx(
-        expected_reward, rel=1e-3)  # Fuzzy comparison for floats
+    actual_reward = ds.score_answer(predicted_answer, {"answer": oracle_answer})
+    assert actual_reward == pytest.approx(expected_reward, rel=1e-3)  # Fuzzy comparison for floats

From c59db00196f1b9f99df3535c1083db4ed1bee274 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 9 Feb 2025 20:05:56 +0100
Subject: [PATCH 07/11] reduce default zero probability for binary matrix

---
 GALLERY.md                                 | 93 ++++++++++++++++++++++
 reasoning_gym/algorithmic/binary_matrix.py |  5 +-
 2 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index e7ba9238..5aa6fc66 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -9,6 +9,7 @@ This gallery shows examples from all available datasets using their default conf
 - [base_conversion](#base_conversion)
 - [basic_arithmetic](#basic_arithmetic)
 - [bf](#bf)
+- [binary_matrix](#binary_matrix)
 - [caesar_cipher](#caesar_cipher)
 - [calendar_arithmetic](#calendar_arithmetic)
 - [chain_sum](#chain_sum)
@@ -751,6 +752,98 @@ Metadata: {'bfit_code': '\nint main() {\n    print("under");\n}\n', 'bf_program'
 
 ````
 
+### binary_matrix
+Generates Binary Matrix exercises with configurable difficulty
+
+Default configuration:
+```python
+max_n = 10
+p_zero = 0.25
+size = 500
+seed = 42
+```
+
+Example tasks:
+````
+Example 1:
+Question: Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell.
+
+Example:
+
+Input: Find the distance to the nearest 0 for each cell in the matrix below:
+0 0 0
+0 1 0
+1 1 1
+
+Output:
+0 0 0
+0 1 0
+1 2 1
+
+Find the distance to the nearest 0 for each cell in the matrix below:
+0 0
+1 0
+
+Answer: 0 0
+1 0
+Metadata: {'matrix': [[0, 0], [1, 0]], 'solution': [[0, 0], [1, 0]]}
+
+Example 2:
+Question: Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell.
+
+Example:
+
+Input: Find the distance to the nearest 0 for each cell in the matrix below:
+0 0 0
+0 1 0
+1 1 1
+
+Output:
+0 0 0
+0 1 0
+1 2 1
+
+Find the distance to the nearest 0 for each cell in the matrix below:
+0
+
+Answer: 0
+Metadata: {'matrix': [[0]], 'solution': [[0]]}
+
+Example 3:
+Question: Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell.
+
+Example:
+
+Input: Find the distance to the nearest 0 for each cell in the matrix below:
+0 0 0
+0 1 0
+1 1 1
+
+Output:
+0 0 0
+0 1 0
+1 2 1
+
+Find the distance to the nearest 0 for each cell in the matrix below:
+1 0 1 1 0 1 1
+1 0 1 1 1 1 1
+1 1 1 1 0 1 1
+1 1 1 1 0 0 1
+0 1 1 1 1 1 0
+1 0 1 1 1 1 0
+1 1 1 1 1 1 1
+
+Answer: 1 0 1 1 0 1 2
+1 0 1 2 1 2 3
+2 1 2 1 0 1 2
+1 2 2 1 0 0 1
+0 1 2 2 1 1 0
+1 0 1 2 2 1 0
+2 1 2 3 3 2 1
+Metadata: {'matrix': [[1, 0, 1, 1, 0, 1, 1], [1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 0, 1], [0, 1, 1, 1, 1, 1, 0], [1, 0, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1]], 'solution': [[1, 0, 1, 1, 0, 1, 2], [1, 0, 1, 2, 1, 2, 3], [2, 1, 2, 1, 0, 1, 2], [1, 2, 2, 1, 0, 0, 1], [0, 1, 2, 2, 1, 1, 0], [1, 0, 1, 2, 2, 1, 0], [2, 1, 2, 3, 3, 2, 1]]}
+
+````
+
 ### caesar_cipher
 Generates Caesar cipher encryption/decryption tasks
 
diff --git a/reasoning_gym/algorithmic/binary_matrix.py b/reasoning_gym/algorithmic/binary_matrix.py
index 0fbe96b3..8ae122bb 100644
--- a/reasoning_gym/algorithmic/binary_matrix.py
+++ b/reasoning_gym/algorithmic/binary_matrix.py
@@ -5,14 +5,13 @@ https://leetcode.com/problems/01-matrix/description/
 """
 
 from collections import deque
-from copy import deepcopy
 from dataclasses import dataclass
 from random import Random
 from typing import Optional
 
 from ..factory import ProceduralDataset, register_dataset
 
-QUESTION_TEMPLATE = """Given a square matrix, your job is to find the distance of the nearest 0 for each cell.
+QUESTION_TEMPLATE = """Given a square matrix, your job is to find the taxicab distance of the nearest 0 for each cell.
 
 Example:
 
@@ -36,7 +35,7 @@ class BinaryMatrixConfig:
     """Configuration for Binary Matrix dataset generation"""
 
     max_n: int = 10  # Maximum size of the matrix
-    p_zero: float = 0.7  # Probability of a cell being 0
+    p_zero: float = 0.25  # Probability of a cell being 0
 
     size: int = 500  # Virtual dataset size
     seed: Optional[int] = None

From 94f07ed35dcc335178ad3fbec7f06ce847526f40 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sun, 9 Feb 2025 22:36:27 -0800
Subject: [PATCH 08/11] [eval-basic] initial scripts for evaluating models on
 reasoning gym

---
 eval/__init__.py                              |   0
 eval/eval_basic.json                          |  21 +
 eval/eval_basic.py                            | 157 +++++
 eval/eval_basic.sh                            |  30 +
 ..._gemini-2.0-flash-001_20250209_223527.json | 647 ++++++++++++++++++
 ..._gemini-2.0-flash-001_20250209_220608.json |  12 +
 ..._gemini-2.0-flash-001_20250209_220753.json | 186 +++++
 ..._gemini-2.0-flash-001_20250209_220610.json |  12 +
 ..._gemini-2.0-flash-001_20250209_220824.json | 201 ++++++
 ..._gemini-2.0-flash-001_20250209_223527.json |  39 ++
 requirements-dev.txt                          |   1 +
 11 files changed, 1306 insertions(+)
 create mode 100644 eval/__init__.py
 create mode 100644 eval/eval_basic.json
 create mode 100644 eval/eval_basic.py
 create mode 100644 eval/eval_basic.sh
 create mode 100644 eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
 create mode 100644 eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
 create mode 100644 eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
 create mode 100644 eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
 create mode 100644 eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
 create mode 100644 eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json

diff --git a/eval/__init__.py b/eval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/eval/eval_basic.json b/eval/eval_basic.json
new file mode 100644
index 00000000..8e96863c
--- /dev/null
+++ b/eval/eval_basic.json
@@ -0,0 +1,21 @@
+[
+    {
+      "name": "letter_counting",
+      "min_words": 5,
+      "max_words": 15,
+      "size": 10,
+      "seed": 42
+    },
+    {
+      "name": "propositional_logic",
+      "size": 10,
+      "seed": 42
+    },
+    {
+      "name": "leg_counting",
+      "min_animals": 3,
+      "max_animals": 8,
+      "size": 10,
+      "seed": 42
+    }
+  ]
\ No newline at end of file
diff --git a/eval/eval_basic.py b/eval/eval_basic.py
new file mode 100644
index 00000000..6b605487
--- /dev/null
+++ b/eval/eval_basic.py
@@ -0,0 +1,157 @@
+import argparse
+from datetime import datetime
+import json
+import os
+from openai import OpenAI
+from typing import Any, Dict, List
+
+from reasoning_gym.factory import DATASETS, create_dataset
+
+class OpenRouterEvaluator:
+    def __init__(self, model: str):
+        self.client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=os.getenv('OPENROUTER_API_KEY')
+        )
+        self.model = model
+        self.extra_headers = {}
+
+    def get_model_response(self, prompt: str) -> str:
+        """Get response from the model via OpenRouter API."""
+        try:
+            completion = self.client.chat.completions.create(
+                extra_headers=self.extra_headers,
+                model=self.model,
+                messages=[{
+                    "role": "user",
+                    "content": prompt
+                }]
+            )
+            return completion.choices[0].message.content
+        except Exception as e:
+            print(f"Error calling OpenRouter API: {str(e)}")
+            raise
+
+    def evaluate_datasets(self, dataset_configs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Evaluate model on multiple datasets with their respective configurations."""
+        all_results = []
+        
+        for dataset_config in dataset_configs:
+            dataset_name = dataset_config.pop('name')
+            print(f"\nEvaluating dataset: {dataset_name}")
+            
+            try:
+                # Create dataset with its specific configuration
+                data = create_dataset(dataset_name, **dataset_config)
+                results = []
+                
+                for entry in data:
+                    try:
+                        response = self.get_model_response(entry['question'])
+                        score = data.score_answer(answer=response, entry=entry)
+
+                        result = {
+                            'question': entry['question'],
+                            'expected_answer': entry['answer'],
+                            'model_answer': response,
+                            'score': score,
+                            'metadata': entry['metadata']
+                        }
+                        results.append(result)
+                        print(f"Processed question {len(results)}/{len(data)}. Score: {score}")
+
+                    except Exception as e:
+                        print(f"Error processing question: {entry['question']}")
+                        print(f"Error: {str(e)}")
+
+                # Calculate aggregate metrics
+                total_score = sum(r['score'] for r in results)
+                metrics = {
+                    'dataset_name': dataset_name,
+                    'model': self.model,
+                    'size': len(data),
+                    'average_score': total_score / len(results) if results else 0,
+                    'total_examples': len(results),
+                    'timestamp': datetime.now().isoformat(),
+                    'config': dataset_config
+                }
+
+                all_results.append({
+                    'metrics': metrics,
+                    'results': results
+                })
+
+            except Exception as e:
+                print(f"Error evaluating dataset {dataset_name}: {str(e)}")
+                continue
+
+        return all_results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Evaluate models on reasoning datasets')
+    parser.add_argument('--model', required=True, help='Model to evaluate')
+    parser.add_argument('--config', required=True, 
+                       help='Path to JSON configuration file')
+    parser.add_argument('--output-dir', default='results',
+                       help='Output directory')
+
+    args = parser.parse_args()
+
+    # Create output directory if it doesn't exist
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load dataset configurations
+    with open(args.config, 'r') as f:
+        dataset_configs = json.load(f)
+
+    evaluator = OpenRouterEvaluator(model=args.model)
+    all_results = evaluator.evaluate_datasets(dataset_configs)
+
+    # Save results
+    output_file = os.path.join(
+        args.output_dir,
+        f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    )
+
+    # Save detailed results
+    with open(output_file, 'w') as f:
+        json.dump(all_results, f, indent=2)
+
+    # Create summary
+    summary = []
+    for result in all_results:
+        metrics = result['metrics']
+        summary_entry = {
+            'dataset_name': metrics['dataset_name'],
+            'model': metrics['model'],
+            'average_score': metrics['average_score'],
+            'total_examples': metrics['total_examples'],
+            'timestamp': metrics['timestamp'],
+            'config': metrics['config']
+        }
+        summary.append(summary_entry)
+
+    # Save summary to a separate file
+    summary_file = os.path.join(
+        args.output_dir,
+        f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    )
+    
+    with open(summary_file, 'w') as f:
+        json.dump(summary, f, indent=2)
+
+    # Print summary
+    print("\nEvaluation Summary:")
+    for entry in summary:
+        print(f"\nDataset: {entry['dataset_name']}")
+        print(f"Average Score: {entry['average_score']:.2%}")
+        print(f"Total Examples: {entry['total_examples']}")
+    
+    print(f"\nDetailed results saved to: {output_file}")
+    print(f"Summary saved to: {summary_file}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/eval/eval_basic.sh b/eval/eval_basic.sh
new file mode 100644
index 00000000..2a7eb9bc
--- /dev/null
+++ b/eval/eval_basic.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Check if OPENROUTER_API_KEY is set
+if [ -z "$OPENROUTER_API_KEY" ]; then
+    echo "Error: OPENROUTER_API_KEY environment variable is not set"
+    echo "Please set it using: export OPENROUTER_API_KEY=your-api-key"
+    exit 1
+fi
+
+# Configuration
+OUTPUT_DIR="results"
+
+# List of models to evaluate
+MODELS=(
+    "google/gemini-2.0-flash-001"
+)
+
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+
+# Run evaluations
+for model in "${MODELS[@]}"; do
+    echo "Evaluating $model..."
+    python eval_basic.py \
+        --model "$model" \
+        --config "eval_basic.json" \
+        --output-dir "$OUTPUT_DIR"
+done
+
+echo "All evaluations completed!"
\ No newline at end of file
diff --git a/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
new file mode 100644
index 00000000..2d4a5a93
--- /dev/null
+++ b/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
@@ -0,0 +1,647 @@
+[
+  {
+    "metrics": {
+      "dataset_name": "letter_counting",
+      "model": "google/gemini-2.0-flash-001",
+      "size": 10,
+      "average_score": 0.20600000000000002,
+      "total_examples": 10,
+      "timestamp": "2025-02-10T06:34:37.091554",
+      "config": {
+        "min_words": 5,
+        "max_words": 15,
+        "size": 10,
+        "seed": 42
+      }
+    },
+    "results": [
+      {
+        "question": "How many times does the letter \"a\" appear in the text: \"bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed\"?",
+        "expected_answer": "6",
+        "model_answer": "The letter \"a\" appears 7 times in the text: \"bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed\".\n",
+        "score": 0.01,
+        "metadata": {
+          "span_length": 15,
+          "target_letter": "a",
+          "span": [
+            "bed",
+            "and",
+            "enters",
+            "his",
+            "mechanical",
+            "dresser",
+            "Two",
+            "minutes",
+            "later",
+            "the",
+            "machine",
+            "deposited",
+            "him",
+            "all",
+            "dressed"
+          ]
+        }
+      },
+      {
+        "question": "How many times does the letter \"w\" appear in the text: \"it into a watering place\"?",
+        "expected_answer": "1",
+        "model_answer": "The letter \"w\" appears 1 time in the text \"it into a watering place\".\n",
+        "score": 0.5,
+        "metadata": {
+          "span_length": 5,
+          "target_letter": "w",
+          "span": [
+            "it",
+            "into",
+            "a",
+            "watering",
+            "place"
+          ]
+        }
+      },
+      {
+        "question": "How many times does the letter \"t\" appear in the text: \"readable form accessible by the widest array of equipment including outdated\"?",
+        "expected_answer": "5",
+        "model_answer": "Let's count the \"t\"s in the sentence \"readable form accessible by the widest array of equipment including outdated\":\n\n*   readable: 0\n*   form: 0\n*   accessible: 0\n*   by: 0\n*   the: 1\n*   widest: 1\n*   array: 0\n*   of: 0\n*   equipment: 1\n*   including: 0\n*   outdated: 2\n\nTotal: 1 + 1 + 1 + 2 = 5\n\nTherefore, the letter \"t\" appears 5 times in the text.",
+        "score": 0.5,
+        "metadata": {
+          "span_length": 11,
+          "target_letter": "t",
+          "span": [
+            "readable",
+            "form",
+            "accessible",
+            "by",
+            "the",
+            "widest",
+            "array",
+            "of",
+            "equipment",
+            "including",
+            "outdated"
+          ]
+        }
+      },
+      {
+        "question": "How many times does the letter \"n\" appear in the text: \"does not contain a notice indicating that it is\"?",
+        "expected_answer": "6",
+        "model_answer": "The letter \"n\" appears 7 times in the text \"does not contain a notice indicating that it is\".\n",
+        "score": 0.01,
+        "metadata": {
+          "span_length": 9,
+          "target_letter": "n",
+          "span": [
+            "does",
+            "not",
+            "contain",
+            "a",
+            "notice",
+            "indicating",
+            "that",
+            "it",
+            "is"
+          ]
+        }
+      },
+      {
+        "question": "How many times does the letter \"a\" appear in the text: \"keeping this work in the same\"?",
+        "expected_answer": "1",
+        "model_answer": "The letter \"a\" appears **3** times in the text \"keeping this work in the same\".\n",
+        "score": 0.01,
+        "metadata": {
+          "span_length": 6,
+          "target_letter": "a",
+          "span": [
+            "keeping",
+            "this",
+            "work",
+            "in",
+            "the",
+            "same"
+          ]
+        }
+      },
+      {
+        "question": "How many times does the letter \"r\" appear in the text: \"years ago its subscription list contained but a few hundred\"?",
+        "expected_answer": "3",
+        "model_answer": "The letter \"r\" appears 4 times in the text \"years ago its subscription list contained but a few hundred\".\n",
+        "score": 0.01,
+        "metadata": {
+          "span_length": 10,
+          "target_letter": "r",
+          "span": [
+            "years",
+            "ago",
+            "its",
+            "subscription",
+            "list",
+            "contained",
+            "but",
+            "a",
+            "few",
+            "hundred"
+          ]
+        }
+      },
+      {
+        "question": "How many times does the letter \"e\" appear in the text: \"me As in the morning the table rose out of the floor Again\"?",
+        "expected_answer": "6",
+        "model_answer": "The letter \"e\" appears 6 times in the text \"me As in the morning the table rose out of the floor Again\".\n",
+        "score": 0.5,
+        "metadata": {
+          "span_length": 13,
+          "target_letter": "e",
+          "span": [
+            "me",
+            "As",
+            "in",
+            "the",
+            "morning",
+            "the",
+            "table",
+            "rose",
+            "out",
+            "of",
+            "the",
+            "floor",
+            "Again"
+          ]
+        }
+      },
+      {
+        "question": "How many times does the letter \"t\" appear in the text: \"said Faithburn was dead quite certainly\"?",
+        "expected_answer": "3",
+        "model_answer": "The letter \"t\" appears 5 times in the text \"said Faithburn was dead quite certainly\".\n",
+        "score": 0.01,
+        "metadata": {
+          "span_length": 6,
+          "target_letter": "t",
+          "span": [
+            "said",
+            "Faithburn",
+            "was",
+            "dead",
+            "quite",
+            "certainly"
+          ]
+        }
+      },
+      {
+        "question": "How many times does the letter \"n\" appear in the text: \"these eight had only impracticable schemes to propose In fact one of\"?",
+        "expected_answer": "3",
+        "model_answer": "Let's count the \"n\"s:\n\n*   **n**\n*   o**n**ly\n*   i**n**\n*   o**n**e\n\nThere are **4** \"n\"s in the text.",
+        "score": 0.01,
+        "metadata": {
+          "span_length": 12,
+          "target_letter": "n",
+          "span": [
+            "these",
+            "eight",
+            "had",
+            "only",
+            "impracticable",
+            "schemes",
+            "to",
+            "propose",
+            "In",
+            "fact",
+            "one",
+            "of"
+          ]
+        }
+      },
+      {
+        "question": "How many times does the letter \"g\" appear in the text: \"computers including obsolete old middle aged and new\"?",
+        "expected_answer": "2",
+        "model_answer": "The letter \"g\" appears 2 times in the text: \"computers including obsolete old middle aged and new\".\n",
+        "score": 0.5,
+        "metadata": {
+          "span_length": 8,
+          "target_letter": "g",
+          "span": [
+            "computers",
+            "including",
+            "obsolete",
+            "old",
+            "middle",
+            "aged",
+            "and",
+            "new"
+          ]
+        }
+      }
+    ]
+  },
+  {
+    "metrics": {
+      "dataset_name": "propositional_logic",
+      "model": "google/gemini-2.0-flash-001",
+      "size": 10,
+      "average_score": 0.059,
+      "total_examples": 10,
+      "timestamp": "2025-02-10T06:35:11.432275",
+      "config": {
+        "size": 10,
+        "seed": 42
+      }
+    },
+    "results": [
+      {
+        "question": "Given:\n1. R\n2. Q\nWhat can we conclude?",
+        "expected_answer": "(P \u2228 Q)",
+        "model_answer": "Given just \"R\" and \"Q\" as individual statements, we can't directly *conclude* anything meaningful in a logical or mathematical sense *unless* we know more context. Here's why and some possibilities:\n\n**Without Context:**\n\n* **Independent Statements:**  They are simply two distinct statements that might be true or false depending on what \"R\" and \"Q\" represent. They exist independently.\n\n**With Possible Context (and Possible Conclusions):**\n\n* **Logic/Propositional Logic:** If \"R\" and \"Q\" represent propositions (statements that can be true or false), then we can form compound statements:\n    * **R AND Q (R \u2227 Q):**  The statement \"R is true AND Q is true.\"  This is only true if *both* R and Q are true.\n    * **R OR Q (R \u2228 Q):** The statement \"R is true OR Q is true (or both).\" This is true if *either* R or Q or both are true.\n    * **IF R THEN Q (R \u2192 Q):** The statement \"If R is true, then Q is true.\"  This is only false if R is true and Q is false.\n    * **IF Q THEN R (Q \u2192 R):** The statement \"If Q is true, then R is true.\" This is only false if Q is true and R is false.\n    * **R IFF Q (R \u2194 Q):** The statement \"R is true if and only if Q is true.\"  This is true if R and Q have the same truth value (both true or both false).\n    * **NOT R (\u00acR):** The statement \"R is false.\"\n    * **NOT Q (\u00acQ):** The statement \"Q is false.\"\n\n    *From the information given, we cannot deduce the individual truth values of `R` and `Q` (e.g., whether `R` is true or false, whether `Q` is true or false).*\n\n* **Set Theory:** If R and Q are sets, then we can discuss:\n    * **Intersection (R \u2229 Q):**  The set containing elements that are *both* in R and in Q.  We can't *conclude* what's in this intersection without knowing the elements of R and Q.\n    * **Union (R \u222a Q):**  The set containing elements that are in R, in Q, or in both. We can't *conclude* what's in this union without knowing the elements of R and Q.\n\n* **Mathematics (Variables):** If R and Q are variables, then we can't conclude anything about their relationship or values *unless* we have an equation or further information linking them.  We don't know if R = Q, R < Q, R > Q, R + Q = some value, etc.\n\n* **Premises in an Argument:** If R and Q are presented as premises (assumptions) in a larger argument, we can only draw conclusions if there are *other* premises or rules of inference involved.\n\n**Example Scenarios and Conclusions:**\n\n* **Scenario:**  R: \"It is raining.\"  Q: \"The ground is wet.\"\n    * **Possible Conclusion with an additional premise:** If we know \"IF it is raining, THEN the ground is wet (R \u2192 Q)\", then we can say *if* it is raining (we know R), *then* the ground is wet (we can conclude Q).  But *given only* R and Q, we cannot derive this conditional.\n\n* **Scenario:** R:  x > 5   Q: x > 0 (where 'x' is a real number)\n    * **Conclusion:** If both R and Q are true for the same value of x, then x > 5 implies x > 0. Therefore we can deduce that Q is true if R is.  R -> Q.\n\n**In summary, the best answer is:**\n\nWithout any further context or relationship between R and Q, we can only acknowledge them as two independent statements, values or sets. We cannot draw any meaningful logical conclusions *solely* from the information given.\n",
+        "score": 0.01,
+        "metadata": {
+          "premises": [
+            "R",
+            "Q"
+          ],
+          "variables": [
+            "P",
+            "Q",
+            "R",
+            "S"
+          ],
+          "complexity": 3
+        }
+      },
+      {
+        "question": "Given:\n1. ((Q \u2192 P) \u2228 (Q \u2192 P))\n2. ((Q \u2194 Q) \u2192 (P \u2192 P))\n3. P\nWhat can we conclude?",
+        "expected_answer": "(P \u2192 P)",
+        "model_answer": "Let's analyze each statement and see what conclusions can be drawn from them and the given information.\n\n1. **((Q \u2192 P) \u2228 (Q \u2192 P))** simplifies to **(Q \u2192 P)**.  This means \"if Q is true, then P is true.\"  Since we know P is true (given in statement 3), then this statement is true whether Q is true or false. (Remember, a conditional is only false when the antecedent is true and the consequent is false).\n\n2. **((Q \u2194 Q) \u2192 (P \u2192 P))** simplifies significantly.  `Q \u2194 Q` is always true (it's a tautology).  `P \u2192 P` is also always true (it's another tautology).  So, this becomes `(True \u2192 True)`, which is always true.  This statement doesn't help us determine anything about the relationship between Q and P beyond what is already implied by P being true.\n\n3. **P** is given as true.\n\nTherefore, based on the given information and the simplification of the statements, the main conclusion is that **P is true**. The truth value of Q is indeterminate. Q can be either true or false, and all the given statements will still hold.\n**Therefore, we can conclude: P.**\n",
+        "score": 0.5,
+        "metadata": {
+          "premises": [
+            "((Q \u2192 P) \u2228 (Q \u2192 P))",
+            "((Q \u2194 Q) \u2192 (P \u2192 P))",
+            "P"
+          ],
+          "variables": [
+            "P",
+            "Q"
+          ],
+          "complexity": 3
+        }
+      },
+      {
+        "question": "Given:\n1. ((Q \u2228 P) \u2227 \u00acP)\n2. P\n3. ((P \u2227 R) \u2227 \u00acR)\n4. ((Q \u2194 R) \u2192 \u00acQ)\nWhat can we conclude?",
+        "expected_answer": "(Q \u2227 Q)",
+        "model_answer": "Let's analyze the given statements and see what conclusions we can draw using logical deduction:\n\n1.  **((Q \u2228 P) \u2227 \u00acP)**: This says that either Q or P is true, AND P is false.  Since P is false, Q must be true for the entire statement to be true. Thus, we can conclude: **Q is true.**\n\n2.  **P**: This says that P is true.\n\n3.  **((P \u2227 R) \u2227 \u00acR)**: This says that both P and R are true, AND R is false.  This entire statement is a contradiction, as R cannot be both true and false. This statement implies: **False (Contradiction)**.\n\n4.  **((Q \u2194 R) \u2192 \u00acQ)**: This says that IF Q is equivalent to R (i.e., Q and R have the same truth value), THEN Q is false.\n\nNow, let's combine our findings:\n\n*   From (1), we deduced **Q**. However from (2) and (3), we found that statement (3) resulted in a contradiction. This contradiction stems from assuming both Q and R are false, despite Q already being false as a logical conclusion from (1). Therefore something must be wrong with the premises given.\n\nHere's a more methodical analysis, considering the contradiction:\n\nSince statement (3) leads to a contradiction, we must question our assumptions and how they interact.  Here's where things get interesting:\n\n*   We initially deduced Q from statement (1). Thus since P is false and P is true (2), this creates another contradiction. Thus we are concluding contradictory truths when all assumptions are in effect.\n\nSince we arrive at a contradiction, the set of premises is inconsistent.  Therefore, anything can be formally derived. In essence, we can conclude anything we want via *ex falso quod libet*.\n\n**Therefore, due to the contradictory statements provided, we can conclude anything.** However, within a standard logic system, we would first need to address which statement must be removed to allow the system to converge.\n",
+        "score": 0.01,
+        "metadata": {
+          "premises": [
+            "((Q \u2228 P) \u2227 \u00acP)",
+            "P",
+            "((P \u2227 R) \u2227 \u00acR)",
+            "((Q \u2194 R) \u2192 \u00acQ)"
+          ],
+          "variables": [
+            "P",
+            "Q",
+            "R"
+          ],
+          "complexity": 3
+        }
+      },
+      {
+        "question": "Given:\n1. \u00acP\n2. \u00acP\n3. Q\nWhat can we conclude?",
+        "expected_answer": "(Q \u2228 R)",
+        "model_answer": "From the given premises:\n\n*   \u00acP (P is false)\n*   \u00acP (P is false)\n*   Q (Q is true)\n\nWe can conclude the following:\n\n*   **P is false.** (This is directly from \u00acP)\n*   **Q is true.** (This is directly from Q)\n*   **\u00acQ is false.** (Since Q is true)\n\nNow we can also conclude the following with propositional logic principles:\n\n*   **P \u2228 Q** is true. (Since Q is true, the disjunction is true regardless of the truth value of P).\n*   **P \u2192 Q** is true. (Since P is false, the conditional is true).\n*   **\u00acP \u2228 Q** is true. (Since both \u00acP and Q are true, the disjunction is true).\n*   **\u00acP \u2192 Q** is true. (Since \u00acP is true, the conditional is true).\n\nIn summary, we can validly conclude many things, the most straightforward being the truth values of P and Q:\n\n*   **P is false.**\n*   **Q is true.**\n\nAnd other valid conclusions such as those related to ORs and implication.\n",
+        "score": 0.01,
+        "metadata": {
+          "premises": [
+            "\u00acP",
+            "\u00acP",
+            "Q"
+          ],
+          "variables": [
+            "P",
+            "Q",
+            "R"
+          ],
+          "complexity": 3
+        }
+      },
+      {
+        "question": "Given:\n1. P\n2. ((P \u2228 P) \u2194 (Q \u2227 P))\n3. P\nWhat can we conclude?",
+        "expected_answer": "(Q \u2194 Q)",
+        "model_answer": "Let's analyze the given statements:\n\n1.  P: This statement is true.\n2.  ((P \u2228 P) \u2194 (Q \u2227 P)): This is a biconditional. Let's break it down:\n    *   (P \u2228 P) is logically equivalent to P (by the idempotent law).\n    *   So the statement simplifies to (P \u2194 (Q \u2227 P)).\n    *   Since we know P is true (from statement 1 and 3), the statement becomes (True \u2194 (Q \u2227 True)).\n    *   (Q \u2227 True) is logically equivalent to Q.\n    *   Therefore, the statement simplifies further to (True \u2194 Q).\n    *   For the biconditional (True \u2194 Q) to be true, Q must be true.\n\nTherefore, we can conclude:\n\n*   **Q** is true.\n",
+        "score": 0.01,
+        "metadata": {
+          "premises": [
+            "P",
+            "((P \u2228 P) \u2194 (Q \u2227 P))",
+            "P"
+          ],
+          "variables": [
+            "P",
+            "Q"
+          ],
+          "complexity": 3
+        }
+      },
+      {
+        "question": "Given:\n1. (Q \u2194 R)\n2. \u00acR\nWhat can we conclude?",
+        "expected_answer": "(Q \u2192 R)",
+        "model_answer": "From the given premises:\n\n1.  Q \u2194 R (Q is logically equivalent to R, meaning Q is true if and only if R is true)\n2.  \u00acR (R is false)\n\nSince Q \u2194 R, Q and R have the same truth value.  Because \u00acR, we know R is false. Therefore, Q must also be false.\n\nConclusion: **\u00acQ**\n",
+        "score": 0.01,
+        "metadata": {
+          "premises": [
+            "(Q \u2194 R)",
+            "\u00acR"
+          ],
+          "variables": [
+            "P",
+            "Q",
+            "R"
+          ],
+          "complexity": 3
+        }
+      },
+      {
+        "question": "Given:\n1. R\n2. ((Q \u2192 Q) \u2228 (S \u2227 Q))\n3. Q\nWhat can we conclude?",
+        "expected_answer": "(S \u2228 Q)",
+        "model_answer": "Let's analyze the given statements:\n\n1.  **R:** This statement simply asserts that R is true.\n\n2.  **(Q \u2192 Q) \u2228 (S \u2227 Q):**  This is a disjunction (OR). Let's break it down:\n    *   **Q \u2192 Q:**  This is a tautology.  A statement implies itself is always true.  Therefore, the entire disjunction is true regardless of the value of (S \u2227 Q).\n\n3.  **Q:** This statement says Q is true.\n\nSince Q is true:\n\n* **(Q -> Q)** is true (since Q is true implies Q is true).\n\n* **(S ^ Q)** is true if and only if *both* S and Q are true.  We know Q is true, so if S is also true then this is true.  If S is false this is false.\n\n* **((Q -> Q) v (S ^ Q))** is true, since **(Q -> Q)** which is part of the disjunction is true. We already knew that from line 2.\n\nTherefore, from these premises, we can definitively conclude:\n\n*   **R** is true (given).\n*   **Q** is true (given).\n*   **(Q \u2192 Q) \u2228 (S \u2227 Q)** is true (given, and verifiable).\n*   **(Q -> Q)** is true (implied from Q being true, since this is a tautology).\n*   The truth value of S is undetermined.  If S is true, then (S ^ Q) is true.  If S is false, then (S ^ Q) is false.\n\nFinal Answer: The conclusions are **R**, **Q**, and **(Q \u2192 Q) \u2228 (S \u2227 Q)** are all true. Additionally, we know **(Q \u2192 Q)** is true. We cannot determine the truth value of S.\n",
+        "score": 0.01,
+        "metadata": {
+          "premises": [
+            "R",
+            "((Q \u2192 Q) \u2228 (S \u2227 Q))",
+            "Q"
+          ],
+          "variables": [
+            "P",
+            "Q",
+            "R",
+            "S"
+          ],
+          "complexity": 3
+        }
+      },
+      {
+        "question": "Given:\n1. (Q \u2227 P)\n2. (\u00acP \u2227 (P \u2192 P))\n3. \u00ac(Q \u2194 Q)\nWhat can we conclude?",
+        "expected_answer": "(P \u2227 P)",
+        "model_answer": "Let's analyze each statement to see what we can derive:\n\n*   **1. (Q \u2227 P)** This statement tells us that both Q and P are true.\n    *   Q is True\n    *   P is True\n\n*   **2. (\u00acP \u2227 (P \u2192 P))** Let's break this down:\n    *   `\u00acP`: This means \"Not P\".\n    *   `P \u2192 P`: This is a tautology (always true).  If P is true, then P implies P is true. If P is false then P implies P is true.\n\n    So the statement is essentially `(\u00acP) \u2227 (True)`.  This simplifies to just `\u00acP`. Therefore, the statement tells us that P is false.\n\n*   **3. \u00ac(Q \u2194 Q)** The biconditional `Q \u2194 Q`  means \"Q if and only if Q\". This is always true, regardless of Q's value (Q is true, so \"true if and only if true\" is true; Q is false, so \"false if and only if false\" is true). Therefore \u00ac(Q \u2194 Q) always resolves to `\u00ac(True)`, meaning that `False`.\n\nNow we consider these conclusions together:\n\n*   From statement 1: `Q` is True and `P` is True.\n*   From statement 2: `P` is False.\n*   From statment 3: the statement is `False`.\n\nWe have a **Contradiction**.\n\nStatement 1 tells us that P is true, while Statement 2 tells us that P is false.  This means the entire set of statements is inconsistent.  Therefore, we can conclude the system is flawed.\nFinal Answer: The final answer is $\\boxed{Contradiction}$",
+        "score": 0.01,
+        "metadata": {
+          "premises": [
+            "(Q \u2227 P)",
+            "(\u00acP \u2227 (P \u2192 P))",
+            "\u00ac(Q \u2194 Q)"
+          ],
+          "variables": [
+            "P",
+            "Q"
+          ],
+          "complexity": 3
+        }
+      },
+      {
+        "question": "Given:\n1. (R \u2228 Q)\n2. (R \u2227 Q)\n3. R\nWhat can we conclude?",
+        "expected_answer": "(P \u2194 P)",
+        "model_answer": "From the given statements:\n\n1.  (R \u2228 Q):  R or Q (or both) is true.\n2.  (R \u2227 Q):  R and Q are both true.\n3.  R: R is true.\n\nWe can conclude the following:\n\n*   **Q is true.**  Statement (R \u2227 Q) tells us that R and Q are both true.  Since statement 3 tells us that R is true, Q must also be true.\n\nTherefore, the main conclusion is **Q**.\n",
+        "score": 0.01,
+        "metadata": {
+          "premises": [
+            "(R \u2228 Q)",
+            "(R \u2227 Q)",
+            "R"
+          ],
+          "variables": [
+            "P",
+            "Q",
+            "R"
+          ],
+          "complexity": 3
+        }
+      },
+      {
+        "question": "Given:\n1. ((P \u2228 Q) \u2228 (Q \u2192 Q))\n2. \u00ac(Q \u2192 Q)\n3. ((P \u2227 Q) \u2227 (Q \u2227 P))\n4. (Q \u2194 Q)\nWhat can we conclude?",
+        "expected_answer": "(P \u2192 P)",
+        "model_answer": "Let's analyze each statement:\n\n*   **1. ((P \u2228 Q) \u2228 (Q \u2192 Q))**:  `Q \u2192 Q` is always true (tautology). Therefore, `(P \u2228 Q) \u2228 True` is always true, regardless of the values of P and Q. So, this statement provides no specific information about P or Q.\n\n*   **2. \u00ac(Q \u2192 Q)**:  `Q \u2192 Q` is always true.  Therefore `\u00ac(Q \u2192 Q)` means \u00acTrue which is False. So, the statement asserts a contradiction.\n\n*   **3. ((P \u2227 Q) \u2227 (Q \u2227 P))**: This statement is equivalent to (P \u2227 Q).  Both (P \u2227 Q) and (Q \u2227 P) are true only when both P and Q are true.\n\n*   **4. (Q \u2194 Q)**: This statement is a tautology (always true). It provides no specific information.\n\nNow let's consider the implications of having all four statements:\n\nStatements 1 and 4 are always true, so they do not provide any useful information.  Statement 2 says that a tautology is false, which is a contradiction.  Statement 3 indicates that both P and Q are true.  However, statement 2 means the entire set of statements is contradictory.\n\nTherefore, we can conclude that the given set of premises is **inconsistent (or contradictory).**  You cannot simultaneously assert these four statements without creating a logical absurdity.\n",
+        "score": 0.01,
+        "metadata": {
+          "premises": [
+            "((P \u2228 Q) \u2228 (Q \u2192 Q))",
+            "\u00ac(Q \u2192 Q)",
+            "((P \u2227 Q) \u2227 (Q \u2227 P))",
+            "(Q \u2194 Q)"
+          ],
+          "variables": [
+            "P",
+            "Q"
+          ],
+          "complexity": 3
+        }
+      }
+    ]
+  },
+  {
+    "metrics": {
+      "dataset_name": "leg_counting",
+      "model": "google/gemini-2.0-flash-001",
+      "size": 10,
+      "average_score": 0.40199999999999997,
+      "total_examples": 10,
+      "timestamp": "2025-02-10T06:35:27.087469",
+      "config": {
+        "min_animals": 3,
+        "max_animals": 8,
+        "size": 10,
+        "seed": 42
+      }
+    },
+    "results": [
+      {
+        "question": "How many legs are there in total if you have 3 humans, 3 sea slugs, 3 deers, 1 giraffe, 3 elephants, 2 ducks, 1 beetle, 1 flatworm?",
+        "expected_answer": "44",
+        "model_answer": "Okay, let's break it down:\n\n*   **Humans:** 3 humans * 2 legs/human = 6 legs\n*   **Sea Slugs:** 3 sea slugs * 0 legs/sea slug = 0 legs\n*   **Deer:** 3 deer * 4 legs/deer = 12 legs\n*   **Giraffe:** 1 giraffe * 4 legs/giraffe = 4 legs\n*   **Elephants:** 3 elephants * 4 legs/elephant = 12 legs\n*   **Ducks:** 2 ducks * 2 legs/duck = 4 legs\n*   **Beetle:** 1 beetle * 6 legs/beetle = 6 legs\n*   **Flatworm:** 1 flatworm * 0 legs/flatworm = 0 legs\n\nNow, let's add them all up:\n\n6 + 0 + 12 + 4 + 12 + 4 + 6 + 0 = 44 legs\n\nSo, there are a total of 44 legs.",
+        "score": 0.5,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 8
+          },
+          "animals": {
+            "human": 3,
+            "sea slug": 3,
+            "deer": 3,
+            "giraffe": 1,
+            "elephant": 3,
+            "duck": 2,
+            "beetle": 1,
+            "flatworm": 1
+          },
+          "total_legs": 44
+        }
+      },
+      {
+        "question": "How many legs are there in total if you have 2 sheeps, 3 dogs, 3 praying mantiss?",
+        "expected_answer": "38",
+        "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Sheep:** 2 sheep * 4 legs/sheep = 8 legs\n*   **Dogs:** 3 dogs * 4 legs/dog = 12 legs\n*   **Praying Mantises:** 3 mantises * 6 legs/mantis = 18 legs\n\n**Total:** 8 + 12 + 18 = 38 legs\n",
+        "score": 0.5,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 3
+          },
+          "animals": {
+            "sheep": 2,
+            "dog": 3,
+            "praying mantis": 3
+          },
+          "total_legs": 38
+        }
+      },
+      {
+        "question": "How many legs are there in total if you have 2 crabs, 1 lobster, 1 human, 1 cow, 3 bees, 1 elephant?",
+        "expected_answer": "58",
+        "model_answer": "Here's the breakdown:\n\n*   **Crabs:** 2 crabs * 10 legs/crab = 20 legs\n*   **Lobster:** 1 lobster * 10 legs/lobster = 10 legs\n*   **Human:** 1 human * 2 legs/human = 2 legs\n*   **Cow:** 1 cow * 4 legs/cow = 4 legs\n*   **Bees:** 3 bees * 6 legs/bee = 18 legs\n*   **Elephant:** 1 elephant * 4 legs/elephant = 4 legs\n\n**Total:** 20 + 10 + 2 + 4 + 18 + 4 = **58 legs**",
+        "score": 0.5,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 6
+          },
+          "animals": {
+            "crab": 2,
+            "lobster": 1,
+            "human": 1,
+            "cow": 1,
+            "bee": 3,
+            "elephant": 1
+          },
+          "total_legs": 58
+        }
+      },
+      {
+        "question": "How many legs are there in total if you have 2 grasshoppers, 1 spider, 1 tiger, 2 chickens, 1 starfish?",
+        "expected_answer": "33",
+        "model_answer": "Here's the breakdown:\n\n*   **Grasshoppers:** 2 grasshoppers * 6 legs/grasshopper = 12 legs\n*   **Spider:** 1 spider * 8 legs/spider = 8 legs\n*   **Tiger:** 1 tiger * 4 legs/tiger = 4 legs\n*   **Chickens:** 2 chickens * 2 legs/chicken = 4 legs\n*   **Starfish:** 1 starfish * 0 legs/starfish = 0 legs (starfish have tube feet but not legs)\n\n**Total:** 12 + 8 + 4 + 4 + 0 = **28 legs**\n",
+        "score": 0.01,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 5
+          },
+          "animals": {
+            "grasshopper": 2,
+            "spider": 1,
+            "tiger": 1,
+            "chicken": 2,
+            "starfish": 1
+          },
+          "total_legs": 33
+        }
+      },
+      {
+        "question": "How many legs are there in total if you have 3 wasps, 3 jellyfishs, 1 elephant?",
+        "expected_answer": "22",
+        "model_answer": "Here's how to solve this:\n\n*   **Wasps:** Each wasp has 6 legs, so 3 wasps have 3 * 6 = 18 legs.\n*   **Jellyfish:** Jellyfish don't have legs, so they have 0 legs.\n*   **Elephant:** An elephant has 4 legs.\n\n**Total:** 18 + 0 + 4 = 22 legs\n\n**Answer:** There are a total of 22 legs.",
+        "score": 0.5,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 3
+          },
+          "animals": {
+            "wasp": 3,
+            "jellyfish": 3,
+            "elephant": 1
+          },
+          "total_legs": 22
+        }
+      },
+      {
+        "question": "How many legs are there in total if you have 2 leechs, 2 crickets, 3 shrimps, 2 praying mantiss, 2 woodlouses?",
+        "expected_answer": "82",
+        "model_answer": "Here's the breakdown of the legs:\n\n*   **Leeches:** 0 legs each, so 2 leeches have 2 * 0 = 0 legs\n*   **Crickets:** 6 legs each, so 2 crickets have 2 * 6 = 12 legs\n*   **Shrimps:** 10 legs each, so 3 shrimps have 3 * 10 = 30 legs\n*   **Praying Mantises:** 6 legs each, so 2 praying mantises have 2 * 6 = 12 legs\n*   **Woodlice (Pillbugs/Roly-Polies):** 14 legs each, so 2 woodlice have 2 * 14 = 28 legs\n\nTotal: 0 + 12 + 30 + 12 + 28 = 82 legs\n\nSo the answer is 82",
+        "score": 0.5,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 5
+          },
+          "animals": {
+            "leech": 2,
+            "cricket": 2,
+            "shrimp": 3,
+            "praying mantis": 2,
+            "woodlouse": 2
+          },
+          "total_legs": 82
+        }
+      },
+      {
+        "question": "How many legs are there in total if you have 2 insects, 1 duck, 1 lobster, 3 starfishs, 1 scorpion, 2 horses, 3 butterflys?",
+        "expected_answer": "73",
+        "model_answer": "Here's the breakdown:\n\n*   **Insects:** 2 insects * 6 legs/insect = 12 legs\n*   **Duck:** 1 duck * 2 legs/duck = 2 legs\n*   **Lobster:** 1 lobster * 10 legs/lobster = 10 legs\n*   **Starfish:** 3 starfish * 0 legs/starfish = 0 legs\n*   **Scorpion:** 1 scorpion * 8 legs/scorpion = 8 legs\n*   **Horses:** 2 horses * 4 legs/horse = 8 legs\n*   **Butterflies:** 3 butterflies * 6 legs/butterfly = 18 legs\n\n**Total:** 12 + 2 + 10 + 0 + 8 + 8 + 18 = **58 legs**",
+        "score": 0.01,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 7
+          },
+          "animals": {
+            "insect": 2,
+            "duck": 1,
+            "lobster": 1,
+            "starfish": 3,
+            "scorpion": 1,
+            "horse": 2,
+            "butterfly": 3
+          },
+          "total_legs": 73
+        }
+      },
+      {
+        "question": "How many legs are there in total if you have 2 butterflys, 3 grasshoppers, 3 humans?",
+        "expected_answer": "36",
+        "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Butterflies:** 2 butterflies * 6 legs/butterfly = 12 legs\n*   **Grasshoppers:** 3 grasshoppers * 6 legs/grasshopper = 18 legs\n*   **Humans:** 3 humans * 2 legs/human = 6 legs\n\n**Total:** 12 + 18 + 6 = 36 legs\n\nThere are a total of 36 legs.",
+        "score": 0.5,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 3
+          },
+          "animals": {
+            "butterfly": 2,
+            "grasshopper": 3,
+            "human": 3
+          },
+          "total_legs": 36
+        }
+      },
+      {
+        "question": "How many legs are there in total if you have 3 deers, 2 beetles, 1 giraffe, 3 fireflys, 3 ants, 1 chicken?",
+        "expected_answer": "66",
+        "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Deer:** 3 deer * 4 legs/deer = 12 legs\n*   **Beetles:** 2 beetles * 6 legs/beetle = 12 legs\n*   **Giraffe:** 1 giraffe * 4 legs/giraffe = 4 legs\n*   **Fireflys:** 3 fireflys * 6 legs/firefly = 18 legs\n*   **Ants:** 3 ants * 6 legs/ant = 18 legs\n*   **Chicken:** 1 chicken * 2 legs/chicken = 2 legs\n\n**Total:** 12 + 12 + 4 + 18 + 18 + 2 = **66 legs**",
+        "score": 0.5,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 6
+          },
+          "animals": {
+            "deer": 3,
+            "beetle": 2,
+            "giraffe": 1,
+            "firefly": 3,
+            "ant": 3,
+            "chicken": 1
+          },
+          "total_legs": 66
+        }
+      },
+      {
+        "question": "How many legs are there in total if you have 1 scorpion, 3 shrimps, 2 cats, 2 giraffes?",
+        "expected_answer": "54",
+        "model_answer": "Let's count the legs of each animal and then add them up:\n\n*   **Scorpion:** 8 legs\n*   **Shrimp:** 10 legs\n*   **Cat:** 4 legs\n*   **Giraffe:** 4 legs\n\nNow let's calculate the total:\n\n*   1 scorpion * 8 legs/scorpion = 8 legs\n*   3 shrimps * 10 legs/shrimp = 30 legs\n*   2 cats * 4 legs/cat = 8 legs\n*   2 giraffes * 4 legs/giraffe = 8 legs\n\nFinally, add them all together: 8 + 30 + 8 + 8 = 54 legs\n\nSo the answer is $\\boxed{54}$\n",
+        "score": 0.5,
+        "metadata": {
+          "difficulty": {
+            "num_animals": 4
+          },
+          "animals": {
+            "scorpion": 1,
+            "shrimp": 3,
+            "cat": 2,
+            "giraffe": 2
+          },
+          "total_legs": 54
+        }
+      }
+    ]
+  }
+]
\ No newline at end of file
diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
new file mode 100644
index 00000000..92d49c86
--- /dev/null
+++ b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
@@ -0,0 +1,12 @@
+{
+  "metrics": {
+    "dataset_name": "leg_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "size": 10,
+    "seed": 42,
+    "average_score": 0,
+    "total_examples": 0,
+    "timestamp": "2025-02-10T06:06:08.539389"
+  },
+  "results": []
+}
\ No newline at end of file
diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
new file mode 100644
index 00000000..cbac4104
--- /dev/null
+++ b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
@@ -0,0 +1,186 @@
+{
+  "metrics": {
+    "dataset_name": "leg_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "size": 10,
+    "seed": 42,
+    "average_score": 0.45099999999999996,
+    "total_examples": 10,
+    "timestamp": "2025-02-10T06:07:53.290258"
+  },
+  "results": [
+    {
+      "question": "How many legs are there in total if you have 1 sea slug, 1 deer?",
+      "expected_answer": "4",
+      "model_answer": "*   A sea slug has no legs.\n*   A deer has 4 legs.\n\nTherefore, there are a total of 0 + 4 = $\\boxed{4}$ legs.\n",
+      "score": 0.5,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 2
+        },
+        "animals": {
+          "sea slug": 1,
+          "deer": 1
+        },
+        "total_legs": 4
+      }
+    },
+    {
+      "question": "How many legs are there in total if you have 2 sheeps, 2 dogs?",
+      "expected_answer": "16",
+      "model_answer": "* Sheep have 4 legs each, so 2 sheep have 2 * 4 = 8 legs\n* Dogs have 4 legs each, so 2 dogs have 2 * 4 = 8 legs\n\nIn total, there are 8 + 8 = **16** legs.\n",
+      "score": 0.5,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 2
+        },
+        "animals": {
+          "sheep": 2,
+          "dog": 2
+        },
+        "total_legs": 16
+      }
+    },
+    {
+      "question": "How many legs are there in total if you have 1 crab, 2 lobsters, 1 human, 1 cow, 1 bee?",
+      "expected_answer": "42",
+      "model_answer": "Here's the breakdown:\n\n*   **Crab:** 10 legs\n*   **Lobster:** 10 legs each, so 2 * 10 = 20 legs\n*   **Human:** 2 legs\n*   **Cow:** 4 legs\n*   **Bee:** 6 legs\n\n**Total:** 10 + 20 + 2 + 4 + 6 = **42 legs**",
+      "score": 0.5,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 5
+        },
+        "animals": {
+          "crab": 1,
+          "lobster": 2,
+          "human": 1,
+          "cow": 1,
+          "bee": 1
+        },
+        "total_legs": 42
+      }
+    },
+    {
+      "question": "How many legs are there in total if you have 2 grasshoppers, 2 spiders, 1 tiger, 1 chicken?",
+      "expected_answer": "34",
+      "model_answer": "Here's the breakdown:\n\n*   **Grasshopper:** 6 legs each, so 2 grasshoppers have 2 * 6 = 12 legs\n*   **Spider:** 8 legs each, so 2 spiders have 2 * 8 = 16 legs\n*   **Tiger:** 4 legs\n*   **Chicken:** 2 legs\n\nTotal: 12 + 16 + 4 + 2 = 34 legs\n\nThere are a total of $\\boxed{34}$ legs.",
+      "score": 0.5,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 4
+        },
+        "animals": {
+          "grasshopper": 2,
+          "spider": 2,
+          "tiger": 1,
+          "chicken": 1
+        },
+        "total_legs": 34
+      }
+    },
+    {
+      "question": "How many legs are there in total if you have 3 wasps, 3 jellyfishs?",
+      "expected_answer": "18",
+      "model_answer": "*   **Wasps:** Each wasp has 6 legs, so 3 wasps have 3 * 6 = 18 legs.\n*   **Jellyfish:** Jellyfish do not have legs.\n\n**Total:** 18 + 0 = 18 legs\n\nThere are a total of $\\boxed{18}$ legs.\n",
+      "score": 0.5,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 2
+        },
+        "animals": {
+          "wasp": 3,
+          "jellyfish": 3
+        },
+        "total_legs": 18
+      }
+    },
+    {
+      "question": "How many legs are there in total if you have 3 leechs, 2 crickets, 2 shrimps, 3 praying mantiss?",
+      "expected_answer": "50",
+      "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Leeches:** Leeches don't have legs.\n*   **Crickets:** 2 Crickets * 6 legs/cricket = 12 legs\n*   **Shrimp:** 2 Shrimps * 10 legs/shrimp = 20 legs\n*   **Praying Mantises:** 3 Mantises * 6 legs/mantis = 18 legs\n\n**Total:** 0 + 12 + 20 + 18 = 50 legs\n\nThere are a total of 50 legs.",
+      "score": 0.5,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 4
+        },
+        "animals": {
+          "leech": 3,
+          "cricket": 2,
+          "shrimp": 2,
+          "praying mantis": 3
+        },
+        "total_legs": 50
+      }
+    },
+    {
+      "question": "How many legs are there in total if you have 3 ducks, 1 shrimp, 3 lobsters, 2 starfishs?",
+      "expected_answer": "56",
+      "model_answer": "Here's how to calculate the total number of legs:\n\n*   Ducks: 3 ducks * 2 legs/duck = 6 legs\n*   Shrimp: 1 shrimp * 10 legs/shrimp = 10 legs\n*   Lobsters: 3 lobsters * 10 legs/lobster = 30 legs\n*   Starfish: 2 starfish * 0 legs/starfish = 0 legs\n\nTotal: 6 + 10 + 30 + 0 = 46 legs\n\nThere are a total of 46 legs.",
+      "score": 0.01,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 4
+        },
+        "animals": {
+          "duck": 3,
+          "shrimp": 1,
+          "lobster": 3,
+          "starfish": 2
+        },
+        "total_legs": 56
+      }
+    },
+    {
+      "question": "How many legs are there in total if you have 1 butterfly, 2 grasshoppers?",
+      "expected_answer": "18",
+      "model_answer": "* A butterfly has 6 legs.\n* A grasshopper has 6 legs.\n\nSo, 1 butterfly has 6 legs, and 2 grasshoppers have 2 * 6 = 12 legs.\n\nIn total, there are 6 + 12 = $\\boxed{18}$ legs.\n",
+      "score": 0.5,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 2
+        },
+        "animals": {
+          "butterfly": 1,
+          "grasshopper": 2
+        },
+        "total_legs": 18
+      }
+    },
+    {
+      "question": "How many legs are there in total if you have 1 deer, 3 beetles, 2 giraffes, 1 firefly, 3 ants?",
+      "expected_answer": "54",
+      "model_answer": "Let's count the legs for each animal:\n* Deer: 1 deer * 4 legs/deer = 4 legs\n* Beetles: 3 beetles * 6 legs/beetle = 18 legs\n* Giraffes: 2 giraffes * 4 legs/giraffe = 8 legs\n* Firefly: 1 firefly * 6 legs/firefly = 6 legs\n* Ants: 3 ants * 6 legs/ant = 18 legs\n\nNow, add up the legs for all the animals:\n4 + 18 + 8 + 6 + 18 = 54 legs\n\nTherefore, there are a total of 54 legs.\n\nFinal Answer: The final answer is $\\boxed{54}$",
+      "score": 0.5,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 5
+        },
+        "animals": {
+          "deer": 1,
+          "beetle": 3,
+          "giraffe": 2,
+          "firefly": 1,
+          "ant": 3
+        },
+        "total_legs": 54
+      }
+    },
+    {
+      "question": "How many legs are there in total if you have 1 scorpion, 1 shrimp, 3 cats?",
+      "expected_answer": "30",
+      "model_answer": "Here's how to figure that out:\n\n*   **Scorpion:** Scorpions have 8 legs.\n*   **Shrimp:** Shrimp have 10 legs.\n*   **Cats:** Cats have 4 legs each, so 3 cats have 3 * 4 = 12 legs.\n\n**Total:** 8 + 10 + 12 = 30 legs\n\nThere are a total of 30 legs.",
+      "score": 0.5,
+      "metadata": {
+        "difficulty": {
+          "num_animals": 3
+        },
+        "animals": {
+          "scorpion": 1,
+          "shrimp": 1,
+          "cat": 3
+        },
+        "total_legs": 30
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
new file mode 100644
index 00000000..590a48d6
--- /dev/null
+++ b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
@@ -0,0 +1,12 @@
+{
+  "metrics": {
+    "dataset_name": "propositional_logic",
+    "model": "google/gemini-2.0-flash-001",
+    "size": 10,
+    "seed": 42,
+    "average_score": 0,
+    "total_examples": 0,
+    "timestamp": "2025-02-10T06:06:10.638347"
+  },
+  "results": []
+}
\ No newline at end of file
diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
new file mode 100644
index 00000000..8144479d
--- /dev/null
+++ b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
@@ -0,0 +1,201 @@
+{
+  "metrics": {
+    "dataset_name": "propositional_logic",
+    "model": "google/gemini-2.0-flash-001",
+    "size": 10,
+    "seed": 42,
+    "average_score": 0.059,
+    "total_examples": 10,
+    "timestamp": "2025-02-10T06:08:24.037697"
+  },
+  "results": [
+    {
+      "question": "Given:\n1. R\n2. Q\nWhat can we conclude?",
+      "expected_answer": "(P \u2228 Q)",
+      "model_answer": "Given only R and Q, we can conclude that **R and Q are true** (according to propositional logic).\n",
+      "score": 0.01,
+      "metadata": {
+        "premises": [
+          "R",
+          "Q"
+        ],
+        "variables": [
+          "P",
+          "Q",
+          "R",
+          "S"
+        ],
+        "complexity": 3
+      }
+    },
+    {
+      "question": "Given:\n1. ((Q \u2192 P) \u2228 (Q \u2192 P))\n2. ((Q \u2194 Q) \u2192 (P \u2192 P))\n3. P\nWhat can we conclude?",
+      "expected_answer": "(P \u2192 P)",
+      "model_answer": "Let's break down each premise and see what we can derive.\n\n* **1. ((Q \u2192 P) \u2228 (Q \u2192 P))** This is logically equivalent to (Q \u2192 P).  This is because any statement 'A or A' is simply 'A'.  So we now have:  **Q \u2192 P**\n\n* **2. ((Q \u2194 Q) \u2192 (P \u2192 P))**  The statement (Q \u2194 Q) is a tautology (always true).  The statement (P \u2192 P) is also a tautology.  Therefore, (True \u2192 True) is True. This premise is always true and doesn't provide us with any new information that helps us directly derive something else.  We can essentially ignore it.\n\n* **3. P** This is simply **P**.\n\nNow we have:\n\n* Q \u2192 P\n* P\n\nFrom these two statements, we can't directly conclude anything.\n\nRemember the Modus Ponens and Modus Tollens rules:\n\n*   **Modus Ponens:** If you have P \u2192 Q and P, you can conclude Q.\n*   **Modus Tollens:** If you have P \u2192 Q and \u00acQ, you can conclude \u00acP.\n\nIn our case, we have Q \u2192 P and P.   This matches the *reverse* of Modus Ponens, which is *not* a valid inference rule.  Knowing that the *consequent* (P) is true *doesn't* tell us anything definitively about the *antecedent* (Q).  Q could be either true or false.\n\nTherefore, based on the given premises, we cannot conclude anything further about Q's truth value.\n",
+      "score": 0.5,
+      "metadata": {
+        "premises": [
+          "((Q \u2192 P) \u2228 (Q \u2192 P))",
+          "((Q \u2194 Q) \u2192 (P \u2192 P))",
+          "P"
+        ],
+        "variables": [
+          "P",
+          "Q"
+        ],
+        "complexity": 3
+      }
+    },
+    {
+      "question": "Given:\n1. ((Q \u2228 P) \u2227 \u00acP)\n2. P\n3. ((P \u2227 R) \u2227 \u00acR)\n4. ((Q \u2194 R) \u2192 \u00acQ)\nWhat can we conclude?",
+      "expected_answer": "(Q \u2227 Q)",
+      "model_answer": "Let's analyze each given statement:\n\n1. **((Q \u2228 P) \u2227 \u00acP)**:  This says that either Q or P is true *and* P is false.  Since P is false, Q must be true.  So, we can conclude **Q**.\n\n2. **P**:  This simply states that P is true.\n\n3. **((P \u2227 R) \u2227 \u00acR)**: This reads as \"Both P and R are true, *and* R is false.\"  This is a contradiction. Since P and R are true, you cannot conclude that R is false.\n\n4. **((Q \u2194 R) \u2192 \u00acQ)**: This says \"If Q is equivalent to R (i.e., they have the same truth value), then Q is false.\"\n\nNow, let's see what we can derive, considering information from all the premises.\n\n* **From premise 1:**  We *appear* to conclude Q.  However premise 2 states that P is true. This contradicts the information provided in premise 1 that states P is false. Therefore, we have contradictory information. \n\n* **Premise 3 is a contradiction.**\n\nLet's consider the assumptions independently.\n\n* **Assuming premise 1 is true in isolation:**  Then, P is false and Q is true.\n\n* **Assuming premise 2 is true in isolation:** Then, P is true.\n\n* **Assuming premise 3 is true in isolation:**  Then, P and R are true and R is false. This is itself a contradiction and therefore cannot be true in isolation.\n\n* **Assuming premise 4 is true in isolation:**  It tells us the dependence between the truth values of Q and R, but doesn't give us the true/false result of either variable on their own.\n\nSince we have contradicting information in the premises, we must be able to reach a conclusion.\n\n**Analyzing for Contradiction:**\n\nSince premise 1 stated that Q is true only if \u00acP is True, yet premise 2 asserts that P is True, then we know that premise (1\u22272) \u22a2 False.\n\nPremise 3 also claims that ((P \u2227 R) \u2227 \u00acR) is true which again cannot be true since (R\u2227\u00acR) evaluates to False.\n\nPremise 4 will only create a contradiction if Q \u2194 R evaluates to True but \u00acQ evaluates to False. Since Q is true from Premise 1, this implies that R must also be True, yet we cannot determine whether this is indeed true or false. We can only determine this, if we remove the contradictory information given regarding Q and R across all the premises.\n\n**Conclusion with contradiction:**\n\nWith contradictory premises, we can conclude *anything*.  This is a principle of classical logic called \"ex falso quodlibet\" (from falsehood, anything follows), often simplified as \"explosion\".  Since the collection of premises contains a contradiction, we can use it derive any conclusion.\n\n**Example:**\n\n1. Q is true (from premise 1, under the assumption it's only true premise)\n2. P is true is also true (from premise 2 under the assumption it's the only true premise)\n3. P and R is true and R is false (from premise 3 with the same provision).\n4. (Q \u2194 R) \u2192 \u00acQ (from premise 4 and depends on the premise results from the above to determine whether Q is true, but we can only determine that when the contradictory information is omitted from the statement/statements/premises).\n\nBecause premises do not provide consistent rules to determine truth values of P, Q, R, and therefore contradiction, then anything follows.\n",
+      "score": 0.01,
+      "metadata": {
+        "premises": [
+          "((Q \u2228 P) \u2227 \u00acP)",
+          "P",
+          "((P \u2227 R) \u2227 \u00acR)",
+          "((Q \u2194 R) \u2192 \u00acQ)"
+        ],
+        "variables": [
+          "P",
+          "Q",
+          "R"
+        ],
+        "complexity": 3
+      }
+    },
+    {
+      "question": "Given:\n1. \u00acP\n2. \u00acP\n3. Q\nWhat can we conclude?",
+      "expected_answer": "(Q \u2228 R)",
+      "model_answer": "Given the premises:\n\n1.  \u00acP\n2.  \u00acP\n3.  Q\n\nWe can conclude **Q**.\n\nThe repetition of \u00acP doesn't add any new information. This is a very simple case where 'Q' is stated directly in the premises. There's no need for any complex logical deductions.\n",
+      "score": 0.01,
+      "metadata": {
+        "premises": [
+          "\u00acP",
+          "\u00acP",
+          "Q"
+        ],
+        "variables": [
+          "P",
+          "Q",
+          "R"
+        ],
+        "complexity": 3
+      }
+    },
+    {
+      "question": "Given:\n1. P\n2. ((P \u2228 P) \u2194 (Q \u2227 P))\n3. P\nWhat can we conclude?",
+      "expected_answer": "(Q \u2194 Q)",
+      "model_answer": "Let's analyze the given statements:\n\n1. P is true.\n2. ((P \u2228 P) \u2194 (Q \u2227 P)) - This statement tells us that the truth value of (P \u2228 P) is equivalent to the truth value of (Q \u2227 P).\n\nSince we know P is true from premise 1, we can simplify:\n\n* **P \u2228 P is true:**  Because P is true, disjoining it with itself makes the disjunction true.\n* **Since (P \u2228 P) \u2194 (Q \u2227 P) is true and (P \u2228 P) is true, then (Q \u2227 P) must also be true.**\n* **(Q \u2227 P) is true, and P is true, therefore Q must be true.**  For the conjunction (Q \u2227 P) to be true, both Q and P must be true.\n\nTherefore, we can conclude that **Q is true**.\n\nSo the conclusion includes P and Q.\n",
+      "score": 0.01,
+      "metadata": {
+        "premises": [
+          "P",
+          "((P \u2228 P) \u2194 (Q \u2227 P))",
+          "P"
+        ],
+        "variables": [
+          "P",
+          "Q"
+        ],
+        "complexity": 3
+      }
+    },
+    {
+      "question": "Given:\n1. (Q \u2194 R)\n2. \u00acR\nWhat can we conclude?",
+      "expected_answer": "(Q \u2192 R)",
+      "model_answer": "From the given premises:\n\n1.  Q \u2194 R (Q is equivalent to R)\n2.  \u00acR (R is false)\n\nSince Q is equivalent to R, this means that if R is false, then Q must also be false.\n\nTherefore, we can conclude:\n\n\u00acQ (Q is false)\n",
+      "score": 0.01,
+      "metadata": {
+        "premises": [
+          "(Q \u2194 R)",
+          "\u00acR"
+        ],
+        "variables": [
+          "P",
+          "Q",
+          "R"
+        ],
+        "complexity": 3
+      }
+    },
+    {
+      "question": "Given:\n1. R\n2. ((Q \u2192 Q) \u2228 (S \u2227 Q))\n3. Q\nWhat can we conclude?",
+      "expected_answer": "(S \u2228 Q)",
+      "model_answer": "Let's analyze the given statements:\n\n*   **1. R** - This is a simple assertion: R is true.\n\n*   **2. ((Q \u2192 Q) \u2228 (S \u2227 Q))** -  Let's break this down:\n    *   `(Q \u2192 Q)`: This is a tautology.  Any proposition implying itself is always true.\n    *   `(S \u2227 Q)`: This is true if and only if both S and Q are true.\n    *   `\u2228`:  The entire statement is true if either `(Q \u2192 Q)` is true, or `(S \u2227 Q)` is true.  Since `(Q \u2192 Q)` is always true, this entire statement is always true, regardless of the truth value of S.\n\n*   **3. Q** - This asserts that Q is true.\n\nNow, what can we conclude?\n\n*   **We know R is true.** (Directly from statement 1).\n*   **We know Q is true.** (Directly from statement 3).\n\nFrom the second statement, `((Q \u2192 Q) \u2228 (S \u2227 Q))`, we can note the following:\n\n*   Because `Q \u2192 Q` is always true (a tautology), `((Q \u2192 Q) \u2228 (S \u2227 Q))` is always true, regardless of the truth value of S. This statement doesn't help us conclude anything new about S, but it does provide some information about the relationship between Q and itself, and Q and S.\n* Because we also know that Q is true, and that `((Q \u2192 Q) \u2228 (S \u2227 Q))` is true, the `(S \u2227 Q)` portion could be true or false. If it's true, then S must also be true. If it's false, then S must be false. However, because `((Q \u2192 Q) \u2228 (S \u2227 Q))` is true, and Q is true, we cannot know for sure if S must be true or false. \n\nTherefore, the key conclusions we can draw are:\n\n*   **R is true.**\n*   **Q is true.**\n\nWe **cannot** definitively conclude anything about the truth value of S.\n",
+      "score": 0.01,
+      "metadata": {
+        "premises": [
+          "R",
+          "((Q \u2192 Q) \u2228 (S \u2227 Q))",
+          "Q"
+        ],
+        "variables": [
+          "P",
+          "Q",
+          "R",
+          "S"
+        ],
+        "complexity": 3
+      }
+    },
+    {
+      "question": "Given:\n1. (Q \u2227 P)\n2. (\u00acP \u2227 (P \u2192 P))\n3. \u00ac(Q \u2194 Q)\nWhat can we conclude?",
+      "expected_answer": "(P \u2227 P)",
+      "model_answer": "Let's analyze each statement:\n\n1. **(Q \u2227 P):** This statement tells us that both Q and P are true.\n\n2. **(\u00acP \u2227 (P \u2192 P)):**\n    * `\u00acP`: This means P is false.\n    * `(P \u2192 P)`: This is always true (a tautology).  Any proposition implies itself.\n\n3. **\u00ac(Q \u2194 Q):**\n    * `(Q \u2194 Q)`:  This means \"Q if and only if Q\". This is always true (a tautology).\n    * `\u00ac(Q \u2194 Q)`: This means \"not (Q if and only if Q)\".  Since `(Q \u2194 Q)` is always true, its negation is always false.\n\nNow let's look for contradictions:\n\n* Statement 1 says P is true.\n* Statement 2 says P is false.\n\nTherefore, **we have a contradiction**. The set of premises is **inconsistent**.  We can conclude that the statements cannot all be true simultaneously; there's no truth assignment that will satisfy all three statements.\n\nTherefore, we can conclude that the premises are inconsistent.\n\n**Final Answer: The set of premises is inconsistent.**\n",
+      "score": 0.01,
+      "metadata": {
+        "premises": [
+          "(Q \u2227 P)",
+          "(\u00acP \u2227 (P \u2192 P))",
+          "\u00ac(Q \u2194 Q)"
+        ],
+        "variables": [
+          "P",
+          "Q"
+        ],
+        "complexity": 3
+      }
+    },
+    {
+      "question": "Given:\n1. (R \u2228 Q)\n2. (R \u2227 Q)\n3. R\nWhat can we conclude?",
+      "expected_answer": "(P \u2194 P)",
+      "model_answer": "From the given statements:\n\n*   **1. (R \u2228 Q)** - This tells us that R or Q (or both) is true.\n*   **2. (R \u2227 Q)** - This tells us that both R and Q are true.\n*   **3. R** - This tells us that R is true.\n\nWhile statement 3. R is already confirmed in statement 2., so, knowing that both R and Q are true from statement 2, we can definitively conclude:\n\n*   **Q** is true.\n\nHere's a breakdown:\n\nStatement 2 tells us R \u2227 Q.  For a conjunction (\u2227) to be true, both parts must be true therefore R and Q must be true.\n\nSo, the most direct and strongest conclusion we can make is:\n\n*  **Q is true**\n",
+      "score": 0.01,
+      "metadata": {
+        "premises": [
+          "(R \u2228 Q)",
+          "(R \u2227 Q)",
+          "R"
+        ],
+        "variables": [
+          "P",
+          "Q",
+          "R"
+        ],
+        "complexity": 3
+      }
+    },
+    {
+      "question": "Given:\n1. ((P \u2228 Q) \u2228 (Q \u2192 Q))\n2. \u00ac(Q \u2192 Q)\n3. ((P \u2227 Q) \u2227 (Q \u2227 P))\n4. (Q \u2194 Q)\nWhat can we conclude?",
+      "expected_answer": "(P \u2192 P)",
+      "model_answer": "Let's analyze each statement:\n\n1.  **((P \u2228 Q) \u2228 (Q \u2192 Q))**:  We know that `Q \u2192 Q` is always true (a tautology). Therefore, `(Q \u2192 Q)` is equivalent to `T` (True). The statement becomes `((P \u2228 Q) \u2228 T)`.  Since anything ORed with True is always True, this whole statement simplifies to **T (True)**.  It tells us nothing about the values of P or Q.\n\n2.  **\u00ac(Q \u2192 Q)**: We know that `Q \u2192 Q` is always true.  Therefore, `\u00ac(Q \u2192 Q)` is equivalent to `\u00acT`, which is **F (False)**.  This is a contradiction given that the argument is valid.\n\n3.  **((P \u2227 Q) \u2227 (Q \u2227 P))**:  This statement is equivalent to `(P \u2227 Q)`. This is because `P \u2227 Q` is logically equivalent to `Q \u2227 P`. So, we can say that `P` is True **AND** `Q` is True.\n\n4.  **(Q \u2194 Q)**:  This is a biconditional, meaning \"Q if and only if Q\".  This statement is always true (a tautology) regardless of the value of Q. So this is equivalent to **T (True)**, and again, it tells us nothing about the value of Q.\n\n**Conclusion**:\n\nThe argument presents contradictory statements:\n\nStatement 1: Always True.\nStatement 2: Always False.\nStatement 3: P is true AND Q is true.\nStatement 4: Always true.\n\nStatement 2 contradicts both statement 1 and statement 4.\n\n**Therefore, we can conclude that the set of premises is inconsistent due to the contradiction in premise 2 \"\u00ac(Q \u2192 Q)\".**\n\nWithout premise 2, we could conclude that P is true AND Q is true from premise 3. The logical equivalence `(P \u2227 Q) \u2194 (Q \u2194 Q)` can be derived by simply checking truth tables.\n",
+      "score": 0.01,
+      "metadata": {
+        "premises": [
+          "((P \u2228 Q) \u2228 (Q \u2192 Q))",
+          "\u00ac(Q \u2192 Q)",
+          "((P \u2227 Q) \u2227 (Q \u2227 P))",
+          "(Q \u2194 Q)"
+        ],
+        "variables": [
+          "P",
+          "Q"
+        ],
+        "complexity": 3
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json
new file mode 100644
index 00000000..661bdc24
--- /dev/null
+++ b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json
@@ -0,0 +1,39 @@
+[
+  {
+    "dataset_name": "letter_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.20600000000000002,
+    "total_examples": 10,
+    "timestamp": "2025-02-10T06:34:37.091554",
+    "config": {
+      "min_words": 5,
+      "max_words": 15,
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "propositional_logic",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.059,
+    "total_examples": 10,
+    "timestamp": "2025-02-10T06:35:11.432275",
+    "config": {
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "leg_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.40199999999999997,
+    "total_examples": 10,
+    "timestamp": "2025-02-10T06:35:27.087469",
+    "config": {
+      "min_animals": 3,
+      "max_animals": 8,
+      "size": 10,
+      "seed": 42
+    }
+  }
+]
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b96fc1c2..b082f034 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,3 +5,4 @@ isort>=5.13.2
 flake8>=7.1.1
 mypy>=1.14.1
 pre-commit>=4.1.0
+openai>=1.61.1
\ No newline at end of file

From 2308ed99fb31014ee92bd8ac58bc08a3821e9579 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sun, 9 Feb 2025 22:40:45 -0800
Subject: [PATCH 09/11] [eval-basic] run precommit formatting

---
 eval/eval_basic.json                          |   2 +-
 eval/eval_basic.py                            | 102 ++++++++----------
 eval/eval_basic.sh                            |   2 +-
 ..._gemini-2.0-flash-001_20250209_223527.json |   2 +-
 ..._gemini-2.0-flash-001_20250209_220608.json |   2 +-
 ..._gemini-2.0-flash-001_20250209_220753.json |   2 +-
 ..._gemini-2.0-flash-001_20250209_220610.json |   2 +-
 ..._gemini-2.0-flash-001_20250209_220824.json |   2 +-
 ..._gemini-2.0-flash-001_20250209_223527.json |   2 +-
 requirements-dev.txt                          |   2 +-
 10 files changed, 53 insertions(+), 67 deletions(-)

diff --git a/eval/eval_basic.json b/eval/eval_basic.json
index 8e96863c..6240ce9d 100644
--- a/eval/eval_basic.json
+++ b/eval/eval_basic.json
@@ -18,4 +18,4 @@
       "size": 10,
       "seed": 42
     }
-  ]
\ No newline at end of file
+  ]
diff --git a/eval/eval_basic.py b/eval/eval_basic.py
index 6b605487..f8952e10 100644
--- a/eval/eval_basic.py
+++ b/eval/eval_basic.py
@@ -1,18 +1,17 @@
 import argparse
-from datetime import datetime
 import json
 import os
-from openai import OpenAI
+from datetime import datetime
 from typing import Any, Dict, List
 
+from openai import OpenAI
+
 from reasoning_gym.factory import DATASETS, create_dataset
 
+
 class OpenRouterEvaluator:
     def __init__(self, model: str):
-        self.client = OpenAI(
-            base_url="https://openrouter.ai/api/v1",
-            api_key=os.getenv('OPENROUTER_API_KEY')
-        )
+        self.client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=os.getenv("OPENROUTER_API_KEY"))
         self.model = model
         self.extra_headers = {}
 
@@ -20,12 +19,7 @@ class OpenRouterEvaluator:
         """Get response from the model via OpenRouter API."""
         try:
             completion = self.client.chat.completions.create(
-                extra_headers=self.extra_headers,
-                model=self.model,
-                messages=[{
-                    "role": "user",
-                    "content": prompt
-                }]
+                extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}]
             )
             return completion.choices[0].message.content
         except Exception as e:
@@ -35,27 +29,27 @@ class OpenRouterEvaluator:
     def evaluate_datasets(self, dataset_configs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Evaluate model on multiple datasets with their respective configurations."""
         all_results = []
-        
+
         for dataset_config in dataset_configs:
-            dataset_name = dataset_config.pop('name')
+            dataset_name = dataset_config.pop("name")
             print(f"\nEvaluating dataset: {dataset_name}")
-            
+
             try:
                 # Create dataset with its specific configuration
                 data = create_dataset(dataset_name, **dataset_config)
                 results = []
-                
+
                 for entry in data:
                     try:
-                        response = self.get_model_response(entry['question'])
+                        response = self.get_model_response(entry["question"])
                         score = data.score_answer(answer=response, entry=entry)
 
                         result = {
-                            'question': entry['question'],
-                            'expected_answer': entry['answer'],
-                            'model_answer': response,
-                            'score': score,
-                            'metadata': entry['metadata']
+                            "question": entry["question"],
+                            "expected_answer": entry["answer"],
+                            "model_answer": response,
+                            "score": score,
+                            "metadata": entry["metadata"],
                         }
                         results.append(result)
                         print(f"Processed question {len(results)}/{len(data)}. Score: {score}")
@@ -65,21 +59,18 @@ class OpenRouterEvaluator:
                         print(f"Error: {str(e)}")
 
                 # Calculate aggregate metrics
-                total_score = sum(r['score'] for r in results)
+                total_score = sum(r["score"] for r in results)
                 metrics = {
-                    'dataset_name': dataset_name,
-                    'model': self.model,
-                    'size': len(data),
-                    'average_score': total_score / len(results) if results else 0,
-                    'total_examples': len(results),
-                    'timestamp': datetime.now().isoformat(),
-                    'config': dataset_config
+                    "dataset_name": dataset_name,
+                    "model": self.model,
+                    "size": len(data),
+                    "average_score": total_score / len(results) if results else 0,
+                    "total_examples": len(results),
+                    "timestamp": datetime.now().isoformat(),
+                    "config": dataset_config,
                 }
 
-                all_results.append({
-                    'metrics': metrics,
-                    'results': results
-                })
+                all_results.append({"metrics": metrics, "results": results})
 
             except Exception as e:
                 print(f"Error evaluating dataset {dataset_name}: {str(e)}")
@@ -89,13 +80,10 @@ class OpenRouterEvaluator:
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description='Evaluate models on reasoning datasets')
-    parser.add_argument('--model', required=True, help='Model to evaluate')
-    parser.add_argument('--config', required=True, 
-                       help='Path to JSON configuration file')
-    parser.add_argument('--output-dir', default='results',
-                       help='Output directory')
+    parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
+    parser.add_argument("--model", required=True, help="Model to evaluate")
+    parser.add_argument("--config", required=True, help="Path to JSON configuration file")
+    parser.add_argument("--output-dir", default="results", help="Output directory")
 
     args = parser.parse_args()
 
@@ -103,7 +91,7 @@ def main():
     os.makedirs(args.output_dir, exist_ok=True)
 
     # Load dataset configurations
-    with open(args.config, 'r') as f:
+    with open(args.config, "r") as f:
         dataset_configs = json.load(f)
 
     evaluator = OpenRouterEvaluator(model=args.model)
@@ -111,35 +99,33 @@ def main():
 
     # Save results
     output_file = os.path.join(
-        args.output_dir,
-        f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+        args.output_dir, f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
     )
 
     # Save detailed results
-    with open(output_file, 'w') as f:
+    with open(output_file, "w") as f:
         json.dump(all_results, f, indent=2)
 
     # Create summary
     summary = []
     for result in all_results:
-        metrics = result['metrics']
+        metrics = result["metrics"]
         summary_entry = {
-            'dataset_name': metrics['dataset_name'],
-            'model': metrics['model'],
-            'average_score': metrics['average_score'],
-            'total_examples': metrics['total_examples'],
-            'timestamp': metrics['timestamp'],
-            'config': metrics['config']
+            "dataset_name": metrics["dataset_name"],
+            "model": metrics["model"],
+            "average_score": metrics["average_score"],
+            "total_examples": metrics["total_examples"],
+            "timestamp": metrics["timestamp"],
+            "config": metrics["config"],
         }
         summary.append(summary_entry)
 
     # Save summary to a separate file
     summary_file = os.path.join(
-        args.output_dir,
-        f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+        args.output_dir, f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
     )
-    
-    with open(summary_file, 'w') as f:
+
+    with open(summary_file, "w") as f:
         json.dump(summary, f, indent=2)
 
     # Print summary
@@ -148,10 +134,10 @@ def main():
         print(f"\nDataset: {entry['dataset_name']}")
         print(f"Average Score: {entry['average_score']:.2%}")
         print(f"Total Examples: {entry['total_examples']}")
-    
+
     print(f"\nDetailed results saved to: {output_file}")
     print(f"Summary saved to: {summary_file}")
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/eval/eval_basic.sh b/eval/eval_basic.sh
index 2a7eb9bc..69f5251e 100644
--- a/eval/eval_basic.sh
+++ b/eval/eval_basic.sh
@@ -27,4 +27,4 @@ for model in "${MODELS[@]}"; do
         --output-dir "$OUTPUT_DIR"
 done
 
-echo "All evaluations completed!"
\ No newline at end of file
+echo "All evaluations completed!"
diff --git a/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
index 2d4a5a93..5e82c747 100644
--- a/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
+++ b/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
@@ -644,4 +644,4 @@
       }
     ]
   }
-]
\ No newline at end of file
+]
diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
index 92d49c86..cb8b45cc 100644
--- a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
+++ b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
@@ -9,4 +9,4 @@
     "timestamp": "2025-02-10T06:06:08.539389"
   },
   "results": []
-}
\ No newline at end of file
+}
diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
index cbac4104..16bd308e 100644
--- a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
+++ b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
@@ -183,4 +183,4 @@
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
index 590a48d6..49697c4b 100644
--- a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
+++ b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
@@ -9,4 +9,4 @@
     "timestamp": "2025-02-10T06:06:10.638347"
   },
   "results": []
-}
\ No newline at end of file
+}
diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
index 8144479d..13f80c7a 100644
--- a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
+++ b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
@@ -198,4 +198,4 @@
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json
index 661bdc24..8c9d6a5c 100644
--- a/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json
+++ b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json
@@ -36,4 +36,4 @@
       "seed": 42
     }
   }
-]
\ No newline at end of file
+]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b082f034..18cbc82d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,4 +5,4 @@ isort>=5.13.2
 flake8>=7.1.1
 mypy>=1.14.1
 pre-commit>=4.1.0
-openai>=1.61.1
\ No newline at end of file
+openai>=1.61.1

From 03f87dbc075a3cb3a17e1a8f65b6689651c38bad Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sun, 9 Feb 2025 22:52:10 -0800
Subject: [PATCH 10/11] [eval-basic] remove large results files, add gitignore,
 only leave summary

---
 eval/.gitignore                               |   2 +
 eval/__init__.py                              |   0
 eval/{eval_basic.py => eval.py}               |   0
 eval/{eval_basic.sh => eval.sh}               |   2 +-
 ..._gemini-2.0-flash-001_20250209_223527.json | 647 ------------------
 ..._gemini-2.0-flash-001_20250209_220608.json |  12 -
 ..._gemini-2.0-flash-001_20250209_220753.json | 186 -----
 ..._gemini-2.0-flash-001_20250209_220610.json |  12 -
 ..._gemini-2.0-flash-001_20250209_220824.json | 201 ------
 ..._gemini-2.0-flash-001_20250209_224813.json |  39 ++
 10 files changed, 42 insertions(+), 1059 deletions(-)
 create mode 100644 eval/.gitignore
 delete mode 100644 eval/__init__.py
 rename eval/{eval_basic.py => eval.py} (100%)
 rename eval/{eval_basic.sh => eval.sh} (95%)
 delete mode 100644 eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
 delete mode 100644 eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
 delete mode 100644 eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
 delete mode 100644 eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
 delete mode 100644 eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
 create mode 100644 eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json

diff --git a/eval/.gitignore b/eval/.gitignore
new file mode 100644
index 00000000..7db6c7ce
--- /dev/null
+++ b/eval/.gitignore
@@ -0,0 +1,2 @@
+results/*
+!results/summary*
diff --git a/eval/__init__.py b/eval/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/eval/eval_basic.py b/eval/eval.py
similarity index 100%
rename from eval/eval_basic.py
rename to eval/eval.py
diff --git a/eval/eval_basic.sh b/eval/eval.sh
similarity index 95%
rename from eval/eval_basic.sh
rename to eval/eval.sh
index 69f5251e..1d2a0beb 100644
--- a/eval/eval_basic.sh
+++ b/eval/eval.sh
@@ -21,7 +21,7 @@ mkdir -p "$OUTPUT_DIR"
 # Run evaluations
 for model in "${MODELS[@]}"; do
     echo "Evaluating $model..."
-    python eval_basic.py \
+    python eval.py \
         --model "$model" \
         --config "eval_basic.json" \
         --output-dir "$OUTPUT_DIR"
diff --git a/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
deleted file mode 100644
index 5e82c747..00000000
--- a/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
+++ /dev/null
@@ -1,647 +0,0 @@
-[
-  {
-    "metrics": {
-      "dataset_name": "letter_counting",
-      "model": "google/gemini-2.0-flash-001",
-      "size": 10,
-      "average_score": 0.20600000000000002,
-      "total_examples": 10,
-      "timestamp": "2025-02-10T06:34:37.091554",
-      "config": {
-        "min_words": 5,
-        "max_words": 15,
-        "size": 10,
-        "seed": 42
-      }
-    },
-    "results": [
-      {
-        "question": "How many times does the letter \"a\" appear in the text: \"bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed\"?",
-        "expected_answer": "6",
-        "model_answer": "The letter \"a\" appears 7 times in the text: \"bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 15,
-          "target_letter": "a",
-          "span": [
-            "bed",
-            "and",
-            "enters",
-            "his",
-            "mechanical",
-            "dresser",
-            "Two",
-            "minutes",
-            "later",
-            "the",
-            "machine",
-            "deposited",
-            "him",
-            "all",
-            "dressed"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"w\" appear in the text: \"it into a watering place\"?",
-        "expected_answer": "1",
-        "model_answer": "The letter \"w\" appears 1 time in the text \"it into a watering place\".\n",
-        "score": 0.5,
-        "metadata": {
-          "span_length": 5,
-          "target_letter": "w",
-          "span": [
-            "it",
-            "into",
-            "a",
-            "watering",
-            "place"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"t\" appear in the text: \"readable form accessible by the widest array of equipment including outdated\"?",
-        "expected_answer": "5",
-        "model_answer": "Let's count the \"t\"s in the sentence \"readable form accessible by the widest array of equipment including outdated\":\n\n*   readable: 0\n*   form: 0\n*   accessible: 0\n*   by: 0\n*   the: 1\n*   widest: 1\n*   array: 0\n*   of: 0\n*   equipment: 1\n*   including: 0\n*   outdated: 2\n\nTotal: 1 + 1 + 1 + 2 = 5\n\nTherefore, the letter \"t\" appears 5 times in the text.",
-        "score": 0.5,
-        "metadata": {
-          "span_length": 11,
-          "target_letter": "t",
-          "span": [
-            "readable",
-            "form",
-            "accessible",
-            "by",
-            "the",
-            "widest",
-            "array",
-            "of",
-            "equipment",
-            "including",
-            "outdated"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"n\" appear in the text: \"does not contain a notice indicating that it is\"?",
-        "expected_answer": "6",
-        "model_answer": "The letter \"n\" appears 7 times in the text \"does not contain a notice indicating that it is\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 9,
-          "target_letter": "n",
-          "span": [
-            "does",
-            "not",
-            "contain",
-            "a",
-            "notice",
-            "indicating",
-            "that",
-            "it",
-            "is"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"a\" appear in the text: \"keeping this work in the same\"?",
-        "expected_answer": "1",
-        "model_answer": "The letter \"a\" appears **3** times in the text \"keeping this work in the same\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 6,
-          "target_letter": "a",
-          "span": [
-            "keeping",
-            "this",
-            "work",
-            "in",
-            "the",
-            "same"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"r\" appear in the text: \"years ago its subscription list contained but a few hundred\"?",
-        "expected_answer": "3",
-        "model_answer": "The letter \"r\" appears 4 times in the text \"years ago its subscription list contained but a few hundred\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 10,
-          "target_letter": "r",
-          "span": [
-            "years",
-            "ago",
-            "its",
-            "subscription",
-            "list",
-            "contained",
-            "but",
-            "a",
-            "few",
-            "hundred"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"e\" appear in the text: \"me As in the morning the table rose out of the floor Again\"?",
-        "expected_answer": "6",
-        "model_answer": "The letter \"e\" appears 6 times in the text \"me As in the morning the table rose out of the floor Again\".\n",
-        "score": 0.5,
-        "metadata": {
-          "span_length": 13,
-          "target_letter": "e",
-          "span": [
-            "me",
-            "As",
-            "in",
-            "the",
-            "morning",
-            "the",
-            "table",
-            "rose",
-            "out",
-            "of",
-            "the",
-            "floor",
-            "Again"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"t\" appear in the text: \"said Faithburn was dead quite certainly\"?",
-        "expected_answer": "3",
-        "model_answer": "The letter \"t\" appears 5 times in the text \"said Faithburn was dead quite certainly\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 6,
-          "target_letter": "t",
-          "span": [
-            "said",
-            "Faithburn",
-            "was",
-            "dead",
-            "quite",
-            "certainly"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"n\" appear in the text: \"these eight had only impracticable schemes to propose In fact one of\"?",
-        "expected_answer": "3",
-        "model_answer": "Let's count the \"n\"s:\n\n*   **n**\n*   o**n**ly\n*   i**n**\n*   o**n**e\n\nThere are **4** \"n\"s in the text.",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 12,
-          "target_letter": "n",
-          "span": [
-            "these",
-            "eight",
-            "had",
-            "only",
-            "impracticable",
-            "schemes",
-            "to",
-            "propose",
-            "In",
-            "fact",
-            "one",
-            "of"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"g\" appear in the text: \"computers including obsolete old middle aged and new\"?",
-        "expected_answer": "2",
-        "model_answer": "The letter \"g\" appears 2 times in the text: \"computers including obsolete old middle aged and new\".\n",
-        "score": 0.5,
-        "metadata": {
-          "span_length": 8,
-          "target_letter": "g",
-          "span": [
-            "computers",
-            "including",
-            "obsolete",
-            "old",
-            "middle",
-            "aged",
-            "and",
-            "new"
-          ]
-        }
-      }
-    ]
-  },
-  {
-    "metrics": {
-      "dataset_name": "propositional_logic",
-      "model": "google/gemini-2.0-flash-001",
-      "size": 10,
-      "average_score": 0.059,
-      "total_examples": 10,
-      "timestamp": "2025-02-10T06:35:11.432275",
-      "config": {
-        "size": 10,
-        "seed": 42
-      }
-    },
-    "results": [
-      {
-        "question": "Given:\n1. R\n2. Q\nWhat can we conclude?",
-        "expected_answer": "(P \u2228 Q)",
-        "model_answer": "Given just \"R\" and \"Q\" as individual statements, we can't directly *conclude* anything meaningful in a logical or mathematical sense *unless* we know more context. Here's why and some possibilities:\n\n**Without Context:**\n\n* **Independent Statements:**  They are simply two distinct statements that might be true or false depending on what \"R\" and \"Q\" represent. They exist independently.\n\n**With Possible Context (and Possible Conclusions):**\n\n* **Logic/Propositional Logic:** If \"R\" and \"Q\" represent propositions (statements that can be true or false), then we can form compound statements:\n    * **R AND Q (R \u2227 Q):**  The statement \"R is true AND Q is true.\"  This is only true if *both* R and Q are true.\n    * **R OR Q (R \u2228 Q):** The statement \"R is true OR Q is true (or both).\" This is true if *either* R or Q or both are true.\n    * **IF R THEN Q (R \u2192 Q):** The statement \"If R is true, then Q is true.\"  This is only false if R is true and Q is false.\n    * **IF Q THEN R (Q \u2192 R):** The statement \"If Q is true, then R is true.\" This is only false if Q is true and R is false.\n    * **R IFF Q (R \u2194 Q):** The statement \"R is true if and only if Q is true.\"  This is true if R and Q have the same truth value (both true or both false).\n    * **NOT R (\u00acR):** The statement \"R is false.\"\n    * **NOT Q (\u00acQ):** The statement \"Q is false.\"\n\n    *From the information given, we cannot deduce the individual truth values of `R` and `Q` (e.g., whether `R` is true or false, whether `Q` is true or false).*\n\n* **Set Theory:** If R and Q are sets, then we can discuss:\n    * **Intersection (R \u2229 Q):**  The set containing elements that are *both* in R and in Q.  We can't *conclude* what's in this intersection without knowing the elements of R and Q.\n    * **Union (R \u222a Q):**  The set containing elements that are in R, in Q, or in both. We can't *conclude* what's in this union without knowing the elements of R and Q.\n\n* **Mathematics (Variables):** If R and Q are variables, then we can't conclude anything about their relationship or values *unless* we have an equation or further information linking them.  We don't know if R = Q, R < Q, R > Q, R + Q = some value, etc.\n\n* **Premises in an Argument:** If R and Q are presented as premises (assumptions) in a larger argument, we can only draw conclusions if there are *other* premises or rules of inference involved.\n\n**Example Scenarios and Conclusions:**\n\n* **Scenario:**  R: \"It is raining.\"  Q: \"The ground is wet.\"\n    * **Possible Conclusion with an additional premise:** If we know \"IF it is raining, THEN the ground is wet (R \u2192 Q)\", then we can say *if* it is raining (we know R), *then* the ground is wet (we can conclude Q).  But *given only* R and Q, we cannot derive this conditional.\n\n* **Scenario:** R:  x > 5   Q: x > 0 (where 'x' is a real number)\n    * **Conclusion:** If both R and Q are true for the same value of x, then x > 5 implies x > 0. Therefore we can deduce that Q is true if R is.  R -> Q.\n\n**In summary, the best answer is:**\n\nWithout any further context or relationship between R and Q, we can only acknowledge them as two independent statements, values or sets. We cannot draw any meaningful logical conclusions *solely* from the information given.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "R",
-            "Q"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R",
-            "S"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. ((Q \u2192 P) \u2228 (Q \u2192 P))\n2. ((Q \u2194 Q) \u2192 (P \u2192 P))\n3. P\nWhat can we conclude?",
-        "expected_answer": "(P \u2192 P)",
-        "model_answer": "Let's analyze each statement and see what conclusions can be drawn from them and the given information.\n\n1. **((Q \u2192 P) \u2228 (Q \u2192 P))** simplifies to **(Q \u2192 P)**.  This means \"if Q is true, then P is true.\"  Since we know P is true (given in statement 3), then this statement is true whether Q is true or false. (Remember, a conditional is only false when the antecedent is true and the consequent is false).\n\n2. **((Q \u2194 Q) \u2192 (P \u2192 P))** simplifies significantly.  `Q \u2194 Q` is always true (it's a tautology).  `P \u2192 P` is also always true (it's another tautology).  So, this becomes `(True \u2192 True)`, which is always true.  This statement doesn't help us determine anything about the relationship between Q and P beyond what is already implied by P being true.\n\n3. **P** is given as true.\n\nTherefore, based on the given information and the simplification of the statements, the main conclusion is that **P is true**. The truth value of Q is indeterminate. Q can be either true or false, and all the given statements will still hold.\n**Therefore, we can conclude: P.**\n",
-        "score": 0.5,
-        "metadata": {
-          "premises": [
-            "((Q \u2192 P) \u2228 (Q \u2192 P))",
-            "((Q \u2194 Q) \u2192 (P \u2192 P))",
-            "P"
-          ],
-          "variables": [
-            "P",
-            "Q"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. ((Q \u2228 P) \u2227 \u00acP)\n2. P\n3. ((P \u2227 R) \u2227 \u00acR)\n4. ((Q \u2194 R) \u2192 \u00acQ)\nWhat can we conclude?",
-        "expected_answer": "(Q \u2227 Q)",
-        "model_answer": "Let's analyze the given statements and see what conclusions we can draw using logical deduction:\n\n1.  **((Q \u2228 P) \u2227 \u00acP)**: This says that either Q or P is true, AND P is false.  Since P is false, Q must be true for the entire statement to be true. Thus, we can conclude: **Q is true.**\n\n2.  **P**: This says that P is true.\n\n3.  **((P \u2227 R) \u2227 \u00acR)**: This says that both P and R are true, AND R is false.  This entire statement is a contradiction, as R cannot be both true and false. This statement implies: **False (Contradiction)**.\n\n4.  **((Q \u2194 R) \u2192 \u00acQ)**: This says that IF Q is equivalent to R (i.e., Q and R have the same truth value), THEN Q is false.\n\nNow, let's combine our findings:\n\n*   From (1), we deduced **Q**. However from (2) and (3), we found that statement (3) resulted in a contradiction. This contradiction stems from assuming both Q and R are false, despite Q already being false as a logical conclusion from (1). Therefore something must be wrong with the premises given.\n\nHere's a more methodical analysis, considering the contradiction:\n\nSince statement (3) leads to a contradiction, we must question our assumptions and how they interact.  Here's where things get interesting:\n\n*   We initially deduced Q from statement (1). Thus since P is false and P is true (2), this creates another contradiction. Thus we are concluding contradictory truths when all assumptions are in effect.\n\nSince we arrive at a contradiction, the set of premises is inconsistent.  Therefore, anything can be formally derived. In essence, we can conclude anything we want via *ex falso quod libet*.\n\n**Therefore, due to the contradictory statements provided, we can conclude anything.** However, within a standard logic system, we would first need to address which statement must be removed to allow the system to converge.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "((Q \u2228 P) \u2227 \u00acP)",
-            "P",
-            "((P \u2227 R) \u2227 \u00acR)",
-            "((Q \u2194 R) \u2192 \u00acQ)"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. \u00acP\n2. \u00acP\n3. Q\nWhat can we conclude?",
-        "expected_answer": "(Q \u2228 R)",
-        "model_answer": "From the given premises:\n\n*   \u00acP (P is false)\n*   \u00acP (P is false)\n*   Q (Q is true)\n\nWe can conclude the following:\n\n*   **P is false.** (This is directly from \u00acP)\n*   **Q is true.** (This is directly from Q)\n*   **\u00acQ is false.** (Since Q is true)\n\nNow we can also conclude the following with propositional logic principles:\n\n*   **P \u2228 Q** is true. (Since Q is true, the disjunction is true regardless of the truth value of P).\n*   **P \u2192 Q** is true. (Since P is false, the conditional is true).\n*   **\u00acP \u2228 Q** is true. (Since both \u00acP and Q are true, the disjunction is true).\n*   **\u00acP \u2192 Q** is true. (Since \u00acP is true, the conditional is true).\n\nIn summary, we can validly conclude many things, the most straightforward being the truth values of P and Q:\n\n*   **P is false.**\n*   **Q is true.**\n\nAnd other valid conclusions such as those related to ORs and implication.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "\u00acP",
-            "\u00acP",
-            "Q"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. P\n2. ((P \u2228 P) \u2194 (Q \u2227 P))\n3. P\nWhat can we conclude?",
-        "expected_answer": "(Q \u2194 Q)",
-        "model_answer": "Let's analyze the given statements:\n\n1.  P: This statement is true.\n2.  ((P \u2228 P) \u2194 (Q \u2227 P)): This is a biconditional. Let's break it down:\n    *   (P \u2228 P) is logically equivalent to P (by the idempotent law).\n    *   So the statement simplifies to (P \u2194 (Q \u2227 P)).\n    *   Since we know P is true (from statement 1 and 3), the statement becomes (True \u2194 (Q \u2227 True)).\n    *   (Q \u2227 True) is logically equivalent to Q.\n    *   Therefore, the statement simplifies further to (True \u2194 Q).\n    *   For the biconditional (True \u2194 Q) to be true, Q must be true.\n\nTherefore, we can conclude:\n\n*   **Q** is true.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "P",
-            "((P \u2228 P) \u2194 (Q \u2227 P))",
-            "P"
-          ],
-          "variables": [
-            "P",
-            "Q"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. (Q \u2194 R)\n2. \u00acR\nWhat can we conclude?",
-        "expected_answer": "(Q \u2192 R)",
-        "model_answer": "From the given premises:\n\n1.  Q \u2194 R (Q is logically equivalent to R, meaning Q is true if and only if R is true)\n2.  \u00acR (R is false)\n\nSince Q \u2194 R, Q and R have the same truth value.  Because \u00acR, we know R is false. Therefore, Q must also be false.\n\nConclusion: **\u00acQ**\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "(Q \u2194 R)",
-            "\u00acR"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. R\n2. ((Q \u2192 Q) \u2228 (S \u2227 Q))\n3. Q\nWhat can we conclude?",
-        "expected_answer": "(S \u2228 Q)",
-        "model_answer": "Let's analyze the given statements:\n\n1.  **R:** This statement simply asserts that R is true.\n\n2.  **(Q \u2192 Q) \u2228 (S \u2227 Q):**  This is a disjunction (OR). Let's break it down:\n    *   **Q \u2192 Q:**  This is a tautology.  A statement implies itself is always true.  Therefore, the entire disjunction is true regardless of the value of (S \u2227 Q).\n\n3.  **Q:** This statement says Q is true.\n\nSince Q is true:\n\n* **(Q -> Q)** is true (since Q is true implies Q is true).\n\n* **(S ^ Q)** is true if and only if *both* S and Q are true.  We know Q is true, so if S is also true then this is true.  If S is false this is false.\n\n* **((Q -> Q) v (S ^ Q))** is true, since **(Q -> Q)** which is part of the disjunction is true. We already knew that from line 2.\n\nTherefore, from these premises, we can definitively conclude:\n\n*   **R** is true (given).\n*   **Q** is true (given).\n*   **(Q \u2192 Q) \u2228 (S \u2227 Q)** is true (given, and verifiable).\n*   **(Q -> Q)** is true (implied from Q being true, since this is a tautology).\n*   The truth value of S is undetermined.  If S is true, then (S ^ Q) is true.  If S is false, then (S ^ Q) is false.\n\nFinal Answer: The conclusions are **R**, **Q**, and **(Q \u2192 Q) \u2228 (S \u2227 Q)** are all true. Additionally, we know **(Q \u2192 Q)** is true. We cannot determine the truth value of S.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "R",
-            "((Q \u2192 Q) \u2228 (S \u2227 Q))",
-            "Q"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R",
-            "S"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. (Q \u2227 P)\n2. (\u00acP \u2227 (P \u2192 P))\n3. \u00ac(Q \u2194 Q)\nWhat can we conclude?",
-        "expected_answer": "(P \u2227 P)",
-        "model_answer": "Let's analyze each statement to see what we can derive:\n\n*   **1. (Q \u2227 P)** This statement tells us that both Q and P are true.\n    *   Q is True\n    *   P is True\n\n*   **2. (\u00acP \u2227 (P \u2192 P))** Let's break this down:\n    *   `\u00acP`: This means \"Not P\".\n    *   `P \u2192 P`: This is a tautology (always true).  If P is true, then P implies P is true. If P is false then P implies P is true.\n\n    So the statement is essentially `(\u00acP) \u2227 (True)`.  This simplifies to just `\u00acP`. Therefore, the statement tells us that P is false.\n\n*   **3. \u00ac(Q \u2194 Q)** The biconditional `Q \u2194 Q`  means \"Q if and only if Q\". This is always true, regardless of Q's value (Q is true, so \"true if and only if true\" is true; Q is false, so \"false if and only if false\" is true). Therefore \u00ac(Q \u2194 Q) always resolves to `\u00ac(True)`, meaning that `False`.\n\nNow we consider these conclusions together:\n\n*   From statement 1: `Q` is True and `P` is True.\n*   From statement 2: `P` is False.\n*   From statment 3: the statement is `False`.\n\nWe have a **Contradiction**.\n\nStatement 1 tells us that P is true, while Statement 2 tells us that P is false.  This means the entire set of statements is inconsistent.  Therefore, we can conclude the system is flawed.\nFinal Answer: The final answer is $\\boxed{Contradiction}$",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "(Q \u2227 P)",
-            "(\u00acP \u2227 (P \u2192 P))",
-            "\u00ac(Q \u2194 Q)"
-          ],
-          "variables": [
-            "P",
-            "Q"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. (R \u2228 Q)\n2. (R \u2227 Q)\n3. R\nWhat can we conclude?",
-        "expected_answer": "(P \u2194 P)",
-        "model_answer": "From the given statements:\n\n1.  (R \u2228 Q):  R or Q (or both) is true.\n2.  (R \u2227 Q):  R and Q are both true.\n3.  R: R is true.\n\nWe can conclude the following:\n\n*   **Q is true.**  Statement (R \u2227 Q) tells us that R and Q are both true.  Since statement 3 tells us that R is true, Q must also be true.\n\nTherefore, the main conclusion is **Q**.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "(R \u2228 Q)",
-            "(R \u2227 Q)",
-            "R"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. ((P \u2228 Q) \u2228 (Q \u2192 Q))\n2. \u00ac(Q \u2192 Q)\n3. ((P \u2227 Q) \u2227 (Q \u2227 P))\n4. (Q \u2194 Q)\nWhat can we conclude?",
-        "expected_answer": "(P \u2192 P)",
-        "model_answer": "Let's analyze each statement:\n\n*   **1. ((P \u2228 Q) \u2228 (Q \u2192 Q))**:  `Q \u2192 Q` is always true (tautology). Therefore, `(P \u2228 Q) \u2228 True` is always true, regardless of the values of P and Q. So, this statement provides no specific information about P or Q.\n\n*   **2. \u00ac(Q \u2192 Q)**:  `Q \u2192 Q` is always true.  Therefore `\u00ac(Q \u2192 Q)` means \u00acTrue which is False. So, the statement asserts a contradiction.\n\n*   **3. ((P \u2227 Q) \u2227 (Q \u2227 P))**: This statement is equivalent to (P \u2227 Q).  Both (P \u2227 Q) and (Q \u2227 P) are true only when both P and Q are true.\n\n*   **4. (Q \u2194 Q)**: This statement is a tautology (always true). It provides no specific information.\n\nNow let's consider the implications of having all four statements:\n\nStatements 1 and 4 are always true, so they do not provide any useful information.  Statement 2 says that a tautology is false, which is a contradiction.  Statement 3 indicates that both P and Q are true.  However, statement 2 means the entire set of statements is contradictory.\n\nTherefore, we can conclude that the given set of premises is **inconsistent (or contradictory).**  You cannot simultaneously assert these four statements without creating a logical absurdity.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "((P \u2228 Q) \u2228 (Q \u2192 Q))",
-            "\u00ac(Q \u2192 Q)",
-            "((P \u2227 Q) \u2227 (Q \u2227 P))",
-            "(Q \u2194 Q)"
-          ],
-          "variables": [
-            "P",
-            "Q"
-          ],
-          "complexity": 3
-        }
-      }
-    ]
-  },
-  {
-    "metrics": {
-      "dataset_name": "leg_counting",
-      "model": "google/gemini-2.0-flash-001",
-      "size": 10,
-      "average_score": 0.40199999999999997,
-      "total_examples": 10,
-      "timestamp": "2025-02-10T06:35:27.087469",
-      "config": {
-        "min_animals": 3,
-        "max_animals": 8,
-        "size": 10,
-        "seed": 42
-      }
-    },
-    "results": [
-      {
-        "question": "How many legs are there in total if you have 3 humans, 3 sea slugs, 3 deers, 1 giraffe, 3 elephants, 2 ducks, 1 beetle, 1 flatworm?",
-        "expected_answer": "44",
-        "model_answer": "Okay, let's break it down:\n\n*   **Humans:** 3 humans * 2 legs/human = 6 legs\n*   **Sea Slugs:** 3 sea slugs * 0 legs/sea slug = 0 legs\n*   **Deer:** 3 deer * 4 legs/deer = 12 legs\n*   **Giraffe:** 1 giraffe * 4 legs/giraffe = 4 legs\n*   **Elephants:** 3 elephants * 4 legs/elephant = 12 legs\n*   **Ducks:** 2 ducks * 2 legs/duck = 4 legs\n*   **Beetle:** 1 beetle * 6 legs/beetle = 6 legs\n*   **Flatworm:** 1 flatworm * 0 legs/flatworm = 0 legs\n\nNow, let's add them all up:\n\n6 + 0 + 12 + 4 + 12 + 4 + 6 + 0 = 44 legs\n\nSo, there are a total of 44 legs.",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 8
-          },
-          "animals": {
-            "human": 3,
-            "sea slug": 3,
-            "deer": 3,
-            "giraffe": 1,
-            "elephant": 3,
-            "duck": 2,
-            "beetle": 1,
-            "flatworm": 1
-          },
-          "total_legs": 44
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 sheeps, 3 dogs, 3 praying mantiss?",
-        "expected_answer": "38",
-        "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Sheep:** 2 sheep * 4 legs/sheep = 8 legs\n*   **Dogs:** 3 dogs * 4 legs/dog = 12 legs\n*   **Praying Mantises:** 3 mantises * 6 legs/mantis = 18 legs\n\n**Total:** 8 + 12 + 18 = 38 legs\n",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 3
-          },
-          "animals": {
-            "sheep": 2,
-            "dog": 3,
-            "praying mantis": 3
-          },
-          "total_legs": 38
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 crabs, 1 lobster, 1 human, 1 cow, 3 bees, 1 elephant?",
-        "expected_answer": "58",
-        "model_answer": "Here's the breakdown:\n\n*   **Crabs:** 2 crabs * 10 legs/crab = 20 legs\n*   **Lobster:** 1 lobster * 10 legs/lobster = 10 legs\n*   **Human:** 1 human * 2 legs/human = 2 legs\n*   **Cow:** 1 cow * 4 legs/cow = 4 legs\n*   **Bees:** 3 bees * 6 legs/bee = 18 legs\n*   **Elephant:** 1 elephant * 4 legs/elephant = 4 legs\n\n**Total:** 20 + 10 + 2 + 4 + 18 + 4 = **58 legs**",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 6
-          },
-          "animals": {
-            "crab": 2,
-            "lobster": 1,
-            "human": 1,
-            "cow": 1,
-            "bee": 3,
-            "elephant": 1
-          },
-          "total_legs": 58
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 grasshoppers, 1 spider, 1 tiger, 2 chickens, 1 starfish?",
-        "expected_answer": "33",
-        "model_answer": "Here's the breakdown:\n\n*   **Grasshoppers:** 2 grasshoppers * 6 legs/grasshopper = 12 legs\n*   **Spider:** 1 spider * 8 legs/spider = 8 legs\n*   **Tiger:** 1 tiger * 4 legs/tiger = 4 legs\n*   **Chickens:** 2 chickens * 2 legs/chicken = 4 legs\n*   **Starfish:** 1 starfish * 0 legs/starfish = 0 legs (starfish have tube feet but not legs)\n\n**Total:** 12 + 8 + 4 + 4 + 0 = **28 legs**\n",
-        "score": 0.01,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 5
-          },
-          "animals": {
-            "grasshopper": 2,
-            "spider": 1,
-            "tiger": 1,
-            "chicken": 2,
-            "starfish": 1
-          },
-          "total_legs": 33
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 3 wasps, 3 jellyfishs, 1 elephant?",
-        "expected_answer": "22",
-        "model_answer": "Here's how to solve this:\n\n*   **Wasps:** Each wasp has 6 legs, so 3 wasps have 3 * 6 = 18 legs.\n*   **Jellyfish:** Jellyfish don't have legs, so they have 0 legs.\n*   **Elephant:** An elephant has 4 legs.\n\n**Total:** 18 + 0 + 4 = 22 legs\n\n**Answer:** There are a total of 22 legs.",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 3
-          },
-          "animals": {
-            "wasp": 3,
-            "jellyfish": 3,
-            "elephant": 1
-          },
-          "total_legs": 22
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 leechs, 2 crickets, 3 shrimps, 2 praying mantiss, 2 woodlouses?",
-        "expected_answer": "82",
-        "model_answer": "Here's the breakdown of the legs:\n\n*   **Leeches:** 0 legs each, so 2 leeches have 2 * 0 = 0 legs\n*   **Crickets:** 6 legs each, so 2 crickets have 2 * 6 = 12 legs\n*   **Shrimps:** 10 legs each, so 3 shrimps have 3 * 10 = 30 legs\n*   **Praying Mantises:** 6 legs each, so 2 praying mantises have 2 * 6 = 12 legs\n*   **Woodlice (Pillbugs/Roly-Polies):** 14 legs each, so 2 woodlice have 2 * 14 = 28 legs\n\nTotal: 0 + 12 + 30 + 12 + 28 = 82 legs\n\nSo the answer is 82",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 5
-          },
-          "animals": {
-            "leech": 2,
-            "cricket": 2,
-            "shrimp": 3,
-            "praying mantis": 2,
-            "woodlouse": 2
-          },
-          "total_legs": 82
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 insects, 1 duck, 1 lobster, 3 starfishs, 1 scorpion, 2 horses, 3 butterflys?",
-        "expected_answer": "73",
-        "model_answer": "Here's the breakdown:\n\n*   **Insects:** 2 insects * 6 legs/insect = 12 legs\n*   **Duck:** 1 duck * 2 legs/duck = 2 legs\n*   **Lobster:** 1 lobster * 10 legs/lobster = 10 legs\n*   **Starfish:** 3 starfish * 0 legs/starfish = 0 legs\n*   **Scorpion:** 1 scorpion * 8 legs/scorpion = 8 legs\n*   **Horses:** 2 horses * 4 legs/horse = 8 legs\n*   **Butterflies:** 3 butterflies * 6 legs/butterfly = 18 legs\n\n**Total:** 12 + 2 + 10 + 0 + 8 + 8 + 18 = **58 legs**",
-        "score": 0.01,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 7
-          },
-          "animals": {
-            "insect": 2,
-            "duck": 1,
-            "lobster": 1,
-            "starfish": 3,
-            "scorpion": 1,
-            "horse": 2,
-            "butterfly": 3
-          },
-          "total_legs": 73
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 butterflys, 3 grasshoppers, 3 humans?",
-        "expected_answer": "36",
-        "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Butterflies:** 2 butterflies * 6 legs/butterfly = 12 legs\n*   **Grasshoppers:** 3 grasshoppers * 6 legs/grasshopper = 18 legs\n*   **Humans:** 3 humans * 2 legs/human = 6 legs\n\n**Total:** 12 + 18 + 6 = 36 legs\n\nThere are a total of 36 legs.",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 3
-          },
-          "animals": {
-            "butterfly": 2,
-            "grasshopper": 3,
-            "human": 3
-          },
-          "total_legs": 36
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 3 deers, 2 beetles, 1 giraffe, 3 fireflys, 3 ants, 1 chicken?",
-        "expected_answer": "66",
-        "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Deer:** 3 deer * 4 legs/deer = 12 legs\n*   **Beetles:** 2 beetles * 6 legs/beetle = 12 legs\n*   **Giraffe:** 1 giraffe * 4 legs/giraffe = 4 legs\n*   **Fireflys:** 3 fireflys * 6 legs/firefly = 18 legs\n*   **Ants:** 3 ants * 6 legs/ant = 18 legs\n*   **Chicken:** 1 chicken * 2 legs/chicken = 2 legs\n\n**Total:** 12 + 12 + 4 + 18 + 18 + 2 = **66 legs**",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 6
-          },
-          "animals": {
-            "deer": 3,
-            "beetle": 2,
-            "giraffe": 1,
-            "firefly": 3,
-            "ant": 3,
-            "chicken": 1
-          },
-          "total_legs": 66
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 1 scorpion, 3 shrimps, 2 cats, 2 giraffes?",
-        "expected_answer": "54",
-        "model_answer": "Let's count the legs of each animal and then add them up:\n\n*   **Scorpion:** 8 legs\n*   **Shrimp:** 10 legs\n*   **Cat:** 4 legs\n*   **Giraffe:** 4 legs\n\nNow let's calculate the total:\n\n*   1 scorpion * 8 legs/scorpion = 8 legs\n*   3 shrimps * 10 legs/shrimp = 30 legs\n*   2 cats * 4 legs/cat = 8 legs\n*   2 giraffes * 4 legs/giraffe = 8 legs\n\nFinally, add them all together: 8 + 30 + 8 + 8 = 54 legs\n\nSo the answer is $\\boxed{54}$\n",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 4
-          },
-          "animals": {
-            "scorpion": 1,
-            "shrimp": 3,
-            "cat": 2,
-            "giraffe": 2
-          },
-          "total_legs": 54
-        }
-      }
-    ]
-  }
-]
diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
deleted file mode 100644
index cb8b45cc..00000000
--- a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "metrics": {
-    "dataset_name": "leg_counting",
-    "model": "google/gemini-2.0-flash-001",
-    "size": 10,
-    "seed": 42,
-    "average_score": 0,
-    "total_examples": 0,
-    "timestamp": "2025-02-10T06:06:08.539389"
-  },
-  "results": []
-}
diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
deleted file mode 100644
index 16bd308e..00000000
--- a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
+++ /dev/null
@@ -1,186 +0,0 @@
-{
-  "metrics": {
-    "dataset_name": "leg_counting",
-    "model": "google/gemini-2.0-flash-001",
-    "size": 10,
-    "seed": 42,
-    "average_score": 0.45099999999999996,
-    "total_examples": 10,
-    "timestamp": "2025-02-10T06:07:53.290258"
-  },
-  "results": [
-    {
-      "question": "How many legs are there in total if you have 1 sea slug, 1 deer?",
-      "expected_answer": "4",
-      "model_answer": "*   A sea slug has no legs.\n*   A deer has 4 legs.\n\nTherefore, there are a total of 0 + 4 = $\\boxed{4}$ legs.\n",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 2
-        },
-        "animals": {
-          "sea slug": 1,
-          "deer": 1
-        },
-        "total_legs": 4
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 2 sheeps, 2 dogs?",
-      "expected_answer": "16",
-      "model_answer": "* Sheep have 4 legs each, so 2 sheep have 2 * 4 = 8 legs\n* Dogs have 4 legs each, so 2 dogs have 2 * 4 = 8 legs\n\nIn total, there are 8 + 8 = **16** legs.\n",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 2
-        },
-        "animals": {
-          "sheep": 2,
-          "dog": 2
-        },
-        "total_legs": 16
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 1 crab, 2 lobsters, 1 human, 1 cow, 1 bee?",
-      "expected_answer": "42",
-      "model_answer": "Here's the breakdown:\n\n*   **Crab:** 10 legs\n*   **Lobster:** 10 legs each, so 2 * 10 = 20 legs\n*   **Human:** 2 legs\n*   **Cow:** 4 legs\n*   **Bee:** 6 legs\n\n**Total:** 10 + 20 + 2 + 4 + 6 = **42 legs**",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 5
-        },
-        "animals": {
-          "crab": 1,
-          "lobster": 2,
-          "human": 1,
-          "cow": 1,
-          "bee": 1
-        },
-        "total_legs": 42
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 2 grasshoppers, 2 spiders, 1 tiger, 1 chicken?",
-      "expected_answer": "34",
-      "model_answer": "Here's the breakdown:\n\n*   **Grasshopper:** 6 legs each, so 2 grasshoppers have 2 * 6 = 12 legs\n*   **Spider:** 8 legs each, so 2 spiders have 2 * 8 = 16 legs\n*   **Tiger:** 4 legs\n*   **Chicken:** 2 legs\n\nTotal: 12 + 16 + 4 + 2 = 34 legs\n\nThere are a total of $\\boxed{34}$ legs.",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 4
-        },
-        "animals": {
-          "grasshopper": 2,
-          "spider": 2,
-          "tiger": 1,
-          "chicken": 1
-        },
-        "total_legs": 34
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 3 wasps, 3 jellyfishs?",
-      "expected_answer": "18",
-      "model_answer": "*   **Wasps:** Each wasp has 6 legs, so 3 wasps have 3 * 6 = 18 legs.\n*   **Jellyfish:** Jellyfish do not have legs.\n\n**Total:** 18 + 0 = 18 legs\n\nThere are a total of $\\boxed{18}$ legs.\n",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 2
-        },
-        "animals": {
-          "wasp": 3,
-          "jellyfish": 3
-        },
-        "total_legs": 18
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 3 leechs, 2 crickets, 2 shrimps, 3 praying mantiss?",
-      "expected_answer": "50",
-      "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Leeches:** Leeches don't have legs.\n*   **Crickets:** 2 Crickets * 6 legs/cricket = 12 legs\n*   **Shrimp:** 2 Shrimps * 10 legs/shrimp = 20 legs\n*   **Praying Mantises:** 3 Mantises * 6 legs/mantis = 18 legs\n\n**Total:** 0 + 12 + 20 + 18 = 50 legs\n\nThere are a total of 50 legs.",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 4
-        },
-        "animals": {
-          "leech": 3,
-          "cricket": 2,
-          "shrimp": 2,
-          "praying mantis": 3
-        },
-        "total_legs": 50
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 3 ducks, 1 shrimp, 3 lobsters, 2 starfishs?",
-      "expected_answer": "56",
-      "model_answer": "Here's how to calculate the total number of legs:\n\n*   Ducks: 3 ducks * 2 legs/duck = 6 legs\n*   Shrimp: 1 shrimp * 10 legs/shrimp = 10 legs\n*   Lobsters: 3 lobsters * 10 legs/lobster = 30 legs\n*   Starfish: 2 starfish * 0 legs/starfish = 0 legs\n\nTotal: 6 + 10 + 30 + 0 = 46 legs\n\nThere are a total of 46 legs.",
-      "score": 0.01,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 4
-        },
-        "animals": {
-          "duck": 3,
-          "shrimp": 1,
-          "lobster": 3,
-          "starfish": 2
-        },
-        "total_legs": 56
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 1 butterfly, 2 grasshoppers?",
-      "expected_answer": "18",
-      "model_answer": "* A butterfly has 6 legs.\n* A grasshopper has 6 legs.\n\nSo, 1 butterfly has 6 legs, and 2 grasshoppers have 2 * 6 = 12 legs.\n\nIn total, there are 6 + 12 = $\\boxed{18}$ legs.\n",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 2
-        },
-        "animals": {
-          "butterfly": 1,
-          "grasshopper": 2
-        },
-        "total_legs": 18
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 1 deer, 3 beetles, 2 giraffes, 1 firefly, 3 ants?",
-      "expected_answer": "54",
-      "model_answer": "Let's count the legs for each animal:\n* Deer: 1 deer * 4 legs/deer = 4 legs\n* Beetles: 3 beetles * 6 legs/beetle = 18 legs\n* Giraffes: 2 giraffes * 4 legs/giraffe = 8 legs\n* Firefly: 1 firefly * 6 legs/firefly = 6 legs\n* Ants: 3 ants * 6 legs/ant = 18 legs\n\nNow, add up the legs for all the animals:\n4 + 18 + 8 + 6 + 18 = 54 legs\n\nTherefore, there are a total of 54 legs.\n\nFinal Answer: The final answer is $\\boxed{54}$",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 5
-        },
-        "animals": {
-          "deer": 1,
-          "beetle": 3,
-          "giraffe": 2,
-          "firefly": 1,
-          "ant": 3
-        },
-        "total_legs": 54
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 1 scorpion, 1 shrimp, 3 cats?",
-      "expected_answer": "30",
-      "model_answer": "Here's how to figure that out:\n\n*   **Scorpion:** Scorpions have 8 legs.\n*   **Shrimp:** Shrimp have 10 legs.\n*   **Cats:** Cats have 4 legs each, so 3 cats have 3 * 4 = 12 legs.\n\n**Total:** 8 + 10 + 12 = 30 legs\n\nThere are a total of 30 legs.",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 3
-        },
-        "animals": {
-          "scorpion": 1,
-          "shrimp": 1,
-          "cat": 3
-        },
-        "total_legs": 30
-      }
-    }
-  ]
-}
diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
deleted file mode 100644
index 49697c4b..00000000
--- a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "metrics": {
-    "dataset_name": "propositional_logic",
-    "model": "google/gemini-2.0-flash-001",
-    "size": 10,
-    "seed": 42,
-    "average_score": 0,
-    "total_examples": 0,
-    "timestamp": "2025-02-10T06:06:10.638347"
-  },
-  "results": []
-}
diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
deleted file mode 100644
index 13f80c7a..00000000
--- a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
+++ /dev/null
@@ -1,201 +0,0 @@
-{
-  "metrics": {
-    "dataset_name": "propositional_logic",
-    "model": "google/gemini-2.0-flash-001",
-    "size": 10,
-    "seed": 42,
-    "average_score": 0.059,
-    "total_examples": 10,
-    "timestamp": "2025-02-10T06:08:24.037697"
-  },
-  "results": [
-    {
-      "question": "Given:\n1. R\n2. Q\nWhat can we conclude?",
-      "expected_answer": "(P \u2228 Q)",
-      "model_answer": "Given only R and Q, we can conclude that **R and Q are true** (according to propositional logic).\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "R",
-          "Q"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R",
-          "S"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. ((Q \u2192 P) \u2228 (Q \u2192 P))\n2. ((Q \u2194 Q) \u2192 (P \u2192 P))\n3. P\nWhat can we conclude?",
-      "expected_answer": "(P \u2192 P)",
-      "model_answer": "Let's break down each premise and see what we can derive.\n\n* **1. ((Q \u2192 P) \u2228 (Q \u2192 P))** This is logically equivalent to (Q \u2192 P).  This is because any statement 'A or A' is simply 'A'.  So we now have:  **Q \u2192 P**\n\n* **2. ((Q \u2194 Q) \u2192 (P \u2192 P))**  The statement (Q \u2194 Q) is a tautology (always true).  The statement (P \u2192 P) is also a tautology.  Therefore, (True \u2192 True) is True. This premise is always true and doesn't provide us with any new information that helps us directly derive something else.  We can essentially ignore it.\n\n* **3. P** This is simply **P**.\n\nNow we have:\n\n* Q \u2192 P\n* P\n\nFrom these two statements, we can't directly conclude anything.\n\nRemember the Modus Ponens and Modus Tollens rules:\n\n*   **Modus Ponens:** If you have P \u2192 Q and P, you can conclude Q.\n*   **Modus Tollens:** If you have P \u2192 Q and \u00acQ, you can conclude \u00acP.\n\nIn our case, we have Q \u2192 P and P.   This matches the *reverse* of Modus Ponens, which is *not* a valid inference rule.  Knowing that the *consequent* (P) is true *doesn't* tell us anything definitively about the *antecedent* (Q).  Q could be either true or false.\n\nTherefore, based on the given premises, we cannot conclude anything further about Q's truth value.\n",
-      "score": 0.5,
-      "metadata": {
-        "premises": [
-          "((Q \u2192 P) \u2228 (Q \u2192 P))",
-          "((Q \u2194 Q) \u2192 (P \u2192 P))",
-          "P"
-        ],
-        "variables": [
-          "P",
-          "Q"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. ((Q \u2228 P) \u2227 \u00acP)\n2. P\n3. ((P \u2227 R) \u2227 \u00acR)\n4. ((Q \u2194 R) \u2192 \u00acQ)\nWhat can we conclude?",
-      "expected_answer": "(Q \u2227 Q)",
-      "model_answer": "Let's analyze each given statement:\n\n1. **((Q \u2228 P) \u2227 \u00acP)**:  This says that either Q or P is true *and* P is false.  Since P is false, Q must be true.  So, we can conclude **Q**.\n\n2. **P**:  This simply states that P is true.\n\n3. **((P \u2227 R) \u2227 \u00acR)**: This reads as \"Both P and R are true, *and* R is false.\"  This is a contradiction. Since P and R are true, you cannot conclude that R is false.\n\n4. **((Q \u2194 R) \u2192 \u00acQ)**: This says \"If Q is equivalent to R (i.e., they have the same truth value), then Q is false.\"\n\nNow, let's see what we can derive, considering information from all the premises.\n\n* **From premise 1:**  We *appear* to conclude Q.  However premise 2 states that P is true. This contradicts the information provided in premise 1 that states P is false. Therefore, we have contradictory information. \n\n* **Premise 3 is a contradiction.**\n\nLet's consider the assumptions independently.\n\n* **Assuming premise 1 is true in isolation:**  Then, P is false and Q is true.\n\n* **Assuming premise 2 is true in isolation:** Then, P is true.\n\n* **Assuming premise 3 is true in isolation:**  Then, P and R are true and R is false. This is itself a contradiction and therefore cannot be true in isolation.\n\n* **Assuming premise 4 is true in isolation:**  It tells us the dependence between the truth values of Q and R, but doesn't give us the true/false result of either variable on their own.\n\nSince we have contradicting information in the premises, we must be able to reach a conclusion.\n\n**Analyzing for Contradiction:**\n\nSince premise 1 stated that Q is true only if \u00acP is True, yet premise 2 asserts that P is True, then we know that premise (1\u22272) \u22a2 False.\n\nPremise 3 also claims that ((P \u2227 R) \u2227 \u00acR) is true which again cannot be true since (R\u2227\u00acR) evaluates to False.\n\nPremise 4 will only create a contradiction if Q \u2194 R evaluates to True but \u00acQ evaluates to False. Since Q is true from Premise 1, this implies that R must also be True, yet we cannot determine whether this is indeed true or false. We can only determine this, if we remove the contradictory information given regarding Q and R across all the premises.\n\n**Conclusion with contradiction:**\n\nWith contradictory premises, we can conclude *anything*.  This is a principle of classical logic called \"ex falso quodlibet\" (from falsehood, anything follows), often simplified as \"explosion\".  Since the collection of premises contains a contradiction, we can use it derive any conclusion.\n\n**Example:**\n\n1. Q is true (from premise 1, under the assumption it's only true premise)\n2. P is true is also true (from premise 2 under the assumption it's the only true premise)\n3. P and R is true and R is false (from premise 3 with the same provision).\n4. (Q \u2194 R) \u2192 \u00acQ (from premise 4 and depends on the premise results from the above to determine whether Q is true, but we can only determine that when the contradictory information is omitted from the statement/statements/premises).\n\nBecause premises do not provide consistent rules to determine truth values of P, Q, R, and therefore contradiction, then anything follows.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "((Q \u2228 P) \u2227 \u00acP)",
-          "P",
-          "((P \u2227 R) \u2227 \u00acR)",
-          "((Q \u2194 R) \u2192 \u00acQ)"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. \u00acP\n2. \u00acP\n3. Q\nWhat can we conclude?",
-      "expected_answer": "(Q \u2228 R)",
-      "model_answer": "Given the premises:\n\n1.  \u00acP\n2.  \u00acP\n3.  Q\n\nWe can conclude **Q**.\n\nThe repetition of \u00acP doesn't add any new information. This is a very simple case where 'Q' is stated directly in the premises. There's no need for any complex logical deductions.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "\u00acP",
-          "\u00acP",
-          "Q"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. P\n2. ((P \u2228 P) \u2194 (Q \u2227 P))\n3. P\nWhat can we conclude?",
-      "expected_answer": "(Q \u2194 Q)",
-      "model_answer": "Let's analyze the given statements:\n\n1. P is true.\n2. ((P \u2228 P) \u2194 (Q \u2227 P)) - This statement tells us that the truth value of (P \u2228 P) is equivalent to the truth value of (Q \u2227 P).\n\nSince we know P is true from premise 1, we can simplify:\n\n* **P \u2228 P is true:**  Because P is true, disjoining it with itself makes the disjunction true.\n* **Since (P \u2228 P) \u2194 (Q \u2227 P) is true and (P \u2228 P) is true, then (Q \u2227 P) must also be true.**\n* **(Q \u2227 P) is true, and P is true, therefore Q must be true.**  For the conjunction (Q \u2227 P) to be true, both Q and P must be true.\n\nTherefore, we can conclude that **Q is true**.\n\nSo the conclusion includes P and Q.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "P",
-          "((P \u2228 P) \u2194 (Q \u2227 P))",
-          "P"
-        ],
-        "variables": [
-          "P",
-          "Q"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. (Q \u2194 R)\n2. \u00acR\nWhat can we conclude?",
-      "expected_answer": "(Q \u2192 R)",
-      "model_answer": "From the given premises:\n\n1.  Q \u2194 R (Q is equivalent to R)\n2.  \u00acR (R is false)\n\nSince Q is equivalent to R, this means that if R is false, then Q must also be false.\n\nTherefore, we can conclude:\n\n\u00acQ (Q is false)\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "(Q \u2194 R)",
-          "\u00acR"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. R\n2. ((Q \u2192 Q) \u2228 (S \u2227 Q))\n3. Q\nWhat can we conclude?",
-      "expected_answer": "(S \u2228 Q)",
-      "model_answer": "Let's analyze the given statements:\n\n*   **1. R** - This is a simple assertion: R is true.\n\n*   **2. ((Q \u2192 Q) \u2228 (S \u2227 Q))** -  Let's break this down:\n    *   `(Q \u2192 Q)`: This is a tautology.  Any proposition implying itself is always true.\n    *   `(S \u2227 Q)`: This is true if and only if both S and Q are true.\n    *   `\u2228`:  The entire statement is true if either `(Q \u2192 Q)` is true, or `(S \u2227 Q)` is true.  Since `(Q \u2192 Q)` is always true, this entire statement is always true, regardless of the truth value of S.\n\n*   **3. Q** - This asserts that Q is true.\n\nNow, what can we conclude?\n\n*   **We know R is true.** (Directly from statement 1).\n*   **We know Q is true.** (Directly from statement 3).\n\nFrom the second statement, `((Q \u2192 Q) \u2228 (S \u2227 Q))`, we can note the following:\n\n*   Because `Q \u2192 Q` is always true (a tautology), `((Q \u2192 Q) \u2228 (S \u2227 Q))` is always true, regardless of the truth value of S. This statement doesn't help us conclude anything new about S, but it does provide some information about the relationship between Q and itself, and Q and S.\n* Because we also know that Q is true, and that `((Q \u2192 Q) \u2228 (S \u2227 Q))` is true, the `(S \u2227 Q)` portion could be true or false. If it's true, then S must also be true. If it's false, then S must be false. However, because `((Q \u2192 Q) \u2228 (S \u2227 Q))` is true, and Q is true, we cannot know for sure if S must be true or false. \n\nTherefore, the key conclusions we can draw are:\n\n*   **R is true.**\n*   **Q is true.**\n\nWe **cannot** definitively conclude anything about the truth value of S.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "R",
-          "((Q \u2192 Q) \u2228 (S \u2227 Q))",
-          "Q"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R",
-          "S"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. (Q \u2227 P)\n2. (\u00acP \u2227 (P \u2192 P))\n3. \u00ac(Q \u2194 Q)\nWhat can we conclude?",
-      "expected_answer": "(P \u2227 P)",
-      "model_answer": "Let's analyze each statement:\n\n1. **(Q \u2227 P):** This statement tells us that both Q and P are true.\n\n2. **(\u00acP \u2227 (P \u2192 P)):**\n    * `\u00acP`: This means P is false.\n    * `(P \u2192 P)`: This is always true (a tautology).  Any proposition implies itself.\n\n3. **\u00ac(Q \u2194 Q):**\n    * `(Q \u2194 Q)`:  This means \"Q if and only if Q\". This is always true (a tautology).\n    * `\u00ac(Q \u2194 Q)`: This means \"not (Q if and only if Q)\".  Since `(Q \u2194 Q)` is always true, its negation is always false.\n\nNow let's look for contradictions:\n\n* Statement 1 says P is true.\n* Statement 2 says P is false.\n\nTherefore, **we have a contradiction**. The set of premises is **inconsistent**.  We can conclude that the statements cannot all be true simultaneously; there's no truth assignment that will satisfy all three statements.\n\nTherefore, we can conclude that the premises are inconsistent.\n\n**Final Answer: The set of premises is inconsistent.**\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "(Q \u2227 P)",
-          "(\u00acP \u2227 (P \u2192 P))",
-          "\u00ac(Q \u2194 Q)"
-        ],
-        "variables": [
-          "P",
-          "Q"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. (R \u2228 Q)\n2. (R \u2227 Q)\n3. R\nWhat can we conclude?",
-      "expected_answer": "(P \u2194 P)",
-      "model_answer": "From the given statements:\n\n*   **1. (R \u2228 Q)** - This tells us that R or Q (or both) is true.\n*   **2. (R \u2227 Q)** - This tells us that both R and Q are true.\n*   **3. R** - This tells us that R is true.\n\nWhile statement 3. R is already confirmed in statement 2., so, knowing that both R and Q are true from statement 2, we can definitively conclude:\n\n*   **Q** is true.\n\nHere's a breakdown:\n\nStatement 2 tells us R \u2227 Q.  For a conjunction (\u2227) to be true, both parts must be true therefore R and Q must be true.\n\nSo, the most direct and strongest conclusion we can make is:\n\n*  **Q is true**\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "(R \u2228 Q)",
-          "(R \u2227 Q)",
-          "R"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. ((P \u2228 Q) \u2228 (Q \u2192 Q))\n2. \u00ac(Q \u2192 Q)\n3. ((P \u2227 Q) \u2227 (Q \u2227 P))\n4. (Q \u2194 Q)\nWhat can we conclude?",
-      "expected_answer": "(P \u2192 P)",
-      "model_answer": "Let's analyze each statement:\n\n1.  **((P \u2228 Q) \u2228 (Q \u2192 Q))**:  We know that `Q \u2192 Q` is always true (a tautology). Therefore, `(Q \u2192 Q)` is equivalent to `T` (True). The statement becomes `((P \u2228 Q) \u2228 T)`.  Since anything ORed with True is always True, this whole statement simplifies to **T (True)**.  It tells us nothing about the values of P or Q.\n\n2.  **\u00ac(Q \u2192 Q)**: We know that `Q \u2192 Q` is always true.  Therefore, `\u00ac(Q \u2192 Q)` is equivalent to `\u00acT`, which is **F (False)**.  This is a contradiction given that the argument is valid.\n\n3.  **((P \u2227 Q) \u2227 (Q \u2227 P))**:  This statement is equivalent to `(P \u2227 Q)`. This is because `P \u2227 Q` is logically equivalent to `Q \u2227 P`. So, we can say that `P` is True **AND** `Q` is True.\n\n4.  **(Q \u2194 Q)**:  This is a biconditional, meaning \"Q if and only if Q\".  This statement is always true (a tautology) regardless of the value of Q. So this is equivalent to **T (True)**, and again, it tells us nothing about the value of Q.\n\n**Conclusion**:\n\nThe argument presents contradictory statements:\n\nStatement 1: Always True.\nStatement 2: Always False.\nStatement 3: P is true AND Q is true.\nStatement 4: Always true.\n\nStatement 2 contradicts both statement 1 and statement 4.\n\n**Therefore, we can conclude that the set of premises is inconsistent due to the contradiction in premise 2 \"\u00ac(Q \u2192 Q)\".**\n\nWithout premise 2, we could conclude that P is true AND Q is true from premise 3. The logical equivalence `(P \u2227 Q) \u2194 (Q \u2194 Q)` can be derived by simply checking truth tables.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "((P \u2228 Q) \u2228 (Q \u2192 Q))",
-          "\u00ac(Q \u2192 Q)",
-          "((P \u2227 Q) \u2227 (Q \u2227 P))",
-          "(Q \u2194 Q)"
-        ],
-        "variables": [
-          "P",
-          "Q"
-        ],
-        "complexity": 3
-      }
-    }
-  ]
-}
diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json b/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json
new file mode 100644
index 00000000..32086f46
--- /dev/null
+++ b/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json
@@ -0,0 +1,39 @@
+[
+  {
+    "dataset_name": "letter_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.20600000000000002,
+    "total_examples": 10,
+    "timestamp": "2025-02-09T22:47:25.934820",
+    "config": {
+      "min_words": 5,
+      "max_words": 15,
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "propositional_logic",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.059,
+    "total_examples": 10,
+    "timestamp": "2025-02-09T22:47:57.473560",
+    "config": {
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "leg_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.40199999999999997,
+    "total_examples": 10,
+    "timestamp": "2025-02-09T22:48:13.546006",
+    "config": {
+      "min_animals": 3,
+      "max_animals": 8,
+      "size": 10,
+      "seed": 42
+    }
+  }
+]

From 767c34297ff553d4f583701dcec78d36beaa1a23 Mon Sep 17 00:00:00 2001
From: Adefioye <47661641+Adefioye@users.noreply.github.com>
Date: Mon, 10 Feb 2025 08:15:23 -0600
Subject: [PATCH 11/11] Add score_answer method to word_ladder (#93)

* Add score_answer method to word_ladder
* add unit test for WordLadderDataset::score_answer()

---------

Co-authored-by: Andreas Koepf <andreas.koepf@provisio.com>
---
 reasoning_gym/algorithmic/word_ladder.py | 66 ++++++++++++++++++------
 tests/test_word_ladder.py                | 44 +++++++++++++++-
 2 files changed, 92 insertions(+), 18 deletions(-)

diff --git a/reasoning_gym/algorithmic/word_ladder.py b/reasoning_gym/algorithmic/word_ladder.py
index a0b000c2..64c65326 100644
--- a/reasoning_gym/algorithmic/word_ladder.py
+++ b/reasoning_gym/algorithmic/word_ladder.py
@@ -5,8 +5,7 @@ from dataclasses import dataclass
 from random import Random
 from typing import Dict, List, Optional, Set, Tuple
 
-from reasoning_gym.data import read_data_file
-
+from ..data import get_data_file_path
 from ..factory import ProceduralDataset, register_dataset
 
 
@@ -64,6 +63,7 @@ class WordLadderDataset(ProceduralDataset):
         self.config = config
         self.word_sets = {}
         self.word_graphs = {}
+        self._vocabulary = None  # A large list of dictionary words to validate words against
 
         # Load words from CSV
         self.word_sets = self._load_words_from_csv(
@@ -84,28 +84,24 @@ class WordLadderDataset(ProceduralDataset):
         assert 3 <= min_length <= max_length <= 5, "Word length must be between 3 and 5 inclusive"
 
         import csv
-        from io import StringIO
 
         word_sets = {}
 
         try:
             # Get CSV content as string
-            csv_content = read_data_file("words.csv")
+            with get_data_file_path("words.csv").open("r", encoding="utf-8") as csv_file:
+                reader = csv.DictReader(csv_file)
 
-            # Use StringIO to create a file-like object from the string
-            csv_file = StringIO(csv_content)
-            reader = csv.DictReader(csv_file)
+                for row in reader:
+                    # Process each word length column using config range
+                    for length in range(min_length, max_length + 1):
+                        col_name = f"{length}_letter"
+                        word = row.get(col_name, "")
 
-            for row in reader:
-                # Process each word length column using config range
-                for length in range(min_length, max_length + 1):
-                    col_name = f"{length}_letter"
-                    word = row.get(col_name, "")
+                        if not word:  # Skip empty entries
+                            continue
 
-                    if not word:  # Skip empty entries
-                        continue
-
-                    word_sets.setdefault(length, set()).add(word.upper())
+                        word_sets.setdefault(length, set()).add(word.upper())
 
         except Exception as e:
             raise RuntimeError(f"Error processing words.csv content: {e}") from e
@@ -220,5 +216,43 @@ class WordLadderDataset(ProceduralDataset):
             "metadata": {"start_word": start, "end_word": end, "word_length": length, "chain_length": len(path)},
         }
 
+    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+        if answer is None:
+            return 0
+
+        answer_words = tuple(s.strip() for s in answer.upper().split(","))
+
+        metadata = entry["metadata"]
+        start_word = metadata["start_word"]
+        end_word = metadata["end_word"]
+        word_length = len(end_word)
+        known_words = self.word_sets[word_length]
+
+        # Check conditions:
+        # 1. start and end word match question
+        # 2. all words have the correct length
+        # 3. every changed word is a single letter change from the previous word
+        # 4. all words are in our vocabulary
+
+        if len(answer_words) < 2:
+            return 0
+
+        if answer_words[0] != start_word or answer_words[-1] != end_word:
+            return 0.01
+
+        if not all(len(w) == word_length for w in answer_words):
+            return 0.01
+
+        for i in range(1, len(answer_words)):
+            if sum(1 for a, b in zip(answer_words[i - 1], answer_words[i]) if a != b) != 1:
+                return 0.01
+
+        reward = 1.0
+        for word in answer_words:
+            if not word in known_words:
+                reward *= 0.5
+
+        return reward
+
 
 register_dataset("word_ladder", WordLadderDataset, WordLadderConfig)
diff --git a/tests/test_word_ladder.py b/tests/test_word_ladder.py
index d42108ea..1aba4cf3 100644
--- a/tests/test_word_ladder.py
+++ b/tests/test_word_ladder.py
@@ -355,5 +355,45 @@ def test_word_ladder_edge_cases():
     assert max_length > 3, "No challenging word pairs generated"
 
 
-if __name__ == "__main__":
-    pytest.main([__file__])
+def test_word_ladder_score_answer():
+    """Test the score_answer method"""
+    config = WordLadderConfig(min_word_length=4, max_word_length=4)
+    dataset = WordLadderDataset(config)
+
+    # Create a test entry
+    entry = {
+        "question": "Transform the word ladder 'COLD' to 'WARM' by changing one letter at a time.",
+        "answer": "COLD,CORD,CARD,WARD,WARM",
+        "metadata": {"start_word": "COLD", "end_word": "WARM", "word_length": 4, "chain_length": 5},
+    }
+
+    # Test perfect answer
+    assert dataset.score_answer("COLD,CORD,CARD,WARD,WARM", entry) == 1.0
+
+    # Test None answer
+    assert dataset.score_answer(None, entry) == 0.0
+
+    # Test empty answer
+    assert dataset.score_answer("", entry) == 0.0
+
+    # Test single word answer
+    assert dataset.score_answer("COLD", entry) == 0.0
+
+    # Test wrong start word
+    assert dataset.score_answer("BOLD,CORD,CARD,WARD,WARM", entry) == 0.01
+
+    # Test wrong end word
+    assert dataset.score_answer("COLD,CORD,CARD,WARD,WARP", entry) == 0.01
+
+    # Test wrong word length
+    assert dataset.score_answer("COLD,CORDS,CARDS,WARD,WARM", entry) == 0.01
+
+    # Test invalid transitions (more than one letter change)
+    assert dataset.score_answer("COLD,WARD,WARM", entry) == 0.01
+
+    # Test case insensitivity
+    assert dataset.score_answer("cold,cord,card,ward,warm", entry) == 1.0
+
+    # Test with unknown words (should return partial credit)
+    assert dataset.score_answer("COLD,COXD,CARD,WARD,WARM", entry) < 1.0
+    assert dataset.score_answer("COLD,COXD,CARD,WARD,WARM", entry) > 0.0