diff --git a/eval/.gitignore b/eval/.gitignore
new file mode 100644
index 00000000..7db6c7ce
--- /dev/null
+++ b/eval/.gitignore
@@ -0,0 +1,2 @@
+results/*
+!results/summary*
diff --git a/eval/__init__.py b/eval/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/eval/eval_basic.py b/eval/eval.py
similarity index 100%
rename from eval/eval_basic.py
rename to eval/eval.py
diff --git a/eval/eval_basic.sh b/eval/eval.sh
similarity index 95%
rename from eval/eval_basic.sh
rename to eval/eval.sh
index 69f5251e..1d2a0beb 100644
--- a/eval/eval_basic.sh
+++ b/eval/eval.sh
@@ -21,7 +21,7 @@ mkdir -p "$OUTPUT_DIR"
 # Run evaluations
 for model in "${MODELS[@]}"; do
     echo "Evaluating $model..."
-    python eval_basic.py \
+    python eval.py \
         --model "$model" \
         --config "eval_basic.json" \
         --output-dir "$OUTPUT_DIR"
diff --git a/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
deleted file mode 100644
index 5e82c747..00000000
--- a/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json
+++ /dev/null
@@ -1,647 +0,0 @@
-[
-  {
-    "metrics": {
-      "dataset_name": "letter_counting",
-      "model": "google/gemini-2.0-flash-001",
-      "size": 10,
-      "average_score": 0.20600000000000002,
-      "total_examples": 10,
-      "timestamp": "2025-02-10T06:34:37.091554",
-      "config": {
-        "min_words": 5,
-        "max_words": 15,
-        "size": 10,
-        "seed": 42
-      }
-    },
-    "results": [
-      {
-        "question": "How many times does the letter \"a\" appear in the text: \"bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed\"?",
-        "expected_answer": "6",
-        "model_answer": "The letter \"a\" appears 7 times in the text: \"bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 15,
-          "target_letter": "a",
-          "span": [
-            "bed",
-            "and",
-            "enters",
-            "his",
-            "mechanical",
-            "dresser",
-            "Two",
-            "minutes",
-            "later",
-            "the",
-            "machine",
-            "deposited",
-            "him",
-            "all",
-            "dressed"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"w\" appear in the text: \"it into a watering place\"?",
-        "expected_answer": "1",
-        "model_answer": "The letter \"w\" appears 1 time in the text \"it into a watering place\".\n",
-        "score": 0.5,
-        "metadata": {
-          "span_length": 5,
-          "target_letter": "w",
-          "span": [
-            "it",
-            "into",
-            "a",
-            "watering",
-            "place"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"t\" appear in the text: \"readable form accessible by the widest array of equipment including outdated\"?",
-        "expected_answer": "5",
-        "model_answer": "Let's count the \"t\"s in the sentence \"readable form accessible by the widest array of equipment including outdated\":\n\n*   readable: 0\n*   form: 0\n*   accessible: 0\n*   by: 0\n*   the: 1\n*   widest: 1\n*   array: 0\n*   of: 0\n*   equipment: 1\n*   including: 0\n*   outdated: 2\n\nTotal: 1 + 1 + 1 + 2 = 5\n\nTherefore, the letter \"t\" appears 5 times in the text.",
-        "score": 0.5,
-        "metadata": {
-          "span_length": 11,
-          "target_letter": "t",
-          "span": [
-            "readable",
-            "form",
-            "accessible",
-            "by",
-            "the",
-            "widest",
-            "array",
-            "of",
-            "equipment",
-            "including",
-            "outdated"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"n\" appear in the text: \"does not contain a notice indicating that it is\"?",
-        "expected_answer": "6",
-        "model_answer": "The letter \"n\" appears 7 times in the text \"does not contain a notice indicating that it is\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 9,
-          "target_letter": "n",
-          "span": [
-            "does",
-            "not",
-            "contain",
-            "a",
-            "notice",
-            "indicating",
-            "that",
-            "it",
-            "is"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"a\" appear in the text: \"keeping this work in the same\"?",
-        "expected_answer": "1",
-        "model_answer": "The letter \"a\" appears **3** times in the text \"keeping this work in the same\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 6,
-          "target_letter": "a",
-          "span": [
-            "keeping",
-            "this",
-            "work",
-            "in",
-            "the",
-            "same"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"r\" appear in the text: \"years ago its subscription list contained but a few hundred\"?",
-        "expected_answer": "3",
-        "model_answer": "The letter \"r\" appears 4 times in the text \"years ago its subscription list contained but a few hundred\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 10,
-          "target_letter": "r",
-          "span": [
-            "years",
-            "ago",
-            "its",
-            "subscription",
-            "list",
-            "contained",
-            "but",
-            "a",
-            "few",
-            "hundred"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"e\" appear in the text: \"me As in the morning the table rose out of the floor Again\"?",
-        "expected_answer": "6",
-        "model_answer": "The letter \"e\" appears 6 times in the text \"me As in the morning the table rose out of the floor Again\".\n",
-        "score": 0.5,
-        "metadata": {
-          "span_length": 13,
-          "target_letter": "e",
-          "span": [
-            "me",
-            "As",
-            "in",
-            "the",
-            "morning",
-            "the",
-            "table",
-            "rose",
-            "out",
-            "of",
-            "the",
-            "floor",
-            "Again"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"t\" appear in the text: \"said Faithburn was dead quite certainly\"?",
-        "expected_answer": "3",
-        "model_answer": "The letter \"t\" appears 5 times in the text \"said Faithburn was dead quite certainly\".\n",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 6,
-          "target_letter": "t",
-          "span": [
-            "said",
-            "Faithburn",
-            "was",
-            "dead",
-            "quite",
-            "certainly"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"n\" appear in the text: \"these eight had only impracticable schemes to propose In fact one of\"?",
-        "expected_answer": "3",
-        "model_answer": "Let's count the \"n\"s:\n\n*   **n**\n*   o**n**ly\n*   i**n**\n*   o**n**e\n\nThere are **4** \"n\"s in the text.",
-        "score": 0.01,
-        "metadata": {
-          "span_length": 12,
-          "target_letter": "n",
-          "span": [
-            "these",
-            "eight",
-            "had",
-            "only",
-            "impracticable",
-            "schemes",
-            "to",
-            "propose",
-            "In",
-            "fact",
-            "one",
-            "of"
-          ]
-        }
-      },
-      {
-        "question": "How many times does the letter \"g\" appear in the text: \"computers including obsolete old middle aged and new\"?",
-        "expected_answer": "2",
-        "model_answer": "The letter \"g\" appears 2 times in the text: \"computers including obsolete old middle aged and new\".\n",
-        "score": 0.5,
-        "metadata": {
-          "span_length": 8,
-          "target_letter": "g",
-          "span": [
-            "computers",
-            "including",
-            "obsolete",
-            "old",
-            "middle",
-            "aged",
-            "and",
-            "new"
-          ]
-        }
-      }
-    ]
-  },
-  {
-    "metrics": {
-      "dataset_name": "propositional_logic",
-      "model": "google/gemini-2.0-flash-001",
-      "size": 10,
-      "average_score": 0.059,
-      "total_examples": 10,
-      "timestamp": "2025-02-10T06:35:11.432275",
-      "config": {
-        "size": 10,
-        "seed": 42
-      }
-    },
-    "results": [
-      {
-        "question": "Given:\n1. R\n2. Q\nWhat can we conclude?",
-        "expected_answer": "(P \u2228 Q)",
-        "model_answer": "Given just \"R\" and \"Q\" as individual statements, we can't directly *conclude* anything meaningful in a logical or mathematical sense *unless* we know more context. Here's why and some possibilities:\n\n**Without Context:**\n\n* **Independent Statements:**  They are simply two distinct statements that might be true or false depending on what \"R\" and \"Q\" represent. They exist independently.\n\n**With Possible Context (and Possible Conclusions):**\n\n* **Logic/Propositional Logic:** If \"R\" and \"Q\" represent propositions (statements that can be true or false), then we can form compound statements:\n    * **R AND Q (R \u2227 Q):**  The statement \"R is true AND Q is true.\"  This is only true if *both* R and Q are true.\n    * **R OR Q (R \u2228 Q):** The statement \"R is true OR Q is true (or both).\" This is true if *either* R or Q or both are true.\n    * **IF R THEN Q (R \u2192 Q):** The statement \"If R is true, then Q is true.\"  This is only false if R is true and Q is false.\n    * **IF Q THEN R (Q \u2192 R):** The statement \"If Q is true, then R is true.\" This is only false if Q is true and R is false.\n    * **R IFF Q (R \u2194 Q):** The statement \"R is true if and only if Q is true.\"  This is true if R and Q have the same truth value (both true or both false).\n    * **NOT R (\u00acR):** The statement \"R is false.\"\n    * **NOT Q (\u00acQ):** The statement \"Q is false.\"\n\n    *From the information given, we cannot deduce the individual truth values of `R` and `Q` (e.g., whether `R` is true or false, whether `Q` is true or false).*\n\n* **Set Theory:** If R and Q are sets, then we can discuss:\n    * **Intersection (R \u2229 Q):**  The set containing elements that are *both* in R and in Q.  We can't *conclude* what's in this intersection without knowing the elements of R and Q.\n    * **Union (R \u222a Q):**  The set containing elements that are in R, in Q, or in both. We can't *conclude* what's in this union without knowing the elements of R and Q.\n\n* **Mathematics (Variables):** If R and Q are variables, then we can't conclude anything about their relationship or values *unless* we have an equation or further information linking them.  We don't know if R = Q, R < Q, R > Q, R + Q = some value, etc.\n\n* **Premises in an Argument:** If R and Q are presented as premises (assumptions) in a larger argument, we can only draw conclusions if there are *other* premises or rules of inference involved.\n\n**Example Scenarios and Conclusions:**\n\n* **Scenario:**  R: \"It is raining.\"  Q: \"The ground is wet.\"\n    * **Possible Conclusion with an additional premise:** If we know \"IF it is raining, THEN the ground is wet (R \u2192 Q)\", then we can say *if* it is raining (we know R), *then* the ground is wet (we can conclude Q).  But *given only* R and Q, we cannot derive this conditional.\n\n* **Scenario:** R:  x > 5   Q: x > 0 (where 'x' is a real number)\n    * **Conclusion:** If both R and Q are true for the same value of x, then x > 5 implies x > 0. Therefore we can deduce that Q is true if R is.  R -> Q.\n\n**In summary, the best answer is:**\n\nWithout any further context or relationship between R and Q, we can only acknowledge them as two independent statements, values or sets. We cannot draw any meaningful logical conclusions *solely* from the information given.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "R",
-            "Q"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R",
-            "S"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. ((Q \u2192 P) \u2228 (Q \u2192 P))\n2. ((Q \u2194 Q) \u2192 (P \u2192 P))\n3. P\nWhat can we conclude?",
-        "expected_answer": "(P \u2192 P)",
-        "model_answer": "Let's analyze each statement and see what conclusions can be drawn from them and the given information.\n\n1. **((Q \u2192 P) \u2228 (Q \u2192 P))** simplifies to **(Q \u2192 P)**.  This means \"if Q is true, then P is true.\"  Since we know P is true (given in statement 3), then this statement is true whether Q is true or false. (Remember, a conditional is only false when the antecedent is true and the consequent is false).\n\n2. **((Q \u2194 Q) \u2192 (P \u2192 P))** simplifies significantly.  `Q \u2194 Q` is always true (it's a tautology).  `P \u2192 P` is also always true (it's another tautology).  So, this becomes `(True \u2192 True)`, which is always true.  This statement doesn't help us determine anything about the relationship between Q and P beyond what is already implied by P being true.\n\n3. **P** is given as true.\n\nTherefore, based on the given information and the simplification of the statements, the main conclusion is that **P is true**. The truth value of Q is indeterminate. Q can be either true or false, and all the given statements will still hold.\n**Therefore, we can conclude: P.**\n",
-        "score": 0.5,
-        "metadata": {
-          "premises": [
-            "((Q \u2192 P) \u2228 (Q \u2192 P))",
-            "((Q \u2194 Q) \u2192 (P \u2192 P))",
-            "P"
-          ],
-          "variables": [
-            "P",
-            "Q"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. ((Q \u2228 P) \u2227 \u00acP)\n2. P\n3. ((P \u2227 R) \u2227 \u00acR)\n4. ((Q \u2194 R) \u2192 \u00acQ)\nWhat can we conclude?",
-        "expected_answer": "(Q \u2227 Q)",
-        "model_answer": "Let's analyze the given statements and see what conclusions we can draw using logical deduction:\n\n1.  **((Q \u2228 P) \u2227 \u00acP)**: This says that either Q or P is true, AND P is false.  Since P is false, Q must be true for the entire statement to be true. Thus, we can conclude: **Q is true.**\n\n2.  **P**: This says that P is true.\n\n3.  **((P \u2227 R) \u2227 \u00acR)**: This says that both P and R are true, AND R is false.  This entire statement is a contradiction, as R cannot be both true and false. This statement implies: **False (Contradiction)**.\n\n4.  **((Q \u2194 R) \u2192 \u00acQ)**: This says that IF Q is equivalent to R (i.e., Q and R have the same truth value), THEN Q is false.\n\nNow, let's combine our findings:\n\n*   From (1), we deduced **Q**. However from (2) and (3), we found that statement (3) resulted in a contradiction. This contradiction stems from assuming both Q and R are false, despite Q already being false as a logical conclusion from (1). Therefore something must be wrong with the premises given.\n\nHere's a more methodical analysis, considering the contradiction:\n\nSince statement (3) leads to a contradiction, we must question our assumptions and how they interact.  Here's where things get interesting:\n\n*   We initially deduced Q from statement (1). Thus since P is false and P is true (2), this creates another contradiction. Thus we are concluding contradictory truths when all assumptions are in effect.\n\nSince we arrive at a contradiction, the set of premises is inconsistent.  Therefore, anything can be formally derived. In essence, we can conclude anything we want via *ex falso quod libet*.\n\n**Therefore, due to the contradictory statements provided, we can conclude anything.** However, within a standard logic system, we would first need to address which statement must be removed to allow the system to converge.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "((Q \u2228 P) \u2227 \u00acP)",
-            "P",
-            "((P \u2227 R) \u2227 \u00acR)",
-            "((Q \u2194 R) \u2192 \u00acQ)"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. \u00acP\n2. \u00acP\n3. Q\nWhat can we conclude?",
-        "expected_answer": "(Q \u2228 R)",
-        "model_answer": "From the given premises:\n\n*   \u00acP (P is false)\n*   \u00acP (P is false)\n*   Q (Q is true)\n\nWe can conclude the following:\n\n*   **P is false.** (This is directly from \u00acP)\n*   **Q is true.** (This is directly from Q)\n*   **\u00acQ is false.** (Since Q is true)\n\nNow we can also conclude the following with propositional logic principles:\n\n*   **P \u2228 Q** is true. (Since Q is true, the disjunction is true regardless of the truth value of P).\n*   **P \u2192 Q** is true. (Since P is false, the conditional is true).\n*   **\u00acP \u2228 Q** is true. (Since both \u00acP and Q are true, the disjunction is true).\n*   **\u00acP \u2192 Q** is true. (Since \u00acP is true, the conditional is true).\n\nIn summary, we can validly conclude many things, the most straightforward being the truth values of P and Q:\n\n*   **P is false.**\n*   **Q is true.**\n\nAnd other valid conclusions such as those related to ORs and implication.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "\u00acP",
-            "\u00acP",
-            "Q"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. P\n2. ((P \u2228 P) \u2194 (Q \u2227 P))\n3. P\nWhat can we conclude?",
-        "expected_answer": "(Q \u2194 Q)",
-        "model_answer": "Let's analyze the given statements:\n\n1.  P: This statement is true.\n2.  ((P \u2228 P) \u2194 (Q \u2227 P)): This is a biconditional. Let's break it down:\n    *   (P \u2228 P) is logically equivalent to P (by the idempotent law).\n    *   So the statement simplifies to (P \u2194 (Q \u2227 P)).\n    *   Since we know P is true (from statement 1 and 3), the statement becomes (True \u2194 (Q \u2227 True)).\n    *   (Q \u2227 True) is logically equivalent to Q.\n    *   Therefore, the statement simplifies further to (True \u2194 Q).\n    *   For the biconditional (True \u2194 Q) to be true, Q must be true.\n\nTherefore, we can conclude:\n\n*   **Q** is true.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "P",
-            "((P \u2228 P) \u2194 (Q \u2227 P))",
-            "P"
-          ],
-          "variables": [
-            "P",
-            "Q"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. (Q \u2194 R)\n2. \u00acR\nWhat can we conclude?",
-        "expected_answer": "(Q \u2192 R)",
-        "model_answer": "From the given premises:\n\n1.  Q \u2194 R (Q is logically equivalent to R, meaning Q is true if and only if R is true)\n2.  \u00acR (R is false)\n\nSince Q \u2194 R, Q and R have the same truth value.  Because \u00acR, we know R is false. Therefore, Q must also be false.\n\nConclusion: **\u00acQ**\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "(Q \u2194 R)",
-            "\u00acR"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. R\n2. ((Q \u2192 Q) \u2228 (S \u2227 Q))\n3. Q\nWhat can we conclude?",
-        "expected_answer": "(S \u2228 Q)",
-        "model_answer": "Let's analyze the given statements:\n\n1.  **R:** This statement simply asserts that R is true.\n\n2.  **(Q \u2192 Q) \u2228 (S \u2227 Q):**  This is a disjunction (OR). Let's break it down:\n    *   **Q \u2192 Q:**  This is a tautology.  A statement implies itself is always true.  Therefore, the entire disjunction is true regardless of the value of (S \u2227 Q).\n\n3.  **Q:** This statement says Q is true.\n\nSince Q is true:\n\n* **(Q -> Q)** is true (since Q is true implies Q is true).\n\n* **(S ^ Q)** is true if and only if *both* S and Q are true.  We know Q is true, so if S is also true then this is true.  If S is false this is false.\n\n* **((Q -> Q) v (S ^ Q))** is true, since **(Q -> Q)** which is part of the disjunction is true. We already knew that from line 2.\n\nTherefore, from these premises, we can definitively conclude:\n\n*   **R** is true (given).\n*   **Q** is true (given).\n*   **(Q \u2192 Q) \u2228 (S \u2227 Q)** is true (given, and verifiable).\n*   **(Q -> Q)** is true (implied from Q being true, since this is a tautology).\n*   The truth value of S is undetermined.  If S is true, then (S ^ Q) is true.  If S is false, then (S ^ Q) is false.\n\nFinal Answer: The conclusions are **R**, **Q**, and **(Q \u2192 Q) \u2228 (S \u2227 Q)** are all true. Additionally, we know **(Q \u2192 Q)** is true. We cannot determine the truth value of S.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "R",
-            "((Q \u2192 Q) \u2228 (S \u2227 Q))",
-            "Q"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R",
-            "S"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. (Q \u2227 P)\n2. (\u00acP \u2227 (P \u2192 P))\n3. \u00ac(Q \u2194 Q)\nWhat can we conclude?",
-        "expected_answer": "(P \u2227 P)",
-        "model_answer": "Let's analyze each statement to see what we can derive:\n\n*   **1. (Q \u2227 P)** This statement tells us that both Q and P are true.\n    *   Q is True\n    *   P is True\n\n*   **2. (\u00acP \u2227 (P \u2192 P))** Let's break this down:\n    *   `\u00acP`: This means \"Not P\".\n    *   `P \u2192 P`: This is a tautology (always true).  If P is true, then P implies P is true. If P is false then P implies P is true.\n\n    So the statement is essentially `(\u00acP) \u2227 (True)`.  This simplifies to just `\u00acP`. Therefore, the statement tells us that P is false.\n\n*   **3. \u00ac(Q \u2194 Q)** The biconditional `Q \u2194 Q`  means \"Q if and only if Q\". This is always true, regardless of Q's value (Q is true, so \"true if and only if true\" is true; Q is false, so \"false if and only if false\" is true). Therefore \u00ac(Q \u2194 Q) always resolves to `\u00ac(True)`, meaning that `False`.\n\nNow we consider these conclusions together:\n\n*   From statement 1: `Q` is True and `P` is True.\n*   From statement 2: `P` is False.\n*   From statment 3: the statement is `False`.\n\nWe have a **Contradiction**.\n\nStatement 1 tells us that P is true, while Statement 2 tells us that P is false.  This means the entire set of statements is inconsistent.  Therefore, we can conclude the system is flawed.\nFinal Answer: The final answer is $\\boxed{Contradiction}$",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "(Q \u2227 P)",
-            "(\u00acP \u2227 (P \u2192 P))",
-            "\u00ac(Q \u2194 Q)"
-          ],
-          "variables": [
-            "P",
-            "Q"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. (R \u2228 Q)\n2. (R \u2227 Q)\n3. R\nWhat can we conclude?",
-        "expected_answer": "(P \u2194 P)",
-        "model_answer": "From the given statements:\n\n1.  (R \u2228 Q):  R or Q (or both) is true.\n2.  (R \u2227 Q):  R and Q are both true.\n3.  R: R is true.\n\nWe can conclude the following:\n\n*   **Q is true.**  Statement (R \u2227 Q) tells us that R and Q are both true.  Since statement 3 tells us that R is true, Q must also be true.\n\nTherefore, the main conclusion is **Q**.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "(R \u2228 Q)",
-            "(R \u2227 Q)",
-            "R"
-          ],
-          "variables": [
-            "P",
-            "Q",
-            "R"
-          ],
-          "complexity": 3
-        }
-      },
-      {
-        "question": "Given:\n1. ((P \u2228 Q) \u2228 (Q \u2192 Q))\n2. \u00ac(Q \u2192 Q)\n3. ((P \u2227 Q) \u2227 (Q \u2227 P))\n4. (Q \u2194 Q)\nWhat can we conclude?",
-        "expected_answer": "(P \u2192 P)",
-        "model_answer": "Let's analyze each statement:\n\n*   **1. ((P \u2228 Q) \u2228 (Q \u2192 Q))**:  `Q \u2192 Q` is always true (tautology). Therefore, `(P \u2228 Q) \u2228 True` is always true, regardless of the values of P and Q. So, this statement provides no specific information about P or Q.\n\n*   **2. \u00ac(Q \u2192 Q)**:  `Q \u2192 Q` is always true.  Therefore `\u00ac(Q \u2192 Q)` means \u00acTrue which is False. So, the statement asserts a contradiction.\n\n*   **3. ((P \u2227 Q) \u2227 (Q \u2227 P))**: This statement is equivalent to (P \u2227 Q).  Both (P \u2227 Q) and (Q \u2227 P) are true only when both P and Q are true.\n\n*   **4. (Q \u2194 Q)**: This statement is a tautology (always true). It provides no specific information.\n\nNow let's consider the implications of having all four statements:\n\nStatements 1 and 4 are always true, so they do not provide any useful information.  Statement 2 says that a tautology is false, which is a contradiction.  Statement 3 indicates that both P and Q are true.  However, statement 2 means the entire set of statements is contradictory.\n\nTherefore, we can conclude that the given set of premises is **inconsistent (or contradictory).**  You cannot simultaneously assert these four statements without creating a logical absurdity.\n",
-        "score": 0.01,
-        "metadata": {
-          "premises": [
-            "((P \u2228 Q) \u2228 (Q \u2192 Q))",
-            "\u00ac(Q \u2192 Q)",
-            "((P \u2227 Q) \u2227 (Q \u2227 P))",
-            "(Q \u2194 Q)"
-          ],
-          "variables": [
-            "P",
-            "Q"
-          ],
-          "complexity": 3
-        }
-      }
-    ]
-  },
-  {
-    "metrics": {
-      "dataset_name": "leg_counting",
-      "model": "google/gemini-2.0-flash-001",
-      "size": 10,
-      "average_score": 0.40199999999999997,
-      "total_examples": 10,
-      "timestamp": "2025-02-10T06:35:27.087469",
-      "config": {
-        "min_animals": 3,
-        "max_animals": 8,
-        "size": 10,
-        "seed": 42
-      }
-    },
-    "results": [
-      {
-        "question": "How many legs are there in total if you have 3 humans, 3 sea slugs, 3 deers, 1 giraffe, 3 elephants, 2 ducks, 1 beetle, 1 flatworm?",
-        "expected_answer": "44",
-        "model_answer": "Okay, let's break it down:\n\n*   **Humans:** 3 humans * 2 legs/human = 6 legs\n*   **Sea Slugs:** 3 sea slugs * 0 legs/sea slug = 0 legs\n*   **Deer:** 3 deer * 4 legs/deer = 12 legs\n*   **Giraffe:** 1 giraffe * 4 legs/giraffe = 4 legs\n*   **Elephants:** 3 elephants * 4 legs/elephant = 12 legs\n*   **Ducks:** 2 ducks * 2 legs/duck = 4 legs\n*   **Beetle:** 1 beetle * 6 legs/beetle = 6 legs\n*   **Flatworm:** 1 flatworm * 0 legs/flatworm = 0 legs\n\nNow, let's add them all up:\n\n6 + 0 + 12 + 4 + 12 + 4 + 6 + 0 = 44 legs\n\nSo, there are a total of 44 legs.",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 8
-          },
-          "animals": {
-            "human": 3,
-            "sea slug": 3,
-            "deer": 3,
-            "giraffe": 1,
-            "elephant": 3,
-            "duck": 2,
-            "beetle": 1,
-            "flatworm": 1
-          },
-          "total_legs": 44
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 sheeps, 3 dogs, 3 praying mantiss?",
-        "expected_answer": "38",
-        "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Sheep:** 2 sheep * 4 legs/sheep = 8 legs\n*   **Dogs:** 3 dogs * 4 legs/dog = 12 legs\n*   **Praying Mantises:** 3 mantises * 6 legs/mantis = 18 legs\n\n**Total:** 8 + 12 + 18 = 38 legs\n",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 3
-          },
-          "animals": {
-            "sheep": 2,
-            "dog": 3,
-            "praying mantis": 3
-          },
-          "total_legs": 38
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 crabs, 1 lobster, 1 human, 1 cow, 3 bees, 1 elephant?",
-        "expected_answer": "58",
-        "model_answer": "Here's the breakdown:\n\n*   **Crabs:** 2 crabs * 10 legs/crab = 20 legs\n*   **Lobster:** 1 lobster * 10 legs/lobster = 10 legs\n*   **Human:** 1 human * 2 legs/human = 2 legs\n*   **Cow:** 1 cow * 4 legs/cow = 4 legs\n*   **Bees:** 3 bees * 6 legs/bee = 18 legs\n*   **Elephant:** 1 elephant * 4 legs/elephant = 4 legs\n\n**Total:** 20 + 10 + 2 + 4 + 18 + 4 = **58 legs**",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 6
-          },
-          "animals": {
-            "crab": 2,
-            "lobster": 1,
-            "human": 1,
-            "cow": 1,
-            "bee": 3,
-            "elephant": 1
-          },
-          "total_legs": 58
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 grasshoppers, 1 spider, 1 tiger, 2 chickens, 1 starfish?",
-        "expected_answer": "33",
-        "model_answer": "Here's the breakdown:\n\n*   **Grasshoppers:** 2 grasshoppers * 6 legs/grasshopper = 12 legs\n*   **Spider:** 1 spider * 8 legs/spider = 8 legs\n*   **Tiger:** 1 tiger * 4 legs/tiger = 4 legs\n*   **Chickens:** 2 chickens * 2 legs/chicken = 4 legs\n*   **Starfish:** 1 starfish * 0 legs/starfish = 0 legs (starfish have tube feet but not legs)\n\n**Total:** 12 + 8 + 4 + 4 + 0 = **28 legs**\n",
-        "score": 0.01,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 5
-          },
-          "animals": {
-            "grasshopper": 2,
-            "spider": 1,
-            "tiger": 1,
-            "chicken": 2,
-            "starfish": 1
-          },
-          "total_legs": 33
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 3 wasps, 3 jellyfishs, 1 elephant?",
-        "expected_answer": "22",
-        "model_answer": "Here's how to solve this:\n\n*   **Wasps:** Each wasp has 6 legs, so 3 wasps have 3 * 6 = 18 legs.\n*   **Jellyfish:** Jellyfish don't have legs, so they have 0 legs.\n*   **Elephant:** An elephant has 4 legs.\n\n**Total:** 18 + 0 + 4 = 22 legs\n\n**Answer:** There are a total of 22 legs.",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 3
-          },
-          "animals": {
-            "wasp": 3,
-            "jellyfish": 3,
-            "elephant": 1
-          },
-          "total_legs": 22
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 leechs, 2 crickets, 3 shrimps, 2 praying mantiss, 2 woodlouses?",
-        "expected_answer": "82",
-        "model_answer": "Here's the breakdown of the legs:\n\n*   **Leeches:** 0 legs each, so 2 leeches have 2 * 0 = 0 legs\n*   **Crickets:** 6 legs each, so 2 crickets have 2 * 6 = 12 legs\n*   **Shrimps:** 10 legs each, so 3 shrimps have 3 * 10 = 30 legs\n*   **Praying Mantises:** 6 legs each, so 2 praying mantises have 2 * 6 = 12 legs\n*   **Woodlice (Pillbugs/Roly-Polies):** 14 legs each, so 2 woodlice have 2 * 14 = 28 legs\n\nTotal: 0 + 12 + 30 + 12 + 28 = 82 legs\n\nSo the answer is 82",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 5
-          },
-          "animals": {
-            "leech": 2,
-            "cricket": 2,
-            "shrimp": 3,
-            "praying mantis": 2,
-            "woodlouse": 2
-          },
-          "total_legs": 82
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 insects, 1 duck, 1 lobster, 3 starfishs, 1 scorpion, 2 horses, 3 butterflys?",
-        "expected_answer": "73",
-        "model_answer": "Here's the breakdown:\n\n*   **Insects:** 2 insects * 6 legs/insect = 12 legs\n*   **Duck:** 1 duck * 2 legs/duck = 2 legs\n*   **Lobster:** 1 lobster * 10 legs/lobster = 10 legs\n*   **Starfish:** 3 starfish * 0 legs/starfish = 0 legs\n*   **Scorpion:** 1 scorpion * 8 legs/scorpion = 8 legs\n*   **Horses:** 2 horses * 4 legs/horse = 8 legs\n*   **Butterflies:** 3 butterflies * 6 legs/butterfly = 18 legs\n\n**Total:** 12 + 2 + 10 + 0 + 8 + 8 + 18 = **58 legs**",
-        "score": 0.01,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 7
-          },
-          "animals": {
-            "insect": 2,
-            "duck": 1,
-            "lobster": 1,
-            "starfish": 3,
-            "scorpion": 1,
-            "horse": 2,
-            "butterfly": 3
-          },
-          "total_legs": 73
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 2 butterflys, 3 grasshoppers, 3 humans?",
-        "expected_answer": "36",
-        "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Butterflies:** 2 butterflies * 6 legs/butterfly = 12 legs\n*   **Grasshoppers:** 3 grasshoppers * 6 legs/grasshopper = 18 legs\n*   **Humans:** 3 humans * 2 legs/human = 6 legs\n\n**Total:** 12 + 18 + 6 = 36 legs\n\nThere are a total of 36 legs.",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 3
-          },
-          "animals": {
-            "butterfly": 2,
-            "grasshopper": 3,
-            "human": 3
-          },
-          "total_legs": 36
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 3 deers, 2 beetles, 1 giraffe, 3 fireflys, 3 ants, 1 chicken?",
-        "expected_answer": "66",
-        "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Deer:** 3 deer * 4 legs/deer = 12 legs\n*   **Beetles:** 2 beetles * 6 legs/beetle = 12 legs\n*   **Giraffe:** 1 giraffe * 4 legs/giraffe = 4 legs\n*   **Fireflys:** 3 fireflys * 6 legs/firefly = 18 legs\n*   **Ants:** 3 ants * 6 legs/ant = 18 legs\n*   **Chicken:** 1 chicken * 2 legs/chicken = 2 legs\n\n**Total:** 12 + 12 + 4 + 18 + 18 + 2 = **66 legs**",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 6
-          },
-          "animals": {
-            "deer": 3,
-            "beetle": 2,
-            "giraffe": 1,
-            "firefly": 3,
-            "ant": 3,
-            "chicken": 1
-          },
-          "total_legs": 66
-        }
-      },
-      {
-        "question": "How many legs are there in total if you have 1 scorpion, 3 shrimps, 2 cats, 2 giraffes?",
-        "expected_answer": "54",
-        "model_answer": "Let's count the legs of each animal and then add them up:\n\n*   **Scorpion:** 8 legs\n*   **Shrimp:** 10 legs\n*   **Cat:** 4 legs\n*   **Giraffe:** 4 legs\n\nNow let's calculate the total:\n\n*   1 scorpion * 8 legs/scorpion = 8 legs\n*   3 shrimps * 10 legs/shrimp = 30 legs\n*   2 cats * 4 legs/cat = 8 legs\n*   2 giraffes * 4 legs/giraffe = 8 legs\n\nFinally, add them all together: 8 + 30 + 8 + 8 = 54 legs\n\nSo the answer is $\\boxed{54}$\n",
-        "score": 0.5,
-        "metadata": {
-          "difficulty": {
-            "num_animals": 4
-          },
-          "animals": {
-            "scorpion": 1,
-            "shrimp": 3,
-            "cat": 2,
-            "giraffe": 2
-          },
-          "total_legs": 54
-        }
-      }
-    ]
-  }
-]
diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
deleted file mode 100644
index cb8b45cc..00000000
--- a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "metrics": {
-    "dataset_name": "leg_counting",
-    "model": "google/gemini-2.0-flash-001",
-    "size": 10,
-    "seed": 42,
-    "average_score": 0,
-    "total_examples": 0,
-    "timestamp": "2025-02-10T06:06:08.539389"
-  },
-  "results": []
-}
diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
deleted file mode 100644
index 16bd308e..00000000
--- a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json
+++ /dev/null
@@ -1,186 +0,0 @@
-{
-  "metrics": {
-    "dataset_name": "leg_counting",
-    "model": "google/gemini-2.0-flash-001",
-    "size": 10,
-    "seed": 42,
-    "average_score": 0.45099999999999996,
-    "total_examples": 10,
-    "timestamp": "2025-02-10T06:07:53.290258"
-  },
-  "results": [
-    {
-      "question": "How many legs are there in total if you have 1 sea slug, 1 deer?",
-      "expected_answer": "4",
-      "model_answer": "*   A sea slug has no legs.\n*   A deer has 4 legs.\n\nTherefore, there are a total of 0 + 4 = $\\boxed{4}$ legs.\n",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 2
-        },
-        "animals": {
-          "sea slug": 1,
-          "deer": 1
-        },
-        "total_legs": 4
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 2 sheeps, 2 dogs?",
-      "expected_answer": "16",
-      "model_answer": "* Sheep have 4 legs each, so 2 sheep have 2 * 4 = 8 legs\n* Dogs have 4 legs each, so 2 dogs have 2 * 4 = 8 legs\n\nIn total, there are 8 + 8 = **16** legs.\n",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 2
-        },
-        "animals": {
-          "sheep": 2,
-          "dog": 2
-        },
-        "total_legs": 16
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 1 crab, 2 lobsters, 1 human, 1 cow, 1 bee?",
-      "expected_answer": "42",
-      "model_answer": "Here's the breakdown:\n\n*   **Crab:** 10 legs\n*   **Lobster:** 10 legs each, so 2 * 10 = 20 legs\n*   **Human:** 2 legs\n*   **Cow:** 4 legs\n*   **Bee:** 6 legs\n\n**Total:** 10 + 20 + 2 + 4 + 6 = **42 legs**",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 5
-        },
-        "animals": {
-          "crab": 1,
-          "lobster": 2,
-          "human": 1,
-          "cow": 1,
-          "bee": 1
-        },
-        "total_legs": 42
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 2 grasshoppers, 2 spiders, 1 tiger, 1 chicken?",
-      "expected_answer": "34",
-      "model_answer": "Here's the breakdown:\n\n*   **Grasshopper:** 6 legs each, so 2 grasshoppers have 2 * 6 = 12 legs\n*   **Spider:** 8 legs each, so 2 spiders have 2 * 8 = 16 legs\n*   **Tiger:** 4 legs\n*   **Chicken:** 2 legs\n\nTotal: 12 + 16 + 4 + 2 = 34 legs\n\nThere are a total of $\\boxed{34}$ legs.",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 4
-        },
-        "animals": {
-          "grasshopper": 2,
-          "spider": 2,
-          "tiger": 1,
-          "chicken": 1
-        },
-        "total_legs": 34
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 3 wasps, 3 jellyfishs?",
-      "expected_answer": "18",
-      "model_answer": "*   **Wasps:** Each wasp has 6 legs, so 3 wasps have 3 * 6 = 18 legs.\n*   **Jellyfish:** Jellyfish do not have legs.\n\n**Total:** 18 + 0 = 18 legs\n\nThere are a total of $\\boxed{18}$ legs.\n",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 2
-        },
-        "animals": {
-          "wasp": 3,
-          "jellyfish": 3
-        },
-        "total_legs": 18
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 3 leechs, 2 crickets, 2 shrimps, 3 praying mantiss?",
-      "expected_answer": "50",
-      "model_answer": "Here's how to calculate the total number of legs:\n\n*   **Leeches:** Leeches don't have legs.\n*   **Crickets:** 2 Crickets * 6 legs/cricket = 12 legs\n*   **Shrimp:** 2 Shrimps * 10 legs/shrimp = 20 legs\n*   **Praying Mantises:** 3 Mantises * 6 legs/mantis = 18 legs\n\n**Total:** 0 + 12 + 20 + 18 = 50 legs\n\nThere are a total of 50 legs.",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 4
-        },
-        "animals": {
-          "leech": 3,
-          "cricket": 2,
-          "shrimp": 2,
-          "praying mantis": 3
-        },
-        "total_legs": 50
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 3 ducks, 1 shrimp, 3 lobsters, 2 starfishs?",
-      "expected_answer": "56",
-      "model_answer": "Here's how to calculate the total number of legs:\n\n*   Ducks: 3 ducks * 2 legs/duck = 6 legs\n*   Shrimp: 1 shrimp * 10 legs/shrimp = 10 legs\n*   Lobsters: 3 lobsters * 10 legs/lobster = 30 legs\n*   Starfish: 2 starfish * 0 legs/starfish = 0 legs\n\nTotal: 6 + 10 + 30 + 0 = 46 legs\n\nThere are a total of 46 legs.",
-      "score": 0.01,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 4
-        },
-        "animals": {
-          "duck": 3,
-          "shrimp": 1,
-          "lobster": 3,
-          "starfish": 2
-        },
-        "total_legs": 56
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 1 butterfly, 2 grasshoppers?",
-      "expected_answer": "18",
-      "model_answer": "* A butterfly has 6 legs.\n* A grasshopper has 6 legs.\n\nSo, 1 butterfly has 6 legs, and 2 grasshoppers have 2 * 6 = 12 legs.\n\nIn total, there are 6 + 12 = $\\boxed{18}$ legs.\n",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 2
-        },
-        "animals": {
-          "butterfly": 1,
-          "grasshopper": 2
-        },
-        "total_legs": 18
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 1 deer, 3 beetles, 2 giraffes, 1 firefly, 3 ants?",
-      "expected_answer": "54",
-      "model_answer": "Let's count the legs for each animal:\n* Deer: 1 deer * 4 legs/deer = 4 legs\n* Beetles: 3 beetles * 6 legs/beetle = 18 legs\n* Giraffes: 2 giraffes * 4 legs/giraffe = 8 legs\n* Firefly: 1 firefly * 6 legs/firefly = 6 legs\n* Ants: 3 ants * 6 legs/ant = 18 legs\n\nNow, add up the legs for all the animals:\n4 + 18 + 8 + 6 + 18 = 54 legs\n\nTherefore, there are a total of 54 legs.\n\nFinal Answer: The final answer is $\\boxed{54}$",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 5
-        },
-        "animals": {
-          "deer": 1,
-          "beetle": 3,
-          "giraffe": 2,
-          "firefly": 1,
-          "ant": 3
-        },
-        "total_legs": 54
-      }
-    },
-    {
-      "question": "How many legs are there in total if you have 1 scorpion, 1 shrimp, 3 cats?",
-      "expected_answer": "30",
-      "model_answer": "Here's how to figure that out:\n\n*   **Scorpion:** Scorpions have 8 legs.\n*   **Shrimp:** Shrimp have 10 legs.\n*   **Cats:** Cats have 4 legs each, so 3 cats have 3 * 4 = 12 legs.\n\n**Total:** 8 + 10 + 12 = 30 legs\n\nThere are a total of 30 legs.",
-      "score": 0.5,
-      "metadata": {
-        "difficulty": {
-          "num_animals": 3
-        },
-        "animals": {
-          "scorpion": 1,
-          "shrimp": 1,
-          "cat": 3
-        },
-        "total_legs": 30
-      }
-    }
-  ]
-}
diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
deleted file mode 100644
index 49697c4b..00000000
--- a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "metrics": {
-    "dataset_name": "propositional_logic",
-    "model": "google/gemini-2.0-flash-001",
-    "size": 10,
-    "seed": 42,
-    "average_score": 0,
-    "total_examples": 0,
-    "timestamp": "2025-02-10T06:06:10.638347"
-  },
-  "results": []
-}
diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
deleted file mode 100644
index 13f80c7a..00000000
--- a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json
+++ /dev/null
@@ -1,201 +0,0 @@
-{
-  "metrics": {
-    "dataset_name": "propositional_logic",
-    "model": "google/gemini-2.0-flash-001",
-    "size": 10,
-    "seed": 42,
-    "average_score": 0.059,
-    "total_examples": 10,
-    "timestamp": "2025-02-10T06:08:24.037697"
-  },
-  "results": [
-    {
-      "question": "Given:\n1. R\n2. Q\nWhat can we conclude?",
-      "expected_answer": "(P \u2228 Q)",
-      "model_answer": "Given only R and Q, we can conclude that **R and Q are true** (according to propositional logic).\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "R",
-          "Q"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R",
-          "S"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. ((Q \u2192 P) \u2228 (Q \u2192 P))\n2. ((Q \u2194 Q) \u2192 (P \u2192 P))\n3. P\nWhat can we conclude?",
-      "expected_answer": "(P \u2192 P)",
-      "model_answer": "Let's break down each premise and see what we can derive.\n\n* **1. ((Q \u2192 P) \u2228 (Q \u2192 P))** This is logically equivalent to (Q \u2192 P).  This is because any statement 'A or A' is simply 'A'.  So we now have:  **Q \u2192 P**\n\n* **2. ((Q \u2194 Q) \u2192 (P \u2192 P))**  The statement (Q \u2194 Q) is a tautology (always true).  The statement (P \u2192 P) is also a tautology.  Therefore, (True \u2192 True) is True. This premise is always true and doesn't provide us with any new information that helps us directly derive something else.  We can essentially ignore it.\n\n* **3. P** This is simply **P**.\n\nNow we have:\n\n* Q \u2192 P\n* P\n\nFrom these two statements, we can't directly conclude anything.\n\nRemember the Modus Ponens and Modus Tollens rules:\n\n*   **Modus Ponens:** If you have P \u2192 Q and P, you can conclude Q.\n*   **Modus Tollens:** If you have P \u2192 Q and \u00acQ, you can conclude \u00acP.\n\nIn our case, we have Q \u2192 P and P.   This matches the *reverse* of Modus Ponens, which is *not* a valid inference rule.  Knowing that the *consequent* (P) is true *doesn't* tell us anything definitively about the *antecedent* (Q).  Q could be either true or false.\n\nTherefore, based on the given premises, we cannot conclude anything further about Q's truth value.\n",
-      "score": 0.5,
-      "metadata": {
-        "premises": [
-          "((Q \u2192 P) \u2228 (Q \u2192 P))",
-          "((Q \u2194 Q) \u2192 (P \u2192 P))",
-          "P"
-        ],
-        "variables": [
-          "P",
-          "Q"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. ((Q \u2228 P) \u2227 \u00acP)\n2. P\n3. ((P \u2227 R) \u2227 \u00acR)\n4. ((Q \u2194 R) \u2192 \u00acQ)\nWhat can we conclude?",
-      "expected_answer": "(Q \u2227 Q)",
-      "model_answer": "Let's analyze each given statement:\n\n1. **((Q \u2228 P) \u2227 \u00acP)**:  This says that either Q or P is true *and* P is false.  Since P is false, Q must be true.  So, we can conclude **Q**.\n\n2. **P**:  This simply states that P is true.\n\n3. **((P \u2227 R) \u2227 \u00acR)**: This reads as \"Both P and R are true, *and* R is false.\"  This is a contradiction. Since P and R are true, you cannot conclude that R is false.\n\n4. **((Q \u2194 R) \u2192 \u00acQ)**: This says \"If Q is equivalent to R (i.e., they have the same truth value), then Q is false.\"\n\nNow, let's see what we can derive, considering information from all the premises.\n\n* **From premise 1:**  We *appear* to conclude Q.  However premise 2 states that P is true. This contradicts the information provided in premise 1 that states P is false. Therefore, we have contradictory information. \n\n* **Premise 3 is a contradiction.**\n\nLet's consider the assumptions independently.\n\n* **Assuming premise 1 is true in isolation:**  Then, P is false and Q is true.\n\n* **Assuming premise 2 is true in isolation:** Then, P is true.\n\n* **Assuming premise 3 is true in isolation:**  Then, P and R are true and R is false. This is itself a contradiction and therefore cannot be true in isolation.\n\n* **Assuming premise 4 is true in isolation:**  It tells us the dependence between the truth values of Q and R, but doesn't give us the true/false result of either variable on their own.\n\nSince we have contradicting information in the premises, we must be able to reach a conclusion.\n\n**Analyzing for Contradiction:**\n\nSince premise 1 stated that Q is true only if \u00acP is True, yet premise 2 asserts that P is True, then we know that premise (1\u22272) \u22a2 False.\n\nPremise 3 also claims that ((P \u2227 R) \u2227 \u00acR) is true which again cannot be true since (R\u2227\u00acR) evaluates to False.\n\nPremise 4 will only create a contradiction if Q \u2194 R evaluates to True but \u00acQ evaluates to False. Since Q is true from Premise 1, this implies that R must also be True, yet we cannot determine whether this is indeed true or false. We can only determine this, if we remove the contradictory information given regarding Q and R across all the premises.\n\n**Conclusion with contradiction:**\n\nWith contradictory premises, we can conclude *anything*.  This is a principle of classical logic called \"ex falso quodlibet\" (from falsehood, anything follows), often simplified as \"explosion\".  Since the collection of premises contains a contradiction, we can use it derive any conclusion.\n\n**Example:**\n\n1. Q is true (from premise 1, under the assumption it's only true premise)\n2. P is true is also true (from premise 2 under the assumption it's the only true premise)\n3. P and R is true and R is false (from premise 3 with the same provision).\n4. (Q \u2194 R) \u2192 \u00acQ (from premise 4 and depends on the premise results from the above to determine whether Q is true, but we can only determine that when the contradictory information is omitted from the statement/statements/premises).\n\nBecause premises do not provide consistent rules to determine truth values of P, Q, R, and therefore contradiction, then anything follows.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "((Q \u2228 P) \u2227 \u00acP)",
-          "P",
-          "((P \u2227 R) \u2227 \u00acR)",
-          "((Q \u2194 R) \u2192 \u00acQ)"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. \u00acP\n2. \u00acP\n3. Q\nWhat can we conclude?",
-      "expected_answer": "(Q \u2228 R)",
-      "model_answer": "Given the premises:\n\n1.  \u00acP\n2.  \u00acP\n3.  Q\n\nWe can conclude **Q**.\n\nThe repetition of \u00acP doesn't add any new information. This is a very simple case where 'Q' is stated directly in the premises. There's no need for any complex logical deductions.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "\u00acP",
-          "\u00acP",
-          "Q"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. P\n2. ((P \u2228 P) \u2194 (Q \u2227 P))\n3. P\nWhat can we conclude?",
-      "expected_answer": "(Q \u2194 Q)",
-      "model_answer": "Let's analyze the given statements:\n\n1. P is true.\n2. ((P \u2228 P) \u2194 (Q \u2227 P)) - This statement tells us that the truth value of (P \u2228 P) is equivalent to the truth value of (Q \u2227 P).\n\nSince we know P is true from premise 1, we can simplify:\n\n* **P \u2228 P is true:**  Because P is true, disjoining it with itself makes the disjunction true.\n* **Since (P \u2228 P) \u2194 (Q \u2227 P) is true and (P \u2228 P) is true, then (Q \u2227 P) must also be true.**\n* **(Q \u2227 P) is true, and P is true, therefore Q must be true.**  For the conjunction (Q \u2227 P) to be true, both Q and P must be true.\n\nTherefore, we can conclude that **Q is true**.\n\nSo the conclusion includes P and Q.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "P",
-          "((P \u2228 P) \u2194 (Q \u2227 P))",
-          "P"
-        ],
-        "variables": [
-          "P",
-          "Q"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. (Q \u2194 R)\n2. \u00acR\nWhat can we conclude?",
-      "expected_answer": "(Q \u2192 R)",
-      "model_answer": "From the given premises:\n\n1.  Q \u2194 R (Q is equivalent to R)\n2.  \u00acR (R is false)\n\nSince Q is equivalent to R, this means that if R is false, then Q must also be false.\n\nTherefore, we can conclude:\n\n\u00acQ (Q is false)\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "(Q \u2194 R)",
-          "\u00acR"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. R\n2. ((Q \u2192 Q) \u2228 (S \u2227 Q))\n3. Q\nWhat can we conclude?",
-      "expected_answer": "(S \u2228 Q)",
-      "model_answer": "Let's analyze the given statements:\n\n*   **1. R** - This is a simple assertion: R is true.\n\n*   **2. ((Q \u2192 Q) \u2228 (S \u2227 Q))** -  Let's break this down:\n    *   `(Q \u2192 Q)`: This is a tautology.  Any proposition implying itself is always true.\n    *   `(S \u2227 Q)`: This is true if and only if both S and Q are true.\n    *   `\u2228`:  The entire statement is true if either `(Q \u2192 Q)` is true, or `(S \u2227 Q)` is true.  Since `(Q \u2192 Q)` is always true, this entire statement is always true, regardless of the truth value of S.\n\n*   **3. Q** - This asserts that Q is true.\n\nNow, what can we conclude?\n\n*   **We know R is true.** (Directly from statement 1).\n*   **We know Q is true.** (Directly from statement 3).\n\nFrom the second statement, `((Q \u2192 Q) \u2228 (S \u2227 Q))`, we can note the following:\n\n*   Because `Q \u2192 Q` is always true (a tautology), `((Q \u2192 Q) \u2228 (S \u2227 Q))` is always true, regardless of the truth value of S. This statement doesn't help us conclude anything new about S, but it does provide some information about the relationship between Q and itself, and Q and S.\n* Because we also know that Q is true, and that `((Q \u2192 Q) \u2228 (S \u2227 Q))` is true, the `(S \u2227 Q)` portion could be true or false. If it's true, then S must also be true. If it's false, then S must be false. However, because `((Q \u2192 Q) \u2228 (S \u2227 Q))` is true, and Q is true, we cannot know for sure if S must be true or false. \n\nTherefore, the key conclusions we can draw are:\n\n*   **R is true.**\n*   **Q is true.**\n\nWe **cannot** definitively conclude anything about the truth value of S.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "R",
-          "((Q \u2192 Q) \u2228 (S \u2227 Q))",
-          "Q"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R",
-          "S"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. (Q \u2227 P)\n2. (\u00acP \u2227 (P \u2192 P))\n3. \u00ac(Q \u2194 Q)\nWhat can we conclude?",
-      "expected_answer": "(P \u2227 P)",
-      "model_answer": "Let's analyze each statement:\n\n1. **(Q \u2227 P):** This statement tells us that both Q and P are true.\n\n2. **(\u00acP \u2227 (P \u2192 P)):**\n    * `\u00acP`: This means P is false.\n    * `(P \u2192 P)`: This is always true (a tautology).  Any proposition implies itself.\n\n3. **\u00ac(Q \u2194 Q):**\n    * `(Q \u2194 Q)`:  This means \"Q if and only if Q\". This is always true (a tautology).\n    * `\u00ac(Q \u2194 Q)`: This means \"not (Q if and only if Q)\".  Since `(Q \u2194 Q)` is always true, its negation is always false.\n\nNow let's look for contradictions:\n\n* Statement 1 says P is true.\n* Statement 2 says P is false.\n\nTherefore, **we have a contradiction**. The set of premises is **inconsistent**.  We can conclude that the statements cannot all be true simultaneously; there's no truth assignment that will satisfy all three statements.\n\nTherefore, we can conclude that the premises are inconsistent.\n\n**Final Answer: The set of premises is inconsistent.**\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "(Q \u2227 P)",
-          "(\u00acP \u2227 (P \u2192 P))",
-          "\u00ac(Q \u2194 Q)"
-        ],
-        "variables": [
-          "P",
-          "Q"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. (R \u2228 Q)\n2. (R \u2227 Q)\n3. R\nWhat can we conclude?",
-      "expected_answer": "(P \u2194 P)",
-      "model_answer": "From the given statements:\n\n*   **1. (R \u2228 Q)** - This tells us that R or Q (or both) is true.\n*   **2. (R \u2227 Q)** - This tells us that both R and Q are true.\n*   **3. R** - This tells us that R is true.\n\nWhile statement 3. R is already confirmed in statement 2., so, knowing that both R and Q are true from statement 2, we can definitively conclude:\n\n*   **Q** is true.\n\nHere's a breakdown:\n\nStatement 2 tells us R \u2227 Q.  For a conjunction (\u2227) to be true, both parts must be true therefore R and Q must be true.\n\nSo, the most direct and strongest conclusion we can make is:\n\n*  **Q is true**\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "(R \u2228 Q)",
-          "(R \u2227 Q)",
-          "R"
-        ],
-        "variables": [
-          "P",
-          "Q",
-          "R"
-        ],
-        "complexity": 3
-      }
-    },
-    {
-      "question": "Given:\n1. ((P \u2228 Q) \u2228 (Q \u2192 Q))\n2. \u00ac(Q \u2192 Q)\n3. ((P \u2227 Q) \u2227 (Q \u2227 P))\n4. (Q \u2194 Q)\nWhat can we conclude?",
-      "expected_answer": "(P \u2192 P)",
-      "model_answer": "Let's analyze each statement:\n\n1.  **((P \u2228 Q) \u2228 (Q \u2192 Q))**:  We know that `Q \u2192 Q` is always true (a tautology). Therefore, `(Q \u2192 Q)` is equivalent to `T` (True). The statement becomes `((P \u2228 Q) \u2228 T)`.  Since anything ORed with True is always True, this whole statement simplifies to **T (True)**.  It tells us nothing about the values of P or Q.\n\n2.  **\u00ac(Q \u2192 Q)**: We know that `Q \u2192 Q` is always true.  Therefore, `\u00ac(Q \u2192 Q)` is equivalent to `\u00acT`, which is **F (False)**.  This is a contradiction given that the argument is valid.\n\n3.  **((P \u2227 Q) \u2227 (Q \u2227 P))**:  This statement is equivalent to `(P \u2227 Q)`. This is because `P \u2227 Q` is logically equivalent to `Q \u2227 P`. So, we can say that `P` is True **AND** `Q` is True.\n\n4.  **(Q \u2194 Q)**:  This is a biconditional, meaning \"Q if and only if Q\".  This statement is always true (a tautology) regardless of the value of Q. So this is equivalent to **T (True)**, and again, it tells us nothing about the value of Q.\n\n**Conclusion**:\n\nThe argument presents contradictory statements:\n\nStatement 1: Always True.\nStatement 2: Always False.\nStatement 3: P is true AND Q is true.\nStatement 4: Always true.\n\nStatement 2 contradicts both statement 1 and statement 4.\n\n**Therefore, we can conclude that the set of premises is inconsistent due to the contradiction in premise 2 \"\u00ac(Q \u2192 Q)\".**\n\nWithout premise 2, we could conclude that P is true AND Q is true from premise 3. The logical equivalence `(P \u2227 Q) \u2194 (Q \u2194 Q)` can be derived by simply checking truth tables.\n",
-      "score": 0.01,
-      "metadata": {
-        "premises": [
-          "((P \u2228 Q) \u2228 (Q \u2192 Q))",
-          "\u00ac(Q \u2192 Q)",
-          "((P \u2227 Q) \u2227 (Q \u2227 P))",
-          "(Q \u2194 Q)"
-        ],
-        "variables": [
-          "P",
-          "Q"
-        ],
-        "complexity": 3
-      }
-    }
-  ]
-}
diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json b/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json
new file mode 100644
index 00000000..32086f46
--- /dev/null
+++ b/eval/results/summary_google_gemini-2.0-flash-001_20250209_224813.json
@@ -0,0 +1,39 @@
+[
+  {
+    "dataset_name": "letter_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.20600000000000002,
+    "total_examples": 10,
+    "timestamp": "2025-02-09T22:47:25.934820",
+    "config": {
+      "min_words": 5,
+      "max_words": 15,
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "propositional_logic",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.059,
+    "total_examples": 10,
+    "timestamp": "2025-02-09T22:47:57.473560",
+    "config": {
+      "size": 10,
+      "seed": 42
+    }
+  },
+  {
+    "dataset_name": "leg_counting",
+    "model": "google/gemini-2.0-flash-001",
+    "average_score": 0.40199999999999997,
+    "total_examples": 10,
+    "timestamp": "2025-02-09T22:48:13.546006",
+    "config": {
+      "min_animals": 3,
+      "max_animals": 8,
+      "size": 10,
+      "seed": 42
+    }
+  }
+]