Minor question template & score_answer improvements (#261)

* math prompt improvements * ignore brackets in complex_arithmetic results * improve additional instruction in prompt of polynomial_equations * more strict tests for score_answer in polynomial_equations * simplify special reward handling * fix test_intermediate_integration * fix sokoban dataset * add common dataset score_answer consistency test
2026-04-22 16:49:06 +00:00 · 2025-03-04 21:55:09 +01:00 · 2025-03-04 21:55:09 +01:00 · 5d7fbac0ad
commit 5d7fbac0ad
parent 061282e373
106 changed files with 403 additions and 507 deletions
--- a/reasoning_gym/code/bf.py
+++ b/reasoning_gym/code/bf.py
@ -121,20 +121,23 @@ int main() {{
            float: The computed score between 0.0 and 1.0.
        """

-        if answer == None:
+        if not isinstance(answer, str):
            return 0.0
-        if answer != entry["answer"]:
-            if entry["answer"] in answer.splitlines():
-                # We can be quite confident that the correct answer was given
-                # It was likely just given alongside an explanation
-                return max(0.9 * len(answer) / len(entry["answer"]), 0.1)
-            if entry["answer"] in answer:
-                # Since answers are English words, some risk of the response coincidentally containing the answer
-                return max(0.5 * len(answer) / len(entry["answer"]), 0.1)
-            return 0.01
-        else:
+
+        if answer == entry["answer"]:
            return 1.0  # Yay

+        if entry["answer"] in answer.splitlines():
+            # We can be quite confident that the correct answer was given
+            # It was likely just given alongside an explanation
+            return max(0.9 * len(answer) / len(entry["answer"]), 0.1)
+
+        if entry["answer"] in answer:
+            # Since answers are English words, some risk of the response coincidentally containing the answer
+            return max(0.5 * len(answer) / len(entry["answer"]), 0.1)
+
+        return 0.0
+

 # Register the dataset
 register_dataset("bf", BFDataset, BFConfig)