Formatting/scoring improvements for BF & family

2026-04-23 16:55:05 +00:00 · 2025-02-17 19:08:15 +00:00 · 2025-02-17 19:08:15 +00:00 · 0de0044d52
commit 0de0044d52
parent b40c44059d
2 changed files with 12 additions and 4 deletions
--- a/reasoning_gym/code/bf.py
+++ b/reasoning_gym/code/bf.py
@ -28,7 +28,8 @@ class BFDataset(ProceduralDataset):

    def __init__(self, config: BFConfig):
        self._prompt_templates = [
-            "This is a BF (Brainf*ck) computer program. What is the output? \n\n{bf_program}",
+            "This is a BF (Brainf*ck) computer program. What is the output?\n\n{bf_program}\n\nRespond only with the exact output of the program.",
+            "Consider the following BF (Brainf*ck) code. What would it output?\n\n{bf_program}\n\nProvide only the exact output of the code.",
        ]
        super().__init__(config=config, seed=config.seed, size=config.size)

@ -123,6 +124,13 @@ int main() {{
        if answer == None:
            return 0.0
        if answer != entry["answer"]:
+            if entry["answer"] in answer.splitlines():
+                # We can be quite confident that the correct answer was given
+                # It was likely just given alongside an explanation
+                return 0.9 * len(answer) / len(entry["answer"])
+            if entry["answer"] in answer:
+                # Since answers are English words, some risk of the response coincidentally containing the answer
+                return 0.5 * len(answer) / len(entry["answer"])
            return 0.01
        else:
            return 1.0  # Yay