From 99bf648989565a7209034f6ad9abecdf08ed689c Mon Sep 17 00:00:00 2001
From: Rich Jones <miserlou@gmail.com>
Date: Thu, 30 Jan 2025 15:38:03 +0100
Subject: [PATCH 01/94] initial bf working, contrib not committed

---
 .gitignore                     |  3 ++
 reasoning_gym/code/__init__.py |  7 +++
 reasoning_gym/code/bf.py       | 96 ++++++++++++++++++++++++++++++++++
 tests/test_bf.py               | 37 +++++++++++++
 4 files changed, 143 insertions(+)
 create mode 100644 reasoning_gym/code/__init__.py
 create mode 100644 reasoning_gym/code/bf.py
 create mode 100644 tests/test_bf.py

diff --git a/.gitignore b/.gitignore
index be4071bb..c3ff9440 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,6 @@ ENV/
 .coverage
 htmlcov/
 .pytest_cache/
+
+# OSX
+.DS_Store
diff --git a/reasoning_gym/code/__init__.py b/reasoning_gym/code/__init__.py
new file mode 100644
index 00000000..19aca9df
--- /dev/null
+++ b/reasoning_gym/code/__init__.py
@@ -0,0 +1,7 @@
+"""
+Cognition tasks for training reasoning capabilities:
+- Code Analysis
+- Code Interpretation
+- Code Execution
+"""
+
diff --git a/reasoning_gym/code/bf.py b/reasoning_gym/code/bf.py
new file mode 100644
index 00000000..9f2f3381
--- /dev/null
+++ b/reasoning_gym/code/bf.py
@@ -0,0 +1,96 @@
+from dataclasses import dataclass
+from random import Random
+from typing import Dict, Optional
+
+import bfi
+from .contrib.bfit.Compiler import Compiler, Minify
+
+from ..data.wordle_words import wordle_words
+from ..factory import ProceduralDataset, register_dataset
+
+
+@dataclass
+class BFConfig:
+    """Configuration for BF task generation"""
+
+    seed: Optional[int] = None
+    size: int = 500
+
+
+class BFDataset(ProceduralDataset):
+    """Generates BF tasks"""
+
+    def __init__(self, config: BFConfig):
+        self._prompt_templates = [
+            "This is a BF (Brainf*ck) computer program. What is the output? \n\n{bf_program}",
+        ]
+        super().__init__(config=config, seed=config.seed, size=config.size)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Generate a single BF task
+
+        Returns:
+            dict with keys:
+                - question: str, the task description with figlet string
+                - answer: str, the figlet encoded word
+                - metadata: dict with generation parameters
+        """
+        rng = Random(self.seed + idx)
+
+        bfit_code = self.generate_bfit_code(rng)
+        bf_program = self.compile_bfit_code_to_bf(bfit_code)
+
+        result = bfi.interpret(bf_program, buffer_output=True)
+
+        return {
+            "question": rng.choice(self._prompt_templates).format(bf_program=bf_program),
+            "answer": result,
+            "metadata": {"bfit_code": bfit_code, "bf_program": bf_program},
+        }
+
+    def generate_bfit_code(self, rng: Random) -> str:
+
+        bfit_template = """
+int main() {
+    int acc = 0;
+    int target = 15;
+    int x = 2;
+    int y = 3;
+    while (acc < target) {
+        acc = acc + x;
+        acc = acc + y;
+    }
+    printint(acc);
+}
+"""
+        rendered_bfit = bfit_template
+        return rendered_bfit
+
+    def compile_bfit_code_to_bf(self, bfit: str) -> str:
+        bf = Compiler.compile(bfit, optimize_code=True)
+        # bf = Minify.minify(bf) # Is this necessary?
+        return bf
+
+    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+        """Determine if the solution provided solves the figlet task.
+
+        The function awards 1.0 for a correct answer and 0.1 points for each correct letter in the correct position,
+        with a maximum possible score of 1.0.
+
+        Args:
+            answer (Optional[str]): The user's answer.
+            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+
+        Returns:
+            float: The computed score between 0.0 and 1.0.
+        """
+
+        if answer == None:
+            return 0.0
+        if answer != entry['answer']:
+            return 0.01
+        else:
+            return 1.0 # Yay
+
+# Register the dataset
+register_dataset("figlet_font", BFDataset, BFConfig)
diff --git a/tests/test_bf.py b/tests/test_bf.py
new file mode 100644
index 00000000..9340e9c4
--- /dev/null
+++ b/tests/test_bf.py
@@ -0,0 +1,37 @@
+import pytest
+
+from reasoning_gym.code.bf import BFConfig, BFDataset
+
+
+# def test_figlet_deterministic():
+#     """Test that dataset generates same items with same seed"""
+#     config = FigletFontConfig(seed=42, size=15)
+#     dataset1 = FigletFontDataset(config)
+#     dataset2 = FigletFontDataset(config)
+
+#     for i in range(15):  # Only check first 15 entries for speed
+#         assert dataset1[i] == dataset2[i]
+
+
+def test_bf():
+    """Test basic properties and solution of generated items"""
+    config = BFConfig(seed=42, size=40)
+    dataset = BFDataset(config)
+
+    for item in dataset:
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        # Check metadata contains required fields
+        assert "bfit_code" in item["metadata"]
+        assert "bf_program" in item["metadata"]
+
+        print(item["answer"])
+
+        # Test the scoring
+        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
+        assert dataset.score_answer(answer=None, entry=item) == 0.0
+        assert dataset.score_answer(answer="Love is a battlefield", entry=item) == 0.01
+

From 574df8de2327aec48300bb301ff870ab5c3ad1fd Mon Sep 17 00:00:00 2001
From: Rich Jones <miserlou@gmail.com>
Date: Thu, 30 Jan 2025 15:42:11 +0100
Subject: [PATCH 02/94] add contrib

---
 reasoning_gym/code/contrib/__init__.py        |    0
 reasoning_gym/code/contrib/bfit/BF-it.py      |   58 +
 .../code/contrib/bfit/Compiler/Compiler.py    |  165 +++
 .../code/contrib/bfit/Compiler/Exceptions.py  |    6 +
 .../contrib/bfit/Compiler/FunctionCompiler.py | 1159 +++++++++++++++
 .../code/contrib/bfit/Compiler/Functions.py   |   33 +
 .../code/contrib/bfit/Compiler/General.py     | 1258 +++++++++++++++++
 .../code/contrib/bfit/Compiler/Globals.py     |   82 ++
 .../contrib/bfit/Compiler/Lexical_analyzer.py |  183 +++
 .../bfit/Compiler/LibraryFunctionCompiler.py  |  127 ++
 .../code/contrib/bfit/Compiler/Minify.py      |   14 +
 .../code/contrib/bfit/Compiler/Node.py        |  398 ++++++
 .../code/contrib/bfit/Compiler/Optimizer.py   |   89 ++
 .../code/contrib/bfit/Compiler/Parser.py      |  129 ++
 .../code/contrib/bfit/Compiler/Token.py       |   70 +
 .../code/contrib/bfit/Compiler/__init__.py    |    0
 .../code/contrib/bfit/Interpreter.py          |   78 +
 reasoning_gym/code/contrib/bfit/LICENSE.md    |   21 +
 reasoning_gym/code/contrib/bfit/README.md     |  101 ++
 reasoning_gym/code/contrib/bfit/__init__.py   |    0
 20 files changed, 3971 insertions(+)
 create mode 100644 reasoning_gym/code/contrib/__init__.py
 create mode 100644 reasoning_gym/code/contrib/bfit/BF-it.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Compiler.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Exceptions.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/FunctionCompiler.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Functions.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/General.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Globals.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/LibraryFunctionCompiler.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Minify.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Node.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Optimizer.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Parser.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/Token.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Compiler/__init__.py
 create mode 100644 reasoning_gym/code/contrib/bfit/Interpreter.py
 create mode 100644 reasoning_gym/code/contrib/bfit/LICENSE.md
 create mode 100644 reasoning_gym/code/contrib/bfit/README.md
 create mode 100644 reasoning_gym/code/contrib/bfit/__init__.py

diff --git a/reasoning_gym/code/contrib/__init__.py b/reasoning_gym/code/contrib/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/reasoning_gym/code/contrib/bfit/BF-it.py b/reasoning_gym/code/contrib/bfit/BF-it.py
new file mode 100644
index 00000000..46545a29
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/BF-it.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import Interpreter
+from Compiler import Compiler
+from Compiler import Minify
+
+
+def process_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("filepath", metavar="input_file", nargs=1, help="Path to the input code file")
+    parser.add_argument("--output", "-o", metavar="output_file", nargs=1, help="Path to output Brainfuck file")
+    parser.add_argument("--run", "-r", action="store_true", help="Run the Brainfuck file after compilation")
+    parser.add_argument("--minify", "-m", action="store_true", help="Minifies the compiled code")
+    parser.add_argument("--optimize", "-opt", action="store_true", help="syntax optimization")
+
+    args = parser.parse_args()
+
+    input_file = args.filepath[0]
+    if args.output:
+        output_file = args.output[0]
+    else:
+        output_file_basename = os.path.splitext(os.path.basename(input_file))[0] + ".bf"
+        output_file = os.path.join(os.path.dirname(input_file), output_file_basename)
+
+    run_file = args.run
+    minify_file = args.minify
+    optimize = args.optimize
+
+    return input_file, output_file, run_file, minify_file, optimize
+
+
+def compile_file():
+    input_file, output_file, run_file, minify_bf_code, optimize_code = process_args()
+    print("Compiling file '%s'..." % input_file)
+
+    with open(input_file, "rb") as f:
+        code = f.read().decode("utf8")
+
+    brainfuck_code = Compiler.compile(code, optimize_code)
+    brainfuck_code += "\n"
+
+    if minify_bf_code:
+        brainfuck_code = Minify.minify(brainfuck_code)
+
+    with open(output_file, "wt") as f:
+        f.write(brainfuck_code)
+
+    print("Compiled successfully to '%s'" % output_file)
+
+    if run_file:
+        print("Running compiled code...")
+        Interpreter.brainfuck(brainfuck_code)
+
+
+if __name__ == '__main__':
+    compile_file()
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Compiler.py b/reasoning_gym/code/contrib/bfit/Compiler/Compiler.py
new file mode 100644
index 00000000..276fae88
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Compiler.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+from .Exceptions import BFSyntaxError, BFSemanticError
+from .FunctionCompiler import FunctionCompiler
+from .Functions import check_function_exists, get_function_object, insert_function_object
+from .General import is_token_literal, get_literal_token_code, unpack_literal_tokens_to_array_dimensions
+from .Globals import get_global_variables_size, get_variable_size, get_variable_dimensions, insert_global_variable, create_variable_from_definition
+from .Lexical_analyzer import analyze
+from .Optimizer import optimize
+from .LibraryFunctionCompiler import insert_library_functions
+from .Parser import Parser
+from .Token import Token
+
+"""
+This file is responsible for creating FunctionCompiler objects and global variables objects
+And finally, return the code of the main function
+"""
+
+
+class Compiler:
+    def __init__(self, code, optimize_code=False):
+        tokens = analyze(code)
+        if optimize_code:
+            optimize(tokens)
+        self.parser = Parser(tokens)
+
+    # global variables and functions
+    def create_function_object(self):
+        # function: (INT | VOID) ID LPAREN expression_list RPAREN LBRACE statements RBRACE
+        # returns function named tuple
+
+        if self.parser.current_token().type not in [Token.VOID, Token.INT]:
+            raise BFSemanticError("Function return type can be either void or int, not '%s'" % str(self.parser.current_token()))
+
+        self.parser.check_next_tokens_are([Token.ID, Token.LPAREN])
+
+        # save all tokens of this function
+        function_name = self.parser.next_token(next_amount=1).data
+        RPAREN_index = self.parser.find_matching(starting_index=self.parser.current_token_index+2)  # first find RPAREN
+        self.parser.check_next_token_is(Token.LBRACE, starting_index=RPAREN_index)
+        RBRACE_index = self.parser.find_matching(starting_index=RPAREN_index+1)  # then find RBRACE
+
+        # take all tokens between INT and RBRACE and pass them to function object
+        function_tokens = self.parser.tokens[self.parser.current_token_index:RBRACE_index+1]
+        # skip function definition
+        self.parser.advance_to_token_at_index(RBRACE_index+1)
+
+        function = FunctionCompiler(function_name, function_tokens)
+        return function
+
+    def compile_global_variable_definition(self):
+        # INT ID (ASSIGN NUM | (LBRACK NUM RBRACK)+ (ASSIGN LBRACE ... RBRACE)?)? SEMICOLON
+        # returns code that initializes this variable, and advances pointer according to variable size
+
+        self.parser.check_current_tokens_are([Token.INT, Token.ID])
+        ID_token = self.parser.next_token()
+        variable = create_variable_from_definition(self.parser, advance_tokens=True)
+        insert_global_variable(variable)
+
+        # if this is set to False, then the compiler assumes that initially all cells are set to zero
+        # if this is set to True, then the compiler zeros each cell before using it (may generate a lot of unnecessary BF code)
+        ZERO_CELLS_BEFORE_USE = False
+
+        code = '[-]' if ZERO_CELLS_BEFORE_USE else ''
+        if get_variable_size(variable) > 1:  # its an array
+            if self.parser.current_token().type == Token.SEMICOLON:
+                # array definition - INT ID (LBRACK NUM RBRACK)+ SEMICOLON
+                self.parser.advance_token()  # skip SEMICOLON
+                code = (code + '>') * get_variable_size(variable)  # advance to after this variable
+                return code
+            elif self.parser.current_token().type == Token.ASSIGN and self.parser.current_token().data == "=":
+                # array definition and initialization - INT ID (LBRACK NUM RBRACK)+ ASSIGN ((LBRACE ... RBRACE)+|STRING) SEMICOLON
+                self.parser.advance_token()  # skip ASSIGN
+
+                if self.parser.current_token().type not in [Token.LBRACE, Token.STRING]:
+                    raise BFSyntaxError("Expected LBRACE or STRING at '%s'" % self.parser.current_token())
+
+                literal_tokens_list = self.parser.compile_array_initialization_list()
+                self.parser.check_current_token_is(Token.SEMICOLON)
+                self.parser.advance_token()  # skip SEMICOLON
+
+                array_dimensions = get_variable_dimensions(variable)
+                unpacked_literals_list = unpack_literal_tokens_to_array_dimensions(ID_token, array_dimensions, literal_tokens_list)
+
+                for literal in unpacked_literals_list:
+                    code += get_literal_token_code(literal)  # evaluate this literal and point to next array element
+                return code
+            else:
+                raise BFSyntaxError("Unexpected %s in array definition. Expected SEMICOLON (;) or ASSIGN (=)" % self.parser.current_token())
+
+        elif self.parser.current_token().type == Token.SEMICOLON:  # no need to initialize
+            self.parser.advance_token()  # skip SEMICOLON
+            code += '>'  # advance to after this variable
+        else:
+            self.parser.check_current_token_is(Token.ASSIGN)
+            if self.parser.current_token().data != "=":
+                raise BFSyntaxError("Unexpected %s when initializing global variable. Expected ASSIGN (=)" % self.parser.current_token())
+            self.parser.advance_token()  # skip ASSIGN
+
+            if not is_token_literal(self.parser.current_token()):
+                raise BFSemanticError("Unexpected '%s'. expected literal (NUM | CHAR | TRUE | FALSE )" % str(self.parser.current_token()))
+
+            code += get_literal_token_code(self.parser.current_token())
+
+            self.parser.check_next_token_is(Token.SEMICOLON)
+            self.parser.advance_token(amount=2)  # skip (NUM|CHAR|TRUE|FALSE) SEMICOLON
+
+        return code
+
+    def process_global_definitions(self):
+        """
+        Iterate through all tokens
+        When encountering function definition - create Function object and pass it the function's tokens
+        When encountering global variable definition - create Variable object
+        Returns code that initializes global variables and advances the pointer to after them
+        """
+        code = ''
+        token = self.parser.current_token()
+        while token is not None and token.type in [Token.VOID, Token.INT, Token.SEMICOLON]:
+            if token.type == Token.SEMICOLON:  # can have random semicolons ;)
+                self.parser.advance_token()
+                token = self.parser.current_token()
+                continue
+            self.parser.check_next_token_is(Token.ID)
+
+            if self.parser.next_token(next_amount=2).type == Token.LPAREN:
+                function = self.create_function_object()
+                insert_function_object(function)
+            elif token.type is Token.INT and self.parser.next_token(next_amount=2).type in [Token.SEMICOLON, Token.ASSIGN, Token.LBRACK]:
+                code += self.compile_global_variable_definition()
+            else:
+                raise BFSyntaxError("Unexpected '%s' after '%s'. Expected '(' (function definition) or one of: '=', ';', '[' (global variable definition)" % (str(self.parser.next_token(next_amount=2)), str(self.parser.next_token())))
+
+            token = self.parser.current_token()
+
+        if self.parser.current_token() is not None:  # we have not reached the last token
+            untouched_tokens = [str(t) for t in self.parser.tokens[self.parser.current_token_index:]]
+            raise BFSyntaxError("Did not reach the end of the code. Untouched tokens:\n%s" % untouched_tokens)
+
+        return code
+
+    def compile(self):
+        insert_library_functions()
+        code = self.process_global_definitions()  # code that initializes global variables and advances pointer to after them
+
+        check_function_exists(Token(Token.ID, 0, 0, "main"), 0)
+        code += get_function_object("main").get_code(get_global_variables_size())
+        code += "<" * get_global_variables_size()  # point to the first cell to end the program nicely :)
+        return code
+
+
+def compile(code, optimize_code=False):
+    """
+    :param code:  C-like code (string)
+    :param optimize_code:  syntax optimization (bool)
+    :return code:  Brainfuck code (string)
+    """
+    compiler = Compiler(code, optimize_code)
+    brainfuck_code = compiler.compile()
+    return brainfuck_code
+
+
+if __name__ == '__main__':
+    print("This file cannot be directly run")
+    print("Please import it and use the 'compile' function")
+    print("Which receives a C-like code (string) and returns Brainfuck code (string)")
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Exceptions.py b/reasoning_gym/code/contrib/bfit/Compiler/Exceptions.py
new file mode 100644
index 00000000..5d11702c
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Exceptions.py
@@ -0,0 +1,6 @@
+class BFSyntaxError(Exception):
+    pass
+
+
+class BFSemanticError(Exception):
+    pass
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/FunctionCompiler.py b/reasoning_gym/code/contrib/bfit/Compiler/FunctionCompiler.py
new file mode 100644
index 00000000..7eaa9877
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/FunctionCompiler.py
@@ -0,0 +1,1159 @@
+from collections import namedtuple
+from functools import reduce
+from .Exceptions import BFSyntaxError, BFSemanticError
+from .Functions import check_function_exists, get_function_object
+from .General import get_variable_dimensions_from_token, get_move_to_return_value_cell_code, get_print_string_code, get_variable_from_ID_token
+from .General import get_literal_token_value, process_switch_cases, is_token_literal
+from .Globals import create_variable_from_definition, get_global_variables, get_variable_size, is_variable_array
+from .Node import NodeToken, NodeTernary, NodeArraySetElement, NodeUnaryPrefix, NodeUnaryPostfix, NodeArrayGetElement, NodeFunctionCall, NodeArrayAssignment
+from .Parser import Parser
+from .Token import Token
+
+"""
+This file implements the FunctionCompiler object
+This is where we actually compile the code - statements, assignments, calculations, etc
+The syntax of the language is defined here as compilation rules
+
+A function is position-dependent - it needs to know where on the tape it runs
+So that it can access global variables, which are at the beginning of the stack, correctly
+Because of that, the function's code is dependent on when/where we call it
+The idea is that we compile the function on demand - every time it is called
+And every time we compile it - we pass the current stack pointer to it
+This is implemented in the get_code() function
+
+The FunctionCompiler object holds tokens that correspond to the function so that we can compile it on demand
+"""
+
+
+class FunctionCompiler:
+    def __init__(self, name, tokens):
+        self.name = name
+        self.tokens = tokens
+        self.parser = Parser(self.tokens)
+        self.ids_map_list = list()
+        self.type = None
+        self.parameters = None
+        self.process_function_definition()  # sets type and parameters
+        self.return_value_cell = None  # will be set on every call to this function
+
+    """
+    ids_map_list is a list of named tuples. Each tuple represents a scope, and holds 2 items:
+        1. an index of the next available cell. (if we want to insert a new ID to the ids_map_list, it will be in that index)
+        2. a dictionary that maps an ID (string) to an index - the cell where we hold that variable
+
+    We use this list as a stack:
+        when entering a scope, we insert a (available_cell, dictionary) to the BEGINNING of the list
+        when exiting a scope, we pop the last inserted tuple (the one at the BEGINNING of the list)
+
+    When declaring a variable in the current scope, we add it to the dictionary at the beginning of the list,
+    and increase the 'next_available_cell' by 1
+    When retrieving a variable, we go through the list and return the first occurrence that matches the ID
+    """
+
+    def process_function_definition(self):
+        # sets function type and parameters, advances parser
+
+        function_return_type = self.parser.current_token()
+        self.parser.advance_token()  # skip return type
+        function_name = self.parser.current_token().data
+        assert function_name == self.name
+        self.parser.advance_token()  # skip ID
+        parameters = self.get_function_parameters_declaration()
+        # parser now points to LBRACE = beginning of function scope
+
+        self.type = function_return_type
+        self.parameters = parameters
+
+    def get_code(self, current_stack_pointer):
+        """
+        layout:
+                    current_stack_pointer -------
+                                                |
+                                                v
+        global1 global2 unknown1 unknown2 my_return_value param1 param2 local1 local2
+
+        current_stack_pointer is current next available cell
+        which is the value of the caller's current_stack_pointer plus this function's parameters
+        create ids map for global variables
+        make room for return_value
+        """
+        self.insert_global_variables_to_function_scope()
+
+        # self.current_stack_pointer is now equal to the size of the global variables plus 1 (next_available_cell)
+        # new stack pointer should be at least that size
+        assert self.current_stack_pointer() <= current_stack_pointer
+        self.return_value_cell = current_stack_pointer
+        self.set_stack_pointer(current_stack_pointer+1)  # make room for return_value cell. next available cell is the next one after it.
+        function_code = self.compile_function_scope(self.parameters)
+        self.remove_ids_map()  # Global variables
+        return function_code
+
+    # =================
+    # helper functions
+    # =================
+    def insert_global_variables_to_function_scope(self):
+        self.add_ids_map()
+        for variable in get_global_variables():
+            self.insert_to_ids_map(variable)
+
+    def get_array_index_expression(self):
+        """
+        the idea - address the multi-dimensional array as a one-dimensional array
+        calculate the appropriate index in the one-dimensional array
+        by multiplying the index in each dimension by its size (i.e the multiplication of all sizes of the following dimensions)
+        and then using the NodeArrayGetElement/NodeArraySetElement class which gets an element in a one-dimensional array
+
+        in order to do that, we need to create our own sub-tree of multiplications,
+        and pass it as the "index expression"
+
+        e.g if the array is: arr[10][5][2] and we want to get arr[4][3][1]
+        then we want to calculate index = (4*(5*2) + 3*(2) + 1)
+        """
+        ID_token = self.parser.current_token()
+        self.parser.advance_token(2)  # skip ID, LBRACK
+        first_index_expression = index_expression = self.expression()  # first dimension
+        self.parser.check_current_token_is(Token.RBRACK)
+        self.parser.advance_token()  # skip RBRACK
+
+        # now handle the next dimensions (if multi-dimensional array)
+        dimensions = get_variable_dimensions_from_token(self.ids_map_list, ID_token)
+        if len(dimensions) > 1:
+            multiply_token = Token(Token.BINOP, ID_token.line, ID_token.column, data="*")
+            add_token = Token(Token.BINOP, ID_token.line, ID_token.column, data="+")
+
+            # multiply by next dimensions sizes
+            multiply_amount = reduce(lambda x, y: x * y, dimensions[1:])  # size of the following dimensions
+            node_token_multiply_amount = NodeToken(self.ids_map_list, token=Token(Token.NUM, ID_token.line, ID_token.column, data=str(multiply_amount)))
+            index_expression = NodeToken(self.ids_map_list, token=multiply_token, left=first_index_expression, right=node_token_multiply_amount)
+
+            # handle next dimensions
+            dimension = 1
+            while dimension < len(dimensions):
+                if self.parser.current_token().type != Token.LBRACK:  # too few indexes given...
+                    if dimension == 1:
+                        return first_index_expression  # allow use of only one dimension for multi-dimensional array
+                    raise BFSemanticError("%s is a %s-dimensional array, but only %s dimension(s) given as index" %
+                                          (str(ID_token), len(dimensions), dimension))
+                self.parser.check_current_token_is(Token.LBRACK)
+                self.parser.advance_token()  # skip LBRACK
+                exp = self.expression()
+
+                self.parser.check_current_token_is(Token.RBRACK)
+                self.parser.advance_token()  # skip RBRACK
+
+                # current_dimension_index *= size_of_following_dimensions
+                if dimension + 1 < len(dimensions):  # not last dimension - need to multiply and add
+                    multiply_amount = reduce(lambda x, y: x * y, dimensions[dimension + 1:])  # size of the following dimensions
+                    node_token_multiply_amount = NodeToken(self.ids_map_list, token=Token(Token.NUM, ID_token.line, ID_token.column, data=str(multiply_amount)))
+                    multiply_node = NodeToken(self.ids_map_list, token=multiply_token, left=exp, right=node_token_multiply_amount)
+
+                    # prev_dimensions_index += current_dimension_index
+                    index_expression = NodeToken(self.ids_map_list, token=add_token, left=index_expression, right=multiply_node)
+                else:  # last dimension - no need to multiply, just add
+                    index_expression = NodeToken(self.ids_map_list, token=add_token, left=index_expression, right=exp)
+                dimension += 1
+
+        if self.parser.current_token().type == Token.LBRACK:  # too many indexes given...
+            raise BFSemanticError("%s is a %s-dimensional array. Unexpected %s" %
+                                  (str(ID_token), len(dimensions), self.parser.current_token()))
+        return index_expression
+
+    def get_token_after_array_access(self, offset=0):
+        # in case we have: "ID[a][b][c]...[z] next_token", return "next_token"
+        idx = self.parser.current_token_index + offset
+        self.parser.check_next_tokens_are([Token.ID, Token.LBRACK], starting_index=idx - 1)
+        idx += 1  # point to LBRACK
+        while self.parser.token_at_index(idx).type == Token.LBRACK:
+            idx = self.parser.find_matching(idx)  # point to RBRACK
+            idx += 1  # advance to one after the RBRACK
+
+        return self.parser.token_at_index(idx)
+
+    def compile_array_assignment(self, token_id):
+        # int id[a][b][c]... = {1, 2, 3, ...};
+        # or int id[a][b][c]... = "\1\2\3...";
+        # or int id[a][b][c]... = {{1, 2}, {3, 4}, ...};
+        # or array assignment: id = {1, 2, 3, ...};
+        self.parser.check_current_token_is(Token.ASSIGN)
+        if self.parser.current_token().data != "=":
+            raise BFSyntaxError("Unexpected %s when assigning array. Expected ASSIGN (=)" % self.parser.current_token())
+
+        if self.parser.next_token().type not in [Token.LBRACE, Token.STRING]:
+            raise BFSyntaxError("Expected LBRACE or STRING at '%s'" % self.parser.next_token())
+
+        self.parser.advance_token()  # skip to LBRACE or STRING
+        literal_tokens_list = self.parser.compile_array_initialization_list()
+
+        return NodeArrayAssignment(self.ids_map_list, token_id, literal_tokens_list)
+
+    def compile_variable_declaration(self):
+        self.parser.check_next_token_is(Token.ID)
+        self.parser.advance_token()  # skip "INT" (now points to ID)
+        assert self.parser.current_token().type == Token.ID
+
+        if self.parser.next_token().type == Token.SEMICOLON:  # INT ID SEMICOLON
+            self.parser.advance_token(2)  # skip ID SEMICOLON
+            return ''  # no code is generated here. code was generated for defining this variable when we entered the scope
+
+        elif self.parser.next_token().type == Token.ASSIGN and self.parser.next_token().data == "=":  # INT ID = EXPRESSION SEMICOLON
+            return self.compile_expression_as_statement()  # compile_expression_as_statement skips the SEMICOLON
+
+        elif self.parser.next_token().type == Token.LBRACK:  # INT ID (LBRACK NUM RBRACK)+ (= ARRAY_INITIALIZATION)? SEMICOLON
+            # array definition (int arr[2][3]...[];) or array definition and initialization (arr[2][3]...[] = {...};)
+            token_id = self.parser.current_token()
+            self.parser.advance_token()  # skip ID
+            while self.parser.current_token().type == Token.LBRACK:  # loop to skip to after last RBRACK ]
+                self.parser.check_current_tokens_are([Token.LBRACK, Token.NUM, Token.RBRACK])
+                self.parser.advance_token(3)  # skip LBRACK, NUM, RBRACK
+
+            if self.parser.current_token().type == Token.ASSIGN:  # initialization
+                initialization_node = self.compile_array_assignment(token_id)
+                code = initialization_node.get_code(self.current_stack_pointer()) + "<"  # discard expression value
+            else:
+                code = ''  # just array definition
+                # no code is generated here. code was generated for defining this variable when we entered the scope
+            self.parser.check_current_token_is(Token.SEMICOLON)
+            self.parser.advance_token()  # skip SEMICOLON
+            return code
+        else:
+            raise BFSyntaxError("Unexpected %s after %s" % (self.parser.next_token(), self.parser.current_token()))
+
+    def add_ids_map(self):
+        """
+        the first cells are global variable cells (index 0 to n)
+        the next cell (index n+1) is the return_value cell
+        every function assumes that these cells exist
+        """
+
+        next_available_cell = 0 if len(self.ids_map_list) == 0 else self.ids_map_list[0].next_available_cell
+
+        ids_map = namedtuple("ids_map", ["next_available_cell", "IDs_dict"])
+        ids_map.next_available_cell = next_available_cell
+        ids_map.IDs_dict = dict()
+
+        self.ids_map_list.insert(0, ids_map)
+
+    def remove_ids_map(self):
+        self.ids_map_list.pop(0)
+
+    def insert_to_ids_map(self, variable):
+        ids_map = self.ids_map_list[0]
+
+        self.check_id_doesnt_exist(variable.name)
+
+        variable.cell_index = ids_map.next_available_cell
+        ids_map.next_available_cell += get_variable_size(variable)
+        ids_map.IDs_dict[variable.name] = variable
+
+    def reserve_cell_in_ids_map(self):
+        """
+        reserve cell by increasing the "pointer" of the next available cell
+        this is used for making room for return_value cell
+        """
+        ids_map = self.ids_map_list[0]
+        ids_map.next_available_cell += 1
+
+    def variables_dict_size(self, variables_dict_index):
+        variables_dict = self.ids_map_list[variables_dict_index].IDs_dict
+
+        size = 0
+        for variable in variables_dict.values():
+            size += get_variable_size(variable)
+
+        return size
+
+    def size_of_variables_current_scope(self):
+        return self.variables_dict_size(0)
+
+    def size_of_global_variables(self):
+        return self.variables_dict_size(-1)
+
+    def increase_stack_pointer(self, amount=1):
+        # sometimes it is needed to increase the stack pointer
+        # for example, when compiling "if ... else ...", we need 2 temporary cells before the inner scope code of both the if and the else
+        # another example - when evaluating expression list in function call, each expression is evaluated while pointing to a different cell
+        # therefore, it is needed to "update" the stack pointer to represent the new pointer
+        self.ids_map_list[0].next_available_cell += amount
+
+    def decrease_stack_pointer(self, amount=1):
+        self.ids_map_list[0].next_available_cell -= amount
+
+    def set_stack_pointer(self, new_value):
+        assert new_value >= self.ids_map_list[0].next_available_cell
+        self.ids_map_list[0].next_available_cell = new_value
+
+    def current_stack_pointer(self):
+        return self.ids_map_list[0].next_available_cell
+
+    def insert_scope_variables_into_ids_map(self):
+        # go through all the variable definitions in this scope (not including sub-scopes), and add them to the ids map
+        # move the pointer to the next available cell (the one after the last variable declared in this scope)
+
+        assert self.parser.current_token().type == Token.LBRACE
+        self.parser.advance_token()
+
+        i = self.parser.current_token_index
+        while i < len(self.tokens):
+            token = self.tokens[i]
+
+            if token.type == Token.INT:
+                if self.tokens[i-2].type != Token.FOR:  # if it is not a definition inside a FOR statement (for (int i = 0...))
+                    variable = create_variable_from_definition(self.parser, index=i)
+                    self.insert_to_ids_map(variable)
+
+            elif token.type == Token.LBRACE:
+                i = self.parser.find_matching(starting_index=i)
+
+            elif token.type == Token.RBRACE:
+                break  # we have reached the end of the scope
+
+            i += 1
+
+        return ">" * self.size_of_variables_current_scope()  # advance pointer to the next available cell
+
+    def enter_scope(self):
+        # create an ids map to the current scope, and then inserts the scope variables into it
+        self.add_ids_map()
+        return self.insert_scope_variables_into_ids_map()
+
+    def exit_scope(self):
+        # remove the ids map of the current scope
+        # return pointer to the previous scope's next available cell
+        code = "<" * self.size_of_variables_current_scope()
+        self.remove_ids_map()
+        return code
+
+    def enter_function_scope(self, parameters):
+        # make room for return_value cell
+        # create an ids map to the current function scope
+        # insert parameters into the ids map
+        # insert scope variables into the ids map
+
+        self.add_ids_map()
+        for parameter in parameters:
+            self.insert_to_ids_map(parameter)
+
+        code = '>'  # skip return_value_cell
+        code += self.insert_scope_variables_into_ids_map()
+        # this inserts scope variables AND moves pointer right, with the amount of BOTH parameters and scope variables
+
+        return code
+
+    def check_id_doesnt_exist(self, ID):
+        # make sure that the id does not exist in the current scope
+        # used when defining a variable
+        if ID in self.ids_map_list[0].IDs_dict:
+            raise BFSemanticError("ID %s is already defined" % ID)
+
+    # =================
+    # compilation rules
+    # =================
+
+    # expression
+    def function_call(self):
+        # function_call: ID LPAREN expression_list RPAREN
+        # returns NodeFunctionCall
+        assert self.parser.current_token().type == Token.ID
+
+        function_token = self.parser.current_token()
+        function_name = function_token.data
+        self.parser.advance_token()  # skip ID
+
+        if function_name == self.name:
+            raise BFSemanticError("No support for recursion yet :(.... in function call '%s'" % str(function_token))
+
+        parameters = self.compile_expression_list()
+
+        check_function_exists(function_token, len(parameters))
+        function_to_call = get_function_object(function_name)
+
+        return NodeFunctionCall(self.ids_map_list, function_to_call, parameters)
+
+    def literal(self):
+        # literal: NUM | CHAR | ID | ID (LBRACK expression RBRACK)+ | TRUE | FALSE | function_call | ( expression )
+
+        token = self.parser.current_token()
+
+        if token.type == Token.ID and self.parser.next_token().type == Token.LPAREN:
+            return self.function_call()
+
+        if token.type == Token.ID and self.parser.next_token().type == Token.LBRACK:  # array - ID(LBRACK expression RBRACK)+
+            index_expression = self.get_array_index_expression()
+            return NodeArrayGetElement(self.ids_map_list, token, index_expression)
+
+        if is_token_literal(token) or token.type == Token.ID:
+            self.parser.advance_token()
+            return NodeToken(self.ids_map_list, token=token)
+
+        if token.type != Token.LPAREN:
+            raise BFSyntaxError("Unexpected '%s'. expected literal (NUM | ID | ID(LBRACK expression RBRACK)+ | TRUE | FALSE | function_call | ( expression ))" % str(token))
+
+        # ( expression )
+        self.parser.check_current_token_is(Token.LPAREN)
+        self.parser.advance_token()  # skip LPAREN
+        exp = self.expression()
+        self.parser.check_current_token_is(Token.RPAREN)
+        self.parser.advance_token()  # skip RPAREN
+
+        return exp
+
+    def unary_postfix(self):
+        # unary_postfix: literal ( ++ | -- | UNARY_MULTIPLICATIVE)?
+
+        literal = self.literal()
+        token = self.parser.current_token()
+
+        if token.type in [Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]:
+            self.parser.advance_token()
+            new_node = NodeUnaryPostfix(self.ids_map_list, operation=token, literal=literal)
+            return new_node
+        else:
+            return literal
+
+    def unary_prefix(self):
+        # unary_prefix:  ( (!|+|-)* unary_prefix ) | ( ( ++ | -- | UNARY_MULTIPLICATIVE | ~ ) literal ) | unary_postfix
+
+        token = self.parser.current_token()
+
+        if token.type in [Token.NOT, Token.BITWISE_NOT, Token.BINOP]:
+            if token.type == Token.BINOP and token.data not in ["+", "-"]:
+                    raise BFSyntaxError("Expected either + or - as unary prefix instead of token %s" % self.parser.current_token())
+            self.parser.advance_token()
+            unary_prefix = self.unary_prefix()
+
+            new_node = NodeUnaryPrefix(self.ids_map_list, operation=token, literal=unary_prefix)
+            return new_node
+
+        elif token.type in [Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]:
+            self.parser.advance_token()
+            literal = self.literal()
+
+            new_node = NodeUnaryPrefix(self.ids_map_list, operation=token, literal=literal)
+            return new_node
+
+        else:
+            return self.unary_postfix()
+
+    def multiplicative(self):
+        # multiplicative: unary_prefix ((MUL|DIV|MOD) unary_prefix)*
+
+        n = self.unary_prefix()
+
+        token = self.parser.current_token()
+        while token is not None and token.type == Token.BINOP and token.data in ["*", "/", "%"]:
+            self.parser.advance_token()
+            next_factor = self.unary_prefix()
+
+            new_node = NodeToken(self.ids_map_list, token=token, left=n, right=next_factor)
+            n = new_node
+
+            token = self.parser.current_token()
+
+        return n
+
+    def additive(self):
+        # additive: multiplicative ((PLUS|MINUS) multiplicative)*
+
+        n = self.multiplicative()
+
+        token = self.parser.current_token()
+        while token is not None and token.type == Token.BINOP and token.data in ["+", "-"]:
+            self.parser.advance_token()
+            next_term = self.multiplicative()
+
+            new_node = NodeToken(self.ids_map_list, token=token, left=n, right=next_term)
+            n = new_node
+
+            token = self.parser.current_token()
+
+        return n
+
+    def shift(self):
+        # shift: additive (<<|>> additive)*
+
+        n = self.additive()
+
+        token = self.parser.current_token()
+        while token is not None and token.type == Token.BITWISE_SHIFT:
+            self.parser.advance_token()
+            next_additive = self.additive()
+
+            new_node = NodeToken(self.ids_map_list, token=token, left=n, right=next_additive)
+            n = new_node
+
+            token = self.parser.current_token()
+
+        return n
+
+    def relational(self):
+        # relational: shift (==|!=|<|>|<=|>= shift)?
+
+        a = self.shift()
+
+        token = self.parser.current_token()
+        if token.type != Token.RELOP:  # just an arithmetic expression
+            return a
+
+        self.parser.advance_token()
+        b = self.shift()
+
+        new_node = NodeToken(self.ids_map_list, token=token, left=a, right=b)
+        return new_node
+
+    def bitwise_and(self):
+        # bitwise_and: relational (& relational)*
+
+        n = self.relational()
+
+        token = self.parser.current_token()
+        while token is not None and token.type == Token.BITWISE_AND:
+            self.parser.advance_token()
+            next_relational = self.relational()
+
+            new_node = NodeToken(self.ids_map_list, token=token, left=n, right=next_relational)
+            n = new_node
+
+            token = self.parser.current_token()
+
+        return n
+
+    def bitwise_xor(self):
+        # bitwise_xor: bitwise_and (| bitwise_and)*
+
+        n = self.bitwise_and()
+
+        token = self.parser.current_token()
+        while token is not None and token.type == Token.BITWISE_XOR:
+            self.parser.advance_token()
+            next_bitwise_and = self.bitwise_and()
+
+            new_node = NodeToken(self.ids_map_list, token=token, left=n, right=next_bitwise_and)
+            n = new_node
+
+            token = self.parser.current_token()
+
+        return n
+
+    def bitwise_or(self):
+        # bitwise_or: bitwise_xor (| bitwise_xor)*
+
+        n = self.bitwise_xor()
+
+        token = self.parser.current_token()
+        while token is not None and token.type == Token.BITWISE_OR:
+            self.parser.advance_token()
+            next_bitwise_xor = self.bitwise_xor()
+
+            new_node = NodeToken(self.ids_map_list, token=token, left=n, right=next_bitwise_xor)
+            n = new_node
+
+            token = self.parser.current_token()
+
+        return n
+
+    def logical_and(self):
+        # logical_and: bitwise_or (&& bitwise_or)*
+
+        n = self.bitwise_or()
+
+        token = self.parser.current_token()
+        while token is not None and token.type == Token.AND:
+            self.parser.advance_token()
+            next_bitwise_or = self.bitwise_or()
+
+            new_node = NodeToken(self.ids_map_list, token=token, left=n, right=next_bitwise_or)
+            n = new_node
+
+            token = self.parser.current_token()
+
+        return n
+
+    def logical_or(self):
+        # logical_or: logical_and (|| logical_and)*
+
+        n = self.logical_and()
+
+        token = self.parser.current_token()
+        while token is not None and token.type == Token.OR:
+            self.parser.advance_token()
+            next_and = self.logical_and()
+
+            new_node = NodeToken(self.ids_map_list, token=token, left=n, right=next_and)
+            n = new_node
+
+            token = self.parser.current_token()
+
+        return n
+
+    def ternary_expression(self):
+        # ternary_expression: logical_or (? expression : ternary_expression)?
+        n = self.logical_or()
+        if self.parser.current_token().type != Token.TERNARY:
+            return n
+
+        self.parser.advance_token()  # skip ?
+        node_true = self.expression()
+        self.parser.check_current_token_is(Token.COLON)
+        self.parser.advance_token()  # skip :
+        node_false = self.ternary_expression()
+        return NodeTernary(self.ids_map_list, n, node_true, node_false)
+
+    def assignment(self):
+        # assignment: ID ASSIGN expression | ID ASSIGN ARRAY_INITIALIZATION | ID (LBRACK expression RBRACK)+ ASSIGN expression | ternary_expression
+
+        if self.parser.current_token().type == Token.ID and self.parser.next_token().type == Token.ASSIGN:
+
+            if self.parser.next_token(2).type in [Token.LBRACE, Token.STRING]:  # ID ASSIGN ARRAY_INITIALIZATION
+                token_ID = self.parser.current_token()
+                self.parser.advance_token()  # skip ID
+                variable_ID = get_variable_from_ID_token(self.ids_map_list, token_ID)
+                if not is_variable_array(variable_ID):
+                    raise BFSemanticError("Trying to assign array to non-array variable %s" % token_ID)
+                return self.compile_array_assignment(token_ID)
+
+            # ID ASSIGN expression
+            id_token = self.parser.current_token()
+            assign_token = self.parser.next_token()
+            self.parser.advance_token(amount=2)  # skip ID ASSIGN
+
+            expression_node = self.expression()
+
+            new_node = NodeToken(self.ids_map_list, left=NodeToken(self.ids_map_list, token=id_token), token=assign_token, right=expression_node)
+            return new_node
+
+        elif self.parser.current_token().type == Token.ID and self.parser.next_token().type == Token.LBRACK and \
+                self.get_token_after_array_access().type == Token.ASSIGN:
+            # ID (LBRACK expression RBRACK)+ ASSIGN value_expression
+            id_token = self.parser.current_token()
+            index_expression = self.get_array_index_expression()
+            self.parser.check_current_token_is(Token.ASSIGN)
+            assign_token = self.parser.current_token()
+            self.parser.advance_token()  # skip ASSIGN
+            value_expression = self.expression()
+
+            return NodeArraySetElement(self.ids_map_list, id_token, index_expression, assign_token, value_expression)
+        else:
+            return self.ternary_expression()
+
+    def expression(self):
+        # expression: assignment
+        return self.assignment()
+
+    def compile_expression(self):
+        # parses mathematical expressions (+-*/ ())
+        # increments/decrements (++, --)
+        # relative operations (==, !=, <, >, <=, >=)
+        # bitwise operations (|, &, ^, ~)
+        # logical operations (!, &&, ||, ~)
+        # ternary expression (?)
+        # assignment (=, +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^=)
+        # this is implemented using a Node class that represents a parse tree
+
+        """
+        (used reference: https://introcs.cs.princeton.edu/java/11precedence/)
+        order of operations (lowest precedence to highest precedence)
+            assignment (=, +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^=)
+            ternary_expression (?)
+            logical_or (||)
+            logical_and (&&)
+            bitwise_or (|)
+            bitwise_xor (^)
+            bitwise_and (&)
+            bitwise_not (~)
+            relational (==|!=|<|>|<=|>=)
+            shift (<<|>>)
+            additive (+-)
+            multiplicative (*/%)
+            unary_prefix (!, ++, --, ~)
+            unary_postfix (++, --)
+
+        expression: assignment
+        assignment: ID (=|+=|-=|*=|/=|%=|<<=|>>=|&=|(|=)|^=) expression | ternary_expression
+        ternary_expression: logical_or (? expression : ternary_expression)?
+        logical_or: logical_and (|| logical_and)*
+        logical_and: bitwise_or (&& bitwise_or)*
+        bitwise_or: bitwise_xor (| bitwise_xor)*
+        bitwise_xor: bitwise_and (^ bitwise_and)*
+        bitwise_and: relational (& relational)*
+        relational: shift (==|!=|<|>|<=|>= shift)?
+        shift: additive ((<<|>>) additive)*
+        additive: multiplicative ((PLUS|MINUS) multiplicative)*
+        multiplicative: unary_prefix ((MUL|DIV|MOD) unary_prefix)*
+        unary_prefix:  ( (!|+|-)* unary_prefix ) | ( ( ++ | -- | ~ ) literal ) | unary_postfix
+        unary_postfix: literal ( ++ | -- )?
+        literal: NUM | CHAR | ID | ID[expression] | TRUE | FALSE | function_call | ( expression )
+        """
+
+        parse_tree = self.expression()
+        expression_code = parse_tree.get_code(self.current_stack_pointer())
+        return expression_code
+
+    # functions-related
+    def get_function_parameters_declaration(self):
+        # parameters declaration: LPAREN (int ID (LBRACK NUM RBRACK)? (COMMA int ID)*)? RPAREN
+        # return list of parameters (named tuples (type, ID)) at the same order as declared
+
+        assert self.parser.current_token().type == Token.LPAREN
+        self.parser.advance_token()
+
+        res = list()
+
+        token = self.parser.current_token()
+        while token.type != Token.RPAREN:
+            if token.type != Token.INT:
+                raise BFSemanticError("Only int type is supported as a function parameter, and not '%s'" % str(token))
+
+            parameter = create_variable_from_definition(self.parser, advance_tokens=True)
+            res.append(parameter)
+
+            if self.parser.current_token().type == Token.COMMA:
+                self.parser.advance_token()
+            else:
+                self.parser.check_current_token_is(Token.RPAREN)
+
+            token = self.parser.current_token()
+
+        self.parser.advance_token()  # skip RPAREN
+        return res
+
+    def compile_expression_list(self):
+        # expression_list: ( expression (COMMA expression)* )?
+        # returns a list of Nodes - one node for each expression
+        assert self.parser.current_token().type == Token.LPAREN
+        self.parser.advance_token()
+
+        expressions = list()
+
+        token = self.parser.current_token()
+        while token.type != Token.RPAREN:
+            expressions.append(self.expression())
+
+            if self.parser.current_token().type == Token.COMMA:
+                self.parser.advance_token()
+            else:
+                self.parser.check_current_token_is(Token.RPAREN)
+            token = self.parser.current_token()
+
+        self.parser.advance_token()  # skip RPAREN
+        return expressions
+
+    def compile_return(self):
+        # this assumes that the return is the last statement in the function
+
+        self.parser.advance_token()  # skip return
+        if self.parser.current_token().type == Token.SEMICOLON:
+            # return;
+            self.parser.advance_token()  # skip ;
+            return ''  # nothing to do
+
+        # return exp;
+        expression_code = self.compile_expression()
+        self.parser.check_current_token_is(Token.SEMICOLON)
+
+        self.parser.advance_token()  # skip ;
+
+        code = expression_code  # after this, we point to next available cell
+        code += "<"  # point to value to return
+        code += get_move_to_return_value_cell_code(self.return_value_cell, self.current_stack_pointer())
+
+        return code
+
+    # statements
+    def compile_expression_as_statement(self):
+        # this expression can be used as a statement.
+        # e.g: x+=5;  or  x++ or ++x;
+
+        assert self.parser.current_token().type in [Token.ID, Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]
+
+        code = self.compile_expression()
+        self.parser.check_current_token_is(Token.SEMICOLON)
+        self.parser.advance_token()  # skip ;
+
+        code += "<"  # discard the expression's value
+
+        return code
+
+    def compile_print_string(self):
+        # print(string);
+        self.parser.check_next_tokens_are([Token.LPAREN, Token.STRING, Token.RPAREN, Token.SEMICOLON])
+        self.parser.advance_token(amount=2)  # skip print (
+        string_to_print = self.parser.current_token().data
+        self.parser.advance_token(amount=3)  # skip string ) ;
+
+        code = get_print_string_code(string_to_print)
+        return code
+
+    def compile_function_call_statement(self):
+        # compile statement: function_call SEMICOLON
+        function_call_node = self.function_call()
+        function_call_code = function_call_node.get_code(current_pointer=self.current_stack_pointer())
+
+        self.parser.check_current_token_is(Token.SEMICOLON)
+        self.parser.advance_token()  # skip ;
+
+        code = function_call_code  # at this point, we point to one after the return value
+        code += "<"  # discard return value
+        return code
+
+    def compile_if(self):
+        # if (expression) statement (else statement)?   note - statement can be scope { }
+
+        self.parser.check_next_token_is(Token.LPAREN)
+        self.parser.advance_token(amount=2)  # skip to after LPAREN
+
+        expression_code = self.compile_expression()
+        self.parser.check_current_token_is(Token.RPAREN)
+        self.parser.advance_token()  # point to after RPAREN
+
+        # if ... (else ...)?
+        # need to use 2 temp cells
+        # expression, execute_else
+
+        self.increase_stack_pointer(amount=2)
+        inside_if_code = self.compile_statement()
+
+        have_else = self.parser.current_token().type == Token.ELSE
+        if have_else:
+            self.parser.advance_token()  # skip the 'else'
+            inside_else_code = self.compile_statement()
+        self.decrease_stack_pointer(amount=2)
+
+        code = expression_code  # evaluate expression. after this we point to "execute_else" cell
+        if have_else:
+            code += "[-]+"  # execute_else = 1
+        code += "<"  # point to the expression
+        code += "["  # if it is non-zero
+        code += ">"  # point to execute_else
+        if have_else:
+            code += "-"  # execute_else = 0
+        code += ">"  # point to next available cell
+        code += inside_if_code  # after this we point to the same cell (one after execute_else)
+        code += "<<"  # point to expression
+        code += "[-]"  # expression = 0
+        code += "]"  # end if
+        # now we point to next available cell (what used to be expression_code)
+
+        if have_else:
+            code += ">"  # point to execute_else
+            code += "["  # if it is non-zero
+            code += ">"  # point to next available cell
+            code += inside_else_code  # after this we point to the same cell (one after execute_else)
+            code += "<"  # point to execute_else
+            code += "-"  # execute_else = 0
+            code += "]"  # end if
+            code += "<"  # point to next available cell (what used to be expression_code)
+
+        return code
+
+    def compile_while(self):  # while (expression) statement       note - statement can be scope { }
+        self.parser.check_next_token_is(Token.LPAREN)
+        self.parser.advance_token(amount=2)  # skip to after LPAREN
+
+        expression_code = self.compile_expression()
+
+        self.parser.check_current_token_is(Token.RPAREN)
+        self.parser.advance_token()  # point to after RPAREN
+
+        inner_scope_code = self.compile_statement()
+
+        code = expression_code  # evaluate expression
+        code += "<"  # point to the expression
+        code += "["  # if it is 0, jump to after the <while> scope
+        code += inner_scope_code  # <while> scope code. after this code, pointer points to the next available cell. i.e one after the expression
+        code += expression_code  # re-evaluate the expression
+        code += "<"  # point to the expression
+        code += "]"  # after <while> scope
+
+        return code
+
+    def compile_do_while(self):  # do statement while (expression) semicolon      note - statement can be scope { }
+        self.parser.check_current_token_is(Token.DO)
+        self.parser.advance_token()
+
+        inner_scope_code = self.compile_statement()
+
+        self.parser.check_current_tokens_are([Token.WHILE, Token.LPAREN])
+        self.parser.advance_token(amount=2)  # point to after LPAREN
+
+        expression_code = self.compile_expression()
+
+        self.parser.check_current_tokens_are([Token.RPAREN, Token.SEMICOLON])
+        self.parser.advance_token(amount=2)  # point to after SEMICOLON
+
+        code = "[-]+"  # set expression to 1. since do while loops executes the scope code first.
+        code += "["  # go in scope
+        code += inner_scope_code  # <do-while> scope code. after this code, pointer points to the same cell. i.e the expression
+        code += expression_code  # evaluate the expression, after this code, the pointer is pointing to the next cell
+        code += "<"  # point to the expression
+        code += "]"  # after <do-while> scope
+
+        return code
+
+    def compile_switch(self):  # switch (expression) { ((default | case literal): statements* break;? statements*)* }
+        self.parser.check_current_tokens_are([Token.SWITCH, Token.LPAREN])
+        self.parser.advance_token(amount=2)  # point to after LPAREN
+
+        self.increase_stack_pointer()  # use 1 temp cell before evaluating the expression
+        expression_code = self.compile_expression()
+        self.parser.check_current_tokens_are([Token.RPAREN, Token.LBRACE])
+        self.parser.advance_token(amount=2)  # point to after LBRACE
+
+        self.increase_stack_pointer()  # use 1 additional temp cell for indicating we need to execute a case
+        cases = list()  # list of tuples: (value/"default" (int or string), case_code (string), has_break(bool))
+
+        while self.parser.current_token().type in [Token.CASE, Token.DEFAULT]:  # (default | CASE literal) COLON statement* break;? statements*
+            if self.parser.current_token().type == Token.CASE:
+                self.parser.advance_token()  # skip CASE
+                constant_value_token = self.parser.current_token()
+                if not is_token_literal(constant_value_token):
+                    raise BFSemanticError("Switch case value is not a literal. Token is %s" % constant_value_token)
+
+                value = get_literal_token_value(constant_value_token)
+                if value in [case for (case, _, _) in cases]:
+                    raise BFSemanticError("Case %d already exists. Token is %s" % (value, constant_value_token))
+            else:
+                assert self.parser.current_token().type == Token.DEFAULT
+                value = "default"
+                if value in [case for (case, _, _) in cases]:
+                    raise BFSemanticError("default case %s already exists." % self.parser.current_token())
+
+            self.parser.check_next_token_is(Token.COLON)
+            self.parser.advance_token(amount=2)  # point to after COLON
+
+            inner_case_code = ""
+            while self.parser.current_token().type not in [Token.CASE, Token.DEFAULT, Token.RBRACE, Token.BREAK]:
+                inner_case_code += self.compile_statement(allow_declaration=False)  # not allowed to declare variables directly inside case
+
+            has_break = False
+            if self.parser.current_token().type == Token.BREAK:  # ignore all statements after break
+                self.parser.check_next_token_is(Token.SEMICOLON)
+                self.parser.advance_token(amount=2)  # skip break SEMICOLON
+                has_break = True
+                while self.parser.current_token().type not in [Token.CASE, Token.DEFAULT, Token.RBRACE]:
+                    self.compile_statement()  # advance the parser and discard the code
+            cases.append((value, inner_case_code, has_break))
+
+        if self.parser.current_token().type not in [Token.CASE, Token.DEFAULT, Token.RBRACE]:
+            raise BFSyntaxError("Expected case / default / RBRACE (}) instead of token %s" % self.parser.current_token())
+        self.parser.check_current_token_is(Token.RBRACE)
+        self.parser.advance_token()
+        self.decrease_stack_pointer(amount=2)
+
+        return process_switch_cases(expression_code, cases)
+
+    def compile_break(self):
+        # TODO: Make the break statement in scopes inside switch-case (including if/else), and for/do/while
+        raise NotImplementedError("Break statement found outside of switch case first scope.\nBreak is not currently implemented for while/for/do statements.\nToken is %s" % self.parser.current_token())
+
+    def compile_for(self):
+        # for (statement expression; expression) inner_scope_code   note: statement contains ;, and inner_scope_code can be scope { }
+        # (the statement/second expression/inner_scope_code can be empty)
+        # (the statement cannot contain scope - { and } )
+
+        """
+            <for> is a special case of scope
+            the initial code (int i = 0;) is executed INSIDE the scope, but BEFORE the LBRACE
+            so we manually compile the scope instead of using self.compile_scope():
+
+            we first create an ids map, and in the case that there is a variable definition inside the <for> definition:
+            we manually insert the ID into the ids map, and move the pointer to the right once, to make room for it
+            (this needs to be done before the <for> definition's statement)
+            next, inside the for's scope {}:
+            after calling insert_scope_variables_into_ids_map, we move the pointer to the left once, since it counts the ID we entered manually as well
+            after calling exit_scope, we move the pointer to the right, since it counts the ID we entered manually, and we don't want it to be discarded after every iteration
+            finally, at the end of the <for> loop, we move the pointer once to the left, to discard the variable we defined manually
+        """
+
+        self.parser.check_current_tokens_are([Token.FOR, Token.LPAREN])
+        self.parser.advance_token(amount=2)  # skip for (
+
+        manually_inserted_variable_in_for_definition = False
+        variable = None
+        code = ''
+
+        # =============== enter FOR scope ===============
+        self.add_ids_map()
+        # ===============================================
+
+        if self.parser.current_token().type == Token.INT:
+            # we are defining a variable inside the for statement definition (for (int i = 0....))
+            variable = create_variable_from_definition(self.parser, advance_tokens=False)
+            self.insert_to_ids_map(variable)
+            manually_inserted_variable_in_for_definition = True
+            code += ">" * get_variable_size(variable)
+
+            show_side_effect_warning = self.parser.next_token(2).type != Token.ASSIGN
+            if self.parser.next_token(2).type == Token.LBRACK:
+                show_side_effect_warning = self.get_token_after_array_access(offset=1).type != Token.ASSIGN
+
+            if show_side_effect_warning:
+                print("[Warning] For loop variable '%s' isn't assigned to anything and may cause side effects" % self.parser.next_token())
+
+        if self.parser.current_token().type == Token.LBRACE:  # statement is a scope
+            raise BFSyntaxError("Unexpected scope inside for loop statement - %s" % self.parser.current_token())
+        initial_statement = self.compile_statement()
+
+        condition_expression = self.compile_expression()
+        self.parser.check_current_token_is(Token.SEMICOLON)
+        self.parser.advance_token()  # skip ;
+
+        if self.parser.current_token().type == Token.RPAREN:
+            modification_expression = ""  # no modification expression
+        else:
+            modification_expression = self.compile_expression()
+            modification_expression += "<"  # discard expression value
+        self.parser.check_current_token_is(Token.RPAREN)
+        self.parser.advance_token()  # skip )
+
+        inner_scope_code = ""
+        if self.parser.current_token().type == Token.LBRACE:  # do we have {} as for's statement?
+            # compiling <for> scope inside { }:
+            if manually_inserted_variable_in_for_definition:
+                inner_scope_code += "<" * get_variable_size(variable)
+            inner_scope_code += self.insert_scope_variables_into_ids_map()
+            inner_scope_code += self.compile_scope_statements()
+        else:
+            inner_scope_code += self.compile_statement()
+        # =============== exit FOR scope ===============
+        inner_scope_code += self.exit_scope()
+        if manually_inserted_variable_in_for_definition:
+            inner_scope_code += ">" * get_variable_size(variable)
+        # ==============================================
+
+        code += initial_statement
+        code += condition_expression  # evaluate expression
+        code += "<"  # point to the expression
+        code += "["  # if it is 0, jump to after the <for> scope
+        code += inner_scope_code  # <for> scope code
+        code += modification_expression
+        code += condition_expression  # re-evaluate the expression
+        code += "<"  # point to the expression
+        code += "]"  # after <for> scope
+
+        if manually_inserted_variable_in_for_definition:
+            code += "<" * get_variable_size(variable)
+
+        return code
+
+    def compile_statement(self, allow_declaration=True):
+        # returns code that performs the current statement
+        # at the end, the pointer points to the same location it pointed before the statement was executed
+
+        token = self.parser.current_token()
+        if token.type == Token.INT:  # INT ID ((= EXPRESSION) | ([NUM])+ (= ARRAY_INITIALIZATION)?)? SEMICOLON
+            if not allow_declaration:
+                raise BFSemanticError("Cannot define variable (%s) directly inside case. "
+                                      "Can define inside new scope {} or outside the switch statement" % token)
+            return self.compile_variable_declaration()
+
+        elif token.type in [Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]:  # ++ID;
+            return self.compile_expression_as_statement()
+
+        elif token.type == Token.ID:
+            if self.parser.next_token().type in [Token.ASSIGN, Token.LBRACK, Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]:
+                # ID ASSIGN expression; or ID([expression])+ ASSIGN expression; or ID++;
+                return self.compile_expression_as_statement()
+            elif self.parser.next_token().type == Token.LPAREN:  # ID(...);  (function call)
+                return self.compile_function_call_statement()
+            raise BFSyntaxError("Unexpected '%s' after '%s'. Expected '=|+=|-=|*=|/=|%%=|<<=|>>=|&=|(|=)|^=' (assignment), '++|--' (modification) or '(' (function call)" % (str(self.parser.next_token()), str(token)))
+
+        elif token.type == Token.PRINT:
+            return self.compile_print_string()
+
+        elif token.type == Token.IF:
+            return self.compile_if()
+
+        elif token.type == Token.LBRACE:
+            return self.compile_scope()
+
+        elif token.type == Token.WHILE:
+            return self.compile_while()
+
+        elif token.type == Token.DO:
+            return self.compile_do_while()
+
+        elif token.type == Token.SWITCH:
+            return self.compile_switch()
+
+        elif token.type == Token.BREAK:
+            return self.compile_break()
+
+        elif token.type == Token.RETURN:
+            return self.compile_return()
+
+        elif token.type == Token.FOR:
+            return self.compile_for()
+
+        elif token.type == Token.SEMICOLON:
+            # empty statement
+            self.parser.advance_token()  # skip ;
+            return ""
+
+        elif token.type in [Token.CASE, Token.DEFAULT]:
+            raise BFSyntaxError("%s not inside a switch statement" % token)
+
+        raise BFSyntaxError("Invalid statement at " + str(token))
+
+    def compile_scope_statements(self):
+        tokens = self.tokens
+
+        code = ''
+        while self.parser.current_token() is not None:
+            if self.parser.current_token().type == Token.RBRACE:
+                # we reached the end of our scope
+                self.parser.advance_token()  # skip RBRACE
+                return code
+            else:
+                code += self.compile_statement()
+
+        # should never get here
+        raise BFSyntaxError("expected } after the last token in scope " + str(tokens[-1]))
+
+    def compile_scope(self):
+        assert self.parser.current_token().type == Token.LBRACE
+
+        code = self.enter_scope()
+        code += self.compile_scope_statements()
+        code += self.exit_scope()
+
+        return code
+
+    def compile_function_scope(self, parameters):
+        # returns code for the current function
+        # parameters is a list of parameters, in the order of their declaration
+        # will be inserted into the new scope prior to the scope's compilation
+
+        """
+            example layout:
+                int global_var1;
+                int global_var2;
+                int foo(int a, int b) {
+                    int x;
+                    int y;
+                    return 5;
+                }
+
+                int main() {
+                    int n;
+                    foo(1, 2);
+                }
+
+                global_var1 global_var2 main_return_value n foo_return_value a=1 b=2 x y
+
+                calling convention:
+                caller responsibility: make room for return_value (and zero its cell), place parameters, point to return_value cell
+                callee responsibility: put return value in return_value cell and point to it (thus "cleaning" parameters)
+                    can assume that there is a zeroed cell at current_stack_pointer (return_value_cell) (therefore ids_map starts at index current_stack_pointer+1)
+                    can assume that the next cells match your parameters
+                    assumes that initially, the pointer points to the first cell (return_value_cell).
+                    therefore begin with '>' * (1 + parameters + scope variables)
+        """
+
+        assert self.parser.current_token().type == Token.LBRACE
+
+        code = self.enter_function_scope(parameters)
+        code += self.compile_scope_statements()
+        code += self.exit_scope()
+        code += "<"  # point to return_value_cell
+
+        return code
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Functions.py b/reasoning_gym/code/contrib/bfit/Compiler/Functions.py
new file mode 100644
index 00000000..837e3339
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Functions.py
@@ -0,0 +1,33 @@
+from copy import deepcopy
+from .Exceptions import BFSemanticError
+
+functions = dict()  # Global dictionary of function_name --> FunctionCompiler objects
+
+
+def insert_function_object(function):
+    functions[function.name] = function
+
+
+def get_function_object(name):
+    """
+    must return a copy of the function
+    because we might need to compile function recursively
+    and if we don't work on different copies then we will interfere with the current token pointer etc
+
+    for example:
+        int increase(int n) { return n+1;}
+        int main() {int x = increase(increase(1));}
+
+    while compiling the first call, we start a compilation of the same function object in the second call
+    """
+    return deepcopy(functions[name])
+
+
+def check_function_exists(function_token, parameters_amount):
+    function_name = function_token.data
+    if function_name not in functions:
+        raise BFSemanticError("Function '%s' is undefined" % str(function_token))
+
+    function = functions[function_name]
+    if len(function.parameters) != parameters_amount:
+        raise BFSemanticError("Function '%s' has %s parameters (called it with %s parameters)" % (str(function_token), len(function.parameters), parameters_amount))
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/General.py b/reasoning_gym/code/contrib/bfit/Compiler/General.py
new file mode 100644
index 00000000..2a182b8a
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/General.py
@@ -0,0 +1,1258 @@
+from .Exceptions import BFSyntaxError, BFSemanticError
+from .Token import Token
+from functools import reduce
+
+"""
+This file holds functions that generate general Brainfuck code
+And general functions that are not dependent on other objects
+"""
+
+
+# =================
+#  Brainfuck code
+# =================
+
+
+def get_set_cell_value_code(new_value, previous_value, zero_next_cell_if_necessary=True):
+    # this function returns a code that sets the current cell's value to new_value,
+    # given that its previous value is previous_value
+
+    # it may return the "naive" way, of "+"/"-" usage, <offset> times
+    # and it may return an optimization using loops, by using the next cell as a loop counter
+    # if zero_next_cell_if_necessary is set to False, it assumes that the next cell is already 0
+
+    # after the code of this function is executed, the pointer will point to the original cell
+    # this function returns the shorter code between "naive" and "looped"
+
+    def get_char(value):
+        return "+" if value > 0 else "-"
+
+    offset = new_value - previous_value
+    char = get_char(offset)
+    is_negative = offset < 0
+    offset = abs(offset)
+
+    # "naive" code is simply +/-, <offset> times
+    naive = char * offset
+
+    # "looped" code is "[<a> times perform <b> adds/subs] and then <c> more adds/subs"
+    def get_abc(offset):
+        # returns a,b,c such that a*b+c=offset and a+b+c is minimal
+
+        min_a, min_b, min_c = offset, 1, 0
+        min_sum = offset + 1
+
+        left = 1
+        right = offset // 2 - 1
+
+        while right >= left:
+            a, b = left + 1, right + 1
+            c = offset - a * b
+            curr_sum = abs(a) + abs(b) + abs(c)
+
+            if curr_sum < min_sum:
+                min_a, min_b, min_c = a, b, c
+                min_sum = curr_sum
+
+            if a * b > offset:
+                right -= 1
+            else:
+                left += 1
+
+        return min_a, min_b, min_c
+
+    a, b, c = get_abc(offset)
+    looped = ">"  # point to next cell (loop counter)
+    if zero_next_cell_if_necessary:
+        looped += "[-]"  # zero it if necessary
+    looped += "+" * a  # set loop counter
+    looped += "[<" + char * abs(b) + ">-]"  # sub 1 from counter, perform b actions
+    looped += "<"  # point to "character" cell
+    looped += get_char(-c if is_negative else c) * abs(c)  # c more actions
+
+    if len(naive) > len(looped):
+        return looped
+    else:
+        return naive
+
+
+def get_move_to_offset_code(offset):
+    # returns code that moves value from current pointer to cell at offset <offset> to the left
+    # after this, the pointer points to the original cell, which is now the next available cell
+
+    code = "<" * offset  # point to destination
+    code += "[-]"  # zero destination
+    code += ">" * offset  # point to source cell
+    code += "[" + "<" * offset + "+" + ">" * offset + "-]"  # increase destination, zero source
+    # point to next free location (source, which is now zero)
+
+    return code
+
+
+def get_copy_to_offset_code(offset):
+    # returns code that copies value from current pointer to cell at offset <offset> to the left
+    # after this, the pointer points to the original cell, which remains unchanged
+
+    code = ">"  # point to temp
+    code += "[-]"  # zero temp
+    code += "<" * (offset + 1)  # point to destination
+    code += "[-]"  # zero destination
+    code += ">" * offset  # point to source cell
+    code += "[>+" + "<" * (offset + 1) + "+" + ">" * offset + "-]"  # increase temp and destination, zero source
+    code += ">"  # point to temp
+    code += "[<+>-]"  # move temp to original cell
+    code += "<"  # point to original cell
+
+    return code
+
+
+def get_copy_to_variable_code(ids_map_list, ID_token, current_pointer):
+    # returns code that copies value from current pointer to cell of the variable ID
+    # after this, the pointer points to the original cell, which remains unchanged
+
+    offset = get_offset_to_variable(ids_map_list, ID_token, current_pointer)
+    return get_copy_to_offset_code(offset)
+
+
+def get_move_to_return_value_cell_code(return_value_cell, current_stack_pointer):
+    # returns code that moves value from current pointer to return_value cell
+    # after this, the pointer points to the original cell, which is now the next available cell
+
+    # we need to move it <current_stack_pointer - return_value_cell> cells left
+    return get_move_to_offset_code(current_stack_pointer - return_value_cell)
+
+
+def unpack_multidimensional_literal_tokens_to_array_dimensions(ID_token, array_dimensions, literal_tokens_list):
+    if len(array_dimensions) == 0:
+        raise BFSemanticError("Tried to initialize array %s with too many nested sub-arrays" % ID_token)
+    if len(literal_tokens_list) > array_dimensions[0]:
+        raise BFSemanticError("Tried to initialize array %s dimension %s with too many elements (%s)"
+                              % (ID_token, str(array_dimensions), str(len(literal_tokens_list))))
+
+    result = []
+    for element in literal_tokens_list:
+        if isinstance(element, list):
+            # recursively unpack the list with the sub-dimension of the sub-array
+            # E.g if we have arr[3][3][3] and then this call will fill [3][3]=9 elements
+            result.extend(unpack_multidimensional_literal_tokens_to_array_dimensions(ID_token, array_dimensions[1:], element))
+        else:
+            result.append(element)
+            if len(array_dimensions) > 1:
+                dimension_size = dimensions_to_size(array_dimensions[1:])  # current size we need to fill
+                result.extend([Token(Token.NUM, 0, 0, "0")] * (dimension_size - 1))  # fill missing elements in this dimension with zeros
+
+    dimension_size = dimensions_to_size(array_dimensions)  # current size we need to fill
+    result.extend([Token(Token.NUM, 0, 0, "0")] * (dimension_size-len(result)))  # fill the result with zeros
+    return result
+
+
+def unpack_literal_tokens_to_array_dimensions(ID_token, array_dimensions, literal_tokens_list):
+    # gets array dimensions and list of (list of list of...) literal tokens to initialize it with
+    # returns one long list of literal tokens that can be used to initialize the array as a one dimensional array
+    # if there are missing literals to fill the entire array, then fill the blanks with NUM 0
+    # E.g if the code is int arr[3][3][3] = {{1,2,3}, {}, {7, 8}}
+    # Then this function receives ([3,3,3] and [[1,2,3],[],[7,8]]) and returns [1,2,3,0,0,0,7,8,0] (all are tokens)
+
+    array_size = dimensions_to_size(array_dimensions)  # current size we need to fill
+    if all(not isinstance(element, list) for element in literal_tokens_list):
+        # special case - if all elements are literals, then we allow assigning them as-is and not care about dimensions
+        # E.g if we have arr[3][3][3] = {1,2,3,4} then return [1,2,3,4,0,0,0,0,0]
+        unpacked_literals_list = literal_tokens_list + [Token(Token.NUM, 0, 0, "0")] * (array_size - len(literal_tokens_list))  # fill missing with zeros
+    else:
+        unpacked_literals_list = unpack_multidimensional_literal_tokens_to_array_dimensions(ID_token, array_dimensions, literal_tokens_list)
+
+    if len(unpacked_literals_list) > array_size:
+        raise BFSemanticError("Tried to initialize array %s with incompatible amount of literals."
+                              " (array size is %s and literals size is %s)" % (ID_token, str(array_size), str(len(unpacked_literals_list))))
+    assert len(unpacked_literals_list) == array_size
+    return unpacked_literals_list
+
+
+def process_switch_cases(expression_code, cases):
+    # This function receives expression_code (string) and cases (list of tuples) corresponding to switch cases
+    # Each tuple is (case_value, case_code, has_break)
+    # And it returns code for the switch-case statement (string)
+
+    if len(cases) == 0:
+        code = ">"  # point to next cell
+        code += expression_code  # evaluate expression
+        code += "<"  # point to expression
+        code += "<"  # discard result
+        return code
+
+    def process_cases(cases):
+        # This function gets the cases list of tuples
+        # And returns 2 values: default_code (string), all_cases_have_break (bool)
+        # Note - default_code includes code of all relevant cases that are after the default case (if there's no break)
+        all_cases_have_break = all(has_break for (_, _, has_break) in cases)
+
+        has_default, default_code = False, ""
+        for case, case_code, has_break in cases:
+            if case == "default":
+                has_default = True
+            if has_default:
+                default_code += case_code
+                if has_break:
+                    break
+        return default_code, all_cases_have_break
+
+    default_code, all_cases_have_break = process_cases(cases)
+
+    # using 2 temp cells: need_to_execute, expression_value
+    # need_to_execute - initialized with 1, zeroed if running any case. indicating we should execute code for one of the cases
+    # expression_value - initialized with expression's value, this is what we compare our cases' values to
+
+    code = "[-]+"  # need_to_execute = 1
+    code += ">"  # point to next cell
+    code += expression_code  # evaluate expression
+    code += "<"  # point to expression
+
+    if all_cases_have_break:  # small optimization for evaluating the expression
+        cases = [case for case in cases if case[0] != "default"]  # remove default to be able to sort. it is handled differently
+        cases.sort(key=lambda x: x[0], reverse=True)  # Can sort since correct flow is not needed
+
+    """
+        This loop compares the expression value to each case in the switch-case statement, in reverse order
+        It does so by increasing and decreasing expression, and comparing result to 0
+        E.G. if we have 
+            switch(x) {
+                case 2:
+                case 0:
+                case 5: 
+                case 1:
+            }
+        x will be put in <expression> cell, then:
+        Iteration 1 will "increase" <expression> cell by -1 (0-1) (comparing x with 1)
+        Iteration 2 will "increase" <expression> cell by -4 (1-5) (comparing x with 5)
+        Iteration 3 will increase   <expression> cell by +5 (5-0) (comparing x with 0)
+        Iteration 4 will "increase" <expression> cell by -2 (0-2) (comparing x with 2)
+    """
+
+    # at this point, we point to expression_value cell
+    comparisons = 0
+    last_case_val = 0
+    for case, _, _ in reversed(cases):
+        if case == "default":
+            continue  # default is handled differently
+        code += get_set_cell_value_code(-case, last_case_val)
+        last_case_val = -case
+        code += "["  # "if zero then jump to matching code part"
+        comparisons += 1
+
+    """
+    Then we add each case's code in the correct order:
+    <need_to_execute=1>
+    <compare_with_1>    [
+    <compare_with_5>        [
+    <compare_with_0>            [ 
+    <compare_with_2>                [
+                                        <default_code> <expression_value=0> <need_to_execute=0>
+                                    ]   <if need_to_execute> <code_for_2> <need_to_execute=0>
+                                ]       <if need_to_execute> <code_for_0> <need_to_execute=0>
+                            ]           <if need_to_execute> <code_for_5> <need_to_execute=0>
+                        ]               <if need_to_execute> <code_for_1> <need_to_execute=0>
+
+    notice each case uses the next case's ']' instruction to return to the comparisons block
+    for example, the '[' in case 5 line uses the ']' of case 1 code to "return" to the comparisons
+    this is because there is no way to "skip" code
+    """
+
+    # This code will execute after all the comparisons are done and non of the cases executed
+    if default_code:
+        code += ">"  # point to next available cell for running the "default" code
+        code += default_code  # add code for default case (it also includes all the following cases until break)
+        code += "<"  # point to expression_value
+    code += "<-"  # need_to_execute = 0
+    code += ">[-]"  # expression_value = 0. When going back to last comparison, it will be 0, so we skip the default
+    if comparisons > 0:
+        code += "]"  # "jump back address" of the last comparison
+        comparisons -= 1
+
+    # Add all the cases code
+    for case_index, (case, case_code, has_break) in enumerate(cases):
+        if case == "default":
+            continue  # default is handled differently
+        if has_break or case_code or default_code:  # Meaning this case is not identical to the following case
+            # Or there exist a default case. And because it is handled differently, we need to have its code multiple times in different locations
+            # (if they are identical then no need to generate the same code multiple times (one for each case).
+            # this case will use the following case's code in the next loop iteration)
+
+            # Generate code for this case (unique)
+            code += "<"  # point to need_to_execute
+            code += "["  # if its non-zero (i.e need to execute the code for this case)
+            code += ">>"  # point to next available cell for running the code
+
+            # Insert the code from this case and all the following cases until reaching break
+            # This generates a lot of code since each case includes all following cases until reaching break
+            for _, following_case_code, following_has_break in cases[case_index:]:
+                code += following_case_code
+                if following_has_break:
+                    break
+            code += "<<"  # point to need_to_execute
+            code += "-"  # need_to_execute=0
+            code += "]"  # # end if
+            code += ">"  # point to expression_value
+
+        if comparisons > 0:
+            code += "]"  # "jump back address" of the comparison before us
+            comparisons -= 1
+
+    # end of the switch-case
+    code += "<"  # point to need_to_execute, which becomes next available cell
+    return code
+
+
+def get_copy_from_variable_code(ids_map_list, ID_token, current_pointer):
+    # returns code that copies the value from cell of variable ID to current pointer, and then sets the pointer to the next cell
+
+    offset = get_offset_to_variable(ids_map_list, ID_token, current_pointer)
+    code = "[-]"  # res = 0
+    code += ">[-]"  # temp (next cell) = 0
+    code += "<" * (offset + 1)  # point to destination cell
+    code += "[" + ">" * offset + "+>+" + "<" * (offset + 1) + "-]"  # increase res and temp, zero destination
+    code += ">" * (offset + 1)  # point to temp
+    code += "[" + "<" * (offset + 1) + "+" + ">" * (offset + 1) + "-]"  # copy temp back to destination
+    # at this point we point to the next available cell, which is temp, which is now zero
+
+    return code
+
+
+def get_token_ID_code(ids_map_list, token, current_pointer):
+    # generate code that evaluates the ID token at the current pointer, and sets the pointer to point to the next available cell
+    return get_copy_from_variable_code(ids_map_list, token, current_pointer)
+
+
+def get_literal_token_code(token):
+    # generate code that evaluates the token at the current pointer, and sets the pointer to point to the next available cell
+    assert is_token_literal(token)
+    if token.type == Token.TRUE:
+        code = "[-]"  # zero current cell
+        code += "+"  # current cell = 1
+        code += ">"  # point to next cell
+        return code
+
+    elif token.type == Token.FALSE:
+        code = "[-]"  # zero current cell
+        code += ">"  # point to next cell
+        return code
+
+    else:
+        value = get_literal_token_value(token)
+        code = "[-]"  # zero current cell
+        code += get_set_cell_value_code(value, 0)  # set current cell to the value
+        code += ">"  # point to the next cell
+        return code
+
+
+def get_divmod_code(right_token=None):
+    # given that the current pointer points to a, and the cell after a contains b,
+    # (i.e the cells look like: --> a, b, ?, ?, ?, ?, ...)
+    # returns a code that calculates divmod, and the cells look like this:
+    # --> 0, b-a%b, a%b, a/b, 0, 0
+    # and the pointer points to the first 0 (which is in the same cell as a used to be)
+    ADD_DIVISION_BY_ZERO_CHECK = True
+
+    if right_token is not None and right_token.type == Token.NUM:
+        if get_NUM_token_value(right_token) == 0:
+            raise BFSemanticError("Dividing by Zero, at %s" % right_token)
+
+        ADD_DIVISION_BY_ZERO_CHECK = False
+
+    def get_if_equal_to_0_code(inside_if_code, offset_to_temp_cell):
+        """
+        given a <inside_if_code>, wraps it with an "if (current_cell == 0) {<inside_if_code>}"
+
+        in the process, it zeros the current cell
+        additionally, it uses a temp cell
+        the argument <offset_to_temp_cell> is the offset from the current cell to the temp cell
+        *** note that the temp cell must be AFTER the cells that the <inside_if_code> touches ***
+
+        <inside_if_code> should assume it starts running when pointing to the current cell
+        and it should end its run pointing to the same cell
+        """
+
+        # temp cell is initialized to 1, and holds a flag of whether or not we should run <inside_if_code> or not
+        # if cell to evaluate is not zero, we set this flag to 0
+
+        code = ">" * offset_to_temp_cell  # point to temp
+        code += "[-]+"  # temp = 1
+        code += "<" * offset_to_temp_cell  # point to cell to compare to 0
+
+        code += "["  # if it is not zero
+        code += ">" * offset_to_temp_cell  # point to temp
+        code += "-"  # temp = 0
+        code += "<" * offset_to_temp_cell  # point to cell
+        code += "[-]"  # zero the cell
+        code += "]"  # end if
+
+        code += ">" * offset_to_temp_cell  # point to temp cell
+        code += "["  # if it is non zero
+        code += "<" * offset_to_temp_cell  # point to cell
+        code += inside_if_code  # execute desired code
+        # at this point we point to the original cell
+        code += ">" * offset_to_temp_cell  # point to temp cell
+        code += "-"  # temp = 0
+        code += "]"  # end if
+        code += "<" * offset_to_temp_cell  # point back to original cell
+
+        return code
+
+    code = ""
+
+    if ADD_DIVISION_BY_ZERO_CHECK:
+        # create a prefix code: if (b == 0) {print("Error - Division by zero\n");}
+
+        # copy b to temp cell (via another temp cell) and compare that cell to 0. if its 0, execute error print and go to infinite loop
+
+        code += ">>"  # point to empty cell
+        code += "[-]>[-]"  # zero 2 temp cells
+        code += "<<"  # point to b
+        code += "[>+>+<<-]"  # move b to both cells
+        code += ">"  # point to first cell
+        code += "[<+>-]"  # move first cell back to b
+        code += ">"  # point to second cell
+
+        code_inside_if = get_print_string_code("Error - Division by zero\n")
+        code_inside_if += "[]"  # infinite loop
+
+        code += get_if_equal_to_0_code(code_inside_if, offset_to_temp_cell=1)
+        code += "<<<"  # point to a
+
+        # ======================= end of prefix =======================
+
+    # a, b, w, x, y, z
+
+    code += ">>[-]>[-]>[-]>[-]<<<<<"  # zero w,x,y,z, and point to a
+    code += "["  # while a != 0
+
+    code += "-"  # decrease a by 1
+    code += ">-"  # decrease b by 1
+    code += ">+"  # increase w by 1
+    code += "<"  # point to b
+    code += "[->>>+>+<<<<]>>>>[-<<<<+>>>>]"  # copy b to y (via z)
+    code += "<"  # point to y
+
+    code_inside_if = ""
+    code_inside_if += "<+"  # increase x by 1
+    code_inside_if += "<"  # point to w
+    code_inside_if += "[-<+>]"  # copy w to b (b is already 0) (after this we point to w)
+    code_inside_if += ">>"  # point to y
+
+    # get_if_equal_to_0 also zeros y
+    # i set offset_to_temp_cell = 1 because it can use z, since it is unused inside the if
+    code += get_if_equal_to_0_code(inside_if_code=code_inside_if, offset_to_temp_cell=1)
+
+    code += "<<<<"  # point to a
+    code += "]"  # end while
+
+    """
+    a, b, w, x, y, z
+
+
+    w, x, y, z = 0, 0, 0, 0
+
+    while a != 0
+        a -= 1
+        b -= 1
+        w += 1
+
+        if b == 0:  (this means that w = original b) (implementation: copy b to y (via z) and compare y to 0, (then zero y))
+            x += 1
+            b = w
+            w = 0
+
+    at the end:
+    w = a%b
+    x = a/b
+    b = b-a%b
+    """
+
+    return code
+
+
+def get_bitwise_code(code_logic):
+    # a, b, c, w, x, y, z, bit1, bitcounter, res
+    # code_logic uses the cells y, z, and bit1. Where y is res and z and bit1 are the bits.
+    # y is zero. z and bit1 should be zero after code_logic.
+
+    code = ">" * 7  # point to bit1
+    code += "[-]"  # zero bit1
+    code += ">"  # point to bitcounter
+    code += ">[-]<"  # zero res
+
+    code += "[-]--------[++++++++"  # while bitcounter != 8:
+    code += "<"
+    code += "<[-]" * 5  # clear c, w, x, y, z
+    code += "++"  # c = 2
+    code += "<<"  # point to a
+
+    code += "["  # while a != 0:
+    code +=     "-"  # a -= 1
+    code +=     ">>-"  # c -= 1
+    code +=     "[>+>>+<<<-]>[<+>-]"  # copy c to y (using w)
+    code +=     ">>"  # point to y
+    code +=     ">>+<<"  # bit1 += 1
+
+    code +=     "-["  # if y != 1:
+    code +=         "<+"  # x += 1
+    code +=         "<<++"  # c += 2 (c was 0)
+    code +=         ">" * 5  # point to bit1
+    code +=         "--"  # bit1 -= 2 (bit1 was 2)
+    code +=         "<<"  # point to y
+    code +=         "+"  # set y to 0
+    code +=     "]"  # end if
+
+    code +=     "<<<<<"  # point to a
+    code += "]"  # end while
+
+    code += ">>>>[<<<<+>>>>-]"  # move x to a (x is a/2)
+    code += "<<[-]++"  # c = 2
+    code += "<"  # point to b
+
+    code += "["  # while b != 0:
+    code +=     "-"  # b -= 1
+    code +=     ">-"  # c -= 1
+    code +=     "[>+>>+<<<-]>[<+>-]"  # copy c to y (using w)
+    code +=     ">>"  # point to y
+    code +=     ">+<"  # z += 1
+
+    code +=     "-["  # if y != 1:
+    code +=         ">--<"  # z -= 2 (z was 2)
+    code +=         "<+"  # x += 1
+    code +=         "<<++"  # c += 2 (c was 0)
+    code +=         ">>>"  # point to y
+    code +=         "+"  # set y to 0
+    code +=     "]"
+
+    code +=     "<<<<"  # point to b
+    code += "]"  # end while
+
+    # w is a % 2
+    # x is a / 2
+
+    code += ">>>[<<<+>>>-]"  # move x to b
+
+    code += ">>"  # point to z
+    code += code_logic  # pointer ends at bit1, z and bit1 should be 0 after code
+
+    code += ">[<+<+>>-]<[>+<-]"  # copy bit to z (using bit1)
+
+    # y = y << z
+    code += "<"
+    code += "["  # while z != 0:
+    code += "<"  # point to y
+    code += "[<+>-]"  # copy y to x
+    code += "<[>++<-]"  # copy x to y * 2
+    code += ">>-"  # z -= 1
+    code += "]"
+
+    code += "<"  # point to y
+    code += "[>>>>+<<<<-]"  # res += y
+
+    code += ">>>"  # point to bitcounter
+    code += "-" * 7  # loop if bitcounter != 7
+
+    code += "]"  # end while
+
+    code += ">[<<<<<<<<<+>>>>>>>>>-]"  # move res to a
+    code += "<<<<<<<<"  # point to b
+
+    return code
+
+
+def get_unary_prefix_op_code(token, offset_to_variable=None):
+    # returns code that:
+    # performs op on an operand that is at the current pointer
+    # the result is placed in the cell of the operand
+    # and the pointer points to the cell right after it (which becomes the next available cell)
+
+    if token.type == Token.NOT:
+        # a temp
+        code = ">"  # point to temp
+        code += "[-]+"  # temp = 1
+        code += "<"  # point to a
+        code += "["  # if a is non-zero
+        code += ">-"  # temp = 0
+        code += "<[-]"  # zero a
+        code += "]"  # end if
+
+        code += ">"  # point to temp
+        code += "["  # if temp is non-zero
+        code += "<+"  # a = 1
+        code += ">-"  # temp = 0
+        code += "]"  # end if
+
+        return code
+
+    elif token.type == Token.INCREMENT:
+        # returns code that copies the value from the variable's cell at the given offset, and adds 1 to both the copied and the original cell
+        assert offset_to_variable is not None
+        offset = offset_to_variable
+
+        code = "[-]"  # res = 0
+        code += ">[-]"  # temp (next pointer) = 0
+        code += "<" * (offset + 1)  # point to destination cell
+        code += "+"  # increase destination by 1
+        code += "[" + ">" * offset + "+>+" + "<" * (offset + 1) + "-]"  # increase res and temp, zero destination
+        code += ">" * (offset + 1)  # point to temp
+        code += "[" + "<" * (offset + 1) + "+" + ">" * (offset + 1) + "-]"  # copy temp back to destination
+        # at this point we point to the next available cell, which is temp, which is now zero
+
+        return code
+
+    elif token.type == Token.DECREMENT:
+        # returns code that copies the value from the variable's cell at the given offset, and subtracts 1 from both the copied and the original cell
+        assert offset_to_variable is not None
+        offset = offset_to_variable
+
+        code = "[-]"  # res = 0
+        code += ">[-]"  # temp (next pointer) = 0
+        code += "<" * (offset + 1)  # point to destination cell
+        code += "-"  # decrease destination by 1
+        code += "[" + ">" * offset + "+>+" + "<" * (offset + 1) + "-]"  # increase res and temp, zero destination
+        code += ">" * (offset + 1)  # point to temp
+        code += "[" + "<" * (offset + 1) + "+" + ">" * (offset + 1) + "-]"  # copy temp back to destination
+        # at this point we point to the next available cell, which is temp, which is now zero
+
+        return code
+
+    elif token.type == Token.UNARY_MULTIPLICATIVE:
+        # returns code that copies the value from the variable's cell at the given offset, modifies both the copied and the original cell depending on the op
+        assert offset_to_variable is not None
+        offset = offset_to_variable
+
+        if token.data in ["**", "//"]:
+            code = "[-]"  # res = 0
+            code += ">[-]"  # temp (next pointer) = 0
+            code += "<" * (offset + 1)  # point to destination cell
+            code += "[" + ">" * offset + "+>+" + "<" * (offset + 1) + "-]"  # increase res and temp, zero destination
+            code += ">" * offset  # point to res
+            code += ">"  # point to temp (**x, //x keep x the same)
+            code += "[" + "<" * (offset + 1) + "+" + ">" * (offset + 1) + "-]"  # copy temp back to destination
+            # at this point we point to the next available cell
+
+            return code
+
+        elif token.data == "%%":
+            code = "[-]"  # res = 0
+            code += "<" * offset  # point to destination cell
+            code += "[-]"  # zero destination
+            code += ">" * offset  # point to res
+            code += ">"  # point the next available cell
+            # at this point we point to the next available cell
+
+            return code
+
+        else:
+            raise BFSyntaxError("Unexpected unary prefix %s" % str(token))
+
+    elif token.type == Token.BITWISE_NOT:
+        # a temp
+        code = "[>+<-]"  # move a into temp
+        code += ">"  # point to temp
+        code += "+[<->-]"  # invert temp into a
+
+        return code
+
+    elif token.type == Token.BINOP:
+        assert token.data in ["+", "-"]
+        if token.data == "+":
+            # keep value as-is
+            return '>'
+        elif token.data == "-":
+            # a temp
+            code = ">[-]" # zero temp
+            code += "<" # point to a
+            code += "[->-<]" # sub a from temp
+            code += ">" # point to temp
+            code += "[<+>-]" # copy temp to a
+            return code
+    raise NotImplementedError
+
+
+def get_unary_postfix_op_code(token, offset_to_variable):
+    # returns code that:
+    # performs op on operand that is at the current pointer
+    # the result is placed in the cell of the operand
+    # and the pointer points to the cell right after it (which becomes the next available cell)
+
+    if token.type == Token.INCREMENT:
+        # returns code that copies the value from the variable's cell at the given offset, and adds 1 to the original cell
+        offset = offset_to_variable
+
+        code = "[-]"  # res = 0
+        code += ">[-]"  # temp (next pointer) = 0
+        code += "<" * (offset + 1)  # point to destination cell
+        code += "[" + ">" * offset + "+>+" + "<" * (offset + 1) + "-]"  # increase res and temp, zero destination
+        code += ">" * (offset + 1)  # point to temp
+        code += "+"  # increase temp by 1
+        code += "[" + "<" * (offset + 1) + "+" + ">" * (offset + 1) + "-]"  # copy temp back to destination
+        # at this point we point to the next available cell, which is temp, which is now zero
+
+        return code
+
+    elif token.type == Token.DECREMENT:
+        # returns code that copies the value from the variable's cell at the given offset, and subtracts 1 from the original cell
+        offset = offset_to_variable
+
+        code = "[-]"  # res = 0
+        code += ">[-]"  # temp (next pointer) = 0
+        code += "<" * (offset + 1)  # point to destination cell
+        code += "[" + ">" * offset + "+>+" + "<" * (offset + 1) + "-]"  # increase res and temp, zero destination
+        code += ">" * (offset + 1)  # point to temp
+        code += "-"  # decrease temp by 1
+        code += "[" + "<" * (offset + 1) + "+" + ">" * (offset + 1) + "-]"  # copy temp back to destination
+        # at this point we point to the next available cell, which is temp, which is now zero
+
+        return code
+
+    elif token.type == Token.UNARY_MULTIPLICATIVE:
+        # returns code that copies the value from the variable's cell at the given offset, and modifies the original cell depending on the operation
+        offset = offset_to_variable
+
+        code = "[-]"  # res = 0
+        code += ">[-]"  # temp (next pointer) = 0
+        code += "<" * (offset + 1)  # point to destination cell
+        code += "[" + ">" * offset + "+>+" + "<" * (offset + 1) + "-]"  # increase res and temp, zero destination
+        code += ">" * (offset + 1)  # point to temp
+
+        if token.data in ["**", "//"]:
+            pass  # x**, x// keeps x the same
+        elif token.data == "%%":
+            # at this point we zeroed x and we point to temp (next available cell)
+            return code  # no need to copy anything back to destination - x%% modifies x to 0
+        else:
+            raise BFSyntaxError("Unexpected unary postfix %s" % str(token))
+
+        code += "[" + "<" * (offset + 1) + "+" + ">" * (offset + 1) + "-]"  # copy temp back to destination
+        # at this point we point to the next available cell, which is temp, which is now zero
+
+        return code
+
+    raise NotImplementedError
+
+
+def get_op_between_literals_code(op_token, right_token=None):
+    # returns code that:
+    # performs op on 2 operands
+    # the first operand is at current pointer, and the second operand is at current pointer + 1
+    # the code can destroy second operand, and everything after it
+
+    # the result is placed in the cell of the first operand
+    # and the pointer points to the cell right after it (which becomes the next available cell)
+
+    op = op_token.data
+    if op == "+" or op == "-":
+        code = ">[<" + op + ">-]"  # increase/decrease the first operand and decrease the second operand
+        # the pointer now points to the next available cell, which is the second operand, which is 0
+
+        return code
+
+    elif op == "*":
+        # a, b, temp1, temp2
+        code = ">>[-]"  # temp1 = 0
+        code += ">[-]"  # temp2 = 0
+        code += "<<<"  # point to first operand
+        code += "[>>>+<<<-]"  # move first operand to temp2
+        code += ">>>"  # point to temp2
+
+        # do in a loop: as long as temp2 != 0
+        code += "["
+
+        code += "<<"  # point to second operand
+        code += "[<+>>+<-]"  # add it to first operand and temp1
+        code += ">"  # point to temp1
+        code += "[<+>-]"  # move it to second operand
+
+        # end loop
+        code += ">"  # point back to temp2
+        code += "-"  # decrease temp2
+        code += "]"
+
+        code += "<<"  # point back to next available cell (second operand)
+        return code
+
+    elif op == "/":
+        code = get_divmod_code(right_token)
+        code += ">>>"  # point to a/b
+        code += "[<<<+>>>-]"  # copy a/b to current cell
+        code += "<<"  # point to next available cell
+
+        return code
+
+    elif op == "%":
+        code = get_divmod_code(right_token)
+        code += ">>"  # point to a%b
+        code += "[<<+>>-]"  # copy a%b to current cell
+        code += "<"  # point to next available cell
+
+        return code
+
+    # relops
+    elif op == "==":
+        # a, b
+        code = "[->-<]"  # a = 0, b = b - a
+        code += "+"  # a = 1. will hold the result. if a!=b, this is unchanged
+        code += ">"  # point to b
+        code += "["  # if b == 0, enter the following code
+        code += "<->[-]"  # a = 0, b=0
+        code += "]"  # end of "loop"
+
+        return code
+
+    elif op == "!=":
+        # a, b
+        code = "[->-<]"  # a = 0, b = b - a
+        # a will hold the result. if a != b, this is unchanged
+        code += ">"  # point to b
+        code += "["  # if b == 0, enter the following code
+        code += "<+>[-]"  # a = 1, b=0
+        code += "]"  # end of "loop"
+
+        return code
+
+    elif op == ">":
+        # a, b, c, d
+
+        code = ">>[-]"  # c = 0  (will hold res)
+        code += ">[-]"  # d = 0
+        code += "<<<"  # point to a
+
+        code += "["  # while a != 0
+
+        code += ">>[-]"  # c = 0
+        code += "<"  # point to b
+        code += "[>+>+<<-]>[<+>-]"  # copy b to d (via c)
+        code += "+"  # c = 1 (will hold res)
+        code += ">"  # point to d
+        code += "["  # if d != 0
+        code += "[-]"  # d = 0
+        code += "<-"  # c = 0
+        code += "<-"  # b -= 1
+        code += ">>"  # point to d
+        code += "]"  # end if
+
+        code += "<<<"  # point to a
+        code += "-"  # a -= 1
+
+        code += "]"  # end while
+
+        # move c to a
+        code += ">>"  # point to c
+        code += "[<<+>>-]"  # move c to a
+        code += "<"  # point to b (next available cell)
+
+        """
+        x > y?
+
+        res = 0
+        while x != 0:
+            res = 1
+            if y != 0:
+                res = 0
+                y -= 1
+
+            x -= 1
+        """
+
+        return code
+
+    elif op == "<":
+        # similar to >
+
+        # a, b, c, d
+
+        code = ">>[-]"  # c = 0  (will hold res)
+        code += ">[-]"  # d = 0
+        code += "<<"  # point to b
+
+        code += "["  # while b != 0
+
+        code += ">[-]"  # c = 0
+        code += "<<"  # point to a
+        code += "[>>+>+<<<-]>>[<<+>>-]"  # copy a to d (via c)
+        code += "+"  # c = 1 (will hold res)
+        code += ">"  # point to d
+        code += "["  # if d != 0
+        code += "[-]"  # d = 0
+        code += "<-"  # c = 0
+        code += "<<-"  # a -= 1
+        code += ">>>"  # point to d
+        code += "]"  # end if
+
+        code += "<<"  # point to b
+        code += "-"  # b -= 1
+
+        code += "]"  # end while
+
+        # move c to a
+        code += "<"  # point to a
+        code += "[-]"  # a = 0
+        code += ">>"  # point to c
+        code += "[<<+>>-]"  # move c to a
+        code += "<"  # point to b (next available cell)
+
+        """
+        x < y?
+
+        res = 0
+        while y != 0:
+            res = 1
+            if x != 0:
+                res = 0
+                x -= 1
+
+            y -= 1
+        """
+
+        return code
+
+    elif op == "<=":
+        # a, b, c, d
+
+        code = ">>[-]+"  # c = 1  (will hold res)
+        code += ">[-]"  # d = 0
+        code += "<<<"  # point to a
+
+        code += "["  # while a != 0
+
+        code += ">>[-]"  # c = 0
+        code += "<"  # point to b
+        code += "[>+>+<<-]>[<+>-]"  # copy b to d (via c)
+        code += ">"  # point to d
+        code += "["  # if d != 0
+        code += "[-]"  # d = 0
+        code += "<+"  # c = 1
+        code += "<-"  # b -= 1
+        code += ">>"  # point to d
+        code += "]"  # end if
+
+        code += "<<<"  # point to a
+        code += "-"  # a -= 1
+
+        code += "]"  # end while
+
+        # move c to a
+        code += ">>"  # point to c
+        code += "[<<+>>-]"  # move c to a
+        code += "<"  # point to b (next available cell)
+
+        """
+        x <= y?
+
+        res = 1
+        while x != 0:
+            res = 0
+
+            if y != 0:
+                res = 1
+                y -= 1
+
+            x -= 1
+        """
+
+        return code
+
+    elif op == ">=":
+        # similar to <=
+
+        # a, b, c, d
+
+        code = ">>[-]+"  # c = 1  (will hold res)
+        code += ">[-]"  # d = 0
+        code += "<<"  # point to b
+
+        code += "["  # while b != 0
+
+        code += ">[-]"  # c = 0
+        code += "<<"  # point to a
+        code += "[>>+>+<<<-]>>[<<+>>-]"  # copy a to d (via c)
+        code += ">"  # point to d
+        code += "["  # if d != 0
+        code += "[-]"  # d = 0
+        code += "<+"  # c = 1
+        code += "<<-"  # a -= 1
+        code += ">>>"  # point to d
+        code += "]"  # end if
+
+        code += "<<"  # point to b
+        code += "-"  # b -= 1
+
+        code += "]"  # end while
+
+        # move c to a
+        code += "<"  # point to a
+        code += "[-]"  # a = 0
+        code += ">>"  # point to c
+        code += "[<<+>>-]"  # move c to a
+        code += "<"  # point to b (next available cell)
+
+        """
+        x >= y?
+
+        res = 1
+        while y != 0:
+            res = 0
+
+            if x != 0:
+                res = 1
+                x -= 1
+
+            y -= 1
+        """
+
+        return code
+
+    elif op == "<<":
+        # a, b, temp
+
+        code = ">>[-]"  # zero temp
+        code += "<"  # point to b
+
+        code += "["  # while b != 0
+        code += "<"  # point to a
+        code += "[>>+<<-]"  # copy a to temp
+        code += ">>"  # point to temp
+        code += "[<<++>>-]"  # multiply temp by 2 and store result in a
+        code += "<-"  # point to b and b -= 1
+        code += "]"  # end while
+
+        return code
+
+    elif op == ">>":
+        # a, b, c, x, y, z
+
+        code = ">"  # point to b
+        code += ">[-]" * 4  # clear 4 cells
+        code += "<" * 4  # point to b
+
+        code += "["  # while b != 0
+        code += ">++"  # set c to 2
+        code += "<<"  # point to a
+
+        code += "["  # while a != 0
+        code += "-"  # a -= 1
+        code += ">>-"  # c -= 1
+        code += "[>>+>+<<<-]>>>[<<<+>>>-]"  # copy c to y (via z)
+        code += "<"  # point to y
+
+        code += "-["  # if y == 0
+        code += "<+"  # x += 1
+        code += "<++"  # set c to 2
+        code += ">>"
+        code += "+"  # zero y
+        code += "]"  # end if
+
+        code += "<<<<"  # point to a
+        code += "]"  # end while
+
+        code += ">>>"  # point to x
+        code += "[<<<+>>>-]"  # move x to a
+        code += "<[-]"  # zero c
+        code += "<-"  # b -= 1
+        code += "]"  # end while
+
+        return code
+
+    elif op_token.type == Token.BITWISE_AND:
+        code = get_bitwise_code("[->[-<<+>>]<]>[-]")
+
+        return code
+
+    elif op_token.type == Token.BITWISE_OR:
+        code = get_bitwise_code("[>+<-]>[[-]<<+>>]")
+
+        return code
+
+    elif op_token.type == Token.BITWISE_XOR:
+        code = get_bitwise_code("[>-<-]>[[-]<<+>>]")
+
+        return code
+
+    raise NotImplementedError
+
+
+def get_op_boolean_operator_code(node, current_pointer):
+    # short-circuit evaluation of AND and OR
+    assert node.token.type in [Token.AND, Token.OR]
+
+    if node.token.type == Token.AND:
+        # result, operand
+        code = "[-]"  # zero result
+        code += ">"  # point to next cell
+        code += node.left.get_code(current_pointer + 1)  # evaluate first operand
+        code += "<"  # point to first operand
+        code += "["  # if it is non-zero
+
+        code += "[-]"  # zero first operand
+        code += node.right.get_code(current_pointer + 1)  # evaluate second operand
+        code += "<"  # point to second operand
+        code += "["  # if it is non-zero
+        code += "<+>"  # result = 1
+        code += "[-]"  # zero second operand
+        code += "]"  # end if
+
+        code += "]"  # end if
+        # now we point to one after result (next available cell)
+        return code
+
+    elif node.token.type == Token.OR:
+        # result, check_second_operand/second_operand, first_operand
+        code = "[-]"  # zero result
+        code += ">"  # point to check_second_operand
+        code += "[-]+"  # check_second_operand = 1
+        code += ">"  # point to next cell
+        code += node.left.get_code(current_pointer + 2)  # evaluate first operand
+        code += "<"  # point to first operand
+
+        code += "["  # if it is non-zero
+        code += "<<+"  # result = 1
+        code += ">-"  # check_second_operand = 0
+        code += ">[-]"  # zero first operand
+        code += "]"  # end if
+
+        code += "<"  # point to check_second_operand
+        code += "["  # if check_second_operand
+        code += node.right.get_code(current_pointer + 1)  # evaluate second operand
+        code += "<"  # point to second operand
+        code += "["  # if it is non-zero
+        code += "<+>"  # result = 1
+        code += "[-]"  # zero second operand
+        code += "]"  # end if
+        code += "]"  # end if
+
+        # now we point to one after result (next available cell)
+        return code
+
+    raise NotImplementedError
+
+
+
+def get_print_string_code(string):
+    code = "[-]"  # zero the current cell
+    code += ">[-]"  # zero the next cell (will be used for loop counts)
+    code += "<"  # point to the original cell ("character" cell)
+
+    prev_value = 0
+    for i in range(len(string)):
+        current_value = ord(string[i])
+
+        code += get_set_cell_value_code(current_value, prev_value, zero_next_cell_if_necessary=False)
+        code += "."
+
+        prev_value = current_value
+
+    return code
+
+
+def get_move_right_index_cells_code(current_pointer, node_index):
+    # used for arrays
+    # returns a code that evaluates the index, then moves the pointer right, <index> amount of cells
+    # at the end of execution, the layout is:
+    # 0 index next_available_cell (point to next available cell)
+
+    # index, steps_taken_counter
+    code = node_index.get_code(current_pointer)  # index
+    code += "[-]"  # counter = 0
+    code += "<"  # point to index
+
+    code += "["  # while index != 0
+    code += ">>"  # point to new_counter (one after current counter)
+    code += "[-]+"  # zero new_counter then add 1 to the new_counter
+    code += "<"  # move to old counter
+    code += "[>+<-]"  # add old counter to new counter
+    code += "<"  # point to old index
+    code += "-"  # sub 1 from old index
+    code += "[>+<-]"  # move old index to new index
+    code += ">"  # point to new index
+    code += "]"  # end while
+
+    # old_index=0 new_index res (pointing to old index)
+    code += ">>"  # point to res
+
+    return code
+
+
+def get_move_left_index_cell_code():
+    # used for arrays
+    # complement of "get_move_right_index_cells_code"
+    # assumes the layout is:
+    # value, index (pointing to index)
+    # moves <index> cells left, and moving <value> along with it
+    # in the end, point to one cell after <value> (which becomes the next available cell)
+
+    # layout: res, index (pointing to index)
+    code = "["  # while new_index != 0
+    code += "<"  # point to res
+    code += "[<+>-]"  # move res to the left
+    code += ">"  # point to new_index
+    code += "-"  # sub 1 from index
+    code += "[<+>-]"  # move new_index to left
+    code += "<"  # point to new index
+    code += "]"  # end while
+
+    # now res is at the desired cell, and we point to the next available cell
+
+    return code
+
+
+# =================
+#     General
+# =================
+
+def get_literal_token_value(token):
+    # known at compilation time
+    assert is_token_literal(token)
+    if token.type == Token.NUM:
+        return get_NUM_token_value(token)
+    elif token.type == Token.TRUE:
+        return 1
+    elif token.type == Token.FALSE:
+        return 0
+    elif token.type == Token.CHAR:
+        return ord(token.data)
+
+
+def get_NUM_token_value(token):
+    if token.data.startswith("0x"):
+        return int(token.data, 16)
+    elif token.data.startswith("0o"):
+        return int(token.data, 8)
+    elif token.data.startswith("0b"):
+        return int(token.data, 2)
+    else:
+        return int(token.data)
+
+
+def get_variable_from_ID_token(ids_map_list, ID_token):
+    ID = ID_token.data
+    # given an id, goes through the ids map list and returns the index of the first ID it finds
+    for i in range(len(ids_map_list)):
+        ids_map = ids_map_list[i].IDs_dict
+        if ID in ids_map:
+            return ids_map[ID]
+    raise BFSemanticError("'%s' does not exist" % str(ID_token))
+
+
+def dimensions_to_size(dimensions):
+    return reduce(lambda x, y: x * y, dimensions)
+
+
+def get_variable_dimensions_from_token(ids_map_list, ID_token):
+    variable = get_variable_from_ID_token(ids_map_list, ID_token)
+    return variable.dimensions
+
+
+def get_id_index(ids_map_list, ID_token):
+    variable = get_variable_from_ID_token(ids_map_list, ID_token)
+    return variable.cell_index
+
+
+def get_offset_to_variable(ids_map_list, ID_token, current_pointer):
+    offset = current_pointer - get_id_index(ids_map_list, ID_token)
+    return offset
+
+
+def is_token_literal(token):
+    # token with value that is known at compilation time
+    return token.type in [Token.TRUE, Token.FALSE, Token.NUM, Token.CHAR]
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Globals.py b/reasoning_gym/code/contrib/bfit/Compiler/Globals.py
new file mode 100644
index 00000000..5c37c59e
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Globals.py
@@ -0,0 +1,82 @@
+from collections import namedtuple
+from .Token import Token
+from .General import dimensions_to_size, get_NUM_token_value
+
+"""
+This file holds the program's functions and global variables
+(as global variables, hehe)
+And related functions
+"""
+
+global_variables = list()  # Global list of global variables
+
+
+# variables
+def get_global_variables():
+    return global_variables
+
+
+def insert_global_variable(variable):
+    get_global_variables().append(variable)
+
+
+def get_global_variables_size():
+    return sum(get_variable_size(variable) for variable in get_global_variables())
+
+
+def create_variable(name, type, dimensions):
+    # return variable named tuple
+    variable = namedtuple("variable", ["name", "type", "size", "cell_index"])
+
+    variable.name = name
+    variable.type = type
+    variable.dimensions = dimensions  # list of array dimensions sizes (for non-arrays it will be [1])
+    variable.cell_index = None  # will be updated when we insert this variable into an ids map
+
+    return variable
+
+
+def get_variable_size(variable):
+    # return total variable size
+    return dimensions_to_size(variable.dimensions)
+
+
+def get_variable_dimensions(variable):
+    return variable.dimensions
+
+
+def is_variable_array(variable):
+    return variable.dimensions != [1]
+
+
+def create_variable_from_definition(parser, index=None, advance_tokens=False):
+    """
+    processes the variable definition at index, and returns the variable named tuple
+    if index is None, then assumes we start at the current_token_index
+    if advance_tokens is True, then modifies current_token_index accordingly using parser.advance_token()
+    """
+    
+    if index is None:
+        index = parser.current_token_index
+
+    assert parser.tokens[index].type == Token.INT
+
+    parser.check_next_token_is(Token.ID, starting_index=index)
+    ID = parser.tokens[index + 1].data
+
+    if advance_tokens:
+        parser.advance_token(amount=2)  # skip INT ID
+
+    if parser.tokens[index + 2].type == Token.LBRACK:  # array (support multi-dimensional arrays)
+        dimensions = []  # element[i] holds the size of dimension[i]
+        while parser.tokens[index + 2].type == Token.LBRACK:
+            parser.check_next_tokens_are([Token.LBRACK, Token.NUM, Token.RBRACK], starting_index=index + 1)
+            dimensions.append(get_NUM_token_value(parser.tokens[index + 3]))
+
+            if advance_tokens:
+                parser.advance_token(amount=3)  # skip LBRACK NUM RBRACK
+            index += 3
+    else:
+        dimensions = [1]
+
+    return create_variable(ID, Token.INT, dimensions)
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py b/reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py
new file mode 100644
index 00000000..1c3e5e0a
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py
@@ -0,0 +1,183 @@
+import re
+from .Token import Token
+from .Optimizer import optimize
+
+
+class LexicalErrorException(Exception):
+    pass
+
+
+def analyze(text):
+    """
+    :returns list of tokens in the text
+    raises exception in case of lexical error
+    """
+
+    rules = [
+        ('\s+', Token.WHITESPACE),
+        ('void',    Token.VOID),
+        ('int',     Token.INT),
+        ('bool', Token.INT),  # treat bool as int
+        ('char', Token.INT),  # treat char as int
+
+        ('true', Token.TRUE),
+        ('false', Token.FALSE),
+        ('&&', Token.AND),
+        ('\|\|', Token.OR),
+        ('\!', Token.NOT),
+        ('return', Token.RETURN),
+        ('if', Token.IF),
+        ('else', Token.ELSE),
+        ('while', Token.WHILE),
+        ('for', Token.FOR),
+        ('do', Token.DO),
+        ('print', Token.PRINT),
+        ('switch', Token.SWITCH),
+        ('case', Token.CASE),
+        ('default', Token.DEFAULT),
+        ('break', Token.BREAK),
+        ('continue', Token.CONTINUE),  # todo
+        (':', Token.COLON),
+        (';', Token.SEMICOLON),
+        (',', Token.COMMA),
+
+        ('\(', Token.LPAREN),
+        ('\)', Token.RPAREN),
+        ('\{', Token.LBRACE),
+        ('\}', Token.RBRACE),
+        ('\[', Token.LBRACK),
+        ('\]', Token.RBRACK),
+        ('=|\+=|-=|\*=|/=|%=|<<=|>>=|&=|\|=|\^=', Token.ASSIGN),
+        ('\?', Token.TERNARY),
+
+        ('<=|>=|==|!=|<|>', Token.RELOP),
+        ('\+\+', Token.INCREMENT),
+        ('--', Token.DECREMENT),
+        ('\+|-|\*|/|%', Token.BINOP),
+        ('\*\*|//|%%', Token.UNARY_MULTIPLICATIVE),
+
+        ('<<|>>', Token.BITWISE_SHIFT),
+        ('~', Token.BITWISE_NOT),
+        ('&', Token.BITWISE_AND),
+        ('\|', Token.BITWISE_OR),
+        ('\^', Token.BITWISE_XOR),
+
+        ('([a-zA-Z_][a-zA-Z0-9_]*)',    Token.ID),
+        ('(\d+)',     Token.NUM),
+        ('(0x[A-Fa-f\d]+)',     Token.NUM),  # hexadecimal number
+        ('(0o[0-7]+)',     Token.NUM),  # octal number
+        ('(0b[01]+)',     Token.NUM),  # binary number
+        (r'\"(\\\"|[^"])*"',   Token.STRING),
+        (r'\'(\\\'|(\\)?[^\'])\'', Token.CHAR),
+        ('//.*(\\n|$)', Token.COMMENT),
+        (r'/\*[\s\S]*?\*/', Token.COMMENT),  # multiline comments
+        ('.',       Token.UNIDENTIFIED)
+    ]
+
+    rules = [(re.compile(r), t) for r, t in rules]
+
+    tokens = []
+
+    # create a mapping of [line number] to [offset of that line from the beginning of the text]
+    newline = re.compile('\n')
+    lines = [0] + [m.end() for m in re.finditer(newline, text)]
+
+    i = 0
+    while i < len(text):
+        current_matches = []
+        for regex, token_type in rules:
+            m = regex.match(text, i)
+            if m:
+                current_matches.append((m, token_type))
+
+        # pick the token that fits the longest match
+        # if tie - pick the one defined first in the rules list
+        longest_match, max_i, matched_token = None, i, None
+        for match, token_type in current_matches:
+            if match.end() > max_i:
+                longest_match, max_i, matched_token = match, match.end(), token_type
+
+        # calculate line and column
+        line, column = None, None
+        for line_idx in range(len(lines)-1):
+            if lines[line_idx] <= longest_match.start() < lines[line_idx+1]:
+                line, column = line_idx+1, (longest_match.start() - lines[line_idx])+1  # humans count from 1 :)
+                break
+        if not line:
+            line, column = len(lines), (longest_match.start() - lines[-1])+1
+
+        if matched_token in [Token.COMMENT, Token.WHITESPACE]:
+            pass  # do nothing
+        elif matched_token == Token.UNIDENTIFIED:
+            raise LexicalErrorException("Unidentified Character '%s' (line %s column %s)" % (text[i], line, column))
+        elif matched_token in [Token.STRING, Token.CHAR]:
+            # remove quotes at beginning and end, un-escape characters
+            tokens.append(Token(matched_token, line, column, longest_match.group()[1:-1].encode("utf8").decode("unicode_escape")))
+        elif matched_token in [Token.NUM, Token.ID, Token.BINOP, Token.RELOP, Token.ASSIGN, Token.UNARY_MULTIPLICATIVE, Token.BITWISE_SHIFT]:
+            tokens.append(Token(matched_token, line, column, longest_match.group()))
+        else:
+            tokens.append(Token(matched_token, line, column))
+        i = longest_match.end()
+
+    return tokens
+
+
+def tests():
+    def test1():
+        # test token priorities: INT should not be confused with ID even if ID contains "int"
+        text = "my international int ; int; pints; international;"
+        res = analyze(text)
+
+        expected = [Token.ID, Token.ID, Token.INT, Token.SEMICOLON, Token.INT, Token.SEMICOLON, Token.ID,
+                    Token.SEMICOLON, Token.ID, Token.SEMICOLON]
+        assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
+
+    def test2():
+        text = "true !||!false falsek  k||y+-a&&x"
+        res = analyze(text)
+
+        expected = [Token.TRUE, Token.NOT, Token.OR, Token.NOT, Token.FALSE, Token.ID, Token.ID, Token.OR, Token.ID,
+                    Token.BINOP, Token.BINOP, Token.ID, Token.AND, Token.ID]
+        assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
+
+    def test3():
+        text = "1+2"
+        tokens = analyze(text)
+        expected = [Token.NUM, Token.BINOP, Token.NUM]
+        assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
+        optimize(tokens)
+        assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "3"
+
+        text = "1+2+3"
+        tokens = analyze(text)
+        expected = [Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM]
+        assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
+        optimize(tokens)
+        assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "6"
+
+        # make sure it is not optimized to 9 (3*3)
+        text = "1+2*3"
+        tokens = analyze(text)
+        expected = [Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM]
+        assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
+        optimize(tokens)
+        assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "7"
+
+        # test all arithmetic operations
+        text = "(1+2*3/6)+(1%3)*(6-1)"
+        tokens = analyze(text)
+        expected = [Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM,
+                    Token.RPAREN, Token.BINOP, Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.RPAREN,
+                    Token.BINOP, Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.RPAREN]
+        assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
+        optimize(tokens)
+        assert tokens[1].data == "2" and tokens[5].data == "1" and tokens[9].data == "5"
+
+    # todo find a better way to test?
+    test1()
+    test2()
+    test3()
+
+
+if __name__ == '__main__':
+    tests()
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/LibraryFunctionCompiler.py b/reasoning_gym/code/contrib/bfit/Compiler/LibraryFunctionCompiler.py
new file mode 100644
index 00000000..5b6567d6
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/LibraryFunctionCompiler.py
@@ -0,0 +1,127 @@
+from .Functions import insert_function_object
+from .Token import Token
+
+
+class LibraryFunctionCompiler:
+    def __init__(self, name, type, parameters, code):
+        self.name = name
+        self.type = type
+        self.parameters = parameters
+        self.code = code
+
+    def get_code(self, current_stack_pointer):
+        return self.code
+
+
+def get_readint_code():
+    # res, tmp, input, loop
+    # tmp is used for multiplication
+    """
+    res = 0
+    loop = 1
+
+    while loop
+        loop = 0
+        input = input()
+        if input != newline # todo add a eof check as well. run it in several interpreters to look for common ways for "end of number" input
+            loop = 1
+            res *= 10 + char_to_digit(input)
+    """
+
+    code = "[-]"  # clear res = 0
+    code += ">[-]"  # tmp = 0
+    code += ">>[-]+"  # loop = 1
+
+    code += "["  # while loop == 1
+    code += "[-]"  # loop = 0
+    code += "<"  # point to input
+    code += ","  # input character
+    code += "----------"  # sub 10 (check for newline)
+
+    code += "["  # if input is not newline
+    code += ">"  # point to loop
+    code += "+"  # loop = 1
+
+    # multiply res by 10 and add the input digit
+    code += "<<<"  # point to res
+    code += "[>+<-]"  # move res to tmp
+    code += ">"  # point to tmp
+    code += "[<++++++++++>-]"  # res = tmp * 10, tmp = 0
+    code += ">"  # point to input
+    code += "-" * (0x30 - 10)  # convert character to a digit by subtracting 0x30 from it (we already subtracted 10 before)
+    code += "[<<+>>-]"  # res += input
+    code += "]"  # end if
+
+    code += ">"  # point to loop
+    code += "]"  # end while
+
+    code += "<<<"  # point to res
+
+    return code
+
+
+def get_printint_code():
+    # return_cell value_to_print_cell
+
+    code = ">"  # point to value_to_print cell
+    code += ">[-]" * 8 + "<" * 8  # zero some cells
+
+    code += ">++++++++++<"  # div amount
+    code += "[->-[>+>>]>[+[<+>-]>+>>]<<<<<]"  # value_to_print/10
+    code += ">[-]"  # zero d-n%d
+    code += ">>"  # point to div result
+
+    code += ">++++++++++<"  # div amount
+    code += "[->-[>+>>]>[+[<+>-]>+>>]<<<<<]"  # res/10
+    code += ">[-]"  # zero d-n%d
+    code += ">>"  # point to div result
+
+    code += "["  # if the first digit is not 0
+    code += ">++++++[<++++++++>-]<."  # add 48 to the first digit and print it
+    code += "<<"
+    code += "+>"  # set is_over_100 to true
+    code += "+>"  # add 1 to the second digit so it prints even when it's 0
+    code += "[-]"  # zero the first digit
+    code += "]"  # end if
+
+    code += "<"  # point to the second digit
+
+    code += "["  # if the second digit is not 0
+    code += "<[>-<-]"  # if is_over_100 is true then subtract 1 from the second digit
+    code += "++++++[>++++++++<-]>."  # add 48 to the second digit and print it
+    code += "[-]"  # zero the second digit
+    code += "]"  # end if
+
+    code += "<<"  # point to the cell after the third digit
+    code += "++++++[<++++++++>-]<."  # add 48 to the third digit and print it
+    code += "[-]"  # zero the third digit
+    code += "<<"  # point to value_to_print_cell which is 0
+
+    code += "<"  # point to return_cell
+    return code
+
+
+def get_readchar_code():
+    # read input into "return value cell". no need to move the pointer
+    code = ","
+    return code
+
+
+def get_printchar_code():
+    # point to parameter, output it, and then point back to "return value cell"
+    code = ">.<"
+    return code
+
+
+def insert_library_functions():
+    readint = LibraryFunctionCompiler("readint", Token.INT, list(), get_readint_code())
+    insert_function_object(readint)
+
+    printint = LibraryFunctionCompiler("printint", Token.VOID, [Token.INT], get_printint_code())
+    insert_function_object(printint)
+
+    readchar = LibraryFunctionCompiler("readchar", Token.INT, list(), get_readchar_code())
+    insert_function_object(readchar)
+
+    printchar = LibraryFunctionCompiler("printchar", Token.VOID, [Token.INT], get_printchar_code())
+    insert_function_object(printchar)
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Minify.py b/reasoning_gym/code/contrib/bfit/Compiler/Minify.py
new file mode 100644
index 00000000..ea0411e3
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Minify.py
@@ -0,0 +1,14 @@
+def minify(code):
+    old_code = ""
+
+    while old_code != code:
+        old_code = code
+
+        code = code.replace("><", "")
+        code = code.replace("<>", "")
+        code = code.replace("+-", "")
+        code = code.replace("-+", "")
+
+        code = code.replace("][-]", "]")
+
+    return code
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Node.py b/reasoning_gym/code/contrib/bfit/Compiler/Node.py
new file mode 100644
index 00000000..fadeeff8
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Node.py
@@ -0,0 +1,398 @@
+from .Exceptions import BFSemanticError
+from .General import get_copy_from_variable_code, get_copy_to_variable_code
+from .General import get_move_left_index_cell_code, get_move_right_index_cells_code
+from .General import get_offset_to_variable, get_variable_dimensions_from_token
+from .General import get_op_between_literals_code, get_literal_token_code, get_token_ID_code
+from .General import get_unary_prefix_op_code, get_unary_postfix_op_code, is_token_literal
+from .General import unpack_literal_tokens_to_array_dimensions, get_op_boolean_operator_code
+from .Token import Token
+
+"""
+This file holds classes that are used to create the parse tree of expressions
+Each class implements a get_code() function that receives a "stack pointer" and returns code that evaluates the expression
+"""
+
+
+class Node:
+    def __init__(self, ids_map_list):
+        # holds a copy of ids_map_list as it was when we parsed the expression
+        self.ids_map_list = ids_map_list[:]
+
+    def assign_token_to_op_token(self, assign_token):
+        assert assign_token.data in ["+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "|=", "^="]
+
+        assignment_map = {
+            "+=": Token(Token.BINOP, assign_token.line, assign_token.column, data="+"),
+            "-=": Token(Token.BINOP, assign_token.line, assign_token.column, data="-"),
+            "*=": Token(Token.BINOP, assign_token.line, assign_token.column, data="*"),
+            "/=": Token(Token.BINOP, assign_token.line, assign_token.column, data="/"),
+            "%=": Token(Token.BINOP, assign_token.line, assign_token.column, data="%"),
+            "<<=": Token(Token.BITWISE_SHIFT, assign_token.line, assign_token.column, data="<<"),
+            ">>=": Token(Token.BITWISE_SHIFT, assign_token.line, assign_token.column, data=">>"),
+            "&=": Token(Token.BITWISE_AND, assign_token.line, assign_token.column),
+            "|=": Token(Token.BITWISE_OR, assign_token.line, assign_token.column),
+            "^=": Token(Token.BITWISE_XOR, assign_token.line, assign_token.column),
+        }
+
+        op_token = assignment_map[assign_token.data]
+        op_node = NodeToken(self.ids_map_list, token=op_token)
+        return op_node
+
+    def get_code(self, *args, **kwargs):
+        pass
+
+
+class NodeToken(Node):
+    def __init__(self, ids_map_list, left=None, token=None, right=None):
+        Node.__init__(self, ids_map_list)
+        self.left = left
+        self.right = right
+        self.token = token
+
+    def get_code(self, current_pointer, *args, **kwargs):
+        # returns the code that evaluates the parse tree
+
+        if is_token_literal(self.token) or self.token.type == Token.ID:
+            # its a literal (leaf)
+            assert self.left is None and self.right is None
+            if self.token.type == Token.ID:
+                return get_token_ID_code(self.ids_map_list, self.token, current_pointer)
+            else:
+                return get_literal_token_code(self.token)
+
+        elif self.token.type in [Token.BINOP, Token.RELOP, Token.BITWISE_SHIFT, Token.BITWISE_AND, Token.BITWISE_OR, Token.BITWISE_XOR]:
+            code = self.left.get_code(current_pointer)
+            code += self.right.get_code(current_pointer + 1)
+            code += "<<"  # point to the first operand
+
+            right_token = None
+            if isinstance(self.right, NodeToken):
+                right_token = self.right.token
+
+            code += get_op_between_literals_code(self.token, right_token)
+            return code
+
+        elif self.token.type in [Token.AND, Token.OR]:  # short-circuit evaluation treated differently
+            return get_op_boolean_operator_code(self, current_pointer)
+
+        elif self.token.type == Token.ASSIGN:
+            assert self.left.token.type == Token.ID
+
+            if self.token.data == '=':
+                # id = expression
+                code = self.right.get_code(current_pointer)
+
+                # create code to copy from evaluated expression to ID's cell
+                code += "<"  # point to evaluated expression cell
+                code += get_copy_to_variable_code(self.ids_map_list, self.left.token, current_pointer)
+                code += ">"  # point to next available cell
+
+                return code
+
+            else:
+                assert self.token.data in ["+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "|=", "^="]
+                # id += expression
+                # create a node for id + expression
+
+                op_node = self.assign_token_to_op_token(self.token)
+                op_node.left = self.left
+                op_node.right = self.right
+
+                # create a node for id = id + expression
+                assign_token = Token(Token.ASSIGN, self.token.line, self.token.column, data="=")
+                assignment_node = NodeToken(self.ids_map_list, left=self.left, token=assign_token, right=op_node)
+
+                return assignment_node.get_code(current_pointer)
+
+
+class NodeTernary(Node):
+    def __init__(self, ids_map_list, condition, node_true, node_false):
+        # node_condition ? node_true : node_false;
+        Node.__init__(self, ids_map_list)
+        self.condition = condition
+        self.node_true = node_true
+        self.node_false = node_false
+
+    def get_code(self, current_pointer, *args, **kwargs):
+        # cells layout:
+        # result, bool_evaluate_node_false, condition
+        code = ">"  # point to bool_evaluate_node_false
+        code += "[-]+"  # bool_evaluate_node_false=1
+        code += ">"  # point to condition
+        code += self.condition.get_code(current_pointer+2)  # evaluate condition
+        code += "<"  # point to condition
+
+        code += "["  # if condition is non-zero
+        code += "<<"  # point to result
+        code += self.node_true.get_code(current_pointer)  # evaluate node_true
+        # now we point to bool_evaluate_node_false
+        code += "[-]"  # zero bool_evaluate_node_false
+        code += ">"  # point to condition
+        code += "[-]"  # zero condition
+        code += "]"  # end if
+
+        code += "<"  # point to bool_evaluate_node_false
+        code += "["  # if bool_evaluate_node_false is 1
+        code += "<"  # point to result
+        code += self.node_false.get_code(current_pointer)  # evaluate node_false
+        # now we point to bool_evaluate_node_false
+        code += "[-]"  # zero bool_evaluate_node_false
+        code += "]"  # end if
+        # now we point to one cell after result - next available cell
+        return code
+
+
+class NodeUnaryPrefix(Node):
+    def __init__(self, ids_map_list, operation, literal):
+        Node.__init__(self, ids_map_list)
+        self.token_operation = operation
+        self.node_literal = literal
+
+    def get_code(self, current_pointer, *args, **kwargs):
+        # unary prefix (!x or ++x or ~x or -x)
+        assert self.token_operation.type in [Token.NOT, Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE, Token.BITWISE_NOT, Token.BINOP]
+
+        if self.token_operation.type in [Token.NOT, Token.BITWISE_NOT, Token.BINOP]:
+            code = self.node_literal.get_code(current_pointer)
+            code += "<"  # point to operand
+            code += get_unary_prefix_op_code(self.token_operation)
+
+            return code
+        else:
+            # its INCREMENT or DECREMENT
+            if isinstance(self.node_literal, NodeArrayGetElement):
+                token_id, index_node = self.node_literal.token_id, self.node_literal.node_expression
+                code = get_move_right_index_cells_code(current_pointer, index_node)
+
+                offset_to_array = get_offset_to_variable(self.ids_map_list, token_id, current_pointer + 2)
+                # it is +2 because in "get_move_right_index_cells_code", we moved 2 extra cells to the right, for retrieving the value
+
+                code += get_unary_prefix_op_code(self.token_operation, offset_to_array)
+
+                code += "<"  # point to res
+                code += "[<<+>>-]"  # move res to old "index cell"
+                code += "<"  # point to new index cell
+
+                code += get_move_left_index_cell_code()
+                return code
+
+            # the token to apply on must be an ID
+            if isinstance(self.node_literal, NodeToken) is False:
+                raise BFSemanticError("Prefix operator %s can only be applied to a variable" % str(self.token_operation))
+
+            if self.node_literal.token.type != Token.ID:
+                raise BFSemanticError("Prefix operator %s cannot be applied to %s, but only to a variable" % (str(self.token_operation), str(self.node_literal.token)))
+
+            offset_to_ID = get_offset_to_variable(self.ids_map_list, self.node_literal.token, current_pointer)
+            return get_unary_prefix_op_code(self.token_operation, offset_to_ID)
+
+
+class NodeUnaryPostfix(Node):
+    def __init__(self, ids_map_list, operation, literal):
+        Node.__init__(self, ids_map_list)
+        self.token_operation = operation
+        self.node_literal = literal
+
+    def get_code(self, current_pointer, *args, **kwargs):
+        # its an unary postfix operation (x++)
+        assert self.token_operation.type in [Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]
+
+        if isinstance(self.node_literal, NodeArrayGetElement):
+            token_id, index_node = self.node_literal.token_id, self.node_literal.node_expression
+            code = get_move_right_index_cells_code(current_pointer, index_node)
+
+            offset_to_array = get_offset_to_variable(self.ids_map_list, token_id, current_pointer + 2)
+            # it is +2 because in "get_move_right_index_cells_code", we moved 2 extra cells to the right, for retrieving the value
+
+            code += get_unary_postfix_op_code(self.token_operation, offset_to_array)
+
+            code += "<"  # point to res
+            code += "[<<+>>-]"  # move res to old "index cell"
+            code += "<"  # point to new index cell
+
+            code += get_move_left_index_cell_code()
+            return code
+
+        # the token to apply on must be an ID
+        if isinstance(self.node_literal, NodeToken) is False:
+            raise BFSemanticError("Postfix operator %s can only be applied to a variable" % str(self.token_operation))
+
+        if self.node_literal.token.type != Token.ID:
+            raise BFSemanticError("Postfix operator %s cannot be applied to %s, but only to a variable" % (str(self.token_operation), str(self.node_literal.token)))
+
+        offset_to_ID = get_offset_to_variable(self.ids_map_list, self.node_literal.token, current_pointer)
+        return get_unary_postfix_op_code(self.token_operation, offset_to_ID)
+
+
+class NodeFunctionCall(Node):
+    def __init__(self, ids_map_list, function_to_call, parameters):
+        """
+            receives a FunctionCompiler object
+                that implements get_code() which gets a stack pointer and returns code
+            receives a list of parameters - Node objects
+                each one gets a stack pointer and returns code that evaluates the parameter
+        """
+        Node.__init__(self, ids_map_list)
+        self.function_to_call = function_to_call
+        self.parameters = parameters
+
+    def get_code(self, current_pointer, *args, **kwargs):
+        code = '[-]>'  # return_value_cell=0
+
+        # evaluate parameters from left to right, and put them on the "stack" in that order
+        # after each parameter code, the pointer points to the next available cell (one after the parameter)
+        for i, parameter in enumerate(self.parameters):
+            code += parameter.get_code(current_pointer+1+i)  # evaluate each parameter at its cell offset (starting at one after return_value_cell)
+
+        # at this point we point to one after the last parameter
+        code += "<" * len(self.parameters)  # point back to first parameter
+        code += "<"  # point to return_value_cell
+        code += self.function_to_call.get_code(current_stack_pointer=current_pointer)  # after this we point to return value cell
+        code += ">"  # point to next available cell (one after return value)
+        return code
+
+
+class NodeArrayElement(Node):
+    def __init__(self, ids_map_list):
+        Node.__init__(self, ids_map_list)
+
+    """
+    the idea:
+    1. evaluate index. it is known only in run time, so we need to perform a little trick
+    2. move <index> steps to the right, while counting how many steps we moved so far
+        hold an index, and a steps_counter, and move them to the right while decreasing index and increasing steps_counter
+        e.g: 4,0 --> 3,1 --> 2,2 --> 1,3 --> 0,4
+        (move right until index is 0. counter will hold the old index)
+        this way we know we moved <index> steps, and know how many steps to go back when we are done
+    3. move <offset from stack pointer to array> steps left, to get/set the relevant array element
+        this offset is known at compilation time
+    """
+
+
+class NodeArrayGetElement(NodeArrayElement):
+    """
+    class for getting element of a one-dimensional array
+    it receives an expression, indicating the required index
+    and returns a code that gets that element
+    """
+
+    def __init__(self, ids_map_list, token_id, node_expression):
+        Node.__init__(self, ids_map_list)
+        self.token_id = token_id
+        self.node_expression = node_expression
+
+    def get_code(self, current_pointer, *args, **kwargs):
+        code = get_move_right_index_cells_code(current_pointer, self.node_expression)
+        code += get_copy_from_variable_code(self.ids_map_list, self.token_id, current_pointer + 2)
+        # it is +2 because in "get_move_right_index_cells_code", we moved 2 extra cells to the right, for retrieving the value
+
+        code += "<"  # point to res
+        code += "[<<+>>-]"  # move res to old "index cell"
+        code += "<"  # point to new index cell
+
+        code += get_move_left_index_cell_code()
+        return code
+
+
+class NodeArraySetElement(NodeArrayElement):
+    """
+    class for setting element of a one-dimensional array
+    it receives:
+    1. an expression, indicating the required index
+    2. assignment operator (=|+=|-=|*=|/=|%=|<<=|>>=|&=|(|=)|^=)
+    3. an expression, indicating the value to be used for the assignment
+    and returns a code that gets that element
+    """
+
+    def __init__(self, ids_map_list, token_id, node_expression_index, assign_token, node_expression_value):
+        Node.__init__(self, ids_map_list)
+        self.token_id = token_id
+        self.node_expression_index = node_expression_index
+
+        if assign_token.data == "=":
+            # id[exp] = expression
+
+            self.assign_token = assign_token
+            self.node_expression_value = node_expression_value
+
+        else:
+            # id[exp] += expression
+            assert assign_token.data in ["+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "|=", "^="]
+
+            self.assign_token = Token(Token.ASSIGN, assign_token.line, assign_token.column, data="=")
+
+            # create a node for id[exp] + expression
+            op_node = self.assign_token_to_op_token(assign_token)
+            op_node.left = NodeArrayGetElement(self.ids_map_list, token_id, node_expression_index)
+            op_node.right = node_expression_value
+
+            self.node_expression_value = op_node
+
+    def get_code(self, current_pointer, *args, **kwargs):
+        # index, steps_taken_counter, value
+
+        code = self.node_expression_index.get_code(current_pointer)
+        code += "[-]"  # counter = 0
+        code += ">"  # point to value cell
+        code += self.node_expression_value.get_code(current_pointer + 2)
+        code += "<<<"  # point to index
+
+        code += "["  # while index != 0
+        code += ">>>"  # point to new_value (one after current value)
+        code += "[-]"  # zero new_value
+        code += "<"  # move to old value
+        code += "[>+<-]"  # move old value to new counter
+        code += "<"  # point to old counter
+        code += "+"  # increase old counter
+        code += "[>+<-]"  # move old counter to new counter
+        code += "<"  # point to old index
+        code += "-"  # decrease old index
+        code += "[>+<-]"  # move old index to new index
+        code += ">"  # point to new index
+        code += "]"  # end while
+
+        code += ">>"  # point to value
+        code += get_copy_to_variable_code(self.ids_map_list, self.token_id, current_pointer + 2)
+        # it is +2 because we moved 2 extra cells to the right, for pointing to value
+
+        # layout: 0, idx, value (pointing to value)
+        # create layout: value, idx
+        code += "[<<+>>-]"  # move value to old "index" cell (which is now 0)
+
+        # value, index (pointing to one after index)
+        code += "<"  # point to index
+        code += "["  # while index != 0
+        code += "<"  # point to value
+        code += "[<+>-]"  # move value to the left
+        code += ">"  # point to index
+        code += "-"  # sub 1 from index
+        code += "[<+>-]"  # move index to left
+        code += "<"  # point to index
+        code += "]"  # end while
+
+        # now value is at the desired cell, and we point to the next available cell
+
+        return code
+
+
+class NodeArrayAssignment(Node):
+    """
+        Used for array assignment
+        E.g arr = = { 1, 2, 3... }
+    """
+    def __init__(self, ids_map_list, token_id, literal_tokens_list):
+        Node.__init__(self, ids_map_list)
+        self.token_id = token_id
+        self.literal_tokens_list = literal_tokens_list
+
+    def get_code(self, current_pointer, *args, **kwargs):
+        array_dimensions = get_variable_dimensions_from_token(self.ids_map_list, self.token_id)
+        unpacked_literals_list = unpack_literal_tokens_to_array_dimensions(self.token_id, array_dimensions, self.literal_tokens_list)
+
+        offset = get_offset_to_variable(self.ids_map_list, self.token_id, current_pointer)
+        code = "<" * offset  # point to first array element
+        for literal in unpacked_literals_list:
+            code += get_literal_token_code(literal)  # evaluate this literal and point to next array element
+        code += ">" * (offset - len(unpacked_literals_list))  # move back to the original position
+        code += ">"  # point to the next cell
+        return code
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Optimizer.py b/reasoning_gym/code/contrib/bfit/Compiler/Optimizer.py
new file mode 100644
index 00000000..c2bc5413
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Optimizer.py
@@ -0,0 +1,89 @@
+from .General import get_NUM_token_value
+from .Token import Token
+
+"""
+This file holds functions that optimize code on syntax-level. For example:
+The tokens corresponding to the code "3*5" will be replaced in-place by a token that represents "15"
+"""
+
+
+def optimize_once(tokens):
+    # performs one pass on the tokens and optimizes them in-place if possible
+    # optimization based on a list of rules
+
+    def optimize_binop(tokens, start_index):
+        # optimize arithmetic operations. E.g replace 1+2 with 3
+
+        # need to be careful not to optimize (1+2*3) to (3*3)
+        if tokens[start_index+1].data in ["*", "/", "%"] or (start_index+3 >= len(tokens)) or (tokens[start_index+3].data not in ["*", "/", "%"]):
+            num1, num2 = get_NUM_token_value(tokens[start_index]), get_NUM_token_value(tokens[start_index+2])
+            op = tokens[start_index+1].data
+            if op == "+":
+                val = num1 + num2
+            elif op == "-":
+                val = num1 - num2
+                if val < 0:  # cannot optimize negative values
+                    return False
+            elif op == "*":
+                val = num1 * num2
+            elif op in ["/", "%"]:
+                if num2 == 0:
+                    print("WARNING (optimizer) - division by zero at %s" % str(tokens[start_index]))
+                    return False
+                if op == "/":
+                    val = num1 // num2
+                else:
+                    val = num1 % num2
+            else:
+                raise NotImplementedError(op)
+
+            # remove the 3 old tokens and replace them with new one
+            new_token = Token(Token.NUM, tokens[start_index].line, tokens[start_index].column, data=str(val),
+                              original_tokens=tokens[start_index:start_index+3])
+
+            for _ in range(3):
+                tokens.pop(start_index)
+            tokens.insert(start_index, new_token)
+            return True
+
+        return False
+
+    def optimize_printint(tokens, start_index):
+        # replace printint(50) with print("50")
+        # since printing strings compiles into less Brainfuck code than printing ints
+        if tokens[start_index].data == "printint":
+            tokens[start_index] = Token(Token.PRINT, tokens[start_index].line, tokens[start_index].column, original_tokens=[tokens[start_index]])
+            tokens[start_index+2] = Token(Token.STRING, tokens[start_index].line, tokens[start_index].column,
+                                          data=str(tokens[start_index+2].data), original_tokens=[tokens[start_index+2]])
+            return True
+
+        return False
+
+    rules = [([Token.NUM, Token.BINOP, Token.NUM], optimize_binop),  # arithmetic operations
+             ([Token.ID, Token.LPAREN, Token.NUM, Token.RPAREN], optimize_printint),  # printint(50) to print("50")
+             ]
+
+    # try to match one of the rules to the tokens in a "sliding window" style
+    i = 0
+    while i < len(tokens):
+        optimized = False
+        for tokens_sequence, optimization_function in rules:
+            if i + len(tokens_sequence) <= len(tokens):
+                if all(tokens_sequence[n] == tokens[i+n].type for n in range(len(tokens_sequence))):
+                    if optimization_function(tokens, i):
+                        optimized = True
+        if optimized:
+            continue  # don't increment i, try to optimize the same location again
+        i += 1
+
+
+def optimize(tokens):
+    # optimize tokens again and again until there is nothing left to optimize
+    prev_tokens = [token.type for token in tokens]
+    while True:
+        optimize_once(tokens)
+        print(".", end='')
+        current_tokens = [token.type for token in tokens]
+        if current_tokens == prev_tokens:
+            break
+        prev_tokens = current_tokens
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Parser.py b/reasoning_gym/code/contrib/bfit/Compiler/Parser.py
new file mode 100644
index 00000000..a658e04a
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Parser.py
@@ -0,0 +1,129 @@
+from .Exceptions import BFSyntaxError, BFSemanticError
+from .Token import Token
+from .General import is_token_literal
+
+
+class Parser:
+    """
+    Used to easily iterate tokens
+    """
+    def __init__(self, tokens):
+        self.tokens = tokens
+        self.current_token_index = 0
+
+    # parsing tokens
+    def current_token(self):
+        if self.current_token_index >= len(self.tokens):
+            return None
+        else:
+            return self.token_at_index(self.current_token_index)
+
+    def advance_token(self, amount=1):
+        self.current_token_index += amount
+
+    def advance_to_token_at_index(self, token_index):
+        self.current_token_index = token_index
+
+    def token_at_index(self, index):
+        assert index < len(self.tokens)
+        return self.tokens[index]
+
+    def next_token(self, next_amount=1):
+        return self.token_at_index(self.current_token_index + next_amount)
+
+    def find_matching(self, starting_index=None):
+        """
+        :return: the index of the token that matches the current token
+        :param starting_index (optional) - the index of the token we want to match
+
+        for example, if current token is {
+        it returns the index of the matching }
+        """
+        if starting_index is None:
+            starting_index = self.current_token_index
+
+        tokens = self.tokens
+        token_to_match = tokens[starting_index]
+        if token_to_match.type == Token.LBRACE:
+            inc = Token.LBRACE
+            dec = Token.RBRACE
+        elif token_to_match.type == Token.LBRACK:
+            inc = Token.LBRACK
+            dec = Token.RBRACK
+        elif token_to_match.type == Token.LPAREN:
+            inc = Token.LPAREN
+            dec = Token.RPAREN
+        else:
+            raise BFSemanticError("No support for matching %s" % str(token_to_match))
+
+        i = starting_index
+        cnt = 0
+        while i < len(tokens):
+            if tokens[i].type == inc:
+                cnt += 1
+            elif tokens[i].type == dec:
+                cnt -= 1
+
+            if cnt == 0:
+                return i
+
+            i += 1
+
+        raise BFSyntaxError("Did not find matching %s for %s" % (dec, str(token_to_match)))
+
+    def check_next_tokens_are(self, tokens_list, starting_index=None):
+        if starting_index is None:
+            starting_index = self.current_token_index
+
+        # used for "assertion" and print a nice message to the user
+        if starting_index + len(tokens_list) >= len(self.tokens):
+            raise BFSyntaxError("Expected %s after %s" % (str(tokens_list), str(self.tokens[starting_index])))
+        for i in range(0, len(tokens_list)):
+            if self.tokens[starting_index + 1 + i].type != tokens_list[i]:
+                raise BFSyntaxError("Expected %s after %s" % (str(tokens_list[i]), [str(t) for t in self.tokens[starting_index: starting_index+1+i]]))
+
+    def check_next_token_is(self, token, starting_index=None):
+        self.check_next_tokens_are([token], starting_index=starting_index)
+
+    def check_current_tokens_are(self, tokens_list):
+        self.check_next_tokens_are(tokens_list, starting_index=self.current_token_index - 1)
+
+    def check_current_token_is(self, token):
+        self.check_current_tokens_are([token])
+
+    def compile_array_initialization_list(self):
+        # {1, 2, 3, ...} or {array_initialization_list, array_initialization_list, array_initialization_list, ...} or string
+        # parses the definition and returns a list (of list of list ....) of literal tokens (NUM, CHAR, TRUE, FALSE)
+
+        list_tokens = []
+
+        if self.current_token().type == Token.STRING:
+            string_token = self.current_token()
+            line, column = string_token.line, string_token.column
+            for char in string_token.data:
+                list_tokens.append(Token(Token.NUM, line, column, str(ord(char))))
+
+            self.advance_token()  # point to after STRING
+            return list_tokens
+
+        assert self.current_token().type == Token.LBRACE
+        self.advance_token()  # skip to after LBRACE
+
+        while is_token_literal(self.current_token()) or self.current_token().type == Token.LBRACE:
+            if self.current_token().type == Token.LBRACE:  # list of (literals | list)
+                list_tokens.append(self.compile_array_initialization_list())
+            else:  # literal
+                list_tokens.append(self.current_token())
+                self.advance_token()  # skip literal
+
+            if self.current_token().type not in [Token.COMMA, Token.RBRACE]:
+                raise BFSyntaxError("Unexpected %s (expected comma (,) or RBRACE (}))" % self.current_token())
+
+            if self.current_token().type == Token.COMMA:
+                self.advance_token()  # skip comma
+            if self.current_token().type == Token.RBRACE:
+                break
+
+        self.check_current_token_is(Token.RBRACE)
+        self.advance_token()  # skip RBRACE
+        return list_tokens
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Token.py b/reasoning_gym/code/contrib/bfit/Compiler/Token.py
new file mode 100644
index 00000000..df43a6f5
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Token.py
@@ -0,0 +1,70 @@
+class Token:
+
+    INT = "INT"
+    VOID = "VOID"
+    TRUE = "TRUE"
+    FALSE = "FALSE"
+    AND = "AND"
+    OR = "OR"
+    NOT = "NOT"
+    RETURN = "RETURN"
+    IF = "IF"
+    ELSE = "ELSE"
+    WHILE = "WHILE"
+    FOR = "FOR"
+    DO = "DO"
+    BREAK = "BREAK"
+    CONTINUE = "CONTINUE"
+    SWITCH = "SWITCH"
+    CASE = "CASE"
+    DEFAULT = "DEFAULT"
+    COLON = "COLON"
+    SEMICOLON = "SEMICOLON"
+    COMMA = "COMMA"
+
+    LPAREN = "LPAREN"
+    RPAREN = "RPAREN"
+    LBRACE = "LBRACE"
+    RBRACE = "RBRACE"
+    LBRACK = "LBRACK"
+    RBRACK = "RBRACK"
+
+    ASSIGN = "ASSIGN"
+    TERNARY = "TERNARY"
+    RELOP = "RELOP"
+    BINOP = "BINOP"
+    INCREMENT = "INCREMENT"
+    DECREMENT = "DECREMENT"
+    UNARY_MULTIPLICATIVE = "UNARY_MULTIPLICATIVE"
+
+    BITWISE_SHIFT = "BITWISE_SHIFT"
+    BITWISE_NOT = "BITWISE_NOT"
+    BITWISE_AND = "BITWISE_AND"
+    BITWISE_OR = "BITWISE_OR"
+    BITWISE_XOR = "BITWISE_XOR"
+
+    WHITESPACE = "WHITESPACE"
+    ID = "ID"
+    NUM = "NUM"
+    STRING = "STRING"
+    CHAR = "CHAR"
+
+    PRINT = "PRINT"
+    COMMENT = "COMMENT"
+    UNIDENTIFIED = "UNIDENTIFIED"
+
+    def __init__(self, type, line, column, data=None, original_tokens=None):
+        self.type = type
+        self.line = line
+        self.column = column
+        self.data = data
+        self.original_tokens = original_tokens
+
+    def __str__(self):
+        result = self.type
+        if self.data:
+            result += " " + self.data
+        result += " (line %s column %s)" % (self.line, self.column)
+        if self.original_tokens:
+            result += " (original tokens: " + ", ".join([str(t) for t in self.original_tokens]) + ")"
+        return result
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/__init__.py b/reasoning_gym/code/contrib/bfit/Compiler/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/reasoning_gym/code/contrib/bfit/Interpreter.py b/reasoning_gym/code/contrib/bfit/Interpreter.py
new file mode 100644
index 00000000..02e0520f
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/Interpreter.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+import sys
+import argparse
+
+
+def create_jumps_dictionary(program):
+    lbraces = list()
+    res = dict()
+
+    for index, command in enumerate(program):
+        if command == '[':
+            lbraces.append(index)
+        elif command == ']':
+            if len(lbraces) == 0:
+                raise SyntaxError("Brainfuck: mismatched parentheses (at index: %s)" % index)
+
+            lbrace_index = lbraces.pop()
+            res[lbrace_index] = index
+            res[index] = lbrace_index
+
+    if len(lbraces) > 0:
+        raise SyntaxError("Brainfuck: mismatched parentheses (at indexes: %s)" % str(lbraces))
+    return res
+
+
+def brainfuck(program, bits=8):
+
+    jumps = create_jumps_dictionary(program)
+    data = dict()
+    data_pointer = 0
+
+    instruction_pointer = 0
+
+    while instruction_pointer < len(program):
+        command = program[instruction_pointer]
+
+        if command == '>':
+            data_pointer += 1
+        elif command == '<':
+            data_pointer -= 1
+        elif command == '+':
+            data[data_pointer] = (data.get(data_pointer, 0) + 1)
+            if data[data_pointer] == 2 ** bits:
+                data[data_pointer] = 0
+        elif command == '-':
+            data[data_pointer] = (data.get(data_pointer, 0) - 1)
+            if data[data_pointer] == -1:
+                data[data_pointer] = 2 ** bits - 1
+        elif command == ',':
+            data[data_pointer] = ord(sys.stdin.read(1)) % 256
+        elif command == '.':
+            print(chr(data.get(data_pointer, 0)), end='', flush=True)
+        elif command == '[':
+            if data.get(data_pointer, 0) == 0:
+                instruction_pointer = jumps[instruction_pointer]
+        elif command == ']':
+            if data.get(data_pointer, 0) != 0:
+                instruction_pointer = jumps[instruction_pointer]
+        else:  # everything else is comment
+            pass
+
+        instruction_pointer += 1
+
+    if data_pointer != 0:
+        print("WARNING (interpreter) - at the end of the execution the data pointer is %s instead of 0 (possibly a compiler issue)" % str(data_pointer))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("filepath")
+    parser.add_argument("--bits", "-b", "--interpreter-bits", type=int, default=8, help="Amount of bits each cell uses")
+
+    args = parser.parse_args()
+    with open(args.filepath, 'r') as f:
+        code = f.read()
+
+    brainfuck(code, args.bits)
diff --git a/reasoning_gym/code/contrib/bfit/LICENSE.md b/reasoning_gym/code/contrib/bfit/LICENSE.md
new file mode 100644
index 00000000..e1249203
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 elikaski
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/reasoning_gym/code/contrib/bfit/README.md b/reasoning_gym/code/contrib/bfit/README.md
new file mode 100644
index 00000000..4ad60e98
--- /dev/null
+++ b/reasoning_gym/code/contrib/bfit/README.md
@@ -0,0 +1,101 @@
+<img width="200" height="200" align="left" style="float: left; margin: 0 10px 0 0;" alt="BF-it logo" src="images/logo.png?sanitize=true">
+
+# **BF-it**
+A compiler from a C-like language to Brainfuck
+
+Always heard that Brainfuck is Turing complete but still had doubts about its computational power?
+Those days are over
+
+
+## What is this?
+BF-it is a Compiler of a C-like language to Brainfuck, written in Python
+
+You give it C-like code, and it gives you Brainfuck code with the same functionality :)
+
+
+## Why did you create it?
+There are several reasons why I chose to create this
+
+1. I always heard that Brainfuck is Turing complete, and
+   therefore as strong as any other language
+    * I wanted to put that to the test, and find a concrete
+      proof that it is true
+2. I find it beautiful how a seemingly bunch of nonsense
+   characters can have a logical meaning / functionality
+    * I wanted to take it to the extreme
+3. I wanted to practice my developing skills
+4. It seemed to be like a real challenge (and it was!)
+5. But mostly, and most importantly, I did it for teh lulz
+
+
+## What can I use it for?
+Create a Brainfuck code of your choice, send it to your friends, and tell them to run it (on some online interpreter, for example)
+
+When they receive a bunch of nonsense which does something meaningful, they will (maybe) be amazed
+
+
+## How do I use it?
+
+1. Write a C-like code (example programs included) into a file
+2. Open your favorite command line interface
+3. Run `BF-it.py <path_to_code_file>`
+    * This will compile your file and create a .bf file with
+      Brainfuck code
+    * (optional parameters: `-o` for output file, and `-r`
+      for running the compiled file)
+4. Run `Interpreter.py <path_to_bf_file>`, this will execute
+   the Brainfuck code
+
+Example:
+```
+$ cat helloworld.code
+int main()
+{
+    print("Hello World!\n");
+}
+
+$ ./BF-it.py helloworld.code
+Compiling file 'helloworld.code'...
+Compiled successfully to 'helloworld.bf'
+$ cat helloworld.bf 
+>[-]>[-]<>++++++++[-<+++++++++>]<.>++++[-<+++++++>]
+<+.+++++++..+++.>++++++[-<------------->]<-.>+++++[
+-<+++++++++++>]<.>++++[-<++++++>]<.+++.------.-----
+---.>++++++[-<----------->]<-.>+++[-<------->]<--.<
+
+$ ./Interpreter.py helloworld.bf
+Hello World!
+```
+
+For a more interesting example, check out Tic-Tac-Toe!
+
+<img alt="Tic-Tac-Toe" src="images/tic tac toe.gif?sanitize=true"><br/>
+
+1. Take a look at tic_tac_toe.code in the 'examples/games' directory
+2. Run ```./BF-it.py examples/games/tic_tac_toe.code -r```
+3. Play around
+4. If you're brave, take a look at the generated Brainfuck code
+5. If you're very brave, try to understand it (I can't)
+
+## How do I write code?
+Please refer to the 'examples' directory.
+It contains examples and explanations for syntax, operations, flow, and more
+
+
+## I would like to add a feature / fix a bug
+If you would like to add a feature yourself, perform a pull request and add your changes. I will review them
+
+If you found a bug, or have an idea for a feature, open an issue
+
+
+
+## References I used:
+* https://esolangs.org/wiki/brainfuck_algorithms for basic Brainfuck algorithms
+* https://www.iamcal.com/misc/bf_debug/ for debugging Brainfuck code
+* https://fatiherikli.github.io/brainfuck-visualizer/ for debugging Brainfuck code
+* https://godbolt.org/ for comparing my C-like compiler to other C compilers' implementations
+* https://www.lysator.liu.se/c/ANSI-C-grammar-y.html for creating a grammar
+* https://introcs.cs.princeton.edu/java/11precedence/ for operator precedence
+* https://logomakr.com/ for creating a logo
+* https://www.youtube.com/ for setting the mood
+
diff --git a/reasoning_gym/code/contrib/bfit/__init__.py b/reasoning_gym/code/contrib/bfit/__init__.py
new file mode 100644
index 00000000..e69de29b

From 2393ae05258d5c5c7894fa2ff4ba239a05bf0c95 Mon Sep 17 00:00:00 2001
From: Rich Jones <miserlou@gmail.com>
Date: Thu, 30 Jan 2025 16:24:28 +0100
Subject: [PATCH 03/94] difficulty levels

---
 pyproject.toml           |  7 ++++-
 reasoning_gym/code/bf.py | 60 ++++++++++++++++++++++++++++++++--------
 tests/test_bf.py         | 28 +++++++++----------
 3 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b86e3c5d..0c2ec1ab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,12 @@ authors = [
 description = "A library of procedural dataset generators for training reasoning models"
 readme = "README.md"
 requires-python = ">=3.11"
-dependencies = ["sympy>=1.13.1", "magiccube==0.3.0", "pyfiglet==1.0.2"]
+dependencies = [
+  "bfi==1.0.4",
+  "sympy>=1.13.1", 
+  "magiccube==0.3.0", 
+  "pyfiglet==1.0.2"
+]
 classifiers = [
   "Programming Language :: Python :: 3",
   "License :: OSI Approved :: Apache Software License",
diff --git a/reasoning_gym/code/bf.py b/reasoning_gym/code/bf.py
index 9f2f3381..fb895421 100644
--- a/reasoning_gym/code/bf.py
+++ b/reasoning_gym/code/bf.py
@@ -15,6 +15,12 @@ class BFConfig:
 
     seed: Optional[int] = None
     size: int = 500
+    difficulty: int = 1
+
+    def validate(self) -> None:
+        """Validate configuration parameters"""
+        assert self.difficulty > 0, "difficulty must be greater than 0"
+        assert self.difficulty < 4, "difficulty must be less than 4"
 
 
 class BFDataset(ProceduralDataset):
@@ -37,9 +43,8 @@ class BFDataset(ProceduralDataset):
         """
         rng = Random(self.seed + idx)
 
-        bfit_code = self.generate_bfit_code(rng)
+        bfit_code = self.generate_bfit_code(self.config.difficulty, rng)
         bf_program = self.compile_bfit_code_to_bf(bfit_code)
-
         result = bfi.interpret(bf_program, buffer_output=True)
 
         return {
@@ -48,20 +53,51 @@ class BFDataset(ProceduralDataset):
             "metadata": {"bfit_code": bfit_code, "bf_program": bf_program},
         }
 
-    def generate_bfit_code(self, rng: Random) -> str:
+    def generate_bfit_code(self, difficulty, rng: Random) -> str:
 
-        bfit_template = """
-int main() {
+        if difficulty == 1:
+            word = rng.choice(wordle_words)
+            bfit_template = f"""
+int main() {{
+    print("{word}");
+}}
+"""
+        elif difficulty == 2:
+            x = rng.randint(1, 4)
+            y = rng.randint(1, 5)
+            target = x * y * rng.randint(1, 9) + rng.randint(1, 9)
+            bfit_template = f"""
+int main() {{
     int acc = 0;
-    int target = 15;
-    int x = 2;
-    int y = 3;
-    while (acc < target) {
+    int target = {target};
+    int x = {x};
+    int y = {y};
+    while (acc < target) {{
         acc = acc + x;
         acc = acc + y;
-    }
+    }}
     printint(acc);
-}
+}}
+"""
+        elif difficulty == 3:
+            x = rng.randint(1, 7)
+            y = rng.randint(1, 9)
+            target = x * y * rng.randint(1, 9) + rng.randint(1, 9) + 50
+            conditional = target - rng.randint(1, 40)
+            bfit_template = f"""
+int main() {{
+    int acc = 0;
+    int target = {target};
+    int x = {x};
+    int y = {y};
+    while (acc < target) {{
+        acc = acc + x;
+        if (acc > {conditional}) {{
+            acc = acc + y;
+        }}
+    }}
+    printint(acc);
+}}
 """
         rendered_bfit = bfit_template
         return rendered_bfit
@@ -93,4 +129,4 @@ int main() {
             return 1.0 # Yay
 
 # Register the dataset
-register_dataset("figlet_font", BFDataset, BFConfig)
+register_dataset("bf", BFDataset, BFConfig)
diff --git a/tests/test_bf.py b/tests/test_bf.py
index 9340e9c4..cefac4c7 100644
--- a/tests/test_bf.py
+++ b/tests/test_bf.py
@@ -2,20 +2,11 @@ import pytest
 
 from reasoning_gym.code.bf import BFConfig, BFDataset
 
-
-# def test_figlet_deterministic():
-#     """Test that dataset generates same items with same seed"""
-#     config = FigletFontConfig(seed=42, size=15)
-#     dataset1 = FigletFontDataset(config)
-#     dataset2 = FigletFontDataset(config)
-
-#     for i in range(15):  # Only check first 15 entries for speed
-#         assert dataset1[i] == dataset2[i]
-
-
 def test_bf():
     """Test basic properties and solution of generated items"""
-    config = BFConfig(seed=42, size=40)
+
+    # Easy
+    config = BFConfig(seed=42, size=20, difficulty=1)
     dataset = BFDataset(config)
 
     for item in dataset:
@@ -28,10 +19,19 @@ def test_bf():
         assert "bfit_code" in item["metadata"]
         assert "bf_program" in item["metadata"]
 
-        print(item["answer"])
-
         # Test the scoring
         assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
         assert dataset.score_answer(answer=None, entry=item) == 0.0
         assert dataset.score_answer(answer="Love is a battlefield", entry=item) == 0.01
 
+    # Medium
+    config = BFConfig(seed=43, size=20, difficulty=2)
+    dataset = BFDataset(config)
+    for item in dataset:
+        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
+
+    # Hard
+    config = BFConfig(seed=44, size=20, difficulty=3)
+    dataset = BFDataset(config)
+    for item in dataset:
+        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
\ No newline at end of file

From 9d4f8963297bd28dd1664dac1a4fa77fdc783865 Mon Sep 17 00:00:00 2001
From: Rich Jones <miserlou@gmail.com>
Date: Thu, 30 Jan 2025 17:15:48 +0100
Subject: [PATCH 04/94] init definitions

---
 reasoning_gym/code/__init__.py      | 7 +++++++
 reasoning_gym/cognition/__init__.py | 6 ++++++
 reasoning_gym/graphs/__init__.py    | 8 +++++++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/reasoning_gym/code/__init__.py b/reasoning_gym/code/__init__.py
index 19aca9df..9a4a9044 100644
--- a/reasoning_gym/code/__init__.py
+++ b/reasoning_gym/code/__init__.py
@@ -5,3 +5,10 @@ Cognition tasks for training reasoning capabilities:
 - Code Execution
 """
 
+from .family_relationships import FamilyRelationshipsConfig, FamilyRelationshipsDataset
+from .bf import BFConfig, BFDataset
+
+__all__ = [
+    "BFConfig",
+    "BFDataset"
+]
diff --git a/reasoning_gym/cognition/__init__.py b/reasoning_gym/cognition/__init__.py
index f5d43196..e1f01947 100644
--- a/reasoning_gym/cognition/__init__.py
+++ b/reasoning_gym/cognition/__init__.py
@@ -8,10 +8,16 @@ Cognition tasks for training reasoning capabilities:
 
 from .color_cube_rotation import ColorCubeRotationConfig, ColorCubeRotationDataset
 from .number_sequences import NumberSequenceConfig, NumberSequenceDataset
+from .rubiks_cube import RubiksCubeConfig, RubiksCubeDataset
+from .figlet_fonts import FigletFontConfig, FigletFontDataset
 
 __all__ = [
     "NumberSequenceConfig",
     "NumberSequenceDataset",
     "ColorCubeRotationConfig",
     "ColorCubeRotationDataset",
+    "RubiksCubeConfig",
+    "RubiksCubeDataset",
+    "FigletFontConfig",
+    "FigletFontDataset"
 ]
diff --git a/reasoning_gym/graphs/__init__.py b/reasoning_gym/graphs/__init__.py
index 8ede1fe9..56231a46 100644
--- a/reasoning_gym/graphs/__init__.py
+++ b/reasoning_gym/graphs/__init__.py
@@ -1,3 +1,9 @@
 from .family_relationships import FamilyRelationshipsConfig, FamilyRelationshipsDataset
+from .quantum_lock import QuantumLockConfig, QuantumLockDataset
 
-__all__ = ["FamilyRelationshipsDataset", "FamilyRelationshipsConfig"]
+__all__ = [
+    "FamilyRelationshipsConfig",
+    "FamilyRelationshipsDataset", 
+    "QuantumLockConfig",
+    "QuantumLockDataset"
+]

From 2d9b916f8bba144e4c129ab01ffcdcad6fa10cce Mon Sep 17 00:00:00 2001
From: Rich Jones <miserlou@gmail.com>
Date: Thu, 30 Jan 2025 17:16:37 +0100
Subject: [PATCH 05/94] rm bad copypaste

---
 reasoning_gym/code/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/reasoning_gym/code/__init__.py b/reasoning_gym/code/__init__.py
index 9a4a9044..680653c6 100644
--- a/reasoning_gym/code/__init__.py
+++ b/reasoning_gym/code/__init__.py
@@ -5,7 +5,6 @@ Cognition tasks for training reasoning capabilities:
 - Code Execution
 """
 
-from .family_relationships import FamilyRelationshipsConfig, FamilyRelationshipsDataset
 from .bf import BFConfig, BFDataset
 
 __all__ = [

From 2f9224127da73dc11933d813a9338693e14877e9 Mon Sep 17 00:00:00 2001
From: Rich Jones <miserlou@gmail.com>
Date: Thu, 30 Jan 2025 17:20:53 +0100
Subject: [PATCH 06/94] docstrings

---
 reasoning_gym/code/bf.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/reasoning_gym/code/bf.py b/reasoning_gym/code/bf.py
index fb895421..0e47948c 100644
--- a/reasoning_gym/code/bf.py
+++ b/reasoning_gym/code/bf.py
@@ -37,8 +37,8 @@ class BFDataset(ProceduralDataset):
 
         Returns:
             dict with keys:
-                - question: str, the task description with figlet string
-                - answer: str, the figlet encoded word
+                - question: str, the task description with BF program
+                - answer: str, the result of this BF program BFI execution
                 - metadata: dict with generation parameters
         """
         rng = Random(self.seed + idx)
@@ -108,10 +108,9 @@ int main() {{
         return bf
 
     def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
-        """Determine if the solution provided solves the figlet task.
+        """Determine if the solution provided solves the BF task.
 
-        The function awards 1.0 for a correct answer and 0.1 points for each correct letter in the correct position,
-        with a maximum possible score of 1.0.
+        The function awards 1.0 for a correct answer.
 
         Args:
             answer (Optional[str]): The user's answer.

From f886b15fecba4ebb93fdac22ae9950370decd46a Mon Sep 17 00:00:00 2001
From: Rich Jones <miserlou@gmail.com>
Date: Thu, 30 Jan 2025 17:26:39 +0100
Subject: [PATCH 07/94] update readme

---
 README.md | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index eee342ae..554851e3 100644
--- a/README.md
+++ b/README.md
@@ -4,28 +4,37 @@ We are building a python library of procedural dataset generators and algorithmi
 
 The goal is to generate virtually infinite data with adjustable complexity.
 
-Algorithmic verification allows to train on tasks like Rubik‘s cube or [Countdown](https://en.wikipedia.org/wiki/Countdown_(game_show)#Numbers_Round) which have many correct solutions.
+Algorithmic verification allows to train on tasks like Rubik‘s cube or [Countdown](<https://en.wikipedia.org/wiki/Countdown_(game_show)#Numbers_Round>) which have many correct solutions.
 
 ## Set up for development
+
 1. Clone the project
+
 ```
 git clone https://github.com/open-thought/reasoning-gym.git
 ```
+
 2. Create a virtual environment (here we use conda)
+
 ```
 conda create --name reasoning_gym python=3.11 -y
 conda activate reasoning_gym
 ```
+
 3. Link project and install dependencies
+
 ```
 pip install -e .
 ```
+
 4. Install development dependencies
+
 ```
 pip install -r requirements-dev.txt
 ```
 
->NOTE: To consume the APIs in reasoning_gym, just install from pip using the following
+> NOTE: To consume the APIs in reasoning_gym, just install from pip using the following
+
 ```
 pip install reasoning-gym
 ```
@@ -116,10 +125,16 @@ Available dataset names (which can be used with `create_dataset()`):
 - `SpellBackwardDataset`: Spell individual words backward (e.g. "sun" -> "nus")
 - `WordSequenceReversalDataset`: Reverse word order in text spans
 
+### <small>Code Tasks</small>
+
+- `BFDataset`: Generates BF programs of various difficult, from simple string printing to loops and conditional logic
+
 ### <small>Cognition Tasks</small>
 
 - `NumberSequenceDataset`: Generate number sequences with discoverable patterns
 - `ColorCubeRotationDataset`: Generate 3D spatial reasoning tasks with colored cube rotations and orientation tracking
+- `RubiksCubeDataset`: Generate Rubik's Cube configurations and check correct solutions
+- `FigletFontDataset`: Generate random words in different "Figlet" fonts for reasoning about the structure of letters
 
 ### <small>Logic Tasks</small>
 
@@ -128,6 +143,7 @@ Available dataset names (which can be used with `create_dataset()`):
 ### <small>Graph Tasks</small>
 
 - `FamilyRelationshipsDataset`: Generate family relationship reasoning tasks with family trees
+- `QuantumLockDataset`: Generates puzzles which involve stateful arithmetic and a correct sequence of operations
 
 ### <small>Game Tasks</small>
 
@@ -170,6 +186,7 @@ Example output:
 {'question': 'Solve the polynomial equation for real i:\n3*i**4 + 4*i**3 - 1 = 0', 'answer': '[]', 'metadata': {'polynomial_expr': '3*i**4 + 4*i**3 - 1', 'variable': 'i', 'degree': 4, 'real_solutions': []}}
 {'question': 'Solve the polynomial equation for real h:\n7*h**4 - 2*h**2 + h = 0', 'answer': '[-0.6998793469266564, 0.0]', 'metadata': {'polynomial_expr': '7*h**4 - 2*h**2 + h', 'variable': 'h', 'degree': 4, 'real_solutions': [-0.6998793469266564, 0.0]}}
 ```
+
 </details>
 
 <details>
@@ -205,6 +222,7 @@ Example output:
 {'question': '-22 - -94 + -97 =', 'answer': '-25', 'metadata': {'num_terms': 3, 'num_digits': 2, 'expression': '-22 - -94 + -97'}}
 {'question': '51 * 63 =', 'answer': '3213', 'metadata': {'num_terms': 2, 'num_digits': 2, 'expression': '51 * 63'}}
 ```
+
 </details>
 
 <details>
@@ -245,6 +263,7 @@ Example data:
     "metadata": { "num_terms": 2, "num_digits": 3, "expression": "426 + 562" }
 }
 ```
+
 </details>
 
 <details>
@@ -285,6 +304,7 @@ Example data:
     "metadata": {"rule": "add 6", "complexity": 1, "sequence": [8, 14, 20, 26, 32, 38, 44, 50]},
 }
 ```
+
 </details>
 
 <details>
@@ -322,6 +342,7 @@ Example data:
     }
 }
 ```
+
 </details>
 
 <details>
@@ -366,6 +387,7 @@ Example data:
     },
 }
 ```
+
 </details>
 
 <details>
@@ -419,6 +441,7 @@ Legend: '<' = Wall, 'w' = Path
 
 {'question': "Navigate from 'J' (start) to '_' (goal):\n\n<<<<<\n<<J<<\n<www<\n<<w_<\n<<<<<\nLegend: '<' = Wall, 'w' = Path\n", 'answer': '3', 'metadata': {'grid_size': 5, 'grid': ['<<<<<', '<<J<<', '<www<', '<<w_<', '<<<<<'], 'shortest_path_length': 3, 'start': 'J', 'goal': '_', 'wall': '<', 'path': 'w'}}
 ```
+
 </details>
 
 ## Future Generator Ideas

From fdb556fd331c36a9fdd693d9162427b056c65558 Mon Sep 17 00:00:00 2001
From: Schmeitzke <cjf.schmeitz@student.maastrichtuniversity.nl>
Date: Thu, 30 Jan 2025 17:51:23 +0100
Subject: [PATCH 08/94] Implemented simple and advanced geomtry dataset makers,
 including test scripts

---
 reasoning_gym/geometry/__init__.py          |   9 +
 reasoning_gym/geometry/advanced_geometry.py | 212 ++++++++++++++++++++
 reasoning_gym/geometry/simple_geometry.py   | 140 +++++++++++++
 tests/test_advanced_geometry.py             |  88 ++++++++
 tests/test_simple_geometry.py               |  92 +++++++++
 5 files changed, 541 insertions(+)
 create mode 100644 reasoning_gym/geometry/__init__.py
 create mode 100644 reasoning_gym/geometry/advanced_geometry.py
 create mode 100644 reasoning_gym/geometry/simple_geometry.py
 create mode 100644 tests/test_advanced_geometry.py
 create mode 100644 tests/test_simple_geometry.py

diff --git a/reasoning_gym/geometry/__init__.py b/reasoning_gym/geometry/__init__.py
new file mode 100644
index 00000000..d6539df1
--- /dev/null
+++ b/reasoning_gym/geometry/__init__.py
@@ -0,0 +1,9 @@
+from .simple_geometry import SimpleGeometryConfig, SimpleGeometryDataset
+from .advanced_geometry import AdvancedGeometryConfig, AdvancedGeometryDataset
+
+__all__ = [
+    "SimpleGeometryConfig",
+    "SimpleGeometryDataset",
+    "AdvancedGeometryConfig",
+    "AdvancedGeometryDataset",
+]
diff --git a/reasoning_gym/geometry/advanced_geometry.py b/reasoning_gym/geometry/advanced_geometry.py
new file mode 100644
index 00000000..f8221614
--- /dev/null
+++ b/reasoning_gym/geometry/advanced_geometry.py
@@ -0,0 +1,212 @@
+import random
+from dataclasses import dataclass
+from typing import Optional, List
+
+import sympy
+from sympy.geometry import Point, Triangle
+
+from ..factory import ProceduralDataset, register_dataset
+
+
+@dataclass
+class AdvancedGeometryConfig:
+    """
+    Configuration for generating advanced geometry tasks.
+    """
+    min_coord: int = -10  # Minimum x/y coordinate
+    max_coord: int = 10   # Maximum x/y coordinate
+    size: int = 50        # Number of problems to generate
+    seed: Optional[int] = None
+
+    # Probability or list of tasks we want to generate
+    # For demonstration, we have three categories:
+    task_types: List[str] = None
+
+    def __post_init__(self):
+        if self.task_types is None:
+            # Default set of advanced tasks
+            self.task_types = [
+                "orthocenter",
+                "incircle_radius",
+                "angle_measure",
+            ]
+
+    def validate(self):
+        assert self.min_coord < self.max_coord, "min_coord must be < max_coord."
+        assert self.size > 0, "Size of dataset must be positive."
+        assert len(self.task_types) > 0, "Must specify at least one task type."
+
+
+class AdvancedGeometryDataset(ProceduralDataset):
+    """
+    A dataset for advanced geometry tasks using coordinate geometry.
+    """
+
+    def __init__(self, config: AdvancedGeometryConfig):
+        self._prompt_templates = {
+            "orthocenter": [
+                "Given triangle ABC with coordinates A={A}, B={B}, and C={C}, find the coordinates of its orthocenter.",
+                "For triangle with vertices A={A}, B={B}, and C={C}, determine the orthocenter (intersection of altitudes).",
+            ],
+            "incircle_radius": [
+                "Consider triangle ABC with coordinates A={A}, B={B}, and C={C}. Compute the radius of its incircle.",
+                "Find the incircle radius of triangle ABC whose vertices are A={A}, B={B}, and C={C}.",
+            ],
+            "angle_measure": [
+                "In triangle ABC with coordinates A={A}, B={B}, and C={C}, find the measure (in degrees) of angle ABC.",
+                "Given a triangle with vertices A={A}, B={B}, C={C}, determine the angle at B in degrees.",
+            ],
+        }
+        super().__init__(config=config, seed=config.seed, size=config.size)
+
+    def __getitem__(self, idx: int) -> dict:
+        """
+        Generate a single advanced geometry item based on the config's task types.
+        """
+        rng = random.Random(self.seed + idx)
+        task_type = rng.choice(self.config.task_types)
+
+        # Randomly generate coordinates for a triangle
+        A, B, C = self._generate_non_degenerate_triangle(rng)
+
+        # Build a question and compute the solution
+        if task_type == "orthocenter":
+            question, answer, metadata = self._build_orthocenter_task(rng, A, B, C)
+        elif task_type == "incircle_radius":
+            question, answer, metadata = self._build_incircle_radius_task(rng, A, B, C)
+        elif task_type == "angle_measure":
+            question, answer, metadata = self._build_angle_measure_task(rng, A, B, C)
+        else:
+            raise ValueError(f"Unknown task_type: {task_type}")
+
+        return {
+            "question": question,
+            "answer": answer,
+            "metadata": metadata,
+        }
+
+    def _generate_non_degenerate_triangle(self, rng: random.Random):
+        """
+        Generate a random non-degenerate triangle with integer coordinates
+        in [min_coord, max_coord] x [min_coord, max_coord].
+        """
+        max_attempts = 100
+        for _ in range(max_attempts):
+            xA = rng.randint(self.config.min_coord, self.config.max_coord)
+            yA = rng.randint(self.config.min_coord, self.config.max_coord)
+            xB = rng.randint(self.config.min_coord, self.config.max_coord)
+            yB = rng.randint(self.config.min_coord, self.config.max_coord)
+            xC = rng.randint(self.config.min_coord, self.config.max_coord)
+            yC = rng.randint(self.config.min_coord, self.config.max_coord)
+
+            A = Point(xA, yA)
+            B = Point(xB, yB)
+            C = Point(xC, yC)
+            tri = Triangle(A, B, C)
+
+            # Check that the triangle is non-degenerate (area != 0)
+            if tri.area != 0:
+                return A, B, C
+
+        raise ValueError(f"Failed to generate a non-degenerate triangle after {max_attempts} attempts.")
+
+    def _build_orthocenter_task(self, rng: random.Random, A: Point, B: Point, C: Point):
+        """
+        Build a question about finding the orthocenter of triangle ABC.
+        """
+        tri = Triangle(A, B, C)
+        # Sympy can give altitudes or direct concurrency point
+        ortho = tri.orthocenter
+        # Format the answer
+        # The orthocenter may have rational coordinates, so let's convert to float or simplified fraction
+        # We'll store both numeric approximations and exact forms in metadata
+        x_ortho_approx = float(ortho.x.evalf())
+        y_ortho_approx = float(ortho.y.evalf())
+
+        # Choose a prompt
+        question_template = rng.choice(self._prompt_templates["orthocenter"])
+        question = question_template.format(
+            A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y)
+        )
+        # Round to e.g. 3 decimals or keep a string representation
+        answer_str = f"({x_ortho_approx:.3f}, {y_ortho_approx:.3f})"
+
+        metadata = {
+            "A": (A.x, A.y),
+            "B": (B.x, B.y),
+            "C": (C.x, C.y),
+            "orthocenter_exact": (str(ortho.x), str(ortho.y)),
+            "orthocenter_approx": (x_ortho_approx, y_ortho_approx),
+        }
+        return question, answer_str, metadata
+
+    def _build_incircle_radius_task(self, rng: random.Random, A: Point, B: Point, C: Point):
+        """
+        Build a question about finding the incircle radius of triangle ABC.
+        """
+        tri = Triangle(A, B, C)
+        incircle = tri.incircle()
+        # incircle is a Circle object; radius is incircle.radius
+        radius = incircle.radius
+
+        # Convert to float for final answer
+        radius_approx = float(radius.evalf())
+
+        question_template = rng.choice(self._prompt_templates["incircle_radius"])
+        question = question_template.format(
+            A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y)
+        )
+        answer_str = f"{radius_approx:.3f}"
+
+        metadata = {
+            "A": (A.x, A.y),
+            "B": (B.x, B.y),
+            "C": (C.x, C.y),
+            "incircle_radius_exact": str(radius),
+            "incircle_radius_approx": radius_approx,
+        }
+        return question, answer_str, metadata
+
+    def _build_angle_measure_task(self, rng: random.Random, A: Point, B: Point, C: Point):
+        """
+        Build a question about finding the measure of angle ABC in degrees.
+        """
+        # Angle at B means the angle ∠ABC
+        # Vector BA = A - B, BC = C - B
+        BA = A - B
+        BC = C - B
+
+        # Use vector dot product to find angle between BA and BC
+        # angle = arccos((BA · BC) / (|BA| * |BC|))
+        dot_val = BA.dot(BC)
+        mag_ba = BA.distance(Point(0, 0))
+        mag_bc = BC.distance(Point(0, 0))
+
+        # numerical check
+        if mag_ba == 0 or mag_bc == 0:
+            # degenerate, but theoretically we forced a non-degenerate triangle
+            angle_deg = 0
+        else:
+            cos_theta = dot_val / (mag_ba * mag_bc)
+            # clamp cos_theta to [-1, 1] to avoid floating rounding errors
+            cos_theta = max(-1, min(1, cos_theta))
+            angle_rad = sympy.acos(cos_theta)
+            angle_deg = float(angle_rad.evalf() * 180 / sympy.pi)
+
+        question_template = rng.choice(self._prompt_templates["angle_measure"])
+        question = question_template.format(
+            A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y)
+        )
+
+        answer_str = f"{angle_deg:.2f}°"
+        metadata = {
+            "A": (A.x, A.y),
+            "B": (B.x, B.y),
+            "C": (C.x, C.y),
+            "angle_ABC_degrees": angle_deg,
+        }
+        return question, answer_str, metadata
+
+
+# Register the dataset
+register_dataset("advanced_geometry", AdvancedGeometryDataset, AdvancedGeometryConfig)
diff --git a/reasoning_gym/geometry/simple_geometry.py b/reasoning_gym/geometry/simple_geometry.py
new file mode 100644
index 00000000..3714abe8
--- /dev/null
+++ b/reasoning_gym/geometry/simple_geometry.py
@@ -0,0 +1,140 @@
+import random
+from dataclasses import dataclass
+from typing import Optional
+
+from ..factory import ProceduralDataset, register_dataset
+
+@dataclass
+class SimpleGeometryConfig:
+    """
+    Configuration for generating basic geometry (angle-finding) tasks.
+    Produces a random convex polygon with N sides, random angles
+    for the first (N-1) sides, and asks the solver to find the last angle.
+    """
+
+    min_sides: int = 3          # Minimum number of sides (e.g. triangle)
+    max_sides: int = 6          # Maximum number of sides (e.g. hexagon)
+    min_angle: int = 10         # Minimum angle (in degrees) for each of the first (N-1) angles
+    max_angle: int = 170        # Maximum angle (in degrees) for each of the first (N-1) angles
+    seed: Optional[int] = None  # Random seed
+    size: int = 100             # Number of geometry tasks to generate
+
+    def validate(self) -> None:
+        """
+        Validate configuration parameters.
+        """
+        assert self.min_sides >= 3, "min_sides must be at least 3 (triangle)."
+        assert self.max_sides >= self.min_sides, "max_sides must be >= min_sides."
+        assert 0 < self.min_angle < 180, "min_angle must be in (0, 180)."
+        assert self.max_angle <= 179, "max_angle should be less than 180."
+        assert self.max_angle >= self.min_angle, "max_angle must be >= min_angle."
+
+
+class SimpleGeometryDataset(ProceduralDataset):
+    """
+    A dataset for simple polygon angle-finding tasks.
+    We randomly choose the number of sides N within [min_sides, max_sides].
+    We then generate (N-1) random angles (in degrees), ensuring their sum is
+    strictly less than the total sum for an (N)-sided convex polygon (which is 180*(N-2)).
+    The question asks for the missing angle; the answer is computed by subtracting the
+    sum of known angles from 180*(N-2).
+    """
+
+    def __init__(self, config: SimpleGeometryConfig):
+        self._prompt_templates = [
+            (
+                "Given a convex polygon with {n_sides} sides, its first {n_minus_1} interior angles "
+                "are: {angle_list}. What is the measure of the remaining interior angle (in degrees)?"
+            ),
+            (
+                "A convex polygon has {n_sides} sides. The measures of "
+                "the first {n_minus_1} interior angles are: {angle_list}. "
+                "Find the measure of the last interior angle."
+            ),
+            (
+                "Consider a convex {n_sides}-gon whose first {n_minus_1} interior angles "
+                "are: {angle_list}. Determine the measure of the remaining angle."
+            ),
+        ]
+        super().__init__(config=config, seed=config.seed, size=config.size)
+
+    def __getitem__(self, idx: int) -> dict:
+        """
+        Generate a single geometry angle-finding item.
+
+        Returns:
+            A dict with:
+                - question: str
+                - answer: str (the missing angle, as an integer or float in degrees)
+                - metadata: dict (n_sides, angles, sum_of_known, missing_angle, etc.)
+        """
+        rng = random.Random(self.seed + idx)
+
+        # Randomly pick the number of sides
+        n_sides = rng.randint(self.config.min_sides, self.config.max_sides)
+
+        # Total interior angle sum for a convex n_sides-gon
+        total_sum = 180 * (n_sides - 2)
+
+        # Generate (n_sides - 1) random angles, ensuring their sum < total_sum
+        known_angles = self._generate_valid_angles(rng, n_sides, total_sum)
+
+        # Missing angle
+        missing_angle = total_sum - sum(known_angles)
+
+        # Build the question string
+        angle_list_str = ", ".join(f"{a:.1f}°" for a in known_angles)
+        prompt = rng.choice(self._prompt_templates).format(
+            n_sides=n_sides,
+            n_minus_1=n_sides - 1,
+            angle_list=angle_list_str
+        )
+
+        # Round the missing angle to one decimal place or integer if it is very close to an integer
+        # so that the answer remains consistent and clean
+        missing_angle_rounded = round(missing_angle, 1)
+        if abs(missing_angle_rounded - round(missing_angle_rounded)) < 1e-6:
+            # If it is effectively an integer, keep it as int
+            missing_angle_rounded = int(missing_angle_rounded)
+
+        answer_str = str(missing_angle_rounded)
+
+        return {
+            "question": prompt,
+            "answer": answer_str,
+            "metadata": {
+                "n_sides": n_sides,
+                "known_angles": known_angles,
+                "sum_of_known_angles": sum(known_angles),
+                "missing_angle_raw": missing_angle,
+                "missing_angle_rounded": missing_angle_rounded,
+                "total_interior_sum": total_sum,
+            },
+        }
+
+    def _generate_valid_angles(self, rng: random.Random, n_sides: int, total_sum: int):
+        """
+        Generate (n_sides - 1) random angles in [min_angle, max_angle],
+        ensuring the sum is strictly less than total_sum to keep a valid missing angle.
+        We keep retrying until we find a valid set or reach a max attempt limit.
+        """
+        max_attempts = 100
+        for _ in range(max_attempts):
+            angles = []
+            # We choose angles one by one
+            for _ in range(n_sides - 1):
+                angle = rng.randint(self.config.min_angle, self.config.max_angle)
+                angles.append(float(angle))
+
+            # Check if the sum is strictly less than total_sum
+            if sum(angles) < total_sum:
+                return angles
+
+        # If we fail after max_attempts, raise an error
+        raise ValueError(
+            f"Could not generate valid angles for an {n_sides}-gon "
+            f"with total sum {total_sum} within {max_attempts} attempts."
+        )
+
+# Register the dataset so it can be accessed similarly to the others
+register_dataset("simple_geometry", SimpleGeometryDataset, SimpleGeometryConfig)
diff --git a/tests/test_advanced_geometry.py b/tests/test_advanced_geometry.py
new file mode 100644
index 00000000..cf371d10
--- /dev/null
+++ b/tests/test_advanced_geometry.py
@@ -0,0 +1,88 @@
+import pytest
+
+from reasoning_gym.geometry.advanced_geometry import (
+    AdvancedGeometryDataset,
+    AdvancedGeometryConfig,
+)
+
+def test_advanced_geometry_config_validation():
+    """Test that invalid configs raise appropriate errors."""
+    # min_coord >= max_coord
+    with pytest.raises(AssertionError):
+        config = AdvancedGeometryConfig(min_coord=5, max_coord=5)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = AdvancedGeometryConfig(min_coord=10, max_coord=0)
+        config.validate()
+
+    # size <= 0
+    with pytest.raises(AssertionError):
+        config = AdvancedGeometryConfig(size=0)
+        config.validate()
+
+    # Empty task_types
+    with pytest.raises(AssertionError):
+        config = AdvancedGeometryConfig(task_types=[])
+        config.validate()
+
+
+def test_advanced_geometry_dataset_deterministic():
+    """Test the dataset generates the same items with the same seed."""
+    config = AdvancedGeometryConfig(min_coord=-5, max_coord=5, size=5, seed=42)
+    dataset1 = AdvancedGeometryDataset(config)
+    dataset2 = AdvancedGeometryDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i], (
+            f"Item mismatch at index {i} for same seed. "
+            f"Dataset1: {dataset1[i]} vs Dataset2: {dataset2[i]}"
+        )
+
+
+def test_advanced_geometry_dataset_items():
+    """Test basic properties of generated items."""
+    config = AdvancedGeometryConfig(min_coord=-3, max_coord=3, size=5, seed=123)
+    dataset = AdvancedGeometryDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        # Check structure
+        assert isinstance(item, dict), "Generated item must be a dictionary."
+        assert "question" in item, "Item must contain a 'question' key."
+        assert "answer" in item, "Item must contain an 'answer' key."
+        assert "metadata" in item, "Item must contain a 'metadata' key."
+
+        # Basic metadata checks
+        metadata = item["metadata"]
+        assert "A" in metadata and "B" in metadata and "C" in metadata, (
+            "Metadata should contain coordinates for points A, B, and C."
+        )
+
+        # Check answer format depending on task type
+        # For angle measure tasks, answer should end with '°'
+        if "angle_measure" in item["question"].lower() or "angle at" in item["question"].lower():
+            assert item["answer"].endswith("°"), (
+                f"Expected angle measure in degrees, got {item['answer']}"
+            )
+
+
+def test_advanced_geometry_dataset_iteration():
+    """Test that iteration respects dataset size and is repeatable."""
+    config = AdvancedGeometryConfig(min_coord=-2, max_coord=2, size=3, seed=999)
+    dataset = AdvancedGeometryDataset(config)
+
+    # Test manual iteration
+    items = []
+    for item in dataset:
+        items.append(item)
+    assert len(items) == config.size, "Iterator should yield exactly 'size' items."
+
+    # Test list conversion
+    items_list = list(dataset)
+    assert len(items_list) == config.size, "List conversion should yield exactly 'size' items."
+
+    # Test multiple iterations produce the same results
+    first_items = list(dataset)
+    second_items = list(dataset)
+    assert first_items == second_items, "Multiple iterations should yield the same items."
diff --git a/tests/test_simple_geometry.py b/tests/test_simple_geometry.py
new file mode 100644
index 00000000..4d8702df
--- /dev/null
+++ b/tests/test_simple_geometry.py
@@ -0,0 +1,92 @@
+import pytest
+
+from reasoning_gym.geometry.simple_geometry import (
+    SimpleGeometryDataset,
+    SimpleGeometryConfig,
+)
+
+def test_simple_geometry_config_validation():
+    """Test invalid configs raise appropriate errors."""
+    # min_sides < 3
+    with pytest.raises(AssertionError):
+        config = SimpleGeometryConfig(min_sides=2, max_sides=5)
+        config.validate()
+
+    # max_sides < min_sides
+    with pytest.raises(AssertionError):
+        config = SimpleGeometryConfig(min_sides=4, max_sides=3)
+        config.validate()
+
+    # Invalid angles
+    with pytest.raises(AssertionError):
+        config = SimpleGeometryConfig(min_angle=-10)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = SimpleGeometryConfig(min_angle=10, max_angle=5)
+        config.validate()
+
+
+def test_simple_geometry_dataset_deterministic():
+    """Test the dataset generates the same items with the same seed."""
+    config = SimpleGeometryConfig(seed=42, size=5, min_sides=3, max_sides=4)
+    dataset1 = SimpleGeometryDataset(config)
+    dataset2 = SimpleGeometryDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i], (
+            f"Item mismatch at index {i} for same seed. "
+            f"Dataset1: {dataset1[i]} vs Dataset2: {dataset2[i]}"
+        )
+
+
+def test_simple_geometry_dataset_items():
+    """Test basic properties of generated items."""
+    config = SimpleGeometryConfig(
+        min_sides=3, 
+        max_sides=5, 
+        min_angle=10, 
+        max_angle=120, 
+        size=10, 
+        seed=123
+    )
+    dataset = SimpleGeometryDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        # Check structure
+        assert isinstance(item, dict), "Generated item must be a dictionary."
+        assert "question" in item, "Item must contain a 'question' key."
+        assert "answer" in item, "Item must contain an 'answer' key."
+        assert "metadata" in item, "Item must contain a 'metadata' key."
+
+        metadata = item["metadata"]
+        assert "n_sides" in metadata, "Metadata should contain 'n_sides'."
+        assert "missing_angle_rounded" in metadata, (
+            "Metadata should contain the computed 'missing_angle_rounded'."
+        )
+
+        # Check that the missing angle is a valid float or integer
+        missing_angle = float(item["answer"])
+        assert missing_angle > 0, f"Missing angle should be positive, found {missing_angle}"
+
+
+def test_simple_geometry_dataset_iteration():
+    """Test that iteration respects dataset size and is repeatable."""
+    config = SimpleGeometryConfig(min_sides=3, max_sides=4, size=5, seed=42)
+    dataset = SimpleGeometryDataset(config)
+
+    # Test manual iteration
+    items = []
+    for item in dataset:
+        items.append(item)
+    assert len(items) == config.size, "Iterator should yield exactly 'size' items."
+
+    # Test list conversion
+    items_list = list(dataset)
+    assert len(items_list) == config.size, "List conversion should yield exactly 'size' items."
+
+    # Test multiple iterations produce the same results
+    first_items = list(dataset)
+    second_items = list(dataset)
+    assert first_items == second_items, "Multiple iterations should yield the same items."

From fc81bbec0636bc7626c6d5a619ad13dc0c570806 Mon Sep 17 00:00:00 2001
From: Rich Jones <miserlou@gmail.com>
Date: Thu, 30 Jan 2025 20:08:44 +0100
Subject: [PATCH 09/94] game of life via cellpylib

---
 pyproject.toml                      |  1 +
 reasoning_gym/games/__init__.py     |  4 ++
 reasoning_gym/games/game_of_life.py | 96 +++++++++++++++++++++++++++++
 tests/test_game_of_life.py          | 33 ++++++++++
 4 files changed, 134 insertions(+)
 create mode 100644 reasoning_gym/games/game_of_life.py
 create mode 100644 tests/test_game_of_life.py

diff --git a/pyproject.toml b/pyproject.toml
index 0c2ec1ab..3a546f8c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
   "bfi==1.0.4",
+  "cellpylib==2.4.0",
   "sympy>=1.13.1", 
   "magiccube==0.3.0", 
   "pyfiglet==1.0.2"
diff --git a/reasoning_gym/games/__init__.py b/reasoning_gym/games/__init__.py
index cf083ba4..0826dea6 100644
--- a/reasoning_gym/games/__init__.py
+++ b/reasoning_gym/games/__init__.py
@@ -3,12 +3,14 @@ Game tasks for training reasoning capabilities:
 - Board games
 - Puzzle games
 - Strategy games
+- Simulation games
 """
 
 from .countdown import CountdownConfig, CountdownDataset
 from .maze import MazeConfig, MazeDataset
 from .mini_sudoku import MiniSudokuConfig, MiniSudokuDataset
 from .sudoku import SudokuConfig, SudokuDataset
+from .game_of_life import GameOfLifeConfig, GameOfLifeDataset
 
 __all__ = [
     "CountdownConfig",
@@ -19,4 +21,6 @@ __all__ = [
     "SudokuDataset",
     "MazeConfig",
     "MazeDataset",
+    "GameOfLifeConfig",
+    "GameOfLifeDataset",
 ]
diff --git a/reasoning_gym/games/game_of_life.py b/reasoning_gym/games/game_of_life.py
new file mode 100644
index 00000000..cc4dc6a8
--- /dev/null
+++ b/reasoning_gym/games/game_of_life.py
@@ -0,0 +1,96 @@
+from dataclasses import dataclass
+from random import Random
+from typing import List, Optional, Tuple, Dict
+
+import cellpylib as cpl
+
+from ..factory import ProceduralDataset, register_dataset
+
+@dataclass
+class GameOfLifeConfig:
+    """Configuration for sudoku puzzle generation"""
+
+    grid_size_x: int = 20 
+    grid_size_y: int = 20
+    filled_cells: int = 100 # actually a max
+    simulation_steps: int = 1
+    seed: Optional[int] = None
+    size: int = 500
+
+    def validate(self):
+        """Validate configuration parameters"""
+        assert 3 <= self.grid_size_x <= 999, "grid_size_x must be between 0 and 999"
+        assert 3 <= self.grid_size_y <= 999, "grid_size_y must be between 0 and 999"
+        assert self.simulation_steps >= 0, "simulation_steps must be gte 0"
+        assert self.filled_cells <= self.grid_size_x * self.grid_size_y, "filled_cells must fit in x times y"
+
+
+class GameOfLifeConfigDataset(ProceduralDataset):
+    """Generates Game of Life games with configurable parameters"""
+
+    def __init__(self, config: GameOfLifeConfig):
+        self._prompt_templates = ["What will this Game of Life board look like after {simulation_steps} steps of simulation?\n\n{board}"
+        ]
+
+        super().__init__(config=config, seed=config.seed, size=config.size)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Generate a single GameOfLife task
+
+        Returns:
+            dict with keys:
+                - question: str, the task description
+                - answer: str, a solution string
+                - metadata: dict with generation parameters
+        """
+        rng = Random(self.seed + idx)
+
+        # Make the board
+        board  = cpl.init_simple2d(self.config.grid_size_x, self.config.grid_size_y)
+        board[:, :, :] = 0
+
+        # Add the cells
+        for i in range(0, self.config.filled_cells):
+            rx = rng.randint(0, self.config.grid_size_x - 1)
+            ry = rng.randint(0, self.config.grid_size_y - 1)
+            board[:, rx, ry] = 1
+
+        # Simulate the result to get the answer
+        evolved = cpl.evolve2d(board, timesteps=self.config.simulation_steps + 1, apply_rule=cpl.game_of_life_rule, memoize='recursive')
+
+        board_str = str(board[0])
+        result_str = str(evolved[-1])
+
+        return {
+            "question": rng.choice(self._prompt_templates).format(simulation_steps=self.config.simulation_steps, board=board_str),
+            "answer": result_str,
+            "metadata": {
+                "grid_size_x": self.config.grid_size_x,
+                "grid_size_y": self.config.grid_size_y,
+                "filled_cells": self.config.filled_cells,
+                "simulation_steps": self.config.simulation_steps,
+            },
+        }
+
+    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+        """Determine if the solution provided solves the GoL task.
+
+        The function awards 1.0 for a correct answer.
+
+        Args:
+            answer (Optional[str]): The user's answer.
+            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+
+        Returns:
+            float: The computed score between 0.0 and 1.0.
+        """
+
+        if answer == None:
+            return 0.0
+        if answer.replace('\n', '') != entry['answer'].replace('\n', ''):
+            return 0.01
+        else:
+            return 1.0 # Yay
+
+
+register_dataset("game_of_life", GameOfLifeConfigDataset, GameOfLifeConfig)
diff --git a/tests/test_game_of_life.py b/tests/test_game_of_life.py
new file mode 100644
index 00000000..288a1fe4
--- /dev/null
+++ b/tests/test_game_of_life.py
@@ -0,0 +1,33 @@
+import pytest
+
+from reasoning_gym.games.game_of_life import GameOfLifeConfig, GameOfLifeConfigDataset
+
+def test_game_of_life():
+    """Test basic properties and solution of generated items"""
+
+    # Easy
+    config = GameOfLifeConfig(
+        seed=42, 
+        size=1, 
+        grid_size_x=20,
+        grid_size_y=20,
+        filled_cells=10,
+        simulation_steps=1
+    )
+    dataset = GameOfLifeConfigDataset(config)
+
+    for item in dataset:
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        # # Check metadata contains required fields
+        assert "grid_size_x" in item["metadata"]
+        assert "grid_size_y" in item["metadata"]
+        assert "filled_cells" in item["metadata"]
+        assert "simulation_steps" in item["metadata"]
+
+        # # Test the scoring
+        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
+        assert dataset.score_answer(answer=None, entry=item) == 0.0

From ebb88e6c6aa4ffad2e29162e66bd58ca7ea3d293 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Thu, 30 Jan 2025 22:55:04 +0100
Subject: [PATCH 10/94] lint

---
 GALLERY.md                                    | 1143 ++++++++++-------
 pyproject.toml                                |    4 +-
 reasoning_gym/code/__init__.py                |    5 +-
 reasoning_gym/code/bf.py                      |    7 +-
 reasoning_gym/code/contrib/bfit/BF-it.py      |    6 +-
 .../code/contrib/bfit/Compiler/Compiler.py    |   72 +-
 .../contrib/bfit/Compiler/FunctionCompiler.py |  228 +++-
 .../code/contrib/bfit/Compiler/Functions.py   |    6 +-
 .../code/contrib/bfit/Compiler/General.py     |  114 +-
 .../code/contrib/bfit/Compiler/Globals.py     |    5 +-
 .../contrib/bfit/Compiler/Lexical_analyzer.py |  198 +--
 .../bfit/Compiler/LibraryFunctionCompiler.py  |    4 +-
 .../code/contrib/bfit/Compiler/Node.py        |   85 +-
 .../code/contrib/bfit/Compiler/Optimizer.py   |   43 +-
 .../code/contrib/bfit/Compiler/Parser.py      |   10 +-
 .../code/contrib/bfit/Interpreter.py          |   41 +-
 reasoning_gym/code/contrib/bfit/README.md     |    3 +-
 reasoning_gym/cognition/__init__.py           |    4 +-
 reasoning_gym/games/__init__.py               |    2 +-
 reasoning_gym/games/game_of_life.py           |   28 +-
 reasoning_gym/graphs/__init__.py              |    2 +-
 scripts/generate_gallery.py                   |    2 +-
 tests/test_bf.py                              |    3 +-
 tests/test_game_of_life.py                    |   14 +-
 24 files changed, 1215 insertions(+), 814 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index f0a2caba..e161b85d 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -11,6 +11,7 @@ This gallery shows examples from all available datasets using their default conf
 - [family_relationships](#family-relationships)
 - [figlet_font](#figlet-font)
 - [fraction_simplification](#fraction-simplification)
+- [game_of_life](#game-of-life)
 - [gcd](#gcd)
 - [lcm](#lcm)
 - [leg_counting](#leg-counting)
@@ -35,7 +36,7 @@ This gallery shows examples from all available datasets using their default conf
 - [word_sorting](#word-sorting)
 
 ## Dataset Examples
-### base_conversion {base-conversion}
+### base_conversion
 Generates base conversion tasks
 
 Default configuration:
@@ -51,23 +52,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Convert the base-5 number 21e to base-3
-Answer: 21e
-Metadata: {'decimal_value': 542, 'source_base': 5, 'target_base': 3, 'source_repr': '21e', 'target_repr': '21e'}
+Question: Convert the base-15 number 15 to binary
+Answer: 10101
+Metadata: {'decimal_value': 21, 'source_base': 15, 'target_base': 2, 'source_repr': '15', 'target_repr': '10101'}
 
 Example 2:
-Question: Convert the base-6 number f7 to base-3
-Answer: f7
-Metadata: {'decimal_value': 247, 'source_base': 6, 'target_base': 3, 'source_repr': 'f7', 'target_repr': 'f7'}
+Question: Convert the base-15 number de to base-6
+Answer: de
+Metadata: {'decimal_value': 222, 'source_base': 15, 'target_base': 6, 'source_repr': 'de', 'target_repr': 'de'}
 
 Example 3:
-Question: Convert the base-4 number 25d to base-14 (use lowercase letters a-z for digits above 9)
-Answer: 25d
-Metadata: {'decimal_value': 605, 'source_base': 4, 'target_base': 14, 'source_repr': '25d', 'target_repr': '25d'}
+Question: Convert the base-10 number 4e to binary
+Answer: 1001110
+Metadata: {'decimal_value': 78, 'source_base': 10, 'target_base': 2, 'source_repr': '4e', 'target_repr': '1001110'}
 
 ```
 
-### basic_arithmetic {basic-arithmetic}
+### basic_arithmetic
 Dataset that generates basic arithmetic tasks with configurable complexity
 
 Default configuration:
@@ -88,23 +89,23 @@ whitespace = single
 Example tasks:
 ```
 Example 1:
-Question: 7035 / 1005 =
-Answer: 7
-Metadata: {'num_terms': 2, 'num_digits': 4, 'expression': '7035 / 1005'}
+Question: 19 + 61 * -43 / 1 + 89 - 98 =
+Answer: -2613
+Metadata: {'num_terms': 6, 'num_digits': 2, 'expression': '19 + 61 * -43 / 1 + 89 - 98'}
 
 Example 2:
-Question: -743 + 475 + 719 + -155 - -768 =
-Answer: 1064
-Metadata: {'num_terms': 5, 'num_digits': 3, 'expression': '-743 + 475 + 719 + -155 - -768'}
+Question: ( 9240 + -702 ) =
+Answer: 8538
+Metadata: {'num_terms': 2, 'num_digits': 4, 'expression': '( 9240 + -702 )'}
 
 Example 3:
-Question: 898 / 2 + 357 + -664 * -496 =
-Answer: 330150
-Metadata: {'num_terms': 5, 'num_digits': 3, 'expression': '898 / 2 + 357 + -664 * -496'}
+Question: -68 * 12 - 6 / 2 + -60 =
+Answer: -879
+Metadata: {'num_terms': 5, 'num_digits': 2, 'expression': '-68 * 12 - 6 / 2 + -60'}
 
 ```
 
-### caesar_cipher {caesar-cipher}
+### caesar_cipher
 Generates Caesar cipher encryption/decryption tasks
 
 Default configuration:
@@ -121,23 +122,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Decrypt this Caesar cipher text: B EJTDVTTJPO XBT HPJOH PO XIFO IF FOUFSFE
-Answer: A DISCUSSION WAS GOING ON WHEN HE ENTERED
-Metadata: {'rotation': 1, 'cipher_text': 'B EJTDVTTJPO XBT HPJOH PO XIFO IF FOUFSFE', 'clear_text': 'A DISCUSSION WAS GOING ON WHEN HE ENTERED'}
+Question: Decrypt this Caesar cipher text: UVYAO MVY AOL VM IBA AOL ZVBAO MVY AOL SHAPUZ
+Answer: NORTH FOR THE OF BUT THE SOUTH FOR THE LATINS
+Metadata: {'rotation': 7, 'cipher_text': 'UVYAO MVY AOL VM IBA AOL ZVBAO MVY AOL SHAPUZ', 'clear_text': 'NORTH FOR THE OF BUT THE SOUTH FOR THE LATINS'}
 
 Example 2:
-Question: Decrypt this Caesar cipher text: Q IU BQZML YCQBM BQZML LW GWC VWB BPQVS BPIB I JIBP EWCTL ZMNZMAP
-Answer: I AM TIRED QUITE TIRED DO YOU NOT THINK THAT A BATH WOULD REFRESH
-Metadata: {'rotation': 8, 'cipher_text': 'Q IU BQZML YCQBM BQZML LW GWC VWB BPQVS BPIB I JIBP EWCTL ZMNZMAP', 'clear_text': 'I AM TIRED QUITE TIRED DO YOU NOT THINK THAT A BATH WOULD REFRESH'}
+Question: Decrypt this Caesar cipher text: ER MRHITIRHIRX KSZIVRQIRX
+Answer: AN INDEPENDENT GOVERNMENT
+Metadata: {'rotation': 4, 'cipher_text': 'ER MRHITIRHIRX KSZIVRQIRX', 'clear_text': 'AN INDEPENDENT GOVERNMENT'}
 
 Example 3:
-Question: Decrypt this Caesar cipher text: Y IGLE GQ FC
-Answer: A KING IS HE
-Metadata: {'rotation': 24, 'cipher_text': 'Y IGLE GQ FC', 'clear_text': 'A KING IS HE'}
+Question: Decrypt this Caesar cipher text: IYE WKI ECO DRSC OLYYU PYB XOKBVI KXI ZEBZYCO CEMR KC MBOKDSYX YP NOBSFKDSFO ZOBPYBWKXMOC KXN BOCOKBMR
+Answer: YOU MAY USE THIS EBOOK FOR NEARLY ANY PURPOSE SUCH AS CREATION OF DERIVATIVE PERFORMANCES AND RESEARCH
+Metadata: {'rotation': 10, 'cipher_text': 'IYE WKI ECO DRSC OLYYU PYB XOKBVI KXI ZEBZYCO CEMR KC MBOKDSYX YP NOBSFKDSFO ZOBPYBWKXMOC KXN BOCOKBMR', 'clear_text': 'YOU MAY USE THIS EBOOK FOR NEARLY ANY PURPOSE SUCH AS CREATION OF DERIVATIVE PERFORMANCES AND RESEARCH'}
 
 ```
 
-### chain_sum {chain-sum}
+### chain_sum
 Generates simple arithmetic tasks using only + and - operators
 
 Default configuration:
@@ -154,23 +155,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: 47 - 18 - 85 + 54 =
-Answer: -2
-Metadata: {'num_terms': 4, 'num_digits': 2, 'expression': '47 - 18 - 85 + 54'}
+Question: 3 - 6 + 4 =
+Answer: 1
+Metadata: {'num_terms': 3, 'num_digits': 1, 'expression': '3 - 6 + 4'}
 
 Example 2:
-Question: 52 + 23 - 88 + 78 + 92 - 54 =
-Answer: 103
-Metadata: {'num_terms': 6, 'num_digits': 2, 'expression': '52 + 23 - 88 + 78 + 92 - 54'}
+Question: 6516 - 9002 - 5380 - 2663 =
+Answer: -10529
+Metadata: {'num_terms': 4, 'num_digits': 4, 'expression': '6516 - 9002 - 5380 - 2663'}
 
 Example 3:
-Question: 46 + 76 + 75 + 46 + 70 - 88 =
-Answer: 225
-Metadata: {'num_terms': 6, 'num_digits': 2, 'expression': '46 + 76 + 75 + 46 + 70 - 88'}
+Question: 3352 + 3153 - 3475 + 1726 - 8711 - 7863 =
+Answer: -11818
+Metadata: {'num_terms': 6, 'num_digits': 4, 'expression': '3352 + 3153 - 3475 + 1726 - 8711 - 7863'}
 
 ```
 
-### color_cube_rotation {color-cube-rotation}
+### color_cube_rotation
 Generates color cube rotation reasoning tasks
 
 Default configuration:
@@ -185,56 +186,58 @@ Example tasks:
 ```
 Example 1:
 Question: A cube has:
-- a white top side
-- a silver right side
-- a brown front side
-- a violet left side
-- a pink back side
-- a purple bottom side
-
-The cube is rotated so that the side which was before at the left is now at the top.
-
-Then the cube is rotated to bring the bottom side to the top.
-
-Now the cube is rotated to place its back side at the top.
-
-What is now the color of the top side of the cube?
-Answer: brown
-Metadata: {'initial_state': {'top': 'white', 'right': 'silver', 'front': 'brown', 'left': 'violet', 'back': 'pink', 'bottom': 'purple'}, 'rotations': ['left', 'bottom', 'back'], 'target_side': 'top', 'num_rotations': 3}
-
-Example 2:
-Question: A cube has:
-- a violet top side
-- a green right side
-- a white front side
-- a blue left side
-- a gold back side
-- a yellow bottom side
-
-The cube is rotated so that the side which was before at the right is now at the top.
-
-What is now the color of the bottom side of the cube?
-Answer: blue
-Metadata: {'initial_state': {'top': 'violet', 'right': 'green', 'front': 'white', 'left': 'blue', 'back': 'gold', 'bottom': 'yellow'}, 'rotations': ['right'], 'target_side': 'bottom', 'num_rotations': 1}
-
-Example 3:
-Question: A cube has:
-- a magenta top side
-- a green right side
-- a brown front side
-- a yellow left side
+- a red top side
+- a brown right side
+- a cyan front side
+- a gray left side
 - a silver back side
-- a violet bottom side
+- a purple bottom side
 
 The cube is rotated so that the side which was before at the front is now at the top.
 
-What is now the color of the bottom side of the cube?
+Now the cube is rotated to place its right side at the top.
+
+What is now the color of the top side of the cube?
+Answer: brown
+Metadata: {'initial_state': {'top': 'red', 'right': 'brown', 'front': 'cyan', 'left': 'gray', 'back': 'silver', 'bottom': 'purple'}, 'rotations': ['front', 'right'], 'target_side': 'top', 'num_rotations': 2}
+
+Example 2:
+Question: A cube has:
+- a yellow top side
+- a cyan right side
+- a white front side
+- a blue left side
+- a red back side
+- a pink bottom side
+
+The cube is rotated so that the side which was before at the left is now at the top.
+
+Then the cube is rotated to bring the front side to the top.
+
+Next, the front side is rotated to become the top face.
+
+What is now the color of the front side of the cube?
+Answer: red
+Metadata: {'initial_state': {'top': 'yellow', 'right': 'cyan', 'front': 'white', 'left': 'blue', 'back': 'red', 'bottom': 'pink'}, 'rotations': ['left', 'front', 'front'], 'target_side': 'front', 'num_rotations': 3}
+
+Example 3:
+Question: A cube has:
+- a indigo top side
+- a violet right side
+- a silver front side
+- a pink left side
+- a magenta back side
+- a cyan bottom side
+
+The cube is rotated so that the side which was before at the front is now at the top.
+
+What is now the color of the top side of the cube?
 Answer: silver
-Metadata: {'initial_state': {'top': 'magenta', 'right': 'green', 'front': 'brown', 'left': 'yellow', 'back': 'silver', 'bottom': 'violet'}, 'rotations': ['front'], 'target_side': 'bottom', 'num_rotations': 1}
+Metadata: {'initial_state': {'top': 'indigo', 'right': 'violet', 'front': 'silver', 'left': 'pink', 'back': 'magenta', 'bottom': 'cyan'}, 'rotations': ['front'], 'target_side': 'top', 'num_rotations': 1}
 
 ```
 
-### countdown {countdown}
+### countdown
 Generates Countdown Number Game tasks
 
 Default configuration:
@@ -254,26 +257,26 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Using the numbers 4, 15, 90, 49, 15, create an expression that equals 275.
-You can only use each number once.
-Answer: 49*90/15 - 15 - 4
-Metadata: {'numbers': [4, 15, 90, 49, 15], 'target': 275, 'expression': '49*90/15 - 15 - 4'}
+Question: Calculate 421 using the numbers 10, 30, 26, 59.
+Each number may be used at most once.
+Answer: 30*(26 - 10) - 59
+Metadata: {'numbers': [10, 30, 26, 59], 'target': 421, 'expression': '30*(26 - 10) - 59'}
 
 Example 2:
-Question: Calculate 237 using the numbers 32, 56, 64, 23, 3, 100.
+Question: Calculate 229 using the numbers 55, 80, 34, 60.
 Each number may be used at most once.
-Answer: 100*3 - 64 - 23 - 32 + 56
-Metadata: {'numbers': [32, 56, 64, 23, 3, 100], 'target': 237, 'expression': '100*3 - 64 - 23 - 32 + 56'}
+Answer: 80 + 34 + 60 + 55
+Metadata: {'numbers': [55, 80, 34, 60], 'target': 229, 'expression': '80 + 34 + 60 + 55'}
 
 Example 3:
-Question: Find a way to make 241 using some or all of these numbers: 87, 85, 82, 13.
-Each number can only be used once.
-Answer: 85 + 82 - 13 + 87
-Metadata: {'numbers': [87, 85, 82, 13], 'target': 241, 'expression': '85 + 82 - 13 + 87'}
+Question: Calculate 840 using the numbers 41, 18, 32, 45, 84.
+Each number may be used at most once.
+Answer: 84*(41 - 45 + 32 - 18)
+Metadata: {'numbers': [41, 18, 32, 45, 84], 'target': 840, 'expression': '84*(41 - 45 + 32 - 18)'}
 
 ```
 
-### family_relationships {family-relationships}
+### family_relationships
 Generates family relationship reasoning tasks
 
 Default configuration:
@@ -289,29 +292,29 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Charles is married to Jessica. They have a child called Atlas. Atlas is married to Amelia. They have children called Patricia and Nova. David is married to Ava. They have a child called Amelia.
+Question: Jack is married to Elizabeth. They have a child called Oliver. Oliver is married to Abigail. They have a child called Logan. Alexander is married to Mia. They have a child called Abigail.
 
-How is Jessica related to Nova?
-Answer: grandmother
-Metadata: {'person1': 'Jessica', 'person2': 'Nova', 'relationship': 'grandmother', 'family_size': 8}
+What relation is Mia to Abigail?
+Answer: mother
+Metadata: {'person1': 'Mia', 'person2': 'Abigail', 'relationship': 'mother', 'family_size': 7}
 
 Example 2:
-Question: David is married to Charlotte. They have a child called Lucas. Lucas is married to Victoria. They have children called James and Abigail.
+Question: James is married to Sarah. They have a child called Atlas. Atlas is married to Sophie. They have children called Jennifer and Aria.
 
-What is Victoria to Abigail?
-Answer: mother
-Metadata: {'person1': 'Victoria', 'person2': 'Abigail', 'relationship': 'mother', 'family_size': 6}
+What is Aria to Jennifer?
+Answer: sister
+Metadata: {'person1': 'Aria', 'person2': 'Jennifer', 'relationship': 'sister', 'family_size': 6}
 
 Example 3:
-Question: Mason is married to Amelia. They have a child called James. James is married to Grace. They have a child called Abigail.
+Question: Lucas is married to Willow. They have a child called Samuel. Samuel is married to Zoe. They have a child called William. Henry is married to Emma. They have a child called Zoe.
 
-What relation is James to Amelia?
-Answer: son
-Metadata: {'person1': 'James', 'person2': 'Amelia', 'relationship': 'son', 'family_size': 5}
+What is Lucas to Willow?
+Answer: husband
+Metadata: {'person1': 'Lucas', 'person2': 'Willow', 'relationship': 'husband', 'family_size': 7}
 
 ```
 
-### figlet_font {figlet-font}
+### figlet_font
 Generates FigletFont tasks
 
 Default configuration:
@@ -326,44 +329,45 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: What word does this say?
+Question: Please read the following figlet font:
 
- __          _
-(_    |_|   / \   | |   |\|
-__)   | |   \_/   |^|   | |
+  ()     _     _       _    _ __     ()  ,
+  /\    ' )   /       | )  ' )  )    /`-'|
+ /  )    / / /    ,---|/    /  /    /   /
+/__/__  (_(_/      \_/ \_  /  (_   /__-<_
 
-Answer: SHOWN
-Metadata: {'font': 'bigfig', 'space_letters': True}
+
+
+Answer: SWING
+Metadata: {'font': 'slscript', 'space_letters': True}
 
 Example 2:
 Question: What word does this say?
 
-  #####  ### ###     ##    ######    ######
- ## ###  ##   ##   #####   ###  ##  ######
-##       ##   ##   ## ###  ##   ##     ##
-##       #######  ##   ##  ##  ##      ##
-##       ##   ##  #######  #####       ##
-#####    ##   ##  ##  ##    ## ##      ##
- #####    #    #  #   #     ##  ##      #
+     dBBBP     dBBBBBb        dBBBP     dBP dBP    dBBBP
+                    BB
+   dBP          dBP BB      dBP       dBBBBBP    dBBP
+  dBP          dBP  BB     dBP       dBP dBP    dBP
+ dBBBBP       dBBBBBBB    dBBBBP    dBP dBP    dBBBBP
 
 
-Answer: CHART
-Metadata: {'font': 'future_6', 'space_letters': True}
+Answer: CACHE
+Metadata: {'font': 'trek', 'space_letters': True}
 
 Example 3:
 Question: Please read the following figlet font:
 
-.dP"Y8     88  88     888888        db        88""Yb
-`Ybo."     88  88     88__         dPYb       88__dP
-o.`Y8b     888888     88""        dP__Yb      88"Yb
-8bodP'     88  88     888888     dP""""Yb     88  Yb
+.---. .---. .-. .-..-. .-..-.
+ \ \  | |-' | | | .` |  >  /
+`---' `-'   `-' `-'`-'  `-'
 
-Answer: SHEAR
-Metadata: {'font': '4max', 'space_letters': True}
+
+Answer: SPINY
+Metadata: {'font': 'linux', 'space_letters': True}
 
 ```
 
-### fraction_simplification {fraction-simplification}
+### fraction_simplification
 Generates fraction simplification tasks
 
 Default configuration:
@@ -380,23 +384,175 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Simplify the fraction $12054/36848$ to its lowest terms
-Answer: $123/376$
-Metadata: {'numerator': 12054, 'denominator': 36848, 'simplified_numerator': 123, 'simplified_denominator': 376, 'reduction_factor': 98, 'style': 'latex_inline'}
+Question: Simplify the fraction $1380/6180$ to its lowest terms
+Answer: $23/103$
+Metadata: {'numerator': 1380, 'denominator': 6180, 'simplified_numerator': 23, 'simplified_denominator': 103, 'reduction_factor': 60, 'style': 'latex_inline'}
 
 Example 2:
-Question: Simplify the fraction 1218/28275 to its lowest terms
-Answer: 14/325
-Metadata: {'numerator': 1218, 'denominator': 28275, 'simplified_numerator': 14, 'simplified_denominator': 325, 'reduction_factor': 87, 'style': 'plain'}
+Question: Simplify the fraction 15552/49984 to its lowest terms
+Answer: 243/781
+Metadata: {'numerator': 15552, 'denominator': 49984, 'simplified_numerator': 243, 'simplified_denominator': 781, 'reduction_factor': 64, 'style': 'plain'}
 
 Example 3:
-Question: Simplify the fraction 21902/24111 to its lowest terms
-Answer: 466/513
-Metadata: {'numerator': 21902, 'denominator': 24111, 'simplified_numerator': 466, 'simplified_denominator': 513, 'reduction_factor': 47, 'style': 'plain'}
+Question: Simplify the fraction $56100/80500$ to its lowest terms
+Answer: $561/805$
+Metadata: {'numerator': 56100, 'denominator': 80500, 'simplified_numerator': 561, 'simplified_denominator': 805, 'reduction_factor': 100, 'style': 'latex_inline'}
 
 ```
 
-### gcd {gcd}
+### game_of_life
+Generates Game of Life games with configurable parameters
+
+Default configuration:
+```python
+grid_size_x = 20
+grid_size_y = 20
+filled_cells = 100
+simulation_steps = 1
+seed = None
+size = 500
+```
+
+Example tasks:
+```
+Example 1:
+Question: What will this Game of Life board look like after 1 steps of simulation?
+
+[[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
+ [0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0]
+ [0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1]
+ [0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1]
+ [0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 1 0 0 0 0 1 0 1 0 1 1 1 0 1 0 0 1]
+ [0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0]
+ [0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0]
+ [0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0]
+ [1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1]
+ [0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0]
+ [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0]
+ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 0]
+ [0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1]
+ [0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0]
+ [0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0]
+ [1 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0]]
+Answer: [[1 0 0 1 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0]
+ [0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1]
+ [0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0]
+ [0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 1 0]
+ [0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 1 0]
+ [0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0]
+ [0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0]
+ [0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0]
+ [0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0]
+ [0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0]
+ [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0]
+ [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0]
+ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0]
+ [0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1]
+ [0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0]
+ [0 0 0 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1 0 0]
+ [0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0]]
+Metadata: {'grid_size_x': 20, 'grid_size_y': 20, 'filled_cells': 100, 'simulation_steps': 1}
+
+Example 2:
+Question: What will this Game of Life board look like after 1 steps of simulation?
+
+[[0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0]
+ [0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0]
+ [1 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0]
+ [0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0]
+ [0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 1]
+ [0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1]
+ [1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0]
+ [0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0]
+ [1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0]
+ [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1]
+ [0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1]
+ [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0]
+ [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1]
+ [1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 0]
+ [0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0]
+ [0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0]
+ [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
+ [0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0]
+ [0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0]]
+Answer: [[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0]
+ [0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 1]
+ [0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0]
+ [0 0 0 0 1 1 1 0 1 1 0 1 1 0 1 0 0 0 0 1]
+ [1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1]
+ [0 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 0 1 1]
+ [1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0]
+ [0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0]
+ [0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1]
+ [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1]
+ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1]
+ [0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1]
+ [1 1 0 0 0 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0]
+ [0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0]
+ [0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
+ [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0]]
+Metadata: {'grid_size_x': 20, 'grid_size_y': 20, 'filled_cells': 100, 'simulation_steps': 1}
+
+Example 3:
+Question: What will this Game of Life board look like after 1 steps of simulation?
+
+[[1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1]
+ [0 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0]
+ [0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0]
+ [0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 1 0 0 1]
+ [1 1 0 0 0 1 1 0 0 0 0 1 1 1 0 1 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0]
+ [0 0 1 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0]
+ [0 0 1 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0]
+ [0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0]
+ [0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1]
+ [0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0]
+ [0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0]
+ [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0 1 1 0 0]
+ [0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1]
+ [0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
+ [1 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0]]
+Answer: [[1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1]
+ [0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0]
+ [0 0 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0]
+ [1 1 1 0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0]
+ [1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0]
+ [0 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 0 0]
+ [0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
+ [0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0]
+ [0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0]
+ [0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0]
+ [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0]
+ [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0]
+ [0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0]
+ [0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0]]
+Metadata: {'grid_size_x': 20, 'grid_size_y': 20, 'filled_cells': 100, 'simulation_steps': 1}
+
+```
+
+### gcd
 Generates Greatest Common Divisor (GCD) tasks
 
 Default configuration:
@@ -412,23 +568,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Find the Greatest Common Divisor (GCD) of these numbers: 384, 414
-Answer: 6
-Metadata: {'numbers': [384, 414], 'result': 6}
+Question: Find the Greatest Common Divisor (GCD) of these numbers: 226, 512
+Answer: 2
+Metadata: {'numbers': [226, 512], 'result': 2}
 
 Example 2:
-Question: Find the Greatest Common Divisor (GCD) of these numbers: 298, 803
-Answer: 1
-Metadata: {'numbers': [298, 803], 'result': 1}
+Question: Find the Greatest Common Divisor (GCD) of these numbers: 999, 495
+Answer: 9
+Metadata: {'numbers': [999, 495], 'result': 9}
 
 Example 3:
-Question: Find the Greatest Common Divisor (GCD) of these numbers: 846, 550
-Answer: 2
-Metadata: {'numbers': [846, 550], 'result': 2}
+Question: Find the Greatest Common Divisor (GCD) of these numbers: 999, 719
+Answer: 1
+Metadata: {'numbers': [999, 719], 'result': 1}
 
 ```
 
-### lcm {lcm}
+### lcm
 Generates Least Common Multiple (LCM) tasks
 
 Default configuration:
@@ -444,23 +600,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Find the Least Common Multiple (LCM) of these numbers: 33, 84
-Answer: 924
-Metadata: {'numbers': [33, 84], 'result': 924}
+Question: Find the Least Common Multiple (LCM) of these numbers: 30, 69
+Answer: 690
+Metadata: {'numbers': [30, 69], 'result': 690}
 
 Example 2:
-Question: Find the Least Common Multiple (LCM) of these numbers: 16, 23
-Answer: 368
-Metadata: {'numbers': [16, 23], 'result': 368}
+Question: Find the Least Common Multiple (LCM) of these numbers: 57, 99
+Answer: 1881
+Metadata: {'numbers': [57, 99], 'result': 1881}
 
 Example 3:
-Question: Find the Least Common Multiple (LCM) of these numbers: 66, 88
-Answer: 264
-Metadata: {'numbers': [66, 88], 'result': 264}
+Question: Find the Least Common Multiple (LCM) of these numbers: 3, 24
+Answer: 24
+Metadata: {'numbers': [3, 24], 'result': 24}
 
 ```
 
-### leg_counting {leg-counting}
+### leg_counting
 Generates leg counting arithmetic tasks
 
 Default configuration:
@@ -475,23 +631,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: How many legs are there in total if you have 2 scorpions, 3 sea slugs, 2 cockroachs, 2 fireflys?
-Answer: 40
-Metadata: {'animals': {'scorpion': 2, 'sea slug': 3, 'cockroach': 2, 'firefly': 2}, 'total_legs': 40}
+Question: How many legs are there in total if you have 1 starfish, 3 crabs, 3 chickens, 3 cows, 1 woodlouse?
+Answer: 67
+Metadata: {'animals': {'starfish': 1, 'crab': 3, 'chicken': 3, 'cow': 3, 'woodlouse': 1}, 'total_legs': 67}
 
 Example 2:
-Question: How many legs are there in total if you have 2 shrimps, 2 deers?
-Answer: 28
-Metadata: {'animals': {'shrimp': 2, 'deer': 2}, 'total_legs': 28}
+Question: How many legs are there in total if you have 2 sheeps, 1 butterfly, 1 ant, 3 humans, 2 wasps?
+Answer: 38
+Metadata: {'animals': {'sheep': 2, 'butterfly': 1, 'ant': 1, 'human': 3, 'wasp': 2}, 'total_legs': 38}
 
 Example 3:
-Question: How many legs are there in total if you have 1 beetle, 3 spiders, 1 jellyfish?
-Answer: 30
-Metadata: {'animals': {'beetle': 1, 'spider': 3, 'jellyfish': 1}, 'total_legs': 30}
+Question: How many legs are there in total if you have 3 chickens, 3 cockroachs, 3 woodlouses, 2 elephants, 2 sea slugs?
+Answer: 74
+Metadata: {'animals': {'chicken': 3, 'cockroach': 3, 'woodlouse': 3, 'elephant': 2, 'sea slug': 2}, 'total_legs': 74}
 
 ```
 
-### letter_counting {letter-counting}
+### letter_counting
 Generates letter counting tasks from text spans
 
 Default configuration:
@@ -505,23 +661,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: How many times does the letter "s" appear in the text: "a varied assortment is always in readiness A subscription"?
-Answer: 8
-Metadata: {'span_length': 9, 'target_letter': 's', 'span': ['a', 'varied', 'assortment', 'is', 'always', 'in', 'readiness', 'A', 'subscription']}
+Question: How many times does the letter "r" appear in the text: "You decline All is over then murmured the British agent sadly The"?
+Answer: 4
+Metadata: {'span_length': 12, 'target_letter': 'r', 'span': ['You', 'decline', 'All', 'is', 'over', 'then', 'murmured', 'the', 'British', 'agent', 'sadly', 'The']}
 
 Example 2:
-Question: How many times does the letter "c" appear in the text: "exclaims every one present Yes answers"?
+Question: How many times does the letter "l" appear in the text: "coffined and laid in a tomb Time went on September 25th 2889"?
 Answer: 1
-Metadata: {'span_length': 6, 'target_letter': 'c', 'span': ['exclaims', 'every', 'one', 'present', 'Yes', 'answers']}
+Metadata: {'span_length': 12, 'target_letter': 'l', 'span': ['coffined', 'and', 'laid', 'in', 'a', 'tomb', 'Time', 'went', 'on', 'September', '25th', '2889']}
 
 Example 3:
-Question: How many times does the letter "f" appear in the text: "individual Project Gutenberg electronic work is derived from texts"?
-Answer: 1
-Metadata: {'span_length': 9, 'target_letter': 'f', 'span': ['individual', 'Project', 'Gutenberg', 'electronic', 'work', 'is', 'derived', 'from', 'texts']}
+Question: How many times does the letter "i" appear in the text: "to the works took more time than he had anticipated It was"?
+Answer: 4
+Metadata: {'span_length': 12, 'target_letter': 'i', 'span': ['to', 'the', 'works', 'took', 'more', 'time', 'than', 'he', 'had', 'anticipated', 'It', 'was']}
 
 ```
 
-### letter_jumble {letter-jumble}
+### letter_jumble
 Generates word letter jumbling tasks
 
 Default configuration:
@@ -540,23 +696,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Unscramble these words: Acemira bnleogs ot
-Answer: America belongs to
-Metadata: {'num_words': 3, 'corruption_level': 0.3687132105849005, 'scrambled_words': ['Acemira', 'bnleogs', 'ot'], 'original_words': ['America', 'belongs', 'to']}
+Question: Unscramble these words: moon abotu faec hA trehe s somethnig ni htat driec eht owt nem ta ocne dnA dndeei
+Answer: moon about face Ah there s something in that cried the two men at once And indeed
+Metadata: {'num_words': 17, 'corruption_level': 0.16056171448414203, 'scrambled_words': ['moon', 'abotu', 'faec', 'hA', 'trehe', 's', 'somethnig', 'ni', 'htat', 'driec', 'eht', 'owt', 'nem', 'ta', 'ocne', 'dnA', 'dndeei'], 'original_words': ['moon', 'about', 'face', 'Ah', 'there', 's', 'something', 'in', 'that', 'cried', 'the', 'two', 'men', 'at', 'once', 'And', 'indeed']}
 
 Example 2:
-Question: Unscramble these words: cubssribres ton noly
-Answer: subscribers not only
-Metadata: {'num_words': 3, 'corruption_level': 0.38741746634525664, 'scrambled_words': ['cubssribres', 'ton', 'noly'], 'original_words': ['subscribers', 'not', 'only']}
+Question: Unscramble these words: lla het aosssen eth msea I psrooep ot od toshmeign etrtbe itlsl amrnsTrfo toni aeht a tiooprn fo het
+Answer: all the seasons the same I propose to do something better still Transform into heat a portion of the
+Metadata: {'num_words': 19, 'corruption_level': 0.8984516776838924, 'scrambled_words': ['lla', 'het', 'aosssen', 'eth', 'msea', 'I', 'psrooep', 'ot', 'od', 'toshmeign', 'etrtbe', 'itlsl', 'amrnsTrfo', 'toni', 'aeht', 'a', 'tiooprn', 'fo', 'het'], 'original_words': ['all', 'the', 'seasons', 'the', 'same', 'I', 'propose', 'to', 'do', 'something', 'better', 'still', 'Transform', 'into', 'heat', 'a', 'portion', 'of', 'the']}
 
 Example 3:
-Question: Unscramble these words: yuo peerntd ttha yuo exepct ot cantmafuure a mnhau iegnb uot adn uot Wyh ton rM itSmh cndedaav
-Answer: you pretend that you expect to manufacture a human being out and out Why not Mr Smith advanced
-Metadata: {'num_words': 18, 'corruption_level': 0.5094277166629008, 'scrambled_words': ['yuo', 'peerntd', 'ttha', 'yuo', 'exepct', 'ot', 'cantmafuure', 'a', 'mnhau', 'iegnb', 'uot', 'adn', 'uot', 'Wyh', 'ton', 'rM', 'itSmh', 'cndedaav'], 'original_words': ['you', 'pretend', 'that', 'you', 'expect', 'to', 'manufacture', 'a', 'human', 'being', 'out', 'and', 'out', 'Why', 'not', 'Mr', 'Smith', 'advanced']}
+Question: Unscramble these words: od ubt si ti fo yna sue Waht ew need si csoudl ont iarn oG dais eh addressing
+Answer: do but is it of any use What we need is clouds not rain Go said he addressing
+Metadata: {'num_words': 18, 'corruption_level': 0.21786426698317396, 'scrambled_words': ['od', 'ubt', 'si', 'ti', 'fo', 'yna', 'sue', 'Waht', 'ew', 'need', 'si', 'csoudl', 'ont', 'iarn', 'oG', 'dais', 'eh', 'addressing'], 'original_words': ['do', 'but', 'is', 'it', 'of', 'any', 'use', 'What', 'we', 'need', 'is', 'clouds', 'not', 'rain', 'Go', 'said', 'he', 'addressing']}
 
 ```
 
-### maze {maze}
+### maze
 Generates mazes with guaranteed shortest path distance from start to goal
     within [min_dist, max_dist].
 
@@ -573,63 +729,57 @@ size = 50
 Example tasks:
 ```
 Example 1:
-Question: Navigate from 'a' (start) to ':' (goal):
+Question: Navigate from 'F' (start) to 'S' (goal):
 
-```xxxxxxxxxx
-xxxx?xx:xx
-xxxx??x??x
-xx????x??x
-xxx?x???xx
-x?x?????xx
-x??ax???xx
-x???xxx??x
-x????x?xxx
-xxxxxxxxxx```
-Legend: 'x' = Wall, '?' = Passage
+```DDDDDDD
+D]D]]DD
+DD]DD]D
+DDS]]]D
+D]]D]]D
+D]]]]FD
+DDDDDDD```
+Legend: 'D' = Wall, ']' = Passage
+
+What is the minimum number of steps to reach the goal?
+Answer: 5
+Metadata: {'grid_size': 7, 'grid': ['DDDDDDD', 'D]D]]DD', 'DD]DD]D', 'DDS]]]D', 'D]]D]]D', 'D]]]]FD', 'DDDDDDD'], 'shortest_path_length': 5, 'start': 'F', 'goal': 'S', 'wall': 'D', 'path': ']'}
+
+Example 2:
+Question: Navigate from 'V' (start) to 'S' (goal):
+
+```77777777
+77SUU777
+7U7UUUU7
+77UUU777
+7UU7UUU7
+77U7UUU7
+7UUU7UV7
+77777777```
+Legend: '7' = Wall, 'U' = Passage
 
 What is the minimum number of steps to reach the goal?
 Answer: 9
-Metadata: {'grid_size': 10, 'grid': ['xxxxxxxxxx', 'xxxx?xx:xx', 'xxxx??x??x', 'xx????x??x', 'xxx?x???xx', 'x?x?????xx', 'x??ax???xx', 'x???xxx??x', 'x????x?xxx', 'xxxxxxxxxx'], 'shortest_path_length': 9, 'start': 'a', 'goal': ':', 'wall': 'x', 'path': '?'}
-
-Example 2:
-Question: Navigate from '"' (start) to '}' (goal):
-
-```444444444
-4##4#4##4
-44}444444
-44##4#444
-4#####"44
-4##4####4
-444#####4
-4##4#4444
-444444444```
-Legend: '4' = Wall, '#' = Passage
-
-What is the minimum number of steps to reach the goal?
-Answer: 6
-Metadata: {'grid_size': 9, 'grid': ['444444444', '4##4#4##4', '44}444444', '44##4#444', '4#####"44', '4##4####4', '444#####4', '4##4#4444', '444444444'], 'shortest_path_length': 6, 'start': '"', 'goal': '}', 'wall': '4', 'path': '#'}
+Metadata: {'grid_size': 8, 'grid': ['77777777', '77SUU777', '7U7UUUU7', '77UUU777', '7UU7UUU7', '77U7UUU7', '7UUU7UV7', '77777777'], 'shortest_path_length': 9, 'start': 'V', 'goal': 'S', 'wall': '7', 'path': 'U'}
 
 Example 3:
-Question: Navigate from '(' (start) to '$' (goal):
+Question: Navigate from 'z' (start) to '4' (goal):
 
-```eeeeeeeee
-e(%%%%%ee
-e%%%%%eee
-ee%eee%ee
-e%%%%%$%e
-e%%%%e%ee
-e%%%%%%%e
-ee%%%e%%e
-eeeeeeeee```
-Legend: 'e' = Wall, '%' = Passage
+```$$$$$$$
+$~~~~~$
+$$~$~~$
+$~$~$4$
+$$~~~~$
+$~z~~~$
+$$$$$$$```
+Legend: '$' = Wall, '~' = Passage
 
 What is the minimum number of steps to reach the goal?
-Answer: 8
-Metadata: {'grid_size': 9, 'grid': ['eeeeeeeee', 'e(%%%%%ee', 'e%%%%%eee', 'ee%eee%ee', 'e%%%%%$%e', 'e%%%%e%ee', 'e%%%%%%%e', 'ee%%%e%%e', 'eeeeeeeee'], 'shortest_path_length': 8, 'start': '(', 'goal': '$', 'wall': 'e', 'path': '%'}
+Answer: 5
+Metadata: {'grid_size': 7, 'grid': ['$$$$$$$', '$~~~~~$', '$$~$~~$', '$~$~$4$', '$$~~~~$', '$~z~~~$', '$$$$$$$'], 'shortest_path_length': 5, 'start': 'z', 'goal': '4', 'wall': '$', 'path': '~'}
 
 ```
 
-### mini_sudoku {mini-sudoku}
+### mini_sudoku
 Generates 4x4 sudoku puzzles with configurable difficulty
 
 Default configuration:
@@ -644,43 +794,43 @@ Example tasks:
 ```
 Example 1:
 Question: Solve this 4x4 Mini Sudoku puzzle:
-_ 3 _ 1
-2 1 _ _
-_ _ _ 2
-3 2 _ 4
-Answer: 4 3 2 1
-2 1 4 3
-1 4 3 2
+1 _ _ _
+_ 4 _ _
+_ _ _ 3
+_ _ 1 4
+Answer: 1 3 4 2
+2 4 3 1
+4 1 2 3
 3 2 1 4
-Metadata: {'puzzle': [[0, 3, 0, 1], [2, 1, 0, 0], [0, 0, 0, 2], [3, 2, 0, 4]], 'solution': [[4, 3, 2, 1], [2, 1, 4, 3], [1, 4, 3, 2], [3, 2, 1, 4]], 'num_empty': 8}
+Metadata: {'puzzle': [[1, 0, 0, 0], [0, 4, 0, 0], [0, 0, 0, 3], [0, 0, 1, 4]], 'solution': [[1, 3, 4, 2], [2, 4, 3, 1], [4, 1, 2, 3], [3, 2, 1, 4]], 'num_empty': 11}
 
 Example 2:
 Question: Solve this 4x4 Mini Sudoku puzzle:
-1 _ _ _
-_ _ 1 _
-2 _ _ _
-3 4 _ _
-Answer: 1 2 3 4
-4 3 1 2
-2 1 4 3
-3 4 2 1
-Metadata: {'puzzle': [[1, 0, 0, 0], [0, 0, 1, 0], [2, 0, 0, 0], [3, 4, 0, 0]], 'solution': [[1, 2, 3, 4], [4, 3, 1, 2], [2, 1, 4, 3], [3, 4, 2, 1]], 'num_empty': 11}
+_ _ _ 2
+2 _ _ 4
+_ 4 _ _
+_ 2 4 _
+Answer: 4 3 1 2
+2 1 3 4
+1 4 2 3
+3 2 4 1
+Metadata: {'puzzle': [[0, 0, 0, 2], [2, 0, 0, 4], [0, 4, 0, 0], [0, 2, 4, 0]], 'solution': [[4, 3, 1, 2], [2, 1, 3, 4], [1, 4, 2, 3], [3, 2, 4, 1]], 'num_empty': 10}
 
 Example 3:
 Question: Solve this 4x4 Mini Sudoku puzzle:
-_ 2 4 3
-_ 3 _ _
-2 _ _ _
-_ 1 2 _
-Answer: 1 2 4 3
-4 3 1 2
-2 4 3 1
+4 2 _ _
+3 _ 2 4
+_ _ _ _
+_ 4 3 2
+Answer: 4 2 1 3
 3 1 2 4
-Metadata: {'puzzle': [[0, 2, 4, 3], [0, 3, 0, 0], [2, 0, 0, 0], [0, 1, 2, 0]], 'solution': [[1, 2, 4, 3], [4, 3, 1, 2], [2, 4, 3, 1], [3, 1, 2, 4]], 'num_empty': 9}
+2 3 4 1
+1 4 3 2
+Metadata: {'puzzle': [[4, 2, 0, 0], [3, 0, 2, 4], [0, 0, 0, 0], [0, 4, 3, 2]], 'solution': [[4, 2, 1, 3], [3, 1, 2, 4], [2, 3, 4, 1], [1, 4, 3, 2]], 'num_empty': 8}
 
 ```
 
-### number_filtering {number-filtering}
+### number_filtering
 Generates number filtering tasks
 
 Default configuration:
@@ -698,23 +848,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Remove all numbers larger than 49.350 in this list: ['-96', '58.6', '39', '4.1432']
-Answer: ['-96', '39', '4.1432']
-Metadata: {'original_numbers': ['-96', '58.6', '39', '4.1432'], 'filter_value': '49.350', 'operation': 'remove_larger', 'result': ['-96', '39', '4.1432']}
+Question: Remove all numbers smaller than -78.527 in this list: ['-14.14', '10.92', '-56.57', '-56', '-84.8', '20']
+Answer: ['-14.14', '10.92', '-56.57', '-56', '20']
+Metadata: {'original_numbers': ['-14.14', '10.92', '-56.57', '-56', '-84.8', '20'], 'filter_value': '-78.527', 'operation': 'remove_smaller', 'result': ['-14.14', '10.92', '-56.57', '-56', '20']}
 
 Example 2:
-Question: Remove all numbers larger than -58.8 in this list: ['42.685', '38.4878', '27.3', '29.6', '-41.16', '87.20', '-66.104', '57.848', '10.3373', '-45.7']
-Answer: ['-66.104']
-Metadata: {'original_numbers': ['42.685', '38.4878', '27.3', '29.6', '-41.16', '87.20', '-66.104', '57.848', '10.3373', '-45.7'], 'filter_value': '-58.8', 'operation': 'remove_larger', 'result': ['-66.104']}
+Question: Remove all numbers larger than 19 in this list: ['20', '66', '-22.729', '-21.62', '-6.2198', '4', '34.0', '-43.9360', '98.011', '-1.2024']
+Answer: ['-22.729', '-21.62', '-6.2198', '4', '-43.9360', '-1.2024']
+Metadata: {'original_numbers': ['20', '66', '-22.729', '-21.62', '-6.2198', '4', '34.0', '-43.9360', '98.011', '-1.2024'], 'filter_value': '19', 'operation': 'remove_larger', 'result': ['-22.729', '-21.62', '-6.2198', '4', '-43.9360', '-1.2024']}
 
 Example 3:
-Question: Keep all numbers smaller than -82.5 in this list: ['-27.517', '11.04', '61', '-95.59', '-89.6322', '84.9458', '-19.8']
-Answer: ['-95.59', '-89.6322']
-Metadata: {'original_numbers': ['-27.517', '11.04', '61', '-95.59', '-89.6322', '84.9458', '-19.8'], 'filter_value': '-82.5', 'operation': 'keep_smaller', 'result': ['-95.59', '-89.6322']}
+Question: Keep all numbers smaller than 2.319 in this list: ['99', '-21', '-77.530', '7', '-11', '87.2816', '94.319', '-36', '-25.7766', '30.013']
+Answer: ['-21', '-77.530', '-11', '-36', '-25.7766']
+Metadata: {'original_numbers': ['99', '-21', '-77.530', '7', '-11', '87.2816', '94.319', '-36', '-25.7766', '30.013'], 'filter_value': '2.319', 'operation': 'keep_smaller', 'result': ['-21', '-77.530', '-11', '-36', '-25.7766']}
 
 ```
 
-### number_sequence {number-sequence}
+### number_sequence
 Generates number sequence completion tasks with dynamic pattern generation
 
 Default configuration:
@@ -731,23 +881,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: 7, 3, 1, 0, 0, 0, ?
+Question: 9, 4, 2, 1, 0, 0, 0, ?
 Answer: 0
-Metadata: {'rule': 'halve', 'complexity': 2, 'sequence': [7, 3, 1, 0, 0, 0, 0]}
+Metadata: {'rule': 'halve', 'complexity': 2, 'sequence': [9, 4, 2, 1, 0, 0, 0, 0]}
 
 Example 2:
-Question: -5, -3, -2, -1, ?
-Answer: -1
-Metadata: {'rule': 'halve', 'complexity': 3, 'sequence': [-5, -3, -2, -1, -1]}
+Question: -2, 1, 7, 19, 43, 91, 187, 379, ?
+Answer: 763
+Metadata: {'rule': 'double then add 5', 'complexity': 1, 'sequence': [-2, 1, 7, 19, 43, 91, 187, 379, 763]}
 
 Example 3:
-Question: 5, 5, 10, 15, 25, 40, 65, ?
-Answer: 105
-Metadata: {'rule': 'add previous', 'complexity': 1, 'sequence': [5, 5, 10, 15, 25, 40, 65, 105]}
+Question: 1, 0, 0, 0, 0, 0, 0, ?
+Answer: 0
+Metadata: {'rule': 'halve then multiply by 8', 'complexity': 1, 'sequence': [1, 0, 0, 0, 0, 0, 0, 0]}
 
 ```
 
-### number_sorting {number-sorting}
+### number_sorting
 Generates number sorting tasks
 
 Default configuration:
@@ -765,23 +915,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Sort these numbers in descending order: 34, 4, -49, -52, -19
-Answer: ['34', '4', '-19', '-49', '-52']
-Metadata: {'original_numbers': ['34', '4', '-49', '-52', '-19'], 'direction': 'descending', 'sorted_numbers': ['34', '4', '-19', '-49', '-52']}
+Question: Sort these numbers in ascending order: -6.78, -92.30, 91.23, -77.49, 95.03, 74.19, 70.26, -67.10
+Answer: ['-92.30', '-77.49', '-67.10', '-6.78', '70.26', '74.19', '91.23', '95.03']
+Metadata: {'original_numbers': ['-6.78', '-92.30', '91.23', '-77.49', '95.03', '74.19', '70.26', '-67.10'], 'direction': 'ascending', 'sorted_numbers': ['-92.30', '-77.49', '-67.10', '-6.78', '70.26', '74.19', '91.23', '95.03']}
 
 Example 2:
-Question: Sort these numbers in descending order: -4.44, 91.85, -86.58, -93.98, -92.88, 71.69, 25.88, 57.53, 89.65
-Answer: ['91.85', '89.65', '71.69', '57.53', '25.88', '-4.44', '-86.58', '-92.88', '-93.98']
-Metadata: {'original_numbers': ['-4.44', '91.85', '-86.58', '-93.98', '-92.88', '71.69', '25.88', '57.53', '89.65'], 'direction': 'descending', 'sorted_numbers': ['91.85', '89.65', '71.69', '57.53', '25.88', '-4.44', '-86.58', '-92.88', '-93.98']}
+Question: Sort these numbers in descending order: -10.32, 68.71, -89.59, 57.02, 12.29, -75.18, 49.79, -62.58, -58.82
+Answer: ['68.71', '57.02', '49.79', '12.29', '-10.32', '-58.82', '-62.58', '-75.18', '-89.59']
+Metadata: {'original_numbers': ['-10.32', '68.71', '-89.59', '57.02', '12.29', '-75.18', '49.79', '-62.58', '-58.82'], 'direction': 'descending', 'sorted_numbers': ['68.71', '57.02', '49.79', '12.29', '-10.32', '-58.82', '-62.58', '-75.18', '-89.59']}
 
 Example 3:
-Question: Sort these numbers in descending order: -34.19, -85.95, -6.94, -74.52, 5.10, -18.09, -4.41
-Answer: ['5.10', '-4.41', '-6.94', '-18.09', '-34.19', '-74.52', '-85.95']
-Metadata: {'original_numbers': ['-34.19', '-85.95', '-6.94', '-74.52', '5.10', '-18.09', '-4.41'], 'direction': 'descending', 'sorted_numbers': ['5.10', '-4.41', '-6.94', '-18.09', '-34.19', '-74.52', '-85.95']}
+Question: Sort these numbers in descending order: 10.13, 72.60, 72.13, 14.65, 1.16, -26.82, 55.17, 37.38, 76.73, -82.92
+Answer: ['76.73', '72.60', '72.13', '55.17', '37.38', '14.65', '10.13', '1.16', '-26.82', '-82.92']
+Metadata: {'original_numbers': ['10.13', '72.60', '72.13', '14.65', '1.16', '-26.82', '55.17', '37.38', '76.73', '-82.92'], 'direction': 'descending', 'sorted_numbers': ['76.73', '72.60', '72.13', '55.17', '37.38', '14.65', '10.13', '1.16', '-26.82', '-82.92']}
 
 ```
 
-### polynomial_equations {polynomial-equations}
+### polynomial_equations
 Generates random polynomial equations of degree in [min_degree, max_degree].
     - The polynomial is formed by summing random terms of the form: coeff * x^exponent.
     - Then we solve "polynomial_expr = 0" using Sympy.
@@ -803,23 +953,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Find the real value(s) of q in the equation: -166*q**2 - 83*q = 0
-Answer: [-0.5, 0.0]
-Metadata: {'polynomial_expr': '-166*q**2 - 83*q', 'variable': 'q', 'degree': 2, 'real_solutions': [-0.5, 0.0]}
+Question: Determine the real value(s) of a tha satisfies: -35*a**2 = 0
+Answer: [0.0]
+Metadata: {'polynomial_expr': '-35*a**2', 'variable': 'a', 'degree': 2, 'real_solutions': [0.0]}
 
 Example 2:
-Question: Determine the real value(s) of i tha satisfies: -41*i = 0
-Answer: [0.0]
-Metadata: {'polynomial_expr': '-41*i', 'variable': 'i', 'degree': 1, 'real_solutions': [0.0]}
+Question: Solve for real l: 27*l**2 + 175*l - 1 = 0
+Answer: [-6.487190738158517, 0.005709256677035911]
+Metadata: {'polynomial_expr': '27*l**2 + 175*l - 1', 'variable': 'l', 'degree': 2, 'real_solutions': [-6.487190738158517, 0.005709256677035911]}
 
 Example 3:
-Question: Find the real value(s) of t in the equation: -153*t = 0
-Answer: [0.0]
-Metadata: {'polynomial_expr': '-153*t', 'variable': 't', 'degree': 1, 'real_solutions': [0.0]}
+Question: Find the real value(s) of t in the equation: 94 - 9*t**2 = 0
+Answer: [-3.2317865716108862, 3.2317865716108862]
+Metadata: {'polynomial_expr': '94 - 9*t**2', 'variable': 't', 'degree': 2, 'real_solutions': [-3.2317865716108862, 3.2317865716108862]}
 
 ```
 
-### prime_factorization {prime-factorization}
+### prime_factorization
 Generates prime factorization tasks
 
 Default configuration:
@@ -833,23 +983,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Find the prime factorization of 139. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
-Answer: 139
-Metadata: {'number': 139, 'factors': [139]}
+Question: Find the prime factorization of 973. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
+Answer: 7 × 139
+Metadata: {'number': 973, 'factors': [7, 139]}
 
 Example 2:
-Question: Find the prime factorization of 172. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
-Answer: 2 × 2 × 43
-Metadata: {'number': 172, 'factors': [2, 2, 43]}
+Question: Find the prime factorization of 153. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
+Answer: 3 × 3 × 17
+Metadata: {'number': 153, 'factors': [3, 3, 17]}
 
 Example 3:
-Question: Find the prime factorization of 562. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
-Answer: 2 × 281
-Metadata: {'number': 562, 'factors': [2, 281]}
+Question: Find the prime factorization of 390. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
+Answer: 2 × 3 × 5 × 13
+Metadata: {'number': 390, 'factors': [2, 3, 5, 13]}
 
 ```
 
-### propositional_logic {propositional-logic}
+### propositional_logic
 Generates propositional logic reasoning tasks
 
 Default configuration:
@@ -867,33 +1017,34 @@ Example tasks:
 ```
 Example 1:
 Question: Given:
-1. Q
-2. S
-3. P
+1. (Q → P)
+2. (P → P)
+3. ((P ∨ Q) ↔ (P ↔ Q))
+4. (Q ∨ P)
 What can we conclude?
-Answer: (P ∧ S)
-Metadata: {'premises': ['Q', 'S', 'P'], 'variables': ['P', 'Q', 'R', 'S'], 'complexity': 3}
+Answer: (P ∧ P)
+Metadata: {'premises': ['(Q → P)', '(P → P)', '((P ∨ Q) ↔ (P ↔ Q))', '(Q ∨ P)'], 'variables': ['P', 'Q'], 'complexity': 3}
 
 Example 2:
 Question: Given:
-1. (P ∨ Q)
-2. P
+1. P
+2. ¬(P ∧ P)
+3. Q
 What can we conclude?
-Answer: (P ∨ Q)
-Metadata: {'premises': ['(P ∨ Q)', 'P'], 'variables': ['P', 'Q', 'R'], 'complexity': 3}
+Answer: (P ∧ P)
+Metadata: {'premises': ['P', '¬(P ∧ P)', 'Q'], 'variables': ['P', 'Q', 'R'], 'complexity': 3}
 
 Example 3:
 Question: Given:
-1. Q
-2. ((Q ↔ P) → (Q → Q))
-3. ((Q → P) → (P ↔ Q))
+1. ¬(R → P)
+2. ¬P
 What can we conclude?
-Answer: (P → Q)
-Metadata: {'premises': ['Q', '((Q ↔ P) → (Q → Q))', '((Q → P) → (P ↔ Q))'], 'variables': ['P', 'Q'], 'complexity': 3}
+Answer: (Q ↔ Q)
+Metadata: {'premises': ['¬(R → P)', '¬P'], 'variables': ['P', 'Q', 'R'], 'complexity': 3}
 
 ```
 
-### quantum_lock {quantum-lock}
+### quantum_lock
 Generates QuantumLock tasks
 
 Default configuration:
@@ -910,43 +1061,43 @@ Question: In front of you are some buttons, a light, and a number. The light wil
 You must press the shortest correct sequence of buttons to reach the target value.
 
 Start: 0 (red)
-Target: 36
+Target: 38
 Buttons:
-A: Add 3 (when any)
-B: Multiply 3 (when any)
-C: Multiply 3 (when red)
-Answer: A → B → A → B
-Metadata: {'difficulty': 10, 'solution_path': ['A', 'B', 'A', 'B'], 'target_value': 36, 'buttons': [{'name': 'A', 'type': 'add', 'value': 3, 'active_state': 'any'}, {'name': 'B', 'type': 'multiply', 'value': 3, 'active_state': 'any'}, {'name': 'C', 'type': 'multiply', 'value': 3, 'active_state': 'red'}], 'initial_state': 'red', 'initial_value': 0}
+A: Multiply 2 (when any)
+B: Add 2 (when red)
+C: Multiply 3 (when any)
+Answer: B → A → C → C → B
+Metadata: {'difficulty': 10, 'solution_path': ['B', 'A', 'C', 'C', 'B'], 'target_value': 38, 'buttons': [{'name': 'A', 'type': 'multiply', 'value': 2, 'active_state': 'any'}, {'name': 'B', 'type': 'add', 'value': 2, 'active_state': 'red'}, {'name': 'C', 'type': 'multiply', 'value': 3, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
 
 Example 2:
 Question: In front of you are some buttons, a light, and a number. The light will toggle between red and green whenever you press a button. Each button performs a mathematical operation to the number, but the operation may depend on the state of the light.
 You must press the shortest correct sequence of buttons to reach the target value.
 
 Start: 0 (red)
-Target: 30
+Target: 42
 Buttons:
-A: Subtract 2 (when red)
-B: Add 3 (when any)
-C: Subtract 3 (when green)
-Answer: B → B → B → B → B → B → B → B → B → B
-Metadata: {'difficulty': 10, 'solution_path': ['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'], 'target_value': 30, 'buttons': [{'name': 'A', 'type': 'subtract', 'value': 2, 'active_state': 'red'}, {'name': 'B', 'type': 'add', 'value': 3, 'active_state': 'any'}, {'name': 'C', 'type': 'subtract', 'value': 3, 'active_state': 'green'}], 'initial_state': 'red', 'initial_value': 0}
+A: Multiply 3 (when any)
+B: Add 2 (when any)
+C: Add 3 (when any)
+Answer: B → B → A → B → A
+Metadata: {'difficulty': 10, 'solution_path': ['B', 'B', 'A', 'B', 'A'], 'target_value': 42, 'buttons': [{'name': 'A', 'type': 'multiply', 'value': 3, 'active_state': 'any'}, {'name': 'B', 'type': 'add', 'value': 2, 'active_state': 'any'}, {'name': 'C', 'type': 'add', 'value': 3, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
 
 Example 3:
 Question: In front of you are some buttons, a light, and a number. The light will toggle between red and green whenever you press a button. Each button performs a mathematical operation to the number, but the operation may depend on the state of the light.
 You must press the shortest correct sequence of buttons to reach the target value.
 
 Start: 0 (red)
-Target: 38
+Target: 35
 Buttons:
-A: Add 2 (when any)
-B: Add 3 (when any)
-C: Subtract 2 (when any)
-Answer: A → B → B → B → B → B → B → B → B → B → B → B → B
-Metadata: {'difficulty': 10, 'solution_path': ['A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'], 'target_value': 38, 'buttons': [{'name': 'A', 'type': 'add', 'value': 2, 'active_state': 'any'}, {'name': 'B', 'type': 'add', 'value': 3, 'active_state': 'any'}, {'name': 'C', 'type': 'subtract', 'value': 2, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
+A: Multiply 3 (when red)
+B: Add 2 (when green)
+C: Subtract 3 (when any)
+Answer: A → B → A → C → A → B → A → B
+Metadata: {'difficulty': 10, 'solution_path': ['A', 'B', 'A', 'C', 'A', 'B', 'A', 'B'], 'target_value': 35, 'buttons': [{'name': 'A', 'type': 'multiply', 'value': 3, 'active_state': 'red'}, {'name': 'B', 'type': 'add', 'value': 2, 'active_state': 'green'}, {'name': 'C', 'type': 'subtract', 'value': 3, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
 
 ```
 
-### rubiks_cube {rubiks-cube}
+### rubiks_cube
 Generates RubiksCube tasks
 
 Default configuration:
@@ -961,62 +1112,62 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: You see a size 3 Rubik's cube. It is arranged this:
+Question: You are given a 3x3x3 Rubik's cube. It looks like this:
 
-          R  R  R
           Y  Y  Y
-          R  R  R
- W  R  W  G  G  G  Y  O  Y  B  B  B
- W  R  W  G  G  G  Y  O  Y  B  B  B
- B  B  B  W  R  W  G  G  G  Y  O  Y
-          O  W  O
-          O  W  O
-          O  W  O
+          Y  Y  Y
+          Y  Y  Y
+ G  G  G  O  O  O  B  B  B  R  R  R
+ R  R  R  G  G  G  O  O  O  B  B  B
+ R  R  R  G  G  G  O  O  O  B  B  B
+          W  W  W
+          W  W  W
+          W  W  W
 
 
-Please provide a solution to solve this cube.
+Please provide a solution to solve this cube using Singmaster notation.
 Answer: None
-Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "B' F D", 'example_correct_answer': "B F D F' D' F' D B' D' R U' R' L U L' U' R' U R L U' L' U L U' L' U' B' U B U' U' F' U F U R U' R' U' B U' B' U' R' U R U' U' B' U B U L U' L' R U R' U R U U R' U U R U' L' U R' U' L U R U' L' U R' U' L U R' D' R D R' D' R D R' D' R D R' D' R D U R' D' R D R' D' R D R' D' R D R' D' R D U R' D' R D R' D' R D R' D' R D R' D' R D U"}
+Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "U L L'", 'example_correct_answer': "U'"}
 
 Example 2:
 Question: You see a size 3 Rubik's cube. It is arranged this:
 
-          B  O  G
-          B  Y  G
-          B  Y  G
- Y  Y  Y  O  G  W  O  O  O  Y  B  R
- R  R  R  Y  G  W  O  O  O  Y  B  W
- R  R  R  Y  G  R  W  W  W  O  B  W
-          G  W  B
-          G  W  B
-          G  R  B
+          Y  Y  O
+          Y  Y  O
+          Y  Y  B
+ R  R  R  G  G  Y  O  G  G  W  B  B
+ R  R  Y  O  G  G  W  O  O  B  B  B
+ R  R  Y  O  G  G  W  O  O  B  B  B
+          G  R  R
+          W  W  W
+          W  W  W
 
 
 Please provide a solution to solve this cube.
 Answer: None
-Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': 'B L R', 'example_correct_answer': "R' B' U' L D F' D' U L U' L' U F U' F' U' L' U L U F U' F' U L' U L U F U' F' U' F' U F U R U' R' U' F' U F U R U' R' U F' U F U R U' R' F R U R' U' F' U R U R' U R U U R' L U' R' U L' U' R U L U' R' U L' U' D' R D R' D' R D U R' D' R D R' D' R D R' D' R D R' D' R D U R' D' R D R' D' R D R' D' R D R' D' R D U R' D' R D R' D' R D U"}
+Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "U F' U'", 'example_correct_answer': "U F U'"}
 
 Example 3:
 Question: You see a size 3 Rubik's cube. It is arranged this:
 
+          R  R  R
+          B  Y  Y
+          O  O  O
+ G  R  Y  G  G  G  W  O  B  W  W  W
+ W  R  Y  G  G  G  W  O  Y  B  B  B
+ W  R  B  Y  Y  Y  G  O  Y  B  B  B
+          R  R  R
+          G  W  W
           O  O  O
-          Y  Y  G
-          Y  Y  G
- G  R  R  G  G  W  O  O  B  Y  Y  Y
- Y  R  R  G  G  W  O  O  W  B  B  B
- B  B  B  Y  R  R  G  G  W  O  O  W
-          R  W  W
-          R  W  W
-          R  B  B
 
 
 Please provide a solution to solve this cube.
 Answer: None
-Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': 'R B D', 'example_correct_answer': "B' F D F' D' R D R' B' D' L' U' L U R U' R' U' L U U L' F U' F' L' U U L U' B' U B U L U' L' L' U L U F U' F' U' F' U F U R U' R' U' U' R' U R U B U' B' U' U' B' U B U L U' L' U F R U R' U' R U R' U' F' U R U R' U R U U R' L U' R' U L' U' R U L U' R' U L' U' R U R' D' R D R' D' R D U R' D' R D R' D' R D U R' D' R D R' D' R D U"}
+Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "L B' F'", 'example_correct_answer': "B L' F U F U' F' U F R U R' U' F' R U R' U R U U R' U' R U R' U R U U R' U' L U' R' U L' U' R U L U' R' U L' U' D' R D R' D' R D R' D' R D R' D' R D U R' D' R D R' D' R D U'"}
 
 ```
 
-### sentence_reordering {sentence-reordering}
+### sentence_reordering
 Generates sentence reordering tasks from text spans
 
 Default configuration:
@@ -1030,23 +1181,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Restore the correct order of words in the following sentence: about We think must it.
-Answer: We must think about it.
+Question: Restore the correct order of words in the following sentence: thing first that Mr. The
+Answer: The first thing that Mr.
 Metadata: {'word_count': 5}
 
 Example 2:
-Question: Restore the correct order of words in the following sentence: 1 through 1.
-Answer: 1 through 1.
-Metadata: {'word_count': 3}
+Question: Restore the correct order of words in the following sentence: shall The to called be the attention of government the matter. Chinese
+Answer: The attention of the the Chinese government shall be called to matter.
+Metadata: {'word_count': 12}
 
 Example 3:
-Question: Restore the correct order of words in the following sentence: lease Smith of great of a has falls obtained Niagara. the
-Answer: Smith has obtained a lease of of the great falls Niagara.
-Metadata: {'word_count': 11}
+Question: Restore the correct order of words in the following sentence: wonderful we are the accumulators. indebted instruments those new for Jackson To
+Answer: To Jackson we are indebted for those wonderful instruments the new accumulators.
+Metadata: {'word_count': 12}
 
 ```
 
-### simple_equations {simple-equations}
+### simple_equations
 Generates simple equations with one variable to solve
 
 Default configuration:
@@ -1063,23 +1214,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Find the value of o in the equation: 84*o - 79 = 4625
-Answer: 56
-Metadata: {'equation': '84*o - 79 = 4625', 'variable': 'o'}
+Question: Solve for j: 69 - 47*j = -4020
+Answer: 87
+Metadata: {'equation': '69 - 47*j = -4020', 'variable': 'j'}
 
 Example 2:
-Question: Find the value of e in the equation: 2068*e = 198528
-Answer: 96
-Metadata: {'equation': '2068*e = 198528', 'variable': 'e'}
+Question: Solve for o: 210000*o + 98 = 840098
+Answer: 4
+Metadata: {'equation': '210000*o + 98 = 840098', 'variable': 'o'}
 
 Example 3:
-Question: Determine the value of g that satisfies: 71*g - 80 = 204
-Answer: 4
-Metadata: {'equation': '71*g - 80 = 204', 'variable': 'g'}
+Question: Find the value of a in the equation: 6930*a = 297990
+Answer: 43
+Metadata: {'equation': '6930*a = 297990', 'variable': 'a'}
 
 ```
 
-### spell_backward {spell-backward}
+### spell_backward
 Generates tasks to spell words backward
 
 Default configuration:
@@ -1092,23 +1243,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Spell this word backward (example: sun -> nus): made
-Answer: edam
-Metadata: {'word': 'made', 'word_len': 4}
+Question: Spell this word backward (example: sun -> nus): only
+Answer: ylno
+Metadata: {'word': 'only', 'word_len': 4}
 
 Example 2:
-Question: Spell this word backward (example: sun -> nus): then
-Answer: neht
-Metadata: {'word': 'then', 'word_len': 4}
+Question: Spell this word backward (example: sun -> nus): from
+Answer: morf
+Metadata: {'word': 'from', 'word_len': 4}
 
 Example 3:
-Question: Spell this word backward (example: sun -> nus): Europe
-Answer: eporuE
-Metadata: {'word': 'Europe', 'word_len': 6}
+Question: Spell this word backward (example: sun -> nus): anxiously
+Answer: ylsuoixna
+Metadata: {'word': 'anxiously', 'word_len': 9}
 
 ```
 
-### sudoku {sudoku}
+### sudoku
 Generates sudoku puzzles with configurable difficulty
 
 Default configuration:
@@ -1123,73 +1274,73 @@ Example tasks:
 ```
 Example 1:
 Question: Solve this Sudoku puzzle:
-_ _ 2 _ _ _ 6 7 _
-7 _ _ _ _ 9 _ _ _
-3 _ _ _ 8 7 4 _ _
-_ 8 4 _ 7 _ 9 _ _
-_ _ _ _ _ _ 3 _ _
-9 _ 3 1 _ _ _ 8 7
-_ 1 8 4 9 _ _ 5 3
-_ _ _ 8 5 1 2 9 4
-4 5 9 _ _ 2 _ _ 6
-Answer: 8 4 2 5 1 3 6 7 9
-7 6 5 2 4 9 1 3 8
-3 9 1 6 8 7 4 2 5
-1 8 4 3 7 5 9 6 2
-5 7 6 9 2 8 3 4 1
-9 2 3 1 6 4 5 8 7
-2 1 8 4 9 6 7 5 3
-6 3 7 8 5 1 2 9 4
-4 5 9 7 3 2 8 1 6
-Metadata: {'puzzle': [[0, 0, 2, 0, 0, 0, 6, 7, 0], [7, 0, 0, 0, 0, 9, 0, 0, 0], [3, 0, 0, 0, 8, 7, 4, 0, 0], [0, 8, 4, 0, 7, 0, 9, 0, 0], [0, 0, 0, 0, 0, 0, 3, 0, 0], [9, 0, 3, 1, 0, 0, 0, 8, 7], [0, 1, 8, 4, 9, 0, 0, 5, 3], [0, 0, 0, 8, 5, 1, 2, 9, 4], [4, 5, 9, 0, 0, 2, 0, 0, 6]], 'solution': [[8, 4, 2, 5, 1, 3, 6, 7, 9], [7, 6, 5, 2, 4, 9, 1, 3, 8], [3, 9, 1, 6, 8, 7, 4, 2, 5], [1, 8, 4, 3, 7, 5, 9, 6, 2], [5, 7, 6, 9, 2, 8, 3, 4, 1], [9, 2, 3, 1, 6, 4, 5, 8, 7], [2, 1, 8, 4, 9, 6, 7, 5, 3], [6, 3, 7, 8, 5, 1, 2, 9, 4], [4, 5, 9, 7, 3, 2, 8, 1, 6]], 'num_empty': 45}
+_ 8 _ 2 _ _ _ _ 3
+_ _ 4 _ 7 _ _ 8 9
+2 5 6 3 _ _ _ 4 7
+_ _ 8 _ 6 _ 9 5 _
+9 _ 2 7 _ 5 _ _ _
+3 6 _ _ 2 9 8 _ _
+_ 4 3 _ 5 2 7 _ _
+_ _ 1 _ _ _ 4 2 8
+6 2 _ 8 4 1 3 9 5
+Answer: 7 8 9 2 1 4 5 6 3
+1 3 4 5 7 6 2 8 9
+2 5 6 3 9 8 1 4 7
+4 7 8 1 6 3 9 5 2
+9 1 2 7 8 5 6 3 4
+3 6 5 4 2 9 8 7 1
+8 4 3 9 5 2 7 1 6
+5 9 1 6 3 7 4 2 8
+6 2 7 8 4 1 3 9 5
+Metadata: {'puzzle': [[0, 8, 0, 2, 0, 0, 0, 0, 3], [0, 0, 4, 0, 7, 0, 0, 8, 9], [2, 5, 6, 3, 0, 0, 0, 4, 7], [0, 0, 8, 0, 6, 0, 9, 5, 0], [9, 0, 2, 7, 0, 5, 0, 0, 0], [3, 6, 0, 0, 2, 9, 8, 0, 0], [0, 4, 3, 0, 5, 2, 7, 0, 0], [0, 0, 1, 0, 0, 0, 4, 2, 8], [6, 2, 0, 8, 4, 1, 3, 9, 5]], 'solution': [[7, 8, 9, 2, 1, 4, 5, 6, 3], [1, 3, 4, 5, 7, 6, 2, 8, 9], [2, 5, 6, 3, 9, 8, 1, 4, 7], [4, 7, 8, 1, 6, 3, 9, 5, 2], [9, 1, 2, 7, 8, 5, 6, 3, 4], [3, 6, 5, 4, 2, 9, 8, 7, 1], [8, 4, 3, 9, 5, 2, 7, 1, 6], [5, 9, 1, 6, 3, 7, 4, 2, 8], [6, 2, 7, 8, 4, 1, 3, 9, 5]], 'num_empty': 38}
 
 Example 2:
 Question: Solve this Sudoku puzzle:
-3 5 _ _ _ _ _ _ _
-_ 1 _ 3 _ 8 5 4 6
-7 _ 8 9 _ _ _ 3 2
-2 3 7 _ 4 _ _ 8 _
-_ _ 1 8 _ 2 3 _ 4
-_ _ 4 7 9 3 6 _ _
-8 6 _ _ _ _ 2 _ _
-_ 2 _ _ 8 7 _ _ _
-_ _ _ 6 2 _ 8 5 _
-Answer: 3 5 6 2 1 4 7 9 8
-9 1 2 3 7 8 5 4 6
-7 4 8 9 6 5 1 3 2
-2 3 7 1 4 6 9 8 5
-6 9 1 8 5 2 3 7 4
-5 8 4 7 9 3 6 2 1
-8 6 5 4 3 9 2 1 7
-1 2 3 5 8 7 4 6 9
-4 7 9 6 2 1 8 5 3
-Metadata: {'puzzle': [[3, 5, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 3, 0, 8, 5, 4, 6], [7, 0, 8, 9, 0, 0, 0, 3, 2], [2, 3, 7, 0, 4, 0, 0, 8, 0], [0, 0, 1, 8, 0, 2, 3, 0, 4], [0, 0, 4, 7, 9, 3, 6, 0, 0], [8, 6, 0, 0, 0, 0, 2, 0, 0], [0, 2, 0, 0, 8, 7, 0, 0, 0], [0, 0, 0, 6, 2, 0, 8, 5, 0]], 'solution': [[3, 5, 6, 2, 1, 4, 7, 9, 8], [9, 1, 2, 3, 7, 8, 5, 4, 6], [7, 4, 8, 9, 6, 5, 1, 3, 2], [2, 3, 7, 1, 4, 6, 9, 8, 5], [6, 9, 1, 8, 5, 2, 3, 7, 4], [5, 8, 4, 7, 9, 3, 6, 2, 1], [8, 6, 5, 4, 3, 9, 2, 1, 7], [1, 2, 3, 5, 8, 7, 4, 6, 9], [4, 7, 9, 6, 2, 1, 8, 5, 3]], 'num_empty': 43}
+5 _ _ _ 3 4 _ 6 _
+_ _ 3 _ _ _ _ _ _
+_ _ 8 5 9 _ _ _ 2
+_ 5 7 6 4 _ _ 8 _
+_ 4 6 _ _ _ _ 5 3
+_ 3 _ _ _ 5 _ _ _
+6 8 1 _ _ 9 _ _ _
+_ 9 5 _ 2 _ _ 4 _
+_ 2 _ _ 8 6 1 9 5
+Answer: 5 7 2 1 3 4 8 6 9
+9 1 3 2 6 8 5 7 4
+4 6 8 5 9 7 3 1 2
+2 5 7 6 4 3 9 8 1
+8 4 6 9 1 2 7 5 3
+1 3 9 8 7 5 4 2 6
+6 8 1 4 5 9 2 3 7
+3 9 5 7 2 1 6 4 8
+7 2 4 3 8 6 1 9 5
+Metadata: {'puzzle': [[5, 0, 0, 0, 3, 4, 0, 6, 0], [0, 0, 3, 0, 0, 0, 0, 0, 0], [0, 0, 8, 5, 9, 0, 0, 0, 2], [0, 5, 7, 6, 4, 0, 0, 8, 0], [0, 4, 6, 0, 0, 0, 0, 5, 3], [0, 3, 0, 0, 0, 5, 0, 0, 0], [6, 8, 1, 0, 0, 9, 0, 0, 0], [0, 9, 5, 0, 2, 0, 0, 4, 0], [0, 2, 0, 0, 8, 6, 1, 9, 5]], 'solution': [[5, 7, 2, 1, 3, 4, 8, 6, 9], [9, 1, 3, 2, 6, 8, 5, 7, 4], [4, 6, 8, 5, 9, 7, 3, 1, 2], [2, 5, 7, 6, 4, 3, 9, 8, 1], [8, 4, 6, 9, 1, 2, 7, 5, 3], [1, 3, 9, 8, 7, 5, 4, 2, 6], [6, 8, 1, 4, 5, 9, 2, 3, 7], [3, 9, 5, 7, 2, 1, 6, 4, 8], [7, 2, 4, 3, 8, 6, 1, 9, 5]], 'num_empty': 47}
 
 Example 3:
 Question: Solve this Sudoku puzzle:
-2 _ 1 4 _ 5 6 _ _
-_ 8 _ 6 _ 1 5 2 9
-_ _ _ _ _ 2 _ 3 _
-1 _ 4 2 _ _ _ _ 5
-_ _ _ _ 4 _ _ 6 _
-_ _ 9 _ _ _ 2 4 _
-8 _ _ 5 1 6 3 _ 7
-9 _ _ 7 _ 3 _ 1 2
-3 _ _ 9 _ 4 _ _ 6
-Answer: 2 9 1 4 3 5 6 7 8
-4 8 3 6 7 1 5 2 9
-7 5 6 8 9 2 1 3 4
-1 3 4 2 6 7 9 8 5
-5 2 8 1 4 9 7 6 3
-6 7 9 3 5 8 2 4 1
-8 4 2 5 1 6 3 9 7
-9 6 5 7 8 3 4 1 2
-3 1 7 9 2 4 8 5 6
-Metadata: {'puzzle': [[2, 0, 1, 4, 0, 5, 6, 0, 0], [0, 8, 0, 6, 0, 1, 5, 2, 9], [0, 0, 0, 0, 0, 2, 0, 3, 0], [1, 0, 4, 2, 0, 0, 0, 0, 5], [0, 0, 0, 0, 4, 0, 0, 6, 0], [0, 0, 9, 0, 0, 0, 2, 4, 0], [8, 0, 0, 5, 1, 6, 3, 0, 7], [9, 0, 0, 7, 0, 3, 0, 1, 2], [3, 0, 0, 9, 0, 4, 0, 0, 6]], 'solution': [[2, 9, 1, 4, 3, 5, 6, 7, 8], [4, 8, 3, 6, 7, 1, 5, 2, 9], [7, 5, 6, 8, 9, 2, 1, 3, 4], [1, 3, 4, 2, 6, 7, 9, 8, 5], [5, 2, 8, 1, 4, 9, 7, 6, 3], [6, 7, 9, 3, 5, 8, 2, 4, 1], [8, 4, 2, 5, 1, 6, 3, 9, 7], [9, 6, 5, 7, 8, 3, 4, 1, 2], [3, 1, 7, 9, 2, 4, 8, 5, 6]], 'num_empty': 44}
+9 8 6 _ _ _ _ _ 3
+4 _ _ _ _ _ _ 6 _
+_ _ 3 6 7 _ _ _ 8
+_ _ 9 _ _ 3 6 _ _
+_ _ _ _ _ _ 7 4 2
+_ _ _ 4 _ _ _ _ _
+_ _ 2 5 _ _ _ 1 _
+_ 3 1 _ 4 6 8 9 7
+7 9 _ 8 _ _ _ _ 6
+Answer: 9 8 6 1 2 4 5 7 3
+4 2 7 3 8 5 1 6 9
+1 5 3 6 7 9 4 2 8
+2 4 9 7 1 3 6 8 5
+3 1 5 9 6 8 7 4 2
+6 7 8 4 5 2 9 3 1
+8 6 2 5 9 7 3 1 4
+5 3 1 2 4 6 8 9 7
+7 9 4 8 3 1 2 5 6
+Metadata: {'puzzle': [[9, 8, 6, 0, 0, 0, 0, 0, 3], [4, 0, 0, 0, 0, 0, 0, 6, 0], [0, 0, 3, 6, 7, 0, 0, 0, 8], [0, 0, 9, 0, 0, 3, 6, 0, 0], [0, 0, 0, 0, 0, 0, 7, 4, 2], [0, 0, 0, 4, 0, 0, 0, 0, 0], [0, 0, 2, 5, 0, 0, 0, 1, 0], [0, 3, 1, 0, 4, 6, 8, 9, 7], [7, 9, 0, 8, 0, 0, 0, 0, 6]], 'solution': [[9, 8, 6, 1, 2, 4, 5, 7, 3], [4, 2, 7, 3, 8, 5, 1, 6, 9], [1, 5, 3, 6, 7, 9, 4, 2, 8], [2, 4, 9, 7, 1, 3, 6, 8, 5], [3, 1, 5, 9, 6, 8, 7, 4, 2], [6, 7, 8, 4, 5, 2, 9, 3, 1], [8, 6, 2, 5, 9, 7, 3, 1, 4], [5, 3, 1, 2, 4, 6, 8, 9, 7], [7, 9, 4, 8, 3, 1, 2, 5, 6]], 'num_empty': 50}
 
 ```
 
-### syllogism {syllogism}
+### syllogism
 Generates syllogism reasoning tasks
 
 Default configuration:
@@ -1209,40 +1360,40 @@ Example tasks:
 ```
 Example 1:
 Question: Consider these statements:
-1. Some programmers are cats
-2. Some ... are not cats are engineers
+1. Some humans are reptiles
+2. Some reptiles are insects
 
 Does it logically follow that:
-No programmers are engineers?
+Some ... are not humans are insects?
 (Answer Yes or No)
-Answer: Yes
-Metadata: {'premise1': 'Some programmers are cats', 'premise2': 'Some ... are not cats are engineers', 'conclusion': 'No programmers are engineers', 'is_valid': True}
+Answer: No
+Metadata: {'premise1': 'Some humans are reptiles', 'premise2': 'Some reptiles are insects', 'conclusion': 'Some ... are not humans are insects', 'is_valid': False}
 
 Example 2:
 Question: Consider these statements:
-1. All parents are cats
-2. Some cats are lawyers
+1. All mortals are teachers
+2. Some teachers are ants
 
 Does it logically follow that:
-Some ... are not parents are lawyers?
+Some ... are not mortals are ants?
 (Answer Yes or No)
 Answer: Yes
-Metadata: {'premise1': 'All parents are cats', 'premise2': 'Some cats are lawyers', 'conclusion': 'Some ... are not parents are lawyers', 'is_valid': True}
+Metadata: {'premise1': 'All mortals are teachers', 'premise2': 'Some teachers are ants', 'conclusion': 'Some ... are not mortals are ants', 'is_valid': True}
 
 Example 3:
 Question: Consider these statements:
-1. No whales are birds
-2. Some birds are teachers
+1. No mortals are whales
+2. No whales are bees
 
 Does it logically follow that:
-All whales are teachers?
+No mortals are bees?
 (Answer Yes or No)
-Answer: Yes
-Metadata: {'premise1': 'No whales are birds', 'premise2': 'Some birds are teachers', 'conclusion': 'All whales are teachers', 'is_valid': True}
+Answer: No
+Metadata: {'premise1': 'No mortals are whales', 'premise2': 'No whales are bees', 'conclusion': 'No mortals are bees', 'is_valid': False}
 
 ```
 
-### word_sequence_reversal {word-sequence-reversal}
+### word_sequence_reversal
 Generates word sequence reversal tasks from text spans
 
 Default configuration:
@@ -1256,23 +1407,23 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: Reverse this list of words: upon, bold, what, of, have
-Answer: have, of, what, bold, upon
-Metadata: {'num_words': 5, 'words': ['upon', 'bold', 'what', 'of', 'have']}
+Question: Reverse this list of words: Africa, harmless, moral
+Answer: moral, harmless, Africa
+Metadata: {'num_words': 3, 'words': ['Africa', 'harmless', 'moral']}
 
 Example 2:
-Question: Reverse this list of words: years, WILL, Gutenberg, Nevertheless
-Answer: Nevertheless, Gutenberg, WILL, years
-Metadata: {'num_words': 4, 'words': ['years', 'WILL', 'Gutenberg', 'Nevertheless']}
+Question: Reverse this list of words: efforts, well, set, these, back, Her, for
+Answer: for, Her, back, these, set, well, efforts
+Metadata: {'num_words': 7, 'words': ['efforts', 'well', 'set', 'these', 'back', 'Her', 'for']}
 
 Example 3:
-Question: Reverse this list of words: or, of, With, no
-Answer: no, With, of, or
-Metadata: {'num_words': 4, 'words': ['or', 'of', 'With', 'no']}
+Question: Reverse this list of words: fellow, compliance, few, which, in, famous, Not
+Answer: Not, famous, in, which, few, compliance, fellow
+Metadata: {'num_words': 7, 'words': ['fellow', 'compliance', 'few', 'which', 'in', 'famous', 'Not']}
 
 ```
 
-### word_sorting {word-sorting}
+### word_sorting
 Generates word sorting tasks
 
 Default configuration:
@@ -1290,20 +1441,20 @@ Example tasks:
 ```
 Example 1:
 Question: Sort these words in descending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-believe, content, How, dedicated, seasons
-Answer: seasons, dedicated, content, believe, How
-Metadata: {'original_words': ['believe', 'content', 'How', 'dedicated', 'seasons'], 'transformed_words': ['believe', 'content', 'How', 'dedicated', 'seasons'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['seasons', 'dedicated', 'content', 'believe', 'How']}
+prepare, provide, speak, surplus, after, unlink, change, 000
+Answer: unlink, surplus, speak, provide, prepare, change, after, 000
+Metadata: {'original_words': ['prepare', 'provide', 'speak', 'surplus', 'after', 'unlink', 'change', '000'], 'transformed_words': ['prepare', 'provide', 'speak', 'surplus', 'after', 'unlink', 'change', '000'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['unlink', 'surplus', 'speak', 'provide', 'prepare', 'change', 'after', '000']}
 
 Example 2:
-Question: Sort these words in ascending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-owing, acute, included
-Answer: acute, included, owing
-Metadata: {'original_words': ['owing', 'acute', 'included'], 'transformed_words': ['owing', 'acute', 'included'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['acute', 'included', 'owing']}
+Question: Sort these words in descending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
+501, differences, Thus, cupola, longer, remaining, mummy, Paris, DISTRIBUTE
+Answer: remaining, mummy, longer, differences, cupola, Thus, Paris, DISTRIBUTE, 501
+Metadata: {'original_words': ['501', 'differences', 'Thus', 'cupola', 'longer', 'remaining', 'mummy', 'Paris', 'DISTRIBUTE'], 'transformed_words': ['501', 'differences', 'Thus', 'cupola', 'longer', 'remaining', 'mummy', 'Paris', 'DISTRIBUTE'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['remaining', 'mummy', 'longer', 'differences', 'cupola', 'Thus', 'Paris', 'DISTRIBUTE', '501']}
 
 Example 3:
 Question: Sort these words in ascending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-WARRANTY, tell, territory, Reckon, downloading
-Answer: Reckon, WARRANTY, downloading, tell, territory
-Metadata: {'original_words': ['WARRANTY', 'tell', 'territory', 'Reckon', 'downloading'], 'transformed_words': ['WARRANTY', 'tell', 'territory', 'Reckon', 'downloading'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['Reckon', 'WARRANTY', 'downloading', 'tell', 'territory']}
+discontinue, access, office, luminous, distributing
+Answer: access, discontinue, distributing, luminous, office
+Metadata: {'original_words': ['discontinue', 'access', 'office', 'luminous', 'distributing'], 'transformed_words': ['discontinue', 'access', 'office', 'luminous', 'distributing'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['access', 'discontinue', 'distributing', 'luminous', 'office']}
 
 ```
diff --git a/pyproject.toml b/pyproject.toml
index 3a546f8c..42e66005 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,8 +14,8 @@ requires-python = ">=3.11"
 dependencies = [
   "bfi==1.0.4",
   "cellpylib==2.4.0",
-  "sympy>=1.13.1", 
-  "magiccube==0.3.0", 
+  "sympy>=1.13.1",
+  "magiccube==0.3.0",
   "pyfiglet==1.0.2"
 ]
 classifiers = [
diff --git a/reasoning_gym/code/__init__.py b/reasoning_gym/code/__init__.py
index 680653c6..d250ad6e 100644
--- a/reasoning_gym/code/__init__.py
+++ b/reasoning_gym/code/__init__.py
@@ -7,7 +7,4 @@ Cognition tasks for training reasoning capabilities:
 
 from .bf import BFConfig, BFDataset
 
-__all__ = [
-    "BFConfig",
-    "BFDataset"
-]
+__all__ = ["BFConfig", "BFDataset"]
diff --git a/reasoning_gym/code/bf.py b/reasoning_gym/code/bf.py
index 0e47948c..c2697203 100644
--- a/reasoning_gym/code/bf.py
+++ b/reasoning_gym/code/bf.py
@@ -3,10 +3,10 @@ from random import Random
 from typing import Dict, Optional
 
 import bfi
-from .contrib.bfit.Compiler import Compiler, Minify
 
 from ..data.wordle_words import wordle_words
 from ..factory import ProceduralDataset, register_dataset
+from .contrib.bfit.Compiler import Compiler, Minify
 
 
 @dataclass
@@ -122,10 +122,11 @@ int main() {{
 
         if answer == None:
             return 0.0
-        if answer != entry['answer']:
+        if answer != entry["answer"]:
             return 0.01
         else:
-            return 1.0 # Yay
+            return 1.0  # Yay
+
 
 # Register the dataset
 register_dataset("bf", BFDataset, BFConfig)
diff --git a/reasoning_gym/code/contrib/bfit/BF-it.py b/reasoning_gym/code/contrib/bfit/BF-it.py
index 46545a29..ccdb3fc0 100644
--- a/reasoning_gym/code/contrib/bfit/BF-it.py
+++ b/reasoning_gym/code/contrib/bfit/BF-it.py
@@ -2,9 +2,9 @@
 
 import argparse
 import os
+
 import Interpreter
-from Compiler import Compiler
-from Compiler import Minify
+from Compiler import Compiler, Minify
 
 
 def process_args():
@@ -54,5 +54,5 @@ def compile_file():
         Interpreter.brainfuck(brainfuck_code)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     compile_file()
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Compiler.py b/reasoning_gym/code/contrib/bfit/Compiler/Compiler.py
index 276fae88..e1f60258 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/Compiler.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Compiler.py
@@ -1,12 +1,18 @@
 #!/usr/bin/env python3
-from .Exceptions import BFSyntaxError, BFSemanticError
+from .Exceptions import BFSemanticError, BFSyntaxError
 from .FunctionCompiler import FunctionCompiler
 from .Functions import check_function_exists, get_function_object, insert_function_object
-from .General import is_token_literal, get_literal_token_code, unpack_literal_tokens_to_array_dimensions
-from .Globals import get_global_variables_size, get_variable_size, get_variable_dimensions, insert_global_variable, create_variable_from_definition
+from .General import get_literal_token_code, is_token_literal, unpack_literal_tokens_to_array_dimensions
+from .Globals import (
+    create_variable_from_definition,
+    get_global_variables_size,
+    get_variable_dimensions,
+    get_variable_size,
+    insert_global_variable,
+)
 from .Lexical_analyzer import analyze
-from .Optimizer import optimize
 from .LibraryFunctionCompiler import insert_library_functions
+from .Optimizer import optimize
 from .Parser import Parser
 from .Token import Token
 
@@ -29,20 +35,24 @@ class Compiler:
         # returns function named tuple
 
         if self.parser.current_token().type not in [Token.VOID, Token.INT]:
-            raise BFSemanticError("Function return type can be either void or int, not '%s'" % str(self.parser.current_token()))
+            raise BFSemanticError(
+                "Function return type can be either void or int, not '%s'" % str(self.parser.current_token())
+            )
 
         self.parser.check_next_tokens_are([Token.ID, Token.LPAREN])
 
         # save all tokens of this function
         function_name = self.parser.next_token(next_amount=1).data
-        RPAREN_index = self.parser.find_matching(starting_index=self.parser.current_token_index+2)  # first find RPAREN
+        RPAREN_index = self.parser.find_matching(
+            starting_index=self.parser.current_token_index + 2
+        )  # first find RPAREN
         self.parser.check_next_token_is(Token.LBRACE, starting_index=RPAREN_index)
-        RBRACE_index = self.parser.find_matching(starting_index=RPAREN_index+1)  # then find RBRACE
+        RBRACE_index = self.parser.find_matching(starting_index=RPAREN_index + 1)  # then find RBRACE
 
         # take all tokens between INT and RBRACE and pass them to function object
-        function_tokens = self.parser.tokens[self.parser.current_token_index:RBRACE_index+1]
+        function_tokens = self.parser.tokens[self.parser.current_token_index : RBRACE_index + 1]
         # skip function definition
-        self.parser.advance_to_token_at_index(RBRACE_index+1)
+        self.parser.advance_to_token_at_index(RBRACE_index + 1)
 
         function = FunctionCompiler(function_name, function_tokens)
         return function
@@ -60,12 +70,12 @@ class Compiler:
         # if this is set to True, then the compiler zeros each cell before using it (may generate a lot of unnecessary BF code)
         ZERO_CELLS_BEFORE_USE = False
 
-        code = '[-]' if ZERO_CELLS_BEFORE_USE else ''
+        code = "[-]" if ZERO_CELLS_BEFORE_USE else ""
         if get_variable_size(variable) > 1:  # its an array
             if self.parser.current_token().type == Token.SEMICOLON:
                 # array definition - INT ID (LBRACK NUM RBRACK)+ SEMICOLON
                 self.parser.advance_token()  # skip SEMICOLON
-                code = (code + '>') * get_variable_size(variable)  # advance to after this variable
+                code = (code + ">") * get_variable_size(variable)  # advance to after this variable
                 return code
             elif self.parser.current_token().type == Token.ASSIGN and self.parser.current_token().data == "=":
                 # array definition and initialization - INT ID (LBRACK NUM RBRACK)+ ASSIGN ((LBRACE ... RBRACE)+|STRING) SEMICOLON
@@ -79,25 +89,34 @@ class Compiler:
                 self.parser.advance_token()  # skip SEMICOLON
 
                 array_dimensions = get_variable_dimensions(variable)
-                unpacked_literals_list = unpack_literal_tokens_to_array_dimensions(ID_token, array_dimensions, literal_tokens_list)
+                unpacked_literals_list = unpack_literal_tokens_to_array_dimensions(
+                    ID_token, array_dimensions, literal_tokens_list
+                )
 
                 for literal in unpacked_literals_list:
                     code += get_literal_token_code(literal)  # evaluate this literal and point to next array element
                 return code
             else:
-                raise BFSyntaxError("Unexpected %s in array definition. Expected SEMICOLON (;) or ASSIGN (=)" % self.parser.current_token())
+                raise BFSyntaxError(
+                    "Unexpected %s in array definition. Expected SEMICOLON (;) or ASSIGN (=)"
+                    % self.parser.current_token()
+                )
 
         elif self.parser.current_token().type == Token.SEMICOLON:  # no need to initialize
             self.parser.advance_token()  # skip SEMICOLON
-            code += '>'  # advance to after this variable
+            code += ">"  # advance to after this variable
         else:
             self.parser.check_current_token_is(Token.ASSIGN)
             if self.parser.current_token().data != "=":
-                raise BFSyntaxError("Unexpected %s when initializing global variable. Expected ASSIGN (=)" % self.parser.current_token())
+                raise BFSyntaxError(
+                    "Unexpected %s when initializing global variable. Expected ASSIGN (=)" % self.parser.current_token()
+                )
             self.parser.advance_token()  # skip ASSIGN
 
             if not is_token_literal(self.parser.current_token()):
-                raise BFSemanticError("Unexpected '%s'. expected literal (NUM | CHAR | TRUE | FALSE )" % str(self.parser.current_token()))
+                raise BFSemanticError(
+                    "Unexpected '%s'. expected literal (NUM | CHAR | TRUE | FALSE )" % str(self.parser.current_token())
+                )
 
             code += get_literal_token_code(self.parser.current_token())
 
@@ -113,7 +132,7 @@ class Compiler:
         When encountering global variable definition - create Variable object
         Returns code that initializes global variables and advances the pointer to after them
         """
-        code = ''
+        code = ""
         token = self.parser.current_token()
         while token is not None and token.type in [Token.VOID, Token.INT, Token.SEMICOLON]:
             if token.type == Token.SEMICOLON:  # can have random semicolons ;)
@@ -125,22 +144,31 @@ class Compiler:
             if self.parser.next_token(next_amount=2).type == Token.LPAREN:
                 function = self.create_function_object()
                 insert_function_object(function)
-            elif token.type is Token.INT and self.parser.next_token(next_amount=2).type in [Token.SEMICOLON, Token.ASSIGN, Token.LBRACK]:
+            elif token.type is Token.INT and self.parser.next_token(next_amount=2).type in [
+                Token.SEMICOLON,
+                Token.ASSIGN,
+                Token.LBRACK,
+            ]:
                 code += self.compile_global_variable_definition()
             else:
-                raise BFSyntaxError("Unexpected '%s' after '%s'. Expected '(' (function definition) or one of: '=', ';', '[' (global variable definition)" % (str(self.parser.next_token(next_amount=2)), str(self.parser.next_token())))
+                raise BFSyntaxError(
+                    "Unexpected '%s' after '%s'. Expected '(' (function definition) or one of: '=', ';', '[' (global variable definition)"
+                    % (str(self.parser.next_token(next_amount=2)), str(self.parser.next_token()))
+                )
 
             token = self.parser.current_token()
 
         if self.parser.current_token() is not None:  # we have not reached the last token
-            untouched_tokens = [str(t) for t in self.parser.tokens[self.parser.current_token_index:]]
+            untouched_tokens = [str(t) for t in self.parser.tokens[self.parser.current_token_index :]]
             raise BFSyntaxError("Did not reach the end of the code. Untouched tokens:\n%s" % untouched_tokens)
 
         return code
 
     def compile(self):
         insert_library_functions()
-        code = self.process_global_definitions()  # code that initializes global variables and advances pointer to after them
+        code = (
+            self.process_global_definitions()
+        )  # code that initializes global variables and advances pointer to after them
 
         check_function_exists(Token(Token.ID, 0, 0, "main"), 0)
         code += get_function_object("main").get_code(get_global_variables_size())
@@ -159,7 +187,7 @@ def compile(code, optimize_code=False):
     return brainfuck_code
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     print("This file cannot be directly run")
     print("Please import it and use the 'compile' function")
     print("Which receives a C-like code (string) and returns Brainfuck code (string)")
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/FunctionCompiler.py b/reasoning_gym/code/contrib/bfit/Compiler/FunctionCompiler.py
index 7eaa9877..1603ac64 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/FunctionCompiler.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/FunctionCompiler.py
@@ -1,11 +1,28 @@
 from collections import namedtuple
 from functools import reduce
-from .Exceptions import BFSyntaxError, BFSemanticError
+
+from .Exceptions import BFSemanticError, BFSyntaxError
 from .Functions import check_function_exists, get_function_object
-from .General import get_variable_dimensions_from_token, get_move_to_return_value_cell_code, get_print_string_code, get_variable_from_ID_token
-from .General import get_literal_token_value, process_switch_cases, is_token_literal
+from .General import (
+    get_literal_token_value,
+    get_move_to_return_value_cell_code,
+    get_print_string_code,
+    get_variable_dimensions_from_token,
+    get_variable_from_ID_token,
+    is_token_literal,
+    process_switch_cases,
+)
 from .Globals import create_variable_from_definition, get_global_variables, get_variable_size, is_variable_array
-from .Node import NodeToken, NodeTernary, NodeArraySetElement, NodeUnaryPrefix, NodeUnaryPostfix, NodeArrayGetElement, NodeFunctionCall, NodeArrayAssignment
+from .Node import (
+    NodeArrayAssignment,
+    NodeArrayGetElement,
+    NodeArraySetElement,
+    NodeFunctionCall,
+    NodeTernary,
+    NodeToken,
+    NodeUnaryPostfix,
+    NodeUnaryPrefix,
+)
 from .Parser import Parser
 from .Token import Token
 
@@ -83,7 +100,9 @@ class FunctionCompiler:
         # new stack pointer should be at least that size
         assert self.current_stack_pointer() <= current_stack_pointer
         self.return_value_cell = current_stack_pointer
-        self.set_stack_pointer(current_stack_pointer+1)  # make room for return_value cell. next available cell is the next one after it.
+        self.set_stack_pointer(
+            current_stack_pointer + 1
+        )  # make room for return_value cell. next available cell is the next one after it.
         function_code = self.compile_function_scope(self.parameters)
         self.remove_ids_map()  # Global variables
         return function_code
@@ -123,8 +142,12 @@ class FunctionCompiler:
 
             # multiply by next dimensions sizes
             multiply_amount = reduce(lambda x, y: x * y, dimensions[1:])  # size of the following dimensions
-            node_token_multiply_amount = NodeToken(self.ids_map_list, token=Token(Token.NUM, ID_token.line, ID_token.column, data=str(multiply_amount)))
-            index_expression = NodeToken(self.ids_map_list, token=multiply_token, left=first_index_expression, right=node_token_multiply_amount)
+            node_token_multiply_amount = NodeToken(
+                self.ids_map_list, token=Token(Token.NUM, ID_token.line, ID_token.column, data=str(multiply_amount))
+            )
+            index_expression = NodeToken(
+                self.ids_map_list, token=multiply_token, left=first_index_expression, right=node_token_multiply_amount
+            )
 
             # handle next dimensions
             dimension = 1
@@ -132,8 +155,10 @@ class FunctionCompiler:
                 if self.parser.current_token().type != Token.LBRACK:  # too few indexes given...
                     if dimension == 1:
                         return first_index_expression  # allow use of only one dimension for multi-dimensional array
-                    raise BFSemanticError("%s is a %s-dimensional array, but only %s dimension(s) given as index" %
-                                          (str(ID_token), len(dimensions), dimension))
+                    raise BFSemanticError(
+                        "%s is a %s-dimensional array, but only %s dimension(s) given as index"
+                        % (str(ID_token), len(dimensions), dimension)
+                    )
                 self.parser.check_current_token_is(Token.LBRACK)
                 self.parser.advance_token()  # skip LBRACK
                 exp = self.expression()
@@ -143,19 +168,30 @@ class FunctionCompiler:
 
                 # current_dimension_index *= size_of_following_dimensions
                 if dimension + 1 < len(dimensions):  # not last dimension - need to multiply and add
-                    multiply_amount = reduce(lambda x, y: x * y, dimensions[dimension + 1:])  # size of the following dimensions
-                    node_token_multiply_amount = NodeToken(self.ids_map_list, token=Token(Token.NUM, ID_token.line, ID_token.column, data=str(multiply_amount)))
-                    multiply_node = NodeToken(self.ids_map_list, token=multiply_token, left=exp, right=node_token_multiply_amount)
+                    multiply_amount = reduce(
+                        lambda x, y: x * y, dimensions[dimension + 1 :]
+                    )  # size of the following dimensions
+                    node_token_multiply_amount = NodeToken(
+                        self.ids_map_list,
+                        token=Token(Token.NUM, ID_token.line, ID_token.column, data=str(multiply_amount)),
+                    )
+                    multiply_node = NodeToken(
+                        self.ids_map_list, token=multiply_token, left=exp, right=node_token_multiply_amount
+                    )
 
                     # prev_dimensions_index += current_dimension_index
-                    index_expression = NodeToken(self.ids_map_list, token=add_token, left=index_expression, right=multiply_node)
+                    index_expression = NodeToken(
+                        self.ids_map_list, token=add_token, left=index_expression, right=multiply_node
+                    )
                 else:  # last dimension - no need to multiply, just add
                     index_expression = NodeToken(self.ids_map_list, token=add_token, left=index_expression, right=exp)
                 dimension += 1
 
         if self.parser.current_token().type == Token.LBRACK:  # too many indexes given...
-            raise BFSemanticError("%s is a %s-dimensional array. Unexpected %s" %
-                                  (str(ID_token), len(dimensions), self.parser.current_token()))
+            raise BFSemanticError(
+                "%s is a %s-dimensional array. Unexpected %s"
+                % (str(ID_token), len(dimensions), self.parser.current_token())
+            )
         return index_expression
 
     def get_token_after_array_access(self, offset=0):
@@ -193,12 +229,18 @@ class FunctionCompiler:
 
         if self.parser.next_token().type == Token.SEMICOLON:  # INT ID SEMICOLON
             self.parser.advance_token(2)  # skip ID SEMICOLON
-            return ''  # no code is generated here. code was generated for defining this variable when we entered the scope
+            return (
+                ""  # no code is generated here. code was generated for defining this variable when we entered the scope
+            )
 
-        elif self.parser.next_token().type == Token.ASSIGN and self.parser.next_token().data == "=":  # INT ID = EXPRESSION SEMICOLON
+        elif (
+            self.parser.next_token().type == Token.ASSIGN and self.parser.next_token().data == "="
+        ):  # INT ID = EXPRESSION SEMICOLON
             return self.compile_expression_as_statement()  # compile_expression_as_statement skips the SEMICOLON
 
-        elif self.parser.next_token().type == Token.LBRACK:  # INT ID (LBRACK NUM RBRACK)+ (= ARRAY_INITIALIZATION)? SEMICOLON
+        elif (
+            self.parser.next_token().type == Token.LBRACK
+        ):  # INT ID (LBRACK NUM RBRACK)+ (= ARRAY_INITIALIZATION)? SEMICOLON
             # array definition (int arr[2][3]...[];) or array definition and initialization (arr[2][3]...[] = {...};)
             token_id = self.parser.current_token()
             self.parser.advance_token()  # skip ID
@@ -210,7 +252,7 @@ class FunctionCompiler:
                 initialization_node = self.compile_array_assignment(token_id)
                 code = initialization_node.get_code(self.current_stack_pointer()) + "<"  # discard expression value
             else:
-                code = ''  # just array definition
+                code = ""  # just array definition
                 # no code is generated here. code was generated for defining this variable when we entered the scope
             self.parser.check_current_token_is(Token.SEMICOLON)
             self.parser.advance_token()  # skip SEMICOLON
@@ -297,7 +339,9 @@ class FunctionCompiler:
             token = self.tokens[i]
 
             if token.type == Token.INT:
-                if self.tokens[i-2].type != Token.FOR:  # if it is not a definition inside a FOR statement (for (int i = 0...))
+                if (
+                    self.tokens[i - 2].type != Token.FOR
+                ):  # if it is not a definition inside a FOR statement (for (int i = 0...))
                     variable = create_variable_from_definition(self.parser, index=i)
                     self.insert_to_ids_map(variable)
 
@@ -333,7 +377,7 @@ class FunctionCompiler:
         for parameter in parameters:
             self.insert_to_ids_map(parameter)
 
-        code = '>'  # skip return_value_cell
+        code = ">"  # skip return_value_cell
         code += self.insert_scope_variables_into_ids_map()
         # this inserts scope variables AND moves pointer right, with the amount of BOTH parameters and scope variables
 
@@ -377,7 +421,9 @@ class FunctionCompiler:
         if token.type == Token.ID and self.parser.next_token().type == Token.LPAREN:
             return self.function_call()
 
-        if token.type == Token.ID and self.parser.next_token().type == Token.LBRACK:  # array - ID(LBRACK expression RBRACK)+
+        if (
+            token.type == Token.ID and self.parser.next_token().type == Token.LBRACK
+        ):  # array - ID(LBRACK expression RBRACK)+
             index_expression = self.get_array_index_expression()
             return NodeArrayGetElement(self.ids_map_list, token, index_expression)
 
@@ -386,7 +432,10 @@ class FunctionCompiler:
             return NodeToken(self.ids_map_list, token=token)
 
         if token.type != Token.LPAREN:
-            raise BFSyntaxError("Unexpected '%s'. expected literal (NUM | ID | ID(LBRACK expression RBRACK)+ | TRUE | FALSE | function_call | ( expression ))" % str(token))
+            raise BFSyntaxError(
+                "Unexpected '%s'. expected literal (NUM | ID | ID(LBRACK expression RBRACK)+ | TRUE | FALSE | function_call | ( expression ))"
+                % str(token)
+            )
 
         # ( expression )
         self.parser.check_current_token_is(Token.LPAREN)
@@ -417,7 +466,9 @@ class FunctionCompiler:
 
         if token.type in [Token.NOT, Token.BITWISE_NOT, Token.BINOP]:
             if token.type == Token.BINOP and token.data not in ["+", "-"]:
-                    raise BFSyntaxError("Expected either + or - as unary prefix instead of token %s" % self.parser.current_token())
+                raise BFSyntaxError(
+                    "Expected either + or - as unary prefix instead of token %s" % self.parser.current_token()
+                )
             self.parser.advance_token()
             unary_prefix = self.unary_prefix()
 
@@ -618,11 +669,19 @@ class FunctionCompiler:
 
             expression_node = self.expression()
 
-            new_node = NodeToken(self.ids_map_list, left=NodeToken(self.ids_map_list, token=id_token), token=assign_token, right=expression_node)
+            new_node = NodeToken(
+                self.ids_map_list,
+                left=NodeToken(self.ids_map_list, token=id_token),
+                token=assign_token,
+                right=expression_node,
+            )
             return new_node
 
-        elif self.parser.current_token().type == Token.ID and self.parser.next_token().type == Token.LBRACK and \
-                self.get_token_after_array_access().type == Token.ASSIGN:
+        elif (
+            self.parser.current_token().type == Token.ID
+            and self.parser.next_token().type == Token.LBRACK
+            and self.get_token_after_array_access().type == Token.ASSIGN
+        ):
             # ID (LBRACK expression RBRACK)+ ASSIGN value_expression
             id_token = self.parser.current_token()
             index_expression = self.get_array_index_expression()
@@ -744,7 +803,7 @@ class FunctionCompiler:
         if self.parser.current_token().type == Token.SEMICOLON:
             # return;
             self.parser.advance_token()  # skip ;
-            return ''  # nothing to do
+            return ""  # nothing to do
 
         # return exp;
         expression_code = self.compile_expression()
@@ -763,7 +822,12 @@ class FunctionCompiler:
         # this expression can be used as a statement.
         # e.g: x+=5;  or  x++ or ++x;
 
-        assert self.parser.current_token().type in [Token.ID, Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]
+        assert self.parser.current_token().type in [
+            Token.ID,
+            Token.INCREMENT,
+            Token.DECREMENT,
+            Token.UNARY_MULTIPLICATIVE,
+        ]
 
         code = self.compile_expression()
         self.parser.check_current_token_is(Token.SEMICOLON)
@@ -901,7 +965,10 @@ class FunctionCompiler:
         self.increase_stack_pointer()  # use 1 additional temp cell for indicating we need to execute a case
         cases = list()  # list of tuples: (value/"default" (int or string), case_code (string), has_break(bool))
 
-        while self.parser.current_token().type in [Token.CASE, Token.DEFAULT]:  # (default | CASE literal) COLON statement* break;? statements*
+        while self.parser.current_token().type in [
+            Token.CASE,
+            Token.DEFAULT,
+        ]:  # (default | CASE literal) COLON statement* break;? statements*
             if self.parser.current_token().type == Token.CASE:
                 self.parser.advance_token()  # skip CASE
                 constant_value_token = self.parser.current_token()
@@ -922,7 +989,9 @@ class FunctionCompiler:
 
             inner_case_code = ""
             while self.parser.current_token().type not in [Token.CASE, Token.DEFAULT, Token.RBRACE, Token.BREAK]:
-                inner_case_code += self.compile_statement(allow_declaration=False)  # not allowed to declare variables directly inside case
+                inner_case_code += self.compile_statement(
+                    allow_declaration=False
+                )  # not allowed to declare variables directly inside case
 
             has_break = False
             if self.parser.current_token().type == Token.BREAK:  # ignore all statements after break
@@ -934,7 +1003,9 @@ class FunctionCompiler:
             cases.append((value, inner_case_code, has_break))
 
         if self.parser.current_token().type not in [Token.CASE, Token.DEFAULT, Token.RBRACE]:
-            raise BFSyntaxError("Expected case / default / RBRACE (}) instead of token %s" % self.parser.current_token())
+            raise BFSyntaxError(
+                "Expected case / default / RBRACE (}) instead of token %s" % self.parser.current_token()
+            )
         self.parser.check_current_token_is(Token.RBRACE)
         self.parser.advance_token()
         self.decrease_stack_pointer(amount=2)
@@ -943,7 +1014,10 @@ class FunctionCompiler:
 
     def compile_break(self):
         # TODO: Make the break statement in scopes inside switch-case (including if/else), and for/do/while
-        raise NotImplementedError("Break statement found outside of switch case first scope.\nBreak is not currently implemented for while/for/do statements.\nToken is %s" % self.parser.current_token())
+        raise NotImplementedError(
+            "Break statement found outside of switch case first scope.\nBreak is not currently implemented for while/for/do statements.\nToken is %s"
+            % self.parser.current_token()
+        )
 
     def compile_for(self):
         # for (statement expression; expression) inner_scope_code   note: statement contains ;, and inner_scope_code can be scope { }
@@ -951,17 +1025,17 @@ class FunctionCompiler:
         # (the statement cannot contain scope - { and } )
 
         """
-            <for> is a special case of scope
-            the initial code (int i = 0;) is executed INSIDE the scope, but BEFORE the LBRACE
-            so we manually compile the scope instead of using self.compile_scope():
+        <for> is a special case of scope
+        the initial code (int i = 0;) is executed INSIDE the scope, but BEFORE the LBRACE
+        so we manually compile the scope instead of using self.compile_scope():
 
-            we first create an ids map, and in the case that there is a variable definition inside the <for> definition:
-            we manually insert the ID into the ids map, and move the pointer to the right once, to make room for it
-            (this needs to be done before the <for> definition's statement)
-            next, inside the for's scope {}:
-            after calling insert_scope_variables_into_ids_map, we move the pointer to the left once, since it counts the ID we entered manually as well
-            after calling exit_scope, we move the pointer to the right, since it counts the ID we entered manually, and we don't want it to be discarded after every iteration
-            finally, at the end of the <for> loop, we move the pointer once to the left, to discard the variable we defined manually
+        we first create an ids map, and in the case that there is a variable definition inside the <for> definition:
+        we manually insert the ID into the ids map, and move the pointer to the right once, to make room for it
+        (this needs to be done before the <for> definition's statement)
+        next, inside the for's scope {}:
+        after calling insert_scope_variables_into_ids_map, we move the pointer to the left once, since it counts the ID we entered manually as well
+        after calling exit_scope, we move the pointer to the right, since it counts the ID we entered manually, and we don't want it to be discarded after every iteration
+        finally, at the end of the <for> loop, we move the pointer once to the left, to discard the variable we defined manually
         """
 
         self.parser.check_current_tokens_are([Token.FOR, Token.LPAREN])
@@ -969,7 +1043,7 @@ class FunctionCompiler:
 
         manually_inserted_variable_in_for_definition = False
         variable = None
-        code = ''
+        code = ""
 
         # =============== enter FOR scope ===============
         self.add_ids_map()
@@ -987,7 +1061,10 @@ class FunctionCompiler:
                 show_side_effect_warning = self.get_token_after_array_access(offset=1).type != Token.ASSIGN
 
             if show_side_effect_warning:
-                print("[Warning] For loop variable '%s' isn't assigned to anything and may cause side effects" % self.parser.next_token())
+                print(
+                    "[Warning] For loop variable '%s' isn't assigned to anything and may cause side effects"
+                    % self.parser.next_token()
+                )
 
         if self.parser.current_token().type == Token.LBRACE:  # statement is a scope
             raise BFSyntaxError("Unexpected scope inside for loop statement - %s" % self.parser.current_token())
@@ -1042,20 +1119,31 @@ class FunctionCompiler:
         token = self.parser.current_token()
         if token.type == Token.INT:  # INT ID ((= EXPRESSION) | ([NUM])+ (= ARRAY_INITIALIZATION)?)? SEMICOLON
             if not allow_declaration:
-                raise BFSemanticError("Cannot define variable (%s) directly inside case. "
-                                      "Can define inside new scope {} or outside the switch statement" % token)
+                raise BFSemanticError(
+                    "Cannot define variable (%s) directly inside case. "
+                    "Can define inside new scope {} or outside the switch statement" % token
+                )
             return self.compile_variable_declaration()
 
         elif token.type in [Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]:  # ++ID;
             return self.compile_expression_as_statement()
 
         elif token.type == Token.ID:
-            if self.parser.next_token().type in [Token.ASSIGN, Token.LBRACK, Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]:
+            if self.parser.next_token().type in [
+                Token.ASSIGN,
+                Token.LBRACK,
+                Token.INCREMENT,
+                Token.DECREMENT,
+                Token.UNARY_MULTIPLICATIVE,
+            ]:
                 # ID ASSIGN expression; or ID([expression])+ ASSIGN expression; or ID++;
                 return self.compile_expression_as_statement()
             elif self.parser.next_token().type == Token.LPAREN:  # ID(...);  (function call)
                 return self.compile_function_call_statement()
-            raise BFSyntaxError("Unexpected '%s' after '%s'. Expected '=|+=|-=|*=|/=|%%=|<<=|>>=|&=|(|=)|^=' (assignment), '++|--' (modification) or '(' (function call)" % (str(self.parser.next_token()), str(token)))
+            raise BFSyntaxError(
+                "Unexpected '%s' after '%s'. Expected '=|+=|-=|*=|/=|%%=|<<=|>>=|&=|(|=)|^=' (assignment), '++|--' (modification) or '(' (function call)"
+                % (str(self.parser.next_token()), str(token))
+            )
 
         elif token.type == Token.PRINT:
             return self.compile_print_string()
@@ -1097,7 +1185,7 @@ class FunctionCompiler:
     def compile_scope_statements(self):
         tokens = self.tokens
 
-        code = ''
+        code = ""
         while self.parser.current_token() is not None:
             if self.parser.current_token().type == Token.RBRACE:
                 # we reached the end of our scope
@@ -1124,29 +1212,29 @@ class FunctionCompiler:
         # will be inserted into the new scope prior to the scope's compilation
 
         """
-            example layout:
-                int global_var1;
-                int global_var2;
-                int foo(int a, int b) {
-                    int x;
-                    int y;
-                    return 5;
-                }
+        example layout:
+            int global_var1;
+            int global_var2;
+            int foo(int a, int b) {
+                int x;
+                int y;
+                return 5;
+            }
 
-                int main() {
-                    int n;
-                    foo(1, 2);
-                }
+            int main() {
+                int n;
+                foo(1, 2);
+            }
 
-                global_var1 global_var2 main_return_value n foo_return_value a=1 b=2 x y
+            global_var1 global_var2 main_return_value n foo_return_value a=1 b=2 x y
 
-                calling convention:
-                caller responsibility: make room for return_value (and zero its cell), place parameters, point to return_value cell
-                callee responsibility: put return value in return_value cell and point to it (thus "cleaning" parameters)
-                    can assume that there is a zeroed cell at current_stack_pointer (return_value_cell) (therefore ids_map starts at index current_stack_pointer+1)
-                    can assume that the next cells match your parameters
-                    assumes that initially, the pointer points to the first cell (return_value_cell).
-                    therefore begin with '>' * (1 + parameters + scope variables)
+            calling convention:
+            caller responsibility: make room for return_value (and zero its cell), place parameters, point to return_value cell
+            callee responsibility: put return value in return_value cell and point to it (thus "cleaning" parameters)
+                can assume that there is a zeroed cell at current_stack_pointer (return_value_cell) (therefore ids_map starts at index current_stack_pointer+1)
+                can assume that the next cells match your parameters
+                assumes that initially, the pointer points to the first cell (return_value_cell).
+                therefore begin with '>' * (1 + parameters + scope variables)
         """
 
         assert self.parser.current_token().type == Token.LBRACE
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Functions.py b/reasoning_gym/code/contrib/bfit/Compiler/Functions.py
index 837e3339..a4ed4b9a 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/Functions.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Functions.py
@@ -1,4 +1,5 @@
 from copy import deepcopy
+
 from .Exceptions import BFSemanticError
 
 functions = dict()  # Global dictionary of function_name --> FunctionCompiler objects
@@ -30,4 +31,7 @@ def check_function_exists(function_token, parameters_amount):
 
     function = functions[function_name]
     if len(function.parameters) != parameters_amount:
-        raise BFSemanticError("Function '%s' has %s parameters (called it with %s parameters)" % (str(function_token), len(function.parameters), parameters_amount))
+        raise BFSemanticError(
+            "Function '%s' has %s parameters (called it with %s parameters)"
+            % (str(function_token), len(function.parameters), parameters_amount)
+        )
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/General.py b/reasoning_gym/code/contrib/bfit/Compiler/General.py
index 2a182b8a..6abab5cf 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/General.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/General.py
@@ -1,7 +1,8 @@
-from .Exceptions import BFSyntaxError, BFSemanticError
-from .Token import Token
 from functools import reduce
 
+from .Exceptions import BFSemanticError, BFSyntaxError
+from .Token import Token
+
 """
 This file holds functions that generate general Brainfuck code
 And general functions that are not dependent on other objects
@@ -126,23 +127,29 @@ def unpack_multidimensional_literal_tokens_to_array_dimensions(ID_token, array_d
     if len(array_dimensions) == 0:
         raise BFSemanticError("Tried to initialize array %s with too many nested sub-arrays" % ID_token)
     if len(literal_tokens_list) > array_dimensions[0]:
-        raise BFSemanticError("Tried to initialize array %s dimension %s with too many elements (%s)"
-                              % (ID_token, str(array_dimensions), str(len(literal_tokens_list))))
+        raise BFSemanticError(
+            "Tried to initialize array %s dimension %s with too many elements (%s)"
+            % (ID_token, str(array_dimensions), str(len(literal_tokens_list)))
+        )
 
     result = []
     for element in literal_tokens_list:
         if isinstance(element, list):
             # recursively unpack the list with the sub-dimension of the sub-array
             # E.g if we have arr[3][3][3] and then this call will fill [3][3]=9 elements
-            result.extend(unpack_multidimensional_literal_tokens_to_array_dimensions(ID_token, array_dimensions[1:], element))
+            result.extend(
+                unpack_multidimensional_literal_tokens_to_array_dimensions(ID_token, array_dimensions[1:], element)
+            )
         else:
             result.append(element)
             if len(array_dimensions) > 1:
                 dimension_size = dimensions_to_size(array_dimensions[1:])  # current size we need to fill
-                result.extend([Token(Token.NUM, 0, 0, "0")] * (dimension_size - 1))  # fill missing elements in this dimension with zeros
+                result.extend(
+                    [Token(Token.NUM, 0, 0, "0")] * (dimension_size - 1)
+                )  # fill missing elements in this dimension with zeros
 
     dimension_size = dimensions_to_size(array_dimensions)  # current size we need to fill
-    result.extend([Token(Token.NUM, 0, 0, "0")] * (dimension_size-len(result)))  # fill the result with zeros
+    result.extend([Token(Token.NUM, 0, 0, "0")] * (dimension_size - len(result)))  # fill the result with zeros
     return result
 
 
@@ -157,13 +164,20 @@ def unpack_literal_tokens_to_array_dimensions(ID_token, array_dimensions, litera
     if all(not isinstance(element, list) for element in literal_tokens_list):
         # special case - if all elements are literals, then we allow assigning them as-is and not care about dimensions
         # E.g if we have arr[3][3][3] = {1,2,3,4} then return [1,2,3,4,0,0,0,0,0]
-        unpacked_literals_list = literal_tokens_list + [Token(Token.NUM, 0, 0, "0")] * (array_size - len(literal_tokens_list))  # fill missing with zeros
+        unpacked_literals_list = literal_tokens_list + [Token(Token.NUM, 0, 0, "0")] * (
+            array_size - len(literal_tokens_list)
+        )  # fill missing with zeros
     else:
-        unpacked_literals_list = unpack_multidimensional_literal_tokens_to_array_dimensions(ID_token, array_dimensions, literal_tokens_list)
+        unpacked_literals_list = unpack_multidimensional_literal_tokens_to_array_dimensions(
+            ID_token, array_dimensions, literal_tokens_list
+        )
 
     if len(unpacked_literals_list) > array_size:
-        raise BFSemanticError("Tried to initialize array %s with incompatible amount of literals."
-                              " (array size is %s and literals size is %s)" % (ID_token, str(array_size), str(len(unpacked_literals_list))))
+        raise BFSemanticError(
+            "Tried to initialize array %s with incompatible amount of literals."
+            " (array size is %s and literals size is %s)"
+            % (ID_token, str(array_size), str(len(unpacked_literals_list)))
+        )
     assert len(unpacked_literals_list) == array_size
     return unpacked_literals_list
 
@@ -208,17 +222,19 @@ def process_switch_cases(expression_code, cases):
     code += "<"  # point to expression
 
     if all_cases_have_break:  # small optimization for evaluating the expression
-        cases = [case for case in cases if case[0] != "default"]  # remove default to be able to sort. it is handled differently
+        cases = [
+            case for case in cases if case[0] != "default"
+        ]  # remove default to be able to sort. it is handled differently
         cases.sort(key=lambda x: x[0], reverse=True)  # Can sort since correct flow is not needed
 
     """
         This loop compares the expression value to each case in the switch-case statement, in reverse order
         It does so by increasing and decreasing expression, and comparing result to 0
-        E.G. if we have 
+        E.G. if we have
             switch(x) {
                 case 2:
                 case 0:
-                case 5: 
+                case 5:
                 case 1:
             }
         x will be put in <expression> cell, then:
@@ -244,7 +260,7 @@ def process_switch_cases(expression_code, cases):
     <need_to_execute=1>
     <compare_with_1>    [
     <compare_with_5>        [
-    <compare_with_0>            [ 
+    <compare_with_0>            [
     <compare_with_2>                [
                                         <default_code> <expression_value=0> <need_to_execute=0>
                                     ]   <if need_to_execute> <code_for_2> <need_to_execute=0>
@@ -487,22 +503,22 @@ def get_bitwise_code(code_logic):
     code += "<<"  # point to a
 
     code += "["  # while a != 0:
-    code +=     "-"  # a -= 1
-    code +=     ">>-"  # c -= 1
-    code +=     "[>+>>+<<<-]>[<+>-]"  # copy c to y (using w)
-    code +=     ">>"  # point to y
-    code +=     ">>+<<"  # bit1 += 1
+    code += "-"  # a -= 1
+    code += ">>-"  # c -= 1
+    code += "[>+>>+<<<-]>[<+>-]"  # copy c to y (using w)
+    code += ">>"  # point to y
+    code += ">>+<<"  # bit1 += 1
 
-    code +=     "-["  # if y != 1:
-    code +=         "<+"  # x += 1
-    code +=         "<<++"  # c += 2 (c was 0)
-    code +=         ">" * 5  # point to bit1
-    code +=         "--"  # bit1 -= 2 (bit1 was 2)
-    code +=         "<<"  # point to y
-    code +=         "+"  # set y to 0
-    code +=     "]"  # end if
+    code += "-["  # if y != 1:
+    code += "<+"  # x += 1
+    code += "<<++"  # c += 2 (c was 0)
+    code += ">" * 5  # point to bit1
+    code += "--"  # bit1 -= 2 (bit1 was 2)
+    code += "<<"  # point to y
+    code += "+"  # set y to 0
+    code += "]"  # end if
 
-    code +=     "<<<<<"  # point to a
+    code += "<<<<<"  # point to a
     code += "]"  # end while
 
     code += ">>>>[<<<<+>>>>-]"  # move x to a (x is a/2)
@@ -510,21 +526,21 @@ def get_bitwise_code(code_logic):
     code += "<"  # point to b
 
     code += "["  # while b != 0:
-    code +=     "-"  # b -= 1
-    code +=     ">-"  # c -= 1
-    code +=     "[>+>>+<<<-]>[<+>-]"  # copy c to y (using w)
-    code +=     ">>"  # point to y
-    code +=     ">+<"  # z += 1
+    code += "-"  # b -= 1
+    code += ">-"  # c -= 1
+    code += "[>+>>+<<<-]>[<+>-]"  # copy c to y (using w)
+    code += ">>"  # point to y
+    code += ">+<"  # z += 1
 
-    code +=     "-["  # if y != 1:
-    code +=         ">--<"  # z -= 2 (z was 2)
-    code +=         "<+"  # x += 1
-    code +=         "<<++"  # c += 2 (c was 0)
-    code +=         ">>>"  # point to y
-    code +=         "+"  # set y to 0
-    code +=     "]"
+    code += "-["  # if y != 1:
+    code += ">--<"  # z -= 2 (z was 2)
+    code += "<+"  # x += 1
+    code += "<<++"  # c += 2 (c was 0)
+    code += ">>>"  # point to y
+    code += "+"  # set y to 0
+    code += "]"
 
-    code +=     "<<<<"  # point to b
+    code += "<<<<"  # point to b
     code += "]"  # end while
 
     # w is a % 2
@@ -658,14 +674,14 @@ def get_unary_prefix_op_code(token, offset_to_variable=None):
         assert token.data in ["+", "-"]
         if token.data == "+":
             # keep value as-is
-            return '>'
+            return ">"
         elif token.data == "-":
             # a temp
-            code = ">[-]" # zero temp
-            code += "<" # point to a
-            code += "[->-<]" # sub a from temp
-            code += ">" # point to temp
-            code += "[<+>-]" # copy temp to a
+            code = ">[-]"  # zero temp
+            code += "<"  # point to a
+            code += "[->-<]"  # sub a from temp
+            code += ">"  # point to temp
+            code += "[<+>-]"  # copy temp to a
             return code
     raise NotImplementedError
 
@@ -1127,7 +1143,6 @@ def get_op_boolean_operator_code(node, current_pointer):
     raise NotImplementedError
 
 
-
 def get_print_string_code(string):
     code = "[-]"  # zero the current cell
     code += ">[-]"  # zero the next cell (will be used for loop counts)
@@ -1200,6 +1215,7 @@ def get_move_left_index_cell_code():
 #     General
 # =================
 
+
 def get_literal_token_value(token):
     # known at compilation time
     assert is_token_literal(token)
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Globals.py b/reasoning_gym/code/contrib/bfit/Compiler/Globals.py
index 5c37c59e..0eaaac80 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/Globals.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Globals.py
@@ -1,6 +1,7 @@
 from collections import namedtuple
-from .Token import Token
+
 from .General import dimensions_to_size, get_NUM_token_value
+from .Token import Token
 
 """
 This file holds the program's functions and global variables
@@ -55,7 +56,7 @@ def create_variable_from_definition(parser, index=None, advance_tokens=False):
     if index is None, then assumes we start at the current_token_index
     if advance_tokens is True, then modifies current_token_index accordingly using parser.advance_token()
     """
-    
+
     if index is None:
         index = parser.current_token_index
 
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py b/reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py
index 1c3e5e0a..091bbcf7 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py
@@ -1,6 +1,7 @@
 import re
-from .Token import Token
+
 from .Optimizer import optimize
+from .Token import Token
 
 
 class LexicalErrorException(Exception):
@@ -14,64 +15,59 @@ def analyze(text):
     """
 
     rules = [
-        ('\s+', Token.WHITESPACE),
-        ('void',    Token.VOID),
-        ('int',     Token.INT),
-        ('bool', Token.INT),  # treat bool as int
-        ('char', Token.INT),  # treat char as int
-
-        ('true', Token.TRUE),
-        ('false', Token.FALSE),
-        ('&&', Token.AND),
-        ('\|\|', Token.OR),
-        ('\!', Token.NOT),
-        ('return', Token.RETURN),
-        ('if', Token.IF),
-        ('else', Token.ELSE),
-        ('while', Token.WHILE),
-        ('for', Token.FOR),
-        ('do', Token.DO),
-        ('print', Token.PRINT),
-        ('switch', Token.SWITCH),
-        ('case', Token.CASE),
-        ('default', Token.DEFAULT),
-        ('break', Token.BREAK),
-        ('continue', Token.CONTINUE),  # todo
-        (':', Token.COLON),
-        (';', Token.SEMICOLON),
-        (',', Token.COMMA),
-
-        ('\(', Token.LPAREN),
-        ('\)', Token.RPAREN),
-        ('\{', Token.LBRACE),
-        ('\}', Token.RBRACE),
-        ('\[', Token.LBRACK),
-        ('\]', Token.RBRACK),
-        ('=|\+=|-=|\*=|/=|%=|<<=|>>=|&=|\|=|\^=', Token.ASSIGN),
-        ('\?', Token.TERNARY),
-
-        ('<=|>=|==|!=|<|>', Token.RELOP),
-        ('\+\+', Token.INCREMENT),
-        ('--', Token.DECREMENT),
-        ('\+|-|\*|/|%', Token.BINOP),
-        ('\*\*|//|%%', Token.UNARY_MULTIPLICATIVE),
-
-        ('<<|>>', Token.BITWISE_SHIFT),
-        ('~', Token.BITWISE_NOT),
-        ('&', Token.BITWISE_AND),
-        ('\|', Token.BITWISE_OR),
-        ('\^', Token.BITWISE_XOR),
-
-        ('([a-zA-Z_][a-zA-Z0-9_]*)',    Token.ID),
-        ('(\d+)',     Token.NUM),
-        ('(0x[A-Fa-f\d]+)',     Token.NUM),  # hexadecimal number
-        ('(0o[0-7]+)',     Token.NUM),  # octal number
-        ('(0b[01]+)',     Token.NUM),  # binary number
-        (r'\"(\\\"|[^"])*"',   Token.STRING),
-        (r'\'(\\\'|(\\)?[^\'])\'', Token.CHAR),
-        ('//.*(\\n|$)', Token.COMMENT),
-        (r'/\*[\s\S]*?\*/', Token.COMMENT),  # multiline comments
-        ('.',       Token.UNIDENTIFIED)
+        (r"\s+", Token.WHITESPACE),
+        ("void", Token.VOID),
+        ("int", Token.INT),
+        ("bool", Token.INT),  # treat bool as int
+        ("char", Token.INT),  # treat char as int
+        ("true", Token.TRUE),
+        ("false", Token.FALSE),
+        ("&&", Token.AND),
+        (r"\|\|", Token.OR),
+        (r"\!", Token.NOT),
+        ("return", Token.RETURN),
+        ("if", Token.IF),
+        ("else", Token.ELSE),
+        ("while", Token.WHILE),
+        ("for", Token.FOR),
+        ("do", Token.DO),
+        ("print", Token.PRINT),
+        ("switch", Token.SWITCH),
+        ("case", Token.CASE),
+        ("default", Token.DEFAULT),
+        ("break", Token.BREAK),
+        ("continue", Token.CONTINUE),  # todo
+        (":", Token.COLON),
+        (";", Token.SEMICOLON),
+        (",", Token.COMMA),
+        (r"\(", Token.LPAREN),
+        (r"\)", Token.RPAREN),
+        (r"\{", Token.LBRACE),
+        (r"\}", Token.RBRACE),
+        (r"\[", Token.LBRACK),
+        (r"\]", Token.RBRACK),
+        (r"=|\+=|-=|\*=|/=|%=|<<=|>>=|&=|\|=|\^=", Token.ASSIGN),
+        (r"\?", Token.TERNARY),
+        (r"<=|>=|==|!=|<|>", Token.RELOP),
+        (r"\+\+", Token.INCREMENT),
+        ("--", Token.DECREMENT),
+        (r"\+|-|\*|/|%", Token.BINOP),
+        (r"\*\*|//|%%", Token.UNARY_MULTIPLICATIVE),
+        ("<<|>>", Token.BITWISE_SHIFT),
+        ("~", Token.BITWISE_NOT),
+        ("&", Token.BITWISE_AND),
+        (r"\|", Token.BITWISE_OR),
+        (r"\^", Token.BITWISE_XOR),
+        ("([a-zA-Z_][a-zA-Z0-9_]*)", Token.ID),
+        (r"(\d+)", Token.NUM),
+        (r"(0x[A-Fa-f\d]+)", Token.NUM),  # hexadecimal number
+        ("(0o[0-7]+)", Token.NUM),  # octal number
+        ("(0b[01]+)", Token.NUM),  # binary number
+        (r'\"(\\\"|[^"])*"', Token.STRING),
+        (r"\'(\\\'|(\\)?[^\'])\'", Token.CHAR),
+        ("//.*(\\n|$)", Token.COMMENT),
+        (r"/\*[\s\S]*?\*/", Token.COMMENT),  # multiline comments
+        (".", Token.UNIDENTIFIED),
     ]
 
     rules = [(re.compile(r), t) for r, t in rules]
@@ -79,7 +75,7 @@ def analyze(text):
     tokens = []
 
     # create a mapping of [line number] to [offset of that line from the beginning of the text]
-    newline = re.compile('\n')
+    newline = re.compile("\n")
     lines = [0] + [m.end() for m in re.finditer(newline, text)]
 
     i = 0
@@ -99,12 +95,12 @@ def analyze(text):
 
         # calculate line and column
         line, column = None, None
-        for line_idx in range(len(lines)-1):
-            if lines[line_idx] <= longest_match.start() < lines[line_idx+1]:
-                line, column = line_idx+1, (longest_match.start() - lines[line_idx])+1  # humans count from 1 :)
+        for line_idx in range(len(lines) - 1):
+            if lines[line_idx] <= longest_match.start() < lines[line_idx + 1]:
+                line, column = line_idx + 1, (longest_match.start() - lines[line_idx]) + 1  # humans count from 1 :)
                 break
         if not line:
-            line, column = len(lines), (longest_match.start() - lines[-1])+1
+            line, column = len(lines), (longest_match.start() - lines[-1]) + 1
 
         if matched_token in [Token.COMMENT, Token.WHITESPACE]:
             pass  # do nothing
@@ -112,8 +108,18 @@ def analyze(text):
             raise LexicalErrorException("Unidentified Character '%s' (line %s column %s)" % (text[i], line, column))
         elif matched_token in [Token.STRING, Token.CHAR]:
             # remove quotes at beginning and end, un-escape characters
-            tokens.append(Token(matched_token, line, column, longest_match.group()[1:-1].encode("utf8").decode("unicode_escape")))
-        elif matched_token in [Token.NUM, Token.ID, Token.BINOP, Token.RELOP, Token.ASSIGN, Token.UNARY_MULTIPLICATIVE, Token.BITWISE_SHIFT]:
+            tokens.append(
+                Token(matched_token, line, column, longest_match.group()[1:-1].encode("utf8").decode("unicode_escape"))
+            )
+        elif matched_token in [
+            Token.NUM,
+            Token.ID,
+            Token.BINOP,
+            Token.RELOP,
+            Token.ASSIGN,
+            Token.UNARY_MULTIPLICATIVE,
+            Token.BITWISE_SHIFT,
+        ]:
             tokens.append(Token(matched_token, line, column, longest_match.group()))
         else:
             tokens.append(Token(matched_token, line, column))
@@ -128,16 +134,40 @@ def tests():
         text = "my international int ; int; pints; international;"
         res = analyze(text)
 
-        expected = [Token.ID, Token.ID, Token.INT, Token.SEMICOLON, Token.INT, Token.SEMICOLON, Token.ID,
-                    Token.SEMICOLON, Token.ID, Token.SEMICOLON]
+        expected = [
+            Token.ID,
+            Token.ID,
+            Token.INT,
+            Token.SEMICOLON,
+            Token.INT,
+            Token.SEMICOLON,
+            Token.ID,
+            Token.SEMICOLON,
+            Token.ID,
+            Token.SEMICOLON,
+        ]
         assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
 
     def test2():
         text = "true !||!false falsek  k||y+-a&&x"
         res = analyze(text)
 
-        expected = [Token.TRUE, Token.NOT, Token.OR, Token.NOT, Token.FALSE, Token.ID, Token.ID, Token.OR, Token.ID,
-                    Token.BINOP, Token.BINOP, Token.ID, Token.AND, Token.ID]
+        expected = [
+            Token.TRUE,
+            Token.NOT,
+            Token.OR,
+            Token.NOT,
+            Token.FALSE,
+            Token.ID,
+            Token.ID,
+            Token.OR,
+            Token.ID,
+            Token.BINOP,
+            Token.BINOP,
+            Token.ID,
+            Token.AND,
+            Token.ID,
+        ]
         assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
 
     def test3():
@@ -166,9 +196,29 @@ def tests():
         # test all arithmetic operations
         text = "(1+2*3/6)+(1%3)*(6-1)"
         tokens = analyze(text)
-        expected = [Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM,
-                    Token.RPAREN, Token.BINOP, Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.RPAREN,
-                    Token.BINOP, Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.RPAREN]
+        expected = [
+            Token.LPAREN,
+            Token.NUM,
+            Token.BINOP,
+            Token.NUM,
+            Token.BINOP,
+            Token.NUM,
+            Token.BINOP,
+            Token.NUM,
+            Token.RPAREN,
+            Token.BINOP,
+            Token.LPAREN,
+            Token.NUM,
+            Token.BINOP,
+            Token.NUM,
+            Token.RPAREN,
+            Token.BINOP,
+            Token.LPAREN,
+            Token.NUM,
+            Token.BINOP,
+            Token.NUM,
+            Token.RPAREN,
+        ]
         assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
         optimize(tokens)
         assert tokens[1].data == "2" and tokens[5].data == "1" and tokens[9].data == "5"
@@ -179,5 +229,5 @@ def tests():
     test3()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tests()
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/LibraryFunctionCompiler.py b/reasoning_gym/code/contrib/bfit/Compiler/LibraryFunctionCompiler.py
index 5b6567d6..feb2497a 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/LibraryFunctionCompiler.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/LibraryFunctionCompiler.py
@@ -48,7 +48,9 @@ def get_readint_code():
     code += ">"  # point to tmp
     code += "[<++++++++++>-]"  # res = tmp * 10, tmp = 0
     code += ">"  # point to input
-    code += "-" * (0x30 - 10)  # convert character to a digit by subtracting 0x30 from it (we already subtracted 10 before)
+    code += "-" * (
+        0x30 - 10
+    )  # convert character to a digit by subtracting 0x30 from it (we already subtracted 10 before)
     code += "[<<+>>-]"  # res += input
     code += "]"  # end if
 
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Node.py b/reasoning_gym/code/contrib/bfit/Compiler/Node.py
index fadeeff8..581ace92 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/Node.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Node.py
@@ -1,10 +1,20 @@
 from .Exceptions import BFSemanticError
-from .General import get_copy_from_variable_code, get_copy_to_variable_code
-from .General import get_move_left_index_cell_code, get_move_right_index_cells_code
-from .General import get_offset_to_variable, get_variable_dimensions_from_token
-from .General import get_op_between_literals_code, get_literal_token_code, get_token_ID_code
-from .General import get_unary_prefix_op_code, get_unary_postfix_op_code, is_token_literal
-from .General import unpack_literal_tokens_to_array_dimensions, get_op_boolean_operator_code
+from .General import (
+    get_copy_from_variable_code,
+    get_copy_to_variable_code,
+    get_literal_token_code,
+    get_move_left_index_cell_code,
+    get_move_right_index_cells_code,
+    get_offset_to_variable,
+    get_op_between_literals_code,
+    get_op_boolean_operator_code,
+    get_token_ID_code,
+    get_unary_postfix_op_code,
+    get_unary_prefix_op_code,
+    get_variable_dimensions_from_token,
+    is_token_literal,
+    unpack_literal_tokens_to_array_dimensions,
+)
 from .Token import Token
 
 """
@@ -60,7 +70,14 @@ class NodeToken(Node):
             else:
                 return get_literal_token_code(self.token)
 
-        elif self.token.type in [Token.BINOP, Token.RELOP, Token.BITWISE_SHIFT, Token.BITWISE_AND, Token.BITWISE_OR, Token.BITWISE_XOR]:
+        elif self.token.type in [
+            Token.BINOP,
+            Token.RELOP,
+            Token.BITWISE_SHIFT,
+            Token.BITWISE_AND,
+            Token.BITWISE_OR,
+            Token.BITWISE_XOR,
+        ]:
             code = self.left.get_code(current_pointer)
             code += self.right.get_code(current_pointer + 1)
             code += "<<"  # point to the first operand
@@ -78,7 +95,7 @@ class NodeToken(Node):
         elif self.token.type == Token.ASSIGN:
             assert self.left.token.type == Token.ID
 
-            if self.token.data == '=':
+            if self.token.data == "=":
                 # id = expression
                 code = self.right.get_code(current_pointer)
 
@@ -119,7 +136,7 @@ class NodeTernary(Node):
         code = ">"  # point to bool_evaluate_node_false
         code += "[-]+"  # bool_evaluate_node_false=1
         code += ">"  # point to condition
-        code += self.condition.get_code(current_pointer+2)  # evaluate condition
+        code += self.condition.get_code(current_pointer + 2)  # evaluate condition
         code += "<"  # point to condition
 
         code += "["  # if condition is non-zero
@@ -150,7 +167,14 @@ class NodeUnaryPrefix(Node):
 
     def get_code(self, current_pointer, *args, **kwargs):
         # unary prefix (!x or ++x or ~x or -x)
-        assert self.token_operation.type in [Token.NOT, Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE, Token.BITWISE_NOT, Token.BINOP]
+        assert self.token_operation.type in [
+            Token.NOT,
+            Token.INCREMENT,
+            Token.DECREMENT,
+            Token.UNARY_MULTIPLICATIVE,
+            Token.BITWISE_NOT,
+            Token.BINOP,
+        ]
 
         if self.token_operation.type in [Token.NOT, Token.BITWISE_NOT, Token.BINOP]:
             code = self.node_literal.get_code(current_pointer)
@@ -178,10 +202,15 @@ class NodeUnaryPrefix(Node):
 
             # the token to apply on must be an ID
             if isinstance(self.node_literal, NodeToken) is False:
-                raise BFSemanticError("Prefix operator %s can only be applied to a variable" % str(self.token_operation))
+                raise BFSemanticError(
+                    "Prefix operator %s can only be applied to a variable" % str(self.token_operation)
+                )
 
             if self.node_literal.token.type != Token.ID:
-                raise BFSemanticError("Prefix operator %s cannot be applied to %s, but only to a variable" % (str(self.token_operation), str(self.node_literal.token)))
+                raise BFSemanticError(
+                    "Prefix operator %s cannot be applied to %s, but only to a variable"
+                    % (str(self.token_operation), str(self.node_literal.token))
+                )
 
             offset_to_ID = get_offset_to_variable(self.ids_map_list, self.node_literal.token, current_pointer)
             return get_unary_prefix_op_code(self.token_operation, offset_to_ID)
@@ -218,7 +247,10 @@ class NodeUnaryPostfix(Node):
             raise BFSemanticError("Postfix operator %s can only be applied to a variable" % str(self.token_operation))
 
         if self.node_literal.token.type != Token.ID:
-            raise BFSemanticError("Postfix operator %s cannot be applied to %s, but only to a variable" % (str(self.token_operation), str(self.node_literal.token)))
+            raise BFSemanticError(
+                "Postfix operator %s cannot be applied to %s, but only to a variable"
+                % (str(self.token_operation), str(self.node_literal.token))
+            )
 
         offset_to_ID = get_offset_to_variable(self.ids_map_list, self.node_literal.token, current_pointer)
         return get_unary_postfix_op_code(self.token_operation, offset_to_ID)
@@ -227,27 +259,31 @@ class NodeUnaryPostfix(Node):
 class NodeFunctionCall(Node):
     def __init__(self, ids_map_list, function_to_call, parameters):
         """
-            receives a FunctionCompiler object
-                that implements get_code() which gets a stack pointer and returns code
-            receives a list of parameters - Node objects
-                each one gets a stack pointer and returns code that evaluates the parameter
+        receives a FunctionCompiler object
+            that implements get_code() which gets a stack pointer and returns code
+        receives a list of parameters - Node objects
+            each one gets a stack pointer and returns code that evaluates the parameter
         """
         Node.__init__(self, ids_map_list)
         self.function_to_call = function_to_call
         self.parameters = parameters
 
     def get_code(self, current_pointer, *args, **kwargs):
-        code = '[-]>'  # return_value_cell=0
+        code = "[-]>"  # return_value_cell=0
 
         # evaluate parameters from left to right, and put them on the "stack" in that order
         # after each parameter code, the pointer points to the next available cell (one after the parameter)
         for i, parameter in enumerate(self.parameters):
-            code += parameter.get_code(current_pointer+1+i)  # evaluate each parameter at its cell offset (starting at one after return_value_cell)
+            code += parameter.get_code(
+                current_pointer + 1 + i
+            )  # evaluate each parameter at its cell offset (starting at one after return_value_cell)
 
         # at this point we point to one after the last parameter
         code += "<" * len(self.parameters)  # point back to first parameter
         code += "<"  # point to return_value_cell
-        code += self.function_to_call.get_code(current_stack_pointer=current_pointer)  # after this we point to return value cell
+        code += self.function_to_call.get_code(
+            current_stack_pointer=current_pointer
+        )  # after this we point to return value cell
         code += ">"  # point to next available cell (one after return value)
         return code
 
@@ -377,9 +413,10 @@ class NodeArraySetElement(NodeArrayElement):
 
 class NodeArrayAssignment(Node):
     """
-        Used for array assignment
-        E.g arr = = { 1, 2, 3... }
+    Used for array assignment
+    E.g arr = = { 1, 2, 3... }
     """
+
     def __init__(self, ids_map_list, token_id, literal_tokens_list):
         Node.__init__(self, ids_map_list)
         self.token_id = token_id
@@ -387,7 +424,9 @@ class NodeArrayAssignment(Node):
 
     def get_code(self, current_pointer, *args, **kwargs):
         array_dimensions = get_variable_dimensions_from_token(self.ids_map_list, self.token_id)
-        unpacked_literals_list = unpack_literal_tokens_to_array_dimensions(self.token_id, array_dimensions, self.literal_tokens_list)
+        unpacked_literals_list = unpack_literal_tokens_to_array_dimensions(
+            self.token_id, array_dimensions, self.literal_tokens_list
+        )
 
         offset = get_offset_to_variable(self.ids_map_list, self.token_id, current_pointer)
         code = "<" * offset  # point to first array element
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Optimizer.py b/reasoning_gym/code/contrib/bfit/Compiler/Optimizer.py
index c2bc5413..992bfe49 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/Optimizer.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Optimizer.py
@@ -15,9 +15,13 @@ def optimize_once(tokens):
         # optimize arithmetic operations. E.g replace 1+2 with 3
 
         # need to be careful not to optimize (1+2*3) to (3*3)
-        if tokens[start_index+1].data in ["*", "/", "%"] or (start_index+3 >= len(tokens)) or (tokens[start_index+3].data not in ["*", "/", "%"]):
-            num1, num2 = get_NUM_token_value(tokens[start_index]), get_NUM_token_value(tokens[start_index+2])
-            op = tokens[start_index+1].data
+        if (
+            tokens[start_index + 1].data in ["*", "/", "%"]
+            or (start_index + 3 >= len(tokens))
+            or (tokens[start_index + 3].data not in ["*", "/", "%"])
+        ):
+            num1, num2 = get_NUM_token_value(tokens[start_index]), get_NUM_token_value(tokens[start_index + 2])
+            op = tokens[start_index + 1].data
             if op == "+":
                 val = num1 + num2
             elif op == "-":
@@ -38,8 +42,13 @@ def optimize_once(tokens):
                 raise NotImplementedError(op)
 
             # remove the 3 old tokens and replace them with new one
-            new_token = Token(Token.NUM, tokens[start_index].line, tokens[start_index].column, data=str(val),
-                              original_tokens=tokens[start_index:start_index+3])
+            new_token = Token(
+                Token.NUM,
+                tokens[start_index].line,
+                tokens[start_index].column,
+                data=str(val),
+                original_tokens=tokens[start_index : start_index + 3],
+            )
 
             for _ in range(3):
                 tokens.pop(start_index)
@@ -52,16 +61,24 @@ def optimize_once(tokens):
         # replace printint(50) with print("50")
         # since printing strings compiles into less Brainfuck code than printing ints
         if tokens[start_index].data == "printint":
-            tokens[start_index] = Token(Token.PRINT, tokens[start_index].line, tokens[start_index].column, original_tokens=[tokens[start_index]])
-            tokens[start_index+2] = Token(Token.STRING, tokens[start_index].line, tokens[start_index].column,
-                                          data=str(tokens[start_index+2].data), original_tokens=[tokens[start_index+2]])
+            tokens[start_index] = Token(
+                Token.PRINT, tokens[start_index].line, tokens[start_index].column, original_tokens=[tokens[start_index]]
+            )
+            tokens[start_index + 2] = Token(
+                Token.STRING,
+                tokens[start_index].line,
+                tokens[start_index].column,
+                data=str(tokens[start_index + 2].data),
+                original_tokens=[tokens[start_index + 2]],
+            )
             return True
 
         return False
 
-    rules = [([Token.NUM, Token.BINOP, Token.NUM], optimize_binop),  # arithmetic operations
-             ([Token.ID, Token.LPAREN, Token.NUM, Token.RPAREN], optimize_printint),  # printint(50) to print("50")
-             ]
+    rules = [
+        ([Token.NUM, Token.BINOP, Token.NUM], optimize_binop),  # arithmetic operations
+        ([Token.ID, Token.LPAREN, Token.NUM, Token.RPAREN], optimize_printint),  # printint(50) to print("50")
+    ]
 
     # try to match one of the rules to the tokens in a "sliding window" style
     i = 0
@@ -69,7 +86,7 @@ def optimize_once(tokens):
         optimized = False
         for tokens_sequence, optimization_function in rules:
             if i + len(tokens_sequence) <= len(tokens):
-                if all(tokens_sequence[n] == tokens[i+n].type for n in range(len(tokens_sequence))):
+                if all(tokens_sequence[n] == tokens[i + n].type for n in range(len(tokens_sequence))):
                     if optimization_function(tokens, i):
                         optimized = True
         if optimized:
@@ -82,7 +99,7 @@ def optimize(tokens):
     prev_tokens = [token.type for token in tokens]
     while True:
         optimize_once(tokens)
-        print(".", end='')
+        print(".", end="")
         current_tokens = [token.type for token in tokens]
         if current_tokens == prev_tokens:
             break
diff --git a/reasoning_gym/code/contrib/bfit/Compiler/Parser.py b/reasoning_gym/code/contrib/bfit/Compiler/Parser.py
index a658e04a..900ae41d 100644
--- a/reasoning_gym/code/contrib/bfit/Compiler/Parser.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Parser.py
@@ -1,12 +1,13 @@
-from .Exceptions import BFSyntaxError, BFSemanticError
-from .Token import Token
+from .Exceptions import BFSemanticError, BFSyntaxError
 from .General import is_token_literal
+from .Token import Token
 
 
 class Parser:
     """
     Used to easily iterate tokens
     """
+
     def __init__(self, tokens):
         self.tokens = tokens
         self.current_token_index = 0
@@ -80,7 +81,10 @@ class Parser:
             raise BFSyntaxError("Expected %s after %s" % (str(tokens_list), str(self.tokens[starting_index])))
         for i in range(0, len(tokens_list)):
             if self.tokens[starting_index + 1 + i].type != tokens_list[i]:
-                raise BFSyntaxError("Expected %s after %s" % (str(tokens_list[i]), [str(t) for t in self.tokens[starting_index: starting_index+1+i]]))
+                raise BFSyntaxError(
+                    "Expected %s after %s"
+                    % (str(tokens_list[i]), [str(t) for t in self.tokens[starting_index : starting_index + 1 + i]])
+                )
 
     def check_next_token_is(self, token, starting_index=None):
         self.check_next_tokens_are([token], starting_index=starting_index)
diff --git a/reasoning_gym/code/contrib/bfit/Interpreter.py b/reasoning_gym/code/contrib/bfit/Interpreter.py
index 02e0520f..9281772b 100644
--- a/reasoning_gym/code/contrib/bfit/Interpreter.py
+++ b/reasoning_gym/code/contrib/bfit/Interpreter.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
-import sys
 import argparse
+import sys
 
 
 def create_jumps_dictionary(program):
@@ -9,9 +9,9 @@ def create_jumps_dictionary(program):
     res = dict()
 
     for index, command in enumerate(program):
-        if command == '[':
+        if command == "[":
             lbraces.append(index)
-        elif command == ']':
+        elif command == "]":
             if len(lbraces) == 0:
                 raise SyntaxError("Brainfuck: mismatched parentheses (at index: %s)" % index)
 
@@ -35,26 +35,26 @@ def brainfuck(program, bits=8):
     while instruction_pointer < len(program):
         command = program[instruction_pointer]
 
-        if command == '>':
+        if command == ">":
             data_pointer += 1
-        elif command == '<':
+        elif command == "<":
             data_pointer -= 1
-        elif command == '+':
-            data[data_pointer] = (data.get(data_pointer, 0) + 1)
-            if data[data_pointer] == 2 ** bits:
+        elif command == "+":
+            data[data_pointer] = data.get(data_pointer, 0) + 1
+            if data[data_pointer] == 2**bits:
                 data[data_pointer] = 0
-        elif command == '-':
-            data[data_pointer] = (data.get(data_pointer, 0) - 1)
+        elif command == "-":
+            data[data_pointer] = data.get(data_pointer, 0) - 1
             if data[data_pointer] == -1:
-                data[data_pointer] = 2 ** bits - 1
-        elif command == ',':
+                data[data_pointer] = 2**bits - 1
+        elif command == ",":
             data[data_pointer] = ord(sys.stdin.read(1)) % 256
-        elif command == '.':
-            print(chr(data.get(data_pointer, 0)), end='', flush=True)
-        elif command == '[':
+        elif command == ".":
+            print(chr(data.get(data_pointer, 0)), end="", flush=True)
+        elif command == "[":
             if data.get(data_pointer, 0) == 0:
                 instruction_pointer = jumps[instruction_pointer]
-        elif command == ']':
+        elif command == "]":
             if data.get(data_pointer, 0) != 0:
                 instruction_pointer = jumps[instruction_pointer]
         else:  # everything else is comment
@@ -63,16 +63,19 @@ def brainfuck(program, bits=8):
         instruction_pointer += 1
 
     if data_pointer != 0:
-        print("WARNING (interpreter) - at the end of the execution the data pointer is %s instead of 0 (possibly a compiler issue)" % str(data_pointer))
+        print(
+            "WARNING (interpreter) - at the end of the execution the data pointer is %s instead of 0 (possibly a compiler issue)"
+            % str(data_pointer)
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("filepath")
     parser.add_argument("--bits", "-b", "--interpreter-bits", type=int, default=8, help="Amount of bits each cell uses")
 
     args = parser.parse_args()
-    with open(args.filepath, 'r') as f:
+    with open(args.filepath, "r") as f:
         code = f.read()
 
     brainfuck(code, args.bits)
diff --git a/reasoning_gym/code/contrib/bfit/README.md b/reasoning_gym/code/contrib/bfit/README.md
index 4ad60e98..2b503c05 100644
--- a/reasoning_gym/code/contrib/bfit/README.md
+++ b/reasoning_gym/code/contrib/bfit/README.md
@@ -57,7 +57,7 @@ int main()
 $ ./BF-it.py helloworld.code
 Compiling file 'helloworld.code'...
 Compiled successfully to 'helloworld.bf'
-$ cat helloworld.bf 
+$ cat helloworld.bf
 >[-]>[-]<>++++++++[-<+++++++++>]<.>++++[-<+++++++>]
 <+.+++++++..+++.>++++++[-<------------->]<-.>+++++[
 -<+++++++++++>]<.>++++[-<++++++>]<.+++.------.-----
@@ -98,4 +98,3 @@ If you found a bug, or have an idea for a feature, open an issue
 * https://introcs.cs.princeton.edu/java/11precedence/ for operator precedence
 * https://logomakr.com/ for creating a logo
 * https://www.youtube.com/ for setting the mood
-
diff --git a/reasoning_gym/cognition/__init__.py b/reasoning_gym/cognition/__init__.py
index e1f01947..fddd97b1 100644
--- a/reasoning_gym/cognition/__init__.py
+++ b/reasoning_gym/cognition/__init__.py
@@ -7,9 +7,9 @@ Cognition tasks for training reasoning capabilities:
 """
 
 from .color_cube_rotation import ColorCubeRotationConfig, ColorCubeRotationDataset
+from .figlet_fonts import FigletFontConfig, FigletFontDataset
 from .number_sequences import NumberSequenceConfig, NumberSequenceDataset
 from .rubiks_cube import RubiksCubeConfig, RubiksCubeDataset
-from .figlet_fonts import FigletFontConfig, FigletFontDataset
 
 __all__ = [
     "NumberSequenceConfig",
@@ -19,5 +19,5 @@ __all__ = [
     "RubiksCubeConfig",
     "RubiksCubeDataset",
     "FigletFontConfig",
-    "FigletFontDataset"
+    "FigletFontDataset",
 ]
diff --git a/reasoning_gym/games/__init__.py b/reasoning_gym/games/__init__.py
index 0826dea6..a801c6e4 100644
--- a/reasoning_gym/games/__init__.py
+++ b/reasoning_gym/games/__init__.py
@@ -7,10 +7,10 @@ Game tasks for training reasoning capabilities:
 """
 
 from .countdown import CountdownConfig, CountdownDataset
+from .game_of_life import GameOfLifeConfig, GameOfLifeDataset
 from .maze import MazeConfig, MazeDataset
 from .mini_sudoku import MiniSudokuConfig, MiniSudokuDataset
 from .sudoku import SudokuConfig, SudokuDataset
-from .game_of_life import GameOfLifeConfig, GameOfLifeDataset
 
 __all__ = [
     "CountdownConfig",
diff --git a/reasoning_gym/games/game_of_life.py b/reasoning_gym/games/game_of_life.py
index cc4dc6a8..c8cdc0d1 100644
--- a/reasoning_gym/games/game_of_life.py
+++ b/reasoning_gym/games/game_of_life.py
@@ -1,18 +1,19 @@
 from dataclasses import dataclass
 from random import Random
-from typing import List, Optional, Tuple, Dict
+from typing import Dict, List, Optional, Tuple
 
 import cellpylib as cpl
 
 from ..factory import ProceduralDataset, register_dataset
 
+
 @dataclass
 class GameOfLifeConfig:
     """Configuration for sudoku puzzle generation"""
 
-    grid_size_x: int = 20 
+    grid_size_x: int = 20
     grid_size_y: int = 20
-    filled_cells: int = 100 # actually a max
+    filled_cells: int = 100  # actually a max
     simulation_steps: int = 1
     seed: Optional[int] = None
     size: int = 500
@@ -25,11 +26,12 @@ class GameOfLifeConfig:
         assert self.filled_cells <= self.grid_size_x * self.grid_size_y, "filled_cells must fit in x times y"
 
 
-class GameOfLifeConfigDataset(ProceduralDataset):
+class GameOfLifeDataset(ProceduralDataset):
     """Generates Game of Life games with configurable parameters"""
 
     def __init__(self, config: GameOfLifeConfig):
-        self._prompt_templates = ["What will this Game of Life board look like after {simulation_steps} steps of simulation?\n\n{board}"
+        self._prompt_templates = [
+            "What will this Game of Life board look like after {simulation_steps} steps of simulation?\n\n{board}"
         ]
 
         super().__init__(config=config, seed=config.seed, size=config.size)
@@ -46,7 +48,7 @@ class GameOfLifeConfigDataset(ProceduralDataset):
         rng = Random(self.seed + idx)
 
         # Make the board
-        board  = cpl.init_simple2d(self.config.grid_size_x, self.config.grid_size_y)
+        board = cpl.init_simple2d(self.config.grid_size_x, self.config.grid_size_y)
         board[:, :, :] = 0
 
         # Add the cells
@@ -56,13 +58,17 @@ class GameOfLifeConfigDataset(ProceduralDataset):
             board[:, rx, ry] = 1
 
         # Simulate the result to get the answer
-        evolved = cpl.evolve2d(board, timesteps=self.config.simulation_steps + 1, apply_rule=cpl.game_of_life_rule, memoize='recursive')
+        evolved = cpl.evolve2d(
+            board, timesteps=self.config.simulation_steps + 1, apply_rule=cpl.game_of_life_rule, memoize="recursive"
+        )
 
         board_str = str(board[0])
         result_str = str(evolved[-1])
 
         return {
-            "question": rng.choice(self._prompt_templates).format(simulation_steps=self.config.simulation_steps, board=board_str),
+            "question": rng.choice(self._prompt_templates).format(
+                simulation_steps=self.config.simulation_steps, board=board_str
+            ),
             "answer": result_str,
             "metadata": {
                 "grid_size_x": self.config.grid_size_x,
@@ -87,10 +93,10 @@ class GameOfLifeConfigDataset(ProceduralDataset):
 
         if answer == None:
             return 0.0
-        if answer.replace('\n', '') != entry['answer'].replace('\n', ''):
+        if answer.replace("\n", "") != entry["answer"].replace("\n", ""):
             return 0.01
         else:
-            return 1.0 # Yay
+            return 1.0  # Yay
 
 
-register_dataset("game_of_life", GameOfLifeConfigDataset, GameOfLifeConfig)
+register_dataset("game_of_life", GameOfLifeDataset, GameOfLifeConfig)
diff --git a/reasoning_gym/graphs/__init__.py b/reasoning_gym/graphs/__init__.py
index 409e954f..6bbe7d67 100644
--- a/reasoning_gym/graphs/__init__.py
+++ b/reasoning_gym/graphs/__init__.py
@@ -3,7 +3,7 @@ from .quantum_lock import QuantumLockConfig, QuantumLockDataset
 
 __all__ = [
     "FamilyRelationshipsConfig",
-    "FamilyRelationshipsDataset", 
+    "FamilyRelationshipsDataset",
     "QuantumLockConfig",
     "QuantumLockDataset",
 ]
diff --git a/scripts/generate_gallery.py b/scripts/generate_gallery.py
index b9cf4630..06d841d4 100755
--- a/scripts/generate_gallery.py
+++ b/scripts/generate_gallery.py
@@ -32,7 +32,7 @@ def generate_gallery() -> str:
 
         # Add dataset header with anchor
         anchor = name.replace("_", "-").lower()
-        content.append(f"### {name} {{{anchor}}}\n")
+        content.append(f"### {name}\n")
 
         # Get dataset class docstring if available
         if dataset.__class__.__doc__:
diff --git a/tests/test_bf.py b/tests/test_bf.py
index cefac4c7..86d2619a 100644
--- a/tests/test_bf.py
+++ b/tests/test_bf.py
@@ -2,6 +2,7 @@ import pytest
 
 from reasoning_gym.code.bf import BFConfig, BFDataset
 
+
 def test_bf():
     """Test basic properties and solution of generated items"""
 
@@ -34,4 +35,4 @@ def test_bf():
     config = BFConfig(seed=44, size=20, difficulty=3)
     dataset = BFDataset(config)
     for item in dataset:
-        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
\ No newline at end of file
+        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
diff --git a/tests/test_game_of_life.py b/tests/test_game_of_life.py
index 288a1fe4..df0f133d 100644
--- a/tests/test_game_of_life.py
+++ b/tests/test_game_of_life.py
@@ -1,20 +1,14 @@
 import pytest
 
-from reasoning_gym.games.game_of_life import GameOfLifeConfig, GameOfLifeConfigDataset
+from reasoning_gym.games.game_of_life import GameOfLifeConfig, GameOfLifeDataset
+
 
 def test_game_of_life():
     """Test basic properties and solution of generated items"""
 
     # Easy
-    config = GameOfLifeConfig(
-        seed=42, 
-        size=1, 
-        grid_size_x=20,
-        grid_size_y=20,
-        filled_cells=10,
-        simulation_steps=1
-    )
-    dataset = GameOfLifeConfigDataset(config)
+    config = GameOfLifeConfig(seed=42, size=1, grid_size_x=20, grid_size_y=20, filled_cells=10, simulation_steps=1)
+    dataset = GameOfLifeDataset(config)
 
     for item in dataset:
         assert isinstance(item, dict)

From ffaf3c365382e2fb3c4d537cb5463438a60a2657 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Thu, 30 Jan 2025 23:06:22 +0100
Subject: [PATCH 11/94] romve dataset examples from REAMED.md (now in linked
 GALLERY.md)

---
 .pre-commit-config.yaml     |    1 +
 GALLERY.md                  | 1312 ++++++++++++++++++-----------------
 README.md                   |  292 --------
 scripts/generate_gallery.py |   10 +-
 4 files changed, 687 insertions(+), 928 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 72c22f89..49987591 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,3 +18,4 @@ repos:
     hooks:
     -   id: isort
         name: isort (python)
+exclude: GALLERY.md
diff --git a/GALLERY.md b/GALLERY.md
index e161b85d..8fb699c7 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -2,38 +2,39 @@
 This gallery shows examples from all available datasets using their default configurations.
 
 ## Available Datasets
-- [base_conversion](#base-conversion)
-- [basic_arithmetic](#basic-arithmetic)
-- [caesar_cipher](#caesar-cipher)
-- [chain_sum](#chain-sum)
-- [color_cube_rotation](#color-cube-rotation)
+- [base_conversion](#base_conversion)
+- [basic_arithmetic](#basic_arithmetic)
+- [bf](#bf)
+- [caesar_cipher](#caesar_cipher)
+- [chain_sum](#chain_sum)
+- [color_cube_rotation](#color_cube_rotation)
 - [countdown](#countdown)
-- [family_relationships](#family-relationships)
-- [figlet_font](#figlet-font)
-- [fraction_simplification](#fraction-simplification)
-- [game_of_life](#game-of-life)
+- [family_relationships](#family_relationships)
+- [figlet_font](#figlet_font)
+- [fraction_simplification](#fraction_simplification)
+- [game_of_life](#game_of_life)
 - [gcd](#gcd)
 - [lcm](#lcm)
-- [leg_counting](#leg-counting)
-- [letter_counting](#letter-counting)
-- [letter_jumble](#letter-jumble)
+- [leg_counting](#leg_counting)
+- [letter_counting](#letter_counting)
+- [letter_jumble](#letter_jumble)
 - [maze](#maze)
-- [mini_sudoku](#mini-sudoku)
-- [number_filtering](#number-filtering)
-- [number_sequence](#number-sequence)
-- [number_sorting](#number-sorting)
-- [polynomial_equations](#polynomial-equations)
-- [prime_factorization](#prime-factorization)
-- [propositional_logic](#propositional-logic)
-- [quantum_lock](#quantum-lock)
-- [rubiks_cube](#rubiks-cube)
-- [sentence_reordering](#sentence-reordering)
-- [simple_equations](#simple-equations)
-- [spell_backward](#spell-backward)
+- [mini_sudoku](#mini_sudoku)
+- [number_filtering](#number_filtering)
+- [number_sequence](#number_sequence)
+- [number_sorting](#number_sorting)
+- [polynomial_equations](#polynomial_equations)
+- [prime_factorization](#prime_factorization)
+- [propositional_logic](#propositional_logic)
+- [quantum_lock](#quantum_lock)
+- [rubiks_cube](#rubiks_cube)
+- [sentence_reordering](#sentence_reordering)
+- [simple_equations](#simple_equations)
+- [spell_backward](#spell_backward)
 - [sudoku](#sudoku)
 - [syllogism](#syllogism)
-- [word_sequence_reversal](#word-sequence-reversal)
-- [word_sorting](#word-sorting)
+- [word_sequence_reversal](#word_sequence_reversal)
+- [word_sorting](#word_sorting)
 
 ## Dataset Examples
 ### base_conversion
@@ -45,26 +46,26 @@ min_base = 2
 max_base = 16
 min_value = 0
 max_value = 1000
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Convert the base-15 number 15 to binary
-Answer: 10101
-Metadata: {'decimal_value': 21, 'source_base': 15, 'target_base': 2, 'source_repr': '15', 'target_repr': '10101'}
+Question: Convert the base-3 number 28e to binary
+Answer: 1010001110
+Metadata: {'decimal_value': 654, 'source_base': 3, 'target_base': 2, 'source_repr': '28e', 'target_repr': '1010001110'}
 
 Example 2:
-Question: Convert the base-15 number de to base-6
-Answer: de
-Metadata: {'decimal_value': 222, 'source_base': 15, 'target_base': 6, 'source_repr': 'de', 'target_repr': 'de'}
+Question: Convert the base-6 number 27 to base-13 (use lowercase letters a-z for digits above 9)
+Answer: 27
+Metadata: {'decimal_value': 39, 'source_base': 6, 'target_base': 13, 'source_repr': '27', 'target_repr': '27'}
 
 Example 3:
-Question: Convert the base-10 number 4e to binary
-Answer: 1001110
-Metadata: {'decimal_value': 78, 'source_base': 10, 'target_base': 2, 'source_repr': '4e', 'target_repr': '1001110'}
+Question: Convert the base-10 number 1a2 to base-13 (use lowercase letters a-z for digits above 9)
+Answer: 1a2
+Metadata: {'decimal_value': 418, 'source_base': 10, 'target_base': 13, 'source_repr': '1a2', 'target_repr': '1a2'}
 
 ```
 
@@ -80,7 +81,7 @@ max_digits = 4
 operators = ('+', '-', '*', '/')
 allow_parentheses = True
 allow_negation = True
-seed = None
+seed = 42
 size = 500
 format_style = simple
 whitespace = single
@@ -89,19 +90,54 @@ whitespace = single
 Example tasks:
 ```
 Example 1:
-Question: 19 + 61 * -43 / 1 + 89 - 98 =
-Answer: -2613
-Metadata: {'num_terms': 6, 'num_digits': 2, 'expression': '19 + 61 * -43 / 1 + 89 - 98'}
+Question: -5 * -6 =
+Answer: 30
+Metadata: {'num_terms': 2, 'num_digits': 1, 'expression': '-5 * -6'}
 
 Example 2:
-Question: ( 9240 + -702 ) =
-Answer: 8538
-Metadata: {'num_terms': 2, 'num_digits': 4, 'expression': '( 9240 + -702 )'}
+Question: 965 / 5 =
+Answer: 193
+Metadata: {'num_terms': 2, 'num_digits': 3, 'expression': '965 / 5'}
 
 Example 3:
-Question: -68 * 12 - 6 / 2 + -60 =
-Answer: -879
-Metadata: {'num_terms': 5, 'num_digits': 2, 'expression': '-68 * 12 - 6 / 2 + -60'}
+Question: 0 + -2 + -4 * 0 * 3 =
+Answer: -2
+Metadata: {'num_terms': 5, 'num_digits': 1, 'expression': '0 + -2 + -4 * 0 * 3'}
+
+```
+
+### bf
+Generates BF tasks
+
+Default configuration:
+```python
+seed = 42
+size = 500
+difficulty = 1
+```
+
+Example tasks:
+```
+Example 1:
+Question: This is a BF (Brainf*ck) computer program. What is the output? 
+
+>[-]>[-]<>++++++++++[<+++++++++++>-]<+.-.+++++.--------------.+++++++++++++++.<
+Answer: onset
+Metadata: {'bfit_code': '\nint main() {\n    print("onset");\n}\n', 'bf_program': '>[-]>[-]<>++++++++++[<+++++++++++>-]<+.-.+++++.--------------.+++++++++++++++.<'}
+
+Example 2:
+Question: This is a BF (Brainf*ck) computer program. What is the output? 
+
+>[-]>[-]<>++++++++[<++++++++++++++>-]<.-----------.+++++++++++++.---------------.+++++.<
+Answer: perch
+Metadata: {'bfit_code': '\nint main() {\n    print("perch");\n}\n', 'bf_program': '>[-]>[-]<>++++++++[<++++++++++++++>-]<.-----------.+++++++++++++.---------------.+++++.<'}
+
+Example 3:
+Question: This is a BF (Brainf*ck) computer program. What is the output? 
+
+>[-]>[-]<>+++++++++[<+++++++++++++>-]<.-------.----------.+.+++++++++++++.<
+Answer: under
+Metadata: {'bfit_code': '\nint main() {\n    print("under");\n}\n', 'bf_program': '>[-]>[-]<>+++++++++[<+++++++++++++>-]<.-------.----------.+.+++++++++++++.<'}
 
 ```
 
@@ -115,26 +151,26 @@ min_words = 3
 max_words = 20
 min_rotation = 1
 max_rotation = 25
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Decrypt this Caesar cipher text: UVYAO MVY AOL VM IBA AOL ZVBAO MVY AOL SHAPUZ
-Answer: NORTH FOR THE OF BUT THE SOUTH FOR THE LATINS
-Metadata: {'rotation': 7, 'cipher_text': 'UVYAO MVY AOL VM IBA AOL ZVBAO MVY AOL SHAPUZ', 'clear_text': 'NORTH FOR THE OF BUT THE SOUTH FOR THE LATINS'}
+Question: Decrypt this Caesar cipher text: JNJUBUF ZPVS BTTPDJBUF XIPN J XBT DPNQMJNFOUJOH B NPNFOU BHP
+Answer: IMITATE YOUR ASSOCIATE WHOM I WAS COMPLIMENTING A MOMENT AGO
+Metadata: {'rotation': 1, 'cipher_text': 'JNJUBUF ZPVS BTTPDJBUF XIPN J XBT DPNQMJNFOUJOH B NPNFOU BHP', 'clear_text': 'IMITATE YOUR ASSOCIATE WHOM I WAS COMPLIMENTING A MOMENT AGO'}
 
 Example 2:
-Question: Decrypt this Caesar cipher text: ER MRHITIRHIRX KSZIVRQIRX
-Answer: AN INDEPENDENT GOVERNMENT
-Metadata: {'rotation': 4, 'cipher_text': 'ER MRHITIRHIRX KSZIVRQIRX', 'clear_text': 'AN INDEPENDENT GOVERNMENT'}
+Question: Decrypt this Caesar cipher text: PBSDJ XKZYVOYX CWSDR LYEQRD SD PYB K WOBO KXN YBSQSXKDON DOVOZRYXSM TYEBXKVSCW
+Answer: FRITZ NAPOLEON SMITH BOUGHT IT FOR A MERE AND ORIGINATED TELEPHONIC JOURNALISM
+Metadata: {'rotation': 10, 'cipher_text': 'PBSDJ XKZYVOYX CWSDR LYEQRD SD PYB K WOBO KXN YBSQSXKDON DOVOZRYXSM TYEBXKVSCW', 'clear_text': 'FRITZ NAPOLEON SMITH BOUGHT IT FOR A MERE AND ORIGINATED TELEPHONIC JOURNALISM'}
 
 Example 3:
-Question: Decrypt this Caesar cipher text: IYE WKI ECO DRSC OLYYU PYB XOKBVI KXI ZEBZYCO CEMR KC MBOKDSYX YP NOBSFKDSFO ZOBPYBWKXMOC KXN BOCOKBMR
-Answer: YOU MAY USE THIS EBOOK FOR NEARLY ANY PURPOSE SUCH AS CREATION OF DERIVATIVE PERFORMANCES AND RESEARCH
-Metadata: {'rotation': 10, 'cipher_text': 'IYE WKI ECO DRSC OLYYU PYB XOKBVI KXI ZEBZYCO CEMR KC MBOKDSYX YP NOBSFKDSFO ZOBPYBWKXMOC KXN BOCOKBMR', 'clear_text': 'YOU MAY USE THIS EBOOK FOR NEARLY ANY PURPOSE SUCH AS CREATION OF DERIVATIVE PERFORMANCES AND RESEARCH'}
+Question: Decrypt this Caesar cipher text: ZW PFLI JKFDRTY ZJ FLK FW ZK DLJK SV DVEUVU
+Answer: IF YOUR STOMACH IS OUT OF IT MUST BE MENDED
+Metadata: {'rotation': 17, 'cipher_text': 'ZW PFLI JKFDRTY ZJ FLK FW ZK DLJK SV DVEUVU', 'clear_text': 'IF YOUR STOMACH IS OUT OF IT MUST BE MENDED'}
 
 ```
 
@@ -148,26 +184,26 @@ max_terms = 6
 min_digits = 1
 max_digits = 4
 allow_negation = False
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: 3 - 6 + 4 =
-Answer: 1
-Metadata: {'num_terms': 3, 'num_digits': 1, 'expression': '3 - 6 + 4'}
+Question: 4 + 3 =
+Answer: 7
+Metadata: {'num_terms': 2, 'num_digits': 1, 'expression': '4 + 3'}
 
 Example 2:
-Question: 6516 - 9002 - 5380 - 2663 =
-Answer: -10529
-Metadata: {'num_terms': 4, 'num_digits': 4, 'expression': '6516 - 9002 - 5380 - 2663'}
+Question: 812 + 880 =
+Answer: 1692
+Metadata: {'num_terms': 2, 'num_digits': 3, 'expression': '812 + 880'}
 
 Example 3:
-Question: 3352 + 3153 - 3475 + 1726 - 8711 - 7863 =
-Answer: -11818
-Metadata: {'num_terms': 6, 'num_digits': 4, 'expression': '3352 + 3153 - 3475 + 1726 - 8711 - 7863'}
+Question: 2 + 6 + 3 + 4 + 0 =
+Answer: 15
+Metadata: {'num_terms': 5, 'num_digits': 1, 'expression': '2 + 6 + 3 + 4 + 0'}
 
 ```
 
@@ -178,7 +214,7 @@ Default configuration:
 ```python
 min_rotations = 1
 max_rotations = 3
-seed = None
+seed = 42
 size = 500
 ```
 
@@ -186,54 +222,56 @@ Example tasks:
 ```
 Example 1:
 Question: A cube has:
-- a red top side
-- a brown right side
-- a cyan front side
-- a gray left side
-- a silver back side
-- a purple bottom side
+- a pink top side
+- a gray right side
+- a orange front side
+- a purple left side
+- a indigo back side
+- a cyan bottom side
 
-The cube is rotated so that the side which was before at the front is now at the top.
+The cube is rotated so that the side which was before at the bottom is now at the top.
 
-Now the cube is rotated to place its right side at the top.
-
-What is now the color of the top side of the cube?
-Answer: brown
-Metadata: {'initial_state': {'top': 'red', 'right': 'brown', 'front': 'cyan', 'left': 'gray', 'back': 'silver', 'bottom': 'purple'}, 'rotations': ['front', 'right'], 'target_side': 'top', 'num_rotations': 2}
+What is now the color of the back side of the cube?
+Answer: orange
+Metadata: {'initial_state': {'top': 'pink', 'right': 'gray', 'front': 'orange', 'left': 'purple', 'back': 'indigo', 'bottom': 'cyan'}, 'rotations': ['bottom'], 'target_side': 'back', 'num_rotations': 1}
 
 Example 2:
 Question: A cube has:
-- a yellow top side
-- a cyan right side
-- a white front side
-- a blue left side
-- a red back side
-- a pink bottom side
+- a gray top side
+- a brown right side
+- a silver front side
+- a red left side
+- a purple back side
+- a yellow bottom side
 
 The cube is rotated so that the side which was before at the left is now at the top.
 
-Then the cube is rotated to bring the front side to the top.
+Next, the bottom side is rotated to become the top face.
 
-Next, the front side is rotated to become the top face.
+After that the cube is turned to make the bottom face the top.
 
-What is now the color of the front side of the cube?
-Answer: red
-Metadata: {'initial_state': {'top': 'yellow', 'right': 'cyan', 'front': 'white', 'left': 'blue', 'back': 'red', 'bottom': 'pink'}, 'rotations': ['left', 'front', 'front'], 'target_side': 'front', 'num_rotations': 3}
+What is now the color of the left side of the cube?
+Answer: yellow
+Metadata: {'initial_state': {'top': 'gray', 'right': 'brown', 'front': 'silver', 'left': 'red', 'back': 'purple', 'bottom': 'yellow'}, 'rotations': ['left', 'bottom', 'bottom'], 'target_side': 'left', 'num_rotations': 3}
 
 Example 3:
 Question: A cube has:
-- a indigo top side
-- a violet right side
-- a silver front side
+- a orange top side
+- a cyan right side
+- a violet front side
 - a pink left side
-- a magenta back side
-- a cyan bottom side
+- a gray back side
+- a gold bottom side
 
-The cube is rotated so that the side which was before at the front is now at the top.
+The cube is rotated so that the side which was before at the left is now at the top.
 
-What is now the color of the top side of the cube?
-Answer: silver
-Metadata: {'initial_state': {'top': 'indigo', 'right': 'violet', 'front': 'silver', 'left': 'pink', 'back': 'magenta', 'bottom': 'cyan'}, 'rotations': ['front'], 'target_side': 'top', 'num_rotations': 1}
+Now the cube is rotated to place its back side at the top.
+
+Now the cube is rotated to place its bottom side at the top.
+
+What is now the color of the left side of the cube?
+Answer: gold
+Metadata: {'initial_state': {'top': 'orange', 'right': 'cyan', 'front': 'violet', 'left': 'pink', 'back': 'gray', 'bottom': 'gold'}, 'rotations': ['left', 'back', 'bottom'], 'target_side': 'left', 'num_rotations': 3}
 
 ```
 
@@ -250,29 +288,29 @@ min_target = 100
 max_target = 999
 operators = ('+', '-', '*', '/')
 shuffle = True
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Calculate 421 using the numbers 10, 30, 26, 59.
+Question: Calculate 139 using the numbers 36, 29, 95, 32, 4, 15.
 Each number may be used at most once.
-Answer: 30*(26 - 10) - 59
-Metadata: {'numbers': [10, 30, 26, 59], 'target': 421, 'expression': '30*(26 - 10) - 59'}
+Answer: 15 - 4 + 95 + 36 - 32 + 29
+Metadata: {'numbers': [36, 29, 95, 32, 4, 15], 'target': 139, 'expression': '15 - 4 + 95 + 36 - 32 + 29'}
 
 Example 2:
-Question: Calculate 229 using the numbers 55, 80, 34, 60.
-Each number may be used at most once.
-Answer: 80 + 34 + 60 + 55
-Metadata: {'numbers': [55, 80, 34, 60], 'target': 229, 'expression': '80 + 34 + 60 + 55'}
+Question: Using the numbers 74, 48, 56, 66, create an expression that equals 132.
+You can only use each number once.
+Answer: 66 - 56 + 74 + 48
+Metadata: {'numbers': [74, 48, 56, 66], 'target': 132, 'expression': '66 - 56 + 74 + 48'}
 
 Example 3:
-Question: Calculate 840 using the numbers 41, 18, 32, 45, 84.
-Each number may be used at most once.
-Answer: 84*(41 - 45 + 32 - 18)
-Metadata: {'numbers': [41, 18, 32, 45, 84], 'target': 840, 'expression': '84*(41 - 45 + 32 - 18)'}
+Question: Using the numbers 5, 41, 38, 81, 14, create an expression that equals 450.
+You can only use each number once.
+Answer: 41*14 - 81 - 38 - 5
+Metadata: {'numbers': [5, 41, 38, 81, 14], 'target': 450, 'expression': '41*14 - 81 - 38 - 5'}
 
 ```
 
@@ -285,32 +323,32 @@ min_family_size = 4
 max_family_size = 8
 male_names = ['James', 'John', 'Robert', 'Michael', 'William', 'David', 'Richard', 'Joseph', 'Thomas', 'Charles', 'Peter', 'Daniel', 'Matthew', 'Christopher', 'Andrew', 'George', 'Edward', 'Benjamin', 'Henry', 'Samuel', 'Alexander', 'Oliver', 'Jack', 'Harry', 'Jacob', 'Noah', 'Ethan', 'Lucas', 'Mason', 'Logan', 'Sebastian', 'Theodore', 'Owen', 'Liam', 'Aiden', 'Kai', 'Jayden', 'Zion', 'Phoenix', 'Atlas', 'Axel', 'Ryder', 'Finn']
 female_names = ['Mary', 'Patricia', 'Jennifer', 'Linda', 'Elizabeth', 'Barbara', 'Susan', 'Jessica', 'Sarah', 'Karen', 'Emma', 'Lisa', 'Anna', 'Margaret', 'Victoria', 'Charlotte', 'Sophia', 'Isabella', 'Olivia', 'Ava', 'Mia', 'Emily', 'Abigail', 'Amelia', 'Eleanor', 'Grace', 'Alice', 'Lucy', 'Chloe', 'Sophie', 'Lily', 'Hannah', 'Zoe', 'Luna', 'Nova', 'Aria', 'Willow', 'Aurora', 'Sage', 'River', 'Winter', 'Sky', 'Rain']
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Jack is married to Elizabeth. They have a child called Oliver. Oliver is married to Abigail. They have a child called Logan. Alexander is married to Mia. They have a child called Abigail.
+Question: John is married to Isabella. They have a child called Edward. Edward is married to Victoria.
 
-What relation is Mia to Abigail?
+What is Isabella to Edward?
 Answer: mother
-Metadata: {'person1': 'Mia', 'person2': 'Abigail', 'relationship': 'mother', 'family_size': 7}
+Metadata: {'person1': 'Isabella', 'person2': 'Edward', 'relationship': 'mother', 'family_size': 4}
 
 Example 2:
-Question: James is married to Sarah. They have a child called Atlas. Atlas is married to Sophie. They have children called Jennifer and Aria.
+Question: Henry is married to Karen. They have a child called Sebastian. Sebastian is married to Eleanor.
 
-What is Aria to Jennifer?
-Answer: sister
-Metadata: {'person1': 'Aria', 'person2': 'Jennifer', 'relationship': 'sister', 'family_size': 6}
+What relation is Henry to Karen?
+Answer: husband
+Metadata: {'person1': 'Henry', 'person2': 'Karen', 'relationship': 'husband', 'family_size': 4}
 
 Example 3:
-Question: Lucas is married to Willow. They have a child called Samuel. Samuel is married to Zoe. They have a child called William. Henry is married to Emma. They have a child called Zoe.
+Question: Liam is married to Nova. They have a child called Noah. Noah is married to Charlotte. They have a child called Patricia. Joseph is married to Lisa. They have a child called Charlotte.
 
-What is Lucas to Willow?
-Answer: husband
-Metadata: {'person1': 'Lucas', 'person2': 'Willow', 'relationship': 'husband', 'family_size': 7}
+What is Liam to Noah?
+Answer: father
+Metadata: {'person1': 'Liam', 'person2': 'Noah', 'relationship': 'father', 'family_size': 7}
 
 ```
 
@@ -322,7 +360,7 @@ Default configuration:
 static_word = None
 static_font = None
 space_letters = True
-seed = None
+seed = 42
 size = 500
 ```
 
@@ -331,39 +369,50 @@ Example tasks:
 Example 1:
 Question: Please read the following figlet font:
 
-  ()     _     _       _    _ __     ()  ,
-  /\    ' )   /       | )  ' )  )    /`-'|
- /  )    / / /    ,---|/    /  /    /   /
-/__/__  (_(_/      \_/ \_  /  (_   /__-<_
+  sSSSs        d s  b        sss.      d sss        sss sssss 
+ S     S       S  S S      d           S                S     
+S       S      S   SS      Y           S                S     
+S       S      S    S        ss.       S sSSs           S     
+S       S      S    S           b      S                S     
+ S     S       S    S           P      S                S     
+  "sss"        P    P      ` ss'       P sSSss          P     
+                                                              
 
-
-
-Answer: SWING
-Metadata: {'font': 'slscript', 'space_letters': True}
+Answer: ONSET
+Metadata: {'font': 'amc_tubes', 'space_letters': True}
 
 Example 2:
 Question: What word does this say?
 
-     dBBBP     dBBBBBb        dBBBP     dBP dBP    dBBBP
-                    BB
-   dBP          dBP BB      dBP       dBBBBBP    dBBP
-  dBP          dBP  BB     dBP       dBP dBP    dBP
- dBBBBP       dBBBBBBB    dBBBBP    dBP dBP    dBBBBP
+######   ######   ######     ####   ##    ## 
+ ##  ##   ##  ##   ##  ##   ##  ##   ##  ##  
+ ##  ##   ##       ##  ##  ##   ##   ##  ##  
+ #####    ####     #####   ##        ######  
+ ##       ##       ## ##   ##   ##   ##  ##  
+ ##       ##  ##   ## ##    ##  ##   ##  ##  
+####     ######   ### ###    ####   ##    ## 
+                                             
 
-
-Answer: CACHE
-Metadata: {'font': 'trek', 'space_letters': True}
+Answer: PERCH
+Metadata: {'font': 'demo_2__', 'space_letters': True}
 
 Example 3:
-Question: Please read the following figlet font:
+Question: What word does this say?
 
-.---. .---. .-. .-..-. .-..-.
- \ \  | |-' | | | .` |  >  /
-`---' `-'   `-' `-'`-'  `-'
+                                              
+                                              
+                                              
+### ###   ### ###   #####    ######   #####   
+ ## ##     ##  #     ## ##    ##  #    ## ##  
+ ## ##     ### #     ## ##    ####     ## ##  
+ ## ##     #####     ## ##    ##       ####   
+ ## ##     ## ##     ## ##    ## ##    ## ##  
+  ###     ### ##    #####    ######   #### ## 
+                                              
+                                              
 
-
-Answer: SPINY
-Metadata: {'font': 'linux', 'space_letters': True}
+Answer: UNDER
+Metadata: {'font': 'xcourb', 'space_letters': True}
 
 ```
 
@@ -377,26 +426,26 @@ max_value = 1000
 min_factor = 1
 max_factor = 100
 styles = ('plain', 'latex_inline', 'latex_frac', 'latex_dfrac')
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Simplify the fraction $1380/6180$ to its lowest terms
-Answer: $23/103$
-Metadata: {'numerator': 1380, 'denominator': 6180, 'simplified_numerator': 23, 'simplified_denominator': 103, 'reduction_factor': 60, 'style': 'latex_inline'}
+Question: Simplify the fraction $\frac{92}{524}$ to its lowest terms
+Answer: $\frac{23}{131}$
+Metadata: {'numerator': 92, 'denominator': 524, 'simplified_numerator': 23, 'simplified_denominator': 131, 'reduction_factor': 4, 'style': 'latex_frac'}
 
 Example 2:
-Question: Simplify the fraction 15552/49984 to its lowest terms
-Answer: 243/781
-Metadata: {'numerator': 15552, 'denominator': 49984, 'simplified_numerator': 243, 'simplified_denominator': 781, 'reduction_factor': 64, 'style': 'plain'}
+Question: Simplify the fraction $3600/26370$ to its lowest terms
+Answer: $40/293$
+Metadata: {'numerator': 3600, 'denominator': 26370, 'simplified_numerator': 40, 'simplified_denominator': 293, 'reduction_factor': 90, 'style': 'latex_inline'}
 
 Example 3:
-Question: Simplify the fraction $56100/80500$ to its lowest terms
-Answer: $561/805$
-Metadata: {'numerator': 56100, 'denominator': 80500, 'simplified_numerator': 561, 'simplified_denominator': 805, 'reduction_factor': 100, 'style': 'latex_inline'}
+Question: Simplify the fraction 29330/37310 to its lowest terms
+Answer: 419/533
+Metadata: {'numerator': 29330, 'denominator': 37310, 'simplified_numerator': 419, 'simplified_denominator': 533, 'reduction_factor': 70, 'style': 'plain'}
 
 ```
 
@@ -409,7 +458,7 @@ grid_size_x = 20
 grid_size_y = 20
 filled_cells = 100
 simulation_steps = 1
-seed = None
+seed = 42
 size = 500
 ```
 
@@ -418,136 +467,136 @@ Example tasks:
 Example 1:
 Question: What will this Game of Life board look like after 1 steps of simulation?
 
-[[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
- [0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0]
- [0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1]
- [0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1]
- [0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0]
- [0 0 0 1 0 0 0 0 1 0 1 0 1 1 1 0 1 0 0 1]
- [0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0]
- [0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0]
- [0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
- [0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0]
- [1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
- [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1]
- [0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0]
- [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0]
- [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
- [0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 0]
- [0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1]
- [0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0]
- [0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0]
- [1 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0]]
-Answer: [[1 0 0 1 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0]
- [0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
- [0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1]
- [0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0]
- [0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 1 0]
- [0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 1 0]
- [0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0]
- [0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0]
- [0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0]
- [0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0]
- [0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0]
- [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
- [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0]
- [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
- [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0]
- [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0]
- [0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1]
- [0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0]
- [0 0 0 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1 0 0]
- [0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0]]
+[[0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0]
+ [0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0]
+ [1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0]
+ [0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0]
+ [0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 0]
+ [0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0]
+ [1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0]
+ [1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0]
+ [0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1]
+ [1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1]
+ [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1]
+ [0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1]
+ [0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0]
+ [1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0]
+ [1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1]
+ [0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0]
+ [0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0]]
+Answer: [[0 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0]
+ [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0]
+ [0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 1 0]
+ [0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 1 1 0 0]
+ [0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 1 1 1 0]
+ [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1]
+ [1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
+ [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
+ [0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1]
+ [1 1 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0]
+ [0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1]
+ [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1]
+ [1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1]
+ [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1]
+ [1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
+ [1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1]
+ [0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0]
+ [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
 Metadata: {'grid_size_x': 20, 'grid_size_y': 20, 'filled_cells': 100, 'simulation_steps': 1}
 
 Example 2:
 Question: What will this Game of Life board look like after 1 steps of simulation?
 
-[[0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0]
- [0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0]
- [1 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0]
- [0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0]
- [0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 1]
- [0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1]
- [1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0]
- [0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0]
- [1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0]
- [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1]
- [0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1]
- [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
- [0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0]
- [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1]
- [1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 0]
- [0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0]
- [0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0]
- [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
- [0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0]
- [0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0]]
-Answer: [[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0]
- [0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 1]
- [0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0]
- [0 0 0 0 1 1 1 0 1 1 0 1 1 0 1 0 0 0 0 1]
- [1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1]
- [0 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 0 1 1]
- [1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0]
- [0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0]
- [0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1]
- [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1]
- [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1]
- [0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0]
- [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
- [1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1]
- [1 1 0 0 0 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0]
- [0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0]
- [0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0]
- [0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
- [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
- [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0]]
+[[1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0]
+ [0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0]
+ [0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0]
+ [0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 1]
+ [0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0]
+ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
+ [0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0]
+ [1 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0]
+ [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0]
+ [0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0]
+ [0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1]
+ [0 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1]
+ [0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1]
+ [0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1]
+ [0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1]
+ [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0]
+ [0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1]
+ [0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
+ [0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0]]
+Answer: [[0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1]
+ [0 0 1 1 0 1 1 0 0 1 1 0 0 0 0 1 0 1 0 0]
+ [0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 1]
+ [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
+ [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1]
+ [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
+ [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0]
+ [0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0]
+ [1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1]
+ [1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0]
+ [1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
+ [0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1]
+ [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
+ [0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]
 Metadata: {'grid_size_x': 20, 'grid_size_y': 20, 'filled_cells': 100, 'simulation_steps': 1}
 
 Example 3:
 Question: What will this Game of Life board look like after 1 steps of simulation?
 
-[[1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
- [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1]
- [0 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0]
- [0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0]
- [0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 1 0 0 1]
- [1 1 0 0 0 1 1 0 0 0 0 1 1 1 0 1 0 0 0 0]
- [0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0]
- [0 0 1 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0]
- [0 0 1 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0]
- [0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0]
- [0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1]
- [0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0]
- [0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0]
- [0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0]
- [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
- [0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0]
- [0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0 1 1 0 0]
- [0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1]
- [0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
- [1 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0]]
-Answer: [[1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1]
- [0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
- [0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0]
- [0 0 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0]
- [1 1 1 0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0]
- [1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0]
- [0 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 0 0]
- [0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
- [0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0]
- [0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0]
- [0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0]
- [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0]
- [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
+[[0 0 1 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 1]
+ [0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 0 1]
+ [0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0]
+ [0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0]
+ [0 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0]
+ [0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0]
+ [0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1]
+ [0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0]
+ [1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1]
+ [0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0]
+ [0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0]
+ [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
+ [0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0]
+ [0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0]
+ [0 0 1 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0]
+ [1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0]
+ [0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
+ [0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0]]
+Answer: [[1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1]
+ [0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 1]
  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
- [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0]
- [0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0]
- [0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0]
- [0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0]
- [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
- [0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0]]
+ [0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0]
+ [0 0 0 0 1 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0]
+ [0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
+ [0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0]
+ [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0]
+ [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1]
+ [0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0]
+ [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1]
+ [0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
+ [0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0]
+ [0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0]
+ [0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0]
+ [0 1 0 0 1 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0]
+ [0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0]
+ [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0]
+ [0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0]
+ [0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0]]
 Metadata: {'grid_size_x': 20, 'grid_size_y': 20, 'filled_cells': 100, 'simulation_steps': 1}
 
 ```
@@ -561,26 +610,26 @@ min_numbers = 2
 max_numbers = 2
 min_value = 1
 max_value = 1000
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Find the Greatest Common Divisor (GCD) of these numbers: 226, 512
+Question: Find the Greatest Common Divisor (GCD) of these numbers: 26, 760
 Answer: 2
-Metadata: {'numbers': [226, 512], 'result': 2}
+Metadata: {'numbers': [26, 760], 'result': 2}
 
 Example 2:
-Question: Find the Greatest Common Divisor (GCD) of these numbers: 999, 495
-Answer: 9
-Metadata: {'numbers': [999, 495], 'result': 9}
+Question: Find the Greatest Common Divisor (GCD) of these numbers: 688, 716
+Answer: 4
+Metadata: {'numbers': [688, 716], 'result': 4}
 
 Example 3:
-Question: Find the Greatest Common Divisor (GCD) of these numbers: 999, 719
-Answer: 1
-Metadata: {'numbers': [999, 719], 'result': 1}
+Question: Find the Greatest Common Divisor (GCD) of these numbers: 297, 30
+Answer: 3
+Metadata: {'numbers': [297, 30], 'result': 3}
 
 ```
 
@@ -593,26 +642,26 @@ min_numbers = 2
 max_numbers = 2
 min_value = 1
 max_value = 100
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Find the Least Common Multiple (LCM) of these numbers: 30, 69
-Answer: 690
-Metadata: {'numbers': [30, 69], 'result': 690}
+Question: Find the Least Common Multiple (LCM) of these numbers: 95, 14
+Answer: 1330
+Metadata: {'numbers': [95, 14], 'result': 1330}
 
 Example 2:
-Question: Find the Least Common Multiple (LCM) of these numbers: 57, 99
-Answer: 1881
-Metadata: {'numbers': [57, 99], 'result': 1881}
+Question: Find the Least Common Multiple (LCM) of these numbers: 60, 48
+Answer: 240
+Metadata: {'numbers': [60, 48], 'result': 240}
 
 Example 3:
-Question: Find the Least Common Multiple (LCM) of these numbers: 3, 24
-Answer: 24
-Metadata: {'numbers': [3, 24], 'result': 24}
+Question: Find the Least Common Multiple (LCM) of these numbers: 38, 4
+Answer: 76
+Metadata: {'numbers': [38, 4], 'result': 76}
 
 ```
 
@@ -624,26 +673,26 @@ Default configuration:
 min_animals = 2
 max_animals = 5
 max_instances = 3
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: How many legs are there in total if you have 1 starfish, 3 crabs, 3 chickens, 3 cows, 1 woodlouse?
-Answer: 67
-Metadata: {'animals': {'starfish': 1, 'crab': 3, 'chicken': 3, 'cow': 3, 'woodlouse': 1}, 'total_legs': 67}
+Question: How many legs are there in total if you have 1 sea slug, 1 deer?
+Answer: 4
+Metadata: {'animals': {'sea slug': 1, 'deer': 1}, 'total_legs': 4}
 
 Example 2:
-Question: How many legs are there in total if you have 2 sheeps, 1 butterfly, 1 ant, 3 humans, 2 wasps?
-Answer: 38
-Metadata: {'animals': {'sheep': 2, 'butterfly': 1, 'ant': 1, 'human': 3, 'wasp': 2}, 'total_legs': 38}
+Question: How many legs are there in total if you have 2 sheeps, 2 dogs?
+Answer: 16
+Metadata: {'animals': {'sheep': 2, 'dog': 2}, 'total_legs': 16}
 
 Example 3:
-Question: How many legs are there in total if you have 3 chickens, 3 cockroachs, 3 woodlouses, 2 elephants, 2 sea slugs?
-Answer: 74
-Metadata: {'animals': {'chicken': 3, 'cockroach': 3, 'woodlouse': 3, 'elephant': 2, 'sea slug': 2}, 'total_legs': 74}
+Question: How many legs are there in total if you have 1 crab, 2 lobsters, 1 human, 1 cow, 1 bee?
+Answer: 42
+Metadata: {'animals': {'crab': 1, 'lobster': 2, 'human': 1, 'cow': 1, 'bee': 1}, 'total_legs': 42}
 
 ```
 
@@ -654,26 +703,26 @@ Default configuration:
 ```python
 min_words = 5
 max_words = 15
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: How many times does the letter "r" appear in the text: "You decline All is over then murmured the British agent sadly The"?
-Answer: 4
-Metadata: {'span_length': 12, 'target_letter': 'r', 'span': ['You', 'decline', 'All', 'is', 'over', 'then', 'murmured', 'the', 'British', 'agent', 'sadly', 'The']}
+Question: How many times does the letter "w" appear in the text: "bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed"?
+Answer: 1
+Metadata: {'span_length': 15, 'target_letter': 'w', 'span': ['bed', 'and', 'enters', 'his', 'mechanical', 'dresser', 'Two', 'minutes', 'later', 'the', 'machine', 'deposited', 'him', 'all', 'dressed']}
 
 Example 2:
-Question: How many times does the letter "l" appear in the text: "coffined and laid in a tomb Time went on September 25th 2889"?
+Question: How many times does the letter "p" appear in the text: "it into a watering place"?
 Answer: 1
-Metadata: {'span_length': 12, 'target_letter': 'l', 'span': ['coffined', 'and', 'laid', 'in', 'a', 'tomb', 'Time', 'went', 'on', 'September', '25th', '2889']}
+Metadata: {'span_length': 5, 'target_letter': 'p', 'span': ['it', 'into', 'a', 'watering', 'place']}
 
 Example 3:
-Question: How many times does the letter "i" appear in the text: "to the works took more time than he had anticipated It was"?
-Answer: 4
-Metadata: {'span_length': 12, 'target_letter': 'i', 'span': ['to', 'the', 'works', 'took', 'more', 'time', 'than', 'he', 'had', 'anticipated', 'It', 'was']}
+Question: How many times does the letter "t" appear in the text: "readable form accessible by the widest array of equipment including outdated"?
+Answer: 5
+Metadata: {'span_length': 11, 'target_letter': 't', 'span': ['readable', 'form', 'accessible', 'by', 'the', 'widest', 'array', 'of', 'equipment', 'including', 'outdated']}
 
 ```
 
@@ -689,26 +738,26 @@ max_words = 20
 min_corruption_level = 0.1
 max_corruption_level = 0.9
 consecutive_words = True
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Unscramble these words: moon abotu faec hA trehe s somethnig ni htat driec eht owt nem ta ocne dnA dndeei
-Answer: moon about face Ah there s something in that cried the two men at once And indeed
-Metadata: {'num_words': 17, 'corruption_level': 0.16056171448414203, 'scrambled_words': ['moon', 'abotu', 'faec', 'hA', 'trehe', 's', 'somethnig', 'ni', 'htat', 'driec', 'eht', 'owt', 'nem', 'ta', 'ocne', 'dnA', 'dndeei'], 'original_words': ['moon', 'about', 'face', 'Ah', 'there', 's', 'something', 'in', 'that', 'cried', 'the', 'two', 'men', 'at', 'once', 'And', 'indeed']}
+Question: Unscramble these words: ew hsall eb ebla ot puodrce
+Answer: we shall be able to produce
+Metadata: {'num_words': 6, 'corruption_level': 0.12000860417813355, 'scrambled_words': ['ew', 'hsall', 'eb', 'ebla', 'ot', 'puodrce'], 'original_words': ['we', 'shall', 'be', 'able', 'to', 'produce']}
 
 Example 2:
-Question: Unscramble these words: lla het aosssen eth msea I psrooep ot od toshmeign etrtbe itlsl amrnsTrfo toni aeht a tiooprn fo het
-Answer: all the seasons the same I propose to do something better still Transform into heat a portion of the
-Metadata: {'num_words': 19, 'corruption_level': 0.8984516776838924, 'scrambled_words': ['lla', 'het', 'aosssen', 'eth', 'msea', 'I', 'psrooep', 'ot', 'od', 'toshmeign', 'etrtbe', 'itlsl', 'amrnsTrfo', 'toni', 'aeht', 'a', 'tiooprn', 'fo', 'het'], 'original_words': ['all', 'the', 'seasons', 'the', 'same', 'I', 'propose', 'to', 'do', 'something', 'better', 'still', 'Transform', 'into', 'heat', 'a', 'portion', 'of', 'the']}
+Question: Unscramble these words: ni oiurnalmsj Well Cahs
+Answer: in journalism Well Cash
+Metadata: {'num_words': 4, 'corruption_level': 0.3288673442377109, 'scrambled_words': ['ni', 'oiurnalmsj', 'Well', 'Cahs'], 'original_words': ['in', 'journalism', 'Well', 'Cash']}
 
 Example 3:
-Question: Unscramble these words: od ubt si ti fo yna sue Waht ew need si csoudl ont iarn oG dais eh addressing
-Answer: do but is it of any use What we need is clouds not rain Go said he addressing
-Metadata: {'num_words': 18, 'corruption_level': 0.21786426698317396, 'scrambled_words': ['od', 'ubt', 'si', 'ti', 'fo', 'yna', 'sue', 'Waht', 'ew', 'need', 'si', 'csoudl', 'ont', 'iarn', 'oG', 'dais', 'eh', 'addressing'], 'original_words': ['do', 'but', 'is', 'it', 'of', 'any', 'use', 'What', 'we', 'need', 'is', 'clouds', 'not', 'rain', 'Go', 'said', 'he', 'addressing']}
+Question: Unscramble these words: dear rchAdbali keep no nSice yrstyedae atnhks ot oyu rheet si a gain fo sucrbbisesr rM
+Answer: dear Archibald keep on Since yesterday thanks to you there is a gain of subscribers Mr
+Metadata: {'num_words': 16, 'corruption_level': 0.516016391169858, 'scrambled_words': ['dear', 'rchAdbali', 'keep', 'no', 'nSice', 'yrstyedae', 'atnhks', 'ot', 'oyu', 'rheet', 'si', 'a', 'gain', 'fo', 'sucrbbisesr', 'rM'], 'original_words': ['dear', 'Archibald', 'keep', 'on', 'Since', 'yesterday', 'thanks', 'to', 'you', 'there', 'is', 'a', 'gain', 'of', 'subscribers', 'Mr']}
 
 ```
 
@@ -722,60 +771,61 @@ min_dist = 5
 max_dist = 10
 min_grid_size = 5
 max_grid_size = 10
-seed = None
+seed = 42
 size = 50
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Navigate from 'F' (start) to 'S' (goal):
+Question: Navigate from '3' (start) to 'z' (goal):
 
-```DDDDDDD
-D]D]]DD
-DD]DD]D
-DDS]]]D
-D]]D]]D
-D]]]]FD
-DDDDDDD```
-Legend: 'D' = Wall, ']' = Passage
+```>>>>>>>>>
+>eeee>e>>
+>ee>>>>>>
+>eeeeee>>
+>e>ee>>e>
+>>ez>3e>>
+>eee>e>e>
+>eeeee>e>
+>>>>>>>>>```
+Legend: '>' = Wall, 'e' = Passage
 
 What is the minimum number of steps to reach the goal?
-Answer: 5
-Metadata: {'grid_size': 7, 'grid': ['DDDDDDD', 'D]D]]DD', 'DD]DD]D', 'DDS]]]D', 'D]]D]]D', 'D]]]]FD', 'DDDDDDD'], 'shortest_path_length': 5, 'start': 'F', 'goal': 'S', 'wall': 'D', 'path': ']'}
+Answer: 6
+Metadata: {'grid_size': 9, 'grid': ['>>>>>>>>>', '>eeee>e>>', '>ee>>>>>>', '>eeeeee>>', '>e>ee>>e>', '>>ez>3e>>', '>eee>e>e>', '>eeeee>e>', '>>>>>>>>>'], 'shortest_path_length': 6, 'start': '3', 'goal': 'z', 'wall': '>', 'path': 'e'}
 
 Example 2:
-Question: Navigate from 'V' (start) to 'S' (goal):
+Question: Navigate from '`' (start) to 'i' (goal):
 
-```77777777
-77SUU777
-7U7UUUU7
-77UUU777
-7UU7UUU7
-77U7UUU7
-7UUU7UV7
-77777777```
-Legend: '7' = Wall, 'U' = Passage
+```4444444
+4AAAAi4
+4A4A4A4
+4A4AA44
+44AAAA4
+44A`444
+4444444```
+Legend: '4' = Wall, 'A' = Passage
 
 What is the minimum number of steps to reach the goal?
-Answer: 9
-Metadata: {'grid_size': 8, 'grid': ['77777777', '77SUU777', '7U7UUUU7', '77UUU777', '7UU7UUU7', '77U7UUU7', '7UUU7UV7', '77777777'], 'shortest_path_length': 9, 'start': 'V', 'goal': 'S', 'wall': '7', 'path': 'U'}
+Answer: 6
+Metadata: {'grid_size': 7, 'grid': ['4444444', '4AAAAi4', '4A4A4A4', '4A4AA44', '44AAAA4', '44A`444', '4444444'], 'shortest_path_length': 6, 'start': '`', 'goal': 'i', 'wall': '4', 'path': 'A'}
 
 Example 3:
-Question: Navigate from 'z' (start) to '4' (goal):
+Question: Navigate from '(' (start) to '`' (goal):
 
-```$$$$$$$
-$~~~~~$
-$$~$~~$
-$~$~$4$
-$$~~~~$
-$~z~~~$
-$$$$$$$```
-Legend: '$' = Wall, '~' = Passage
+```QQQQQQQ
+QQ%%%%Q
+QQ`%Q%Q
+Q%%Q%%Q
+Q%%%Q%Q
+Q%QQ%(Q
+QQQQQQQ```
+Legend: 'Q' = Wall, '%' = Passage
 
 What is the minimum number of steps to reach the goal?
-Answer: 5
-Metadata: {'grid_size': 7, 'grid': ['$$$$$$$', '$~~~~~$', '$$~$~~$', '$~$~$4$', '$$~~~~$', '$~z~~~$', '$$$$$$$'], 'shortest_path_length': 5, 'start': 'z', 'goal': '4', 'wall': '$', 'path': '~'}
+Answer: 8
+Metadata: {'grid_size': 7, 'grid': ['QQQQQQQ', 'QQ%%%%Q', 'QQ`%Q%Q', 'Q%%Q%%Q', 'Q%%%Q%Q', 'Q%QQ%(Q', 'QQQQQQQ'], 'shortest_path_length': 8, 'start': '(', 'goal': '`', 'wall': 'Q', 'path': '%'}
 
 ```
 
@@ -786,7 +836,7 @@ Default configuration:
 ```python
 min_empty = 8
 max_empty = 12
-seed = None
+seed = 42
 size = 500
 ```
 
@@ -794,39 +844,39 @@ Example tasks:
 ```
 Example 1:
 Question: Solve this 4x4 Mini Sudoku puzzle:
-1 _ _ _
-_ 4 _ _
-_ _ _ 3
-_ _ 1 4
-Answer: 1 3 4 2
-2 4 3 1
-4 1 2 3
-3 2 1 4
-Metadata: {'puzzle': [[1, 0, 0, 0], [0, 4, 0, 0], [0, 0, 0, 3], [0, 0, 1, 4]], 'solution': [[1, 3, 4, 2], [2, 4, 3, 1], [4, 1, 2, 3], [3, 2, 1, 4]], 'num_empty': 11}
+_ _ _ _
+_ _ _ _
+_ 1 3 _
+_ 4 _ 1
+Answer: 4 2 1 3
+1 3 4 2
+2 1 3 4
+3 4 2 1
+Metadata: {'puzzle': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 3, 0], [0, 4, 0, 1]], 'solution': [[4, 2, 1, 3], [1, 3, 4, 2], [2, 1, 3, 4], [3, 4, 2, 1]], 'num_empty': 12}
 
 Example 2:
 Question: Solve this 4x4 Mini Sudoku puzzle:
-_ _ _ 2
-2 _ _ 4
-_ 4 _ _
-_ 2 4 _
-Answer: 4 3 1 2
-2 1 3 4
-1 4 2 3
-3 2 4 1
-Metadata: {'puzzle': [[0, 0, 0, 2], [2, 0, 0, 4], [0, 4, 0, 0], [0, 2, 4, 0]], 'solution': [[4, 3, 1, 2], [2, 1, 3, 4], [1, 4, 2, 3], [3, 2, 4, 1]], 'num_empty': 10}
+3 _ _ _
+_ _ 4 _
+4 2 _ _
+_ _ _ 4
+Answer: 3 4 1 2
+2 1 4 3
+4 2 3 1
+1 3 2 4
+Metadata: {'puzzle': [[3, 0, 0, 0], [0, 0, 4, 0], [4, 2, 0, 0], [0, 0, 0, 4]], 'solution': [[3, 4, 1, 2], [2, 1, 4, 3], [4, 2, 3, 1], [1, 3, 2, 4]], 'num_empty': 11}
 
 Example 3:
 Question: Solve this 4x4 Mini Sudoku puzzle:
-4 2 _ _
-3 _ 2 4
 _ _ _ _
-_ 4 3 2
-Answer: 4 2 1 3
+1 3 4 _
 3 1 2 4
-2 3 4 1
-1 4 3 2
-Metadata: {'puzzle': [[4, 2, 0, 0], [3, 0, 2, 4], [0, 0, 0, 0], [0, 4, 3, 2]], 'solution': [[4, 2, 1, 3], [3, 1, 2, 4], [2, 3, 4, 1], [1, 4, 3, 2]], 'num_empty': 8}
+4 _ _ _
+Answer: 2 4 1 3
+1 3 4 2
+3 1 2 4
+4 2 3 1
+Metadata: {'puzzle': [[0, 0, 0, 0], [1, 3, 4, 0], [3, 1, 2, 4], [4, 0, 0, 0]], 'solution': [[2, 4, 1, 3], [1, 3, 4, 2], [3, 1, 2, 4], [4, 2, 3, 1]], 'num_empty': 8}
 
 ```
 
@@ -841,26 +891,26 @@ min_decimals = 0
 max_decimals = 4
 min_value = -100.0
 max_value = 100.0
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Remove all numbers smaller than -78.527 in this list: ['-14.14', '10.92', '-56.57', '-56', '-84.8', '20']
-Answer: ['-14.14', '10.92', '-56.57', '-56', '20']
-Metadata: {'original_numbers': ['-14.14', '10.92', '-56.57', '-56', '-84.8', '20'], 'filter_value': '-78.527', 'operation': 'remove_smaller', 'result': ['-14.14', '10.92', '-56.57', '-56', '20']}
+Question: Keep all numbers larger than -90 in this list: ['-95.00', '-51.0', '47.2942', '-82.612']
+Answer: ['-51.0', '47.2942', '-82.612']
+Metadata: {'original_numbers': ['-95.00', '-51.0', '47.2942', '-82.612'], 'filter_value': '-90', 'operation': 'keep_larger', 'result': ['-51.0', '47.2942', '-82.612']}
 
 Example 2:
-Question: Remove all numbers larger than 19 in this list: ['20', '66', '-22.729', '-21.62', '-6.2198', '4', '34.0', '-43.9360', '98.011', '-1.2024']
-Answer: ['-22.729', '-21.62', '-6.2198', '4', '-43.9360', '-1.2024']
-Metadata: {'original_numbers': ['20', '66', '-22.729', '-21.62', '-6.2198', '4', '34.0', '-43.9360', '98.011', '-1.2024'], 'filter_value': '19', 'operation': 'remove_larger', 'result': ['-22.729', '-21.62', '-6.2198', '4', '-43.9360', '-1.2024']}
+Question: Remove all numbers larger than 18.236 in this list: ['-42.8', '91.88', '34']
+Answer: ['-42.8']
+Metadata: {'original_numbers': ['-42.8', '91.88', '34'], 'filter_value': '18.236', 'operation': 'remove_larger', 'result': ['-42.8']}
 
 Example 3:
-Question: Keep all numbers smaller than 2.319 in this list: ['99', '-21', '-77.530', '7', '-11', '87.2816', '94.319', '-36', '-25.7766', '30.013']
-Answer: ['-21', '-77.530', '-11', '-36', '-25.7766']
-Metadata: {'original_numbers': ['99', '-21', '-77.530', '7', '-11', '87.2816', '94.319', '-36', '-25.7766', '30.013'], 'filter_value': '2.319', 'operation': 'keep_smaller', 'result': ['-21', '-77.530', '-11', '-36', '-25.7766']}
+Question: Keep all numbers larger than 19.8962 in this list: ['4', '-64.7', '-42.1', '-77', '-79.9640', '37.76', '38.702', '18.20', '-28.34']
+Answer: ['37.76', '38.702']
+Metadata: {'original_numbers': ['4', '-64.7', '-42.1', '-77', '-79.9640', '37.76', '38.702', '18.20', '-28.34'], 'filter_value': '19.8962', 'operation': 'keep_larger', 'result': ['37.76', '38.702']}
 
 ```
 
@@ -874,26 +924,26 @@ max_terms = 8
 min_value = -100
 max_value = 100
 max_complexity = 3
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: 9, 4, 2, 1, 0, 0, 0, ?
-Answer: 0
-Metadata: {'rule': 'halve', 'complexity': 2, 'sequence': [9, 4, 2, 1, 0, 0, 0, 0]}
+Question: 3, 6, 12, 24, 48, 96, 192, 384, ?
+Answer: 768
+Metadata: {'rule': 'double', 'complexity': 3, 'sequence': [3, 6, 12, 24, 48, 96, 192, 384, 768]}
 
 Example 2:
-Question: -2, 1, 7, 19, 43, 91, 187, 379, ?
-Answer: 763
-Metadata: {'rule': 'double then add 5', 'complexity': 1, 'sequence': [-2, 1, 7, 19, 43, 91, 187, 379, 763]}
+Question: 8, 14, 20, 26, 32, 38, 44, ?
+Answer: 50
+Metadata: {'rule': 'add 6', 'complexity': 1, 'sequence': [8, 14, 20, 26, 32, 38, 44, 50]}
 
 Example 3:
-Question: 1, 0, 0, 0, 0, 0, 0, ?
+Question: 8, 4, 2, 1, 0, 0, 0, ?
 Answer: 0
-Metadata: {'rule': 'halve then multiply by 8', 'complexity': 1, 'sequence': [1, 0, 0, 0, 0, 0, 0, 0]}
+Metadata: {'rule': 'halve', 'complexity': 2, 'sequence': [8, 4, 2, 1, 0, 0, 0, 0]}
 
 ```
 
@@ -908,26 +958,26 @@ min_decimals = 0
 max_decimals = 2
 min_value = -100.0
 max_value = 100.0
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Sort these numbers in ascending order: -6.78, -92.30, 91.23, -77.49, 95.03, 74.19, 70.26, -67.10
-Answer: ['-92.30', '-77.49', '-67.10', '-6.78', '70.26', '74.19', '91.23', '95.03']
-Metadata: {'original_numbers': ['-6.78', '-92.30', '91.23', '-77.49', '95.03', '74.19', '70.26', '-67.10'], 'direction': 'ascending', 'sorted_numbers': ['-92.30', '-77.49', '-67.10', '-6.78', '70.26', '74.19', '91.23', '95.03']}
+Question: Sort these numbers in ascending order: 48, -51, -72, -80
+Answer: ['-80', '-72', '-51', '48']
+Metadata: {'original_numbers': ['48', '-51', '-72', '-80'], 'direction': 'ascending', 'sorted_numbers': ['-80', '-72', '-51', '48']}
 
 Example 2:
-Question: Sort these numbers in descending order: -10.32, 68.71, -89.59, 57.02, 12.29, -75.18, 49.79, -62.58, -58.82
-Answer: ['68.71', '57.02', '49.79', '12.29', '-10.32', '-58.82', '-62.58', '-75.18', '-89.59']
-Metadata: {'original_numbers': ['-10.32', '68.71', '-89.59', '57.02', '12.29', '-75.18', '49.79', '-62.58', '-58.82'], 'direction': 'descending', 'sorted_numbers': ['68.71', '57.02', '49.79', '12.29', '-10.32', '-58.82', '-62.58', '-75.18', '-89.59']}
+Question: Sort these numbers in ascending order: 39.2, -71.2, -7.5
+Answer: ['-71.2', '-7.5', '39.2']
+Metadata: {'original_numbers': ['39.2', '-71.2', '-7.5'], 'direction': 'ascending', 'sorted_numbers': ['-71.2', '-7.5', '39.2']}
 
 Example 3:
-Question: Sort these numbers in descending order: 10.13, 72.60, 72.13, 14.65, 1.16, -26.82, 55.17, 37.38, 76.73, -82.92
-Answer: ['76.73', '72.60', '72.13', '55.17', '37.38', '14.65', '10.13', '1.16', '-26.82', '-82.92']
-Metadata: {'original_numbers': ['10.13', '72.60', '72.13', '14.65', '1.16', '-26.82', '55.17', '37.38', '76.73', '-82.92'], 'direction': 'descending', 'sorted_numbers': ['76.73', '72.60', '72.13', '55.17', '37.38', '14.65', '10.13', '1.16', '-26.82', '-82.92']}
+Question: Sort these numbers in descending order: 8.39, 72.41, -64.67, -54.97, -94.18, -76.67, -98.24, -68.66, 2.74
+Answer: ['72.41', '8.39', '2.74', '-54.97', '-64.67', '-68.66', '-76.67', '-94.18', '-98.24']
+Metadata: {'original_numbers': ['8.39', '72.41', '-64.67', '-54.97', '-94.18', '-76.67', '-98.24', '-68.66', '2.74'], 'direction': 'descending', 'sorted_numbers': ['72.41', '8.39', '2.74', '-54.97', '-64.67', '-68.66', '-76.67', '-94.18', '-98.24']}
 
 ```
 
@@ -946,26 +996,26 @@ max_value = 100
 min_degree = 1
 max_degree = 3
 operators = ('+', '-')
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Determine the real value(s) of a tha satisfies: -35*a**2 = 0
+Question: Find the real value(s) of u in the equation: -127*u = 0
 Answer: [0.0]
-Metadata: {'polynomial_expr': '-35*a**2', 'variable': 'a', 'degree': 2, 'real_solutions': [0.0]}
+Metadata: {'polynomial_expr': '-127*u', 'variable': 'u', 'degree': 1, 'real_solutions': [0.0]}
 
 Example 2:
-Question: Solve for real l: 27*l**2 + 175*l - 1 = 0
-Answer: [-6.487190738158517, 0.005709256677035911]
-Metadata: {'polynomial_expr': '27*l**2 + 175*l - 1', 'variable': 'l', 'degree': 2, 'real_solutions': [-6.487190738158517, 0.005709256677035911]}
+Question: Determine the real value(s) of b tha satisfies: 86*b**2 - 2*b - 13 = 0
+Answer: [-0.3773425275273891, 0.4005983414808775]
+Metadata: {'polynomial_expr': '86*b**2 - 2*b - 13', 'variable': 'b', 'degree': 2, 'real_solutions': [-0.3773425275273891, 0.4005983414808775]}
 
 Example 3:
-Question: Find the real value(s) of t in the equation: 94 - 9*t**2 = 0
-Answer: [-3.2317865716108862, 3.2317865716108862]
-Metadata: {'polynomial_expr': '94 - 9*t**2', 'variable': 't', 'degree': 2, 'real_solutions': [-3.2317865716108862, 3.2317865716108862]}
+Question: Determine the real value(s) of n tha satisfies: 71*n**3 - 2*n - 29 = 0
+Answer: [0.7546129960163634]
+Metadata: {'polynomial_expr': '71*n**3 - 2*n - 29', 'variable': 'n', 'degree': 3, 'real_solutions': [0.7546129960163634]}
 
 ```
 
@@ -976,26 +1026,26 @@ Default configuration:
 ```python
 min_value = 2
 max_value = 1000
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Find the prime factorization of 973. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
-Answer: 7 × 139
-Metadata: {'number': 973, 'factors': [7, 139]}
+Question: Find the prime factorization of 656. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
+Answer: 2 × 2 × 2 × 2 × 41
+Metadata: {'number': 656, 'factors': [2, 2, 2, 2, 41]}
 
 Example 2:
-Question: Find the prime factorization of 153. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
-Answer: 3 × 3 × 17
-Metadata: {'number': 153, 'factors': [3, 3, 17]}
+Question: Find the prime factorization of 41. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
+Answer: 41
+Metadata: {'number': 41, 'factors': [41]}
 
 Example 3:
-Question: Find the prime factorization of 390. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
-Answer: 2 × 3 × 5 × 13
-Metadata: {'number': 390, 'factors': [2, 3, 5, 13]}
+Question: Find the prime factorization of 420. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
+Answer: 2 × 2 × 3 × 5 × 7
+Metadata: {'number': 420, 'factors': [2, 2, 3, 5, 7]}
 
 ```
 
@@ -1009,7 +1059,7 @@ max_vars = 4
 min_statements = 2
 max_statements = 4
 max_complexity = 3
-seed = None
+seed = 42
 size = 500
 ```
 
@@ -1017,30 +1067,30 @@ Example tasks:
 ```
 Example 1:
 Question: Given:
-1. (Q → P)
-2. (P → P)
-3. ((P ∨ Q) ↔ (P ↔ Q))
-4. (Q ∨ P)
+1. R
+2. Q
 What can we conclude?
-Answer: (P ∧ P)
-Metadata: {'premises': ['(Q → P)', '(P → P)', '((P ∨ Q) ↔ (P ↔ Q))', '(Q ∨ P)'], 'variables': ['P', 'Q'], 'complexity': 3}
+Answer: (P ∨ Q)
+Metadata: {'premises': ['R', 'Q'], 'variables': ['P', 'Q', 'R', 'S'], 'complexity': 3}
 
 Example 2:
 Question: Given:
-1. P
-2. ¬(P ∧ P)
-3. Q
+1. ((Q → P) ∨ (Q → P))
+2. ((Q ↔ Q) → (P → P))
+3. P
 What can we conclude?
-Answer: (P ∧ P)
-Metadata: {'premises': ['P', '¬(P ∧ P)', 'Q'], 'variables': ['P', 'Q', 'R'], 'complexity': 3}
+Answer: (P → P)
+Metadata: {'premises': ['((Q → P) ∨ (Q → P))', '((Q ↔ Q) → (P → P))', 'P'], 'variables': ['P', 'Q'], 'complexity': 3}
 
 Example 3:
 Question: Given:
-1. ¬(R → P)
-2. ¬P
+1. ((Q ∨ P) ∧ ¬P)
+2. P
+3. ((P ∧ R) ∧ ¬R)
+4. ((Q ↔ R) → ¬Q)
 What can we conclude?
-Answer: (Q ↔ Q)
-Metadata: {'premises': ['¬(R → P)', '¬P'], 'variables': ['P', 'Q', 'R'], 'complexity': 3}
+Answer: (Q ∧ Q)
+Metadata: {'premises': ['((Q ∨ P) ∧ ¬P)', 'P', '((P ∧ R) ∧ ¬R)', '((Q ↔ R) → ¬Q)'], 'variables': ['P', 'Q', 'R'], 'complexity': 3}
 
 ```
 
@@ -1050,7 +1100,7 @@ Generates QuantumLock tasks
 Default configuration:
 ```python
 difficulty = 10
-seed = None
+seed = 42
 size = 500
 ```
 
@@ -1061,39 +1111,39 @@ Question: In front of you are some buttons, a light, and a number. The light wil
 You must press the shortest correct sequence of buttons to reach the target value.
 
 Start: 0 (red)
-Target: 38
+Target: 46
 Buttons:
-A: Multiply 2 (when any)
-B: Add 2 (when red)
-C: Multiply 3 (when any)
-Answer: B → A → C → C → B
-Metadata: {'difficulty': 10, 'solution_path': ['B', 'A', 'C', 'C', 'B'], 'target_value': 38, 'buttons': [{'name': 'A', 'type': 'multiply', 'value': 2, 'active_state': 'any'}, {'name': 'B', 'type': 'add', 'value': 2, 'active_state': 'red'}, {'name': 'C', 'type': 'multiply', 'value': 3, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
+A: Add 3 (when any)
+B: Add 2 (when any)
+C: Multiply 2 (when any)
+Answer: A → B → C → C → A → C
+Metadata: {'difficulty': 10, 'solution_path': ['A', 'B', 'C', 'C', 'A', 'C'], 'target_value': 46, 'buttons': [{'name': 'A', 'type': 'add', 'value': 3, 'active_state': 'any'}, {'name': 'B', 'type': 'add', 'value': 2, 'active_state': 'any'}, {'name': 'C', 'type': 'multiply', 'value': 2, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
 
 Example 2:
 Question: In front of you are some buttons, a light, and a number. The light will toggle between red and green whenever you press a button. Each button performs a mathematical operation to the number, but the operation may depend on the state of the light.
 You must press the shortest correct sequence of buttons to reach the target value.
 
 Start: 0 (red)
-Target: 42
+Target: 30
 Buttons:
-A: Multiply 3 (when any)
-B: Add 2 (when any)
-C: Add 3 (when any)
-Answer: B → B → A → B → A
-Metadata: {'difficulty': 10, 'solution_path': ['B', 'B', 'A', 'B', 'A'], 'target_value': 42, 'buttons': [{'name': 'A', 'type': 'multiply', 'value': 3, 'active_state': 'any'}, {'name': 'B', 'type': 'add', 'value': 2, 'active_state': 'any'}, {'name': 'C', 'type': 'add', 'value': 3, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
+A: Add 2 (when green)
+B: Subtract 3 (when red)
+C: Multiply 2 (when red)
+Answer: C → A → C → A → C → A → C → A
+Metadata: {'difficulty': 10, 'solution_path': ['C', 'A', 'C', 'A', 'C', 'A', 'C', 'A'], 'target_value': 30, 'buttons': [{'name': 'A', 'type': 'add', 'value': 2, 'active_state': 'green'}, {'name': 'B', 'type': 'subtract', 'value': 3, 'active_state': 'red'}, {'name': 'C', 'type': 'multiply', 'value': 2, 'active_state': 'red'}], 'initial_state': 'red', 'initial_value': 0}
 
 Example 3:
 Question: In front of you are some buttons, a light, and a number. The light will toggle between red and green whenever you press a button. Each button performs a mathematical operation to the number, but the operation may depend on the state of the light.
 You must press the shortest correct sequence of buttons to reach the target value.
 
 Start: 0 (red)
-Target: 35
+Target: 45
 Buttons:
-A: Multiply 3 (when red)
-B: Add 2 (when green)
-C: Subtract 3 (when any)
-Answer: A → B → A → C → A → B → A → B
-Metadata: {'difficulty': 10, 'solution_path': ['A', 'B', 'A', 'C', 'A', 'B', 'A', 'B'], 'target_value': 35, 'buttons': [{'name': 'A', 'type': 'multiply', 'value': 3, 'active_state': 'red'}, {'name': 'B', 'type': 'add', 'value': 2, 'active_state': 'green'}, {'name': 'C', 'type': 'subtract', 'value': 3, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
+A: Subtract 2 (when any)
+B: Add 3 (when any)
+C: Add 2 (when any)
+Answer: B → B → B → B → B → B → B → B → B → B → B → B → B → B → B
+Metadata: {'difficulty': 10, 'solution_path': ['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'], 'target_value': 45, 'buttons': [{'name': 'A', 'type': 'subtract', 'value': 2, 'active_state': 'any'}, {'name': 'B', 'type': 'add', 'value': 3, 'active_state': 'any'}, {'name': 'C', 'type': 'add', 'value': 2, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
 
 ```
 
@@ -1105,7 +1155,7 @@ Default configuration:
 scramble_steps = 3
 cube_size = 3
 remove_ansi = True
-seed = None
+seed = 42
 size = 500
 ```
 
@@ -1114,56 +1164,56 @@ Example tasks:
 Example 1:
 Question: You are given a 3x3x3 Rubik's cube. It looks like this:
 
-          Y  Y  Y
-          Y  Y  Y
-          Y  Y  Y
- G  G  G  O  O  O  B  B  B  R  R  R
- R  R  R  G  G  G  O  O  O  B  B  B
- R  R  R  G  G  G  O  O  O  B  B  B
-          W  W  W
-          W  W  W
-          W  W  W
-
+          G  Y  G                   
+          G  Y  G                   
+          G  R  G                   
+ W  W  W  O  G  O  Y  Y  Y  R  B  R 
+ R  R  R  W  G  W  O  O  O  Y  B  Y 
+ R  R  R  W  G  W  O  O  O  Y  B  Y 
+          B  O  B                   
+          B  W  B                   
+          B  W  B                   
+ 
 
 Please provide a solution to solve this cube using Singmaster notation.
 Answer: None
-Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "U L L'", 'example_correct_answer': "U'"}
+Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "F L' R", 'example_correct_answer': "L F' U' R D B' D' U R U' R' U B U' B' U' R' U R U B U' B' U R' U R U B U' B' U' B' U B U L U' L' U' B' U B U L U' L' U B' U B U L U' L' F R U R' U' F' U' R U R' U R U U R' F U' B' U F' U' B R' D' R D R' D' R D R' D' R D R' D' R D U R' D' R D R' D' R D U R' D' R D R' D' R D R' D' R D R' D' R D U R' D' R D R' D' R D U"}
 
 Example 2:
-Question: You see a size 3 Rubik's cube. It is arranged this:
+Question: You are given a 3x3x3 Rubik's cube. It looks like this:
 
-          Y  Y  O
-          Y  Y  O
-          Y  Y  B
- R  R  R  G  G  Y  O  G  G  W  B  B
- R  R  Y  O  G  G  W  O  O  B  B  B
- R  R  Y  O  G  G  W  O  O  B  B  B
-          G  R  R
-          W  W  W
-          W  W  W
+          Y  Y  R                   
+          Y  Y  R                   
+          G  G  R                   
+ B  B  Y  R  R  B  W  W  W  G  O  O 
+ R  R  W  G  G  G  Y  O  O  B  B  Y 
+ R  R  W  G  G  G  Y  O  O  B  B  Y 
+          O  O  O                   
+          B  W  W                   
+          B  W  W                   
+ 
 
-
-Please provide a solution to solve this cube.
+Please provide a solution to solve this cube using Singmaster notation.
 Answer: None
-Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "U F' U'", 'example_correct_answer': "U F U'"}
+Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "L' F U'", 'example_correct_answer': "U' D' B D L' U' F D R' D' U' R U' R' F' U U F U F U' F' U' L' U L U F U' F' U L' U L U F U' F' R U' R' U' F' U F R' U R U B U' B' U' U' B' U B U L U' L' F R U R' U' R U R' U' F' U R U R' U R U U R' U' R U R' U R U U R' U' R U' L' U R' U' L U F U' B' U F' U' B R' D' R D R' D' R D U U R' D' R D R' D' R D U R' D' R D R' D' R D U"}
 
 Example 3:
-Question: You see a size 3 Rubik's cube. It is arranged this:
+Question: You are given a 3x3x3 Rubik's cube. It looks like this:
 
-          R  R  R
-          B  Y  Y
-          O  O  O
- G  R  Y  G  G  G  W  O  B  W  W  W
- W  R  Y  G  G  G  W  O  Y  B  B  B
- W  R  B  Y  Y  Y  G  O  Y  B  B  B
-          R  R  R
-          G  W  W
-          O  O  O
+          Y  Y  W                   
+          Y  Y  W                   
+          Y  Y  W                   
+ G  G  G  O  O  B  O  O  O  G  R  R 
+ R  R  R  G  G  B  O  O  O  G  B  B 
+ R  R  R  G  G  R  B  B  B  O  B  B 
+          W  W  Y                   
+          W  W  Y                   
+          W  W  Y                   
+ 
 
-
-Please provide a solution to solve this cube.
+Please provide a solution to solve this cube using Singmaster notation.
 Answer: None
-Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "L B' F'", 'example_correct_answer': "B L' F U F U' F' U F R U R' U' F' R U R' U R U U R' U' R U R' U R U U R' U' L U' R' U L' U' R U L U' R' U L' U' D' R D R' D' R D R' D' R D R' D' R D U R' D' R D R' D' R D U'"}
+Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "U R' R'", 'example_correct_answer': "R R U'"}
 
 ```
 
@@ -1174,26 +1224,26 @@ Default configuration:
 ```python
 min_words_in_sentence = 3
 max_words_in_sentence = 20
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Restore the correct order of words in the following sentence: thing first that Mr. The
-Answer: The first thing that Mr.
-Metadata: {'word_count': 5}
+Question: Restore the correct order of words in the following sentence: wish could get I sleep. "I some
+Answer: "I wish I could get some sleep.
+Metadata: {'word_count': 7}
 
 Example 2:
-Question: Restore the correct order of words in the following sentence: shall The to called be the attention of government the matter. Chinese
-Answer: The attention of the the Chinese government shall be called to matter.
-Metadata: {'word_count': 12}
+Question: Restore the correct order of words in the following sentence: the high level name. itself its unable it maintain at was of to Unfortunately,
+Answer: Unfortunately, it was unable to maintain itself at the high level of its name.
+Metadata: {'word_count': 14}
 
 Example 3:
-Question: Restore the correct order of words in the following sentence: wonderful we are the accumulators. indebted instruments those new for Jackson To
-Answer: To Jackson we are indebted for those wonderful instruments the new accumulators.
-Metadata: {'word_count': 12}
+Question: Restore the correct order of words in the following sentence: developed by For the unutilized. energy falls ages went the
+Answer: For ages the the energy developed by falls went unutilized.
+Metadata: {'word_count': 10}
 
 ```
 
@@ -1207,26 +1257,26 @@ max_terms = 4
 min_value = 1
 max_value = 100
 operators = ('+', '-', '*')
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Solve for j: 69 - 47*j = -4020
-Answer: 87
-Metadata: {'equation': '69 - 47*j = -4020', 'variable': 'j'}
+Question: Determine the value of u that satisfies: 32*u + 4 = 580
+Answer: 18
+Metadata: {'equation': '32*u + 4 = 580', 'variable': 'u'}
 
 Example 2:
-Question: Solve for o: 210000*o + 98 = 840098
-Answer: 4
-Metadata: {'equation': '210000*o + 98 = 840098', 'variable': 'o'}
+Question: Solve for b: 82080*b = 1067040
+Answer: 13
+Metadata: {'equation': '82080*b = 1067040', 'variable': 'b'}
 
 Example 3:
-Question: Find the value of a in the equation: 6930*a = 297990
-Answer: 43
-Metadata: {'equation': '6930*a = 297990', 'variable': 'a'}
+Question: Determine the value of n that satisfies: 29*n - 5 = 430
+Answer: 15
+Metadata: {'equation': '29*n - 5 = 430', 'variable': 'n'}
 
 ```
 
@@ -1236,26 +1286,26 @@ Generates tasks to spell words backward
 Default configuration:
 ```python
 min_word_len = 3
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Spell this word backward (example: sun -> nus): only
-Answer: ylno
-Metadata: {'word': 'only', 'word_len': 4}
+Question: Spell this word backward (example: sun -> nus): Project
+Answer: tcejorP
+Metadata: {'word': 'Project', 'word_len': 7}
 
 Example 2:
-Question: Spell this word backward (example: sun -> nus): from
-Answer: morf
-Metadata: {'word': 'from', 'word_len': 4}
+Question: Spell this word backward (example: sun -> nus): Would
+Answer: dluoW
+Metadata: {'word': 'Would', 'word_len': 5}
 
 Example 3:
-Question: Spell this word backward (example: sun -> nus): anxiously
-Answer: ylsuoixna
-Metadata: {'word': 'anxiously', 'word_len': 9}
+Question: Spell this word backward (example: sun -> nus): One
+Answer: enO
+Metadata: {'word': 'One', 'word_len': 3}
 
 ```
 
@@ -1266,7 +1316,7 @@ Default configuration:
 ```python
 min_empty = 30
 max_empty = 50
-seed = None
+seed = 42
 size = 500
 ```
 
@@ -1274,69 +1324,69 @@ Example tasks:
 ```
 Example 1:
 Question: Solve this Sudoku puzzle:
-_ 8 _ 2 _ _ _ _ 3
-_ _ 4 _ 7 _ _ 8 9
-2 5 6 3 _ _ _ 4 7
-_ _ 8 _ 6 _ 9 5 _
-9 _ 2 7 _ 5 _ _ _
-3 6 _ _ 2 9 8 _ _
-_ 4 3 _ 5 2 7 _ _
-_ _ 1 _ _ _ 4 2 8
-6 2 _ 8 4 1 3 9 5
-Answer: 7 8 9 2 1 4 5 6 3
-1 3 4 5 7 6 2 8 9
-2 5 6 3 9 8 1 4 7
-4 7 8 1 6 3 9 5 2
-9 1 2 7 8 5 6 3 4
-3 6 5 4 2 9 8 7 1
-8 4 3 9 5 2 7 1 6
-5 9 1 6 3 7 4 2 8
-6 2 7 8 4 1 3 9 5
-Metadata: {'puzzle': [[0, 8, 0, 2, 0, 0, 0, 0, 3], [0, 0, 4, 0, 7, 0, 0, 8, 9], [2, 5, 6, 3, 0, 0, 0, 4, 7], [0, 0, 8, 0, 6, 0, 9, 5, 0], [9, 0, 2, 7, 0, 5, 0, 0, 0], [3, 6, 0, 0, 2, 9, 8, 0, 0], [0, 4, 3, 0, 5, 2, 7, 0, 0], [0, 0, 1, 0, 0, 0, 4, 2, 8], [6, 2, 0, 8, 4, 1, 3, 9, 5]], 'solution': [[7, 8, 9, 2, 1, 4, 5, 6, 3], [1, 3, 4, 5, 7, 6, 2, 8, 9], [2, 5, 6, 3, 9, 8, 1, 4, 7], [4, 7, 8, 1, 6, 3, 9, 5, 2], [9, 1, 2, 7, 8, 5, 6, 3, 4], [3, 6, 5, 4, 2, 9, 8, 7, 1], [8, 4, 3, 9, 5, 2, 7, 1, 6], [5, 9, 1, 6, 3, 7, 4, 2, 8], [6, 2, 7, 8, 4, 1, 3, 9, 5]], 'num_empty': 38}
+4 _ _ _ 5 2 _ 3 _
+_ _ 3 4 6 _ _ _ _
+6 1 2 _ _ 8 4 _ _
+1 _ _ _ _ _ 7 9 5
+3 _ _ 7 1 _ _ 2 6
+7 _ _ 5 _ _ _ _ 3
+2 _ _ _ 7 5 _ _ _
+_ 3 _ _ 4 1 _ _ _
+_ _ _ 2 8 _ _ _ 4
+Answer: 4 7 8 1 5 2 6 3 9
+5 9 3 4 6 7 2 8 1
+6 1 2 3 9 8 4 5 7
+1 2 4 8 3 6 7 9 5
+3 5 9 7 1 4 8 2 6
+7 8 6 5 2 9 1 4 3
+2 4 1 9 7 5 3 6 8
+8 3 5 6 4 1 9 7 2
+9 6 7 2 8 3 5 1 4
+Metadata: {'puzzle': [[4, 0, 0, 0, 5, 2, 0, 3, 0], [0, 0, 3, 4, 6, 0, 0, 0, 0], [6, 1, 2, 0, 0, 8, 4, 0, 0], [1, 0, 0, 0, 0, 0, 7, 9, 5], [3, 0, 0, 7, 1, 0, 0, 2, 6], [7, 0, 0, 5, 0, 0, 0, 0, 3], [2, 0, 0, 0, 7, 5, 0, 0, 0], [0, 3, 0, 0, 4, 1, 0, 0, 0], [0, 0, 0, 2, 8, 0, 0, 0, 4]], 'solution': [[4, 7, 8, 1, 5, 2, 6, 3, 9], [5, 9, 3, 4, 6, 7, 2, 8, 1], [6, 1, 2, 3, 9, 8, 4, 5, 7], [1, 2, 4, 8, 3, 6, 7, 9, 5], [3, 5, 9, 7, 1, 4, 8, 2, 6], [7, 8, 6, 5, 2, 9, 1, 4, 3], [2, 4, 1, 9, 7, 5, 3, 6, 8], [8, 3, 5, 6, 4, 1, 9, 7, 2], [9, 6, 7, 2, 8, 3, 5, 1, 4]], 'num_empty': 48}
 
 Example 2:
 Question: Solve this Sudoku puzzle:
-5 _ _ _ 3 4 _ 6 _
-_ _ 3 _ _ _ _ _ _
-_ _ 8 5 9 _ _ _ 2
-_ 5 7 6 4 _ _ 8 _
-_ 4 6 _ _ _ _ 5 3
-_ 3 _ _ _ 5 _ _ _
-6 8 1 _ _ 9 _ _ _
-_ 9 5 _ 2 _ _ 4 _
-_ 2 _ _ 8 6 1 9 5
-Answer: 5 7 2 1 3 4 8 6 9
-9 1 3 2 6 8 5 7 4
-4 6 8 5 9 7 3 1 2
-2 5 7 6 4 3 9 8 1
-8 4 6 9 1 2 7 5 3
-1 3 9 8 7 5 4 2 6
-6 8 1 4 5 9 2 3 7
-3 9 5 7 2 1 6 4 8
-7 2 4 3 8 6 1 9 5
-Metadata: {'puzzle': [[5, 0, 0, 0, 3, 4, 0, 6, 0], [0, 0, 3, 0, 0, 0, 0, 0, 0], [0, 0, 8, 5, 9, 0, 0, 0, 2], [0, 5, 7, 6, 4, 0, 0, 8, 0], [0, 4, 6, 0, 0, 0, 0, 5, 3], [0, 3, 0, 0, 0, 5, 0, 0, 0], [6, 8, 1, 0, 0, 9, 0, 0, 0], [0, 9, 5, 0, 2, 0, 0, 4, 0], [0, 2, 0, 0, 8, 6, 1, 9, 5]], 'solution': [[5, 7, 2, 1, 3, 4, 8, 6, 9], [9, 1, 3, 2, 6, 8, 5, 7, 4], [4, 6, 8, 5, 9, 7, 3, 1, 2], [2, 5, 7, 6, 4, 3, 9, 8, 1], [8, 4, 6, 9, 1, 2, 7, 5, 3], [1, 3, 9, 8, 7, 5, 4, 2, 6], [6, 8, 1, 4, 5, 9, 2, 3, 7], [3, 9, 5, 7, 2, 1, 6, 4, 8], [7, 2, 4, 3, 8, 6, 1, 9, 5]], 'num_empty': 47}
+_ _ _ 1 3 2 6 4 5
+_ 4 _ 7 _ _ _ 9 1
+_ _ 1 8 _ 9 _ _ _
+_ 8 9 _ _ _ 7 5 4
+_ 3 _ 4 _ 1 9 8 _
+4 6 _ 5 9 _ 1 2 3
+5 _ 4 9 1 7 3 _ _
+9 7 6 _ 8 4 5 1 _
+8 _ 3 _ _ _ 4 7 _
+Answer: 7 9 8 1 3 2 6 4 5
+3 4 2 7 5 6 8 9 1
+6 5 1 8 4 9 2 3 7
+1 8 9 6 2 3 7 5 4
+2 3 5 4 7 1 9 8 6
+4 6 7 5 9 8 1 2 3
+5 2 4 9 1 7 3 6 8
+9 7 6 3 8 4 5 1 2
+8 1 3 2 6 5 4 7 9
+Metadata: {'puzzle': [[0, 0, 0, 1, 3, 2, 6, 4, 5], [0, 4, 0, 7, 0, 0, 0, 9, 1], [0, 0, 1, 8, 0, 9, 0, 0, 0], [0, 8, 9, 0, 0, 0, 7, 5, 4], [0, 3, 0, 4, 0, 1, 9, 8, 0], [4, 6, 0, 5, 9, 0, 1, 2, 3], [5, 0, 4, 9, 1, 7, 3, 0, 0], [9, 7, 6, 0, 8, 4, 5, 1, 0], [8, 0, 3, 0, 0, 0, 4, 7, 0]], 'solution': [[7, 9, 8, 1, 3, 2, 6, 4, 5], [3, 4, 2, 7, 5, 6, 8, 9, 1], [6, 5, 1, 8, 4, 9, 2, 3, 7], [1, 8, 9, 6, 2, 3, 7, 5, 4], [2, 3, 5, 4, 7, 1, 9, 8, 6], [4, 6, 7, 5, 9, 8, 1, 2, 3], [5, 2, 4, 9, 1, 7, 3, 6, 8], [9, 7, 6, 3, 8, 4, 5, 1, 2], [8, 1, 3, 2, 6, 5, 4, 7, 9]], 'num_empty': 34}
 
 Example 3:
 Question: Solve this Sudoku puzzle:
-9 8 6 _ _ _ _ _ 3
-4 _ _ _ _ _ _ 6 _
-_ _ 3 6 7 _ _ _ 8
-_ _ 9 _ _ 3 6 _ _
-_ _ _ _ _ _ 7 4 2
-_ _ _ 4 _ _ _ _ _
-_ _ 2 5 _ _ _ 1 _
-_ 3 1 _ 4 6 8 9 7
-7 9 _ 8 _ _ _ _ 6
-Answer: 9 8 6 1 2 4 5 7 3
-4 2 7 3 8 5 1 6 9
-1 5 3 6 7 9 4 2 8
-2 4 9 7 1 3 6 8 5
-3 1 5 9 6 8 7 4 2
-6 7 8 4 5 2 9 3 1
-8 6 2 5 9 7 3 1 4
-5 3 1 2 4 6 8 9 7
-7 9 4 8 3 1 2 5 6
-Metadata: {'puzzle': [[9, 8, 6, 0, 0, 0, 0, 0, 3], [4, 0, 0, 0, 0, 0, 0, 6, 0], [0, 0, 3, 6, 7, 0, 0, 0, 8], [0, 0, 9, 0, 0, 3, 6, 0, 0], [0, 0, 0, 0, 0, 0, 7, 4, 2], [0, 0, 0, 4, 0, 0, 0, 0, 0], [0, 0, 2, 5, 0, 0, 0, 1, 0], [0, 3, 1, 0, 4, 6, 8, 9, 7], [7, 9, 0, 8, 0, 0, 0, 0, 6]], 'solution': [[9, 8, 6, 1, 2, 4, 5, 7, 3], [4, 2, 7, 3, 8, 5, 1, 6, 9], [1, 5, 3, 6, 7, 9, 4, 2, 8], [2, 4, 9, 7, 1, 3, 6, 8, 5], [3, 1, 5, 9, 6, 8, 7, 4, 2], [6, 7, 8, 4, 5, 2, 9, 3, 1], [8, 6, 2, 5, 9, 7, 3, 1, 4], [5, 3, 1, 2, 4, 6, 8, 9, 7], [7, 9, 4, 8, 3, 1, 2, 5, 6]], 'num_empty': 50}
+_ _ 1 2 3 _ _ _ 9
+3 _ _ 1 8 5 6 7 2
+_ _ _ 4 9 6 1 _ _
+1 _ 5 7 _ _ 9 2 _
+_ 4 _ _ 5 9 7 1 6
+9 _ 6 _ 1 _ 4 5 3
+_ _ 3 9 7 _ 2 8 4
+_ _ 2 6 4 _ _ 9 1
+_ 1 _ 5 2 8 3 _ _
+Answer: 5 6 1 2 3 7 8 4 9
+3 9 4 1 8 5 6 7 2
+8 2 7 4 9 6 1 3 5
+1 3 5 7 6 4 9 2 8
+2 4 8 3 5 9 7 1 6
+9 7 6 8 1 2 4 5 3
+6 5 3 9 7 1 2 8 4
+7 8 2 6 4 3 5 9 1
+4 1 9 5 2 8 3 6 7
+Metadata: {'puzzle': [[0, 0, 1, 2, 3, 0, 0, 0, 9], [3, 0, 0, 1, 8, 5, 6, 7, 2], [0, 0, 0, 4, 9, 6, 1, 0, 0], [1, 0, 5, 7, 0, 0, 9, 2, 0], [0, 4, 0, 0, 5, 9, 7, 1, 6], [9, 0, 6, 0, 1, 0, 4, 5, 3], [0, 0, 3, 9, 7, 0, 2, 8, 4], [0, 0, 2, 6, 4, 0, 0, 9, 1], [0, 1, 0, 5, 2, 8, 3, 0, 0]], 'solution': [[5, 6, 1, 2, 3, 7, 8, 4, 9], [3, 9, 4, 1, 8, 5, 6, 7, 2], [8, 2, 7, 4, 9, 6, 1, 3, 5], [1, 3, 5, 7, 6, 4, 9, 2, 8], [2, 4, 8, 3, 5, 9, 7, 1, 6], [9, 7, 6, 8, 1, 2, 4, 5, 3], [6, 5, 3, 9, 7, 1, 2, 8, 4], [7, 8, 2, 6, 4, 3, 5, 9, 1], [4, 1, 9, 5, 2, 8, 3, 6, 7]], 'num_empty': 33}
 
 ```
 
@@ -1352,7 +1402,7 @@ allow_some = True
 allow_some_not = True
 include_invalid = True
 invalid_ratio = 0.3
-seed = None
+seed = 42
 size = 500
 ```
 
@@ -1360,36 +1410,36 @@ Example tasks:
 ```
 Example 1:
 Question: Consider these statements:
-1. Some humans are reptiles
-2. Some reptiles are insects
+1. No students are humans
+2. No humans are chefs
 
 Does it logically follow that:
-Some ... are not humans are insects?
+No students are chefs?
 (Answer Yes or No)
-Answer: No
-Metadata: {'premise1': 'Some humans are reptiles', 'premise2': 'Some reptiles are insects', 'conclusion': 'Some ... are not humans are insects', 'is_valid': False}
+Answer: Yes
+Metadata: {'premise1': 'No students are humans', 'premise2': 'No humans are chefs', 'conclusion': 'No students are chefs', 'is_valid': True}
 
 Example 2:
 Question: Consider these statements:
-1. All mortals are teachers
-2. Some teachers are ants
+1. Some ... are not children are animals
+2. Some animals are doctors
 
 Does it logically follow that:
-Some ... are not mortals are ants?
+All children are doctors?
 (Answer Yes or No)
 Answer: Yes
-Metadata: {'premise1': 'All mortals are teachers', 'premise2': 'Some teachers are ants', 'conclusion': 'Some ... are not mortals are ants', 'is_valid': True}
+Metadata: {'premise1': 'Some ... are not children are animals', 'premise2': 'Some animals are doctors', 'conclusion': 'All children are doctors', 'is_valid': True}
 
 Example 3:
 Question: Consider these statements:
-1. No mortals are whales
-2. No whales are bees
+1. All butterflies are tigers
+2. No tigers are whales
 
 Does it logically follow that:
-No mortals are bees?
+Some ... are not butterflies are whales?
 (Answer Yes or No)
 Answer: No
-Metadata: {'premise1': 'No mortals are whales', 'premise2': 'No whales are bees', 'conclusion': 'No mortals are bees', 'is_valid': False}
+Metadata: {'premise1': 'All butterflies are tigers', 'premise2': 'No tigers are whales', 'conclusion': 'Some ... are not butterflies are whales', 'is_valid': False}
 
 ```
 
@@ -1400,26 +1450,26 @@ Default configuration:
 ```python
 min_words = 3
 max_words = 8
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Reverse this list of words: Africa, harmless, moral
-Answer: moral, harmless, Africa
-Metadata: {'num_words': 3, 'words': ['Africa', 'harmless', 'moral']}
+Question: Reverse this list of words: bed, if, problem, but, Well, an, transmission, nutritive
+Answer: nutritive, transmission, an, Well, but, problem, if, bed
+Metadata: {'num_words': 8, 'words': ['bed', 'if', 'problem', 'but', 'Well', 'an', 'transmission', 'nutritive']}
 
 Example 2:
-Question: Reverse this list of words: efforts, well, set, these, back, Her, for
-Answer: for, Her, back, these, set, well, efforts
-Metadata: {'num_words': 7, 'words': ['efforts', 'well', 'set', 'these', 'back', 'Her', 'for']}
+Question: Reverse this list of words: it, pleasure, Gutenberg
+Answer: Gutenberg, pleasure, it
+Metadata: {'num_words': 3, 'words': ['it', 'pleasure', 'Gutenberg']}
 
 Example 3:
-Question: Reverse this list of words: fellow, compliance, few, which, in, famous, Not
-Answer: Not, famous, in, which, few, compliance, fellow
-Metadata: {'num_words': 7, 'words': ['fellow', 'compliance', 'few', 'which', 'in', 'famous', 'Not']}
+Question: Reverse this list of words: readable, to, he, that, to, possession
+Answer: possession, to, that, he, to, readable
+Metadata: {'num_words': 6, 'words': ['readable', 'to', 'he', 'that', 'to', 'possession']}
 
 ```
 
@@ -1433,28 +1483,30 @@ max_words = 10
 min_word_length = 3
 max_word_length = 12
 transformation = original
-seed = None
+seed = 42
 size = 500
 ```
 
 Example tasks:
 ```
 Example 1:
-Question: Sort these words in descending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-prepare, provide, speak, surplus, after, unlink, change, 000
-Answer: unlink, surplus, speak, provide, prepare, change, after, 000
-Metadata: {'original_words': ['prepare', 'provide', 'speak', 'surplus', 'after', 'unlink', 'change', '000'], 'transformed_words': ['prepare', 'provide', 'speak', 'surplus', 'after', 'unlink', 'change', '000'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['unlink', 'surplus', 'speak', 'provide', 'prepare', 'change', 'after', '000']}
+Question: Sort these words in ascending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
+due, ever, many, generations
+Answer: due, ever, generations, many
+Metadata: {'original_words': ['due', 'ever', 'many', 'generations'], 'transformed_words': ['due', 'ever', 'many', 'generations'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['due', 'ever', 'generations', 'many']}
 
 Example 2:
 Question: Sort these words in descending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-501, differences, Thus, cupola, longer, remaining, mummy, Paris, DISTRIBUTE
-Answer: remaining, mummy, longer, differences, cupola, Thus, Paris, DISTRIBUTE, 501
-Metadata: {'original_words': ['501', 'differences', 'Thus', 'cupola', 'longer', 'remaining', 'mummy', 'Paris', 'DISTRIBUTE'], 'transformed_words': ['501', 'differences', 'Thus', 'cupola', 'longer', 'remaining', 'mummy', 'Paris', 'DISTRIBUTE'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['remaining', 'mummy', 'longer', 'differences', 'cupola', 'Thus', 'Paris', 'DISTRIBUTE', '501']}
+change, 250, young
+Answer: young, change, 250
+Metadata: {'original_words': ['change', '250', 'young'], 'transformed_words': ['change', '250', 'young'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['young', 'change', '250']}
 
 Example 3:
 Question: Sort these words in ascending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-discontinue, access, office, luminous, distributing
-Answer: access, discontinue, distributing, luminous, office
-Metadata: {'original_words': ['discontinue', 'access', 'office', 'luminous', 'distributing'], 'transformed_words': ['discontinue', 'access', 'office', 'luminous', 'distributing'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['access', 'discontinue', 'distributing', 'luminous', 'office']}
+industry, elementary, traverse, stepped, meals, rub, resultant, etheric, irritation
+Answer: elementary, etheric, industry, irritation, meals, resultant, rub, stepped, traverse
+Metadata: {'original_words': ['industry', 'elementary', 'traverse', 'stepped', 'meals', 'rub', 'resultant', 'etheric', 'irritation'], 'transformed_words': ['industry', 'elementary', 'traverse', 'stepped', 'meals', 'rub', 'resultant', 'etheric', 'irritation'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['elementary', 'etheric', 'industry', 'irritation', 'meals', 'resultant', 'rub', 'stepped', 'traverse']}
 
 ```
+
+
diff --git a/README.md b/README.md
index 011eb7cb..a40255f7 100644
--- a/README.md
+++ b/README.md
@@ -123,298 +123,6 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
 - `MazeDataset`: Generate a maze with a start and a goal
 - `CountdownDataset`: Generate number game tasks where numbers and operators must be combined to reach a target value
 
-## Available Generators
-
-<details>
-<summary>
-<h4><dl><dd>PolynomialEquations</dd></dl></h4>
-<smaller>Generate polynomial equations with configurable complexity:</smaller>
-</summary>
-
-```python
-from reasoning_gym.algebra import PolynomialEquationsConfig, PolynomialEquationsConfig
-
-config = PolynomialEquationsConfig(
-    min_terms=3,
-    max_terms=4,
-    min_degree=4,
-    max_degree=4,
-    min_value=1,
-    max_value=5,
-    size=3,
-    seed=123,
-)
-
-dataset = PolynomialEquationsDataset(config)
-for item in dataset:
-    print(item)
-```
-
-Example output:
-
-```
-{'question': 'Find the real value(s) of b in the equation: b**4 - b**3 - 5*b**2 = 0', 'answer': '[-1.79128784747792, 0.0, 2.79128784747792]', 'metadata': {'polynomial_expr': 'b**4 - b**3 - 5*b**2', 'variable': 'b', 'degree': 4, 'real_solutions': [-1.79128784747792, 0.0, 2.79128784747792]}}
-{'question': 'Solve the polynomial equation for real i:\n3*i**4 + 4*i**3 - 1 = 0', 'answer': '[]', 'metadata': {'polynomial_expr': '3*i**4 + 4*i**3 - 1', 'variable': 'i', 'degree': 4, 'real_solutions': []}}
-{'question': 'Solve the polynomial equation for real h:\n7*h**4 - 2*h**2 + h = 0', 'answer': '[-0.6998793469266564, 0.0]', 'metadata': {'polynomial_expr': '7*h**4 - 2*h**2 + h', 'variable': 'h', 'degree': 4, 'real_solutions': [-0.6998793469266564, 0.0]}}
-```
-
-</details>
-
-<details>
-<summary>
-<h4><dl><dd>Basic Arithmetic</dd></dl></h4>
-<smaller>Generate arithmetic problems with configurable complexity:</smaller>
-</summary>
-
-```python
-from reasoning_gym.arithmetic import BasicArithmeticDataset, BasicArithmeticDatasetConfig
-
-config = BasicArithmeticDatasetConfig(
-    min_terms=2,        # Minimum number of terms in expression
-    max_terms=4,        # Maximum number of terms
-    min_digits=1,       # Minimum digits per number
-    max_digits=2,       # Maximum digits per number
-    allow_parentheses=True,  # Include nested expressions
-    size=5,            # Number of problems to generate
-    seed=42            # For reproducibility
-)
-
-dataset = BasicArithmeticDataset(config)
-for item in dataset:
-    print(item)
-```
-
-Example output:
-
-```
-{'question': '-1 + -5   * 8 + -8 =', 'answer': '-49', 'metadata': {'num_terms': 4, 'num_digits': 1, 'expression': '-1 + -5   * 8 + -8'}}
-{'question': '19 - 17 =', 'answer': '2', 'metadata': {'num_terms': 2, 'num_digits': 2, 'expression': '19 - 17'}}
-{'question': '3 + -6 * -9 =', 'answer': '57', 'metadata': {'num_terms': 3, 'num_digits': 1, 'expression': '3 + -6 * -9'}}
-{'question': '-22 - -94 + -97 =', 'answer': '-25', 'metadata': {'num_terms': 3, 'num_digits': 2, 'expression': '-22 - -94 + -97'}}
-{'question': '51 * 63 =', 'answer': '3213', 'metadata': {'num_terms': 2, 'num_digits': 2, 'expression': '51 * 63'}}
-```
-
-</details>
-
-<details>
-<summary>
-<h4><dl><dd>Chain Sum</dd></dl></h4>
-<smaller>Generate addition/subtraction problems with configurable complexity:</smaller>
-</summary>
-
-```python
-from reasoning_gym.arithmetic import ChainSum, ChainSumConfig
-
-config = ChainSumConfig(
-    min_terms=2,        # Minimum numbers to add/subtract
-    max_terms=6,        # Maximum numbers
-    min_digits=1,       # Minimum digits per number
-    max_digits=4,       # Maximum digits per number
-    allow_negation=True, # Allow negative numbers
-    size=5,             # Number of problems
-    seed=42             # For reproducibility
-)
-
-dataset = ChainSum(config)
-for item in dataset:
-    print(item)
-```
-
-Example data:
-
-```
-{
-    "question": "426 + 562 =",
-    "answer": "988",
-    "metadata": { "num_terms": 2, "num_digits": 3, "expression": "426 + 562" },
-}
-{
-    "question": "426 + 562 =",
-    "answer": "988",
-    "metadata": { "num_terms": 2, "num_digits": 3, "expression": "426 + 562" }
-}
-```
-
-</details>
-
-<details>
-<summary>
-<h4><dl><dd>Sequence Completion</dd></dl></h4>
-<smaller>Generate number sequence completion tasks with dynamic pattern generation:</smaller>
-</summary>
-
-```python
-from reasoning_gym.cognition import NumberSequenceDataset, NumberSequenceConfig
-
-config = NumberSequenceConfig(
-    min_terms=4,        # Minimum visible terms
-    max_terms=8,        # Maximum visible terms
-    min_value=-100,     # Minimum allowed number
-    max_value=100,      # Maximum allowed number
-    max_complexity=3,   # Maximum operations to combine
-    size=5,            # Number of sequences
-    seed=42            # For reproducibility
-)
-
-dataset = NumberSequenceDataset(config)
-for item in dataset:
-    print(item)
-```
-
-Example data:
-
-```
-{
-    "question": "3, 6, 12, 24, 48, 96, 192, 384, ?",
-    "answer": "768",
-    "metadata": {"rule": "double", "complexity": 3, "sequence": [3, 6, 12, 24, 48, 96, 192, 384, 768]},
-}
-{
-    "question": "8, 14, 20, 26, 32, 38, 44, ?",
-    "answer": "50",
-    "metadata": {"rule": "add 6", "complexity": 1, "sequence": [8, 14, 20, 26, 32, 38, 44, 50]},
-}
-```
-
-</details>
-
-<details>
-<summary>
-<h4><dl><dd>Color Cube Rotation</dd></dl></h4>
-<smaller>Generate 3D spatial reasoning tasks with cube rotations and color tracking:</smaller>
-</summary>
-
-```python
-from reasoning_gym.cognition import ColorCubeRotationDataset, ColorCubeRotationConfig
-
-config = ColorCubeRotationConfig(
-    min_rotations=1,     # Minimum number of rotations
-    max_rotations=3,     # Maximum number of rotations
-    size=5,             # Number of problems to generate
-    seed=42             # For reproducibility
-)
-
-dataset = ColorCubeRotationDataset(config)
-for item in dataset:
-    print(item)
-```
-
-Example data:
-
-```
-{
-    "question": "A cube has:\n- a red top side\n- a blue right side\n- a green front side\n- a yellow left side\n- a white back side\n- an orange bottom side\n\nThe cube is rotated so that the side which was before at the front is now at the top.\nThe cube is rotated so that the side which was before at the right is now at the top.\n\nWhat is now the color of the bottom side of the cube?",
-    "answer": "yellow",
-    "metadata": {
-        "initial_state": {"top": "red", "right": "blue", "front": "green", "left": "yellow", "back": "white", "bottom": "orange"},
-        "rotations": ["front", "right"],
-        "target_side": "bottom",
-        "num_rotations": 2
-    }
-}
-```
-
-</details>
-
-<details>
-<summary>
-<h4><dl><dd>Propositional Logic</dd></dl></h4>
-<smaller>Generate logical reasoning tasks with configurable complexity:</smaller>
-</summary>
-
-```python
-from reasoning_gym.logic import PropositionalLogicDataset, PropositionalLogicConfig
-
-config = PropositionalLogicConfig(
-    min_vars=2,         # Minimum number of variables
-    max_vars=4,         # Maximum number of variables
-    min_statements=2,   # Minimum number of given statements
-    max_statements=4,   # Maximum number of statements
-    max_complexity=3,   # Maximum operator depth
-    size=5,            # Number of problems to generate
-    seed=42            # For reproducibility
-)
-
-dataset = PropositionalLogicDataset(config)
-for item in dataset:
-    print(item)
-```
-
-Example data:
-
-```
-{
-    "question": "Given:\n1. R\n2. Q\nWhat can we conclude?",
-    "answer": "(P ∨ Q)",
-    "metadata": {"premises": ["R", "Q"], "variables": ["P", "Q", "R", "S"], "complexity": 3},
-}
-{
-    "question": "Given:\n1. ((Q → P) ∨ (Q → P))\n2. ((Q ↔ Q) → (P → P))\n3. P\nWhat can we conclude?",
-    "answer": "(P → P)",
-    "metadata": {
-        "premises": ["((Q → P) ∨ (Q → P))", "((Q ↔ Q) → (P → P))", "P"],
-        "variables": ["P", "Q"],
-        "complexity": 3,
-    },
-}
-```
-
-</details>
-
-<details>
-<summary>
-<h4><dl><dd>Maze</dd></dl></h4>
-<smaller>Generate a maze with configurable difficulty:</smaller>
-</summary>
-
-```python
-from reasoning_gym.games import MazeConfig, MazeDataset
-
-config = MazeConfig(
-    min_dist=3,
-    max_dist=5,
-    min_grid_size=5,
-    max_grid_size=5,
-    size=2,
-    seed=4,
-)
-
-dataset = MazeDataset(config)
-
-for item in dataset:
-    print()
-    print(item["question"])
-    print(item)
-```
-
-Example data:
-
-```
-Navigate from 'd' (start) to '}' (goal):
-
-uuuuu
-uCCdu
-uCCCu
-uu}Cu
-uuuuu
-Legend: 'u' = Wall, 'C' = Path
-
-{'question': "Navigate from 'd' (start) to '}' (goal):\n\nuuuuu\nuCCdu\nuCCCu\nuu}Cu\nuuuuu\nLegend: 'u' = Wall, 'C' = Path\n", 'answer': '3', 'metadata': {'grid_size': 5, 'grid': ['uuuuu', 'uCCdu', 'uCCCu', 'uu}Cu', 'uuuuu'], 'shortest_path_length': 3, 'start': 'd', 'goal': '}', 'wall': 'u', 'path': 'C'}}
-
-Navigate from 'J' (start) to '_' (goal):
-
-<<<<<
-<<J<<
-<www<
-<<w_<
-<<<<<
-Legend: '<' = Wall, 'w' = Path
-
-{'question': "Navigate from 'J' (start) to '_' (goal):\n\n<<<<<\n<<J<<\n<www<\n<<w_<\n<<<<<\nLegend: '<' = Wall, 'w' = Path\n", 'answer': '3', 'metadata': {'grid_size': 5, 'grid': ['<<<<<', '<<J<<', '<www<', '<<w_<', '<<<<<'], 'shortest_path_length': 3, 'start': 'J', 'goal': '_', 'wall': '<', 'path': 'w'}}
-```
-
-</details>
-
 ## Future Generator Ideas
 
 - More complex math tasks (algebra, geometry)
diff --git a/scripts/generate_gallery.py b/scripts/generate_gallery.py
index 06d841d4..3eef4ff5 100755
--- a/scripts/generate_gallery.py
+++ b/scripts/generate_gallery.py
@@ -1,12 +1,10 @@
 #!/usr/bin/env python3
 """Generate a markdown gallery of all available datasets with examples"""
 
-import os
 import textwrap
 from pathlib import Path
 
-import reasoning_gym.cognition.figlet_fonts
-import reasoning_gym.cognition.rubiks_cube
+import reasoning_gym.code.bf
 from reasoning_gym.factory import DATASETS, create_dataset
 
 
@@ -21,17 +19,16 @@ def generate_gallery() -> str:
     content.append("## Available Datasets\n")
     for name in sorted(DATASETS.keys()):
         # Create anchor link
-        anchor = name.replace("_", "-").lower()
+        anchor = name.replace(" ", "-").lower()
         content.append(f"- [{name}](#{anchor})\n")
     content.append("\n")
 
     # Add examples for each dataset
     content.append("## Dataset Examples\n")
     for name in sorted(DATASETS.keys()):
-        dataset = create_dataset(name)
+        dataset = create_dataset(name, seed=42)
 
         # Add dataset header with anchor
-        anchor = name.replace("_", "-").lower()
         content.append(f"### {name}\n")
 
         # Get dataset class docstring if available
@@ -76,6 +73,7 @@ def main():
 
     with open(gallery_path, "w") as f:
         f.write(gallery_content)
+        f.write("\n")
 
     print(f"Generated gallery at {gallery_path}")
 

From 5ae329becd8d46b407b7a52a8a8233d981340802 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Thu, 30 Jan 2025 23:14:32 +0100
Subject: [PATCH 12/94] lint

---
 GALLERY.md                                | 67 ++++++++++++----
 examples/generate_word_ladder_examples.py | 91 ++++++++++-----------
 reasoning_gym/algorithmic/__init__.py     |  2 +-
 reasoning_gym/algorithmic/word_ladder.py  | 97 ++++++++++++-----------
 reasoning_gym/data/words.csv              |  2 +-
 tests/test_word_ladder.py                 | 13 +--
 6 files changed, 148 insertions(+), 124 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index 8fb699c7..9780d605 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -33,6 +33,7 @@ This gallery shows examples from all available datasets using their default conf
 - [spell_backward](#spell_backward)
 - [sudoku](#sudoku)
 - [syllogism](#syllogism)
+- [word_ladder](#word_ladder)
 - [word_sequence_reversal](#word_sequence_reversal)
 - [word_sorting](#word_sorting)
 
@@ -710,19 +711,19 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: How many times does the letter "w" appear in the text: "bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed"?
-Answer: 1
-Metadata: {'span_length': 15, 'target_letter': 'w', 'span': ['bed', 'and', 'enters', 'his', 'mechanical', 'dresser', 'Two', 'minutes', 'later', 'the', 'machine', 'deposited', 'him', 'all', 'dressed']}
+Question: How many times does the letter "o" appear in the text: "bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed"?
+Answer: 2
+Metadata: {'span_length': 15, 'target_letter': 'o', 'span': ['bed', 'and', 'enters', 'his', 'mechanical', 'dresser', 'Two', 'minutes', 'later', 'the', 'machine', 'deposited', 'him', 'all', 'dressed']}
 
 Example 2:
-Question: How many times does the letter "p" appear in the text: "it into a watering place"?
+Question: How many times does the letter "c" appear in the text: "it into a watering place"?
 Answer: 1
-Metadata: {'span_length': 5, 'target_letter': 'p', 'span': ['it', 'into', 'a', 'watering', 'place']}
+Metadata: {'span_length': 5, 'target_letter': 'c', 'span': ['it', 'into', 'a', 'watering', 'place']}
 
 Example 3:
-Question: How many times does the letter "t" appear in the text: "readable form accessible by the widest array of equipment including outdated"?
-Answer: 5
-Metadata: {'span_length': 11, 'target_letter': 't', 'span': ['readable', 'form', 'accessible', 'by', 'the', 'widest', 'array', 'of', 'equipment', 'including', 'outdated']}
+Question: How many times does the letter "o" appear in the text: "readable form accessible by the widest array of equipment including outdated"?
+Answer: 3
+Metadata: {'span_length': 11, 'target_letter': 'o', 'span': ['readable', 'form', 'accessible', 'by', 'the', 'widest', 'array', 'of', 'equipment', 'including', 'outdated']}
 
 ```
 
@@ -1443,6 +1444,38 @@ Metadata: {'premise1': 'All butterflies are tigers', 'premise2': 'No tigers are
 
 ```
 
+### word_ladder
+Generates word ladder transformation tasks
+
+Default configuration:
+```python
+min_word_length = 3
+max_word_length = 5
+min_chain_length = -1
+max_chain_length = -1
+seed = 42
+size = 500
+```
+
+Example tasks:
+```
+Example 1:
+Question: Transform the word 'CEILS' into 'ANIGH' by changing one letter at a time. Each step must create a valid English word (including plurals) and keep the same word length. Show the sequence of words needed.
+Answer: CEILS,TEILS,TEINS,THINS,THIGS,THIGH,AHIGH,ANIGH
+Metadata: {'start_word': 'CEILS', 'end_word': 'ANIGH', 'word_length': 5, 'chain_length': 8}
+
+Example 2:
+Question: Transform the word 'KAW' into 'EFS' by changing one letter at a time. Each step must create a valid English word (including plurals) and keep the same word length. Show the sequence of words needed.
+Answer: KAW,KAS,EAS,EFS
+Metadata: {'start_word': 'KAW', 'end_word': 'EFS', 'word_length': 3, 'chain_length': 4}
+
+Example 3:
+Question: Transform the word 'SAUT' into 'SKER' by changing one letter at a time. Each step must create a valid English word (including plurals) and keep the same word length. Show the sequence of words needed.
+Answer: SAUT,SHUT,SHET,SKET,SKER
+Metadata: {'start_word': 'SAUT', 'end_word': 'SKER', 'word_length': 4, 'chain_length': 5}
+
+```
+
 ### word_sequence_reversal
 Generates word sequence reversal tasks from text spans
 
@@ -1491,21 +1524,21 @@ Example tasks:
 ```
 Example 1:
 Question: Sort these words in ascending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-due, ever, many, generations
-Answer: due, ever, generations, many
-Metadata: {'original_words': ['due', 'ever', 'many', 'generations'], 'transformed_words': ['due', 'ever', 'many', 'generations'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['due', 'ever', 'generations', 'many']}
+Wolcott, keep, reaching, times
+Answer: Wolcott, keep, reaching, times
+Metadata: {'original_words': ['Wolcott', 'keep', 'reaching', 'times'], 'transformed_words': ['Wolcott', 'keep', 'reaching', 'times'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['Wolcott', 'keep', 'reaching', 'times']}
 
 Example 2:
 Question: Sort these words in descending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-change, 250, young
-Answer: young, change, 250
-Metadata: {'original_words': ['change', '250', 'young'], 'transformed_words': ['change', '250', 'young'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['young', 'change', '250']}
+took, critical, condense
+Answer: took, critical, condense
+Metadata: {'original_words': ['took', 'critical', 'condense'], 'transformed_words': ['took', 'critical', 'condense'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['took', 'critical', 'condense']}
 
 Example 3:
 Question: Sort these words in ascending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-industry, elementary, traverse, stepped, meals, rub, resultant, etheric, irritation
-Answer: elementary, etheric, industry, irritation, meals, resultant, rub, stepped, traverse
-Metadata: {'original_words': ['industry', 'elementary', 'traverse', 'stepped', 'meals', 'rub', 'resultant', 'etheric', 'irritation'], 'transformed_words': ['industry', 'elementary', 'traverse', 'stepped', 'meals', 'rub', 'resultant', 'etheric', 'irritation'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['elementary', 'etheric', 'industry', 'irritation', 'meals', 'resultant', 'rub', 'stepped', 'traverse']}
+apartment, yellow, Just, pleasure, collapse, different, purchasers, taking, opening
+Answer: Just, apartment, collapse, different, opening, pleasure, purchasers, taking, yellow
+Metadata: {'original_words': ['apartment', 'yellow', 'Just', 'pleasure', 'collapse', 'different', 'purchasers', 'taking', 'opening'], 'transformed_words': ['apartment', 'yellow', 'Just', 'pleasure', 'collapse', 'different', 'purchasers', 'taking', 'opening'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['Just', 'apartment', 'collapse', 'different', 'opening', 'pleasure', 'purchasers', 'taking', 'yellow']}
 
 ```
 
diff --git a/examples/generate_word_ladder_examples.py b/examples/generate_word_ladder_examples.py
index b5d147c8..72901b66 100644
--- a/examples/generate_word_ladder_examples.py
+++ b/examples/generate_word_ladder_examples.py
@@ -1,48 +1,50 @@
 # generates dataset of word ladder examples, and then generates simulated chain of thought reasoning for each example
 
-import reasoning_gym
-from openai import OpenAI
 import os
 
+from openai import OpenAI
+
+import reasoning_gym
+
 # Configuration for the dataset
 config = {
-    'dataset_name': 'word_ladder',
-    'dataset_config': {
-        'min_word_length': 5,
-        'max_word_length': 5,
-        'min_chain_length':3, # set to -1 for shortest possible path, increase to generate more examples
-        'max_chain_length':5,
-        'size': 1,  # Generate a small dataset for demonstration
-    }
+    "dataset_name": "word_ladder",
+    "dataset_config": {
+        "min_word_length": 5,
+        "max_word_length": 5,
+        "min_chain_length": 3,  # set to -1 for shortest possible path, increase to generate more examples
+        "max_chain_length": 5,
+        "size": 1,  # Generate a small dataset for demonstration
+    },
 }
 
-system_prompt = """Word Ladder puzzles involve transforming a start word into an end word. 
-You are allowed to change only one letter a time and you must keep the number of letters constant. 
-Each time you change one letter the word in the chain must be forming one that's valid in English. 
-Plurals are allowed, but not proper nouns. 
+system_prompt = """Word Ladder puzzles involve transforming a start word into an end word.
+You are allowed to change only one letter a time and you must keep the number of letters constant.
+Each time you change one letter the word in the chain must be forming one that's valid in English.
+Plurals are allowed, but not proper nouns.
 Given a start and an end word, generate a detailed step-by-step chain of thought reasoning of the transformation process.
 You will be given the word ladder question, as well as the correct solution path. So you don't need to solve the problem, you have the solution.
-Your task is to provide a perfectly simulated chain of thought reasoning exactly in the style of the example below, 
-including considering multiple possibilities, validating words, and showing the final path. 
+Your task is to provide a perfectly simulated chain of thought reasoning exactly in the style of the example below,
+including considering multiple possibilities, validating words, and showing the final path.
 Use casual, verbose thinking with markdown formatting.
-Example chain of thought reasoning: 
+Example chain of thought reasoning:
 {sample_CoT}
 """
 
 # sample CoT for demonstration, extracted from real inference data from deepseek r1
-sample_CoT = """ 
-Word Ladder puzzles involve transforming a start word into an end word. 
-You are allowed to change only one letter a time and you must keep the number of letters constant. 
-Each time you change one letter the word in the chain must be forming one that's valid in English. 
-Plurals are allowed, but not proper nouns. 
+sample_CoT = """
+Word Ladder puzzles involve transforming a start word into an end word.
+You are allowed to change only one letter a time and you must keep the number of letters constant.
+Each time you change one letter the word in the chain must be forming one that's valid in English.
+Plurals are allowed, but not proper nouns.
 Given a start and an end word, generate a detailed step-by-step chain of thought reasoning of the transformation process.
 You will be given the word ladder question, as well as the correct solution path. So you don't need to solve the problem, you have the solution already, which should be your 'cheat sheet' to refer to as you go along.
-Your task is to provide a perfectly simulated chain of thought reasoning exactly in the style of the example below, 
-including considering multiple possibilities, validating words, and showing the final path. 
+Your task is to provide a perfectly simulated chain of thought reasoning exactly in the style of the example below,
+including considering multiple possibilities, validating words, and showing the final path.
 Use casual, verbose thinking with markdown formatting.
 VERY IMPORTANT ***NEVER MENTION YOU HAVE BEEN PROVIDED WITH THE ANSWER, YOU NEED TO WRITE ONLY AS IF YOU'RE SOLVING THE QUESTION STEP BY STEP ***
 
-Example chain of thought reasoning: 
+Example chain of thought reasoning:
 <think>
 Okay, so I need to figure out how to transform the word "CARK" into "ZIPS" by changing one letter at a time, making sure each intermediate step is a valid English word. Let me start by breaking down the problem.
 
@@ -177,44 +179,37 @@ Yes, that's four steps, each changing one letter, all words are valid. So that s
 """
 
 client = OpenAI(
-  base_url="https://openrouter.ai/api/v1", # openrouter used in this example
-  api_key=os.environ["OPENROUTER_API_KEY"], # set your key in environment variable
+    base_url="https://openrouter.ai/api/v1",  # openrouter used in this example
+    api_key=os.environ["OPENROUTER_API_KEY"],  # set your key in environment variable
 )
 
 
 def generate_cot(question: str, answer: str) -> str:
     """Generate chain of thought reasoning for word ladder"""
-    prompt = f"""The question is {question}. The correct solution is {answer}. 
+    prompt = f"""The question is {question}. The correct solution is {answer}.
     Provide the verbose chain of thought reasoning to transform the start word into the end word exactly in the style and length required."""
-    
+
     completion = client.chat.completions.create(
-        model="microsoft/phi-4", # choose model
-        messages=[
-            {
-                "role": "system",
-                "content": system_prompt
-            },
-            {
-                "role": "user",
-                "content": prompt
-            }
-        ],
+        model="microsoft/phi-4",  # choose model
+        messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
         temperature=0.6,
-        max_tokens=10000
+        max_tokens=10000,
     )
     return completion.choices[0].message.content
+
+
 # Create the word ladder dataset
-dataset = reasoning_gym.create_dataset(config['dataset_name'], **config['dataset_config'])
+dataset = reasoning_gym.create_dataset(config["dataset_name"], **config["dataset_config"])
 print(f"Generated {len(dataset)} examples, moving on to generate CoT reasoning...")
 # Generate and print examples with CoT
 for item in dataset:
     # Generate CoT reasoning demo
 
-    item['reasoning'] = generate_cot(item['question'],item['answer'])
-    
+    item["reasoning"] = generate_cot(item["question"], item["answer"])
+
     print("\n--- Example ---")
-    print("Question:", item['question'])
-    print("Answer:", item['answer'])
+    print("Question:", item["question"])
+    print("Answer:", item["answer"])
     print("\nChain of Thought:")
-    print(item['reasoning'])
-    print("\nMetadata:", item['metadata']) 
\ No newline at end of file
+    print(item["reasoning"])
+    print("\nMetadata:", item["metadata"])
diff --git a/reasoning_gym/algorithmic/__init__.py b/reasoning_gym/algorithmic/__init__.py
index 5345eaec..1b509970 100644
--- a/reasoning_gym/algorithmic/__init__.py
+++ b/reasoning_gym/algorithmic/__init__.py
@@ -14,9 +14,9 @@ from .number_filtering import NumberFilteringConfig, NumberFilteringDataset
 from .number_sorting import NumberSortingConfig, NumberSortingDataset
 from .sentence_reordering import SentenceReorderingConfig, SentenceReorderingDataset
 from .spell_backward import SpellBackwardConfig, SpellBackwardDataset
+from .word_ladder import WordLadderConfig, WordLadderDataset
 from .word_sequence_reversal import WordSequenceReversalConfig, WordSequenceReversalDataset
 from .word_sorting import TextTransformation, WordSortingConfig, WordSortingDataset
-from .word_ladder import WordLadderConfig, WordLadderDataset
 
 __all__ = [
     "SpellBackwardConfig",
diff --git a/reasoning_gym/algorithmic/word_ladder.py b/reasoning_gym/algorithmic/word_ladder.py
index 8f7390d2..acdfb02e 100644
--- a/reasoning_gym/algorithmic/word_ladder.py
+++ b/reasoning_gym/algorithmic/word_ladder.py
@@ -1,46 +1,51 @@
 """Word ladder task generator"""
 
+from collections import deque
 from dataclasses import dataclass
 from random import Random
-from typing import List, Optional, Set, Dict, Tuple
-from collections import deque
+from typing import Dict, List, Optional, Set, Tuple
+
 from reasoning_gym.data import read_data_file
 
 from ..factory import ProceduralDataset, register_dataset
 
+
 @dataclass
 class WordLadderConfig:
     """Configuration for word ladder task generation"""
-    
-    min_word_length: int = 3       # Minimum word length
-    max_word_length: int = 5       # Maximum word length
-    min_chain_length: int = -1     # Set to -1 for shortest path or a minimum of 3
-    max_chain_length: int = -1     # Set to -1 for shortest path or a max 
+
+    min_word_length: int = 3  # Minimum word length
+    max_word_length: int = 5  # Maximum word length
+    min_chain_length: int = -1  # Set to -1 for shortest path or a minimum of 3
+    max_chain_length: int = -1  # Set to -1 for shortest path or a max
     seed: Optional[int] = None
-    size: int = 500                # Virtual dataset size
+    size: int = 500  # Virtual dataset size
 
     def validate(self) -> None:
         """Validate configuration parameters"""
         assert self.min_word_length > 2, "min_word_length must be 3"
         assert self.max_word_length >= self.min_word_length, "max_word_length must be >= min_word_length"
         assert self.max_word_length <= 5, "max_word_length must be 5"
-        
+
         # Modified validation logic
         if self.min_chain_length == -1:
             if self.max_chain_length != -1:
-                assert self.max_chain_length >= 3, "When min_chain_length=-1 (shortest path), max_chain_length must be -1 or >=3"
+                assert (
+                    self.max_chain_length >= 3
+                ), "When min_chain_length=-1 (shortest path), max_chain_length must be -1 or >=3"
         elif self.max_chain_length == -1:
             raise AssertionError("max_chain_length cannot be -1 unless min_chain_length is also -1")
         else:
             assert self.min_chain_length >= 3, "min_chain_length must be 3 or -1"
             assert self.max_chain_length >= self.min_chain_length, "max_chain_length must be >= min_chain_length"
 
+
 class WordLadderDataset(ProceduralDataset):
     """Generates word ladder transformation tasks"""
 
     def __init__(self, config: WordLadderConfig):
         super().__init__(config=config, seed=config.seed, size=config.size)
-        
+
         # Load words from CSV file
         self.word_sets = self._load_words_from_csv()
 
@@ -48,36 +53,37 @@ class WordLadderDataset(ProceduralDataset):
         """Load words from CSV file organized by length"""
         import csv
         from io import StringIO
+
         word_sets = {}
-        
+
         try:
             # Get CSV content as string
             csv_content = read_data_file("words.csv")
-            
+
             # Use StringIO to create a file-like object from the string
             csv_file = StringIO(csv_content)
             reader = csv.DictReader(csv_file)
-            
+
             for row in reader:
                 # Process each word length column
                 for length in range(3, 6):
-                    col_name = f'{length}_letter'
-                    word = row.get(col_name, '')
-                    
+                    col_name = f"{length}_letter"
+                    word = row.get(col_name, "")
+
                     if not word:  # Skip empty entries
                         continue
-                        
+
                     if self.config.min_word_length <= length <= self.config.max_word_length:
                         word_sets.setdefault(length, set()).add(word.upper())
-                        
+
         except Exception as e:
             raise RuntimeError(f"Error processing words.csv content: {e}") from e
-        
+
         # Validate we have words for each length
         for length in range(self.config.min_word_length, self.config.max_word_length + 1):
             if length not in word_sets or not word_sets[length]:
                 raise ValueError(f"No valid words found for length {length}")
-                
+
         return word_sets
 
     def _differs_by_one(self, word1: str, word2: str) -> bool:
@@ -96,16 +102,16 @@ class WordLadderDataset(ProceduralDataset):
         """Find path between start and end words that meets length requirements"""
         if start == end:
             return [start]
-        
+
         # First find shortest path length
         shortest_path = self._bfs_shortest_path(start, end, word_set)
         if not shortest_path:
             return None
-            
+
         min_length = self.config.min_chain_length
         if len(shortest_path) > min_length:
             return shortest_path  # Shortest path is already longer than required
-            
+
         # Now look for longer paths using DFS with depth constraint
         return self._dfs_with_depth(start, end, word_set, min_length)
 
@@ -113,12 +119,12 @@ class WordLadderDataset(ProceduralDataset):
         """BFS implementation to find shortest path"""
         queue = deque([(start, [start])])
         visited = {start}
-        
+
         while queue:
             current, path = queue.popleft()
             if current == end:
                 return path
-                
+
             for neighbor in self._get_neighbors(current, word_set):
                 if neighbor not in visited:
                     visited.add(neighbor)
@@ -128,62 +134,62 @@ class WordLadderDataset(ProceduralDataset):
     def _dfs_with_depth(self, start: str, end: str, word_set: Set[str], target_length: int) -> Optional[List[str]]:
         """DFS implementation looking for paths of exact length"""
         stack = [(start, [start], set([start]))]
-        
+
         while stack:
             current, path, visited = stack.pop()
-            
+
             if len(path) == target_length:
                 if current == end:
                     return path
                 continue
-                
+
             if len(path) > target_length:
                 continue
-                
+
             # Explore neighbors in random order to find different paths
             neighbors = list(self._get_neighbors(current, word_set))
             Random().shuffle(neighbors)
-            
+
             for neighbor in neighbors:
                 if neighbor not in visited:
                     new_visited = set(visited)
                     new_visited.add(neighbor)
                     stack.append((neighbor, path + [neighbor], new_visited))
-                    
+
         return None
 
     def _get_neighbors(self, word: str, word_set: Set[str]) -> Set[str]:
         """Get all valid neighbors that differ by one letter"""
         neighbors = set()
         word_chars = list(word)
-        
+
         for i in range(len(word_chars)):
             original = word_chars[i]
-            for c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
+            for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
                 if c == original:
                     continue
                 word_chars[i] = c
-                new_word = ''.join(word_chars)
+                new_word = "".join(word_chars)
                 if new_word in word_set:
                     neighbors.add(new_word)
             word_chars[i] = original
-            
+
         return neighbors
 
     def _generate_word_pair(self, rng: Random, length: int) -> Tuple[str, str, List[str]]:
         """Generate valid start/end words with solution path"""
         word_set = self.word_sets[length]
         max_attempts = 500
-        
+
         for _ in range(max_attempts):
             start, end = rng.sample(sorted(word_set), 2)
             path = self._find_path(start, end, word_set)
             if path and (
-                (self.config.min_chain_length == -1 and self.config.max_chain_length == -1) or
-                (self.config.min_chain_length <= len(path) <= self.config.max_chain_length)
+                (self.config.min_chain_length == -1 and self.config.max_chain_length == -1)
+                or (self.config.min_chain_length <= len(path) <= self.config.max_chain_length)
             ):
                 return start, end, path
-        
+
         raise RuntimeError(f"Failed to find valid pair for length {length} after {max_attempts} attempts")
 
     def __getitem__(self, idx: int) -> dict:
@@ -191,17 +197,12 @@ class WordLadderDataset(ProceduralDataset):
         rng = Random(self.seed + idx)
         length = rng.randint(self.config.min_word_length, self.config.max_word_length)
         start, end, path = self._generate_word_pair(rng, length)
-        
+
         return {
             "question": f"Transform the word '{start}' into '{end}' by changing one letter at a time. Each step must create a valid English word (including plurals) and keep the same word length. Show the sequence of words needed.",
             "answer": ",".join(path),
-            "metadata": {
-                "start_word": start,
-                "end_word": end,
-                "word_length": length,
-                "chain_length": len(path)
-            }
+            "metadata": {"start_word": start, "end_word": end, "word_length": length, "chain_length": len(path)},
         }
 
 
-register_dataset("word_ladder", WordLadderDataset, WordLadderConfig)
\ No newline at end of file
+register_dataset("word_ladder", WordLadderDataset, WordLadderConfig)
diff --git a/reasoning_gym/data/words.csv b/reasoning_gym/data/words.csv
index 36723b21..7d2962ca 100644
--- a/reasoning_gym/data/words.csv
+++ b/reasoning_gym/data/words.csv
@@ -12960,4 +12960,4 @@ ZZZ,ELLS,BOSKY
 ,,ZYGAL
 ,,ZYGON
 ,,ZYMES
-,,ZYMIC
\ No newline at end of file
+,,ZYMIC
diff --git a/tests/test_word_ladder.py b/tests/test_word_ladder.py
index 5a5ce299..4161ffd4 100644
--- a/tests/test_word_ladder.py
+++ b/tests/test_word_ladder.py
@@ -44,12 +44,7 @@ def test_word_ladder_dataset_deterministic():
 def test_word_ladder_dataset_items():
     """Test basic properties of generated items"""
     config = WordLadderConfig(
-        min_word_length=3,
-        max_word_length=5,
-        min_chain_length=3,
-        max_chain_length=5,
-        size=10,
-        seed=42
+        min_word_length=3, max_word_length=5, min_chain_length=3, max_chain_length=5, size=10, seed=42
     )
     dataset = WordLadderDataset(config)
 
@@ -76,7 +71,7 @@ def test_word_ladder_dataset_items():
 
         # Verify solution chain from answer
         solution_chain = item["answer"].split(",")
-        
+
         # Handle chain length validation based on whether it's shortest path (-1) or specified length
         if metadata["chain_length"] == -1:
             # For shortest path, just ensure it's a valid path (we can't predict exact length)
@@ -85,7 +80,7 @@ def test_word_ladder_dataset_items():
             # For specified length, ensure it matches config constraints
             assert config.min_chain_length <= len(solution_chain) <= config.max_chain_length
             assert len(solution_chain) == metadata["chain_length"]
-        
+
         assert solution_chain[0] == metadata["start_word"]
         assert solution_chain[-1] == metadata["end_word"]
         assert all(len(word) == word_length for word in solution_chain)
@@ -144,4 +139,4 @@ def test_word_ladder_find_path():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__]) 
\ No newline at end of file
+    pytest.main([__file__])

From c6634fd53809b6cf57dd8ab34905c4e62c5a02cc Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Thu, 30 Jan 2025 23:19:58 +0100
Subject: [PATCH 13/94] bump version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 42e66005..aaa6d3ae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "reasoning_gym"
-version = "0.1.1"
+version = "0.1.2"
 authors = [
   { name = "Open-Thought community", email = "andreas.koepf@xamla.com" },
 ]

From a577f7cdf68cdf2f6151279a8c6344c871463d34 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Thu, 30 Jan 2025 23:33:43 +0100
Subject: [PATCH 14/94] use sorted() for repeatable generation outputs (e.g.
 GALLERY.md)

---
 GALLERY.md                                   | 46 +++++++++++---------
 reasoning_gym/algorithmic/letter_counting.py |  2 +-
 reasoning_gym/algorithmic/word_sorting.py    |  2 +-
 reasoning_gym/games/maze.py                  |  4 +-
 4 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index 9780d605..f6f266b9 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -711,19 +711,19 @@ size = 500
 Example tasks:
 ```
 Example 1:
-Question: How many times does the letter "o" appear in the text: "bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed"?
-Answer: 2
-Metadata: {'span_length': 15, 'target_letter': 'o', 'span': ['bed', 'and', 'enters', 'his', 'mechanical', 'dresser', 'Two', 'minutes', 'later', 'the', 'machine', 'deposited', 'him', 'all', 'dressed']}
+Question: How many times does the letter "a" appear in the text: "bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed"?
+Answer: 6
+Metadata: {'span_length': 15, 'target_letter': 'a', 'span': ['bed', 'and', 'enters', 'his', 'mechanical', 'dresser', 'Two', 'minutes', 'later', 'the', 'machine', 'deposited', 'him', 'all', 'dressed']}
 
 Example 2:
-Question: How many times does the letter "c" appear in the text: "it into a watering place"?
+Question: How many times does the letter "w" appear in the text: "it into a watering place"?
 Answer: 1
-Metadata: {'span_length': 5, 'target_letter': 'c', 'span': ['it', 'into', 'a', 'watering', 'place']}
+Metadata: {'span_length': 5, 'target_letter': 'w', 'span': ['it', 'into', 'a', 'watering', 'place']}
 
 Example 3:
-Question: How many times does the letter "o" appear in the text: "readable form accessible by the widest array of equipment including outdated"?
-Answer: 3
-Metadata: {'span_length': 11, 'target_letter': 'o', 'span': ['readable', 'form', 'accessible', 'by', 'the', 'widest', 'array', 'of', 'equipment', 'including', 'outdated']}
+Question: How many times does the letter "t" appear in the text: "readable form accessible by the widest array of equipment including outdated"?
+Answer: 5
+Metadata: {'span_length': 11, 'target_letter': 't', 'span': ['readable', 'form', 'accessible', 'by', 'the', 'widest', 'array', 'of', 'equipment', 'including', 'outdated']}
 
 ```
 
@@ -781,7 +781,8 @@ Example tasks:
 Example 1:
 Question: Navigate from '3' (start) to 'z' (goal):
 
-```>>>>>>>>>
+```
+>>>>>>>>>
 >eeee>e>>
 >ee>>>>>>
 >eeeeee>>
@@ -790,6 +791,7 @@ Question: Navigate from '3' (start) to 'z' (goal):
 >eee>e>e>
 >eeeee>e>
 >>>>>>>>>```
+
 Legend: '>' = Wall, 'e' = Passage
 
 What is the minimum number of steps to reach the goal?
@@ -799,13 +801,15 @@ Metadata: {'grid_size': 9, 'grid': ['>>>>>>>>>', '>eeee>e>>', '>ee>>>>>>', '>eee
 Example 2:
 Question: Navigate from '`' (start) to 'i' (goal):
 
-```4444444
+```
+4444444
 4AAAAi4
 4A4A4A4
 4A4AA44
 44AAAA4
 44A`444
 4444444```
+
 Legend: '4' = Wall, 'A' = Passage
 
 What is the minimum number of steps to reach the goal?
@@ -815,13 +819,15 @@ Metadata: {'grid_size': 7, 'grid': ['4444444', '4AAAAi4', '4A4A4A4', '4A4AA44',
 Example 3:
 Question: Navigate from '(' (start) to '`' (goal):
 
-```QQQQQQQ
+```
+QQQQQQQ
 QQ%%%%Q
 QQ`%Q%Q
 Q%%Q%%Q
 Q%%%Q%Q
 Q%QQ%(Q
 QQQQQQQ```
+
 Legend: 'Q' = Wall, '%' = Passage
 
 What is the minimum number of steps to reach the goal?
@@ -1524,21 +1530,21 @@ Example tasks:
 ```
 Example 1:
 Question: Sort these words in ascending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-Wolcott, keep, reaching, times
-Answer: Wolcott, keep, reaching, times
-Metadata: {'original_words': ['Wolcott', 'keep', 'reaching', 'times'], 'transformed_words': ['Wolcott', 'keep', 'reaching', 'times'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['Wolcott', 'keep', 'reaching', 'times']}
+DIRECT, given, exclaims, dreaming
+Answer: DIRECT, dreaming, exclaims, given
+Metadata: {'original_words': ['DIRECT', 'given', 'exclaims', 'dreaming'], 'transformed_words': ['DIRECT', 'given', 'exclaims', 'dreaming'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['DIRECT', 'dreaming', 'exclaims', 'given']}
 
 Example 2:
 Question: Sort these words in descending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-took, critical, condense
-Answer: took, critical, condense
-Metadata: {'original_words': ['took', 'critical', 'condense'], 'transformed_words': ['took', 'critical', 'condense'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['took', 'critical', 'condense']}
+heat, begun, sometimes
+Answer: sometimes, heat, begun
+Metadata: {'original_words': ['heat', 'begun', 'sometimes'], 'transformed_words': ['heat', 'begun', 'sometimes'], 'direction': 'descending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['sometimes', 'heat', 'begun']}
 
 Example 3:
 Question: Sort these words in ascending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
-apartment, yellow, Just, pleasure, collapse, different, purchasers, taking, opening
-Answer: Just, apartment, collapse, different, opening, pleasure, purchasers, taking, yellow
-Metadata: {'original_words': ['apartment', 'yellow', 'Just', 'pleasure', 'collapse', 'different', 'purchasers', 'taking', 'opening'], 'transformed_words': ['apartment', 'yellow', 'Just', 'pleasure', 'collapse', 'different', 'purchasers', 'taking', 'opening'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['Just', 'apartment', 'collapse', 'different', 'opening', 'pleasure', 'purchasers', 'taking', 'yellow']}
+violates, yes, already, completing, pages, duty, his, EXPRESS, duly
+Answer: EXPRESS, already, completing, duly, duty, his, pages, violates, yes
+Metadata: {'original_words': ['violates', 'yes', 'already', 'completing', 'pages', 'duty', 'his', 'EXPRESS', 'duly'], 'transformed_words': ['violates', 'yes', 'already', 'completing', 'pages', 'duty', 'his', 'EXPRESS', 'duly'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['EXPRESS', 'already', 'completing', 'duly', 'duty', 'his', 'pages', 'violates', 'yes']}
 
 ```
 
diff --git a/reasoning_gym/algorithmic/letter_counting.py b/reasoning_gym/algorithmic/letter_counting.py
index 1ef33148..8f2590dd 100644
--- a/reasoning_gym/algorithmic/letter_counting.py
+++ b/reasoning_gym/algorithmic/letter_counting.py
@@ -51,7 +51,7 @@ class LetterCountingDataset(ProceduralDataset):
             letters = {"a"}  # Fallback if span has no letters
 
         # Select random letter that appears in the span
-        target_letter = rng.choice(list(letters))
+        target_letter = rng.choice(sorted(letters))
 
         # Count occurrences
         count = sum(word.lower().count(target_letter) for word in span)
diff --git a/reasoning_gym/algorithmic/word_sorting.py b/reasoning_gym/algorithmic/word_sorting.py
index b573fee1..8ac683b6 100644
--- a/reasoning_gym/algorithmic/word_sorting.py
+++ b/reasoning_gym/algorithmic/word_sorting.py
@@ -49,7 +49,7 @@ class WordSortingDataset(ProceduralDataset):
         # Load and preprocess text
         text = read_data_file("in_the_year_2889.txt")
         # Extract unique words within length constraints
-        self.words = list(
+        self.words = sorted(
             set(
                 word
                 for word in re.findall(r"\b\w+\b", text)
diff --git a/reasoning_gym/games/maze.py b/reasoning_gym/games/maze.py
index 2c8cd9bd..b2c6a777 100644
--- a/reasoning_gym/games/maze.py
+++ b/reasoning_gym/games/maze.py
@@ -90,9 +90,9 @@ class MazeDataset(ProceduralDataset):
                 # Maze is good, build the question
                 question_str = (
                     f"Navigate from '{self.start_char}' (start) to '{self.goal_char}' (goal):\n\n"
-                    + "```"
+                    + "```\n"
                     + self._maze_to_str(maze_grid)
-                    + "```"
+                    + "```\n"
                     + "\nLegend: "
                     + f"'{self.wall_char}' = Wall, '{self.path_char}' = Passage\n\n"
                     + "What is the minimum number of steps to reach the goal?"

From cb8d37291231a0b5b9371d3dc57190d60839bc15 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Thu, 30 Jan 2025 23:58:43 +0100
Subject: [PATCH 15/94] ci: Add GitHub workflow to enforce pre-commit checks on
 main branch

---
 .github/workflows/pre-commit.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 .github/workflows/pre-commit.yml

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 00000000..284bf8ed
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,15 @@
+name: Pre-commit
+
+on:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+    - uses: pre-commit/action@v3.0.0

From 3e01b7dccf37281838d9542b8af10850398efd54 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 00:02:10 +0100
Subject: [PATCH 16/94] ci: Add PR comment and failure handling for pre-commit
 checks

---
 .github/workflows/pre-commit.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 284bf8ed..b31c4dc8 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -13,3 +13,19 @@ jobs:
       with:
         python-version: '3.11'
     - uses: pre-commit/action@v3.0.0
+      id: precommit
+      continue-on-error: true
+    - name: Comment on PR if pre-commit failed
+      if: steps.precommit.outcome == 'failure'
+      uses: actions/github-script@v7
+      with:
+        script: |
+          github.rest.issues.createComment({
+            issue_number: context.issue.number,
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            body: '❌ Pre-commit checks failed. Please run `pre-commit run --all-files` locally and fix the issues.'
+          })
+    - name: Exit with pre-commit status
+      if: steps.precommit.outcome == 'failure'
+      run: exit 1

From 0f362920d5f611196eb58a6cd908f7ba820e6f22 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 00:10:12 +0100
Subject: [PATCH 17/94] ci: Add GitHub Actions workflow for running tests

---
 .github/workflows/tests.yml | 30 ++++++++++++++++++++++++++++++
 pyproject.toml              | 12 ++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 .github/workflows/tests.yml

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 00000000..9e97239d
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,30 @@
+name: Tests
+
+on:
+  pull_request:
+    branches: [ main ]
+  push:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11", "3.12"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ".[test]"
+    
+    - name: Run tests
+      run: |
+        pytest
diff --git a/pyproject.toml b/pyproject.toml
index aaa6d3ae..80ad4865 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,18 @@ dependencies = [
   "magiccube==0.3.0",
   "pyfiglet==1.0.2"
 ]
+
+[project.optional-dependencies]
+test = [
+    "pytest>=7.0.0",
+    "pytest-cov>=4.0.0",
+]
+
+[tool.pytest.ini_options]
+addopts = "-ra -q --cov=reasoning_gym"
+testpaths = [
+    "tests",
+]
 classifiers = [
   "Programming Language :: Python :: 3",
   "License :: OSI Approved :: Apache Software License",

From b0e70008ea51d710da00faf03ed6d7f62fe51215 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 00:27:55 +0100
Subject: [PATCH 18/94] fix: Escape markdown code blocks with quadruple
 backticks in gallery generation

---
 GALLERY.md                  | 284 ++++++++++++++++++------------------
 reasoning_gym/games/maze.py |   2 +-
 scripts/generate_gallery.py |   4 +-
 3 files changed, 145 insertions(+), 145 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index f6f266b9..f52bf50b 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -42,17 +42,17 @@ This gallery shows examples from all available datasets using their default conf
 Generates base conversion tasks
 
 Default configuration:
-```python
+````python
 min_base = 2
 max_base = 16
 min_value = 0
 max_value = 1000
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Convert the base-3 number 28e to binary
 Answer: 1010001110
@@ -68,13 +68,13 @@ Question: Convert the base-10 number 1a2 to base-13 (use lowercase letters a-z f
 Answer: 1a2
 Metadata: {'decimal_value': 418, 'source_base': 10, 'target_base': 13, 'source_repr': '1a2', 'target_repr': '1a2'}
 
-```
+````
 
 ### basic_arithmetic
 Dataset that generates basic arithmetic tasks with configurable complexity
 
 Default configuration:
-```python
+````python
 min_terms = 2
 max_terms = 6
 min_digits = 1
@@ -86,10 +86,10 @@ seed = 42
 size = 500
 format_style = simple
 whitespace = single
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: -5 * -6 =
 Answer: 30
@@ -105,20 +105,20 @@ Question: 0 + -2 + -4 * 0 * 3 =
 Answer: -2
 Metadata: {'num_terms': 5, 'num_digits': 1, 'expression': '0 + -2 + -4 * 0 * 3'}
 
-```
+````
 
 ### bf
 Generates BF tasks
 
 Default configuration:
-```python
+````python
 seed = 42
 size = 500
 difficulty = 1
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: This is a BF (Brainf*ck) computer program. What is the output? 
 
@@ -140,13 +140,13 @@ Question: This is a BF (Brainf*ck) computer program. What is the output?
 Answer: under
 Metadata: {'bfit_code': '\nint main() {\n    print("under");\n}\n', 'bf_program': '>[-]>[-]<>+++++++++[<+++++++++++++>-]<.-------.----------.+.+++++++++++++.<'}
 
-```
+````
 
 ### caesar_cipher
 Generates Caesar cipher encryption/decryption tasks
 
 Default configuration:
-```python
+````python
 delimiter = .
 min_words = 3
 max_words = 20
@@ -154,10 +154,10 @@ min_rotation = 1
 max_rotation = 25
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Decrypt this Caesar cipher text: JNJUBUF ZPVS BTTPDJBUF XIPN J XBT DPNQMJNFOUJOH B NPNFOU BHP
 Answer: IMITATE YOUR ASSOCIATE WHOM I WAS COMPLIMENTING A MOMENT AGO
@@ -173,13 +173,13 @@ Question: Decrypt this Caesar cipher text: ZW PFLI JKFDRTY ZJ FLK FW ZK DLJK SV
 Answer: IF YOUR STOMACH IS OUT OF IT MUST BE MENDED
 Metadata: {'rotation': 17, 'cipher_text': 'ZW PFLI JKFDRTY ZJ FLK FW ZK DLJK SV DVEUVU', 'clear_text': 'IF YOUR STOMACH IS OUT OF IT MUST BE MENDED'}
 
-```
+````
 
 ### chain_sum
 Generates simple arithmetic tasks using only + and - operators
 
 Default configuration:
-```python
+````python
 min_terms = 2
 max_terms = 6
 min_digits = 1
@@ -187,10 +187,10 @@ max_digits = 4
 allow_negation = False
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: 4 + 3 =
 Answer: 7
@@ -206,21 +206,21 @@ Question: 2 + 6 + 3 + 4 + 0 =
 Answer: 15
 Metadata: {'num_terms': 5, 'num_digits': 1, 'expression': '2 + 6 + 3 + 4 + 0'}
 
-```
+````
 
 ### color_cube_rotation
 Generates color cube rotation reasoning tasks
 
 Default configuration:
-```python
+````python
 min_rotations = 1
 max_rotations = 3
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: A cube has:
 - a pink top side
@@ -274,13 +274,13 @@ What is now the color of the left side of the cube?
 Answer: gold
 Metadata: {'initial_state': {'top': 'orange', 'right': 'cyan', 'front': 'violet', 'left': 'pink', 'back': 'gray', 'bottom': 'gold'}, 'rotations': ['left', 'back', 'bottom'], 'target_side': 'left', 'num_rotations': 3}
 
-```
+````
 
 ### countdown
 Generates Countdown Number Game tasks
 
 Default configuration:
-```python
+````python
 min_numbers = 4
 max_numbers = 6
 min_value = 1
@@ -291,10 +291,10 @@ operators = ('+', '-', '*', '/')
 shuffle = True
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Calculate 139 using the numbers 36, 29, 95, 32, 4, 15.
 Each number may be used at most once.
@@ -313,23 +313,23 @@ You can only use each number once.
 Answer: 41*14 - 81 - 38 - 5
 Metadata: {'numbers': [5, 41, 38, 81, 14], 'target': 450, 'expression': '41*14 - 81 - 38 - 5'}
 
-```
+````
 
 ### family_relationships
 Generates family relationship reasoning tasks
 
 Default configuration:
-```python
+````python
 min_family_size = 4
 max_family_size = 8
 male_names = ['James', 'John', 'Robert', 'Michael', 'William', 'David', 'Richard', 'Joseph', 'Thomas', 'Charles', 'Peter', 'Daniel', 'Matthew', 'Christopher', 'Andrew', 'George', 'Edward', 'Benjamin', 'Henry', 'Samuel', 'Alexander', 'Oliver', 'Jack', 'Harry', 'Jacob', 'Noah', 'Ethan', 'Lucas', 'Mason', 'Logan', 'Sebastian', 'Theodore', 'Owen', 'Liam', 'Aiden', 'Kai', 'Jayden', 'Zion', 'Phoenix', 'Atlas', 'Axel', 'Ryder', 'Finn']
 female_names = ['Mary', 'Patricia', 'Jennifer', 'Linda', 'Elizabeth', 'Barbara', 'Susan', 'Jessica', 'Sarah', 'Karen', 'Emma', 'Lisa', 'Anna', 'Margaret', 'Victoria', 'Charlotte', 'Sophia', 'Isabella', 'Olivia', 'Ava', 'Mia', 'Emily', 'Abigail', 'Amelia', 'Eleanor', 'Grace', 'Alice', 'Lucy', 'Chloe', 'Sophie', 'Lily', 'Hannah', 'Zoe', 'Luna', 'Nova', 'Aria', 'Willow', 'Aurora', 'Sage', 'River', 'Winter', 'Sky', 'Rain']
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: John is married to Isabella. They have a child called Edward. Edward is married to Victoria.
 
@@ -351,22 +351,22 @@ What is Liam to Noah?
 Answer: father
 Metadata: {'person1': 'Liam', 'person2': 'Noah', 'relationship': 'father', 'family_size': 7}
 
-```
+````
 
 ### figlet_font
 Generates FigletFont tasks
 
 Default configuration:
-```python
+````python
 static_word = None
 static_font = None
 space_letters = True
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Please read the following figlet font:
 
@@ -415,13 +415,13 @@ Question: What word does this say?
 Answer: UNDER
 Metadata: {'font': 'xcourb', 'space_letters': True}
 
-```
+````
 
 ### fraction_simplification
 Generates fraction simplification tasks
 
 Default configuration:
-```python
+````python
 min_value = 1
 max_value = 1000
 min_factor = 1
@@ -429,10 +429,10 @@ max_factor = 100
 styles = ('plain', 'latex_inline', 'latex_frac', 'latex_dfrac')
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Simplify the fraction $\frac{92}{524}$ to its lowest terms
 Answer: $\frac{23}{131}$
@@ -448,23 +448,23 @@ Question: Simplify the fraction 29330/37310 to its lowest terms
 Answer: 419/533
 Metadata: {'numerator': 29330, 'denominator': 37310, 'simplified_numerator': 419, 'simplified_denominator': 533, 'reduction_factor': 70, 'style': 'plain'}
 
-```
+````
 
 ### game_of_life
 Generates Game of Life games with configurable parameters
 
 Default configuration:
-```python
+````python
 grid_size_x = 20
 grid_size_y = 20
 filled_cells = 100
 simulation_steps = 1
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: What will this Game of Life board look like after 1 steps of simulation?
 
@@ -600,23 +600,23 @@ Answer: [[1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1]
  [0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0]]
 Metadata: {'grid_size_x': 20, 'grid_size_y': 20, 'filled_cells': 100, 'simulation_steps': 1}
 
-```
+````
 
 ### gcd
 Generates Greatest Common Divisor (GCD) tasks
 
 Default configuration:
-```python
+````python
 min_numbers = 2
 max_numbers = 2
 min_value = 1
 max_value = 1000
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Find the Greatest Common Divisor (GCD) of these numbers: 26, 760
 Answer: 2
@@ -632,23 +632,23 @@ Question: Find the Greatest Common Divisor (GCD) of these numbers: 297, 30
 Answer: 3
 Metadata: {'numbers': [297, 30], 'result': 3}
 
-```
+````
 
 ### lcm
 Generates Least Common Multiple (LCM) tasks
 
 Default configuration:
-```python
+````python
 min_numbers = 2
 max_numbers = 2
 min_value = 1
 max_value = 100
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Find the Least Common Multiple (LCM) of these numbers: 95, 14
 Answer: 1330
@@ -664,22 +664,22 @@ Question: Find the Least Common Multiple (LCM) of these numbers: 38, 4
 Answer: 76
 Metadata: {'numbers': [38, 4], 'result': 76}
 
-```
+````
 
 ### leg_counting
 Generates leg counting arithmetic tasks
 
 Default configuration:
-```python
+````python
 min_animals = 2
 max_animals = 5
 max_instances = 3
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: How many legs are there in total if you have 1 sea slug, 1 deer?
 Answer: 4
@@ -695,21 +695,21 @@ Question: How many legs are there in total if you have 1 crab, 2 lobsters, 1 hum
 Answer: 42
 Metadata: {'animals': {'crab': 1, 'lobster': 2, 'human': 1, 'cow': 1, 'bee': 1}, 'total_legs': 42}
 
-```
+````
 
 ### letter_counting
 Generates letter counting tasks from text spans
 
 Default configuration:
-```python
+````python
 min_words = 5
 max_words = 15
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: How many times does the letter "a" appear in the text: "bed and enters his mechanical dresser Two minutes later the machine deposited him all dressed"?
 Answer: 6
@@ -725,13 +725,13 @@ Question: How many times does the letter "t" appear in the text: "readable form
 Answer: 5
 Metadata: {'span_length': 11, 'target_letter': 't', 'span': ['readable', 'form', 'accessible', 'by', 'the', 'widest', 'array', 'of', 'equipment', 'including', 'outdated']}
 
-```
+````
 
 ### letter_jumble
 Generates word letter jumbling tasks
 
 Default configuration:
-```python
+````python
 min_word_len = 1
 max_word_len = 64
 min_words = 3
@@ -741,10 +741,10 @@ max_corruption_level = 0.9
 consecutive_words = True
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Unscramble these words: ew hsall eb ebla ot puodrce
 Answer: we shall be able to produce
@@ -760,24 +760,24 @@ Question: Unscramble these words: dear rchAdbali keep no nSice yrstyedae atnhks
 Answer: dear Archibald keep on Since yesterday thanks to you there is a gain of subscribers Mr
 Metadata: {'num_words': 16, 'corruption_level': 0.516016391169858, 'scrambled_words': ['dear', 'rchAdbali', 'keep', 'no', 'nSice', 'yrstyedae', 'atnhks', 'ot', 'oyu', 'rheet', 'si', 'a', 'gain', 'fo', 'sucrbbisesr', 'rM'], 'original_words': ['dear', 'Archibald', 'keep', 'on', 'Since', 'yesterday', 'thanks', 'to', 'you', 'there', 'is', 'a', 'gain', 'of', 'subscribers', 'Mr']}
 
-```
+````
 
 ### maze
 Generates mazes with guaranteed shortest path distance from start to goal
     within [min_dist, max_dist].
 
 Default configuration:
-```python
+````python
 min_dist = 5
 max_dist = 10
 min_grid_size = 5
 max_grid_size = 10
 seed = 42
 size = 50
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Navigate from '3' (start) to 'z' (goal):
 
@@ -790,8 +790,8 @@ Question: Navigate from '3' (start) to 'z' (goal):
 >>ez>3e>>
 >eee>e>e>
 >eeeee>e>
->>>>>>>>>```
-
+>>>>>>>>>
+```
 Legend: '>' = Wall, 'e' = Passage
 
 What is the minimum number of steps to reach the goal?
@@ -808,8 +808,8 @@ Question: Navigate from '`' (start) to 'i' (goal):
 4A4AA44
 44AAAA4
 44A`444
-4444444```
-
+4444444
+```
 Legend: '4' = Wall, 'A' = Passage
 
 What is the minimum number of steps to reach the goal?
@@ -826,29 +826,29 @@ QQ`%Q%Q
 Q%%Q%%Q
 Q%%%Q%Q
 Q%QQ%(Q
-QQQQQQQ```
-
+QQQQQQQ
+```
 Legend: 'Q' = Wall, '%' = Passage
 
 What is the minimum number of steps to reach the goal?
 Answer: 8
 Metadata: {'grid_size': 7, 'grid': ['QQQQQQQ', 'QQ%%%%Q', 'QQ`%Q%Q', 'Q%%Q%%Q', 'Q%%%Q%Q', 'Q%QQ%(Q', 'QQQQQQQ'], 'shortest_path_length': 8, 'start': '(', 'goal': '`', 'wall': 'Q', 'path': '%'}
 
-```
+````
 
 ### mini_sudoku
 Generates 4x4 sudoku puzzles with configurable difficulty
 
 Default configuration:
-```python
+````python
 min_empty = 8
 max_empty = 12
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Solve this 4x4 Mini Sudoku puzzle:
 _ _ _ _
@@ -885,13 +885,13 @@ Answer: 2 4 1 3
 4 2 3 1
 Metadata: {'puzzle': [[0, 0, 0, 0], [1, 3, 4, 0], [3, 1, 2, 4], [4, 0, 0, 0]], 'solution': [[2, 4, 1, 3], [1, 3, 4, 2], [3, 1, 2, 4], [4, 2, 3, 1]], 'num_empty': 8}
 
-```
+````
 
 ### number_filtering
 Generates number filtering tasks
 
 Default configuration:
-```python
+````python
 min_numbers = 3
 max_numbers = 10
 min_decimals = 0
@@ -900,10 +900,10 @@ min_value = -100.0
 max_value = 100.0
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Keep all numbers larger than -90 in this list: ['-95.00', '-51.0', '47.2942', '-82.612']
 Answer: ['-51.0', '47.2942', '-82.612']
@@ -919,13 +919,13 @@ Question: Keep all numbers larger than 19.8962 in this list: ['4', '-64.7', '-42
 Answer: ['37.76', '38.702']
 Metadata: {'original_numbers': ['4', '-64.7', '-42.1', '-77', '-79.9640', '37.76', '38.702', '18.20', '-28.34'], 'filter_value': '19.8962', 'operation': 'keep_larger', 'result': ['37.76', '38.702']}
 
-```
+````
 
 ### number_sequence
 Generates number sequence completion tasks with dynamic pattern generation
 
 Default configuration:
-```python
+````python
 min_terms = 4
 max_terms = 8
 min_value = -100
@@ -933,10 +933,10 @@ max_value = 100
 max_complexity = 3
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: 3, 6, 12, 24, 48, 96, 192, 384, ?
 Answer: 768
@@ -952,13 +952,13 @@ Question: 8, 4, 2, 1, 0, 0, 0, ?
 Answer: 0
 Metadata: {'rule': 'halve', 'complexity': 2, 'sequence': [8, 4, 2, 1, 0, 0, 0, 0]}
 
-```
+````
 
 ### number_sorting
 Generates number sorting tasks
 
 Default configuration:
-```python
+````python
 min_numbers = 3
 max_numbers = 10
 min_decimals = 0
@@ -967,10 +967,10 @@ min_value = -100.0
 max_value = 100.0
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Sort these numbers in ascending order: 48, -51, -72, -80
 Answer: ['-80', '-72', '-51', '48']
@@ -986,7 +986,7 @@ Question: Sort these numbers in descending order: 8.39, 72.41, -64.67, -54.97, -
 Answer: ['72.41', '8.39', '2.74', '-54.97', '-64.67', '-68.66', '-76.67', '-94.18', '-98.24']
 Metadata: {'original_numbers': ['8.39', '72.41', '-64.67', '-54.97', '-94.18', '-76.67', '-98.24', '-68.66', '2.74'], 'direction': 'descending', 'sorted_numbers': ['72.41', '8.39', '2.74', '-54.97', '-64.67', '-68.66', '-76.67', '-94.18', '-98.24']}
 
-```
+````
 
 ### polynomial_equations
 Generates random polynomial equations of degree in [min_degree, max_degree].
@@ -995,7 +995,7 @@ Generates random polynomial equations of degree in [min_degree, max_degree].
     - The solution may be real or complex; we filter real solutions by default for simplicity.
 
 Default configuration:
-```python
+````python
 min_terms = 2
 max_terms = 4
 min_value = 1
@@ -1005,10 +1005,10 @@ max_degree = 3
 operators = ('+', '-')
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Find the real value(s) of u in the equation: -127*u = 0
 Answer: [0.0]
@@ -1024,21 +1024,21 @@ Question: Determine the real value(s) of n tha satisfies: 71*n**3 - 2*n - 29 = 0
 Answer: [0.7546129960163634]
 Metadata: {'polynomial_expr': '71*n**3 - 2*n - 29', 'variable': 'n', 'degree': 3, 'real_solutions': [0.7546129960163634]}
 
-```
+````
 
 ### prime_factorization
 Generates prime factorization tasks
 
 Default configuration:
-```python
+````python
 min_value = 2
 max_value = 1000
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Find the prime factorization of 656. Write the factors separated by × (Example: for 12 the answer would be: 2 × 2 × 3)
 Answer: 2 × 2 × 2 × 2 × 41
@@ -1054,13 +1054,13 @@ Question: Find the prime factorization of 420. Write the factors separated by ×
 Answer: 2 × 2 × 3 × 5 × 7
 Metadata: {'number': 420, 'factors': [2, 2, 3, 5, 7]}
 
-```
+````
 
 ### propositional_logic
 Generates propositional logic reasoning tasks
 
 Default configuration:
-```python
+````python
 min_vars = 2
 max_vars = 4
 min_statements = 2
@@ -1068,10 +1068,10 @@ max_statements = 4
 max_complexity = 3
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Given:
 1. R
@@ -1099,20 +1099,20 @@ What can we conclude?
 Answer: (Q ∧ Q)
 Metadata: {'premises': ['((Q ∨ P) ∧ ¬P)', 'P', '((P ∧ R) ∧ ¬R)', '((Q ↔ R) → ¬Q)'], 'variables': ['P', 'Q', 'R'], 'complexity': 3}
 
-```
+````
 
 ### quantum_lock
 Generates QuantumLock tasks
 
 Default configuration:
-```python
+````python
 difficulty = 10
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: In front of you are some buttons, a light, and a number. The light will toggle between red and green whenever you press a button. Each button performs a mathematical operation to the number, but the operation may depend on the state of the light.
 You must press the shortest correct sequence of buttons to reach the target value.
@@ -1152,22 +1152,22 @@ C: Add 2 (when any)
 Answer: B → B → B → B → B → B → B → B → B → B → B → B → B → B → B
 Metadata: {'difficulty': 10, 'solution_path': ['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'], 'target_value': 45, 'buttons': [{'name': 'A', 'type': 'subtract', 'value': 2, 'active_state': 'any'}, {'name': 'B', 'type': 'add', 'value': 3, 'active_state': 'any'}, {'name': 'C', 'type': 'add', 'value': 2, 'active_state': 'any'}], 'initial_state': 'red', 'initial_value': 0}
 
-```
+````
 
 ### rubiks_cube
 Generates RubiksCube tasks
 
 Default configuration:
-```python
+````python
 scramble_steps = 3
 cube_size = 3
 remove_ansi = True
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: You are given a 3x3x3 Rubik's cube. It looks like this:
 
@@ -1222,21 +1222,21 @@ Please provide a solution to solve this cube using Singmaster notation.
 Answer: None
 Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "U R' R'", 'example_correct_answer': "R R U'"}
 
-```
+````
 
 ### sentence_reordering
 Generates sentence reordering tasks from text spans
 
 Default configuration:
-```python
+````python
 min_words_in_sentence = 3
 max_words_in_sentence = 20
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Restore the correct order of words in the following sentence: wish could get I sleep. "I some
 Answer: "I wish I could get some sleep.
@@ -1252,13 +1252,13 @@ Question: Restore the correct order of words in the following sentence: develope
 Answer: For ages the the energy developed by falls went unutilized.
 Metadata: {'word_count': 10}
 
-```
+````
 
 ### simple_equations
 Generates simple equations with one variable to solve
 
 Default configuration:
-```python
+````python
 min_terms = 2
 max_terms = 4
 min_value = 1
@@ -1266,10 +1266,10 @@ max_value = 100
 operators = ('+', '-', '*')
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Determine the value of u that satisfies: 32*u + 4 = 580
 Answer: 18
@@ -1285,20 +1285,20 @@ Question: Determine the value of n that satisfies: 29*n - 5 = 430
 Answer: 15
 Metadata: {'equation': '29*n - 5 = 430', 'variable': 'n'}
 
-```
+````
 
 ### spell_backward
 Generates tasks to spell words backward
 
 Default configuration:
-```python
+````python
 min_word_len = 3
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Spell this word backward (example: sun -> nus): Project
 Answer: tcejorP
@@ -1314,21 +1314,21 @@ Question: Spell this word backward (example: sun -> nus): One
 Answer: enO
 Metadata: {'word': 'One', 'word_len': 3}
 
-```
+````
 
 ### sudoku
 Generates sudoku puzzles with configurable difficulty
 
 Default configuration:
-```python
+````python
 min_empty = 30
 max_empty = 50
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Solve this Sudoku puzzle:
 4 _ _ _ 5 2 _ 3 _
@@ -1395,13 +1395,13 @@ Answer: 5 6 1 2 3 7 8 4 9
 4 1 9 5 2 8 3 6 7
 Metadata: {'puzzle': [[0, 0, 1, 2, 3, 0, 0, 0, 9], [3, 0, 0, 1, 8, 5, 6, 7, 2], [0, 0, 0, 4, 9, 6, 1, 0, 0], [1, 0, 5, 7, 0, 0, 9, 2, 0], [0, 4, 0, 0, 5, 9, 7, 1, 6], [9, 0, 6, 0, 1, 0, 4, 5, 3], [0, 0, 3, 9, 7, 0, 2, 8, 4], [0, 0, 2, 6, 4, 0, 0, 9, 1], [0, 1, 0, 5, 2, 8, 3, 0, 0]], 'solution': [[5, 6, 1, 2, 3, 7, 8, 4, 9], [3, 9, 4, 1, 8, 5, 6, 7, 2], [8, 2, 7, 4, 9, 6, 1, 3, 5], [1, 3, 5, 7, 6, 4, 9, 2, 8], [2, 4, 8, 3, 5, 9, 7, 1, 6], [9, 7, 6, 8, 1, 2, 4, 5, 3], [6, 5, 3, 9, 7, 1, 2, 8, 4], [7, 8, 2, 6, 4, 3, 5, 9, 1], [4, 1, 9, 5, 2, 8, 3, 6, 7]], 'num_empty': 33}
 
-```
+````
 
 ### syllogism
 Generates syllogism reasoning tasks
 
 Default configuration:
-```python
+````python
 terms = None
 allow_all = True
 allow_no = True
@@ -1411,10 +1411,10 @@ include_invalid = True
 invalid_ratio = 0.3
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Consider these statements:
 1. No students are humans
@@ -1448,23 +1448,23 @@ Some ... are not butterflies are whales?
 Answer: No
 Metadata: {'premise1': 'All butterflies are tigers', 'premise2': 'No tigers are whales', 'conclusion': 'Some ... are not butterflies are whales', 'is_valid': False}
 
-```
+````
 
 ### word_ladder
 Generates word ladder transformation tasks
 
 Default configuration:
-```python
+````python
 min_word_length = 3
 max_word_length = 5
 min_chain_length = -1
 max_chain_length = -1
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Transform the word 'CEILS' into 'ANIGH' by changing one letter at a time. Each step must create a valid English word (including plurals) and keep the same word length. Show the sequence of words needed.
 Answer: CEILS,TEILS,TEINS,THINS,THIGS,THIGH,AHIGH,ANIGH
@@ -1480,21 +1480,21 @@ Question: Transform the word 'SAUT' into 'SKER' by changing one letter at a time
 Answer: SAUT,SHUT,SHET,SKET,SKER
 Metadata: {'start_word': 'SAUT', 'end_word': 'SKER', 'word_length': 4, 'chain_length': 5}
 
-```
+````
 
 ### word_sequence_reversal
 Generates word sequence reversal tasks from text spans
 
 Default configuration:
-```python
+````python
 min_words = 3
 max_words = 8
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Reverse this list of words: bed, if, problem, but, Well, an, transmission, nutritive
 Answer: nutritive, transmission, an, Well, but, problem, if, bed
@@ -1510,13 +1510,13 @@ Question: Reverse this list of words: readable, to, he, that, to, possession
 Answer: possession, to, that, he, to, readable
 Metadata: {'num_words': 6, 'words': ['readable', 'to', 'he', 'that', 'to', 'possession']}
 
-```
+````
 
 ### word_sorting
 Generates word sorting tasks
 
 Default configuration:
-```python
+````python
 min_words = 3
 max_words = 10
 min_word_length = 3
@@ -1524,10 +1524,10 @@ max_word_length = 12
 transformation = original
 seed = 42
 size = 500
-```
+````
 
 Example tasks:
-```
+````
 Example 1:
 Question: Sort these words in ascending order (using ASCII/Unicode ordering) and return them as a comma-separated list:
 DIRECT, given, exclaims, dreaming
@@ -1546,6 +1546,6 @@ violates, yes, already, completing, pages, duty, his, EXPRESS, duly
 Answer: EXPRESS, already, completing, duly, duty, his, pages, violates, yes
 Metadata: {'original_words': ['violates', 'yes', 'already', 'completing', 'pages', 'duty', 'his', 'EXPRESS', 'duly'], 'transformed_words': ['violates', 'yes', 'already', 'completing', 'pages', 'duty', 'his', 'EXPRESS', 'duly'], 'direction': 'ascending', 'transformation': <TextTransformation.ORIGINAL: 'original'>, 'sorted_words': ['EXPRESS', 'already', 'completing', 'duly', 'duty', 'his', 'pages', 'violates', 'yes']}
 
-```
+````
 
 
diff --git a/reasoning_gym/games/maze.py b/reasoning_gym/games/maze.py
index b2c6a777..cc110f42 100644
--- a/reasoning_gym/games/maze.py
+++ b/reasoning_gym/games/maze.py
@@ -92,7 +92,7 @@ class MazeDataset(ProceduralDataset):
                     f"Navigate from '{self.start_char}' (start) to '{self.goal_char}' (goal):\n\n"
                     + "```\n"
                     + self._maze_to_str(maze_grid)
-                    + "```\n"
+                    + "\n```"
                     + "\nLegend: "
                     + f"'{self.wall_char}' = Wall, '{self.path_char}' = Passage\n\n"
                     + "What is the minimum number of steps to reach the goal?"
diff --git a/scripts/generate_gallery.py b/scripts/generate_gallery.py
index 3eef4ff5..08f482e4 100755
--- a/scripts/generate_gallery.py
+++ b/scripts/generate_gallery.py
@@ -46,7 +46,7 @@ def generate_gallery() -> str:
 
         # Show examples
         content.append("Example tasks:\n")
-        content.append("```\n")
+        content.append("````\n")
         for i, item in enumerate(dataset):
             if i >= 3:
                 break
@@ -56,7 +56,7 @@ def generate_gallery() -> str:
             if item.get("metadata"):
                 content.append(f"Metadata: {item['metadata']}\n")
             content.append("\n")
-        content.append("```\n\n")
+        content.append("````\n\n")
 
     return "".join(content)
 

From 131e0d8f19ca9d1329a924240c8131777ac0e717 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-2-175.eu-north-1.compute.internal>
Date: Fri, 31 Jan 2025 06:42:25 +0000
Subject: [PATCH 19/94] added countdown score answer impl

---
 reasoning_gym/games/countdown.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/reasoning_gym/games/countdown.py b/reasoning_gym/games/countdown.py
index 4721844d..87d1a793 100644
--- a/reasoning_gym/games/countdown.py
+++ b/reasoning_gym/games/countdown.py
@@ -1,9 +1,10 @@
 from dataclasses import dataclass
 from random import Random
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Dict, Any
 
 import sympy
 from sympy import Symbol, symbols
+from sympy.parsing.sympy_parser import parse_expr
 
 from ..factory import ProceduralDataset, register_dataset
 
@@ -157,6 +158,23 @@ class CountdownDataset(ProceduralDataset):
                 continue
 
         raise ValueError(f"Failed to generate valid expression after {max_attempts} attempts")
+    
+    def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
+        """Determine if the solution provided solves the problem"""
+        reward = 0.0
+        if answer is not None:
+            try:
+                user_answer = int(parse_expr(answer))
+                solved = user_answer == metadata["target"]
+                if solved:
+                    reward = 1.0
+                elif (len(answer.strip()) > 0): # encourage partial solutions
+                    reward = 0.05
+                else:
+                    reward = 0.01
+            except:
+                reward = 0.01
+        return reward
 
 
 # Register the dataset

From 4fea3c3378f90e2298c56a3b85ae8c8b87dec626 Mon Sep 17 00:00:00 2001
From: joesharratt1229 <joesharratt1229@gmail.com>
Date: Fri, 31 Jan 2025 06:46:18 +0000
Subject: [PATCH 20/94] added testing of score answer method

---
 tests/test_countdown.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_countdown.py b/tests/test_countdown.py
index e426caf2..04bf1a8b 100644
--- a/tests/test_countdown.py
+++ b/tests/test_countdown.py
@@ -64,6 +64,14 @@ def test_countdown_game_items():
 
         # Verify expression evaluates correctly
         expr = item["metadata"]["expression"]
+        
+        #check score
+        assert dataset.score_answer(answer=expr, metadata=item["metadata"]) == 1.0  #correct answer
+        assert dataset.score_answer(answer="45+2", metadata=item["metadata"]) == 0.05  #wrong answer but an attempt
+        assert dataset.score_answer(answer="a wrong solution", metadata=item["metadata"]) == 0.01  #wrong answer but incorrectly formatted
+        assert dataset.score_answer(answer="", metadata=item["metadata"]) == 0.01  #wrong answer but empty string
+        assert dataset.score_answer(answer=None, metadata=item["metadata"]) == 0.0  #no answer
+        
         try:
             result = eval(expr)  # Safe here since we control expression generation
             assert result == item["metadata"]["target"]

From 4bf99d3e0bd1f1eee98ce0388828153eb7afdc03 Mon Sep 17 00:00:00 2001
From: Joe Norton <16323+joenorton@users.noreply.github.com>
Date: Thu, 30 Jan 2025 23:16:06 -0800
Subject: [PATCH 21/94] adds Tower of Hanoi

creates game file & test file, modifies games init to add toh
---
 reasoning_gym/games/__init__.py       |   3 +
 reasoning_gym/games/tower_of_hanoi.py | 364 ++++++++++++++++++++++++++
 tests/test_tower_of_hanoi.py          | 231 ++++++++++++++++
 3 files changed, 598 insertions(+)
 create mode 100644 reasoning_gym/games/tower_of_hanoi.py
 create mode 100644 tests/test_tower_of_hanoi.py

diff --git a/reasoning_gym/games/__init__.py b/reasoning_gym/games/__init__.py
index a801c6e4..9174089e 100644
--- a/reasoning_gym/games/__init__.py
+++ b/reasoning_gym/games/__init__.py
@@ -11,6 +11,7 @@ from .game_of_life import GameOfLifeConfig, GameOfLifeDataset
 from .maze import MazeConfig, MazeDataset
 from .mini_sudoku import MiniSudokuConfig, MiniSudokuDataset
 from .sudoku import SudokuConfig, SudokuDataset
+from .tower_of_hanoi import HanoiConfig, HanoiDataset
 
 __all__ = [
     "CountdownConfig",
@@ -23,4 +24,6 @@ __all__ = [
     "MazeDataset",
     "GameOfLifeConfig",
     "GameOfLifeDataset",
+    "HanoiConfig",
+    "HanoiDataset"
 ]
diff --git a/reasoning_gym/games/tower_of_hanoi.py b/reasoning_gym/games/tower_of_hanoi.py
new file mode 100644
index 00000000..081b3bc6
--- /dev/null
+++ b/reasoning_gym/games/tower_of_hanoi.py
@@ -0,0 +1,364 @@
+# reasoning_gym/games/tower_of_hanoi.py
+
+from dataclasses import dataclass
+from typing import List, Optional, Dict, Tuple
+import math
+import random
+import re
+
+from ..factory import ProceduralDataset, register_dataset
+
+@dataclass
+class HanoiConfig:
+    """
+    Configuration for the Tower of Hanoi task.
+
+    - min_disks: Minimum number of disks in the puzzle.
+    - max_disks: Maximum number of disks in the puzzle.
+    - min_pegs: Minimum number of pegs (minimum 3).
+    - max_pegs: Maximum number of pegs.
+    - size: Number of problem instances in the dataset.
+    - seed: Optional seed for reproducibility.
+    - visualize: Whether to include a visualization of the initial state.
+    """
+    
+    min_disks: int = 3
+    max_disks: int = 7
+    min_pegs: int = 3
+    max_pegs: int = 4
+    size: int = 50
+    seed: Optional[int] = None
+    visualize: bool = False  # New parameter
+    
+    def validate(self) -> None:
+        """Validate configuration parameters."""
+        assert self.min_disks >= 1, "min_disks must be at least 1"
+        assert self.max_disks >= self.min_disks, "max_disks must be >= min_disks"
+        assert self.min_pegs >= 3, "min_pegs must be at least 3"
+        assert self.max_pegs >= self.min_pegs, "max_pegs must be >= min_pegs"
+
+class MoveGenerator:
+    """
+    Helper class to generate valid move sequences for Tower of Hanoi using the Frame-Stewart algorithm.
+    It maintains the current state of all pegs to ensure move validity.
+    """
+    
+    def __init__(self, num_disks: int, pegs: List[int], start: int, target: int):
+        self.num_disks = num_disks
+        self.pegs = pegs
+        self.start = start
+        self.target = target
+        self.auxiliary_pegs = [peg for peg in pegs if peg not in (start, target)]
+        self.pegs_state: Dict[int, List[int]] = {peg: [] for peg in pegs}
+        for disk in range(num_disks, 0, -1):  # Largest disk at the bottom
+            self.pegs_state[start].append(disk)
+        self.moves: List[str] = []
+        self.memo: Dict[Tuple[int, int], int] = {}  # Memoization for T(n, k)
+    
+    def generate_moves(self) -> List[str]:
+        self.move(n=self.num_disks, source=self.start, target=self.target, auxiliary_pegs=self.auxiliary_pegs)
+        return self.moves
+    
+    def move(self, n: int, source: int, target: int, auxiliary_pegs: List[int]):
+        if n == 0:
+            return
+        if n == 1:
+            self._move_disk(source, target)
+            return
+        
+        k = len(auxiliary_pegs) + 2  # Total number of pegs including source and target
+        
+        if k < 3:
+            raise ValueError("At least 3 pegs are required.")
+        
+        if k == 3:
+            # Classic Tower of Hanoi solution
+            aux = auxiliary_pegs[0]
+            self.move(n - 1, source, aux, [target])
+            self._move_disk(source, target)
+            self.move(n - 1, aux, target, [source])
+            return
+        
+        # For k > 3, apply Frame-Stewart algorithm
+        # Find m that minimizes 2*T(m, k) + T(n - m, k - 1)
+        min_moves = math.inf
+        best_m = 1
+        for m in range(1, n):
+            moves_m = self._compute_T(m, k)
+            moves_n_minus_m = self._compute_T(n - m, k - 1)
+            total_moves = 2 * moves_m + moves_n_minus_m
+            if total_moves < min_moves:
+                min_moves = total_moves
+                best_m = m
+        
+        # Select a temporary peg to hold m disks
+        temp_peg = auxiliary_pegs[0]
+        new_auxiliary = [peg for peg in auxiliary_pegs if peg != temp_peg]
+        
+        # Step 1: Move top m disks to temp_peg using all pegs
+        self.move(n=best_m, source=source, target=temp_peg, auxiliary_pegs=auxiliary_pegs[1:] + [target])
+        
+        # Step 2: Move remaining n - m disks to target using k - 1 pegs
+        self.move(n=n - best_m, source=source, target=target, auxiliary_pegs=new_auxiliary)
+        
+        # Step 3: Move m disks from temp_peg to target using all pegs
+        self.move(n=best_m, source=temp_peg, target=target, auxiliary_pegs=auxiliary_pegs[1:] + [source])
+    
+    def _move_disk(self, from_peg: int, to_peg: int):
+        if not self.pegs_state[from_peg]:
+            raise ValueError(f"No disks to move from Peg {from_peg}.")
+        disk = self.pegs_state[from_peg][-1]
+        self.pegs_state[from_peg].pop()
+        self.pegs_state[to_peg].append(disk)
+        self.moves.append(f"Move disk {disk} from Peg {from_peg} to Peg {to_peg}")
+    
+    def _compute_T(self, n: int, k: int) -> int:
+        """
+        Compute the minimal number of moves (T(n, k)) required to move n disks using k pegs.
+        Utilizes memoization to store previously computed results.
+        """
+        if n == 0:
+            return 0
+        if n == 1:
+            return 1
+        if k == 3:
+            return 2 ** n - 1
+        if (n, k) in self.memo:
+            return self.memo[(n, k)]
+        
+        min_moves = math.inf
+        for m in range(1, n):
+            moves = 2 * self._compute_T(m, k) + self._compute_T(n - m, k - 1)
+            if moves < min_moves:
+                min_moves = moves
+        self.memo[(n, k)] = min_moves
+        return min_moves
+
+class HanoiDataset(ProceduralDataset):
+    """
+    Generates Tower of Hanoi problems with solutions.
+    Supports variable number of pegs using the optimized Frame-Stewart algorithm with Peg State Tracking.
+    """
+    
+    def __init__(self, config: HanoiConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+        self.min_pegs = config.min_pegs
+        self.max_pegs = config.max_pegs
+        self.min_disks = config.min_disks
+        self.max_disks = config.max_disks
+        self.visualize = config.visualize  # Initialize the visualize attribute
+        
+    def __getitem__(self, idx: int) -> dict:
+        """
+        Generate a Tower of Hanoi problem instance.
+        
+        Returns:
+            dict with:
+            - "question": Text describing the problem setup.
+            - "answer": List of moves to solve the puzzle.
+            - "metadata": Configuration and solution details.
+            - "initial_state": (Optional) ASCII visualization of the initial pegs.
+            - "states": (Optional) List of ASCII visualizations after each move.
+        """
+        rng = random.Random(self.seed + idx if self.seed is not None else None)
+        
+        # Randomly select number of disks and pegs within the specified ranges
+        num_disks = rng.randint(self.min_disks, self.max_disks)
+        num_pegs = rng.randint(self.min_pegs, self.max_pegs)
+        
+        # Assign unique peg identifiers (e.g., integers starting from 1)
+        pegs = list(range(1, num_pegs + 1))
+        
+        """ #Debug: Print current instance configuration
+        print(f"\n--- Generating Instance {idx} ---")
+        print(f"Number of Disks: {num_disks}")
+        print(f"Number of Pegs: {num_pegs}")
+        print(f"Pegs: {pegs}")
+        """
+        
+        # Randomly select start and target pegs
+        start_peg, target_peg = rng.sample(pegs, 2)
+        
+        # Auxiliary pegs are the remaining pegs
+        auxiliary_pegs = [peg for peg in pegs if peg not in (start_peg, target_peg)]
+        
+        """ # Debug: Print start, target, and auxiliary pegs
+        print(f"Start Peg: {start_peg}")
+        print(f"Target Peg: {target_peg}")
+        print(f"Auxiliary Pegs: {auxiliary_pegs}")
+        """
+        
+        # Initialize the MoveGenerator and generate moves
+        move_gen = MoveGenerator(num_disks, pegs, start_peg, target_peg)
+        try:
+            solution = move_gen.generate_moves()
+        except ValueError as ve:
+            # print(f"Error during move generation: {ve}")
+            raise ve
+        
+        """ # Debug: Print the solution moves
+        print(f"Solution Length: {len(solution)}")
+        print("Solution Moves:")
+        for move_num, move in enumerate(solution, start=1):
+            print(f"  Move {move_num}: {move}")
+        """
+        
+        # Initialize pegs_state: all disks start on the start peg
+        pegs_state = {peg: [] for peg in pegs}
+        for disk in range(num_disks, 0, -1):  # Largest disk at the bottom
+            pegs_state[start_peg].append(disk)
+        
+        # Generate initial state visualization if requested
+        initial_state_str = None
+        if self.visualize:
+            initial_state_str = self._visualize_state(pegs_state)
+        
+        # Apply moves to track state changes
+        states = []
+        if self.visualize:
+            states.append(initial_state_str)  # Initial state
+            for move in solution:
+                # Parse the move string using regex
+                try:
+                    disk, from_peg, to_peg = self._parse_move(move)
+                except ValueError as ve:
+                    # print(f"Error parsing move: {ve}")
+                    raise ve
+                
+                # Validate the move
+                if not self._validate_move(pegs_state, move):
+                    #print(f"Invalid move detected: {move}")
+                    #print(f"Current Pegs State: {pegs_state}")
+                    raise ValueError(f"Invalid move detected: {move}")
+                
+                # Move the disk
+                pegs_state[from_peg].pop()
+                pegs_state[to_peg].append(disk)
+                
+                # Visualize the new state
+                new_state_str = self._visualize_state(pegs_state)
+                states.append(new_state_str)
+        
+        # Peg labels
+        peg_labels = {peg: f"Peg {peg}" for peg in pegs}
+        
+        question_str = (
+            f"Solve the Tower of Hanoi problem with {num_disks} disks and {num_pegs} pegs.\n"
+            f"Move all disks from {peg_labels[start_peg]} to {peg_labels[target_peg]} following the rules:\n"
+            "- Only one disk can be moved at a time.\n"
+            "- A larger disk cannot be placed on top of a smaller disk.\n"
+            "- All disks must be on a peg at all times.\n"
+            "Provide the sequence of moves."
+        )
+        
+        result = {
+            "question": question_str,
+            "answer": solution,
+            "metadata": {
+                "num_disks": num_disks,
+                "num_pegs": num_pegs,
+                "start_peg": start_peg,
+                "target_peg": target_peg,
+                "auxiliary_pegs": auxiliary_pegs,
+                "solution_length": len(solution),
+            },
+        }
+        
+        if self.visualize:
+            result["initial_state"] = initial_state_str
+            result["states"] = states  # List of all states including initial and after each move
+        
+        return result
+    
+    def _visualize_state(self, pegs_state: Dict[int, List[int]]) -> str:
+        """
+        Create an ASCII visualization of the current state of the pegs.
+        Adapts to variable number of pegs.
+        
+        Args:
+            pegs_state (dict): Dictionary mapping peg numbers to lists of disks.
+        
+        Returns:
+            str: ASCII art representing the pegs and disks.
+        """
+        # Determine the number of levels based on the maximum number of disks on any peg
+        max_height = max(len(disks) for disks in pegs_state.values())
+        pegs = sorted(pegs_state.keys())
+        
+        visualization = ""
+        for level in range(max_height, 0, -1):
+            for peg in pegs:
+                if len(pegs_state[peg]) >= level:
+                    disk_size = pegs_state[peg][level - 1]
+                    disk_str = f"[{'*' * disk_size}]"
+                else:
+                    disk_str = "[ ]"
+                visualization += disk_str.center(7)  # Adjust spacing as needed
+            visualization += "\n"
+        
+        # Add the base and peg numbers
+        visualization += "-" * (7 * len(pegs)) + "\n"
+        for peg in pegs:
+            peg_label = f"P{peg}".center(7)
+            visualization += peg_label
+        visualization += "\n"
+        
+        return visualization
+    
+    def _validate_move(self, pegs_state: Dict[int, List[int]], move: str) -> bool:
+        """
+        Validate that a move adheres to the Tower of Hanoi rules.
+        
+        Args:
+            pegs_state (dict): Current state of the pegs.
+            move (str): Move instruction, e.g., "Move disk 2 from Peg 1 to Peg 3".
+        
+        Returns:
+            bool: True if the move is valid, False otherwise.
+        """
+        try:
+            parts = move.split()
+            if len(parts) != 9:
+                # print(f"Unexpected move format: '{move}'")
+                return False
+            disk = int(parts[2])
+            from_peg = int(parts[5])
+            to_peg = int(parts[8])
+            
+            # Check if the disk to move is the top disk on the from_peg
+            if not pegs_state[from_peg] or pegs_state[from_peg][-1] != disk:
+                # print(f"Disk {disk} is not on top of Peg {from_peg}. Current state: {pegs_state[from_peg]}")
+                return False
+            
+            # Check if placing the disk on the to_peg violates size constraints
+            if pegs_state[to_peg] and pegs_state[to_peg][-1] < disk:
+                # print(f"Cannot place disk {disk} on top of smaller disk {pegs_state[to_peg][-1]} on Peg {to_peg}.")
+                return False
+            
+            return True
+        except Exception as e:
+            print(f"Error validating move '{move}': {e}")
+            return False
+    
+    def _parse_move(self, move: str) -> Tuple[int, int, int]:
+        """
+        Parse a move string and extract disk number, from peg, and to peg.
+        
+        Args:
+            move (str): Move instruction, e.g., "Move disk 2 from Peg 1 to Peg 3".
+        
+        Returns:
+            tuple: (disk, from_peg, to_peg)
+        """
+        pattern = r"Move disk (\d+) from Peg (\d+) to Peg (\d+)"
+        match = re.match(pattern, move)
+        if not match:
+            raise ValueError(f"Unexpected move format: '{move}'")
+        
+        disk = int(match.group(1))
+        from_peg = int(match.group(2))
+        to_peg = int(match.group(3))
+        return disk, from_peg, to_peg
+
+# Register the dataset
+register_dataset("tower_of_hanoi", HanoiDataset, HanoiConfig)
diff --git a/tests/test_tower_of_hanoi.py b/tests/test_tower_of_hanoi.py
new file mode 100644
index 00000000..a3a89023
--- /dev/null
+++ b/tests/test_tower_of_hanoi.py
@@ -0,0 +1,231 @@
+"""Tests for Tower of Hanoi puzzle generation"""
+
+import pytest
+import re
+
+from reasoning_gym.games.tower_of_hanoi import HanoiConfig, HanoiDataset
+
+def test_toh_config_validation():
+    """Test that invalid configurations raise appropriate errors."""
+    # Test negative number of disks
+    with pytest.raises(AssertionError):
+        config = HanoiConfig(min_disks=0)  # At least 1 disk required
+        config.validate()
+    
+    # Test max_disks less than min_disks
+    with pytest.raises(AssertionError):
+        config = HanoiConfig(min_disks=5, max_disks=3)
+        config.validate()
+    
+    # Test min_pegs less than 3
+    with pytest.raises(AssertionError):
+        config = HanoiConfig(min_pegs=2)
+        config.validate()
+    
+    # Test max_pegs less than min_pegs
+    with pytest.raises(AssertionError):
+        config = HanoiConfig(min_pegs=3, max_pegs=2)
+        config.validate()
+    
+    # Test invalid move configurations if any (assuming such validations exist)
+    # Add more tests based on the actual validation logic in HanoiConfig
+
+def test_toh_dataset_deterministic():
+    """Test that dataset generates the same items with the same seed."""
+    config = HanoiConfig(seed=42, size=10)
+    dataset1 = HanoiDataset(config)
+    dataset2 = HanoiDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i], f"Mismatch found in instance {i} with seed 42."
+
+def test_toh_dataset_items():
+    """Test basic properties of generated items."""
+    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=10, seed=42)
+    dataset = HanoiDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        
+        # Check item structure
+        assert isinstance(item, dict), f"Item {i} is not a dictionary."
+        assert "question" in item, f"Item {i} missing 'question' key."
+        assert "answer" in item, f"Item {i} missing 'answer' key."
+        assert "metadata" in item, f"Item {i} missing 'metadata' key."
+        
+        # Check metadata
+        metadata = item["metadata"]
+        assert "num_disks" in metadata, f"Item {i} metadata missing 'num_disks'."
+        assert "num_pegs" in metadata, f"Item {i} metadata missing 'num_pegs'."
+        assert "start_peg" in metadata, f"Item {i} metadata missing 'start_peg'."
+        assert "target_peg" in metadata, f"Item {i} metadata missing 'target_peg'."
+        assert "auxiliary_pegs" in metadata, f"Item {i} metadata missing 'auxiliary_pegs'."
+        assert "solution_length" in metadata, f"Item {i} metadata missing 'solution_length'."
+        
+        num_disks = metadata["num_disks"]
+        num_pegs = metadata["num_pegs"]
+        start_peg = metadata["start_peg"]
+        target_peg = metadata["target_peg"]
+        auxiliary_pegs = metadata["auxiliary_pegs"]
+        solution_length = metadata["solution_length"]
+        
+        # Verify peg counts
+        assert num_pegs == len(metadata["auxiliary_pegs"]) + 2, (
+            f"Item {i} has inconsistent peg counts."
+        )
+        
+        # Verify solution_length consistency
+        assert solution_length == len(item["answer"]), (
+            f"Item {i} metadata 'solution_length' does not match actual number of moves."
+        )
+        
+        # Optional: Additional checks like verifying that start and target pegs are distinct
+        assert start_peg != target_peg, f"Item {i} has identical start and target pegs."
+
+def test_toh_move_validity():
+    """Test that all moves in each problem instance are valid according to Tower of Hanoi rules."""
+    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=10, seed=42)
+    dataset = HanoiDataset(config)
+
+    for idx, instance in enumerate(dataset):
+        num_disks = instance['metadata']['num_disks']
+        num_pegs = instance['metadata']['num_pegs']
+        start_peg = instance['metadata']['start_peg']
+        target_peg = instance['metadata']['target_peg']
+        auxiliary_pegs = instance['metadata']['auxiliary_pegs']
+        pegs = list(range(1, num_pegs + 1))
+        
+        # Initialize pegs_state: all disks start on the start peg
+        pegs_state = {peg: [] for peg in pegs}
+        for disk in range(num_disks, 0, -1):
+            pegs_state[start_peg].append(disk)
+        
+        # Iterate over each move and validate
+        for move_num, move in enumerate(instance['answer'], start=1):
+            disk, from_peg, to_peg = parse_move(move)
+            
+            # Check that from_peg exists
+            assert from_peg in pegs, (
+                f"Move {move_num} in Instance {idx} references non-existent from_peg {from_peg}."
+            )
+            
+            # Check that to_peg exists
+            assert to_peg in pegs, (
+                f"Move {move_num} in Instance {idx} references non-existent to_peg {to_peg}."
+            )
+            
+            # Check that from_peg is not empty
+            assert pegs_state[from_peg], (
+                f"Move {move_num} in Instance {idx} attempts to move from an empty Peg {from_peg}."
+            )
+            
+            # Check that the disk to move is on top of from_peg
+            top_disk = pegs_state[from_peg][-1]
+            assert disk == top_disk, (
+                f"Move {move_num} in Instance {idx} attempts to move disk {disk} "
+                f"which is not on top of Peg {from_peg} (top disk: {top_disk})."
+            )
+            
+            # Check that moving disk to to_peg does not violate size constraints
+            if pegs_state[to_peg]:
+                top_to_disk = pegs_state[to_peg][-1]
+                assert top_to_disk > disk, (
+                    f"Move {move_num} in Instance {idx} attempts to place disk {disk} "
+                    f"on top of smaller disk {top_to_disk} on Peg {to_peg}."
+                )
+            
+            # Perform the move
+            pegs_state[from_peg].pop()
+            pegs_state[to_peg].append(disk)
+
+def test_toh_final_state_correct():
+    """Test that the final state of each problem instance has all disks on the target peg in correct order."""
+    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=10, seed=42)
+    dataset = HanoiDataset(config)
+
+    for idx, instance in enumerate(dataset):
+        num_disks = instance['metadata']['num_disks']
+        num_pegs = instance['metadata']['num_pegs']
+        start_peg = instance['metadata']['start_peg']
+        target_peg = instance['metadata']['target_peg']
+        auxiliary_pegs = instance['metadata']['auxiliary_pegs']
+        pegs = list(range(1, num_pegs + 1))
+        
+        # Initialize pegs_state: all disks start on the start peg
+        pegs_state = {peg: [] for peg in pegs}
+        for disk in range(num_disks, 0, -1):
+            pegs_state[start_peg].append(disk)
+        
+        # Perform all moves
+        for move in instance['answer']:
+            disk, from_peg, to_peg = parse_move(move)
+            pegs_state[from_peg].pop()
+            pegs_state[to_peg].append(disk)
+        
+        # After all moves, all disks should be on target peg in descending order
+        final_pegs = pegs_state[target_peg]
+        assert len(final_pegs) == num_disks, (
+            f"Instance {idx} does not have all disks on the target Peg {target_peg}."
+        )
+        
+        # Verify that disks are in correct order on target peg
+        expected_final = list(range(num_disks, 0, -1))
+        assert final_pegs == expected_final, (
+            f"Instance {idx} has disks on Peg {target_peg} in incorrect order."
+        )
+        
+        # Ensure all other pegs are empty
+        for peg in pegs:
+            if peg != target_peg:
+                assert len(pegs_state[peg]) == 0, (
+                    f"Instance {idx} has disks remaining on Peg {peg}, which should be empty."
+                )
+
+def test_toh_dataset_iteration():
+    """Test that iteration respects dataset size and multiple iterations yield the same items."""
+    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=5, seed=42)
+    dataset = HanoiDataset(config)
+
+    # Test dataset size
+    assert len(dataset) == config.size, (
+        f"Dataset size mismatch: expected {config.size}, got {len(dataset)}."
+    )
+    
+    # Collect items
+    items = list(dataset)
+    
+    # Test multiple iterations yield the same items
+    assert items == list(dataset), "Multiple iterations over the dataset do not yield the same items."
+
+def parse_move(move_str: str) -> tuple:
+    """Parse a move string and extract disk number, from peg, and to peg.
+
+    Args:
+        move_str (str): Move instruction, e.g., "Move disk 2 from Peg 1 to Peg 3".
+
+    Returns:
+        tuple: (disk, from_peg, to_peg)
+    """
+    pattern = r"Move disk (\d+) from Peg (\d+) to Peg (\d+)"
+    match = re.match(pattern, move_str)
+    assert match is not None, f"Move string '{move_str}' does not match the expected format."
+    disk = int(match.group(1))
+    from_peg = int(match.group(2))
+    to_peg = int(match.group(3))
+    return disk, from_peg, to_peg
+
+def is_valid_final_state(pegs_state: dict, target_peg: int, num_disks: int) -> bool:
+    """Verify that all disks are on the target peg in descending order.
+
+    Args:
+        pegs_state (dict): Current state of the pegs.
+        target_peg (int): The target peg number.
+        num_disks (int): Total number of disks.
+
+    Returns:
+        bool: True if valid, False otherwise.
+    """
+    target_stack = pegs_state[target_peg]
+    if len(target_stack) != num_disks:
+        return False
+    return target_stack == list(range(num_disks, 0, -1))

From 37375f08a95a41cd9e804b185cb1522a66041b37 Mon Sep 17 00:00:00 2001
From: joesharratt1229 <joesharratt1229@gmail.com>
Date: Fri, 31 Jan 2025 07:19:55 +0000
Subject: [PATCH 22/94] added linting checks

---
 reasoning_gym/games/countdown.py |  6 +++---
 tests/test_countdown.py          | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/reasoning_gym/games/countdown.py b/reasoning_gym/games/countdown.py
index 87d1a793..38a60c4f 100644
--- a/reasoning_gym/games/countdown.py
+++ b/reasoning_gym/games/countdown.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import List, Optional, Tuple, Dict, Any
+from typing import Any, Dict, List, Optional, Tuple
 
 import sympy
 from sympy import Symbol, symbols
@@ -158,7 +158,7 @@ class CountdownDataset(ProceduralDataset):
                 continue
 
         raise ValueError(f"Failed to generate valid expression after {max_attempts} attempts")
-    
+
     def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
         """Determine if the solution provided solves the problem"""
         reward = 0.0
@@ -168,7 +168,7 @@ class CountdownDataset(ProceduralDataset):
                 solved = user_answer == metadata["target"]
                 if solved:
                     reward = 1.0
-                elif (len(answer.strip()) > 0): # encourage partial solutions
+                elif len(answer.strip()) > 0:  # encourage partial solutions
                     reward = 0.05
                 else:
                     reward = 0.01
diff --git a/tests/test_countdown.py b/tests/test_countdown.py
index 04bf1a8b..e78a69ab 100644
--- a/tests/test_countdown.py
+++ b/tests/test_countdown.py
@@ -64,14 +64,16 @@ def test_countdown_game_items():
 
         # Verify expression evaluates correctly
         expr = item["metadata"]["expression"]
-        
-        #check score
-        assert dataset.score_answer(answer=expr, metadata=item["metadata"]) == 1.0  #correct answer
-        assert dataset.score_answer(answer="45+2", metadata=item["metadata"]) == 0.05  #wrong answer but an attempt
-        assert dataset.score_answer(answer="a wrong solution", metadata=item["metadata"]) == 0.01  #wrong answer but incorrectly formatted
-        assert dataset.score_answer(answer="", metadata=item["metadata"]) == 0.01  #wrong answer but empty string
-        assert dataset.score_answer(answer=None, metadata=item["metadata"]) == 0.0  #no answer
-        
+
+        # check score
+        assert dataset.score_answer(answer=expr, metadata=item["metadata"]) == 1.0  # correct answer
+        assert dataset.score_answer(answer="45+2", metadata=item["metadata"]) == 0.05  # wrong answer but an attempt
+        assert (
+            dataset.score_answer(answer="a wrong solution", metadata=item["metadata"]) == 0.01
+        )  # wrong answer but incorrectly formatted
+        assert dataset.score_answer(answer="", metadata=item["metadata"]) == 0.01  # wrong answer but empty string
+        assert dataset.score_answer(answer=None, metadata=item["metadata"]) == 0.0  # no answer
+
         try:
             result = eval(expr)  # Safe here since we control expression generation
             assert result == item["metadata"]["target"]

From b61bb23620da967e8e76d35e324d055e5f859d83 Mon Sep 17 00:00:00 2001
From: Joe Norton <16323+joenorton@users.noreply.github.com>
Date: Fri, 31 Jan 2025 00:05:33 -0800
Subject: [PATCH 23/94] linter

---
 .github/workflows/tests.yml           |   4 +-
 reasoning_gym/games/__init__.py       |   2 +-
 reasoning_gym/games/tower_of_hanoi.py | 122 +++++++++++++------------
 tests/test_tower_of_hanoi.py          | 127 +++++++++++++-------------
 4 files changed, 128 insertions(+), 127 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9e97239d..50b64d5d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -19,12 +19,12 @@ jobs:
       uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
-    
+
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         pip install ".[test]"
-    
+
     - name: Run tests
       run: |
         pytest
diff --git a/reasoning_gym/games/__init__.py b/reasoning_gym/games/__init__.py
index 9174089e..6a6df59f 100644
--- a/reasoning_gym/games/__init__.py
+++ b/reasoning_gym/games/__init__.py
@@ -25,5 +25,5 @@ __all__ = [
     "GameOfLifeConfig",
     "GameOfLifeDataset",
     "HanoiConfig",
-    "HanoiDataset"
+    "HanoiDataset",
 ]
diff --git a/reasoning_gym/games/tower_of_hanoi.py b/reasoning_gym/games/tower_of_hanoi.py
index 081b3bc6..3f878b60 100644
--- a/reasoning_gym/games/tower_of_hanoi.py
+++ b/reasoning_gym/games/tower_of_hanoi.py
@@ -1,13 +1,14 @@
 # reasoning_gym/games/tower_of_hanoi.py
 
-from dataclasses import dataclass
-from typing import List, Optional, Dict, Tuple
 import math
 import random
 import re
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
 
 from ..factory import ProceduralDataset, register_dataset
 
+
 @dataclass
 class HanoiConfig:
     """
@@ -21,7 +22,7 @@ class HanoiConfig:
     - seed: Optional seed for reproducibility.
     - visualize: Whether to include a visualization of the initial state.
     """
-    
+
     min_disks: int = 3
     max_disks: int = 7
     min_pegs: int = 3
@@ -29,7 +30,7 @@ class HanoiConfig:
     size: int = 50
     seed: Optional[int] = None
     visualize: bool = False  # New parameter
-    
+
     def validate(self) -> None:
         """Validate configuration parameters."""
         assert self.min_disks >= 1, "min_disks must be at least 1"
@@ -37,12 +38,13 @@ class HanoiConfig:
         assert self.min_pegs >= 3, "min_pegs must be at least 3"
         assert self.max_pegs >= self.min_pegs, "max_pegs must be >= min_pegs"
 
+
 class MoveGenerator:
     """
     Helper class to generate valid move sequences for Tower of Hanoi using the Frame-Stewart algorithm.
     It maintains the current state of all pegs to ensure move validity.
     """
-    
+
     def __init__(self, num_disks: int, pegs: List[int], start: int, target: int):
         self.num_disks = num_disks
         self.pegs = pegs
@@ -54,23 +56,23 @@ class MoveGenerator:
             self.pegs_state[start].append(disk)
         self.moves: List[str] = []
         self.memo: Dict[Tuple[int, int], int] = {}  # Memoization for T(n, k)
-    
+
     def generate_moves(self) -> List[str]:
         self.move(n=self.num_disks, source=self.start, target=self.target, auxiliary_pegs=self.auxiliary_pegs)
         return self.moves
-    
+
     def move(self, n: int, source: int, target: int, auxiliary_pegs: List[int]):
         if n == 0:
             return
         if n == 1:
             self._move_disk(source, target)
             return
-        
+
         k = len(auxiliary_pegs) + 2  # Total number of pegs including source and target
-        
+
         if k < 3:
             raise ValueError("At least 3 pegs are required.")
-        
+
         if k == 3:
             # Classic Tower of Hanoi solution
             aux = auxiliary_pegs[0]
@@ -78,7 +80,7 @@ class MoveGenerator:
             self._move_disk(source, target)
             self.move(n - 1, aux, target, [source])
             return
-        
+
         # For k > 3, apply Frame-Stewart algorithm
         # Find m that minimizes 2*T(m, k) + T(n - m, k - 1)
         min_moves = math.inf
@@ -90,20 +92,20 @@ class MoveGenerator:
             if total_moves < min_moves:
                 min_moves = total_moves
                 best_m = m
-        
+
         # Select a temporary peg to hold m disks
         temp_peg = auxiliary_pegs[0]
         new_auxiliary = [peg for peg in auxiliary_pegs if peg != temp_peg]
-        
+
         # Step 1: Move top m disks to temp_peg using all pegs
         self.move(n=best_m, source=source, target=temp_peg, auxiliary_pegs=auxiliary_pegs[1:] + [target])
-        
+
         # Step 2: Move remaining n - m disks to target using k - 1 pegs
         self.move(n=n - best_m, source=source, target=target, auxiliary_pegs=new_auxiliary)
-        
+
         # Step 3: Move m disks from temp_peg to target using all pegs
         self.move(n=best_m, source=temp_peg, target=target, auxiliary_pegs=auxiliary_pegs[1:] + [source])
-    
+
     def _move_disk(self, from_peg: int, to_peg: int):
         if not self.pegs_state[from_peg]:
             raise ValueError(f"No disks to move from Peg {from_peg}.")
@@ -111,7 +113,7 @@ class MoveGenerator:
         self.pegs_state[from_peg].pop()
         self.pegs_state[to_peg].append(disk)
         self.moves.append(f"Move disk {disk} from Peg {from_peg} to Peg {to_peg}")
-    
+
     def _compute_T(self, n: int, k: int) -> int:
         """
         Compute the minimal number of moves (T(n, k)) required to move n disks using k pegs.
@@ -122,10 +124,10 @@ class MoveGenerator:
         if n == 1:
             return 1
         if k == 3:
-            return 2 ** n - 1
+            return 2**n - 1
         if (n, k) in self.memo:
             return self.memo[(n, k)]
-        
+
         min_moves = math.inf
         for m in range(1, n):
             moves = 2 * self._compute_T(m, k) + self._compute_T(n - m, k - 1)
@@ -134,12 +136,13 @@ class MoveGenerator:
         self.memo[(n, k)] = min_moves
         return min_moves
 
+
 class HanoiDataset(ProceduralDataset):
     """
     Generates Tower of Hanoi problems with solutions.
     Supports variable number of pegs using the optimized Frame-Stewart algorithm with Peg State Tracking.
     """
-    
+
     def __init__(self, config: HanoiConfig):
         super().__init__(config=config, seed=config.seed, size=config.size)
         self.min_pegs = config.min_pegs
@@ -147,11 +150,11 @@ class HanoiDataset(ProceduralDataset):
         self.min_disks = config.min_disks
         self.max_disks = config.max_disks
         self.visualize = config.visualize  # Initialize the visualize attribute
-        
+
     def __getitem__(self, idx: int) -> dict:
         """
         Generate a Tower of Hanoi problem instance.
-        
+
         Returns:
             dict with:
             - "question": Text describing the problem setup.
@@ -161,33 +164,33 @@ class HanoiDataset(ProceduralDataset):
             - "states": (Optional) List of ASCII visualizations after each move.
         """
         rng = random.Random(self.seed + idx if self.seed is not None else None)
-        
+
         # Randomly select number of disks and pegs within the specified ranges
         num_disks = rng.randint(self.min_disks, self.max_disks)
         num_pegs = rng.randint(self.min_pegs, self.max_pegs)
-        
+
         # Assign unique peg identifiers (e.g., integers starting from 1)
         pegs = list(range(1, num_pegs + 1))
-        
+
         """ #Debug: Print current instance configuration
         print(f"\n--- Generating Instance {idx} ---")
         print(f"Number of Disks: {num_disks}")
         print(f"Number of Pegs: {num_pegs}")
         print(f"Pegs: {pegs}")
         """
-        
+
         # Randomly select start and target pegs
         start_peg, target_peg = rng.sample(pegs, 2)
-        
+
         # Auxiliary pegs are the remaining pegs
         auxiliary_pegs = [peg for peg in pegs if peg not in (start_peg, target_peg)]
-        
+
         """ # Debug: Print start, target, and auxiliary pegs
         print(f"Start Peg: {start_peg}")
         print(f"Target Peg: {target_peg}")
         print(f"Auxiliary Pegs: {auxiliary_pegs}")
         """
-        
+
         # Initialize the MoveGenerator and generate moves
         move_gen = MoveGenerator(num_disks, pegs, start_peg, target_peg)
         try:
@@ -195,24 +198,24 @@ class HanoiDataset(ProceduralDataset):
         except ValueError as ve:
             # print(f"Error during move generation: {ve}")
             raise ve
-        
+
         """ # Debug: Print the solution moves
         print(f"Solution Length: {len(solution)}")
         print("Solution Moves:")
         for move_num, move in enumerate(solution, start=1):
             print(f"  Move {move_num}: {move}")
         """
-        
+
         # Initialize pegs_state: all disks start on the start peg
         pegs_state = {peg: [] for peg in pegs}
         for disk in range(num_disks, 0, -1):  # Largest disk at the bottom
             pegs_state[start_peg].append(disk)
-        
+
         # Generate initial state visualization if requested
         initial_state_str = None
         if self.visualize:
             initial_state_str = self._visualize_state(pegs_state)
-        
+
         # Apply moves to track state changes
         states = []
         if self.visualize:
@@ -224,24 +227,24 @@ class HanoiDataset(ProceduralDataset):
                 except ValueError as ve:
                     # print(f"Error parsing move: {ve}")
                     raise ve
-                
+
                 # Validate the move
                 if not self._validate_move(pegs_state, move):
-                    #print(f"Invalid move detected: {move}")
-                    #print(f"Current Pegs State: {pegs_state}")
+                    # print(f"Invalid move detected: {move}")
+                    # print(f"Current Pegs State: {pegs_state}")
                     raise ValueError(f"Invalid move detected: {move}")
-                
+
                 # Move the disk
                 pegs_state[from_peg].pop()
                 pegs_state[to_peg].append(disk)
-                
+
                 # Visualize the new state
                 new_state_str = self._visualize_state(pegs_state)
                 states.append(new_state_str)
-        
+
         # Peg labels
         peg_labels = {peg: f"Peg {peg}" for peg in pegs}
-        
+
         question_str = (
             f"Solve the Tower of Hanoi problem with {num_disks} disks and {num_pegs} pegs.\n"
             f"Move all disks from {peg_labels[start_peg]} to {peg_labels[target_peg]} following the rules:\n"
@@ -250,7 +253,7 @@ class HanoiDataset(ProceduralDataset):
             "- All disks must be on a peg at all times.\n"
             "Provide the sequence of moves."
         )
-        
+
         result = {
             "question": question_str,
             "answer": solution,
@@ -263,28 +266,28 @@ class HanoiDataset(ProceduralDataset):
                 "solution_length": len(solution),
             },
         }
-        
+
         if self.visualize:
             result["initial_state"] = initial_state_str
             result["states"] = states  # List of all states including initial and after each move
-        
+
         return result
-    
+
     def _visualize_state(self, pegs_state: Dict[int, List[int]]) -> str:
         """
         Create an ASCII visualization of the current state of the pegs.
         Adapts to variable number of pegs.
-        
+
         Args:
             pegs_state (dict): Dictionary mapping peg numbers to lists of disks.
-        
+
         Returns:
             str: ASCII art representing the pegs and disks.
         """
         # Determine the number of levels based on the maximum number of disks on any peg
         max_height = max(len(disks) for disks in pegs_state.values())
         pegs = sorted(pegs_state.keys())
-        
+
         visualization = ""
         for level in range(max_height, 0, -1):
             for peg in pegs:
@@ -295,24 +298,24 @@ class HanoiDataset(ProceduralDataset):
                     disk_str = "[ ]"
                 visualization += disk_str.center(7)  # Adjust spacing as needed
             visualization += "\n"
-        
+
         # Add the base and peg numbers
         visualization += "-" * (7 * len(pegs)) + "\n"
         for peg in pegs:
             peg_label = f"P{peg}".center(7)
             visualization += peg_label
         visualization += "\n"
-        
+
         return visualization
-    
+
     def _validate_move(self, pegs_state: Dict[int, List[int]], move: str) -> bool:
         """
         Validate that a move adheres to the Tower of Hanoi rules.
-        
+
         Args:
             pegs_state (dict): Current state of the pegs.
             move (str): Move instruction, e.g., "Move disk 2 from Peg 1 to Peg 3".
-        
+
         Returns:
             bool: True if the move is valid, False otherwise.
         """
@@ -324,29 +327,29 @@ class HanoiDataset(ProceduralDataset):
             disk = int(parts[2])
             from_peg = int(parts[5])
             to_peg = int(parts[8])
-            
+
             # Check if the disk to move is the top disk on the from_peg
             if not pegs_state[from_peg] or pegs_state[from_peg][-1] != disk:
                 # print(f"Disk {disk} is not on top of Peg {from_peg}. Current state: {pegs_state[from_peg]}")
                 return False
-            
+
             # Check if placing the disk on the to_peg violates size constraints
             if pegs_state[to_peg] and pegs_state[to_peg][-1] < disk:
                 # print(f"Cannot place disk {disk} on top of smaller disk {pegs_state[to_peg][-1]} on Peg {to_peg}.")
                 return False
-            
+
             return True
         except Exception as e:
             print(f"Error validating move '{move}': {e}")
             return False
-    
+
     def _parse_move(self, move: str) -> Tuple[int, int, int]:
         """
         Parse a move string and extract disk number, from peg, and to peg.
-        
+
         Args:
             move (str): Move instruction, e.g., "Move disk 2 from Peg 1 to Peg 3".
-        
+
         Returns:
             tuple: (disk, from_peg, to_peg)
         """
@@ -354,11 +357,12 @@ class HanoiDataset(ProceduralDataset):
         match = re.match(pattern, move)
         if not match:
             raise ValueError(f"Unexpected move format: '{move}'")
-        
+
         disk = int(match.group(1))
         from_peg = int(match.group(2))
         to_peg = int(match.group(3))
         return disk, from_peg, to_peg
 
+
 # Register the dataset
 register_dataset("tower_of_hanoi", HanoiDataset, HanoiConfig)
diff --git a/tests/test_tower_of_hanoi.py b/tests/test_tower_of_hanoi.py
index a3a89023..a4228bc3 100644
--- a/tests/test_tower_of_hanoi.py
+++ b/tests/test_tower_of_hanoi.py
@@ -1,35 +1,38 @@
 """Tests for Tower of Hanoi puzzle generation"""
 
-import pytest
 import re
 
+import pytest
+
 from reasoning_gym.games.tower_of_hanoi import HanoiConfig, HanoiDataset
 
+
 def test_toh_config_validation():
     """Test that invalid configurations raise appropriate errors."""
     # Test negative number of disks
     with pytest.raises(AssertionError):
         config = HanoiConfig(min_disks=0)  # At least 1 disk required
         config.validate()
-    
+
     # Test max_disks less than min_disks
     with pytest.raises(AssertionError):
         config = HanoiConfig(min_disks=5, max_disks=3)
         config.validate()
-    
+
     # Test min_pegs less than 3
     with pytest.raises(AssertionError):
         config = HanoiConfig(min_pegs=2)
         config.validate()
-    
+
     # Test max_pegs less than min_pegs
     with pytest.raises(AssertionError):
         config = HanoiConfig(min_pegs=3, max_pegs=2)
         config.validate()
-    
+
     # Test invalid move configurations if any (assuming such validations exist)
     # Add more tests based on the actual validation logic in HanoiConfig
 
+
 def test_toh_dataset_deterministic():
     """Test that dataset generates the same items with the same seed."""
     config = HanoiConfig(seed=42, size=10)
@@ -39,6 +42,7 @@ def test_toh_dataset_deterministic():
     for i in range(len(dataset1)):
         assert dataset1[i] == dataset2[i], f"Mismatch found in instance {i} with seed 42."
 
+
 def test_toh_dataset_items():
     """Test basic properties of generated items."""
     config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=10, seed=42)
@@ -46,13 +50,13 @@ def test_toh_dataset_items():
 
     for i in range(len(dataset)):
         item = dataset[i]
-        
+
         # Check item structure
         assert isinstance(item, dict), f"Item {i} is not a dictionary."
         assert "question" in item, f"Item {i} missing 'question' key."
         assert "answer" in item, f"Item {i} missing 'answer' key."
         assert "metadata" in item, f"Item {i} missing 'metadata' key."
-        
+
         # Check metadata
         metadata = item["metadata"]
         assert "num_disks" in metadata, f"Item {i} metadata missing 'num_disks'."
@@ -61,71 +65,66 @@ def test_toh_dataset_items():
         assert "target_peg" in metadata, f"Item {i} metadata missing 'target_peg'."
         assert "auxiliary_pegs" in metadata, f"Item {i} metadata missing 'auxiliary_pegs'."
         assert "solution_length" in metadata, f"Item {i} metadata missing 'solution_length'."
-        
+
         num_disks = metadata["num_disks"]
         num_pegs = metadata["num_pegs"]
         start_peg = metadata["start_peg"]
         target_peg = metadata["target_peg"]
         auxiliary_pegs = metadata["auxiliary_pegs"]
         solution_length = metadata["solution_length"]
-        
+
         # Verify peg counts
-        assert num_pegs == len(metadata["auxiliary_pegs"]) + 2, (
-            f"Item {i} has inconsistent peg counts."
-        )
-        
+        assert num_pegs == len(metadata["auxiliary_pegs"]) + 2, f"Item {i} has inconsistent peg counts."
+
         # Verify solution_length consistency
-        assert solution_length == len(item["answer"]), (
-            f"Item {i} metadata 'solution_length' does not match actual number of moves."
-        )
-        
+        assert solution_length == len(
+            item["answer"]
+        ), f"Item {i} metadata 'solution_length' does not match actual number of moves."
+
         # Optional: Additional checks like verifying that start and target pegs are distinct
         assert start_peg != target_peg, f"Item {i} has identical start and target pegs."
 
+
 def test_toh_move_validity():
     """Test that all moves in each problem instance are valid according to Tower of Hanoi rules."""
     config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=10, seed=42)
     dataset = HanoiDataset(config)
 
     for idx, instance in enumerate(dataset):
-        num_disks = instance['metadata']['num_disks']
-        num_pegs = instance['metadata']['num_pegs']
-        start_peg = instance['metadata']['start_peg']
-        target_peg = instance['metadata']['target_peg']
-        auxiliary_pegs = instance['metadata']['auxiliary_pegs']
+        num_disks = instance["metadata"]["num_disks"]
+        num_pegs = instance["metadata"]["num_pegs"]
+        start_peg = instance["metadata"]["start_peg"]
+        target_peg = instance["metadata"]["target_peg"]
+        auxiliary_pegs = instance["metadata"]["auxiliary_pegs"]
         pegs = list(range(1, num_pegs + 1))
-        
+
         # Initialize pegs_state: all disks start on the start peg
         pegs_state = {peg: [] for peg in pegs}
         for disk in range(num_disks, 0, -1):
             pegs_state[start_peg].append(disk)
-        
+
         # Iterate over each move and validate
-        for move_num, move in enumerate(instance['answer'], start=1):
+        for move_num, move in enumerate(instance["answer"], start=1):
             disk, from_peg, to_peg = parse_move(move)
-            
+
             # Check that from_peg exists
-            assert from_peg in pegs, (
-                f"Move {move_num} in Instance {idx} references non-existent from_peg {from_peg}."
-            )
-            
+            assert from_peg in pegs, f"Move {move_num} in Instance {idx} references non-existent from_peg {from_peg}."
+
             # Check that to_peg exists
-            assert to_peg in pegs, (
-                f"Move {move_num} in Instance {idx} references non-existent to_peg {to_peg}."
-            )
-            
+            assert to_peg in pegs, f"Move {move_num} in Instance {idx} references non-existent to_peg {to_peg}."
+
             # Check that from_peg is not empty
-            assert pegs_state[from_peg], (
-                f"Move {move_num} in Instance {idx} attempts to move from an empty Peg {from_peg}."
-            )
-            
+            assert pegs_state[
+                from_peg
+            ], f"Move {move_num} in Instance {idx} attempts to move from an empty Peg {from_peg}."
+
             # Check that the disk to move is on top of from_peg
             top_disk = pegs_state[from_peg][-1]
             assert disk == top_disk, (
                 f"Move {move_num} in Instance {idx} attempts to move disk {disk} "
                 f"which is not on top of Peg {from_peg} (top disk: {top_disk})."
             )
-            
+
             # Check that moving disk to to_peg does not violate size constraints
             if pegs_state[to_peg]:
                 top_to_disk = pegs_state[to_peg][-1]
@@ -133,53 +132,51 @@ def test_toh_move_validity():
                     f"Move {move_num} in Instance {idx} attempts to place disk {disk} "
                     f"on top of smaller disk {top_to_disk} on Peg {to_peg}."
                 )
-            
+
             # Perform the move
             pegs_state[from_peg].pop()
             pegs_state[to_peg].append(disk)
 
+
 def test_toh_final_state_correct():
     """Test that the final state of each problem instance has all disks on the target peg in correct order."""
     config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=10, seed=42)
     dataset = HanoiDataset(config)
 
     for idx, instance in enumerate(dataset):
-        num_disks = instance['metadata']['num_disks']
-        num_pegs = instance['metadata']['num_pegs']
-        start_peg = instance['metadata']['start_peg']
-        target_peg = instance['metadata']['target_peg']
-        auxiliary_pegs = instance['metadata']['auxiliary_pegs']
+        num_disks = instance["metadata"]["num_disks"]
+        num_pegs = instance["metadata"]["num_pegs"]
+        start_peg = instance["metadata"]["start_peg"]
+        target_peg = instance["metadata"]["target_peg"]
+        auxiliary_pegs = instance["metadata"]["auxiliary_pegs"]
         pegs = list(range(1, num_pegs + 1))
-        
+
         # Initialize pegs_state: all disks start on the start peg
         pegs_state = {peg: [] for peg in pegs}
         for disk in range(num_disks, 0, -1):
             pegs_state[start_peg].append(disk)
-        
+
         # Perform all moves
-        for move in instance['answer']:
+        for move in instance["answer"]:
             disk, from_peg, to_peg = parse_move(move)
             pegs_state[from_peg].pop()
             pegs_state[to_peg].append(disk)
-        
+
         # After all moves, all disks should be on target peg in descending order
         final_pegs = pegs_state[target_peg]
-        assert len(final_pegs) == num_disks, (
-            f"Instance {idx} does not have all disks on the target Peg {target_peg}."
-        )
-        
+        assert len(final_pegs) == num_disks, f"Instance {idx} does not have all disks on the target Peg {target_peg}."
+
         # Verify that disks are in correct order on target peg
         expected_final = list(range(num_disks, 0, -1))
-        assert final_pegs == expected_final, (
-            f"Instance {idx} has disks on Peg {target_peg} in incorrect order."
-        )
-        
+        assert final_pegs == expected_final, f"Instance {idx} has disks on Peg {target_peg} in incorrect order."
+
         # Ensure all other pegs are empty
         for peg in pegs:
             if peg != target_peg:
-                assert len(pegs_state[peg]) == 0, (
-                    f"Instance {idx} has disks remaining on Peg {peg}, which should be empty."
-                )
+                assert (
+                    len(pegs_state[peg]) == 0
+                ), f"Instance {idx} has disks remaining on Peg {peg}, which should be empty."
+
 
 def test_toh_dataset_iteration():
     """Test that iteration respects dataset size and multiple iterations yield the same items."""
@@ -187,16 +184,15 @@ def test_toh_dataset_iteration():
     dataset = HanoiDataset(config)
 
     # Test dataset size
-    assert len(dataset) == config.size, (
-        f"Dataset size mismatch: expected {config.size}, got {len(dataset)}."
-    )
-    
+    assert len(dataset) == config.size, f"Dataset size mismatch: expected {config.size}, got {len(dataset)}."
+
     # Collect items
     items = list(dataset)
-    
+
     # Test multiple iterations yield the same items
     assert items == list(dataset), "Multiple iterations over the dataset do not yield the same items."
 
+
 def parse_move(move_str: str) -> tuple:
     """Parse a move string and extract disk number, from peg, and to peg.
 
@@ -214,6 +210,7 @@ def parse_move(move_str: str) -> tuple:
     to_peg = int(match.group(3))
     return disk, from_peg, to_peg
 
+
 def is_valid_final_state(pegs_state: dict, target_peg: int, num_disks: int) -> bool:
     """Verify that all disks are on the target peg in descending order.
 

From 865a1c0a4b09014eaecd5c6b121160d5de330f77 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 09:20:21 +0100
Subject: [PATCH 24/94] fix test workflow formatting :')

---
 .github/workflows/tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9e97239d..50b64d5d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -19,12 +19,12 @@ jobs:
       uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
-    
+
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         pip install ".[test]"
-    
+
     - name: Run tests
       run: |
         pytest

From d9332cdef244d99b09e7774770ce1870b1c16e6d Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 09:27:26 +0100
Subject: [PATCH 25/94] add pytest-cov dev dependency

---
 requirements-dev.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 8b1c25a4..b96fc1c2 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,4 +1,5 @@
 pytest>=8.3.4
+pytest-cov>=6.0.0
 black>=24.10.0
 isort>=5.13.2
 flake8>=7.1.1

From 19c491aaf8ad2ea227145929a9114777d1f61457 Mon Sep 17 00:00:00 2001
From: Joe Norton <16323+joenorton@users.noreply.github.com>
Date: Fri, 31 Jan 2025 01:14:45 -0800
Subject: [PATCH 26/94] add example text

---
 reasoning_gym/games/tower_of_hanoi.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/reasoning_gym/games/tower_of_hanoi.py b/reasoning_gym/games/tower_of_hanoi.py
index 3f878b60..df902300 100644
--- a/reasoning_gym/games/tower_of_hanoi.py
+++ b/reasoning_gym/games/tower_of_hanoi.py
@@ -251,6 +251,11 @@ class HanoiDataset(ProceduralDataset):
             "- Only one disk can be moved at a time.\n"
             "- A larger disk cannot be placed on top of a smaller disk.\n"
             "- All disks must be on a peg at all times.\n"
+            "Example:\n"
+            "Move disk 1 from Peg 1 to Peg 3\n"
+            "Move disk 2 from Peg 1 to Peg 2\n"
+            "Move disk 1 from Peg 3 to Peg 2\n"
+            "\n"
             "Provide the sequence of moves."
         )
 

From 443a244f830fe80b0603ae2b8a10fb51279eb04d Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 11:21:41 +0100
Subject: [PATCH 27/94] update discord channel name -> #reasoning-gym

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ff4fe17b..78efe0f3 100644
--- a/README.md
+++ b/README.md
@@ -134,4 +134,4 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
 
 ## Call for Contributions
 
-If you have ideas for additional procedural dataset generators please create an issue here or contact us in the `#arc-agi-2` channel of the [GPU-Mode discord server](https://discord.gg/gpumode).
+If you have ideas for additional procedural dataset generators please create an issue here or contact us in the `#reasoning-gym` channel of the [GPU-Mode discord server](https://discord.gg/gpumode).

From e1be047d67794a80ab947f7dc8cf7425bc0ab476 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 11:24:04 +0100
Subject: [PATCH 28/94] update dataset gallery

---
 GALLERY.md | 201 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 133 insertions(+), 68 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index f52bf50b..3d787535 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -33,6 +33,7 @@ This gallery shows examples from all available datasets using their default conf
 - [spell_backward](#spell_backward)
 - [sudoku](#sudoku)
 - [syllogism](#syllogism)
+- [tower_of_hanoi](#tower_of_hanoi)
 - [word_ladder](#word_ladder)
 - [word_sequence_reversal](#word_sequence_reversal)
 - [word_sorting](#word_sorting)
@@ -42,14 +43,14 @@ This gallery shows examples from all available datasets using their default conf
 Generates base conversion tasks
 
 Default configuration:
-````python
+```python
 min_base = 2
 max_base = 16
 min_value = 0
 max_value = 1000
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -74,7 +75,7 @@ Metadata: {'decimal_value': 418, 'source_base': 10, 'target_base': 13, 'source_r
 Dataset that generates basic arithmetic tasks with configurable complexity
 
 Default configuration:
-````python
+```python
 min_terms = 2
 max_terms = 6
 min_digits = 1
@@ -86,7 +87,7 @@ seed = 42
 size = 500
 format_style = simple
 whitespace = single
-````
+```
 
 Example tasks:
 ````
@@ -111,11 +112,11 @@ Metadata: {'num_terms': 5, 'num_digits': 1, 'expression': '0 + -2 + -4 * 0 * 3'}
 Generates BF tasks
 
 Default configuration:
-````python
+```python
 seed = 42
 size = 500
 difficulty = 1
-````
+```
 
 Example tasks:
 ````
@@ -146,7 +147,7 @@ Metadata: {'bfit_code': '\nint main() {\n    print("under");\n}\n', 'bf_program'
 Generates Caesar cipher encryption/decryption tasks
 
 Default configuration:
-````python
+```python
 delimiter = .
 min_words = 3
 max_words = 20
@@ -154,7 +155,7 @@ min_rotation = 1
 max_rotation = 25
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -179,7 +180,7 @@ Metadata: {'rotation': 17, 'cipher_text': 'ZW PFLI JKFDRTY ZJ FLK FW ZK DLJK SV
 Generates simple arithmetic tasks using only + and - operators
 
 Default configuration:
-````python
+```python
 min_terms = 2
 max_terms = 6
 min_digits = 1
@@ -187,7 +188,7 @@ max_digits = 4
 allow_negation = False
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -212,12 +213,12 @@ Metadata: {'num_terms': 5, 'num_digits': 1, 'expression': '2 + 6 + 3 + 4 + 0'}
 Generates color cube rotation reasoning tasks
 
 Default configuration:
-````python
+```python
 min_rotations = 1
 max_rotations = 3
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -280,7 +281,7 @@ Metadata: {'initial_state': {'top': 'orange', 'right': 'cyan', 'front': 'violet'
 Generates Countdown Number Game tasks
 
 Default configuration:
-````python
+```python
 min_numbers = 4
 max_numbers = 6
 min_value = 1
@@ -291,7 +292,7 @@ operators = ('+', '-', '*', '/')
 shuffle = True
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -319,14 +320,14 @@ Metadata: {'numbers': [5, 41, 38, 81, 14], 'target': 450, 'expression': '41*14 -
 Generates family relationship reasoning tasks
 
 Default configuration:
-````python
+```python
 min_family_size = 4
 max_family_size = 8
 male_names = ['James', 'John', 'Robert', 'Michael', 'William', 'David', 'Richard', 'Joseph', 'Thomas', 'Charles', 'Peter', 'Daniel', 'Matthew', 'Christopher', 'Andrew', 'George', 'Edward', 'Benjamin', 'Henry', 'Samuel', 'Alexander', 'Oliver', 'Jack', 'Harry', 'Jacob', 'Noah', 'Ethan', 'Lucas', 'Mason', 'Logan', 'Sebastian', 'Theodore', 'Owen', 'Liam', 'Aiden', 'Kai', 'Jayden', 'Zion', 'Phoenix', 'Atlas', 'Axel', 'Ryder', 'Finn']
 female_names = ['Mary', 'Patricia', 'Jennifer', 'Linda', 'Elizabeth', 'Barbara', 'Susan', 'Jessica', 'Sarah', 'Karen', 'Emma', 'Lisa', 'Anna', 'Margaret', 'Victoria', 'Charlotte', 'Sophia', 'Isabella', 'Olivia', 'Ava', 'Mia', 'Emily', 'Abigail', 'Amelia', 'Eleanor', 'Grace', 'Alice', 'Lucy', 'Chloe', 'Sophie', 'Lily', 'Hannah', 'Zoe', 'Luna', 'Nova', 'Aria', 'Willow', 'Aurora', 'Sage', 'River', 'Winter', 'Sky', 'Rain']
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -357,13 +358,13 @@ Metadata: {'person1': 'Liam', 'person2': 'Noah', 'relationship': 'father', 'fami
 Generates FigletFont tasks
 
 Default configuration:
-````python
+```python
 static_word = None
 static_font = None
 space_letters = True
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -421,7 +422,7 @@ Metadata: {'font': 'xcourb', 'space_letters': True}
 Generates fraction simplification tasks
 
 Default configuration:
-````python
+```python
 min_value = 1
 max_value = 1000
 min_factor = 1
@@ -429,7 +430,7 @@ max_factor = 100
 styles = ('plain', 'latex_inline', 'latex_frac', 'latex_dfrac')
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -454,14 +455,14 @@ Metadata: {'numerator': 29330, 'denominator': 37310, 'simplified_numerator': 419
 Generates Game of Life games with configurable parameters
 
 Default configuration:
-````python
+```python
 grid_size_x = 20
 grid_size_y = 20
 filled_cells = 100
 simulation_steps = 1
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -606,14 +607,14 @@ Metadata: {'grid_size_x': 20, 'grid_size_y': 20, 'filled_cells': 100, 'simulatio
 Generates Greatest Common Divisor (GCD) tasks
 
 Default configuration:
-````python
+```python
 min_numbers = 2
 max_numbers = 2
 min_value = 1
 max_value = 1000
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -638,14 +639,14 @@ Metadata: {'numbers': [297, 30], 'result': 3}
 Generates Least Common Multiple (LCM) tasks
 
 Default configuration:
-````python
+```python
 min_numbers = 2
 max_numbers = 2
 min_value = 1
 max_value = 100
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -670,13 +671,13 @@ Metadata: {'numbers': [38, 4], 'result': 76}
 Generates leg counting arithmetic tasks
 
 Default configuration:
-````python
+```python
 min_animals = 2
 max_animals = 5
 max_instances = 3
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -701,12 +702,12 @@ Metadata: {'animals': {'crab': 1, 'lobster': 2, 'human': 1, 'cow': 1, 'bee': 1},
 Generates letter counting tasks from text spans
 
 Default configuration:
-````python
+```python
 min_words = 5
 max_words = 15
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -731,7 +732,7 @@ Metadata: {'span_length': 11, 'target_letter': 't', 'span': ['readable', 'form',
 Generates word letter jumbling tasks
 
 Default configuration:
-````python
+```python
 min_word_len = 1
 max_word_len = 64
 min_words = 3
@@ -741,7 +742,7 @@ max_corruption_level = 0.9
 consecutive_words = True
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -767,14 +768,14 @@ Generates mazes with guaranteed shortest path distance from start to goal
     within [min_dist, max_dist].
 
 Default configuration:
-````python
+```python
 min_dist = 5
 max_dist = 10
 min_grid_size = 5
 max_grid_size = 10
 seed = 42
 size = 50
-````
+```
 
 Example tasks:
 ````
@@ -840,12 +841,12 @@ Metadata: {'grid_size': 7, 'grid': ['QQQQQQQ', 'QQ%%%%Q', 'QQ`%Q%Q', 'Q%%Q%%Q',
 Generates 4x4 sudoku puzzles with configurable difficulty
 
 Default configuration:
-````python
+```python
 min_empty = 8
 max_empty = 12
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -891,7 +892,7 @@ Metadata: {'puzzle': [[0, 0, 0, 0], [1, 3, 4, 0], [3, 1, 2, 4], [4, 0, 0, 0]], '
 Generates number filtering tasks
 
 Default configuration:
-````python
+```python
 min_numbers = 3
 max_numbers = 10
 min_decimals = 0
@@ -900,7 +901,7 @@ min_value = -100.0
 max_value = 100.0
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -925,7 +926,7 @@ Metadata: {'original_numbers': ['4', '-64.7', '-42.1', '-77', '-79.9640', '37.76
 Generates number sequence completion tasks with dynamic pattern generation
 
 Default configuration:
-````python
+```python
 min_terms = 4
 max_terms = 8
 min_value = -100
@@ -933,7 +934,7 @@ max_value = 100
 max_complexity = 3
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -958,7 +959,7 @@ Metadata: {'rule': 'halve', 'complexity': 2, 'sequence': [8, 4, 2, 1, 0, 0, 0, 0
 Generates number sorting tasks
 
 Default configuration:
-````python
+```python
 min_numbers = 3
 max_numbers = 10
 min_decimals = 0
@@ -967,7 +968,7 @@ min_value = -100.0
 max_value = 100.0
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -995,7 +996,7 @@ Generates random polynomial equations of degree in [min_degree, max_degree].
     - The solution may be real or complex; we filter real solutions by default for simplicity.
 
 Default configuration:
-````python
+```python
 min_terms = 2
 max_terms = 4
 min_value = 1
@@ -1005,7 +1006,7 @@ max_degree = 3
 operators = ('+', '-')
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1030,12 +1031,12 @@ Metadata: {'polynomial_expr': '71*n**3 - 2*n - 29', 'variable': 'n', 'degree': 3
 Generates prime factorization tasks
 
 Default configuration:
-````python
+```python
 min_value = 2
 max_value = 1000
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1060,7 +1061,7 @@ Metadata: {'number': 420, 'factors': [2, 2, 3, 5, 7]}
 Generates propositional logic reasoning tasks
 
 Default configuration:
-````python
+```python
 min_vars = 2
 max_vars = 4
 min_statements = 2
@@ -1068,7 +1069,7 @@ max_statements = 4
 max_complexity = 3
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1105,11 +1106,11 @@ Metadata: {'premises': ['((Q ∨ P) ∧ ¬P)', 'P', '((P ∧ R) ∧ ¬R)', '((Q
 Generates QuantumLock tasks
 
 Default configuration:
-````python
+```python
 difficulty = 10
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1158,13 +1159,13 @@ Metadata: {'difficulty': 10, 'solution_path': ['B', 'B', 'B', 'B', 'B', 'B', 'B'
 Generates RubiksCube tasks
 
 Default configuration:
-````python
+```python
 scramble_steps = 3
 cube_size = 3
 remove_ansi = True
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1228,12 +1229,12 @@ Metadata: {'cube_size': 3, 'scramble_steps': 3, 'scramble_moves': "U R' R'", 'ex
 Generates sentence reordering tasks from text spans
 
 Default configuration:
-````python
+```python
 min_words_in_sentence = 3
 max_words_in_sentence = 20
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1258,7 +1259,7 @@ Metadata: {'word_count': 10}
 Generates simple equations with one variable to solve
 
 Default configuration:
-````python
+```python
 min_terms = 2
 max_terms = 4
 min_value = 1
@@ -1266,7 +1267,7 @@ max_value = 100
 operators = ('+', '-', '*')
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1291,11 +1292,11 @@ Metadata: {'equation': '29*n - 5 = 430', 'variable': 'n'}
 Generates tasks to spell words backward
 
 Default configuration:
-````python
+```python
 min_word_len = 3
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1320,12 +1321,12 @@ Metadata: {'word': 'One', 'word_len': 3}
 Generates sudoku puzzles with configurable difficulty
 
 Default configuration:
-````python
+```python
 min_empty = 30
 max_empty = 50
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1401,7 +1402,7 @@ Metadata: {'puzzle': [[0, 0, 1, 2, 3, 0, 0, 0, 9], [3, 0, 0, 1, 8, 5, 6, 7, 2],
 Generates syllogism reasoning tasks
 
 Default configuration:
-````python
+```python
 terms = None
 allow_all = True
 allow_no = True
@@ -1411,7 +1412,7 @@ include_invalid = True
 invalid_ratio = 0.3
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1450,18 +1451,82 @@ Metadata: {'premise1': 'All butterflies are tigers', 'premise2': 'No tigers are
 
 ````
 
+### tower_of_hanoi
+Generates Tower of Hanoi problems with solutions.
+    Supports variable number of pegs using the optimized Frame-Stewart algorithm with Peg State Tracking.
+
+Default configuration:
+```python
+min_disks = 3
+max_disks = 7
+min_pegs = 3
+max_pegs = 4
+size = 50
+seed = 42
+visualize = False
+```
+
+Example tasks:
+````
+Example 1:
+Question: Solve the Tower of Hanoi problem with 3 disks and 3 pegs.
+Move all disks from Peg 3 to Peg 2 following the rules:
+- Only one disk can be moved at a time.
+- A larger disk cannot be placed on top of a smaller disk.
+- All disks must be on a peg at all times.
+Example:
+Move disk 1 from Peg 1 to Peg 3
+Move disk 2 from Peg 1 to Peg 2
+Move disk 1 from Peg 3 to Peg 2
+
+Provide the sequence of moves.
+Answer: ['Move disk 1 from Peg 3 to Peg 2', 'Move disk 2 from Peg 3 to Peg 1', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 3 from Peg 3 to Peg 2', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 2 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2']
+Metadata: {'num_disks': 3, 'num_pegs': 3, 'start_peg': 3, 'target_peg': 2, 'auxiliary_pegs': [1], 'solution_length': 7}
+
+Example 2:
+Question: Solve the Tower of Hanoi problem with 3 disks and 4 pegs.
+Move all disks from Peg 2 to Peg 4 following the rules:
+- Only one disk can be moved at a time.
+- A larger disk cannot be placed on top of a smaller disk.
+- All disks must be on a peg at all times.
+Example:
+Move disk 1 from Peg 1 to Peg 3
+Move disk 2 from Peg 1 to Peg 2
+Move disk 1 from Peg 3 to Peg 2
+
+Provide the sequence of moves.
+Answer: ['Move disk 1 from Peg 2 to Peg 1', 'Move disk 2 from Peg 2 to Peg 3', 'Move disk 3 from Peg 2 to Peg 4', 'Move disk 2 from Peg 3 to Peg 4', 'Move disk 1 from Peg 1 to Peg 4']
+Metadata: {'num_disks': 3, 'num_pegs': 4, 'start_peg': 2, 'target_peg': 4, 'auxiliary_pegs': [1, 3], 'solution_length': 5}
+
+Example 3:
+Question: Solve the Tower of Hanoi problem with 6 disks and 3 pegs.
+Move all disks from Peg 1 to Peg 2 following the rules:
+- Only one disk can be moved at a time.
+- A larger disk cannot be placed on top of a smaller disk.
+- All disks must be on a peg at all times.
+Example:
+Move disk 1 from Peg 1 to Peg 3
+Move disk 2 from Peg 1 to Peg 2
+Move disk 1 from Peg 3 to Peg 2
+
+Provide the sequence of moves.
+Answer: ['Move disk 1 from Peg 1 to Peg 3', 'Move disk 2 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 3 from Peg 1 to Peg 3', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 2 from Peg 2 to Peg 3', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 4 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 2 from Peg 3 to Peg 1', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 3 from Peg 3 to Peg 2', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 2 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 5 from Peg 1 to Peg 3', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 2 from Peg 2 to Peg 3', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 3 from Peg 2 to Peg 1', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 2 from Peg 3 to Peg 1', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 4 from Peg 2 to Peg 3', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 2 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 3 from Peg 1 to Peg 3', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 2 from Peg 2 to Peg 3', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 6 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 2 from Peg 3 to Peg 1', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 3 from Peg 3 to Peg 2', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 2 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 4 from Peg 3 to Peg 1', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 2 from Peg 2 to Peg 3', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 3 from Peg 2 to Peg 1', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 2 from Peg 3 to Peg 1', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 5 from Peg 3 to Peg 2', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 2 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 3 from Peg 1 to Peg 3', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 2 from Peg 2 to Peg 3', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 4 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2', 'Move disk 2 from Peg 3 to Peg 1', 'Move disk 1 from Peg 2 to Peg 1', 'Move disk 3 from Peg 3 to Peg 2', 'Move disk 1 from Peg 1 to Peg 3', 'Move disk 2 from Peg 1 to Peg 2', 'Move disk 1 from Peg 3 to Peg 2']
+Metadata: {'num_disks': 6, 'num_pegs': 3, 'start_peg': 1, 'target_peg': 2, 'auxiliary_pegs': [3], 'solution_length': 63}
+
+````
+
 ### word_ladder
 Generates word ladder transformation tasks
 
 Default configuration:
-````python
+```python
 min_word_length = 3
 max_word_length = 5
 min_chain_length = -1
 max_chain_length = -1
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1486,12 +1551,12 @@ Metadata: {'start_word': 'SAUT', 'end_word': 'SKER', 'word_length': 4, 'chain_le
 Generates word sequence reversal tasks from text spans
 
 Default configuration:
-````python
+```python
 min_words = 3
 max_words = 8
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````
@@ -1516,7 +1581,7 @@ Metadata: {'num_words': 6, 'words': ['readable', 'to', 'he', 'that', 'to', 'poss
 Generates word sorting tasks
 
 Default configuration:
-````python
+```python
 min_words = 3
 max_words = 10
 min_word_length = 3
@@ -1524,7 +1589,7 @@ max_word_length = 12
 transformation = original
 seed = 42
 size = 500
-````
+```
 
 Example tasks:
 ````

From 7d911a8c25a5a46fd06441cb62034943128e80f5 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 12:09:32 +0100
Subject: [PATCH 29/94] fix: Improve base conversion logic for non-standard
 bases

---
 reasoning_gym/algorithmic/base_conversion.py | 32 ++++++++++++++++----
 tests/test_base_conversion.py                | 26 ++++++++++++++++
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/reasoning_gym/algorithmic/base_conversion.py b/reasoning_gym/algorithmic/base_conversion.py
index eb0978bd..c1f62654 100644
--- a/reasoning_gym/algorithmic/base_conversion.py
+++ b/reasoning_gym/algorithmic/base_conversion.py
@@ -60,14 +60,34 @@ class BaseConversionDataset(ProceduralDataset):
         value, source_base, target_base = self._generate_conversion(rng)
 
         # Convert decimal to source base representation
-        source_repr = format(value, f"x" if source_base == 16 else f"b" if source_base == 2 else "").strip()
-        if source_base not in (2, 16):
-            source_repr = format(value, f"{source_base}x").lower().strip()
+        if source_base == 16:
+            source_repr = format(value, 'x')
+        elif source_base == 2:
+            source_repr = format(value, 'b')
+        else:
+            # Manual conversion for other bases
+            n = value
+            digits = []
+            while n:
+                digits.append(int(n % source_base))
+                n //= source_base
+            source_repr = ''.join(str(d) if d < 10 else chr(ord('a') + d - 10) 
+                                for d in reversed(digits) or [0])
 
         # Convert decimal to target base for answer
-        target_repr = format(value, f"x" if target_base == 16 else f"b" if target_base == 2 else "").strip()
-        if target_base not in (2, 16):
-            target_repr = format(value, f"{target_base}x").lower().strip()
+        if target_base == 16:
+            target_repr = format(value, 'x')
+        elif target_base == 2:
+            target_repr = format(value, 'b')
+        else:
+            # Manual conversion for other bases
+            n = value
+            digits = []
+            while n:
+                digits.append(int(n % target_base))
+                n //= target_base
+            target_repr = ''.join(str(d) if d < 10 else chr(ord('a') + d - 10) 
+                                for d in reversed(digits) or [0])
 
         source_name = self._format_base_name(source_base)
         target_name = self._format_base_name(target_base)
diff --git a/tests/test_base_conversion.py b/tests/test_base_conversion.py
index 7c8edf1e..dced77c4 100644
--- a/tests/test_base_conversion.py
+++ b/tests/test_base_conversion.py
@@ -83,6 +83,32 @@ def test_base_conversion_dataset_iteration():
     assert items == list(dataset)
 
 
+def test_base_conversion_validity():
+    """Test that generated numbers are valid for their bases"""
+    config = BaseConversionConfig(
+        min_base=2,
+        max_base=36,
+        min_value=0,
+        max_value=1000,
+        size=100,
+        seed=42
+    )
+    dataset = BaseConversionDataset(config)
+
+    def is_valid_for_base(num_str: str, base: int) -> bool:
+        valid_chars = "0123456789abcdefghijklmnopqrstuvwxyz"[:base]
+        return all(c in valid_chars for c in num_str.lower())
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        assert is_valid_for_base(item["metadata"]["source_repr"], 
+                               item["metadata"]["source_base"]), \
+            f"Invalid source number {item['metadata']['source_repr']} for base {item['metadata']['source_base']}"
+        assert is_valid_for_base(item["metadata"]["target_repr"], 
+                               item["metadata"]["target_base"]), \
+            f"Invalid target number {item['metadata']['target_repr']} for base {item['metadata']['target_base']}"
+
+
 def test_base_conversion_special_bases():
     """Test conversion between special bases (binary, hex)"""
     config = BaseConversionConfig(

From 355bfcd9ebf72e9f745b33cc21ae8f621ec3b538 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 12:10:09 +0100
Subject: [PATCH 30/94] fix: Correct base conversion test logic for
 non-standard bases

---
 tests/test_base_conversion.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tests/test_base_conversion.py b/tests/test_base_conversion.py
index dced77c4..2f12eaa9 100644
--- a/tests/test_base_conversion.py
+++ b/tests/test_base_conversion.py
@@ -65,9 +65,21 @@ def test_base_conversion_dataset_items():
         # Verify conversion correctness
         decimal_value = item["metadata"]["decimal_value"]
         target_base = item["metadata"]["target_base"]
-        expected = format(decimal_value, "x" if target_base == 16 else "b" if target_base == 2 else "").strip()
-        if target_base not in (2, 16):
-            expected = format(decimal_value, f"{target_base}x").lower().strip()
+            
+        # Use same conversion logic as implementation
+        if target_base == 16:
+            expected = format(decimal_value, 'x')
+        elif target_base == 2:
+            expected = format(decimal_value, 'b')
+        else:
+            # Manual conversion for other bases
+            n = decimal_value
+            digits = []
+            while n:
+                digits.append(int(n % target_base))
+                n //= target_base
+            expected = ''.join(str(d) if d < 10 else chr(ord('a') + d - 10)
+                             for d in reversed(digits) or [0])
         assert item["answer"] == expected
 
 

From 39263aa324606d3f97ad084ee262088edd5e2540 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 12:11:44 +0100
Subject: [PATCH 31/94] update base_conversion examples after fix

---
 GALLERY.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index 3d787535..efc7c643 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -55,19 +55,19 @@ size = 500
 Example tasks:
 ````
 Example 1:
-Question: Convert the base-3 number 28e to binary
+Question: Convert the base-3 number 220020 to binary
 Answer: 1010001110
-Metadata: {'decimal_value': 654, 'source_base': 3, 'target_base': 2, 'source_repr': '28e', 'target_repr': '1010001110'}
+Metadata: {'decimal_value': 654, 'source_base': 3, 'target_base': 2, 'source_repr': '220020', 'target_repr': '1010001110'}
 
 Example 2:
-Question: Convert the base-6 number 27 to base-13 (use lowercase letters a-z for digits above 9)
-Answer: 27
-Metadata: {'decimal_value': 39, 'source_base': 6, 'target_base': 13, 'source_repr': '27', 'target_repr': '27'}
+Question: Convert the base-6 number 103 to base-13 (use lowercase letters a-z for digits above 9)
+Answer: 30
+Metadata: {'decimal_value': 39, 'source_base': 6, 'target_base': 13, 'source_repr': '103', 'target_repr': '30'}
 
 Example 3:
-Question: Convert the base-10 number 1a2 to base-13 (use lowercase letters a-z for digits above 9)
-Answer: 1a2
-Metadata: {'decimal_value': 418, 'source_base': 10, 'target_base': 13, 'source_repr': '1a2', 'target_repr': '1a2'}
+Question: Convert the base-10 number 418 to base-13 (use lowercase letters a-z for digits above 9)
+Answer: 262
+Metadata: {'decimal_value': 418, 'source_base': 10, 'target_base': 13, 'source_repr': '418', 'target_repr': '262'}
 
 ````
 

From 69fb127b4e5e7e0755f6952e0b4d35717f119c29 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 12:13:41 +0100
Subject: [PATCH 32/94] fix: Move project metadata to correct section in
 pyproject.toml

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 80ad4865..f6ad4e0c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,8 @@ addopts = "-ra -q --cov=reasoning_gym"
 testpaths = [
     "tests",
 ]
+
+[project]
 classifiers = [
   "Programming Language :: Python :: 3",
   "License :: OSI Approved :: Apache Software License",

From 4ac41bd1749ab8d11e287d8a900ab61c34f2b1e5 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 12:15:27 +0100
Subject: [PATCH 33/94] fix: Remove duplicate project section and reorganize
 pyproject.toml

---
 pyproject.toml | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f6ad4e0c..40f3a1c2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,20 +18,6 @@ dependencies = [
   "magiccube==0.3.0",
   "pyfiglet==1.0.2"
 ]
-
-[project.optional-dependencies]
-test = [
-    "pytest>=7.0.0",
-    "pytest-cov>=4.0.0",
-]
-
-[tool.pytest.ini_options]
-addopts = "-ra -q --cov=reasoning_gym"
-testpaths = [
-    "tests",
-]
-
-[project]
 classifiers = [
   "Programming Language :: Python :: 3",
   "License :: OSI Approved :: Apache Software License",
@@ -40,6 +26,12 @@ classifiers = [
 license = "Apache-2.0"
 license-files = ["LICENSE*"]
 
+[project.optional-dependencies]
+test = [
+    "pytest>=7.0.0",
+    "pytest-cov>=4.0.0",
+]
+
 [project.urls]
 "Homepage" = "https://github.com/open-thought/reasoning-gym"
 "Bug Tracker" = "https://github.com/open-thought/reasoning-gym/issues"
@@ -59,3 +51,9 @@ include = '\.pyi?$'
 profile = "black"
 multi_line_output = 3
 line_length = 120
+
+[tool.pytest.ini_options]
+addopts = "-ra -q --cov=reasoning_gym"
+testpaths = [
+    "tests",
+]

From b49167c61c9d9d1aa200bebdcedd319b62388b72 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 12:16:08 +0100
Subject: [PATCH 34/94] lint

---
 reasoning_gym/algorithmic/base_conversion.py | 14 ++++-----
 tests/test_base_conversion.py                | 30 +++++++-------------
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/reasoning_gym/algorithmic/base_conversion.py b/reasoning_gym/algorithmic/base_conversion.py
index c1f62654..afa6200a 100644
--- a/reasoning_gym/algorithmic/base_conversion.py
+++ b/reasoning_gym/algorithmic/base_conversion.py
@@ -61,9 +61,9 @@ class BaseConversionDataset(ProceduralDataset):
 
         # Convert decimal to source base representation
         if source_base == 16:
-            source_repr = format(value, 'x')
+            source_repr = format(value, "x")
         elif source_base == 2:
-            source_repr = format(value, 'b')
+            source_repr = format(value, "b")
         else:
             # Manual conversion for other bases
             n = value
@@ -71,14 +71,13 @@ class BaseConversionDataset(ProceduralDataset):
             while n:
                 digits.append(int(n % source_base))
                 n //= source_base
-            source_repr = ''.join(str(d) if d < 10 else chr(ord('a') + d - 10) 
-                                for d in reversed(digits) or [0])
+            source_repr = "".join(str(d) if d < 10 else chr(ord("a") + d - 10) for d in reversed(digits) or [0])
 
         # Convert decimal to target base for answer
         if target_base == 16:
-            target_repr = format(value, 'x')
+            target_repr = format(value, "x")
         elif target_base == 2:
-            target_repr = format(value, 'b')
+            target_repr = format(value, "b")
         else:
             # Manual conversion for other bases
             n = value
@@ -86,8 +85,7 @@ class BaseConversionDataset(ProceduralDataset):
             while n:
                 digits.append(int(n % target_base))
                 n //= target_base
-            target_repr = ''.join(str(d) if d < 10 else chr(ord('a') + d - 10) 
-                                for d in reversed(digits) or [0])
+            target_repr = "".join(str(d) if d < 10 else chr(ord("a") + d - 10) for d in reversed(digits) or [0])
 
         source_name = self._format_base_name(source_base)
         target_name = self._format_base_name(target_base)
diff --git a/tests/test_base_conversion.py b/tests/test_base_conversion.py
index 2f12eaa9..8017d74a 100644
--- a/tests/test_base_conversion.py
+++ b/tests/test_base_conversion.py
@@ -65,12 +65,12 @@ def test_base_conversion_dataset_items():
         # Verify conversion correctness
         decimal_value = item["metadata"]["decimal_value"]
         target_base = item["metadata"]["target_base"]
-            
+
         # Use same conversion logic as implementation
         if target_base == 16:
-            expected = format(decimal_value, 'x')
+            expected = format(decimal_value, "x")
         elif target_base == 2:
-            expected = format(decimal_value, 'b')
+            expected = format(decimal_value, "b")
         else:
             # Manual conversion for other bases
             n = decimal_value
@@ -78,8 +78,7 @@ def test_base_conversion_dataset_items():
             while n:
                 digits.append(int(n % target_base))
                 n //= target_base
-            expected = ''.join(str(d) if d < 10 else chr(ord('a') + d - 10)
-                             for d in reversed(digits) or [0])
+            expected = "".join(str(d) if d < 10 else chr(ord("a") + d - 10) for d in reversed(digits) or [0])
         assert item["answer"] == expected
 
 
@@ -97,14 +96,7 @@ def test_base_conversion_dataset_iteration():
 
 def test_base_conversion_validity():
     """Test that generated numbers are valid for their bases"""
-    config = BaseConversionConfig(
-        min_base=2,
-        max_base=36,
-        min_value=0,
-        max_value=1000,
-        size=100,
-        seed=42
-    )
+    config = BaseConversionConfig(min_base=2, max_base=36, min_value=0, max_value=1000, size=100, seed=42)
     dataset = BaseConversionDataset(config)
 
     def is_valid_for_base(num_str: str, base: int) -> bool:
@@ -113,12 +105,12 @@ def test_base_conversion_validity():
 
     for i in range(len(dataset)):
         item = dataset[i]
-        assert is_valid_for_base(item["metadata"]["source_repr"], 
-                               item["metadata"]["source_base"]), \
-            f"Invalid source number {item['metadata']['source_repr']} for base {item['metadata']['source_base']}"
-        assert is_valid_for_base(item["metadata"]["target_repr"], 
-                               item["metadata"]["target_base"]), \
-            f"Invalid target number {item['metadata']['target_repr']} for base {item['metadata']['target_base']}"
+        assert is_valid_for_base(
+            item["metadata"]["source_repr"], item["metadata"]["source_base"]
+        ), f"Invalid source number {item['metadata']['source_repr']} for base {item['metadata']['source_base']}"
+        assert is_valid_for_base(
+            item["metadata"]["target_repr"], item["metadata"]["target_base"]
+        ), f"Invalid target number {item['metadata']['target_repr']} for base {item['metadata']['target_base']}"
 
 
 def test_base_conversion_special_bases():

From 1af455b42455db826d80d5a9789ad72e59ec1bd1 Mon Sep 17 00:00:00 2001
From: Schmeitzke <cjf.schmeitz@student.maastrichtuniversity.nl>
Date: Fri, 31 Jan 2025 14:14:10 +0100
Subject: [PATCH 35/94] Bug fix for segment object

---
 reasoning_gym/geometry/advanced_geometry.py | 70 ++++++++++++++-------
 1 file changed, 46 insertions(+), 24 deletions(-)

diff --git a/reasoning_gym/geometry/advanced_geometry.py b/reasoning_gym/geometry/advanced_geometry.py
index f8221614..ca098b55 100644
--- a/reasoning_gym/geometry/advanced_geometry.py
+++ b/reasoning_gym/geometry/advanced_geometry.py
@@ -3,7 +3,7 @@ from dataclasses import dataclass
 from typing import Optional, List
 
 import sympy
-from sympy.geometry import Point, Triangle
+from sympy.geometry import Point, Triangle, Segment
 
 from ..factory import ProceduralDataset, register_dataset
 
@@ -92,20 +92,24 @@ class AdvancedGeometryDataset(ProceduralDataset):
         """
         max_attempts = 100
         for _ in range(max_attempts):
-            xA = rng.randint(self.config.min_coord, self.config.max_coord)
-            yA = rng.randint(self.config.min_coord, self.config.max_coord)
-            xB = rng.randint(self.config.min_coord, self.config.max_coord)
-            yB = rng.randint(self.config.min_coord, self.config.max_coord)
-            xC = rng.randint(self.config.min_coord, self.config.max_coord)
-            yC = rng.randint(self.config.min_coord, self.config.max_coord)
+            # Generate points with integer coordinates
+            points = []
+            for _ in range(3):
+                x = rng.randint(self.config.min_coord, self.config.max_coord)
+                y = rng.randint(self.config.min_coord, self.config.max_coord)
+                points.append(Point(x, y))
+            
+            A, B, C = points
+            
+            # Calculate signed area to check for non-degeneracy
+            # Using the formula: 1/2 * |x1(y2 - y3) + x2(y3 - y1) + x3(y1 - y2)|
+            area = abs(
+                A.x * (B.y - C.y) + 
+                B.x * (C.y - A.y) + 
+                C.x * (A.y - B.y)
+            ) / 2
 
-            A = Point(xA, yA)
-            B = Point(xB, yB)
-            C = Point(xC, yC)
-            tri = Triangle(A, B, C)
-
-            # Check that the triangle is non-degenerate (area != 0)
-            if tri.area != 0:
+            if area > 0:
                 return A, B, C
 
         raise ValueError(f"Failed to generate a non-degenerate triangle after {max_attempts} attempts.")
@@ -114,12 +118,21 @@ class AdvancedGeometryDataset(ProceduralDataset):
         """
         Build a question about finding the orthocenter of triangle ABC.
         """
-        tri = Triangle(A, B, C)
-        # Sympy can give altitudes or direct concurrency point
-        ortho = tri.orthocenter
-        # Format the answer
-        # The orthocenter may have rational coordinates, so let's convert to float or simplified fraction
-        # We'll store both numeric approximations and exact forms in metadata
+        # Create line segments for the sides
+        AB = Segment(A, B)
+        BC = Segment(B, C)
+        CA = Segment(C, A)
+        
+        # Calculate altitudes
+        # Get perpendicular lines from each vertex to the opposite side
+        alt_A = A.perpendicular_line(BC)
+        alt_B = B.perpendicular_line(CA)
+        alt_C = C.perpendicular_line(AB)
+        
+        # Find orthocenter (intersection of any two altitudes)
+        ortho = alt_A.intersection(alt_B)[0]
+        
+        # Format coordinates
         x_ortho_approx = float(ortho.x.evalf())
         y_ortho_approx = float(ortho.y.evalf())
 
@@ -144,10 +157,19 @@ class AdvancedGeometryDataset(ProceduralDataset):
         """
         Build a question about finding the incircle radius of triangle ABC.
         """
-        tri = Triangle(A, B, C)
-        incircle = tri.incircle()
-        # incircle is a Circle object; radius is incircle.radius
-        radius = incircle.radius
+        # Calculate side lengths
+        a = B.distance(C)
+        b = C.distance(A)
+        c = A.distance(B)
+        
+        # Semi-perimeter
+        s = (a + b + c) / 2
+        
+        # Area using Heron's formula
+        area = sympy.sqrt(s * (s - a) * (s - b) * (s - c))
+        
+        # Radius of incircle = Area / Semi-perimeter
+        radius = area / s
 
         # Convert to float for final answer
         radius_approx = float(radius.evalf())

From 3578884c42e7bcc1b18eb8f0f25a3fb220cec839 Mon Sep 17 00:00:00 2001
From: Schmeitzke <cjf.schmeitz@student.maastrichtuniversity.nl>
Date: Fri, 31 Jan 2025 14:20:55 +0100
Subject: [PATCH 36/94] Bug fix for absent method in sympy

---
 reasoning_gym/geometry/advanced_geometry.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/reasoning_gym/geometry/advanced_geometry.py b/reasoning_gym/geometry/advanced_geometry.py
index ca098b55..3beae559 100644
--- a/reasoning_gym/geometry/advanced_geometry.py
+++ b/reasoning_gym/geometry/advanced_geometry.py
@@ -118,30 +118,24 @@ class AdvancedGeometryDataset(ProceduralDataset):
         """
         Build a question about finding the orthocenter of triangle ABC.
         """
-        # Create line segments for the sides
-        AB = Segment(A, B)
-        BC = Segment(B, C)
-        CA = Segment(C, A)
+        # Convert segments to lines
+        BC_line = sympy.Line(B, C)
+        CA_line = sympy.Line(C, A)
         
-        # Calculate altitudes
-        # Get perpendicular lines from each vertex to the opposite side
-        alt_A = A.perpendicular_line(BC)
-        alt_B = B.perpendicular_line(CA)
-        alt_C = C.perpendicular_line(AB)
+        # Calculate altitudes by creating lines perpendicular from each vertex
+        alt_A = BC_line.perpendicular_line(A)
+        alt_B = CA_line.perpendicular_line(B)
         
-        # Find orthocenter (intersection of any two altitudes)
+        # Find orthocenter (intersection of any two altitudes, e.g. alt_A and alt_B)
         ortho = alt_A.intersection(alt_B)[0]
         
-        # Format coordinates
         x_ortho_approx = float(ortho.x.evalf())
         y_ortho_approx = float(ortho.y.evalf())
 
-        # Choose a prompt
         question_template = rng.choice(self._prompt_templates["orthocenter"])
         question = question_template.format(
             A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y)
         )
-        # Round to e.g. 3 decimals or keep a string representation
         answer_str = f"({x_ortho_approx:.3f}, {y_ortho_approx:.3f})"
 
         metadata = {
@@ -153,6 +147,7 @@ class AdvancedGeometryDataset(ProceduralDataset):
         }
         return question, answer_str, metadata
 
+
     def _build_incircle_radius_task(self, rng: random.Random, A: Point, B: Point, C: Point):
         """
         Build a question about finding the incircle radius of triangle ABC.

From 15a0bd6f0f60e99b98f4ad155f5b9489c82198b4 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Fri, 31 Jan 2025 23:39:21 +0100
Subject: [PATCH 37/94] give pre-commit job write permission to issues

---
 .github/workflows/pre-commit.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index b31c4dc8..a5ff0cca 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -7,6 +7,9 @@ on:
 jobs:
   pre-commit:
     runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      contents: read
     steps:
     - uses: actions/checkout@v4
     - uses: actions/setup-python@v4

From 7eb146634013156e49c0f241b6a33254f58842dd Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Sat, 1 Feb 2025 02:10:48 +0100
Subject: [PATCH 38/94] Add time interval dataset class

---
 README.md                                  |   1 +
 pyproject.toml                             |   2 +-
 reasoning_gym/arithmetic/__init__.py       |   4 +
 reasoning_gym/arithmetic/time_intervals.py | 327 +++++++++++++++++++++
 tests/test_time_intervals.py               | 113 +++++++
 5 files changed, 446 insertions(+), 1 deletion(-)
 create mode 100644 reasoning_gym/arithmetic/time_intervals.py
 create mode 100644 tests/test_time_intervals.py

diff --git a/README.md b/README.md
index 78efe0f3..d5126451 100644
--- a/README.md
+++ b/README.md
@@ -82,6 +82,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
 - `LCMDataset`: Generate Least Common Multiple problems with configurable number of integers
 - `LegCountingDataset`: Generate animal leg counting word problems with various animals
 - `PrimeFactorizationDataset`: Generate prime factorization tasks with configurable number ranges
+- `TimeIntervalsDataset`: Generate time interval calculation tasks with various formats (time, date, datetime) and complexities
 
 ### <small>Algorithmic Tasks</small>
 
diff --git a/pyproject.toml b/pyproject.toml
index 40f3a1c2..afa19831 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,7 +53,7 @@ multi_line_output = 3
 line_length = 120
 
 [tool.pytest.ini_options]
-addopts = "-ra -q --cov=reasoning_gym"
+addopts = "-ra -q"
 testpaths = [
     "tests",
 ]
diff --git a/reasoning_gym/arithmetic/__init__.py b/reasoning_gym/arithmetic/__init__.py
index 12a6ee89..e6f95451 100644
--- a/reasoning_gym/arithmetic/__init__.py
+++ b/reasoning_gym/arithmetic/__init__.py
@@ -4,6 +4,7 @@ Arithmetic tasks for training reasoning capabilities:
 - Chain sums
 - Word problems
 - Leg counting
+- Time intervals
 """
 
 from .basic_arithmetic import BasicArithmeticDataset, BasicArithmeticDatasetConfig
@@ -13,6 +14,7 @@ from .gcd import GCDConfig, GCDDataset
 from .lcm import LCMConfig, LCMDataset
 from .leg_counting import LegCountingConfig, LegCountingDataset
 from .prime_factorization import PrimeFactorizationConfig, PrimeFactorizationDataset
+from .time_intervals import TimeIntervalsConfig, TimeIntervalsDataset
 
 __all__ = [
     "BasicArithmeticDataset",
@@ -30,4 +32,6 @@ __all__ = [
     "LegCountingDataset",
     "PrimeFactorizationConfig",
     "PrimeFactorizationDataset",
+    "TimeIntervalsConfig",
+    "TimeIntervalsDataset",
 ]
diff --git a/reasoning_gym/arithmetic/time_intervals.py b/reasoning_gym/arithmetic/time_intervals.py
new file mode 100644
index 00000000..ed9c47da
--- /dev/null
+++ b/reasoning_gym/arithmetic/time_intervals.py
@@ -0,0 +1,327 @@
+import random
+import zoneinfo
+from dataclasses import dataclass
+from datetime import date, datetime, time, timedelta
+from typing import List, Optional
+
+import pytz
+from dateutil import parser
+
+from ..factory import ProceduralDataset, register_dataset
+
+
+@dataclass
+class TimeIntervalsConfig:
+    """Configuration for time interval calculation tasks"""
+
+    min_time: time = time.min
+    max_time: time = time.max
+    max_time_difference_seconds: int = 24 * 60 * 60
+    min_date: date = date(1900, 1, 1)
+    max_date: date = date(3000, 1, 1)
+    max_date_difference_days: int = 100
+    task_types: List[str] = None
+    seed: Optional[int] = None
+    size: int = 500
+
+    def __post_init__(self):
+        if self.task_types is None:
+            self.task_types = ["time", "time_seconds", "time_ms", "date", "datetime", "datetime_tz"]
+
+    def validate(self) -> None:
+        """Validate configuration parameters"""
+        assert self.size > 0, "size must be positive"
+        assert self.max_time_difference_seconds > 0, "max_time_difference_seconds must be positive"
+        assert self.max_date_difference_days > 0, "max_date_difference_days must be positive"
+        assert self.min_date < self.max_date, "min_date must be before max_date"
+
+
+class TimeIntervalsDataset(ProceduralDataset):
+    """Generates time interval calculation tasks with various formats and complexities"""
+
+    TEMPLATES = [
+        "What is the duration between {start} and {end}? Please answer in {format}.",
+        "Calculate the time difference between {start} and {end}. Express the result in {format}.",
+        "How much time elapsed from {start} to {end}? Give your answer in {format}.",
+        "A meeting started at {start} and ended at {end}. How long was the meeting? Answer in {format}.",
+        "A system operation started at {start} and completed at {end}. What was the operation duration? Answer in {format}.",
+        "A database query started at {start} and ended at {end}. How long did the query take? Answer in {format}.",
+        "A flight departed at {start} and arrived at {end}. How long was the flight? Answer in {format}.",
+        "A video call started at {start} and ended at {end}. How long was the call? Answer in {format}.",
+        "A system backup started at {start} and completed at {end}. What was the total backup duration? Answer in {format}.",
+        "A conference call began at {start} and ended at {end}. How long was the conference? Answer in {format}.",
+    ]
+
+    TIME_FORMATS = [
+        "%H:%M",
+        "%H:%M:%S",
+        "%H:%M:%S.%f",
+    ]
+
+    DATE_FORMATS = [
+        "%Y-%m-%d",
+        "%B %d, %Y",
+        "%m/%d/%Y",
+        "%A, %B %d, %Y",  # e.g. Monday, January 15, 2024
+        "%a %b %d %Y",  # e.g. Mon Jan 15 2024
+        "%d %B %Y",  # e.g. 15 January 2024
+        "%Y-%m-%d (%A)",  # e.g. 2024-01-15 (Monday)
+    ]
+
+    DATETIME_FORMATS = [
+        "%Y-%m-%d %H:%M",
+        "%Y-%m-%d %H:%M:%S",
+        "%Y-%m-%d %H:%M %z",  # For UTC offset format
+        "%Y-%m-%d %H:%M:%S %z",  # For UTC offset with seconds
+        "%A, %B %d, %Y at %H:%M",  # e.g. Monday, January 15, 2024 at 14:30
+        "%a %b %d %Y %H:%M:%S",  # e.g. Mon Jan 15 2024 14:30:45
+        "%d %B %Y, %H:%M",  # e.g. 15 January 2024, 14:30
+        "%d %B %Y, %H:%M %z",  # e.g. 15 January 2024, 14:30 +0000
+        "%Y-%m-%d (%A) %H:%M:%S %z",  # e.g. 2024-01-15 (Monday) 14:30:45 +0000
+    ]
+
+    def __init__(self, config: TimeIntervalsConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+        self.timezones = list(pytz.common_timezones)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Generate a single time interval calculation task"""
+        item_rng = random.Random(self.seed + idx)
+
+        # Randomly choose task type from config
+        task_type = item_rng.choice(self.config.task_types)
+
+        start_time, end_time, format_str, expected_format = self._generate_times(item_rng, task_type)
+
+        template = item_rng.choice(self.TEMPLATES)
+        question = template.format(start=start_time, end=end_time, format=expected_format)
+
+        # Calculate the actual difference
+        if isinstance(start_time, str):
+            # Handle datetime strings with weekday names in parentheses
+            start_time = start_time.split(" (")[0]  # Remove (Weekday) if present
+            end_time = end_time.split(" (")[0]
+            # Parse with UTC offset handling
+            start_dt = parser.parse(start_time)
+            end_dt = parser.parse(end_time)
+        else:
+            start_dt = start_time
+            end_dt = end_time
+
+        difference = end_dt - start_dt
+
+        # Format the answer according to expected_format
+        if expected_format == "HH:MM":
+            total_seconds = difference.total_seconds()
+            answer = f"{int(total_seconds // 3600):02d}:{int((total_seconds % 3600) // 60):02d}"
+        elif expected_format == "HH:MM:SS":
+            total_seconds = difference.total_seconds()
+            answer = f"{int(total_seconds // 3600):02d}:{int((total_seconds % 3600) // 60):02d}:{int(total_seconds % 60):02d}"
+        elif expected_format == "HH:MM:SS.mmm":
+            total_seconds = difference.total_seconds()
+            ms = int((total_seconds % 1) * 1000)
+            answer = f"{int(total_seconds // 3600):02d}:{int((total_seconds % 3600) // 60):02d}:{int(total_seconds % 60):02d}.{ms:03d}"
+        elif expected_format == "D days":
+            answer = f"{difference.days} days"
+        else:  # "D days, HH:MM" or "D days, HH:MM:SS"
+            days = difference.days
+            hours = difference.seconds // 3600
+            minutes = (difference.seconds % 3600) // 60
+            seconds = difference.seconds % 60
+            if expected_format == "D days, HH:MM:SS":
+                answer = f"{days} days, {hours:02d}:{minutes:02d}:{seconds:02d}"
+            else:  # "D days, HH:MM"
+                answer = f"{days} days, {hours:02d}:{minutes:02d}"
+
+        return {
+            "question": question,
+            "answer": answer,
+            "metadata": {
+                "task_type": task_type,
+                "start_time": start_dt,
+                "end_time": end_dt,
+                "format": format_str,
+                "expected_format": expected_format,
+            },
+        }
+
+    def _generate_times(self, rng: random.Random, task_type: str):
+        """Generate start and end times based on task type"""
+        if task_type.startswith("time"):
+            if task_type == "time_ms":
+                format_str = self.TIME_FORMATS[2]  # Get milliseconds format
+                expected_format = "HH:MM:SS.mmm"
+            else:
+                format_str = next(f for f in self.TIME_FORMATS if f.count(":") == (2 if "seconds" in task_type else 1))
+                expected_format = "HH:MM:SS" if "seconds" in task_type else "HH:MM"
+
+            # Generate random start time
+            start_hour = rng.randint(0, 23)
+            start_minute = rng.randint(0, 59)
+            start_second = rng.randint(0, 59)
+            base = datetime.combine(date.today(), time(start_hour, start_minute, start_second))
+
+            # Calculate seconds remaining until midnight
+            seconds_until_midnight = ((24 - start_hour) * 3600) - (start_minute * 60) - start_second
+            # Use the minimum of config max and seconds until midnight
+            max_seconds = min(self.config.max_time_difference_seconds, seconds_until_midnight)
+            diff_seconds = rng.randint(1, max_seconds) if max_seconds > 0 else 0
+
+            if task_type == "time_ms":
+                # Add microseconds for millisecond precision
+                base = base.replace(microsecond=rng.randint(0, 999) * 1000)
+                end_time = base + timedelta(seconds=diff_seconds, microseconds=rng.randint(0, 999) * 1000)
+                # Format with exactly 3 decimal places for milliseconds
+                start_time = base.strftime(format_str)[:-3]  # Remove extra microsecond digits
+                end_time = end_time.strftime(format_str)[:-3]  # Remove extra microsecond digits
+            else:
+                start_time = base.strftime(format_str)
+                end_time = (base + timedelta(seconds=diff_seconds)).strftime(format_str)
+
+        elif task_type == "date":
+            format_str = rng.choice(self.DATE_FORMATS)
+            expected_format = "D days"  # Always return number of days for date tasks
+
+            # Generate random start date within configured range, leaving room for end date
+            max_date_difference_days = min(
+                self.config.max_date_difference_days, (self.config.max_date - self.config.min_date).days
+            )
+            max_start_days = (self.config.max_date - self.config.min_date).days - max_date_difference_days
+            start_days = rng.randint(0, max_start_days - 1)
+            start_date = self.config.min_date + timedelta(days=start_days)
+
+            # Ensure positive difference between dates
+            diff_days = rng.randint(0, max_date_difference_days)
+            end_date = start_date + timedelta(days=diff_days)
+
+            start_time = start_date.strftime(format_str)
+            end_time = end_date.strftime(format_str)
+
+        else:  # datetime or datetime_tz
+            format_str = rng.choice(self.DATETIME_FORMATS)
+            # Choose between HH:MM and HH:MM:SS format for datetime answers
+            expected_format = rng.choice(["D days, HH:MM", "D days, HH:MM:SS"])
+
+            # Generate random start datetime
+            days_range = (self.config.max_date - self.config.min_date).days
+            start_days = rng.randint(0, days_range)
+            start_hour = rng.randint(0, 23)
+            start_minute = rng.randint(0, 59)
+            start_second = rng.randint(0, 59)
+
+            # Generate random time differences first
+            diff_days = rng.randint(0, self.config.max_date_difference_days)
+            diff_seconds = rng.randint(1, self.config.max_time_difference_seconds)
+
+            if "%z" in format_str:
+                # Use simpler timezone format with offset
+                base = datetime.combine(
+                    self.config.min_date + timedelta(days=start_days), time(start_hour, start_minute, start_second)
+                )
+                # Generate timezone offsets
+                start_offset = rng.randint(-12, 12)
+                end_offset = rng.randint(-12, 12)
+
+                # Apply start timezone
+                base = base.replace(tzinfo=pytz.FixedOffset(start_offset * 60))
+                start_format = format_str.replace("%z", "%+05d" % (start_offset * 100))
+
+                # Calculate end time and convert to end timezone
+                end_dt = base + timedelta(days=diff_days, seconds=diff_seconds)
+                end_dt = end_dt.replace(tzinfo=pytz.FixedOffset(end_offset * 60))
+                end_format = format_str.replace("%z", "%+05d" % (end_offset * 100))
+
+                # Format times with their respective timezone offsets
+                start_time = base.strftime(start_format).rstrip()
+                end_time = end_dt.strftime(end_format).rstrip()
+            else:
+                base = datetime.combine(
+                    self.config.min_date + timedelta(days=start_days), time(start_hour, start_minute, start_second)
+                )
+                # For non-timezone aware times, both use same format
+                start_time = base.strftime(format_str).rstrip()
+                end_time = (base + timedelta(days=diff_days, seconds=diff_seconds)).strftime(format_str).rstrip()
+
+        return start_time, end_time, format_str, expected_format
+
+    def score_answer(self, answer: Optional[str], entry: dict) -> float:
+        """Score an answer based on how close it is to the expected duration
+
+        Returns a score between 0 and 1, with partial credit for answers that are
+        close to correct in the appropriate units/format
+        """
+        if not answer:
+            return 0.0
+
+        expected = entry["answer"]
+        task_type = entry["metadata"]["task_type"]
+
+        try:
+            if task_type == "date":
+                # Parse "X days" format
+                try:
+                    actual = int(answer.strip().split()[0])  # Get number before "days"
+                    expected = int(expected.strip().split()[0])
+                    if actual == expected:
+                        return 1.0
+                    # Partial credit based on how close the day count is
+                    max_diff = self.config.max_date_difference_days
+                    diff = abs(actual - expected)
+                    return max(0.0, 1.0 - (diff / max_diff))
+                except (ValueError, IndexError):
+                    return 0.0
+
+            elif task_type.startswith("time"):
+                # Parse times into total seconds for comparison
+                def parse_time(t):
+                    parts = t.strip().split(":")
+                    seconds = int(parts[0]) * 3600 + int(parts[1]) * 60
+                    if len(parts) > 2:
+                        if "." in parts[2]:  # Has milliseconds
+                            s, ms = parts[2].split(".")
+                            seconds += int(s) + int(ms) / 1000
+                        else:
+                            seconds += int(parts[2])
+                    return seconds
+
+                actual_seconds = parse_time(answer)
+                expected_seconds = parse_time(expected)
+
+                if actual_seconds == expected_seconds:
+                    return 1.0
+
+                # Partial credit based on how close the times are
+                max_diff = self.config.max_time_difference_seconds
+                diff = abs(actual_seconds - expected_seconds)
+                return max(0.0, 1.0 - (diff / max_diff))
+
+            else:  # datetime or datetime_tz
+                # Parse the complex format "X days, HH:MM" or "X days, HH:MM:SS"
+                def parse_datetime(t):
+                    days = int(t.split(" days,")[0])
+                    time_part = t.split(",")[1].strip()
+                    parts = time_part.split(":")
+                    seconds = int(parts[0]) * 3600 + int(parts[1]) * 60
+                    if len(parts) > 2:
+                        seconds += int(parts[2])
+                    return days * 86400 + seconds
+
+                actual_seconds = parse_datetime(answer)
+                expected_seconds = parse_datetime(expected)
+
+                if actual_seconds == expected_seconds:
+                    return 1.0
+
+                # Partial credit based on total time difference
+                max_diff = self.config.max_date_difference_days * 86400
+                diff = abs(actual_seconds - expected_seconds)
+                return max(0.0, 1.0 - (diff / max_diff))
+
+        except (ValueError, IndexError):
+            return 0.0  # Invalid format
+
+        return 0.0
+
+
+# Register the dataset
+register_dataset("time_intervals", TimeIntervalsDataset, TimeIntervalsConfig)
diff --git a/tests/test_time_intervals.py b/tests/test_time_intervals.py
new file mode 100644
index 00000000..4e95f778
--- /dev/null
+++ b/tests/test_time_intervals.py
@@ -0,0 +1,113 @@
+from datetime import date, datetime
+
+import pytest
+
+from reasoning_gym.arithmetic import TimeIntervalsConfig, TimeIntervalsDataset
+
+
+def test_time_intervals_config_validation():
+    """Test that invalid configs raise appropriate errors"""
+    with pytest.raises(AssertionError):
+        config = TimeIntervalsConfig(size=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = TimeIntervalsConfig(max_time_difference_seconds=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = TimeIntervalsConfig(max_date_difference_days=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = TimeIntervalsConfig(min_date=date(2024, 1, 1), max_date=date(2023, 1, 1))
+        config.validate()
+
+
+def test_time_intervals_deterministic():
+    """Test that dataset generates same items with same seed"""
+    config = TimeIntervalsConfig(seed=42, size=10)
+    dataset1 = TimeIntervalsDataset(config)
+    dataset2 = TimeIntervalsDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i]
+
+
+def test_time_intervals_items():
+    """Test basic properties of generated items"""
+    config = TimeIntervalsConfig(
+        size=100,
+        seed=42,
+        max_time_difference_seconds=3600,  # 1 hour max
+        max_date_difference_days=10,
+    )
+    dataset = TimeIntervalsDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+        assert "task_type" in item["metadata"]
+        assert "start_time" in item["metadata"]
+        assert "end_time" in item["metadata"]
+
+
+def test_time_intervals_scoring():
+    """Test the answer scoring functionality"""
+    config = TimeIntervalsConfig(seed=42)
+    dataset = TimeIntervalsDataset(config)
+
+    # Generate a sample item
+    item = dataset[0]
+
+    # Test exact match
+    assert dataset.score_answer(item["answer"], item) == 1.0
+
+    # Test empty/None answers
+    assert dataset.score_answer(None, item) == 0.0
+    assert dataset.score_answer("", item) == 0.0
+
+    # Test invalid format
+    assert dataset.score_answer("invalid", item) == 0.0
+
+    # Test close but not exact answers
+    task_type = item["metadata"]["task_type"]
+    if task_type == "date":
+        expected = int(item["answer"])
+        # Test answer off by 1 day
+        score = dataset.score_answer(str(expected + 1), item)
+        assert 0 < score < 1
+    elif task_type.startswith("time"):
+        # Test answer off by a few minutes
+        if ":" in item["answer"]:
+            parts = item["answer"].split(":")
+            hours = int(parts[0])
+            minutes = (int(parts[1]) + 5) % 60  # Add 5 minutes
+            modified = f"{hours:02d}:{minutes:02d}"
+            if len(parts) > 2:
+                modified += ":" + parts[2]
+            score = dataset.score_answer(modified, item)
+            assert 0 < score < 1
+
+
+def test_time_format_patterns():
+    """Test that generated times match expected formats"""
+    config = TimeIntervalsConfig(seed=42, size=500)
+    dataset = TimeIntervalsDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+
+        start_dt = item["metadata"]["start_time"]
+        end_dt = item["metadata"]["end_time"]
+
+        # Verify both are datetime objects
+        assert isinstance(start_dt, datetime)
+        assert isinstance(end_dt, datetime)
+
+        # Verify end is after start
+        assert end_dt >= start_dt, item["question"]
+        assert dataset.score_answer(item["answer"], item) == 1.0

From 62af53bb7a0e1ba6a1fd9cb73d597e3dd3bc1739 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Sat, 1 Feb 2025 02:11:57 +0100
Subject: [PATCH 39/94] add time delta to gallery

---
 GALLERY.md | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/GALLERY.md b/GALLERY.md
index efc7c643..5d593ee9 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -33,6 +33,7 @@ This gallery shows examples from all available datasets using their default conf
 - [spell_backward](#spell_backward)
 - [sudoku](#sudoku)
 - [syllogism](#syllogism)
+- [time_intervals](#time_intervals)
 - [tower_of_hanoi](#tower_of_hanoi)
 - [word_ladder](#word_ladder)
 - [word_sequence_reversal](#word_sequence_reversal)
@@ -1451,6 +1452,41 @@ Metadata: {'premise1': 'All butterflies are tigers', 'premise2': 'No tigers are
 
 ````
 
+### time_intervals
+Generates time interval calculation tasks with various formats and complexities
+
+Default configuration:
+```python
+min_time = 00:00:00
+max_time = 23:59:59.999999
+max_time_difference_seconds = 86400
+min_date = 1900-01-01
+max_date = 3000-01-01
+max_date_difference_days = 100
+task_types = ['time', 'time_seconds', 'time_ms', 'date', 'datetime', 'datetime_tz']
+seed = 42
+size = 500
+```
+
+Example tasks:
+````
+Example 1:
+Question: A system backup started at 2964-06-17 08:15:14 and completed at 2964-07-04 11:59:09. What was the total backup duration? Answer in D days, HH:MM.
+Answer: 17 days, 03:43
+Metadata: {'task_type': 'datetime_tz', 'start_time': datetime.datetime(2964, 6, 17, 8, 15, 14), 'end_time': datetime.datetime(2964, 7, 4, 11, 59, 9), 'format': '%Y-%m-%d %H:%M:%S', 'expected_format': 'D days, HH:MM'}
+
+Example 2:
+Question: A video call started at 09:44 and ended at 12:22. How long was the call? Answer in HH:MM.
+Answer: 02:38
+Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 1, 9, 44), 'end_time': datetime.datetime(2025, 2, 1, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'}
+
+Example 3:
+Question: Calculate the time difference between Sat Dec 22 2677 and Thu Mar 21 2678. Express the result in D days.
+Answer: 89 days
+Metadata: {'task_type': 'date', 'start_time': datetime.datetime(2677, 12, 22, 0, 0), 'end_time': datetime.datetime(2678, 3, 21, 0, 0), 'format': '%a %b %d %Y', 'expected_format': 'D days'}
+
+````
+
 ### tower_of_hanoi
 Generates Tower of Hanoi problems with solutions.
     Supports variable number of pegs using the optimized Frame-Stewart algorithm with Peg State Tracking.

From 9eabc01e23beb6e526abad21ab149b457f171fab Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Sat, 1 Feb 2025 02:15:45 +0100
Subject: [PATCH 40/94] deps: Add pytz dependency to pyproject.toml

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index afa19831..52d04319 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,8 @@ dependencies = [
   "cellpylib==2.4.0",
   "sympy>=1.13.1",
   "magiccube==0.3.0",
-  "pyfiglet==1.0.2"
+  "pyfiglet==1.0.2",
+  "pytz>=2024.1"
 ]
 classifiers = [
   "Programming Language :: Python :: 3",

From 808e47031d6208b4e3337cafb4de49967be6fd2a Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Sat, 1 Feb 2025 02:28:06 +0100
Subject: [PATCH 41/94] chore: Update GitHub workflow permissions for
 consistent access

---
 .github/workflows/pre-commit.yml | 3 ++-
 .github/workflows/tests.yml      | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index a5ff0cca..4aec0f07 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -8,8 +8,9 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     permissions:
-      issues: write
       contents: read
+      issues: write
+      pull-requests: write
     steps:
     - uses: actions/checkout@v4
     - uses: actions/setup-python@v4
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 50b64d5d..157c993d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -9,6 +9,10 @@ on:
 jobs:
   test:
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      issues: write
+      pull-requests: write
     strategy:
       matrix:
         python-version: ["3.11", "3.12"]

From 0f6f58cae262e27d3093facd4502993e427bb52c Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Sat, 1 Feb 2025 02:36:20 +0100
Subject: [PATCH 42/94] add pull-request: write permission for pre-commit
 checks

---
 .github/workflows/pre-commit.yml           | 10 +++++-----
 reasoning_gym/arithmetic/time_intervals.py |  2 --
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 4aec0f07..8bf7ae71 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -1,4 +1,8 @@
-name: Pre-commit
+name:  Pre-commit Checks
+
+permissions:
+  contents: read
+  pull-requests: write
 
 on:
   pull_request:
@@ -7,10 +11,6 @@ on:
 jobs:
   pre-commit:
     runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      issues: write
-      pull-requests: write
     steps:
     - uses: actions/checkout@v4
     - uses: actions/setup-python@v4
diff --git a/reasoning_gym/arithmetic/time_intervals.py b/reasoning_gym/arithmetic/time_intervals.py
index ed9c47da..abc177ca 100644
--- a/reasoning_gym/arithmetic/time_intervals.py
+++ b/reasoning_gym/arithmetic/time_intervals.py
@@ -1,5 +1,4 @@
 import random
-import zoneinfo
 from dataclasses import dataclass
 from datetime import date, datetime, time, timedelta
 from typing import List, Optional
@@ -82,7 +81,6 @@ class TimeIntervalsDataset(ProceduralDataset):
 
     def __init__(self, config: TimeIntervalsConfig):
         super().__init__(config=config, seed=config.seed, size=config.size)
-        self.timezones = list(pytz.common_timezones)
 
     def __getitem__(self, idx: int) -> dict:
         """Generate a single time interval calculation task"""

From 91cf5abe9c5a1ad4abd81784d47eac17540415b2 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Sat, 1 Feb 2025 17:01:11 +0100
Subject: [PATCH 43/94] lint

---
 reasoning_gym/geometry/__init__.py          |  2 +-
 reasoning_gym/geometry/advanced_geometry.py | 44 ++++++++-------------
 reasoning_gym/geometry/simple_geometry.py   | 16 ++++----
 tests/test_advanced_geometry.py             | 19 ++++-----
 tests/test_simple_geometry.py               | 22 +++--------
 5 files changed, 38 insertions(+), 65 deletions(-)

diff --git a/reasoning_gym/geometry/__init__.py b/reasoning_gym/geometry/__init__.py
index d6539df1..6e4e2d1a 100644
--- a/reasoning_gym/geometry/__init__.py
+++ b/reasoning_gym/geometry/__init__.py
@@ -1,5 +1,5 @@
-from .simple_geometry import SimpleGeometryConfig, SimpleGeometryDataset
 from .advanced_geometry import AdvancedGeometryConfig, AdvancedGeometryDataset
+from .simple_geometry import SimpleGeometryConfig, SimpleGeometryDataset
 
 __all__ = [
     "SimpleGeometryConfig",
diff --git a/reasoning_gym/geometry/advanced_geometry.py b/reasoning_gym/geometry/advanced_geometry.py
index 3beae559..3d88a348 100644
--- a/reasoning_gym/geometry/advanced_geometry.py
+++ b/reasoning_gym/geometry/advanced_geometry.py
@@ -1,9 +1,9 @@
 import random
 from dataclasses import dataclass
-from typing import Optional, List
+from typing import List, Optional
 
 import sympy
-from sympy.geometry import Point, Triangle, Segment
+from sympy.geometry import Point, Segment, Triangle
 
 from ..factory import ProceduralDataset, register_dataset
 
@@ -13,9 +13,10 @@ class AdvancedGeometryConfig:
     """
     Configuration for generating advanced geometry tasks.
     """
+
     min_coord: int = -10  # Minimum x/y coordinate
-    max_coord: int = 10   # Maximum x/y coordinate
-    size: int = 50        # Number of problems to generate
+    max_coord: int = 10  # Maximum x/y coordinate
+    size: int = 50  # Number of problems to generate
     seed: Optional[int] = None
 
     # Probability or list of tasks we want to generate
@@ -98,16 +99,12 @@ class AdvancedGeometryDataset(ProceduralDataset):
                 x = rng.randint(self.config.min_coord, self.config.max_coord)
                 y = rng.randint(self.config.min_coord, self.config.max_coord)
                 points.append(Point(x, y))
-            
+
             A, B, C = points
-            
+
             # Calculate signed area to check for non-degeneracy
             # Using the formula: 1/2 * |x1(y2 - y3) + x2(y3 - y1) + x3(y1 - y2)|
-            area = abs(
-                A.x * (B.y - C.y) + 
-                B.x * (C.y - A.y) + 
-                C.x * (A.y - B.y)
-            ) / 2
+            area = abs(A.x * (B.y - C.y) + B.x * (C.y - A.y) + C.x * (A.y - B.y)) / 2
 
             if area > 0:
                 return A, B, C
@@ -121,21 +118,19 @@ class AdvancedGeometryDataset(ProceduralDataset):
         # Convert segments to lines
         BC_line = sympy.Line(B, C)
         CA_line = sympy.Line(C, A)
-        
+
         # Calculate altitudes by creating lines perpendicular from each vertex
         alt_A = BC_line.perpendicular_line(A)
         alt_B = CA_line.perpendicular_line(B)
-        
+
         # Find orthocenter (intersection of any two altitudes, e.g. alt_A and alt_B)
         ortho = alt_A.intersection(alt_B)[0]
-        
+
         x_ortho_approx = float(ortho.x.evalf())
         y_ortho_approx = float(ortho.y.evalf())
 
         question_template = rng.choice(self._prompt_templates["orthocenter"])
-        question = question_template.format(
-            A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y)
-        )
+        question = question_template.format(A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y))
         answer_str = f"({x_ortho_approx:.3f}, {y_ortho_approx:.3f})"
 
         metadata = {
@@ -147,7 +142,6 @@ class AdvancedGeometryDataset(ProceduralDataset):
         }
         return question, answer_str, metadata
 
-
     def _build_incircle_radius_task(self, rng: random.Random, A: Point, B: Point, C: Point):
         """
         Build a question about finding the incircle radius of triangle ABC.
@@ -156,13 +150,13 @@ class AdvancedGeometryDataset(ProceduralDataset):
         a = B.distance(C)
         b = C.distance(A)
         c = A.distance(B)
-        
+
         # Semi-perimeter
         s = (a + b + c) / 2
-        
+
         # Area using Heron's formula
         area = sympy.sqrt(s * (s - a) * (s - b) * (s - c))
-        
+
         # Radius of incircle = Area / Semi-perimeter
         radius = area / s
 
@@ -170,9 +164,7 @@ class AdvancedGeometryDataset(ProceduralDataset):
         radius_approx = float(radius.evalf())
 
         question_template = rng.choice(self._prompt_templates["incircle_radius"])
-        question = question_template.format(
-            A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y)
-        )
+        question = question_template.format(A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y))
         answer_str = f"{radius_approx:.3f}"
 
         metadata = {
@@ -211,9 +203,7 @@ class AdvancedGeometryDataset(ProceduralDataset):
             angle_deg = float(angle_rad.evalf() * 180 / sympy.pi)
 
         question_template = rng.choice(self._prompt_templates["angle_measure"])
-        question = question_template.format(
-            A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y)
-        )
+        question = question_template.format(A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y))
 
         answer_str = f"{angle_deg:.2f}°"
         metadata = {
diff --git a/reasoning_gym/geometry/simple_geometry.py b/reasoning_gym/geometry/simple_geometry.py
index 3714abe8..d04912d7 100644
--- a/reasoning_gym/geometry/simple_geometry.py
+++ b/reasoning_gym/geometry/simple_geometry.py
@@ -4,6 +4,7 @@ from typing import Optional
 
 from ..factory import ProceduralDataset, register_dataset
 
+
 @dataclass
 class SimpleGeometryConfig:
     """
@@ -12,12 +13,12 @@ class SimpleGeometryConfig:
     for the first (N-1) sides, and asks the solver to find the last angle.
     """
 
-    min_sides: int = 3          # Minimum number of sides (e.g. triangle)
-    max_sides: int = 6          # Maximum number of sides (e.g. hexagon)
-    min_angle: int = 10         # Minimum angle (in degrees) for each of the first (N-1) angles
-    max_angle: int = 170        # Maximum angle (in degrees) for each of the first (N-1) angles
+    min_sides: int = 3  # Minimum number of sides (e.g. triangle)
+    max_sides: int = 6  # Maximum number of sides (e.g. hexagon)
+    min_angle: int = 10  # Minimum angle (in degrees) for each of the first (N-1) angles
+    max_angle: int = 170  # Maximum angle (in degrees) for each of the first (N-1) angles
     seed: Optional[int] = None  # Random seed
-    size: int = 100             # Number of geometry tasks to generate
+    size: int = 100  # Number of geometry tasks to generate
 
     def validate(self) -> None:
         """
@@ -85,9 +86,7 @@ class SimpleGeometryDataset(ProceduralDataset):
         # Build the question string
         angle_list_str = ", ".join(f"{a:.1f}°" for a in known_angles)
         prompt = rng.choice(self._prompt_templates).format(
-            n_sides=n_sides,
-            n_minus_1=n_sides - 1,
-            angle_list=angle_list_str
+            n_sides=n_sides, n_minus_1=n_sides - 1, angle_list=angle_list_str
         )
 
         # Round the missing angle to one decimal place or integer if it is very close to an integer
@@ -136,5 +135,6 @@ class SimpleGeometryDataset(ProceduralDataset):
             f"with total sum {total_sum} within {max_attempts} attempts."
         )
 
+
 # Register the dataset so it can be accessed similarly to the others
 register_dataset("simple_geometry", SimpleGeometryDataset, SimpleGeometryConfig)
diff --git a/tests/test_advanced_geometry.py b/tests/test_advanced_geometry.py
index cf371d10..9eec1b36 100644
--- a/tests/test_advanced_geometry.py
+++ b/tests/test_advanced_geometry.py
@@ -1,9 +1,7 @@
 import pytest
 
-from reasoning_gym.geometry.advanced_geometry import (
-    AdvancedGeometryDataset,
-    AdvancedGeometryConfig,
-)
+from reasoning_gym.geometry.advanced_geometry import AdvancedGeometryConfig, AdvancedGeometryDataset
+
 
 def test_advanced_geometry_config_validation():
     """Test that invalid configs raise appropriate errors."""
@@ -35,8 +33,7 @@ def test_advanced_geometry_dataset_deterministic():
 
     for i in range(len(dataset1)):
         assert dataset1[i] == dataset2[i], (
-            f"Item mismatch at index {i} for same seed. "
-            f"Dataset1: {dataset1[i]} vs Dataset2: {dataset2[i]}"
+            f"Item mismatch at index {i} for same seed. " f"Dataset1: {dataset1[i]} vs Dataset2: {dataset2[i]}"
         )
 
 
@@ -55,16 +52,14 @@ def test_advanced_geometry_dataset_items():
 
         # Basic metadata checks
         metadata = item["metadata"]
-        assert "A" in metadata and "B" in metadata and "C" in metadata, (
-            "Metadata should contain coordinates for points A, B, and C."
-        )
+        assert (
+            "A" in metadata and "B" in metadata and "C" in metadata
+        ), "Metadata should contain coordinates for points A, B, and C."
 
         # Check answer format depending on task type
         # For angle measure tasks, answer should end with '°'
         if "angle_measure" in item["question"].lower() or "angle at" in item["question"].lower():
-            assert item["answer"].endswith("°"), (
-                f"Expected angle measure in degrees, got {item['answer']}"
-            )
+            assert item["answer"].endswith("°"), f"Expected angle measure in degrees, got {item['answer']}"
 
 
 def test_advanced_geometry_dataset_iteration():
diff --git a/tests/test_simple_geometry.py b/tests/test_simple_geometry.py
index 4d8702df..804cf15a 100644
--- a/tests/test_simple_geometry.py
+++ b/tests/test_simple_geometry.py
@@ -1,9 +1,7 @@
 import pytest
 
-from reasoning_gym.geometry.simple_geometry import (
-    SimpleGeometryDataset,
-    SimpleGeometryConfig,
-)
+from reasoning_gym.geometry.simple_geometry import SimpleGeometryConfig, SimpleGeometryDataset
+
 
 def test_simple_geometry_config_validation():
     """Test invalid configs raise appropriate errors."""
@@ -35,21 +33,13 @@ def test_simple_geometry_dataset_deterministic():
 
     for i in range(len(dataset1)):
         assert dataset1[i] == dataset2[i], (
-            f"Item mismatch at index {i} for same seed. "
-            f"Dataset1: {dataset1[i]} vs Dataset2: {dataset2[i]}"
+            f"Item mismatch at index {i} for same seed. " f"Dataset1: {dataset1[i]} vs Dataset2: {dataset2[i]}"
         )
 
 
 def test_simple_geometry_dataset_items():
     """Test basic properties of generated items."""
-    config = SimpleGeometryConfig(
-        min_sides=3, 
-        max_sides=5, 
-        min_angle=10, 
-        max_angle=120, 
-        size=10, 
-        seed=123
-    )
+    config = SimpleGeometryConfig(min_sides=3, max_sides=5, min_angle=10, max_angle=120, size=10, seed=123)
     dataset = SimpleGeometryDataset(config)
 
     for i in range(len(dataset)):
@@ -62,9 +52,7 @@ def test_simple_geometry_dataset_items():
 
         metadata = item["metadata"]
         assert "n_sides" in metadata, "Metadata should contain 'n_sides'."
-        assert "missing_angle_rounded" in metadata, (
-            "Metadata should contain the computed 'missing_angle_rounded'."
-        )
+        assert "missing_angle_rounded" in metadata, "Metadata should contain the computed 'missing_angle_rounded'."
 
         # Check that the missing angle is a valid float or integer
         missing_angle = float(item["answer"])

From 27750467487930e42a3fe4a51955a89339aacece Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Sat, 1 Feb 2025 17:04:18 +0100
Subject: [PATCH 44/94] update dataset gallery

---
 GALLERY.md                | 70 +++++++++++++++++++++++++++++++++++++++
 reasoning_gym/__init__.py |  8 ++---
 2 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index 5d593ee9..0bc5154e 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -2,6 +2,7 @@
 This gallery shows examples from all available datasets using their default configurations.
 
 ## Available Datasets
+- [advanced_geometry](#advanced_geometry)
 - [base_conversion](#base_conversion)
 - [basic_arithmetic](#basic_arithmetic)
 - [bf](#bf)
@@ -30,6 +31,7 @@ This gallery shows examples from all available datasets using their default conf
 - [rubiks_cube](#rubiks_cube)
 - [sentence_reordering](#sentence_reordering)
 - [simple_equations](#simple_equations)
+- [simple_geometry](#simple_geometry)
 - [spell_backward](#spell_backward)
 - [sudoku](#sudoku)
 - [syllogism](#syllogism)
@@ -40,6 +42,37 @@ This gallery shows examples from all available datasets using their default conf
 - [word_sorting](#word_sorting)
 
 ## Dataset Examples
+### advanced_geometry
+A dataset for advanced geometry tasks using coordinate geometry.
+
+Default configuration:
+```python
+min_coord = -10
+max_coord = 10
+size = 50
+seed = 42
+task_types = ['orthocenter', 'incircle_radius', 'angle_measure']
+```
+
+Example tasks:
+````
+Example 1:
+Question: In triangle ABC with coordinates A=(-7, -10), B=(-2, -3), and C=(-3, -6), find the measure (in degrees) of angle ABC.
+Answer: 17.10°
+Metadata: {'A': (-7, -10), 'B': (-2, -3), 'C': (-3, -6), 'angle_ABC_degrees': 17.10272896905237}
+
+Example 2:
+Question: For triangle with vertices A=(-1, -6), B=(4, 1), and C=(-7, 4), determine the orthocenter (intersection of altitudes).
+Answer: (0.304, -1.217)
+Metadata: {'A': (-1, -6), 'B': (4, 1), 'C': (-7, 4), 'orthocenter_exact': ('7/23', '-28/23'), 'orthocenter_approx': (0.30434782608695654, -1.2173913043478262)}
+
+Example 3:
+Question: Find the incircle radius of triangle ABC whose vertices are A=(6, 7), B=(-7, -5), and C=(2, -3).
+Answer: 2.176
+Metadata: {'A': (6, 7), 'B': (-7, -5), 'C': (2, -3), 'incircle_radius_exact': 'sqrt(-sqrt(29) + sqrt(85)/2 + sqrt(313)/2)*sqrt(-sqrt(313)/2 + sqrt(85)/2 + sqrt(29))*sqrt(-sqrt(85)/2 + sqrt(29) + sqrt(313)/2)/sqrt(sqrt(85)/2 + sqrt(29) + sqrt(313)/2)', 'incircle_radius_approx': 2.176123777286009}
+
+````
+
 ### base_conversion
 Generates base conversion tasks
 
@@ -1289,6 +1322,43 @@ Metadata: {'equation': '29*n - 5 = 430', 'variable': 'n'}
 
 ````
 
+### simple_geometry
+A dataset for simple polygon angle-finding tasks.
+    We randomly choose the number of sides N within [min_sides, max_sides].
+    We then generate (N-1) random angles (in degrees), ensuring their sum is
+    strictly less than the total sum for an (N)-sided convex polygon (which is 180*(N-2)).
+    The question asks for the missing angle; the answer is computed by subtracting the
+    sum of known angles from 180*(N-2).
+
+Default configuration:
+```python
+min_sides = 3
+max_sides = 6
+min_angle = 10
+max_angle = 170
+seed = 42
+size = 100
+```
+
+Example tasks:
+````
+Example 1:
+Question: Given a convex polygon with 3 sides, its first 2 interior angles are: 16.0°, 80.0°. What is the measure of the remaining interior angle (in degrees)?
+Answer: 84
+Metadata: {'n_sides': 3, 'known_angles': [16.0, 80.0], 'sum_of_known_angles': 96.0, 'missing_angle_raw': 84.0, 'missing_angle_rounded': 84, 'total_interior_sum': 180}
+
+Example 2:
+Question: A convex polygon has 3 sides. The measures of the first 2 interior angles are: 83.0°, 46.0°. Find the measure of the last interior angle.
+Answer: 51
+Metadata: {'n_sides': 3, 'known_angles': [83.0, 46.0], 'sum_of_known_angles': 129.0, 'missing_angle_raw': 51.0, 'missing_angle_rounded': 51, 'total_interior_sum': 180}
+
+Example 3:
+Question: Given a convex polygon with 6 sides, its first 5 interior angles are: 143.0°, 148.0°, 39.0°, 55.0°, 107.0°. What is the measure of the remaining interior angle (in degrees)?
+Answer: 228
+Metadata: {'n_sides': 6, 'known_angles': [143.0, 148.0, 39.0, 55.0, 107.0], 'sum_of_known_angles': 492.0, 'missing_angle_raw': 228.0, 'missing_angle_rounded': 228, 'total_interior_sum': 720}
+
+````
+
 ### spell_backward
 Generates tasks to spell words backward
 
diff --git a/reasoning_gym/__init__.py b/reasoning_gym/__init__.py
index b25eb134..af0c228e 100644
--- a/reasoning_gym/__init__.py
+++ b/reasoning_gym/__init__.py
@@ -2,18 +2,18 @@
 Reasoning Gym - A library of procedural dataset generators for training reasoning models
 """
 
-from . import algebra, algorithmic, arithmetic, cognition, data, games, graphs, logic
+from . import algebra, algorithmic, arithmetic, cognition, data, games, geometry, graphs, logic
 from .factory import create_dataset, register_dataset
 
 __version__ = "0.1.1"
 __all__ = [
-    "arithmetic",
-    "algorithmic",
     "algebra",
+    "algorithmic",
+    "arithmetic",
     "cognition",
     "data",
     "games",
-    "graphs",
+    "geometry" "graphs",
     "logic",
     "create_dataset",
     "register_dataset",

From bb620a106677bc14daf6f5e0b3bfa501709f86e7 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@aleph-alpha.com>
Date: Sat, 1 Feb 2025 17:06:03 +0100
Subject: [PATCH 45/94] fix main __init__.py

---
 reasoning_gym/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/reasoning_gym/__init__.py b/reasoning_gym/__init__.py
index af0c228e..49068294 100644
--- a/reasoning_gym/__init__.py
+++ b/reasoning_gym/__init__.py
@@ -13,7 +13,8 @@ __all__ = [
     "cognition",
     "data",
     "games",
-    "geometry" "graphs",
+    "geometry",
+    "graphs",
     "logic",
     "create_dataset",
     "register_dataset",

From 86525f6401c99fc241aa4484ce5448b1654c83e6 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sat, 1 Feb 2025 11:37:50 -0800
Subject: [PATCH 46/94] [aiw] basic version of alice-in-wonderland procedural
 dataset

---
 reasoning_gym/logic/aiw.py | 183 +++++++++++++++++++++++++++++++++++++
 tests/test_aiw.py          | 107 ++++++++++++++++++++++
 2 files changed, 290 insertions(+)
 create mode 100644 reasoning_gym/logic/aiw.py
 create mode 100644 tests/test_aiw.py

diff --git a/reasoning_gym/logic/aiw.py b/reasoning_gym/logic/aiw.py
new file mode 100644
index 00000000..bee47a89
--- /dev/null
+++ b/reasoning_gym/logic/aiw.py
@@ -0,0 +1,183 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+from enum import Enum
+from random import Random
+from string import Template
+
+from ..factory import ProceduralDataset, register_dataset
+
+
+class TaskType(Enum):
+    """Defines the type of task for the Alice in Wonderland dataset."""
+    SIBLINGS = "siblings"
+    FRIENDS = "friends"
+
+
+class OutputFormat(Enum):
+    """Defines the output format for the generated questions."""
+    PLAIN = "plain"
+    RESTRICTED = "restricted"
+    THINKING = "thinking"
+
+
+@dataclass
+class AliceInWonderlandConfig:
+    """Configuration options for the Alice in Wonderland dataset.
+
+    Attributes:
+        male_names (List[str]): List of male names to use in questions.
+        female_names (List[str]): List of female names to use in questions. Must include 'Alice'.
+        task_types (List[TaskType]): List of task types to include in dataset.
+        output_formats (List[OutputFormat]): List of output formats to include in dataset.
+        seed (Optional[int]): Seed for random number generation.
+        size (int): Number of samples in the dataset.
+        max_entities (int): Max number of siblings/friends in questions.
+    """
+    male_names: List[str] = field(
+        default_factory=lambda: [
+            "James", "John", "Robert", "Michael", "William", "David",
+            "Richard", "Joseph", "Thomas", "Charles", "Bob"
+        ]
+    )
+    female_names: List[str] = field(
+        default_factory=lambda: [
+            "Mary", "Patricia", "Jennifer", "Linda", "Elizabeth",
+            "Barbara", "Susan", "Jessica", "Sarah", "Margaret", "Alice"
+        ]
+    )
+    task_types: List[TaskType] = field(
+        default_factory=lambda: [TaskType.SIBLINGS, TaskType.FRIENDS]
+    )
+    output_formats: List[OutputFormat] = field(
+        default_factory=lambda: [
+            OutputFormat.PLAIN,
+            OutputFormat.RESTRICTED,
+            OutputFormat.THINKING,
+        ]
+    )
+    seed: Optional[int] = None
+    size: int = 10
+    max_entities: int = 6  # Added max_entities
+
+    def validate(self) -> None:
+        """Validates the configuration parameters."""
+        assert len(self.male_names) > 0, "must provide male names"
+        assert len(self.female_names) > 0, "must provide female names"
+        assert "Alice" in self.female_names, "'Alice' must be in female names"
+        assert len(self.task_types) > 0, "must provide at least one task type"
+        assert len(
+            self.output_formats) > 0, "must provide at least one output format"
+        assert self.max_entities > 0, "max_entities must be positive"
+
+
+class AliceInWonderlandDataset(ProceduralDataset):
+    """
+     A procedural dataset inspired by the "Alice in Wonderland" paper.
+
+     The dataset is inspired by the following paper:
+        @inproceedings{nezhurina2024alice,
+        title={Alice in Wonderland: Simple Tasks Reveal Severe Generalization and
+               Basic Reasoning Deficits in State-Of-the-Art Large Language Models},
+        author={Marianna Nezhurina and Lucia Cipolina-Kun and Mehdi Cherti and
+               Jenia Jitsev},
+        booktitle={NeurIPS 2024 Workshop on Scientific Methods for Understanding
+                   Deep Learning},
+        year={2024},
+        url={https://openreview.net/forum?id=Mkl7dzjYiW}
+        }
+
+    """
+    def __init__(self, config: AliceInWonderlandConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+        self.templates = {
+            TaskType.SIBLINGS: [
+                Template(
+                    "$female_name has $num_brothers brothers and she also has "
+                    "$num_sisters sisters. How many sisters does "
+                    "$female_name's brother have?"
+                ),
+                Template(
+                    "$female_name has $num_sisters sisters and she also has "
+                    "$num_brothers brothers. How many sisters does "
+                    "$male_name's brother have?"
+                ),
+            ],
+            TaskType.FRIENDS: [
+                Template(
+                    "$female_name has $num_male male friends and she also has "
+                    "$num_female female friends. They all are friends with each "
+                    "other and have no other friends aside. How many female "
+                    "friends does $male_name, a male friend of $female_name, "
+                    "have?"
+                )
+            ],
+        }
+
+        self.format_templates = {
+            OutputFormat.PLAIN: Template("$question"),
+            OutputFormat.RESTRICTED: Template(
+                "$question To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT "
+                'following format that contains final answer: "### Answer:"'
+            ),
+            OutputFormat.THINKING: Template(
+                "$question Before providing answer to this problem, think "
+                "carefully step by step and double check the path to the "
+                'correct solution for any mistakes. Provide then the final '
+                'answer in following form: "### Answer:"'
+            ),
+        }
+
+    def _get_aiw(self, rng: Random) -> dict:
+        """Generates a single Alice in Wonderland question.
+
+        Args:
+            rng (Random): Random number generator.
+
+        Returns:
+            dict: A dictionary containing the generated question, the right answer
+                and a description of the example.
+        """
+        task_type = rng.choice(self.config.task_types)
+        output_format = rng.choice(self.config.output_formats)
+        female_name = rng.choice(self.config.female_names)
+        male_name = rng.choice(self.config.male_names)
+
+        if task_type == TaskType.SIBLINGS:
+            num_brothers = rng.randint(1, self.config.max_entities)
+            num_sisters = rng.randint(1, self.config.max_entities)
+            answer = num_sisters + 1
+            template = rng.choice(self.templates[TaskType.SIBLINGS])
+            question = template.substitute(
+                female_name=female_name,
+                male_name=male_name,
+                num_brothers=num_brothers,
+                num_sisters=num_sisters,
+            )
+        elif task_type == TaskType.FRIENDS:
+            num_male = rng.randint(1, self.config.max_entities)
+            num_female = rng.randint(1, self.config.max_entities)
+            answer = num_female + 1
+            template = rng.choice(self.templates[TaskType.FRIENDS])
+            question = template.substitute(
+                female_name=female_name,
+                male_name=male_name,
+                num_male=num_male,
+                num_female=num_female,
+            )
+
+        formatted_question = self.format_templates[output_format].substitute(
+            question=question
+        )
+
+        return {
+            "prompt": formatted_question,
+            "right_answer": str(answer),
+            "description": f"{task_type.value} variation, {output_format.value} format",
+        }
+
+    def __getitem__(self, idx: int) -> dict:
+        rng = Random(self.seed + idx)
+        return self._get_aiw(rng)
+
+
+register_dataset("aiw", AliceInWonderlandDataset, AliceInWonderlandConfig)
\ No newline at end of file
diff --git a/tests/test_aiw.py b/tests/test_aiw.py
new file mode 100644
index 00000000..5a0fbaf5
--- /dev/null
+++ b/tests/test_aiw.py
@@ -0,0 +1,107 @@
+import pytest
+
+from reasoning_gym.logic.aiw import AliceInWonderlandConfig, AliceInWonderlandDataset, TaskType, OutputFormat
+
+def test_aiw_config_validation():
+    """Test that invalid configs raise appropriate errors"""
+    with pytest.raises(AssertionError):
+        config = AliceInWonderlandConfig(male_names=[])  # Empty male names
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = AliceInWonderlandConfig(female_names=[])  # Empty female names
+        config.validate()
+    
+    with pytest.raises(AssertionError):
+        config = AliceInWonderlandConfig(female_names=["Mary", "Jane"])  # No Alice
+        config.validate()
+    
+    with pytest.raises(AssertionError):
+        config = AliceInWonderlandConfig(task_types=[])  # No task types
+        config.validate()
+    
+    with pytest.raises(AssertionError):
+        config = AliceInWonderlandConfig(output_formats=[])  # No output formats
+        config.validate()
+
+def test_aiw_deterministic():
+    """Test that dataset generates same items with same seed"""
+    config = AliceInWonderlandConfig(seed=42, size=10)
+    dataset1 = AliceInWonderlandDataset(config)
+    dataset2 = AliceInWonderlandDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i]
+
+def test_aiw_items():
+    """Test basic properties of generated items"""
+    config = AliceInWonderlandConfig(size=50, seed=42)
+    dataset = AliceInWonderlandDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        assert isinstance(item, dict)
+        assert "prompt" in item
+        assert "right_answer" in item
+        assert "description" in item
+        
+        # Verify answer is numeric and positive
+        answer = int(item["right_answer"])
+        assert answer > 0
+        
+        # Verify question contains at least one female name
+        female_names = config.female_names
+        assert any(name in item["prompt"] for name in female_names)
+
+        # Verify question format
+        if TaskType.SIBLINGS.value in item["description"]:
+            assert any(phrase in item["prompt"] for phrase in ["brothers", "sisters"])
+        elif TaskType.FRIENDS.value in item["description"]:
+            assert "friends" in item["prompt"]
+            
+        # Verify output format
+        if OutputFormat.RESTRICTED.value in item["description"]:
+            assert "DO NOT OUTPUT ANY TEXT EXCEPT" in item["prompt"]
+        elif OutputFormat.THINKING.value in item["description"]:
+            assert "think carefully step by step" in item["prompt"]
+
+def test_aiw_iteration():
+    """Test that iteration works correctly"""
+    config = AliceInWonderlandConfig(size=5, seed=42)
+    dataset = AliceInWonderlandDataset(config)
+
+    # Test manual iteration
+    items = []
+    for item in dataset:
+        items.append(item)
+    assert len(items) == config.size
+
+    # Test list conversion
+    items = list(dataset)
+    assert len(items) == config.size
+
+    # Test multiple iterations yield same results
+    first_items = list(dataset)
+    second_items = list(dataset)
+    assert first_items == second_items
+
+def test_aiw_random_ranges():
+    """Test that generated numbers stay within expected ranges"""
+    config = AliceInWonderlandConfig(size=30, seed=42, max_entities=12)
+    dataset = AliceInWonderlandDataset(config)
+
+    for item in dataset:
+        prompt = item["prompt"]
+        numbers = [int(n) for n in prompt.split() if n.isdigit()]
+        
+        # Check all numbers are in reasonable range (1-6 as per implementation)
+        assert all(1 <= n <= 12 for n in numbers), f"Numbers out of range: {numbers}"
+
+def test_output_format_is_correct():
+    """Test that the output format adheres to the user input"""
+    config = AliceInWonderlandConfig(size=30, seed=42, output_formats=[OutputFormat.THINKING])
+    dataset = AliceInWonderlandDataset(config)
+
+    for item in dataset:
+        prompt = item["prompt"]
+        assert "think carefully step by step" in item["prompt"]

From 57a1b5c3538054bac7def5639c5fc07c496797f0 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sat, 1 Feb 2025 12:04:44 -0800
Subject: [PATCH 47/94] [aiw] add colleague variation

---
 reasoning_gym/logic/aiw.py | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/reasoning_gym/logic/aiw.py b/reasoning_gym/logic/aiw.py
index bee47a89..816b5ae3 100644
--- a/reasoning_gym/logic/aiw.py
+++ b/reasoning_gym/logic/aiw.py
@@ -11,6 +11,7 @@ class TaskType(Enum):
     """Defines the type of task for the Alice in Wonderland dataset."""
     SIBLINGS = "siblings"
     FRIENDS = "friends"
+    COLLEAGUES = "colleagues"  # Added colleagues task
 
 
 class OutputFormat(Enum):
@@ -31,7 +32,7 @@ class AliceInWonderlandConfig:
         output_formats (List[OutputFormat]): List of output formats to include in dataset.
         seed (Optional[int]): Seed for random number generation.
         size (int): Number of samples in the dataset.
-        max_entities (int): Max number of siblings/friends in questions.
+        max_entities (int): Max number of siblings/friends/colleagues in questions.
     """
     male_names: List[str] = field(
         default_factory=lambda: [
@@ -46,7 +47,7 @@ class AliceInWonderlandConfig:
         ]
     )
     task_types: List[TaskType] = field(
-        default_factory=lambda: [TaskType.SIBLINGS, TaskType.FRIENDS]
+        default_factory=lambda: [TaskType.SIBLINGS, TaskType.FRIENDS, TaskType.COLLEAGUES]  # Added Colleagues
     )
     output_formats: List[OutputFormat] = field(
         default_factory=lambda: [
@@ -111,6 +112,20 @@ class AliceInWonderlandDataset(ProceduralDataset):
                     "have?"
                 )
             ],
+            TaskType.COLLEAGUES: [  # New colleagues templates
+                Template(
+                    "$female_name has $num_male_colleagues_alice_circle male colleagues and she also has "
+                    "$num_female_colleagues_alice_circle female colleagues. These are all colleagues that $female_name has. "
+                    "All these mentioned persons around $female_name are colleagues of each other. "
+                    "$male_name has $num_male_colleagues_bob_circle male colleagues "
+					"and $num_female_colleagues_bob_circle female colleagues in total. "
+                    "All these mentioned persons around $male_name are colleagues of each other. "
+					"The people in the circle around $male_name do not have "
+					"other colleagues aside - with the only exception of Matilda. "
+                    "She is colleague of $male_name and she is also colleague of $female_name, "
+					"being part of $female_name's circle. How many female colleagues does Matilda have?"
+                ),
+            ],
         }
 
         self.format_templates = {
@@ -145,6 +160,7 @@ class AliceInWonderlandDataset(ProceduralDataset):
         if task_type == TaskType.SIBLINGS:
             num_brothers = rng.randint(1, self.config.max_entities)
             num_sisters = rng.randint(1, self.config.max_entities)
+
             answer = num_sisters + 1
             template = rng.choice(self.templates[TaskType.SIBLINGS])
             question = template.substitute(
@@ -156,6 +172,7 @@ class AliceInWonderlandDataset(ProceduralDataset):
         elif task_type == TaskType.FRIENDS:
             num_male = rng.randint(1, self.config.max_entities)
             num_female = rng.randint(1, self.config.max_entities)
+
             answer = num_female + 1
             template = rng.choice(self.templates[TaskType.FRIENDS])
             question = template.substitute(
@@ -164,6 +181,22 @@ class AliceInWonderlandDataset(ProceduralDataset):
                 num_male=num_male,
                 num_female=num_female,
             )
+        elif task_type == TaskType.COLLEAGUES:
+            num_male_colleagues_alice_circle = rng.randint(1, self.config.max_entities)
+            num_female_colleagues_alice_circle = rng.randint(1, self.config.max_entities)
+            num_male_colleagues_bob_circle = rng.randint(1, self.config.max_entities)
+            num_female_colleagues_bob_circle = rng.randint(1, self.config.max_entities)
+
+            answer = num_female_colleagues_alice_circle + 1
+            template = rng.choice(self.templates[TaskType.COLLEAGUES])
+            question = template.substitute(
+                female_name=female_name,
+                male_name=male_name,
+                num_male_colleagues_alice_circle=num_male_colleagues_alice_circle,
+                num_female_colleagues_alice_circle=num_female_colleagues_alice_circle,
+                num_male_colleagues_bob_circle=num_male_colleagues_bob_circle,
+                num_female_colleagues_bob_circle=num_female_colleagues_bob_circle
+            )
 
         formatted_question = self.format_templates[output_format].substitute(
             question=question

From e671b97ab4d2bfc18b5f50d4a87a224ef7717ff5 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sat, 1 Feb 2025 21:19:27 +0000
Subject: [PATCH 48/94] first bits of veRL example

---
 examples/veRL/.gitignore                |   3 +
 examples/veRL/config/ppo_trainer.yaml   | 167 +++++++++++
 examples/veRL/launch_on_4gpu.sh         |   9 +
 examples/veRL/main_ppo_custom_reward.py | 353 ++++++++++++++++++++++++
 examples/veRL/train.sh                  |  30 ++
 5 files changed, 562 insertions(+)
 create mode 100644 examples/veRL/.gitignore
 create mode 100644 examples/veRL/config/ppo_trainer.yaml
 create mode 100755 examples/veRL/launch_on_4gpu.sh
 create mode 100644 examples/veRL/main_ppo_custom_reward.py
 create mode 100755 examples/veRL/train.sh

diff --git a/examples/veRL/.gitignore b/examples/veRL/.gitignore
new file mode 100644
index 00000000..c54a47c0
--- /dev/null
+++ b/examples/veRL/.gitignore
@@ -0,0 +1,3 @@
+outputs/
+wandb/
+verl_output.log
diff --git a/examples/veRL/config/ppo_trainer.yaml b/examples/veRL/config/ppo_trainer.yaml
new file mode 100644
index 00000000..b294a7cb
--- /dev/null
+++ b/examples/veRL/config/ppo_trainer.yaml
@@ -0,0 +1,167 @@
+data:
+  tokenizer: null
+  train_files: ~/data/rlhf/gsm8k/train.parquet
+  val_files: ~/data/rlhf/gsm8k/test.parquet
+  prompt_key: prompt
+  max_prompt_length: 512
+  max_response_length: 512
+  train_batch_size: 1024
+  val_batch_size: 1312
+  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
+  return_raw_chat: False
+
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: null
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: False # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: -1  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      grad_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.5
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    # number of responses (i.e. num sample times)
+    n: 1 # > 1 for grpo
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      grad_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
+  micro_batch_size_per_gpu: null # set a number
+  max_length: null
+  ulysses_sequence_parallel_size: 1 # sp size
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: gae
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+
+trainer:
+  total_epochs: 30
+  total_training_steps: null
+  project_name: verl_examples
+  experiment_name: gsm8k
+  logger: [ 'console', 'wandb' ]
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  test_freq: -1
+  critic_warmup: 0
+  default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
diff --git a/examples/veRL/launch_on_4gpu.sh b/examples/veRL/launch_on_4gpu.sh
new file mode 100755
index 00000000..0a51f68c
--- /dev/null
+++ b/examples/veRL/launch_on_4gpu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+export N_GPUS=4
+export BASE_MODEL=meta-llama/Llama-3.2-1B-Instruct
+export ROLLOUT_TP_SIZE=2
+export EXPERIMENT_NAME=chain_sum_llama
+export VLLM_ATTENTION_BACKEND=XFORMERS
+
+bash ./train.sh
diff --git a/examples/veRL/main_ppo_custom_reward.py b/examples/veRL/main_ppo_custom_reward.py
new file mode 100644
index 00000000..ef28ecf5
--- /dev/null
+++ b/examples/veRL/main_ppo_custom_reward.py
@@ -0,0 +1,353 @@
+# This example is a modified version of:
+# https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/verl/trainer/main_ppo.py
+
+
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
+"""
+
+from typing import Optional
+from omegaconf import OmegaConf, open_dict
+import reasoning_gym
+from reasoning_gym.utils import extract_answer
+
+import reasoning_gym.utils
+from verl import DataProto
+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import PreTrainedTokenizer
+
+import ray
+import hydra
+
+
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.utils.model import compute_position_id_with_mask
+from verl.utils.dataset.rl_dataset import collate_fn
+import verl.utils.torch_functional as verl_F
+
+
+class RewardManager:
+    """The reward manager."""
+
+    def __init__(self, tokenizer, num_examine, compute_score) -> None:
+        self.tokenizer = tokenizer
+        self.num_examine = num_examine  # the number of batches of decoded responses to print to the console
+        self.compute_score = compute_score
+
+    def __call__(self, data: DataProto):
+        """We will expand this function gradually based on the available datasets"""
+
+        # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
+        if "rm_scores" in data.batch.keys():
+            return data.batch["rm_scores"]
+
+        reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
+
+        already_print_data_sources = {}
+
+        for i in range(len(data)):
+            data_item = data[i]  # DataProtoItem
+
+            prompt_ids = data_item.batch["prompts"]
+
+            prompt_length = prompt_ids.shape[-1]
+
+            valid_prompt_length = data_item.batch["attention_mask"][:prompt_length].sum()
+            valid_prompt_ids = prompt_ids[-valid_prompt_length:]
+
+            response_ids = data_item.batch["responses"]
+            valid_response_length = data_item.batch["attention_mask"][prompt_length:].sum()
+            valid_response_ids = response_ids[:valid_response_length]
+
+            # decode
+            sequences = torch.cat((valid_prompt_ids, valid_response_ids))
+            sequences_str = self.tokenizer.decode(sequences)
+
+            data_source = data_item.non_tensor_batch["data_source"]
+            ground_truth = data_item.non_tensor_batch["answer"]
+            index = data_item.non_tensor_batch["index"]
+
+            score = self.compute_score(
+                data_source=data_source,
+                solution_str=sequences_str,
+                ground_truth=ground_truth,
+                index=index,
+            )
+            reward_tensor[i, valid_response_length - 1] = score
+
+            if data_source not in already_print_data_sources:
+                already_print_data_sources[data_source] = 0
+
+            if already_print_data_sources[data_source] < self.num_examine:
+                already_print_data_sources[data_source] += 1
+                print(sequences_str)
+
+        return reward_tensor
+
+
+@hydra.main(config_path="config", config_name="ppo_trainer", version_base=None)
+def main(config):
+    if not ray.is_initialized():
+        # this is for local ray cluster
+        ray.init(runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}})
+
+    ray.get(main_task.remote(config))
+
+
+class ReasoningGymDataset(Dataset):
+    def __init__(
+        self,
+        dataset_name: str,
+        tokenizer: PreTrainedTokenizer,
+        seed: int,
+        size: int,
+        developer_prompt: Optional[str] = None,
+        developer_role: str = "system",
+        max_prompt_length: int = 2048,
+        truncation: str = "error",  ##  ['left', 'right', 'error']
+        return_raw_chat: bool = False,
+    ):
+        self.tokenizer = tokenizer
+        self.dataset_name = dataset_name
+        self.data = reasoning_gym.create_dataset(dataset_name, seed=seed, size=size)
+        self.developer_prompt = developer_prompt
+        self.developer_role = developer_role
+        self.max_prompt_length = max_prompt_length
+        self.truncation = truncation
+        self.return_raw_chat = return_raw_chat
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __getitem__(self, index):
+        row_dict = self.data[index].copy()
+        q = row_dict["question"]
+
+        chat = []
+        if self.developer_prompt is not None:
+            chat.append({"role": self.developer_role, "content": self.developer_prompt})
+        chat.append({"role": "user", "content": q})
+
+        prompt = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+
+        input_ids, attention_mask = verl_F.tokenize_and_postprocess_data(
+            prompt=prompt,
+            tokenizer=self.tokenizer,
+            max_length=self.max_prompt_length,
+            pad_token_id=self.tokenizer.pad_token_id,
+            left_pad=True,
+            truncation=self.truncation,
+        )
+
+        position_ids = compute_position_id_with_mask(attention_mask)
+
+        row_dict["data_source"] = "reasoning_gym/" + self.dataset_name
+        row_dict["input_ids"] = input_ids[0]
+        row_dict["attention_mask"] = attention_mask[0]
+        row_dict["position_ids"] = position_ids[0]
+
+        # encode prompts without chat template
+        if self.return_raw_chat:
+            row_dict["raw_prompt"] = chat.tolist()
+
+        # add index for each prompt
+        #  index = row_dict.get("extra_info", {}).get("index", 0)
+        row_dict["index"] = index
+
+        return row_dict
+
+
+class RayPPOTrainerCustom(RayPPOTrainer):
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict,
+        resource_pool_manager,
+        ray_worker_group_cls,
+        reward_fn=None,
+        val_reward_fn=None,
+        dataset_name: str = "chain_sum",
+        dataset_size: int = 10000,
+    ):
+        self.dataset_name = dataset_name
+        self.dataset_size = dataset_size
+
+        developer_prompt = reasoning_gym.utils.SYSTEM_PROMPTS["DeepSeekZero"]
+        self.train_dataset = ReasoningGymDataset(
+            dataset_name=self.dataset_name,
+            tokenizer=tokenizer,
+            seed=1,
+            size=self.dataset_size,
+            developer_prompt=developer_prompt,
+        )
+
+        self.val_dataset = ReasoningGymDataset(
+            dataset_name=self.dataset_name,
+            tokenizer=tokenizer,
+            seed=2,
+            size=self.dataset_size,
+            developer_prompt=developer_prompt,
+        )
+
+        reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0, compute_score=self._compute_score)
+
+        super().__init__(
+            config,
+            tokenizer,
+            role_worker_mapping,
+            resource_pool_manager,
+            ray_worker_group_cls,
+            reward_fn,
+            val_reward_fn,
+        )
+
+    def _compute_score(self, data_source, solution_str, ground_truth, index) -> float:
+        print("Solution:", solution_str, ground_truth, index, data_source)
+        found_answer = extract_answer(solution_str, tag_name="answer")
+        entry = self.train_dataset.data[index]
+        return self.train_dataset.data.score_answer(found_answer, entry=entry)
+
+    def _create_dataloader(self):
+        self.train_dataloader = DataLoader(
+            dataset=self.train_dataset,
+            batch_size=self.config.data.train_batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=collate_fn,
+        )
+
+        self.val_dataloader = DataLoader(
+            dataset=self.val_dataset,
+            batch_size=len(self.val_dataset),
+            shuffle=True,
+            drop_last=True,
+            collate_fn=collate_fn,
+        )
+
+        assert len(self.train_dataloader) >= 1
+        assert len(self.val_dataloader) >= 1
+
+        print(f"Size of train dataloader: {len(self.train_dataloader)}")
+        print(f"Size of val dataloader: {len(self.val_dataloader)}")
+
+        # inject total_training_steps to actor/critic optim_config. This is hacky.
+        total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+
+        if self.config.trainer.total_training_steps is not None:
+            total_training_steps = self.config.trainer.total_training_steps
+
+        self.total_training_steps = total_training_steps
+        print(f"Total training steps: {self.total_training_steps}")
+
+        OmegaConf.set_struct(self.config, True)
+        with open_dict(self.config):
+            self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
+            self.config.critic.optim.total_training_steps = total_training_steps
+
+
+@ray.remote
+def main_task(config, compute_score=None):
+    from verl.utils.fs import copy_local_path_from_hdfs
+    from transformers import AutoTokenizer
+
+    # print initial config
+    from pprint import pprint
+    from omegaconf import OmegaConf
+
+    pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
+    OmegaConf.resolve(config)
+
+    # download the checkpoint from hdfs
+    local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path)
+
+    # instantiate tokenizer
+    from verl.utils import hf_tokenizer
+
+    tokenizer = hf_tokenizer(local_path)
+
+    # define worker classes
+    if config.actor_rollout_ref.actor.strategy == "fsdp":
+        assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+        from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
+        from verl.single_controller.ray import RayWorkerGroup
+
+        ray_worker_group_cls = RayWorkerGroup
+
+    elif config.actor_rollout_ref.actor.strategy == "megatron":
+        assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+        from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
+        from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+
+        ray_worker_group_cls = NVMegatronRayWorkerGroup
+
+    else:
+        raise NotImplementedError
+
+    from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+
+    role_worker_mapping = {
+        Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
+        Role.Critic: ray.remote(CriticWorker),
+        Role.RefPolicy: ray.remote(ActorRolloutRefWorker),
+    }
+
+    global_pool_id = "global_pool"
+    resource_pool_spec = {
+        global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+    }
+    mapping = {
+        Role.ActorRollout: global_pool_id,
+        Role.Critic: global_pool_id,
+        Role.RefPolicy: global_pool_id,
+    }
+
+    # we should adopt a multi-source reward function here
+    # - for rule-based rm, we directly call a reward score
+    # - for model-based rm, we call a model
+    # - for code related prompt, we send to a sandbox if there are test cases
+    # - finally, we combine all the rewards together
+    # - The reward type depends on the tag of the data
+    if config.reward_model.enable:
+        if config.reward_model.strategy == "fsdp":
+            from verl.workers.fsdp_workers import RewardModelWorker
+        elif config.reward_model.strategy == "megatron":
+            from verl.workers.megatron_workers import RewardModelWorker
+        else:
+            raise NotImplementedError
+        role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+        mapping[Role.RewardModel] = global_pool_id
+
+    # Note that we always use function-based RM for validation
+    val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1, compute_score=compute_score)
+
+    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+
+    trainer = RayPPOTrainerCustom(
+        config=config,
+        tokenizer=tokenizer,
+        role_worker_mapping=role_worker_mapping,
+        resource_pool_manager=resource_pool_manager,
+        ray_worker_group_cls=ray_worker_group_cls,
+        val_reward_fn=val_reward_fn,
+    )
+    trainer.init_workers()
+    trainer.fit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/veRL/train.sh b/examples/veRL/train.sh
new file mode 100755
index 00000000..92ed0b84
--- /dev/null
+++ b/examples/veRL/train.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+python3 -u main_ppo_custom_reward.py \
+data.train_files=$DATA_DIR/train.parquet \
+data.val_files=$DATA_DIR/test.parquet \
+data.train_batch_size=256 \
+data.val_batch_size=1312 \
+data.max_prompt_length=256 \
+data.max_response_length=1024 \
+actor_rollout_ref.model.path=$BASE_MODEL \
+actor_rollout_ref.actor.optim.lr=1e-6 \
+actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+actor_rollout_ref.actor.ppo_micro_batch_size=8 \
+actor_rollout_ref.rollout.log_prob_micro_batch_size=8 \
+actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \
+actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+actor_rollout_ref.ref.log_prob_micro_batch_size=4 \
+critic.optim.lr=1e-5 \
+critic.model.path=$BASE_MODEL \
+critic.ppo_micro_batch_size=8 \
+algorithm.kl_ctrl.kl_coef=0.001 \
+trainer.logger=['wandb'] \
++trainer.val_before_train=False \
+trainer.default_hdfs_dir=null \
+trainer.n_gpus_per_node=$N_GPUS \
+trainer.nnodes=1 \
+trainer.save_freq=100 \
+trainer.test_freq=100 \
+trainer.project_name=verl_chain_sum \
+trainer.experiment_name=$EXPERIMENT_NAME \
+trainer.total_epochs=15 2>&1 | tee verl_output.log

From 3f24df31dcaf681a44d75fc74b1a50a3a2384933 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sat, 1 Feb 2025 21:27:33 +0000
Subject: [PATCH 49/94] add deps for veRL experiment in README

---
 examples/veRL/README.md                 | 19 +++++++++++++
 examples/veRL/main_ppo_custom_reward.py | 38 ++++++++++++-------------
 2 files changed, 37 insertions(+), 20 deletions(-)
 create mode 100644 examples/veRL/README.md

diff --git a/examples/veRL/README.md b/examples/veRL/README.md
new file mode 100644
index 00000000..e2ec6e50
--- /dev/null
+++ b/examples/veRL/README.md
@@ -0,0 +1,19 @@
+### env setup
+
+```
+conda create --name verl python=3.12 -y
+conda activate verl
+
+pip install flash-attn --no-build-isolation
+pip install vllm==0.7.0 ray wandb
+```
+
+### clone and install veRL
+
+tested with verl HEAD a65c9157bc0b85b64cd753de19f94e80a11bd871
+
+```
+git clone https://github.com/volcengine/verl.git
+cd verl
+pip install -e .
+```
diff --git a/examples/veRL/main_ppo_custom_reward.py b/examples/veRL/main_ppo_custom_reward.py
index ef28ecf5..db2d67d7 100644
--- a/examples/veRL/main_ppo_custom_reward.py
+++ b/examples/veRL/main_ppo_custom_reward.py
@@ -20,24 +20,22 @@ Note that we don't combine the main with ray_trainer as ray_trainer is used by o
 """
 
 from typing import Optional
-from omegaconf import OmegaConf, open_dict
-import reasoning_gym
-from reasoning_gym.utils import extract_answer
 
-import reasoning_gym.utils
-from verl import DataProto
-import torch
-from torch.utils.data import Dataset, DataLoader
-from transformers import PreTrainedTokenizer
-
-import ray
 import hydra
-
-
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer
-from verl.utils.model import compute_position_id_with_mask
-from verl.utils.dataset.rl_dataset import collate_fn
+import ray
+import torch
 import verl.utils.torch_functional as verl_F
+from omegaconf import OmegaConf, open_dict
+from torch.utils.data import DataLoader, Dataset
+from transformers import PreTrainedTokenizer
+from verl import DataProto
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.utils.dataset.rl_dataset import collate_fn
+from verl.utils.model import compute_position_id_with_mask
+
+import reasoning_gym
+import reasoning_gym.utils
+from reasoning_gym.utils import extract_answer
 
 
 class RewardManager:
@@ -262,12 +260,12 @@ class RayPPOTrainerCustom(RayPPOTrainer):
 
 @ray.remote
 def main_task(config, compute_score=None):
-    from verl.utils.fs import copy_local_path_from_hdfs
-    from transformers import AutoTokenizer
-
     # print initial config
     from pprint import pprint
+
     from omegaconf import OmegaConf
+    from transformers import AutoTokenizer
+    from verl.utils.fs import copy_local_path_from_hdfs
 
     pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
     OmegaConf.resolve(config)
@@ -283,15 +281,15 @@ def main_task(config, compute_score=None):
     # define worker classes
     if config.actor_rollout_ref.actor.strategy == "fsdp":
         assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-        from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
         from verl.single_controller.ray import RayWorkerGroup
+        from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
 
         ray_worker_group_cls = RayWorkerGroup
 
     elif config.actor_rollout_ref.actor.strategy == "megatron":
         assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-        from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
         from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+        from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
 
         ray_worker_group_cls = NVMegatronRayWorkerGroup
 

From 8202f234be14a955a6fc0d8da8d88a0de9d846d5 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sat, 1 Feb 2025 23:56:11 +0000
Subject: [PATCH 50/94] reduce veRL example size

---
 examples/veRL/main_ppo_custom_reward.py | 180 ++++++++----------------
 1 file changed, 57 insertions(+), 123 deletions(-)

diff --git a/examples/veRL/main_ppo_custom_reward.py b/examples/veRL/main_ppo_custom_reward.py
index db2d67d7..2addb8e9 100644
--- a/examples/veRL/main_ppo_custom_reward.py
+++ b/examples/veRL/main_ppo_custom_reward.py
@@ -1,24 +1,5 @@
-# This example is a modified version of:
+# This example is an adapted version of Bytedance's code:
 # https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/verl/trainer/main_ppo.py
-
-
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
-"""
-
 from typing import Optional
 
 import hydra
@@ -38,79 +19,11 @@ import reasoning_gym.utils
 from reasoning_gym.utils import extract_answer
 
 
-class RewardManager:
-    """The reward manager."""
-
-    def __init__(self, tokenizer, num_examine, compute_score) -> None:
-        self.tokenizer = tokenizer
-        self.num_examine = num_examine  # the number of batches of decoded responses to print to the console
-        self.compute_score = compute_score
-
-    def __call__(self, data: DataProto):
-        """We will expand this function gradually based on the available datasets"""
-
-        # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
-        if "rm_scores" in data.batch.keys():
-            return data.batch["rm_scores"]
-
-        reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
-
-        already_print_data_sources = {}
-
-        for i in range(len(data)):
-            data_item = data[i]  # DataProtoItem
-
-            prompt_ids = data_item.batch["prompts"]
-
-            prompt_length = prompt_ids.shape[-1]
-
-            valid_prompt_length = data_item.batch["attention_mask"][:prompt_length].sum()
-            valid_prompt_ids = prompt_ids[-valid_prompt_length:]
-
-            response_ids = data_item.batch["responses"]
-            valid_response_length = data_item.batch["attention_mask"][prompt_length:].sum()
-            valid_response_ids = response_ids[:valid_response_length]
-
-            # decode
-            sequences = torch.cat((valid_prompt_ids, valid_response_ids))
-            sequences_str = self.tokenizer.decode(sequences)
-
-            data_source = data_item.non_tensor_batch["data_source"]
-            ground_truth = data_item.non_tensor_batch["answer"]
-            index = data_item.non_tensor_batch["index"]
-
-            score = self.compute_score(
-                data_source=data_source,
-                solution_str=sequences_str,
-                ground_truth=ground_truth,
-                index=index,
-            )
-            reward_tensor[i, valid_response_length - 1] = score
-
-            if data_source not in already_print_data_sources:
-                already_print_data_sources[data_source] = 0
-
-            if already_print_data_sources[data_source] < self.num_examine:
-                already_print_data_sources[data_source] += 1
-                print(sequences_str)
-
-        return reward_tensor
-
-
-@hydra.main(config_path="config", config_name="ppo_trainer", version_base=None)
-def main(config):
-    if not ray.is_initialized():
-        # this is for local ray cluster
-        ray.init(runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}})
-
-    ray.get(main_task.remote(config))
-
-
 class ReasoningGymDataset(Dataset):
     def __init__(
         self,
-        dataset_name: str,
         tokenizer: PreTrainedTokenizer,
+        dataset_name: str,
         seed: int,
         size: int,
         developer_prompt: Optional[str] = None,
@@ -177,8 +90,6 @@ class RayPPOTrainerCustom(RayPPOTrainer):
         role_worker_mapping: dict,
         resource_pool_manager,
         ray_worker_group_cls,
-        reward_fn=None,
-        val_reward_fn=None,
         dataset_name: str = "chain_sum",
         dataset_size: int = 10000,
     ):
@@ -187,22 +98,23 @@ class RayPPOTrainerCustom(RayPPOTrainer):
 
         developer_prompt = reasoning_gym.utils.SYSTEM_PROMPTS["DeepSeekZero"]
         self.train_dataset = ReasoningGymDataset(
-            dataset_name=self.dataset_name,
             tokenizer=tokenizer,
+            dataset_name=self.dataset_name,
             seed=1,
             size=self.dataset_size,
             developer_prompt=developer_prompt,
         )
 
         self.val_dataset = ReasoningGymDataset(
-            dataset_name=self.dataset_name,
             tokenizer=tokenizer,
+            dataset_name=self.dataset_name,
             seed=2,
             size=self.dataset_size,
             developer_prompt=developer_prompt,
         )
 
-        reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0, compute_score=self._compute_score)
+        train_reward_fn = lambda data: self._score_output(data, num_examine=0)
+        val_reward_fn = lambda data: self._score_output(data, num_examine=1)
 
         super().__init__(
             config,
@@ -210,15 +122,51 @@ class RayPPOTrainerCustom(RayPPOTrainer):
             role_worker_mapping,
             resource_pool_manager,
             ray_worker_group_cls,
-            reward_fn,
+            train_reward_fn,
             val_reward_fn,
         )
 
-    def _compute_score(self, data_source, solution_str, ground_truth, index) -> float:
-        print("Solution:", solution_str, ground_truth, index, data_source)
+    def _score_output(self, data: DataProto, num_examine: int = 0) -> torch.Tensor:
+        reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
+
+        num_printed = 0
+        for i in range(len(data)):
+            data_item = data[i]  # DataProtoItem
+
+            prompt_ids = data_item.batch["prompts"]  # tokenized prompts
+            prompt_length = prompt_ids.shape[-1]
+
+            valid_prompt_length = data_item.batch["attention_mask"][:prompt_length].sum()
+            valid_prompt_ids = prompt_ids[-valid_prompt_length:]
+
+            response_ids = data_item.batch["responses"]
+            valid_response_length = data_item.batch["attention_mask"][prompt_length:].sum()
+            valid_response_ids = response_ids[:valid_response_length]
+
+            # decode
+            sequences = torch.cat((valid_prompt_ids, valid_response_ids))
+            sequences_str = self.tokenizer.decode(sequences)
+
+            index = data_item.non_tensor_batch["index"]
+
+            score = self._compute_score(
+                solution_str=sequences_str,
+                index=index,
+            )
+            reward_tensor[i, valid_response_length - 1] = score
+
+            if num_printed < num_examine:
+                print(f"reward={score}, seq={sequences_str}")
+                num_printed += 1
+
+        return reward_tensor
+
+    def _compute_score(self, solution_str: str, index: int) -> float:
         found_answer = extract_answer(solution_str, tag_name="answer")
         entry = self.train_dataset.data[index]
-        return self.train_dataset.data.score_answer(found_answer, entry=entry)
+        reward = self.train_dataset.data.score_answer(found_answer, entry=entry)
+        # print(f"found answer={found_answer}; reward: {reward};")
+        return reward
 
     def _create_dataloader(self):
         self.train_dataloader = DataLoader(
@@ -259,12 +207,11 @@ class RayPPOTrainerCustom(RayPPOTrainer):
 
 
 @ray.remote
-def main_task(config, compute_score=None):
+def main_task(config):
     # print initial config
     from pprint import pprint
 
-    from omegaconf import OmegaConf
-    from transformers import AutoTokenizer
+    from verl.utils import hf_tokenizer
     from verl.utils.fs import copy_local_path_from_hdfs
 
     pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
@@ -274,8 +221,6 @@ def main_task(config, compute_score=None):
     local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path)
 
     # instantiate tokenizer
-    from verl.utils import hf_tokenizer
-
     tokenizer = hf_tokenizer(local_path)
 
     # define worker classes
@@ -314,25 +259,6 @@ def main_task(config, compute_score=None):
         Role.RefPolicy: global_pool_id,
     }
 
-    # we should adopt a multi-source reward function here
-    # - for rule-based rm, we directly call a reward score
-    # - for model-based rm, we call a model
-    # - for code related prompt, we send to a sandbox if there are test cases
-    # - finally, we combine all the rewards together
-    # - The reward type depends on the tag of the data
-    if config.reward_model.enable:
-        if config.reward_model.strategy == "fsdp":
-            from verl.workers.fsdp_workers import RewardModelWorker
-        elif config.reward_model.strategy == "megatron":
-            from verl.workers.megatron_workers import RewardModelWorker
-        else:
-            raise NotImplementedError
-        role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
-        mapping[Role.RewardModel] = global_pool_id
-
-    # Note that we always use function-based RM for validation
-    val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1, compute_score=compute_score)
-
     resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
 
     trainer = RayPPOTrainerCustom(
@@ -341,11 +267,19 @@ def main_task(config, compute_score=None):
         role_worker_mapping=role_worker_mapping,
         resource_pool_manager=resource_pool_manager,
         ray_worker_group_cls=ray_worker_group_cls,
-        val_reward_fn=val_reward_fn,
     )
     trainer.init_workers()
     trainer.fit()
 
 
+@hydra.main(config_path="config", config_name="ppo_trainer", version_base=None)
+def main(config):
+    if not ray.is_initialized():
+        # this is for local ray cluster
+        ray.init(runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}})
+
+    ray.get(main_task.remote(config))
+
+
 if __name__ == "__main__":
     main()

From 3d42e84807cc525cc1b21ffb857a69981b6aabc0 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sat, 1 Feb 2025 16:30:05 -0800
Subject: [PATCH 51/94] [aiw] remove output_formats style and change return
 type to a standard format

---
 reasoning_gym/logic/aiw.py | 47 +++++++++++---------------
 tests/test_aiw.py          | 68 +++++++++++++++++---------------------
 2 files changed, 50 insertions(+), 65 deletions(-)

diff --git a/reasoning_gym/logic/aiw.py b/reasoning_gym/logic/aiw.py
index 816b5ae3..2ce3a13b 100644
--- a/reasoning_gym/logic/aiw.py
+++ b/reasoning_gym/logic/aiw.py
@@ -29,7 +29,6 @@ class AliceInWonderlandConfig:
         male_names (List[str]): List of male names to use in questions.
         female_names (List[str]): List of female names to use in questions. Must include 'Alice'.
         task_types (List[TaskType]): List of task types to include in dataset.
-        output_formats (List[OutputFormat]): List of output formats to include in dataset.
         seed (Optional[int]): Seed for random number generation.
         size (int): Number of samples in the dataset.
         max_entities (int): Max number of siblings/friends/colleagues in questions.
@@ -47,14 +46,8 @@ class AliceInWonderlandConfig:
         ]
     )
     task_types: List[TaskType] = field(
-        default_factory=lambda: [TaskType.SIBLINGS, TaskType.FRIENDS, TaskType.COLLEAGUES]  # Added Colleagues
-    )
-    output_formats: List[OutputFormat] = field(
         default_factory=lambda: [
-            OutputFormat.PLAIN,
-            OutputFormat.RESTRICTED,
-            OutputFormat.THINKING,
-        ]
+            TaskType.SIBLINGS, TaskType.FRIENDS, TaskType.COLLEAGUES]  # Added Colleagues
     )
     seed: Optional[int] = None
     size: int = 10
@@ -66,8 +59,6 @@ class AliceInWonderlandConfig:
         assert len(self.female_names) > 0, "must provide female names"
         assert "Alice" in self.female_names, "'Alice' must be in female names"
         assert len(self.task_types) > 0, "must provide at least one task type"
-        assert len(
-            self.output_formats) > 0, "must provide at least one output format"
         assert self.max_entities > 0, "max_entities must be positive"
 
 
@@ -88,6 +79,7 @@ class AliceInWonderlandDataset(ProceduralDataset):
         }
 
     """
+
     def __init__(self, config: AliceInWonderlandConfig):
         super().__init__(config=config, seed=config.seed, size=config.size)
         self.templates = {
@@ -118,12 +110,12 @@ class AliceInWonderlandDataset(ProceduralDataset):
                     "$num_female_colleagues_alice_circle female colleagues. These are all colleagues that $female_name has. "
                     "All these mentioned persons around $female_name are colleagues of each other. "
                     "$male_name has $num_male_colleagues_bob_circle male colleagues "
-					"and $num_female_colleagues_bob_circle female colleagues in total. "
+                    "and $num_female_colleagues_bob_circle female colleagues in total. "
                     "All these mentioned persons around $male_name are colleagues of each other. "
-					"The people in the circle around $male_name do not have "
-					"other colleagues aside - with the only exception of Matilda. "
+                    "The people in the circle around $male_name do not have "
+                    "other colleagues aside - with the only exception of Matilda. "
                     "She is colleague of $male_name and she is also colleague of $female_name, "
-					"being part of $female_name's circle. How many female colleagues does Matilda have?"
+                    "being part of $female_name's circle. How many female colleagues does Matilda have?"
                 ),
             ],
         }
@@ -153,7 +145,6 @@ class AliceInWonderlandDataset(ProceduralDataset):
                 and a description of the example.
         """
         task_type = rng.choice(self.config.task_types)
-        output_format = rng.choice(self.config.output_formats)
         female_name = rng.choice(self.config.female_names)
         male_name = rng.choice(self.config.male_names)
 
@@ -182,10 +173,14 @@ class AliceInWonderlandDataset(ProceduralDataset):
                 num_female=num_female,
             )
         elif task_type == TaskType.COLLEAGUES:
-            num_male_colleagues_alice_circle = rng.randint(1, self.config.max_entities)
-            num_female_colleagues_alice_circle = rng.randint(1, self.config.max_entities)
-            num_male_colleagues_bob_circle = rng.randint(1, self.config.max_entities)
-            num_female_colleagues_bob_circle = rng.randint(1, self.config.max_entities)
+            num_male_colleagues_alice_circle = rng.randint(
+                1, self.config.max_entities)
+            num_female_colleagues_alice_circle = rng.randint(
+                1, self.config.max_entities)
+            num_male_colleagues_bob_circle = rng.randint(
+                1, self.config.max_entities)
+            num_female_colleagues_bob_circle = rng.randint(
+                1, self.config.max_entities)
 
             answer = num_female_colleagues_alice_circle + 1
             template = rng.choice(self.templates[TaskType.COLLEAGUES])
@@ -198,14 +193,12 @@ class AliceInWonderlandDataset(ProceduralDataset):
                 num_female_colleagues_bob_circle=num_female_colleagues_bob_circle
             )
 
-        formatted_question = self.format_templates[output_format].substitute(
-            question=question
-        )
-
         return {
-            "prompt": formatted_question,
-            "right_answer": str(answer),
-            "description": f"{task_type.value} variation, {output_format.value} format",
+            "question": question,
+            "answer": answer,
+            "metadata": {
+                "task_type": task_type.value
+            }
         }
 
     def __getitem__(self, idx: int) -> dict:
@@ -213,4 +206,4 @@ class AliceInWonderlandDataset(ProceduralDataset):
         return self._get_aiw(rng)
 
 
-register_dataset("aiw", AliceInWonderlandDataset, AliceInWonderlandConfig)
\ No newline at end of file
+register_dataset("aiw", AliceInWonderlandDataset, AliceInWonderlandConfig)
diff --git a/tests/test_aiw.py b/tests/test_aiw.py
index 5a0fbaf5..279fcc2c 100644
--- a/tests/test_aiw.py
+++ b/tests/test_aiw.py
@@ -2,6 +2,7 @@ import pytest
 
 from reasoning_gym.logic.aiw import AliceInWonderlandConfig, AliceInWonderlandDataset, TaskType, OutputFormat
 
+
 def test_aiw_config_validation():
     """Test that invalid configs raise appropriate errors"""
     with pytest.raises(AssertionError):
@@ -11,18 +12,16 @@ def test_aiw_config_validation():
     with pytest.raises(AssertionError):
         config = AliceInWonderlandConfig(female_names=[])  # Empty female names
         config.validate()
-    
+
     with pytest.raises(AssertionError):
-        config = AliceInWonderlandConfig(female_names=["Mary", "Jane"])  # No Alice
+        config = AliceInWonderlandConfig(
+            female_names=["Mary", "Jane"])  # No Alice
         config.validate()
-    
+
     with pytest.raises(AssertionError):
         config = AliceInWonderlandConfig(task_types=[])  # No task types
         config.validate()
-    
-    with pytest.raises(AssertionError):
-        config = AliceInWonderlandConfig(output_formats=[])  # No output formats
-        config.validate()
+
 
 def test_aiw_deterministic():
     """Test that dataset generates same items with same seed"""
@@ -33,6 +32,7 @@ def test_aiw_deterministic():
     for i in range(len(dataset1)):
         assert dataset1[i] == dataset2[i]
 
+
 def test_aiw_items():
     """Test basic properties of generated items"""
     config = AliceInWonderlandConfig(size=50, seed=42)
@@ -41,29 +41,28 @@ def test_aiw_items():
     for i in range(len(dataset)):
         item = dataset[i]
         assert isinstance(item, dict)
-        assert "prompt" in item
-        assert "right_answer" in item
-        assert "description" in item
-        
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
         # Verify answer is numeric and positive
-        answer = int(item["right_answer"])
+        answer = int(item["answer"])
         assert answer > 0
-        
+
         # Verify question contains at least one female name
         female_names = config.female_names
-        assert any(name in item["prompt"] for name in female_names)
+        assert any(name in item["question"] for name in female_names)
+
+        # Verify question task type characteristics
+        task_type = item["metadata"]["task_type"]
+        if task_type == TaskType.SIBLINGS.value:
+            assert any(phrase in item["question"]
+                       for phrase in ["brothers", "sisters"])
+        elif task_type == TaskType.FRIENDS.value:
+            assert "friends" in item["question"]
+        elif task_type == TaskType.COLLEAGUES:
+            assert "colleagues" in item["question"]
 
-        # Verify question format
-        if TaskType.SIBLINGS.value in item["description"]:
-            assert any(phrase in item["prompt"] for phrase in ["brothers", "sisters"])
-        elif TaskType.FRIENDS.value in item["description"]:
-            assert "friends" in item["prompt"]
-            
-        # Verify output format
-        if OutputFormat.RESTRICTED.value in item["description"]:
-            assert "DO NOT OUTPUT ANY TEXT EXCEPT" in item["prompt"]
-        elif OutputFormat.THINKING.value in item["description"]:
-            assert "think carefully step by step" in item["prompt"]
 
 def test_aiw_iteration():
     """Test that iteration works correctly"""
@@ -85,23 +84,16 @@ def test_aiw_iteration():
     second_items = list(dataset)
     assert first_items == second_items
 
+
 def test_aiw_random_ranges():
     """Test that generated numbers stay within expected ranges"""
     config = AliceInWonderlandConfig(size=30, seed=42, max_entities=12)
     dataset = AliceInWonderlandDataset(config)
 
     for item in dataset:
-        prompt = item["prompt"]
-        numbers = [int(n) for n in prompt.split() if n.isdigit()]
-        
+        question = item["question"]
+        numbers = [int(n) for n in question.split() if n.isdigit()]
+
         # Check all numbers are in reasonable range (1-6 as per implementation)
-        assert all(1 <= n <= 12 for n in numbers), f"Numbers out of range: {numbers}"
-
-def test_output_format_is_correct():
-    """Test that the output format adheres to the user input"""
-    config = AliceInWonderlandConfig(size=30, seed=42, output_formats=[OutputFormat.THINKING])
-    dataset = AliceInWonderlandDataset(config)
-
-    for item in dataset:
-        prompt = item["prompt"]
-        assert "think carefully step by step" in item["prompt"]
+        assert all(
+            1 <= n <= 12 for n in numbers), f"Numbers out of range: {numbers}"

From f8696d6d22fcbfa9362cd0fcc97b22af52de1d25 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sat, 1 Feb 2025 16:31:45 -0800
Subject: [PATCH 52/94] [aiw] remove output format enum

---
 reasoning_gym/logic/aiw.py | 7 -------
 tests/test_aiw.py          | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/reasoning_gym/logic/aiw.py b/reasoning_gym/logic/aiw.py
index 2ce3a13b..ebeb50a7 100644
--- a/reasoning_gym/logic/aiw.py
+++ b/reasoning_gym/logic/aiw.py
@@ -14,13 +14,6 @@ class TaskType(Enum):
     COLLEAGUES = "colleagues"  # Added colleagues task
 
 
-class OutputFormat(Enum):
-    """Defines the output format for the generated questions."""
-    PLAIN = "plain"
-    RESTRICTED = "restricted"
-    THINKING = "thinking"
-
-
 @dataclass
 class AliceInWonderlandConfig:
     """Configuration options for the Alice in Wonderland dataset.
diff --git a/tests/test_aiw.py b/tests/test_aiw.py
index 279fcc2c..bf556cbe 100644
--- a/tests/test_aiw.py
+++ b/tests/test_aiw.py
@@ -1,6 +1,6 @@
 import pytest
 
-from reasoning_gym.logic.aiw import AliceInWonderlandConfig, AliceInWonderlandDataset, TaskType, OutputFormat
+from reasoning_gym.logic.aiw import AliceInWonderlandConfig, AliceInWonderlandDataset, TaskType
 
 
 def test_aiw_config_validation():

From 5f6d6153696c3b213ef287e6f4e0a226d3439480 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sat, 1 Feb 2025 16:33:08 -0800
Subject: [PATCH 53/94] [aiw] remove output format template

---
 reasoning_gym/logic/aiw.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/reasoning_gym/logic/aiw.py b/reasoning_gym/logic/aiw.py
index ebeb50a7..7a27b6d0 100644
--- a/reasoning_gym/logic/aiw.py
+++ b/reasoning_gym/logic/aiw.py
@@ -113,20 +113,6 @@ class AliceInWonderlandDataset(ProceduralDataset):
             ],
         }
 
-        self.format_templates = {
-            OutputFormat.PLAIN: Template("$question"),
-            OutputFormat.RESTRICTED: Template(
-                "$question To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT "
-                'following format that contains final answer: "### Answer:"'
-            ),
-            OutputFormat.THINKING: Template(
-                "$question Before providing answer to this problem, think "
-                "carefully step by step and double check the path to the "
-                'correct solution for any mistakes. Provide then the final '
-                'answer in following form: "### Answer:"'
-            ),
-        }
-
     def _get_aiw(self, rng: Random) -> dict:
         """Generates a single Alice in Wonderland question.
 

From 4e9fc4baadf383f822aef90d5e87732983059de7 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@aleph-alpha.com>
Date: Sun, 2 Feb 2025 09:41:23 +0100
Subject: [PATCH 54/94] refactor: Use field default_factory
 TimeIntervalsConfig, AdvancedGeometryConfig

---
 reasoning_gym/arithmetic/time_intervals.py   |  10 +-
 reasoning_gym/geometry/advanced_geometry.py  |  17 +--
 reasoning_gym/graphs/family_relationships.py | 126 +++----------------
 3 files changed, 29 insertions(+), 124 deletions(-)

diff --git a/reasoning_gym/arithmetic/time_intervals.py b/reasoning_gym/arithmetic/time_intervals.py
index abc177ca..1b296d02 100644
--- a/reasoning_gym/arithmetic/time_intervals.py
+++ b/reasoning_gym/arithmetic/time_intervals.py
@@ -1,5 +1,5 @@
 import random
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from datetime import date, datetime, time, timedelta
 from typing import List, Optional
 
@@ -19,14 +19,12 @@ class TimeIntervalsConfig:
     min_date: date = date(1900, 1, 1)
     max_date: date = date(3000, 1, 1)
     max_date_difference_days: int = 100
-    task_types: List[str] = None
+    task_types: List[str] = field(
+        default_factory=lambda: ["time", "time_seconds", "time_ms", "date", "datetime", "datetime_tz"]
+    )
     seed: Optional[int] = None
     size: int = 500
 
-    def __post_init__(self):
-        if self.task_types is None:
-            self.task_types = ["time", "time_seconds", "time_ms", "date", "datetime", "datetime_tz"]
-
     def validate(self) -> None:
         """Validate configuration parameters"""
         assert self.size > 0, "size must be positive"
diff --git a/reasoning_gym/geometry/advanced_geometry.py b/reasoning_gym/geometry/advanced_geometry.py
index 3d88a348..6a34d1db 100644
--- a/reasoning_gym/geometry/advanced_geometry.py
+++ b/reasoning_gym/geometry/advanced_geometry.py
@@ -1,5 +1,5 @@
 import random
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Optional
 
 import sympy
@@ -21,16 +21,11 @@ class AdvancedGeometryConfig:
 
     # Probability or list of tasks we want to generate
     # For demonstration, we have three categories:
-    task_types: List[str] = None
-
-    def __post_init__(self):
-        if self.task_types is None:
-            # Default set of advanced tasks
-            self.task_types = [
-                "orthocenter",
-                "incircle_radius",
-                "angle_measure",
-            ]
+    task_types: List[str] = field(default_factory=lambda: [
+        "orthocenter",
+        "incircle_radius",
+        "angle_measure",
+    ])
 
     def validate(self):
         assert self.min_coord < self.max_coord, "min_coord must be < max_coord."
diff --git a/reasoning_gym/graphs/family_relationships.py b/reasoning_gym/graphs/family_relationships.py
index 6ba042a8..ba1809af 100644
--- a/reasoning_gym/graphs/family_relationships.py
+++ b/reasoning_gym/graphs/family_relationships.py
@@ -1,5 +1,5 @@
 import random
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import StrEnum
 from itertools import count
 from typing import List, Optional, Set, Tuple
@@ -37,12 +37,8 @@ class Person:
     gender: Gender
     id: int
     spouse: Optional["Person"] = None
-    parents: List["Person"] = None
-    children: List["Person"] = None
-
-    def __post_init__(self):
-        self.parents = self.parents or []
-        self.children = self.children or []
+    parents: List["Person"] = field(default_factory=list)
+    children: List["Person"] = field(default_factory=list)
 
     def __hash__(self):
         return self.id
@@ -69,109 +65,25 @@ class FamilyRelationshipsConfig:
 
     min_family_size: int = 4
     max_family_size: int = 8
-    male_names: List[str] = None
-    female_names: List[str] = None
+    male_names: List[str] = field(default_factory=lambda: [
+        "James", "John", "Robert", "Michael", "William", "David", "Richard", "Joseph",
+        "Thomas", "Charles", "Peter", "Daniel", "Matthew", "Christopher", "Andrew",
+        "George", "Edward", "Benjamin", "Henry", "Samuel", "Alexander", "Oliver",
+        "Jack", "Harry", "Jacob", "Noah", "Ethan", "Lucas", "Mason", "Logan",
+        "Sebastian", "Theodore", "Owen", "Liam", "Aiden", "Kai", "Jayden", "Zion",
+        "Phoenix", "Atlas", "Axel", "Ryder", "Finn"
+    ])
+    female_names: List[str] = field(default_factory=lambda: [
+        "Mary", "Patricia", "Jennifer", "Linda", "Elizabeth", "Barbara", "Susan",
+        "Jessica", "Sarah", "Karen", "Emma", "Lisa", "Anna", "Margaret", "Victoria",
+        "Charlotte", "Sophia", "Isabella", "Olivia", "Ava", "Mia", "Emily",
+        "Abigail", "Amelia", "Eleanor", "Grace", "Alice", "Lucy", "Chloe",
+        "Sophie", "Lily", "Hannah", "Zoe", "Luna", "Nova", "Aria", "Willow",
+        "Aurora", "Sage", "River", "Winter", "Sky", "Rain"
+    ])
     seed: Optional[int] = None
     size: int = 500
 
-    def __post_init__(self):
-        # Default name lists if none provided
-        default_male_names = [
-            "James",
-            "John",
-            "Robert",
-            "Michael",
-            "William",
-            "David",
-            "Richard",
-            "Joseph",
-            "Thomas",
-            "Charles",
-            "Peter",
-            "Daniel",
-            "Matthew",
-            "Christopher",
-            "Andrew",
-            "George",
-            "Edward",
-            "Benjamin",
-            "Henry",
-            "Samuel",
-            "Alexander",
-            "Oliver",
-            "Jack",
-            "Harry",
-            "Jacob",
-            "Noah",
-            "Ethan",
-            "Lucas",
-            "Mason",
-            "Logan",
-            "Sebastian",
-            "Theodore",
-            "Owen",
-            "Liam",
-            "Aiden",
-            "Kai",
-            "Jayden",
-            "Zion",
-            "Phoenix",
-            "Atlas",
-            "Axel",
-            "Ryder",
-            "Finn",
-        ]
-        default_female_names = [
-            "Mary",
-            "Patricia",
-            "Jennifer",
-            "Linda",
-            "Elizabeth",
-            "Barbara",
-            "Susan",
-            "Jessica",
-            "Sarah",
-            "Karen",
-            "Emma",
-            "Lisa",
-            "Anna",
-            "Margaret",
-            "Victoria",
-            "Charlotte",
-            "Sophia",
-            "Isabella",
-            "Olivia",
-            "Ava",
-            "Mia",
-            "Emily",
-            "Abigail",
-            "Amelia",
-            "Eleanor",
-            "Grace",
-            "Alice",
-            "Lucy",
-            "Chloe",
-            "Sophie",
-            "Lily",
-            "Hannah",
-            "Zoe",
-            "Luna",
-            "Nova",
-            "Aria",
-            "Willow",
-            "Aurora",
-            "Sage",
-            "River",
-            "Winter",
-            "Sky",
-            "Rain",
-        ]
-
-        if self.male_names is None:
-            self.male_names = default_male_names
-        if self.female_names is None:
-            self.female_names = default_female_names
-
     def validate(self) -> None:
         """Validate configuration parameters"""
         assert self.min_family_size >= 3, "min_family_size must be at least 3"

From f396d3df609193f5e8b9f0ba49aa24ff611be707 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 10:04:18 +0100
Subject: [PATCH 55/94] post merge lint

---
 GALLERY.md                                   |  47 +++++++-
 reasoning_gym/geometry/advanced_geometry.py  |  12 +-
 reasoning_gym/graphs/family_relationships.py | 110 ++++++++++++++++---
 reasoning_gym/logic/__init__.py              |   3 +
 reasoning_gym/logic/aiw.py                   |  79 +++++++------
 tests/test_aiw.py                            |   9 +-
 6 files changed, 197 insertions(+), 63 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index 0bc5154e..8baa0db4 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -3,6 +3,7 @@ This gallery shows examples from all available datasets using their default conf
 
 ## Available Datasets
 - [advanced_geometry](#advanced_geometry)
+- [aiw](#aiw)
 - [base_conversion](#base_conversion)
 - [basic_arithmetic](#basic_arithmetic)
 - [bf](#bf)
@@ -73,6 +74,50 @@ Metadata: {'A': (6, 7), 'B': (-7, -5), 'C': (2, -3), 'incircle_radius_exact': 's
 
 ````
 
+### aiw
+A procedural dataset inspired by the "Alice in Wonderland" paper.
+
+    The dataset is inspired by the following paper:
+       @inproceedings{nezhurina2024alice,
+       title={Alice in Wonderland: Simple Tasks Reveal Severe Generalization and
+              Basic Reasoning Deficits in State-Of-the-Art Large Language Models},
+       author={Marianna Nezhurina and Lucia Cipolina-Kun and Mehdi Cherti and
+              Jenia Jitsev},
+       booktitle={NeurIPS 2024 Workshop on Scientific Methods for Understanding
+                  Deep Learning},
+       year={2024},
+       url={https://openreview.net/forum?id=Mkl7dzjYiW}
+       }
+
+Default configuration:
+```python
+male_names = ['James', 'John', 'Robert', 'Michael', 'William', 'David', 'Richard', 'Joseph', 'Thomas', 'Charles', 'Bob']
+female_names = ['Mary', 'Patricia', 'Jennifer', 'Linda', 'Elizabeth', 'Barbara', 'Susan', 'Jessica', 'Sarah', 'Margaret', 'Alice']
+task_types = [<TaskType.SIBLINGS: 'siblings'>, <TaskType.FRIENDS: 'friends'>, <TaskType.COLLEAGUES: 'colleagues'>]
+seed = 42
+size = 10
+max_entities = 6
+```
+
+Example tasks:
+````
+Example 1:
+Question: Patricia has 6 male colleagues and she also has 3 female colleagues. These are all colleagues that Patricia has. All these mentioned persons around Patricia are colleagues of each other. James has 2 male colleagues and 2 female colleagues in total. All these mentioned persons around James are colleagues of each other. The people in the circle around James do not have other colleagues aside - with the only exception of Matilda. She is colleague of James and she is also colleague of Patricia, being part of Patricia's circle. How many female colleagues does Matilda have?
+Answer: 4
+Metadata: {'task_type': 'colleagues'}
+
+Example 2:
+Question: Elizabeth has 4 brothers and she also has 3 sisters. How many sisters does Elizabeth's brother have?
+Answer: 4
+Metadata: {'task_type': 'siblings'}
+
+Example 3:
+Question: Sarah has 6 male friends and she also has 1 female friends. They all are friends with each other and have no other friends aside. How many female friends does Thomas, a male friend of Sarah, have?
+Answer: 2
+Metadata: {'task_type': 'friends'}
+
+````
+
 ### base_conversion
 Generates base conversion tasks
 
@@ -1548,7 +1593,7 @@ Metadata: {'task_type': 'datetime_tz', 'start_time': datetime.datetime(2964, 6,
 Example 2:
 Question: A video call started at 09:44 and ended at 12:22. How long was the call? Answer in HH:MM.
 Answer: 02:38
-Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 1, 9, 44), 'end_time': datetime.datetime(2025, 2, 1, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'}
+Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 2, 9, 44), 'end_time': datetime.datetime(2025, 2, 2, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'}
 
 Example 3:
 Question: Calculate the time difference between Sat Dec 22 2677 and Thu Mar 21 2678. Express the result in D days.
diff --git a/reasoning_gym/geometry/advanced_geometry.py b/reasoning_gym/geometry/advanced_geometry.py
index 6a34d1db..ac8797b9 100644
--- a/reasoning_gym/geometry/advanced_geometry.py
+++ b/reasoning_gym/geometry/advanced_geometry.py
@@ -21,11 +21,13 @@ class AdvancedGeometryConfig:
 
     # Probability or list of tasks we want to generate
     # For demonstration, we have three categories:
-    task_types: List[str] = field(default_factory=lambda: [
-        "orthocenter",
-        "incircle_radius",
-        "angle_measure",
-    ])
+    task_types: List[str] = field(
+        default_factory=lambda: [
+            "orthocenter",
+            "incircle_radius",
+            "angle_measure",
+        ]
+    )
 
     def validate(self):
         assert self.min_coord < self.max_coord, "min_coord must be < max_coord."
diff --git a/reasoning_gym/graphs/family_relationships.py b/reasoning_gym/graphs/family_relationships.py
index ba1809af..ee278b33 100644
--- a/reasoning_gym/graphs/family_relationships.py
+++ b/reasoning_gym/graphs/family_relationships.py
@@ -65,22 +65,100 @@ class FamilyRelationshipsConfig:
 
     min_family_size: int = 4
     max_family_size: int = 8
-    male_names: List[str] = field(default_factory=lambda: [
-        "James", "John", "Robert", "Michael", "William", "David", "Richard", "Joseph",
-        "Thomas", "Charles", "Peter", "Daniel", "Matthew", "Christopher", "Andrew",
-        "George", "Edward", "Benjamin", "Henry", "Samuel", "Alexander", "Oliver",
-        "Jack", "Harry", "Jacob", "Noah", "Ethan", "Lucas", "Mason", "Logan",
-        "Sebastian", "Theodore", "Owen", "Liam", "Aiden", "Kai", "Jayden", "Zion",
-        "Phoenix", "Atlas", "Axel", "Ryder", "Finn"
-    ])
-    female_names: List[str] = field(default_factory=lambda: [
-        "Mary", "Patricia", "Jennifer", "Linda", "Elizabeth", "Barbara", "Susan",
-        "Jessica", "Sarah", "Karen", "Emma", "Lisa", "Anna", "Margaret", "Victoria",
-        "Charlotte", "Sophia", "Isabella", "Olivia", "Ava", "Mia", "Emily",
-        "Abigail", "Amelia", "Eleanor", "Grace", "Alice", "Lucy", "Chloe",
-        "Sophie", "Lily", "Hannah", "Zoe", "Luna", "Nova", "Aria", "Willow",
-        "Aurora", "Sage", "River", "Winter", "Sky", "Rain"
-    ])
+    male_names: List[str] = field(
+        default_factory=lambda: [
+            "James",
+            "John",
+            "Robert",
+            "Michael",
+            "William",
+            "David",
+            "Richard",
+            "Joseph",
+            "Thomas",
+            "Charles",
+            "Peter",
+            "Daniel",
+            "Matthew",
+            "Christopher",
+            "Andrew",
+            "George",
+            "Edward",
+            "Benjamin",
+            "Henry",
+            "Samuel",
+            "Alexander",
+            "Oliver",
+            "Jack",
+            "Harry",
+            "Jacob",
+            "Noah",
+            "Ethan",
+            "Lucas",
+            "Mason",
+            "Logan",
+            "Sebastian",
+            "Theodore",
+            "Owen",
+            "Liam",
+            "Aiden",
+            "Kai",
+            "Jayden",
+            "Zion",
+            "Phoenix",
+            "Atlas",
+            "Axel",
+            "Ryder",
+            "Finn",
+        ]
+    )
+    female_names: List[str] = field(
+        default_factory=lambda: [
+            "Mary",
+            "Patricia",
+            "Jennifer",
+            "Linda",
+            "Elizabeth",
+            "Barbara",
+            "Susan",
+            "Jessica",
+            "Sarah",
+            "Karen",
+            "Emma",
+            "Lisa",
+            "Anna",
+            "Margaret",
+            "Victoria",
+            "Charlotte",
+            "Sophia",
+            "Isabella",
+            "Olivia",
+            "Ava",
+            "Mia",
+            "Emily",
+            "Abigail",
+            "Amelia",
+            "Eleanor",
+            "Grace",
+            "Alice",
+            "Lucy",
+            "Chloe",
+            "Sophie",
+            "Lily",
+            "Hannah",
+            "Zoe",
+            "Luna",
+            "Nova",
+            "Aria",
+            "Willow",
+            "Aurora",
+            "Sage",
+            "River",
+            "Winter",
+            "Sky",
+            "Rain",
+        ]
+    )
     seed: Optional[int] = None
     size: int = 500
 
diff --git a/reasoning_gym/logic/__init__.py b/reasoning_gym/logic/__init__.py
index c2c07625..38307647 100644
--- a/reasoning_gym/logic/__init__.py
+++ b/reasoning_gym/logic/__init__.py
@@ -6,10 +6,13 @@ Logic tasks for training reasoning capabilities:
 - Syllogisms
 """
 
+from .aiw import AliceInWonderlandConfig, AliceInWonderlandDataset
 from .propositional_logic import PropositionalLogicConfig, PropositionalLogicDataset
 from .syllogisms import SyllogismConfig, SyllogismDataset, Term
 
 __all__ = [
+    "AliceInWonderlandConfig",
+    "AliceInWonderlandDataset",
     "PropositionalLogicConfig",
     "PropositionalLogicDataset",
     "SyllogismConfig",
diff --git a/reasoning_gym/logic/aiw.py b/reasoning_gym/logic/aiw.py
index 7a27b6d0..0c864cc4 100644
--- a/reasoning_gym/logic/aiw.py
+++ b/reasoning_gym/logic/aiw.py
@@ -1,14 +1,15 @@
 from dataclasses import dataclass, field
-from typing import List, Optional
 from enum import Enum
 from random import Random
 from string import Template
+from typing import List, Optional
 
 from ..factory import ProceduralDataset, register_dataset
 
 
 class TaskType(Enum):
     """Defines the type of task for the Alice in Wonderland dataset."""
+
     SIBLINGS = "siblings"
     FRIENDS = "friends"
     COLLEAGUES = "colleagues"  # Added colleagues task
@@ -26,21 +27,39 @@ class AliceInWonderlandConfig:
         size (int): Number of samples in the dataset.
         max_entities (int): Max number of siblings/friends/colleagues in questions.
     """
+
     male_names: List[str] = field(
         default_factory=lambda: [
-            "James", "John", "Robert", "Michael", "William", "David",
-            "Richard", "Joseph", "Thomas", "Charles", "Bob"
+            "James",
+            "John",
+            "Robert",
+            "Michael",
+            "William",
+            "David",
+            "Richard",
+            "Joseph",
+            "Thomas",
+            "Charles",
+            "Bob",
         ]
     )
     female_names: List[str] = field(
         default_factory=lambda: [
-            "Mary", "Patricia", "Jennifer", "Linda", "Elizabeth",
-            "Barbara", "Susan", "Jessica", "Sarah", "Margaret", "Alice"
+            "Mary",
+            "Patricia",
+            "Jennifer",
+            "Linda",
+            "Elizabeth",
+            "Barbara",
+            "Susan",
+            "Jessica",
+            "Sarah",
+            "Margaret",
+            "Alice",
         ]
     )
     task_types: List[TaskType] = field(
-        default_factory=lambda: [
-            TaskType.SIBLINGS, TaskType.FRIENDS, TaskType.COLLEAGUES]  # Added Colleagues
+        default_factory=lambda: [TaskType.SIBLINGS, TaskType.FRIENDS, TaskType.COLLEAGUES]  # Added Colleagues
     )
     seed: Optional[int] = None
     size: int = 10
@@ -57,19 +76,19 @@ class AliceInWonderlandConfig:
 
 class AliceInWonderlandDataset(ProceduralDataset):
     """
-     A procedural dataset inspired by the "Alice in Wonderland" paper.
+    A procedural dataset inspired by the "Alice in Wonderland" paper.
 
-     The dataset is inspired by the following paper:
-        @inproceedings{nezhurina2024alice,
-        title={Alice in Wonderland: Simple Tasks Reveal Severe Generalization and
-               Basic Reasoning Deficits in State-Of-the-Art Large Language Models},
-        author={Marianna Nezhurina and Lucia Cipolina-Kun and Mehdi Cherti and
-               Jenia Jitsev},
-        booktitle={NeurIPS 2024 Workshop on Scientific Methods for Understanding
-                   Deep Learning},
-        year={2024},
-        url={https://openreview.net/forum?id=Mkl7dzjYiW}
-        }
+    The dataset is inspired by the following paper:
+       @inproceedings{nezhurina2024alice,
+       title={Alice in Wonderland: Simple Tasks Reveal Severe Generalization and
+              Basic Reasoning Deficits in State-Of-the-Art Large Language Models},
+       author={Marianna Nezhurina and Lucia Cipolina-Kun and Mehdi Cherti and
+              Jenia Jitsev},
+       booktitle={NeurIPS 2024 Workshop on Scientific Methods for Understanding
+                  Deep Learning},
+       year={2024},
+       url={https://openreview.net/forum?id=Mkl7dzjYiW}
+       }
 
     """
 
@@ -152,14 +171,10 @@ class AliceInWonderlandDataset(ProceduralDataset):
                 num_female=num_female,
             )
         elif task_type == TaskType.COLLEAGUES:
-            num_male_colleagues_alice_circle = rng.randint(
-                1, self.config.max_entities)
-            num_female_colleagues_alice_circle = rng.randint(
-                1, self.config.max_entities)
-            num_male_colleagues_bob_circle = rng.randint(
-                1, self.config.max_entities)
-            num_female_colleagues_bob_circle = rng.randint(
-                1, self.config.max_entities)
+            num_male_colleagues_alice_circle = rng.randint(1, self.config.max_entities)
+            num_female_colleagues_alice_circle = rng.randint(1, self.config.max_entities)
+            num_male_colleagues_bob_circle = rng.randint(1, self.config.max_entities)
+            num_female_colleagues_bob_circle = rng.randint(1, self.config.max_entities)
 
             answer = num_female_colleagues_alice_circle + 1
             template = rng.choice(self.templates[TaskType.COLLEAGUES])
@@ -169,16 +184,10 @@ class AliceInWonderlandDataset(ProceduralDataset):
                 num_male_colleagues_alice_circle=num_male_colleagues_alice_circle,
                 num_female_colleagues_alice_circle=num_female_colleagues_alice_circle,
                 num_male_colleagues_bob_circle=num_male_colleagues_bob_circle,
-                num_female_colleagues_bob_circle=num_female_colleagues_bob_circle
+                num_female_colleagues_bob_circle=num_female_colleagues_bob_circle,
             )
 
-        return {
-            "question": question,
-            "answer": answer,
-            "metadata": {
-                "task_type": task_type.value
-            }
-        }
+        return {"question": question, "answer": answer, "metadata": {"task_type": task_type.value}}
 
     def __getitem__(self, idx: int) -> dict:
         rng = Random(self.seed + idx)
diff --git a/tests/test_aiw.py b/tests/test_aiw.py
index bf556cbe..5a2fb454 100644
--- a/tests/test_aiw.py
+++ b/tests/test_aiw.py
@@ -14,8 +14,7 @@ def test_aiw_config_validation():
         config.validate()
 
     with pytest.raises(AssertionError):
-        config = AliceInWonderlandConfig(
-            female_names=["Mary", "Jane"])  # No Alice
+        config = AliceInWonderlandConfig(female_names=["Mary", "Jane"])  # No Alice
         config.validate()
 
     with pytest.raises(AssertionError):
@@ -56,8 +55,7 @@ def test_aiw_items():
         # Verify question task type characteristics
         task_type = item["metadata"]["task_type"]
         if task_type == TaskType.SIBLINGS.value:
-            assert any(phrase in item["question"]
-                       for phrase in ["brothers", "sisters"])
+            assert any(phrase in item["question"] for phrase in ["brothers", "sisters"])
         elif task_type == TaskType.FRIENDS.value:
             assert "friends" in item["question"]
         elif task_type == TaskType.COLLEAGUES:
@@ -95,5 +93,4 @@ def test_aiw_random_ranges():
         numbers = [int(n) for n in question.split() if n.isdigit()]
 
         # Check all numbers are in reasonable range (1-6 as per implementation)
-        assert all(
-            1 <= n <= 12 for n in numbers), f"Numbers out of range: {numbers}"
+        assert all(1 <= n <= 12 for n in numbers), f"Numbers out of range: {numbers}"

From 943651c15b083be858ca16668a92328c6b05a26e Mon Sep 17 00:00:00 2001
From: benjamrio <benjamrio@yahoo.fr>
Date: Sun, 2 Feb 2025 13:25:29 +0100
Subject: [PATCH 56/94] added calendar-arithmetic tasks

---
 GALLERY.md                                    |  26 +
 README.md                                     |   1 +
 reasoning_gym/arithmetic/__init__.py          |   5 +
 .../arithmetic/calendar_arithmetic.py         | 490 ++++++++++++++++++
 tests/test_calendar_arithmetic.py             | 196 +++++++
 5 files changed, 718 insertions(+)
 create mode 100644 reasoning_gym/arithmetic/calendar_arithmetic.py
 create mode 100644 tests/test_calendar_arithmetic.py

diff --git a/GALLERY.md b/GALLERY.md
index 8baa0db4..15aec181 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -255,6 +255,32 @@ Metadata: {'rotation': 17, 'cipher_text': 'ZW PFLI JKFDRTY ZJ FLK FW ZK DLJK SV
 
 ````
 
+### calendar_arithmetic
+Generates various calendar arithmetic tasks
+Default configuration:
+```python
+year = 2024
+tasks = None
+seed = 42
+size = 500
+```
+
+Example tasks:
+```
+Example 1:
+Question: How many business days (Monday-Friday) are there from Tuesday, December 17, 2024 to Sunday, December 29, 2024 (inclusive of both dates)? Give the count numerically.
+Answer: 9
+Metadata: {'task': 'count_business_days', 'start_date': '2024-12-17', 'end_date': '2024-12-29'}
+Example 2:
+Question: Given that January 1 fell on a Sunday, which weekday occurs on August 17? State the weekday (Monday through Sunday).
+Answer: Friday
+Metadata: {'task': 'first_day_of_year', 'year': 2024, 'first_day': 'Sunday', 'target_date': '2024-08-17', 'delta_days': 229}
+Example 3:
+Question: In August 2024, if an event recurs on the first Tuesday, what is the date (day of the month) of the event? Answer with a number. Answer with -1 if the ordinal does not exist in the month.
+Answer: 6
+Metadata: {'task': 'recurring_event_day', 'year': 2024, 'month': 8, 'ordinal': 'first', 'weekday': 'Tuesday'}
+```
+
 ### chain_sum
 Generates simple arithmetic tasks using only + and - operators
 
diff --git a/README.md b/README.md
index d5126451..f05ce2b3 100644
--- a/README.md
+++ b/README.md
@@ -76,6 +76,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
 ### <small>Arithmetic Tasks</small>
 
 - `BasicArithmeticDataset`: Generate arithmetic expressions with configurable complexity and operators (+, -, \*, /)
+- `CalendarArithmeticDatset`: Generate arithmetic problems around calendar navigation logic
 - `ChainSum`: Generate addition/subtraction chains with configurable length and digit counts
 - `FractionSimplificationDataset`: Generate fraction simplification tasks with configurable complexity
 - `GCDDataset`: Generate Greatest Common Divisor problems with configurable number of integers
diff --git a/reasoning_gym/arithmetic/__init__.py b/reasoning_gym/arithmetic/__init__.py
index e6f95451..18089e6d 100644
--- a/reasoning_gym/arithmetic/__init__.py
+++ b/reasoning_gym/arithmetic/__init__.py
@@ -15,6 +15,7 @@ from .lcm import LCMConfig, LCMDataset
 from .leg_counting import LegCountingConfig, LegCountingDataset
 from .prime_factorization import PrimeFactorizationConfig, PrimeFactorizationDataset
 from .time_intervals import TimeIntervalsConfig, TimeIntervalsDataset
+from .calendar_arithmetic import CalendarArithmeticConfig, CalendarArithmeticDataset
 
 __all__ = [
     "BasicArithmeticDataset",
@@ -22,6 +23,10 @@ __all__ = [
     "basic_arithmetic_dataset",
     "ChainSum",
     "ChainSumConfig",
+    "CalendarArithmeticConfig",
+    "CalendarArithmeticDataset",
+    "Weekday",
+    "CalendarTask",
     "FractionSimplificationConfig",
     "FractionSimplificationDataset",
     "GCDConfig",
diff --git a/reasoning_gym/arithmetic/calendar_arithmetic.py b/reasoning_gym/arithmetic/calendar_arithmetic.py
new file mode 100644
index 00000000..61976d8d
--- /dev/null
+++ b/reasoning_gym/arithmetic/calendar_arithmetic.py
@@ -0,0 +1,490 @@
+from dataclasses import dataclass
+from datetime import date, timedelta
+import calendar
+import random
+import math
+from typing import Optional, Tuple, List, Dict, Any
+from enum import Enum, auto
+
+from ..factory import ProceduralDataset, register_dataset
+
+
+class Weekday(Enum):
+    MONDAY = auto()
+    TUESDAY = auto()
+    WEDNESDAY = auto()
+    THURSDAY = auto()
+    FRIDAY = auto()
+    SATURDAY = auto()
+    SUNDAY = auto()
+
+    @classmethod
+    def from_date(cls, d: date) -> "Weekday":
+        return list(cls)[d.weekday()]
+
+    @classmethod
+    def random(cls, rng: random.Random) -> "Weekday":
+        return list(cls)[rng.randint(0, 6)]
+
+    @classmethod
+    def __getitem__(cls, idx) -> "Weekday":
+        return list(cls)[idx]
+
+    @property
+    def index(self) -> int:
+        return self.value - 1
+
+    def __str__(self) -> str:
+        return self.name.capitalize()
+
+
+class CalendarTask(Enum):
+    WEEKDAY_OFFSET = "weekday_offset"
+    WEEKDAY_OF_DATE = "weekday_of_date"
+    WEEKDAY_OF_DATE_FROM_FIRST_DATE = "weekday_of_date_from_first_day"
+    RECURRING_EVENT_CALCULATIONS = "recurring_event_day"
+    COUNT_DAYS = "count_days"
+    COUNT_BUSINESS_DAYS = "count_business_days"
+    IS_LEAP_YEAR = "is_leap_year"
+
+
+@dataclass
+class CalendarArithmeticConfig:
+    year: int = 2022
+    tasks: Optional[List[str]] = None
+    offset_upper_bound: int = 100
+    leap_year_range: int = 200
+    seed: Optional[int] = 42
+    size: int = 500
+
+    def __post_init__(self):
+        if self.tasks is None:
+            self.tasks = [task.value for task in CalendarTask]
+        else:
+            self.tasks = [task.lower() for task in self.tasks]
+            valid_tasks = {task.value for task in CalendarTask}
+            invalid_tasks = set(self.tasks) - valid_tasks
+            if invalid_tasks:
+                valid_task_list = ", ".join(sorted(valid_tasks))
+                raise ValueError(
+                    f"Invalid tasks: {', '.join(sorted(invalid_tasks))}. " f"Valid tasks are: {valid_task_list}"
+                )
+
+    def validate(self) -> None:
+        """Validate the configuration parameters."""
+        if not isinstance(self.year, int) or self.year <= 0:
+            raise ValueError(f"year must be a positive integer, got {self.year}")
+
+        if self.seed is not None and not isinstance(self.seed, int):
+            raise ValueError(f"seed must be an integer or None, got {type(self.seed)}")
+
+        if not isinstance(self.size, int) or self.size <= 0:
+            raise ValueError(f"size must be a positive integer, got {self.size}")
+
+
+class CalendarArithmeticDataset(ProceduralDataset):
+    DAY_QUESTION_TEMPLATES = [
+        "Answer with the weekday's name (e.g., Monday, Tuesday, etc.).",
+        "Provide the full name of the weekday.",
+        "State the weekday (Monday through Sunday).",
+        "Give the weekday name in full.",
+        "Reply with just the weekday name.",
+        "Write out the full weekday name.",
+        "Respond with the weekday (Monday-Sunday).",
+        "Answer using the complete weekday name.",
+        "Name the day of the week in full.",
+    ]
+
+    COUNT_QUESTION_TEMPLATES = [
+        "Answer with a number.",
+        "Provide the count as a number.",
+        "Respond with just the number.",
+        "Write the total number.",
+        "Give the count numerically.",
+        "State the amount as a number.",
+        "Reply with the numerical value.",
+        "Express your answer as a number.",
+    ]
+
+    def __init__(self, config: CalendarArithmeticConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+
+        self.task_handlers = {
+            CalendarTask.WEEKDAY_OFFSET.value: self._weekday_offset,
+            CalendarTask.WEEKDAY_OF_DATE.value: self._weekday_of_date,
+            CalendarTask.WEEKDAY_OF_DATE_FROM_FIRST_DATE.value: self._weekday_of_date_from_first_day,
+            CalendarTask.RECURRING_EVENT_CALCULATIONS.value: self._recurring_event_day,
+            CalendarTask.COUNT_DAYS.value: self._count_days,
+            CalendarTask.COUNT_BUSINESS_DAYS.value: self._count_business_days,
+            CalendarTask.IS_LEAP_YEAR.value: self._is_leap_year,
+        }
+
+        self.tasks = [self.task_handlers[task] for task in self.config.tasks]
+
+    def __getitem__(self, idx: int) -> dict:
+        item_rng = random.Random(self.seed + idx)
+        task = item_rng.choice(self.tasks)
+        question, answer, metadata = task(item_rng)
+        return {
+            "question": question,
+            "answer": str(answer),
+            "metadata": metadata,
+        }
+
+    def _weekday_offset(self, rng: random.Random) -> Tuple[str, str, dict]:
+        """
+        Task: Given a starting date and a day offset (which may be positive or negative),
+        ask what day of the week it will be.
+        Examples:
+         - "If today is Wednesday, March 13, 2024, what day of the week will it be in 10 days? Answer with the weekday's name."
+         - "If today is Wednesday, March 13, 2024, what day of the week was it 10 days ago? Answer with the weekday's name."
+        """
+        year = self.config.year
+        start_date = self._random_date_for_year(rng, year)
+        offset = rng.randint(1, self.config.offset_upper_bound)
+        sign = rng.choice([-1, 1])
+        offset_days = sign * offset
+        target_date = start_date + timedelta(days=offset_days)
+        target_weekday = target_date.strftime("%A")
+
+        date_str = f"{start_date.strftime('%A')}, {start_date.strftime('%B')} {start_date.day}, {start_date.year}"
+        if offset_days >= 0:
+            templates = [
+                f"If today is {date_str}, what day of the week will it be in {offset_days} days? ",
+                f"Starting from {date_str}, which weekday falls after a {offset_days}-day jump? ",
+                f"Count forward {offset_days} days from {date_str} - what's the weekday? ",
+            ]
+        else:
+            templates = [
+                f"If today is {date_str}, what day of the week was it {abs(offset_days)} days ago? ",
+                f"Starting from {date_str}, which weekday was it {abs(offset_days)} days before? ",
+                f"Count backward {abs(offset_days)} days from {date_str} - what's the weekday? ",
+            ]
+
+        question = rng.choice(templates) + rng.choice(self.DAY_QUESTION_TEMPLATES)
+        metadata = {
+            "task": CalendarTask.WEEKDAY_OFFSET.value,
+            "start_date": start_date.isoformat(),
+            "offset_days": offset_days,
+            "target_date": target_date.isoformat(),
+        }
+        return question, target_weekday, metadata
+
+    def _weekday_of_date(self, rng: random.Random) -> Tuple[str, str, dict]:
+        """
+        task: Ask what day of the week a given date was.
+        example:
+          "What day of the week was January 15, 2024?
+           Answer with the weekday's name."
+        """
+        year = self.config.year
+        target_date = self._random_date_for_year(rng, year)
+        answer_weekday = target_date.strftime("%A")
+        templates = [
+            f"What day of the week was {target_date.strftime('%B')} {target_date.day}, {year}?",
+            f"On which weekday did {target_date.strftime('%B')} {target_date.day}, {year} fall?",
+            f"Name the day of the week for {target_date.strftime('%m/%d/%Y')}.",
+        ]
+
+        question = f"{rng.choice(templates)} {rng.choice(self.DAY_QUESTION_TEMPLATES)}"
+        metadata = {
+            "task": CalendarTask.WEEKDAY_OF_DATE.value,
+            "target_date": target_date.isoformat(),
+        }
+        return question, answer_weekday, metadata
+
+    def _weekday_of_date_from_first_day(self, rng: random.Random) -> Tuple[str, str, dict]:
+        """
+        task: Given an hypothetical weekday for January 1, ask what weekday a later date in the year falls on.
+        example:
+         "If the first day of the year was a Monday, what day of the week will December 31 be?
+          Answer with the weekday's name."
+        """
+        year = self.config.year
+        first_day = Weekday.random(rng)
+        first_day_index = first_day.index
+        # Ensure target date is not January 1.
+        year_start = date(year, 1, 1)
+        year_end = date(year, 12, 31)
+        max_delta = timedelta(days=self.config.offset_upper_bound)
+        max_date = min(year_start + max_delta, year_end)
+        while True:
+            target_date = self._random_date_between(rng, year_start, max_date)
+            if target_date != date(year, 1, 1):
+                break
+        delta_days = (target_date - date(year, 1, 1)).days
+        answer_index = (first_day_index + delta_days) % 7
+        answer_weekday = Weekday(answer_index + 1)
+
+        templates = [
+            f"If the first day of the year was a {first_day}, what day of the week will "
+            f"{target_date.strftime('%B')} {target_date.day} be? ",
+            f"Given that January 1 fell on a {first_day}, which weekday occurs on "
+            f"{target_date.strftime('%B')} {target_date.day}? ",
+            f"In a year where {first_day} is January 1st, name the weekday of "
+            f"{target_date.strftime('%B')} {target_date.day}. ",
+        ]
+
+        question = rng.choice(templates) + rng.choice(self.DAY_QUESTION_TEMPLATES)
+        metadata = {
+            "task": CalendarTask.WEEKDAY_OF_DATE_FROM_FIRST_DATE.value,
+            "year": year,
+            "first_day": str(first_day),
+            "target_date": target_date.isoformat(),
+            "delta_days": delta_days,
+        }
+        return question, answer_weekday, metadata
+
+    def _recurring_event_day(self, rng: random.Random) -> Tuple[str, str, dict]:
+        """
+        task: For a recurring event defined by an ordinal weekday pattern in a month,
+        ask on which day of the month the event occurs.
+        example:
+         "If a meeting is scheduled on the second Tuesday of May 2024, on which day does it fall?
+          Answer with a number."
+        """
+        year = self.config.year
+        month = rng.randint(1, 12)
+        ordinals = ["first", "second", "third", "fourth", "last"]
+        ordinal = rng.choice(ordinals)
+        weekday = Weekday.random(rng)
+        month_name = calendar.month_name[month]
+        _, last_day = calendar.monthrange(year, month)
+
+        if ordinal != "last":
+            ordinal_number = {"first": 1, "second": 2, "third": 3, "fourth": 4}[ordinal]
+            count = 0
+            event_day = None
+            for day in range(1, last_day + 1):
+                d = date(year, month, day)
+                if d.strftime("%A") == str(weekday):
+                    count += 1
+                    if count == ordinal_number:
+                        event_day = day
+                        break
+            if event_day is None:
+                # This should rarely happen but in some months the ordinal may not exist.
+                event_day = -1
+        else:
+            event_day = None
+            for day in range(last_day, 0, -1):
+                d = date(year, month, day)
+                if d.strftime("%A") == str(weekday):
+                    event_day = day
+                    break
+            if event_day is None:
+                event_day = -1
+
+        templates = [
+            f"If a meeting is scheduled on the {ordinal} {weekday} of {month_name} {year}, on which day of the month does it occur? ",
+            f"In {month_name} {year}, if an event recurs on the {ordinal} {weekday}, what is the date (day of the month) of the event? ",
+            f"Determine the day of the month for the {ordinal} {weekday} in {month_name} {year}. ",
+        ]
+        question = (
+            rng.choice(templates)
+            + rng.choice(self.COUNT_QUESTION_TEMPLATES)
+            + " Answer with -1 if the ordinal does not exist in the month."
+        )
+        metadata = {
+            "task": CalendarTask.RECURRING_EVENT_CALCULATIONS.value,
+            "year": year,
+            "month": month,
+            "ordinal": ordinal,
+            "weekday": str(weekday),
+        }
+        return question, str(event_day), metadata
+
+    def _count_days(self, rng: random.Random) -> Tuple[str, str, dict]:
+        """
+        task: Ask how many times a given weekday occurs in a specified range.
+        example:
+           "How many days are there between March 1, 2024 and March 15, 2024?
+           Answer with a number."
+        """
+        year = self.config.year
+        year_start = date(year, 1, 1)
+        year_end = date(year, 12, 31)
+        start_date = self._random_date_between(rng, year_start, year_end)
+        max_delta = timedelta(days=self.config.offset_upper_bound)
+        end_date = self._random_date_between(rng, start_date, min(year_end, start_date + max_delta))
+        weekday = Weekday.random(rng)
+
+        def count_weekday_between(d1: date, d2: date, weekday: str) -> int:
+            days = (d2 - d1).days + 1
+            return sum(1 for i in range(days) if (d1 + timedelta(days=i)).strftime("%A") == weekday)
+
+        count = count_weekday_between(start_date, end_date, str(weekday))
+
+        templates = [
+            f"How many {weekday}s are there from {start_date.strftime('%A, %B')} {start_date.day}, {year} to "
+            f"{end_date.strftime('%A, %B')} {end_date.day}, {year} (inclusive of both dates)? ",
+            f"Count the occurrences of {weekday} from {start_date.strftime('%A, %B')} {start_date.day} "
+            f"to {end_date.strftime('%A, %B')} {end_date.day}, {year} (including both start and end dates). ",
+            f"Between {start_date.strftime('%A, %B')} {start_date.day}, {year} and "
+            f"{end_date.strftime('%A, %B')} {end_date.day}, {year} "
+            f"(counting both dates), how many times does {weekday} occur? ",
+        ]
+
+        question = rng.choice(templates) + rng.choice(self.COUNT_QUESTION_TEMPLATES)
+        metadata = {
+            "task": CalendarTask.COUNT_DAYS.value,
+            "year": year,
+            "start_date": start_date.isoformat(),
+            "end_date": end_date.isoformat(),
+        }
+        return question, str(count), metadata
+
+    def _count_business_days(self, rng: random.Random) -> Tuple[str, str, dict]:
+        """
+        task: Count the number of business days (Monday-Friday) between two dates.
+        example:
+          "How many business days (Monday-Friday) are there between March 1, 2024 and March 15, 2024?
+           Answer with a number."
+        """
+        year = self.config.year
+        year_start = date(year, 1, 1)
+        year_end = date(year, 12, 31)
+        start_date = self._random_date_between(rng, year_start, year_end)
+        max_delta = timedelta(days=self.config.offset_upper_bound)
+        end_date = self._random_date_between(rng, start_date, start_date + max_delta)
+
+        count = 0
+
+        def business_days_between(d1: date, d2: date) -> int:
+            days = (d2 - d1).days + 1
+            weeks, remainder = divmod(days, 7)
+            count = weeks * 5
+            start_weekday = d1.weekday()
+            for i in range(remainder):
+                if (start_weekday + i) % 7 < 5:
+                    count += 1
+            return count
+
+        count = business_days_between(start_date, end_date)
+
+        templates = [
+            f"How many business days (Monday-Friday) are there from "
+            f"{start_date.strftime('%A, %B')} {start_date.day}, {year} to "
+            f"{end_date.strftime('%A, %B')} {end_date.day}, {year} "
+            f"(inclusive of both dates)? ",
+            f"Count the weekdays (excluding weekends) from "
+            f"{start_date.strftime('%A, %B')} {start_date.day} to "
+            f"{end_date.strftime('%A, %B')} {end_date.day}, {year} "
+            f"(including both start and end dates). ",
+            f"Between {start_date.strftime('%A, %B')} {start_date.day}, {year} and "
+            f"{end_date.strftime('%A, %B')} {end_date.day}, {year} "
+            f"(counting both dates), what's the total count of business days "
+            f"(Monday through Friday)? ",
+        ]
+
+        question = rng.choice(templates) + rng.choice(self.COUNT_QUESTION_TEMPLATES)
+        metadata = {
+            "task": CalendarTask.COUNT_BUSINESS_DAYS.value,
+            "start_date": start_date.isoformat(),
+            "end_date": end_date.isoformat(),
+        }
+        return question, str(count), metadata
+
+    def _is_leap_year(self, rng: random.Random) -> Tuple[str, str, dict]:
+        """
+        task: Given a year, determine whether it is a leap year.
+        example:
+         "Is 2024 a leap year? Answer with Yes or No."
+        """
+        semirange = self.config.leap_year_range // 2
+        year = rng.randint(self.config.year - semirange, self.config.year + semirange)
+        is_leap = calendar.isleap(year)
+        answer = "Yes" if is_leap else "No"
+        templates = [
+            f"Determine if the year {year} is a leap year. ",
+            f"Is {year} a leap year? ",
+            f"Tell me whether {year} is a leap year. ",
+        ]
+        question = rng.choice(templates) + "Answer with Yes or No."
+        metadata = {
+            "task": CalendarTask.IS_LEAP_YEAR.value,
+            "year": year,
+            "is_leap": is_leap,
+        }
+        return question, answer, metadata
+
+    def _random_date_for_year(self, rng: random.Random, year: int) -> date:
+        """Return a random date within the given year."""
+        month = rng.randint(1, 12)
+        _, last_day = calendar.monthrange(year, month)
+        day = rng.randint(1, last_day)
+        return date(year, month, day)
+
+    def _random_date_between(self, rng: random.Random, start_date: date, end_date: date) -> date:
+        """
+        Return a random date between start_date and end_date (inclusive).
+        Assumes start_date <= end_date.
+        """
+        if start_date > end_date:
+            raise ValueError("start_date must be <= end_date")
+        delta = (end_date - start_date).days
+        random_days = rng.randint(0, delta)
+        return start_date + timedelta(days=random_days)
+
+    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+        # we suppose the answer is the last occurence of the expected answer type
+        if answer is None:
+            return 0.0
+
+        oracle_answer = entry["answer"]
+        task = entry["metadata"]["task"]
+
+        if task in {
+            CalendarTask.WEEKDAY_OFFSET.value,
+            CalendarTask.WEEKDAY_OF_DATE_FROM_FIRST_DATE.value,
+            CalendarTask.WEEKDAY_OF_DATE.value,
+        }:
+            if not answer:
+                return 0.0
+
+            answer = answer.strip()
+            oracle_answer = oracle_answer
+            weekdays = {d.name.title() for d in Weekday}
+
+            if answer == oracle_answer:
+                return 1.0
+
+            if answer in weekdays:
+                return 0.1
+
+            if answer.title() in weekdays:
+                return 0.05
+
+            if answer.title() not in weekdays:
+                return 0.0
+
+            return 0.0
+
+        # denser reward for numerical tasks
+        elif task in {
+            CalendarTask.COUNT_BUSINESS_DAYS.value,
+            CalendarTask.COUNT_DAYS.value,
+            CalendarTask.RECURRING_EVENT_CALCULATIONS.value,
+        }:
+            try:
+                ans_num = int(answer.strip())
+                oracle_num = int(oracle_answer.strip())
+
+                if oracle_num == 0:
+                    return 1.0 if ans_num == 0 else 0.0
+
+                relative_error = abs(ans_num - oracle_num) / oracle_num
+                return max(0.0, math.exp(-5 * relative_error))
+
+            except (ValueError, AttributeError):
+                return 0.0
+
+        elif task == CalendarTask.IS_LEAP_YEAR.value:
+            if answer.strip().lower() == oracle_answer.lower():
+                return 1.0
+            return 0.0
+
+        return 0.0
+
+
+register_dataset("calendar_arithmetic", CalendarArithmeticDataset, CalendarArithmeticConfig)
diff --git a/tests/test_calendar_arithmetic.py b/tests/test_calendar_arithmetic.py
new file mode 100644
index 00000000..eca89ebb
--- /dev/null
+++ b/tests/test_calendar_arithmetic.py
@@ -0,0 +1,196 @@
+from datetime import date
+import pytest
+import math
+import calendar
+from reasoning_gym.arithmetic import CalendarArithmeticConfig, CalendarArithmeticDataset
+
+WEEKDAYS = [
+    "Monday",
+    "Tuesday",
+    "Wednesday",
+    "Thursday",
+    "Friday",
+    "Saturday",
+    "Sunday",
+]
+
+WEEKDAY_TASKS = {
+    "weekday_offset",
+    "weekday_of_date_from_first_day",
+    "weekday_of_date",
+}
+NUMERIC_TASKS = {
+    "count_days",
+    "count_business_days",
+}
+DAY_TASKS = {"recurring_event_day"}
+BOOLEAN_TASKS = {"is_leap_year"}
+CALENDAR_TASKS = WEEKDAY_TASKS | NUMERIC_TASKS | DAY_TASKS | BOOLEAN_TASKS
+
+
+def test_calendar_config_validation():
+    """Test that invalid CalendarArithmeticConfig parameters raise appropriate errors."""
+    with pytest.raises(ValueError):
+        config = CalendarArithmeticConfig(year=0)
+        config.validate()
+
+    with pytest.raises(ValueError):
+        config = CalendarArithmeticConfig(size=0)
+        config.validate()
+
+    with pytest.raises(ValueError):
+        config = CalendarArithmeticConfig(seed="not_an_int")
+        config.validate()
+
+    with pytest.raises(ValueError):
+        config = CalendarArithmeticConfig(tasks=["invalid_task"])
+
+
+def test_calendar_deterministic():
+    """Test that a dataset with a fixed seed produces the same items."""
+    config = CalendarArithmeticConfig(year=2024, seed=42, size=10)
+    ds1 = CalendarArithmeticDataset(config)
+    ds2 = CalendarArithmeticDataset(config)
+
+    for i in range(len(ds1)):
+        assert ds1[i] == ds2[i]
+
+
+def test_calendar_item_structure():
+    """Test that dataset items have the correct structure and fields."""
+    config = CalendarArithmeticConfig(year=2024, seed=42, size=50)
+    dataset = CalendarArithmeticDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        assert isinstance(item, dict)
+        assert all(key in item for key in ["question", "answer", "metadata"])
+
+        assert isinstance(item["question"], str) and len(item["question"]) > 0
+        assert isinstance(item["answer"], str) and len(item["answer"]) > 0
+        assert "task" in item["metadata"]
+        assert item["metadata"]["task"] in CALENDAR_TASKS
+
+
+def test_calendar_answer_format():
+    """Test that answers have the correct format based on task type."""
+    config = CalendarArithmeticConfig(year=2024, seed=42, size=100)
+    dataset = CalendarArithmeticDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        task = item["metadata"]["task"]
+        answer = item["answer"]
+
+        if task in WEEKDAY_TASKS:
+            assert answer in WEEKDAYS
+
+        elif task in NUMERIC_TASKS:
+            try:
+                num = int(answer)
+                assert num >= 0, f"task {task} produced a negative count: {num}"
+            except ValueError:
+                pytest.fail(f"task {task} produced a non-integer answer: {answer}")
+
+        elif task in BOOLEAN_TASKS:
+            assert answer in ["Yes", "No"]
+
+        elif task in DAY_TASKS:
+            try:
+                num = int(answer)
+                year = item["metadata"]["year"]
+                month = item["metadata"]["month"]
+                _, last_day = calendar.monthrange(year, month)
+                assert 1 <= num <= last_day
+            except ValueError:
+                pytest.fail(f"task {task} produced a day outside expected range (1-{last_day}): {answer}")
+
+
+def test_scoring_function():
+    """Test scoring function for different answer types."""
+    config = CalendarArithmeticConfig(year=2024, seed=42, size=1)
+    dataset = CalendarArithmeticDataset(config)
+
+    weekday_item = {"answer": "Monday", "metadata": {"task": "weekday_offset"}}
+
+    assert dataset.score_answer("Monday", weekday_item) == 1.0
+    assert dataset.score_answer("Tuesday", weekday_item) == 0.1
+    assert dataset.score_answer("It is Monday", weekday_item) == 0.0
+    assert dataset.score_answer("no weekday here", weekday_item) == 0.0
+    assert dataset.score_answer(None, weekday_item) == 0.0
+
+    numeric_item = {"answer": "10", "metadata": {"task": "count_business_days"}}
+    assert dataset.score_answer("10", numeric_item) == 1.0
+    assert dataset.score_answer("15", numeric_item) == pytest.approx(math.exp(-5 * 0.5))
+    assert dataset.score_answer("no number", numeric_item) == 0.0
+    assert dataset.score_answer(None, numeric_item) == 0.0
+
+    boolean_item = {"answer": "Yes", "metadata": {"task": "is_leap_year"}}
+    assert dataset.score_answer("Yes", boolean_item) == 1.0
+    assert dataset.score_answer("yes", boolean_item) == 1.0
+    assert dataset.score_answer("nyes", boolean_item) == 0.0
+    assert dataset.score_answer(None, boolean_item) == 0.0
+
+
+def test_calendar_date_consistency():
+    """Test that dates in metadata are consistent with config year."""
+    config = CalendarArithmeticConfig(year=2024, seed=42, size=50)
+    dataset = CalendarArithmeticDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        task = item["metadata"]["task"]
+
+        if task == "weekday_offset":
+            start_date = date.fromisoformat(item["metadata"]["start_date"])
+            assert start_date.year == config.year
+
+        elif task in {"weekday_of_date_from_first_day", "weekday_of_date"}:
+            target_date = date.fromisoformat(item["metadata"]["target_date"])
+            assert target_date.year == config.year
+
+        elif task in {"count_business_days", "count_days"}:
+            start_date = date.fromisoformat(item["metadata"]["start_date"])
+            end_date = date.fromisoformat(item["metadata"]["end_date"])
+            assert start_date.year == config.year
+            assert end_date.year == config.year
+
+        elif task == "recurring_event_day":
+            meta_year = item["metadata"]["year"]
+            month = item["metadata"]["month"]
+            answer = int(item["answer"])
+            assert meta_year == config.year
+            assert 1 <= month <= 12
+            if answer != -1:
+                _, last_day = calendar.monthrange(meta_year, month)
+                assert 1 <= answer <= last_day
+
+        elif task == "is_leap_year":
+            year = item["metadata"]["year"]
+            assert config.year - 200 <= year <= config.year + 200
+            is_leap_metadata = item["metadata"]["is_leap"]
+            computed_is_leap = calendar.isleap(year)
+            assert is_leap_metadata == computed_is_leap
+
+
+def test_calendar_iteration():
+    """Test that dataset iteration works correctly and is deterministic."""
+    config = CalendarArithmeticConfig(year=2024, seed=42, size=5)
+    dataset = CalendarArithmeticDataset(config)
+
+    items = [item for item in dataset]
+    assert len(items) == config.size
+
+    first_iter = list(dataset)
+    second_iter = list(dataset)
+    assert first_iter == second_iter
+
+
+def test_task_case_sensitivity():
+    """Test that task names are case-insensitive."""
+    tasks = ["WEEKDAY_OFFSET", "Count_Business_Days"]
+    config = CalendarArithmeticConfig(tasks=tasks, size=10)
+    dataset = CalendarArithmeticDataset(config)
+
+    for item in dataset:
+        assert item["metadata"]["task"] in [t.lower() for t in tasks]

From 8b0f634f4c75c0560b00fbbc0b0ec59e762a9bf2 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 15:24:39 +0100
Subject: [PATCH 57/94] post merge formatting

---
 GALLERY.md                                    | 33 +++++++++++--------
 reasoning_gym/arithmetic/__init__.py          |  2 +-
 .../arithmetic/calendar_arithmetic.py         |  8 ++---
 tests/test_calendar_arithmetic.py             |  8 +++--
 4 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index 15aec181..77127cd0 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -8,6 +8,7 @@ This gallery shows examples from all available datasets using their default conf
 - [basic_arithmetic](#basic_arithmetic)
 - [bf](#bf)
 - [caesar_cipher](#caesar_cipher)
+- [calendar_arithmetic](#calendar_arithmetic)
 - [chain_sum](#chain_sum)
 - [color_cube_rotation](#color_cube_rotation)
 - [countdown](#countdown)
@@ -256,30 +257,34 @@ Metadata: {'rotation': 17, 'cipher_text': 'ZW PFLI JKFDRTY ZJ FLK FW ZK DLJK SV
 ````
 
 ### calendar_arithmetic
-Generates various calendar arithmetic tasks
 Default configuration:
 ```python
-year = 2024
-tasks = None
+year = 2022
+tasks = ['weekday_offset', 'weekday_of_date', 'weekday_of_date_from_first_day', 'recurring_event_day', 'count_days', 'count_business_days', 'is_leap_year']
+offset_upper_bound = 100
+leap_year_range = 200
 seed = 42
 size = 500
 ```
 
 Example tasks:
-```
+````
 Example 1:
-Question: How many business days (Monday-Friday) are there from Tuesday, December 17, 2024 to Sunday, December 29, 2024 (inclusive of both dates)? Give the count numerically.
-Answer: 9
-Metadata: {'task': 'count_business_days', 'start_date': '2024-12-17', 'end_date': '2024-12-29'}
+Question: Between Sunday, February 27, 2022 and Wednesday, March 2, 2022 (counting both dates), what's the total count of business days (Monday through Friday)? Give the count numerically.
+Answer: 3
+Metadata: {'task': 'count_business_days', 'start_date': '2022-02-27', 'end_date': '2022-03-02'}
+
 Example 2:
-Question: Given that January 1 fell on a Sunday, which weekday occurs on August 17? State the weekday (Monday through Sunday).
-Answer: Friday
-Metadata: {'task': 'first_day_of_year', 'year': 2024, 'first_day': 'Sunday', 'target_date': '2024-08-17', 'delta_days': 229}
+Question: Starting from Monday, May 23, 2022, which weekday was it 98 days before? Write out the full weekday name.
+Answer: Monday
+Metadata: {'task': 'weekday_offset', 'start_date': '2022-05-23', 'offset_days': -98, 'target_date': '2022-02-14'}
+
 Example 3:
-Question: In August 2024, if an event recurs on the first Tuesday, what is the date (day of the month) of the event? Answer with a number. Answer with -1 if the ordinal does not exist in the month.
-Answer: 6
-Metadata: {'task': 'recurring_event_day', 'year': 2024, 'month': 8, 'ordinal': 'first', 'weekday': 'Tuesday'}
-```
+Question: If a meeting is scheduled on the last Saturday of September 2022, on which day of the month does it occur? Respond with just the number. Answer with -1 if the ordinal does not exist in the month.
+Answer: 24
+Metadata: {'task': 'recurring_event_day', 'year': 2022, 'month': 9, 'ordinal': 'last', 'weekday': 'Saturday'}
+
+````
 
 ### chain_sum
 Generates simple arithmetic tasks using only + and - operators
diff --git a/reasoning_gym/arithmetic/__init__.py b/reasoning_gym/arithmetic/__init__.py
index 18089e6d..9e1a5bc2 100644
--- a/reasoning_gym/arithmetic/__init__.py
+++ b/reasoning_gym/arithmetic/__init__.py
@@ -8,6 +8,7 @@ Arithmetic tasks for training reasoning capabilities:
 """
 
 from .basic_arithmetic import BasicArithmeticDataset, BasicArithmeticDatasetConfig
+from .calendar_arithmetic import CalendarArithmeticConfig, CalendarArithmeticDataset
 from .chain_sum import ChainSum, ChainSumConfig
 from .fraction_simplification import FractionSimplificationConfig, FractionSimplificationDataset
 from .gcd import GCDConfig, GCDDataset
@@ -15,7 +16,6 @@ from .lcm import LCMConfig, LCMDataset
 from .leg_counting import LegCountingConfig, LegCountingDataset
 from .prime_factorization import PrimeFactorizationConfig, PrimeFactorizationDataset
 from .time_intervals import TimeIntervalsConfig, TimeIntervalsDataset
-from .calendar_arithmetic import CalendarArithmeticConfig, CalendarArithmeticDataset
 
 __all__ = [
     "BasicArithmeticDataset",
diff --git a/reasoning_gym/arithmetic/calendar_arithmetic.py b/reasoning_gym/arithmetic/calendar_arithmetic.py
index 61976d8d..78c42df8 100644
--- a/reasoning_gym/arithmetic/calendar_arithmetic.py
+++ b/reasoning_gym/arithmetic/calendar_arithmetic.py
@@ -1,10 +1,10 @@
+import calendar
+import math
+import random
 from dataclasses import dataclass
 from datetime import date, timedelta
-import calendar
-import random
-import math
-from typing import Optional, Tuple, List, Dict, Any
 from enum import Enum, auto
+from typing import Any, Dict, List, Optional, Tuple
 
 from ..factory import ProceduralDataset, register_dataset
 
diff --git a/tests/test_calendar_arithmetic.py b/tests/test_calendar_arithmetic.py
index eca89ebb..87f84781 100644
--- a/tests/test_calendar_arithmetic.py
+++ b/tests/test_calendar_arithmetic.py
@@ -1,7 +1,9 @@
-from datetime import date
-import pytest
-import math
 import calendar
+import math
+from datetime import date
+
+import pytest
+
 from reasoning_gym.arithmetic import CalendarArithmeticConfig, CalendarArithmeticDataset
 
 WEEKDAYS = [

From 8528e39764a7756e9541178d29bb46c5fcdd9323 Mon Sep 17 00:00:00 2001
From: joesharratt1229 <joesharratt1229@gmail.com>
Date: Sun, 2 Feb 2025 15:27:08 +0000
Subject: [PATCH 58/94] added intermediate integration dataset generator

---
 .../algebra/intermediate_integration.py       | 234 ++++++++++++++++++
 1 file changed, 234 insertions(+)
 create mode 100644 reasoning_gym/algebra/intermediate_integration.py

diff --git a/reasoning_gym/algebra/intermediate_integration.py b/reasoning_gym/algebra/intermediate_integration.py
new file mode 100644
index 00000000..df0388a8
--- /dev/null
+++ b/reasoning_gym/algebra/intermediate_integration.py
@@ -0,0 +1,234 @@
+import random
+from dataclasses import dataclass
+from fractions import Fraction
+from typing import List, Optional
+
+import sympy
+
+from ..factory import ProceduralDataset, register_dataset
+
+
+@dataclass
+class IntermediateIntegrationConfig:
+    problem_types: tuple = ("substitution", "by_parts")
+    substitution_types: tuple = (
+        "linear",  # (ax + b)^n
+        "trigonometric",  # sin**2(x)cos(x)
+        "exponential",  # 2xe^x**2
+        "radical",  # x (3x + 2)^1/2
+    )
+
+    # Integration by parts problem categories
+    by_parts_types: tuple = (
+        "polynomial_exp_trig",  # e.g. x^2*e^x
+        "log_inverse_trig",  # e.g. ln(x)/arctan(x)
+        "cyclic",  # e.g. e^x*sinx requiring cyclic integration
+        "repeated_parts",  # Requires multiple integration by parts
+    )
+    seed: Optional[int] = None
+    size: int = 500
+
+    linear_lower_bound: int = 1  # coefficient of linear expression
+    linear_upper_bound: int = 10
+    min_linear_degree: int = 2  # degree of linear expression in substitution problem
+    max_linear_degree: int = 4
+    outer_constant_min: int = 1  # multiplicative constant to multiply integrand by
+    outer_constant_max: int = 3
+    min_poly_degree: int = 1  # degree of polynomial in by parts problem
+    max_poly_degree: int = 3
+    symbols: tuple = ("x", "X")
+    operators: tuple = (
+        "+",
+        "-",
+    )
+
+    def validate(self) -> None:
+        """Validate the configuration parameters of the integral problem"""
+        assert self.size > 0, "size must be positive"
+        assert self.linear_lower_bound > 0, "linear_lower_bound must be positive"
+        assert self.linear_upper_bound >= self.linear_lower_bound, "linear_upper_bound must be >= linear_lower_bound"
+        assert self.min_linear_degree > 0, "min_linear_degree must be positive"
+        assert self.max_linear_degree >= self.min_linear_degree, "max_linear_degree must be >= min_linear_degree"
+        assert self.outer_constant_min > 0, "outer_constant_min must be positive"
+        assert self.outer_constant_max >= self.outer_constant_min, "outer_constant_max must be >= outer_constant_min"
+        assert self.min_poly_degree > 0, "min_poly_degree must be positive"
+        assert self.max_poly_degree >= self.min_poly_degree, "max_poly_degree must be >= min_poly_degree"
+        assert all(op in ("+", "-") for op in self.operators), "invalid operator specified"
+        assert all(symbols in ("x", "X") for symbols in self.symbols), "invalid symbol specified"
+        assert all(t in ("substitution", "by_parts") for t in self.problem_types), "invalid problem type"
+        assert all(
+            t in ("linear", "trigonometric", "exponential", "radical") for t in self.substitution_types
+        ), "invalid substitution type"
+        assert all(
+            t in ("polynomial_exp_trig", "log_inverse_trig", "cyclic", "repeated_parts") for t in self.by_parts_types
+        ), "invalid by_parts type"
+
+
+class IntermediateIntegrationDataset(ProceduralDataset):
+    """Generates intermediate integration problem - either
+    by substitution or by parts"""
+
+    """Add multiplicative constant"""
+
+    def __init__(self, config: IntermediateIntegrationConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+        self.prompt_template = [
+            "Find the indefinite integral: ∫ {integrand} dx",
+            "Calculate the antiderivative: ∫ {integrand} dx",
+            "Evaluate the indefinite integral: ∫ {integrand} dx",
+        ]
+
+    def _get_outer_constant(self, rng: random.Random) -> int:
+        """Helper to generate signed outer constant from config"""
+        value = rng.randint(self.config.outer_constant_min, self.config.outer_constant_max)
+        return -value if rng.choice(self.config.operators) == "-" else value
+
+    def _generate_linear_substitution_problem(self, rng: random.Random, x: sympy.Symbol) -> sympy.Expr:
+        """Generate a linear substitution problem with outer constant"""
+        a = rng.randint(self.config.linear_lower_bound, self.config.linear_upper_bound)
+        b = rng.randint(self.config.linear_lower_bound, self.config.linear_upper_bound)
+
+        linear_function = a * x + (b if rng.choice(self.config.operators) == "+" else -b)
+        degree = rng.randint(self.config.min_linear_degree, self.config.max_linear_degree)
+
+        return self._get_outer_constant(rng) * linear_function**degree
+
+    def _generate_exponential_substitution(self, rng: random.Random, x: sympy.Symbol) -> sympy.Expr:
+        """Generate exponential substitution problem with outer constant"""
+        exponent_type = rng.choice(["linear", "quadratic"])
+
+        # Generate terms with signs
+        num_terms = 2 if exponent_type == "linear" else 3
+        terms = [
+            (-1 if rng.choice(self.config.operators) == "-" else 1)
+            * rng.randint(self.config.linear_lower_bound, self.config.linear_upper_bound)
+            for _ in range(num_terms)
+        ]
+
+        if exponent_type == "linear":
+            u = terms[0] * x + terms[1]
+            du_dx = terms[0]
+        else:  # Quadratic
+            u = terms[0] * x**2 + terms[1] * x + terms[2]
+            du_dx = 2 * terms[0] * x + terms[1]
+
+        return self._get_outer_constant(rng) * du_dx * sympy.exp(u)
+
+    def _generate_radical_substitution(self, rng: random.Random, x: sympy.Symbol) -> sympy.Expr:
+        """Generate radical substitution problem with outer constant"""
+
+        # Generate linear expression under radical: ax + b with possible negative coefficients
+        a = (-1 if rng.choice(self.config.operators) == "-" else 1) * rng.randint(
+            self.config.linear_lower_bound, self.config.linear_upper_bound
+        )
+        b = (-1 if rng.choice(self.config.operators) == "-" else 1) * rng.randint(
+            self.config.linear_lower_bound, self.config.linear_upper_bound
+        )
+
+        u = a * x + b
+        derivative = a  # du/dx
+
+        integrand = derivative * sympy.sqrt(u)
+        return self._get_outer_constant(rng) * integrand
+
+    def _generate_trigonometric_substitution(self, rng: random.Random, x: sympy.Symbol) -> sympy.Expr:
+        """Generate trigonometric substitution with outer constant"""
+        trig_func = rng.choice(["sin", "cos"])
+
+        # Generate signed coefficients
+        a = (-1 if rng.choice(self.config.operators) == "-" else 1) * rng.randint(
+            self.config.linear_lower_bound, self.config.linear_upper_bound
+        )
+        b = (-1 if rng.choice(self.config.operators) == "-" else 1) * rng.randint(
+            self.config.linear_lower_bound, self.config.linear_upper_bound
+        )
+
+        inner = a * x + b
+        power = rng.randint(1, 4)
+        if trig_func == "sin":
+            integrand = a * sympy.cos(inner) * sympy.sin(inner) ** power
+        else:
+            integrand = -a * sympy.sin(inner) * sympy.cos(inner) ** power
+        return self._get_outer_constant(rng) * integrand
+
+    def _generate_polynomial_exp_trig(self, rng: random.Random, x: sympy.Symbol) -> sympy.Expr:
+        """Generate polynomial × exponential/trigonometric integrand"""
+        poly_degree = rng.randint(self.config.min_poly_degree, self.config.max_poly_degree)
+
+        func_type = rng.choice(["exp", "sin", "cos"])
+        if func_type == "exp":
+            transcendental = sympy.exp(x)
+        else:
+            coefficient = rng.randint(1, 3)
+            transcendental = sympy.Function(func_type)(coefficient * x)
+
+        polynomial = x**poly_degree
+        integrand = polynomial * transcendental
+        return self._get_outer_constant(rng) * integrand
+
+    def _generate_log_inverse_trig(self, rng: random.Random, x: sympy.Symbol) -> sympy.Expr:
+        """Generate logarithmic or inverse trigonometric integrand"""
+        func_type = rng.choice(["log", "asin", "atan"])
+
+        if func_type == "log":
+            log_arg = x if rng.random() < 0.8 else x ** rng.randint(2, 3)
+            func = sympy.ln(log_arg)
+        else:
+            coefficient = rng.randint(1, 3)
+            func = sympy.Function(func_type)(coefficient * x)
+
+        return self._get_outer_constant(rng) * func
+
+    def _generate_cyclic_integral(self, rng: random.Random, x: sympy.Symbol) -> sympy.Expr:
+        """Generate cyclic integral (e.g., e^x * sinx)"""
+        func_pair = rng.choice(
+            [(sympy.exp(x), sympy.sin(x)), (sympy.exp(x), sympy.cos(x)), (sympy.sin(x), sympy.cos(x))]
+        )
+        integrand = func_pair[0] * func_pair[1]
+        return self._get_outer_constant(rng) * integrand
+
+    def _generate_repeated_parts(self, rng: random.Random, x: sympy.Symbol):
+        """Generate problem requiring multiple integration by parts"""
+        poly_degree = rng.randint(3, self.config.max_poly_degree)
+        transcendental = rng.choice([sympy.sin(x), sympy.cos(x), sympy.exp(x)])
+        integrand = x**poly_degree * transcendental
+        return self._get_outer_constant(rng) * integrand
+
+    def __getitem__(self, index: int):
+        """Generate either substitution or by-parts problem"""
+        rng = random.Random(self.seed + index)
+        problem_type = rng.choice(self.config.problem_types)
+        x = sympy.Symbol(rng.choice(self.config.symbols))
+
+        if problem_type == "substitution":
+            substitution_type = rng.choice(self.config.substitution_types)
+            if substitution_type == "linear":
+                integrand = self._generate_linear_substitution_problem(rng, x)
+            elif substitution_type == "trigonometric":
+                integrand = self._generate_trigonometric_substitution(rng, x)
+            elif substitution_type == "exponential":
+                integrand = self._generate_exponential_substitution(rng, x)
+            elif substitution_type == "radical":
+                integrand = self._generate_radical_substitution(rng, x)
+        else:
+            parts_type = rng.choice(self.config.by_parts_types)
+            if parts_type == "polynomial_exp_trig":
+                integrand = self._generate_polynomial_exp_trig(rng, x)
+            elif parts_type == "log_inverse_trig":
+                integrand = self._generate_log_inverse_trig(rng, x)
+            elif parts_type == "cyclic":
+                integrand = self._generate_cyclic_integral(rng, x)
+            elif parts_type == "repeated_parts":
+                integrand = self._generate_repeated_parts(rng, x)
+
+        answer = sympy.integrate(integrand, x)
+        return {
+            "question": rng.choice(self.prompt_template).format(integrand=integrand),
+            "answer": str(answer) + " + C",
+            "metadata": {
+                "integrand": str(integrand),
+                "problem_type": problem_type,
+                "variable": str(x),
+                "type": substitution_type if problem_type == "substitution" else parts_type,
+            },
+        }

From 0eb0247ebde7c1f9c8f81fb6451bb0513571abec Mon Sep 17 00:00:00 2001
From: joesharratt1229 <joesharratt1229@gmail.com>
Date: Sun, 2 Feb 2025 15:28:52 +0000
Subject: [PATCH 59/94] added register dataset to script

---
 reasoning_gym/algebra/intermediate_integration.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/reasoning_gym/algebra/intermediate_integration.py b/reasoning_gym/algebra/intermediate_integration.py
index df0388a8..9e5c9528 100644
--- a/reasoning_gym/algebra/intermediate_integration.py
+++ b/reasoning_gym/algebra/intermediate_integration.py
@@ -1,7 +1,6 @@
 import random
 from dataclasses import dataclass
-from fractions import Fraction
-from typing import List, Optional
+from typing import Optional
 
 import sympy
 
@@ -232,3 +231,6 @@ class IntermediateIntegrationDataset(ProceduralDataset):
                 "type": substitution_type if problem_type == "substitution" else parts_type,
             },
         }
+
+
+register_dataset("intermediate_integration", IntermediateIntegrationDataset, IntermediateIntegrationConfig)

From 420a44bd79444a0e7cf4fc5931ce6564c42fdd9b Mon Sep 17 00:00:00 2001
From: joesharratt1229 <joesharratt1229@gmail.com>
Date: Sun, 2 Feb 2025 15:29:24 +0000
Subject: [PATCH 60/94] added impl of simple integration dataset generator

---
 reasoning_gym/algebra/simple_integration.py | 80 +++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 reasoning_gym/algebra/simple_integration.py

diff --git a/reasoning_gym/algebra/simple_integration.py b/reasoning_gym/algebra/simple_integration.py
new file mode 100644
index 00000000..e524e3ef
--- /dev/null
+++ b/reasoning_gym/algebra/simple_integration.py
@@ -0,0 +1,80 @@
+import random
+from dataclasses import dataclass
+from fractions import Fraction
+from typing import Optional
+
+import sympy
+
+from ..factory import ProceduralDataset, register_dataset
+
+
+@dataclass
+class SimpleIntegrationConfig:
+    min_terms: int = 2
+    max_terms: int = 5
+    min_degree: int = 1
+    max_degree: int = 10
+    min_bounds: int = 1
+    max_bounds: int = 10
+    operators: tuple = ("+", "-")
+    symbols: tuple = ("x", "X")
+    seed: Optional[int] = None
+    size: int = 500
+
+    def validate(self) -> None:
+        """Validate the configuration parameters of the integral proble"""
+        assert self.min_bounds > 0, "min_bounds must be positive"
+        assert self.max_bounds >= self.min_bounds, "max_bounds must be >= min_bounds"
+        assert self.min_terms >= 0, "min_terms must be positive"
+        assert self.max_terms >= self.min_terms, "max_terms must be >= min_terms"
+        assert self.min_degree >= -10, "min_degree must be >= -10"
+        assert self.max_degree >= self.min_degree, "max_degree must be >= min_degree"
+        assert all(op in ("+", "-") for op in self.operators), "invalid operator specified"
+
+
+class SimpleIntegrationDataset(ProceduralDataset):
+    """Generates simple integration problems with one variable"""
+
+    def __init__(self, config: SimpleIntegrationConfig):
+        self._prompt_templates = [
+            "Find the indefinite integral: ∫ {integrand} dx",
+            "Calculate the antiderivative: ∫ {integrand} dx",
+            "Evaluate the indefinite integral: ∫ {integrand} dx",
+        ]
+        super().__init__(config=config, seed=config.seed, size=config.size)
+
+    def _generate_coefficient(self, rng: random.Random) -> Fraction:
+        """Generate a random coefficient for the polynomial"""
+        if rng.choice([True, False]):  # 50% chance for integer
+            return Fraction(rng.randint(self.config.min_bounds, self.config.max_bounds), 1)
+        denominator = rng.randint(2, 10)
+        return Fraction(rng.randint(self.config.min_bounds, self.config.max_bounds), denominator)
+
+    def _generate_polynomial(self, rng: random.Random) -> tuple[sympy.Symbol, sympy.Expr]:
+        """Generate a random polynomial with one variable"""
+        terms = []
+        x = sympy.Symbol(rng.choice(self.config.symbols))
+
+        for _ in range(rng.randint(self.config.min_terms, self.config.max_terms)):
+            coefficient = self._generate_coefficient(rng)
+            degree = rng.randint(self.config.min_degree, self.config.max_degree)
+            operator = rng.choice(self.config.operators)
+            term = coefficient * x**degree
+            if operator == "-":
+                term = -term
+            terms.append(term)
+        return x, sum(terms)
+
+    def __getitem__(self, idx: int) -> dict:
+        rng = random.Random(self.seed + idx)
+        symbol, polynomial = self._generate_polynomial(rng)
+        derivative = sympy.diff(polynomial, symbol)
+
+        return {
+            "question": rng.choice(self._prompt_templates).format(integrand=derivative),
+            "answer": str(polynomial) + " + C",
+            "metadata": {"integrand": str(derivative), "variable": str(symbol), "antiderivative": str(polynomial)},
+        }
+
+
+register_dataset("simple_integration", SimpleIntegrationDataset, SimpleIntegrationConfig)

From 76faad9dcfb412ea1f1488a856573101d10d57b1 Mon Sep 17 00:00:00 2001
From: joesharratt1229 <joesharratt1229@gmail.com>
Date: Sun, 2 Feb 2025 15:30:01 +0000
Subject: [PATCH 61/94] created test script for intermediate integration
 dataset generator

---
 tests/test_intermediate_integration.py | 109 +++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 tests/test_intermediate_integration.py

diff --git a/tests/test_intermediate_integration.py b/tests/test_intermediate_integration.py
new file mode 100644
index 00000000..fc35f387
--- /dev/null
+++ b/tests/test_intermediate_integration.py
@@ -0,0 +1,109 @@
+"""Tests for intermediate integration task generation"""
+
+import pytest
+import sympy
+from sympy.parsing.sympy_parser import parse_expr
+
+from reasoning_gym.algebra.intermediate_integration import IntermediateIntegrationConfig, IntermediateIntegrationDataset
+
+
+def test_intermediate_integration_config_validation():
+    """Test that invalid configs raise appropriate errors"""
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(problem_types=["invalid_problem_type"])
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(substitution_types=["invalid_substitution_type"])
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(by_parts_types=["invalid_by_parts_type"])
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(linear_lower_bound=2, linear_upper_bound=1)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(linear_lower_bound=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(min_linear_degree=5, max_linear_degree=1)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(min_linear_degree=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(outer_constant_min=5, outer_constant_max=1)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(outer_constant_min=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(min_poly_degree=5, max_poly_degree=1)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(min_poly_degree=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(symbols=("x", "y"))
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = IntermediateIntegrationConfig(operators=("+", "-", "*", "/"))
+        config.validate()
+
+
+def test_intermediate_integration_dataset_deterministic():
+    """Test that dataset generates same items with same seed"""
+    config = IntermediateIntegrationConfig(seed=42, size=10)
+    dataset1 = IntermediateIntegrationDataset(config)
+    dataset2 = IntermediateIntegrationDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i]
+
+
+def test_intermediate_integration_dataset_items():
+    """Test that dataset items are valid"""
+    config = IntermediateIntegrationConfig(seed=42, size=10)
+    dataset = IntermediateIntegrationDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        assert "integrand" in item["metadata"]
+        assert "problem_type" in item["metadata"]
+        assert "variable" in item["metadata"]
+        assert "type" in item["metadata"]
+
+        # verify answer is mathematical expression
+        answer = item["answer"]
+        answer = answer.replace(" + C", "")
+        assert isinstance(parse_expr(answer), sympy.Expr)
+
+
+def test_solution_verification():
+    """Test for solution verification of each answer"""
+    config = IntermediateIntegrationConfig(seed=42, size=10)
+    dataset = IntermediateIntegrationDataset(config)
+
+    for item in dataset:
+        integrand = parse_expr(item["metadata"]["integrand"])
+        variable = sympy.Symbol(item["metadata"]["variable"])
+        answer = parse_expr(item["answer"].replace(" + C", ""))
+
+        # Verify that the derivative of the answer equals the integrand
+        assert sympy.simplify(sympy.diff(answer, variable) - integrand) == 0

From 40e53b8bca11ec176fdc1ee27be689d6b53aaba0 Mon Sep 17 00:00:00 2001
From: joesharratt1229 <joesharratt1229@gmail.com>
Date: Sun, 2 Feb 2025 15:30:22 +0000
Subject: [PATCH 62/94] added implementation of simple integration dataset

---
 tests/test_simple_integration.py | 85 ++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 tests/test_simple_integration.py

diff --git a/tests/test_simple_integration.py b/tests/test_simple_integration.py
new file mode 100644
index 00000000..8d64cc25
--- /dev/null
+++ b/tests/test_simple_integration.py
@@ -0,0 +1,85 @@
+import random
+from fractions import Fraction
+
+import pytest
+import sympy
+from sympy.parsing.sympy_parser import parse_expr
+
+from reasoning_gym.algebra.simple_integration import SimpleIntegrationConfig, SimpleIntegrationDataset
+
+
+def test_simple_integration_config_validation():
+    """Test that invalid configs raise appropriate errors"""
+    with pytest.raises(AssertionError):
+        config = SimpleIntegrationConfig(min_bounds=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = SimpleIntegrationConfig(max_bounds=5, min_bounds=10)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = SimpleIntegrationConfig(min_terms=-1)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = SimpleIntegrationConfig(max_terms=2, min_terms=5)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = SimpleIntegrationConfig(min_degree=-11)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = SimpleIntegrationConfig(max_degree=3, min_degree=5)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = SimpleIntegrationConfig(operators=("+", "-", "*"))
+        config.validate()
+
+
+def test_simple_integration_dataset_deterministic():
+    """Test that dataset generates same items with same seed"""
+    config = SimpleIntegrationConfig(seed=42, size=10)
+    dataset1 = SimpleIntegrationDataset(config)
+    dataset2 = SimpleIntegrationDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i]
+
+
+def test_simple_integration_dataset_items():
+    """Test that dataset items are valid"""
+    config = SimpleIntegrationConfig(seed=42, size=10)
+    dataset = SimpleIntegrationDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        assert "integrand" in item["metadata"]
+        assert "variable" in item["metadata"]
+        assert "antiderivative" in item["metadata"]
+
+        # Verify answer is a mathematical expression
+        answer = item["answer"]
+        answer = answer.replace(" + C", "")
+        assert isinstance(parse_expr(answer), sympy.Expr)
+
+
+def test_simple_integration_solution_verification():
+    """Test for solution verification of each answer"""
+    config = SimpleIntegrationConfig(seed=42, size=10)
+    dataset = SimpleIntegrationDataset(config)
+
+    for item in dataset:
+        integrand = parse_expr(item["metadata"]["integrand"])
+        variable = sympy.Symbol(item["metadata"]["variable"])
+        answer = parse_expr(item["answer"].replace(" + C", ""))
+
+        # Verify that the derivative of the answer equals the integrand
+        assert sympy.simplify(sympy.diff(answer, variable) - integrand) == 0

From 166e3d5f0dcb2d02b385150277b30ae2e7f70d8e Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:33:01 +0100
Subject: [PATCH 63/94] feat: Add arc_1d.py module for one-dimensional abstract
 reasoning challenges

---
 reasoning_gym/cognition/arc_1d.py | 55 +++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 reasoning_gym/cognition/arc_1d.py

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
new file mode 100644
index 00000000..ed183df8
--- /dev/null
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -0,0 +1,55 @@
+from random import Random
+from typing import Optional, Dict, List
+
+
+def gen_field(size: int, color: int = 0) -> List[int]:
+    """Generate a field of given size filled with specified color (default 0)."""
+    return [color] * size
+
+
+def write_block(pos: int, block: List[int], field: List[int]) -> List[int]:
+    """Write a block into a field at given position."""
+    result = field.copy()
+    for i, color in enumerate(block):
+        result[pos + i] = color
+    return result
+
+
+def task_move_n_pix(size: int, move_pix: int, solid: bool, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """
+    Generate a task where a block is moved to the right by move_pix pixels.
+
+    Args:
+        size: Size of the field
+        move_pix: Number of pixels to move the block
+        solid: If True, block is single color; if False, block has random colors
+        rng: Random number generator
+
+    Returns:
+        Dictionary with 'input' and 'output' fields containing the puzzle,
+        or None if valid puzzle cannot be generated
+    """
+    # Validate size constraints
+    if size <= move_pix + 1:
+        return None
+
+    # Generate block size and position
+    block_size = rng.randint(1, size - move_pix - 1)
+    block_pos = rng.randint(0, size - block_size - move_pix)
+
+    # Generate the block
+    if solid:
+        # For solid blocks, use single random color (1-9)
+        color = rng.randint(1, 9)
+        block = [color] * block_size
+    else:
+        # For non-solid blocks, each position gets random color (1-9)
+        block = [rng.randint(1, 9) for _ in range(block_size)]
+
+    # Create input field with block at initial position
+    question = write_block(block_pos, block, gen_field(size))
+
+    # Create output field with block moved right by move_pix
+    answer = write_block(block_pos + move_pix, block, gen_field(size))
+
+    return {"input": question, "output": answer}

From 4c22fca7ed2fb1ae4a755412a8dc9e6c0e0b5a32 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:33:02 +0100
Subject: [PATCH 64/94] feat: Add new 1D task generation functions to arc_1d.py

---
 reasoning_gym/cognition/arc_1d.py | 214 +++++++++++++++++++++++++++---
 1 file changed, 193 insertions(+), 21 deletions(-)

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index ed183df8..433b4856 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -16,40 +16,212 @@ def write_block(pos: int, block: List[int], field: List[int]) -> List[int]:
 
 
 def task_move_n_pix(size: int, move_pix: int, solid: bool, rng: Random) -> Optional[Dict[str, List[int]]]:
-    """
-    Generate a task where a block is moved to the right by move_pix pixels.
-
-    Args:
-        size: Size of the field
-        move_pix: Number of pixels to move the block
-        solid: If True, block is single color; if False, block has random colors
-        rng: Random number generator
-
-    Returns:
-        Dictionary with 'input' and 'output' fields containing the puzzle,
-        or None if valid puzzle cannot be generated
-    """
-    # Validate size constraints
+    """Generate a task where a block is moved to the right by move_pix pixels."""
     if size <= move_pix + 1:
         return None
 
-    # Generate block size and position
     block_size = rng.randint(1, size - move_pix - 1)
     block_pos = rng.randint(0, size - block_size - move_pix)
 
-    # Generate the block
     if solid:
-        # For solid blocks, use single random color (1-9)
         color = rng.randint(1, 9)
         block = [color] * block_size
     else:
-        # For non-solid blocks, each position gets random color (1-9)
         block = [rng.randint(1, 9) for _ in range(block_size)]
 
-    # Create input field with block at initial position
     question = write_block(block_pos, block, gen_field(size))
-
-    # Create output field with block moved right by move_pix
     answer = write_block(block_pos + move_pix, block, gen_field(size))
 
     return {"input": question, "output": answer}
+
+def task_move_n_pix_wrapped(size: int, move_pix: int, solid: bool, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block is moved to the right by move_pix pixels with wrapping."""
+    block_size = rng.randint(1, size)
+    block_pos = rng.randint(0, size)
+    
+    if solid:
+        color = rng.randint(1, 9)
+        block = [color] * block_size
+    else:
+        block = [rng.randint(1, 9) for _ in range(block_size)]
+
+    question = gen_field(size)
+    answer = gen_field(size)
+    
+    for i, color in enumerate(block):
+        question[(block_pos + i) % size] = color
+        answer[(block_pos + move_pix + i) % size] = color
+
+    return {"input": question, "output": answer}
+
+def task_gravity(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where all non-zero elements are attracted to the left."""
+    density = 0.5
+    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
+    
+    non_zero = [x for x in question if x != 0]
+    answer = non_zero + [0] * (size - len(non_zero))
+    
+    return {"input": question, "output": answer}
+
+def task_gravity_counting(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where non-zero elements are counted and represented as a sequence of 1s."""
+    density = 0.5
+    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
+    
+    count = sum(1 for x in question if x != 0)
+    answer = [1] * count + [0] * (size - count)
+    
+    return {"input": question, "output": answer}
+
+def task_gravity_antigravity(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where color 1 moves right and color 2 moves left."""
+    density = 0.5
+    question = [rng.randint(1, 2) if rng.random() < density else 0 for _ in range(size)]
+    
+    color1 = [x for x in question if x == 1]
+    color2 = [x for x in question if x == 2]
+    answer = [2] * len(color2) + [0] * (size - len(color1) - len(color2)) + [1] * len(color1)
+    
+    return {"input": question, "output": answer}
+
+def task_block_touch_dot(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block moves to touch (but not cover) a dot."""
+    dot_color = 1
+    block_color = rng.randint(2, 9)
+    
+    block_size = rng.randint(1, size)
+    dot_pos = rng.randint(0, size)
+    
+    can_place_left = dot_pos >= block_size
+    can_place_right = dot_pos + block_size < size
+    
+    if not (can_place_left or can_place_right):
+        return None
+        
+    if can_place_left and can_place_right:
+        side = rng.choice(["left", "right"])
+    elif can_place_left:
+        side = "left"
+    else:
+        side = "right"
+        
+    if side == "left":
+        q_block_pos = rng.randint(0, dot_pos - block_size)
+        a_block_pos = dot_pos - block_size
+    else:
+        q_block_pos = rng.randint(dot_pos + 1, size - block_size)
+        a_block_pos = dot_pos + 1
+        
+    question = gen_field(size)
+    question[dot_pos] = dot_color
+    question = write_block(q_block_pos, [block_color] * block_size, question)
+    
+    answer = gen_field(size)
+    answer[dot_pos] = dot_color
+    answer = write_block(a_block_pos, [block_color] * block_size, answer)
+    
+    return {"input": question, "output": answer}
+
+def task_block_touch_dot_n_pix(size: int, move_pix: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block moves move_pix pixels toward a dot."""
+    dot_color = 2
+    block_color = rng.randint(3, 9)
+    
+    block_size = rng.randint(1, size)
+    dot_pos = rng.randint(0, size)
+    
+    can_place_left = dot_pos >= block_size
+    can_place_right = dot_pos + block_size < size
+    
+    if not (can_place_left or can_place_right):
+        return None
+        
+    if can_place_left and can_place_right:
+        side = rng.choice(["left", "right"])
+    elif can_place_left:
+        side = "left"
+    else:
+        side = "right"
+        
+    if side == "left":
+        q_block_pos = rng.randint(0, dot_pos - block_size)
+        distance = (dot_pos - block_size) - q_block_pos
+        move = min(distance, move_pix)
+        a_block_pos = q_block_pos + move
+    else:
+        q_block_pos = rng.randint(dot_pos + 1, size - block_size)
+        distance = q_block_pos - (dot_pos + 1)
+        move = min(distance, move_pix)
+        a_block_pos = q_block_pos - move
+        
+    question = gen_field(size)
+    question[dot_pos] = dot_color
+    question = write_block(q_block_pos, [block_color] * block_size, question)
+    
+    answer = gen_field(size)
+    answer[dot_pos] = dot_color
+    answer = write_block(a_block_pos, [block_color] * block_size, answer)
+    
+    return {"input": question, "output": answer}
+
+def task_block_scale_to_dot(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block scales to touch a dot (keeping one end fixed)."""
+    dot_color = 2
+    block_color = rng.randint(3, 9)
+    
+    block_size = rng.randint(1, size)
+    dot_pos = rng.randint(0, size)
+    
+    can_place_left = dot_pos >= block_size
+    can_place_right = dot_pos + block_size < size
+    
+    if not (can_place_left or can_place_right):
+        return None
+        
+    if can_place_left and can_place_right:
+        side = rng.choice(["left", "right"])
+    elif can_place_left:
+        side = "left"
+    else:
+        side = "right"
+        
+    if side == "left":
+        q_block_pos = rng.randint(0, dot_pos - block_size)
+        new_size = dot_pos - q_block_pos + 1
+        a_block_pos = q_block_pos
+    else:
+        q_block_pos = rng.randint(dot_pos + 1, size - block_size)
+        new_size = (q_block_pos + block_size) - dot_pos
+        a_block_pos = dot_pos
+        
+    question = gen_field(size)
+    question[dot_pos] = dot_color
+    question = write_block(q_block_pos, [block_color] * block_size, question)
+    
+    answer = gen_field(size)
+    answer[dot_pos] = dot_color
+    answer = write_block(a_block_pos, [block_color] * new_size, answer)
+    
+    return {"input": question, "output": answer}
+
+def task_two_points_and_fill(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where space between two points of same color is filled with that color."""
+    color = rng.randint(1, 9)
+    
+    pos1 = rng.randint(0, size - 1)
+    pos2 = rng.randint(0, size - 1)
+    if pos1 == pos2:
+        return None
+        
+    pos1, pos2 = min(pos1, pos2), max(pos1, pos2)
+    
+    question = gen_field(size)
+    question[pos1] = color
+    question[pos2] = color
+    
+    answer = question.copy()
+    for i in range(pos1, pos2 + 1):
+        answer[i] = color
+    
+    return {"input": question, "output": answer}

From 9dac01fda7421ad928e5ddc6277eb24b3101aff0 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:34:52 +0100
Subject: [PATCH 65/94] feat: Add new 1D ARC task generation functions

---
 reasoning_gym/cognition/arc_1d.py | 215 ++++++++++++++++++++++++++++++
 1 file changed, 215 insertions(+)

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index 433b4856..bdbf9253 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -225,3 +225,218 @@ def task_two_points_and_fill(size: int, rng: Random) -> Optional[Dict[str, List[
         answer[i] = color
     
     return {"input": question, "output": answer}
+
+def task_reflect_block_with_border_pixel(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block with a border pixel is reflected."""
+    block_size = rng.randint(2, size)
+    if block_size > size:
+        return None
+        
+    c1 = rng.randint(1, 9)
+    c2 = rng.randint(1, 9)
+    if c1 == c2:
+        return None
+        
+    side = "left" if rng.random() < 0.5 else "right"
+    pos = rng.randint(0, size - block_size)
+    
+    block = [c1] * block_size
+    if side == "left":
+        block[0] = c2
+    else:
+        block[block_size - 1] = c2
+        
+    question = write_block(pos, block, gen_field(size))
+    reversed_block = block[::-1]  # Reverse the block
+    answer = write_block(pos, reversed_block, gen_field(size))
+    
+    return {"input": question, "output": answer}
+
+def task_reflect_block_with_border_pixel_random(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a random-colored block with a border pixel is reflected."""
+    block_size = rng.randint(2, size)
+    if block_size > size:
+        return None
+        
+    side = "left" if rng.random() < 0.5 else "right"
+    pos = rng.randint(0, size - block_size)
+    
+    block = [rng.randint(1, 9) for _ in range(block_size)]
+    border_color = rng.randint(1, 9)
+    
+    if side == "left":
+        if block[0] == border_color:
+            return None
+        block[0] = border_color
+    else:
+        if block[block_size - 1] == border_color:
+            return None
+        block[block_size - 1] = border_color
+        
+    question = write_block(pos, block, gen_field(size))
+    reversed_block = block[::-1]  # Reverse the block
+    answer = write_block(pos, reversed_block, gen_field(size))
+    
+    return {"input": question, "output": answer}
+
+def task_reflect_block_around_dot(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block is reflected around a dot."""
+    dot_color = 2
+    
+    dot_pos = rng.randint(0, size)
+    block_size = rng.randint(1, size)
+    block_pos = rng.randint(0, size - block_size)
+    block_end = block_pos + block_size - 1
+    
+    # Check if block is strictly to left or right of dot
+    strictly_left = block_end < dot_pos
+    strictly_right = block_pos > dot_pos
+    
+    if not (strictly_left or strictly_right):
+        return None
+        
+    block_color = rng.randint(3, 9)  # Different from dot color
+    block = [block_color] * block_size
+    
+    # Calculate reflection bounds
+    min_reflect = 2 * dot_pos - block_end
+    max_reflect = 2 * dot_pos - block_pos
+    if min_reflect < 0 or max_reflect >= size:
+        return None
+        
+    question = gen_field(size)
+    question = write_block(block_pos, block, question)
+    question[dot_pos] = dot_color
+    
+    answer = gen_field(size)
+    answer[dot_pos] = dot_color
+    for i in range(block_size):
+        reflect_idx = 2 * dot_pos - (block_pos + i)
+        answer[reflect_idx] = block[i]
+    
+    return {"input": question, "output": answer}
+
+def task_block_and_noise_remove(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where noise around a block needs to be removed."""
+    block_size = rng.randint(2, size)
+    if block_size > size:
+        return None
+        
+    block_pos = rng.randint(0, size - block_size)
+    color = rng.randint(1, 9)
+    
+    # Create field with block
+    field = gen_field(size)
+    for i in range(block_size):
+        field[block_pos + i] = color
+        
+    # Track forbidden positions for noise
+    forbidden = [False] * size
+    for i in range(block_pos, block_pos + block_size):
+        forbidden[i] = True
+    if block_pos > 0:
+        forbidden[block_pos - 1] = True
+    if block_pos + block_size < size:
+        forbidden[block_pos + block_size] = True
+        
+    # Add noise
+    noise_count = rng.randint(1, 3)
+    noise_positions = []
+    
+    for _ in range(noise_count):
+        allowed = [i for i in range(size) if not forbidden[i]]
+        if not allowed:
+            break
+        noise_pos = rng.choice(allowed)
+        noise_positions.append(noise_pos)
+        field[noise_pos] = color
+        forbidden[noise_pos] = True
+        if noise_pos > 0:
+            forbidden[noise_pos - 1] = True
+        if noise_pos + 1 < size:
+            forbidden[noise_pos + 1] = True
+            
+    if len(noise_positions) < noise_count:
+        return None
+        
+    question = field
+    answer = field.copy()
+    for pos in noise_positions:
+        answer[pos] = 0
+        
+    return {"input": question, "output": answer}
+
+def task_block_and_noise_remove_inside(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where noise inside a block needs to be removed."""
+    if size <= 6:
+        return None
+        
+    block_size = rng.randint(6, size)
+    if block_size > size:
+        return None
+        
+    block_pos = rng.randint(0, size - block_size)
+    color = rng.randint(1, 9)
+    
+    # Create field with block
+    field = gen_field(size)
+    for i in range(block_size):
+        field[block_pos + i] = color
+        
+    # Add noise inside block
+    max_noise = max(1, (block_size // 2) - 1)
+    noise_count = rng.randint(1, max_noise)
+    
+    positions = list(range(block_size))
+    rng.shuffle(positions)
+    noise_positions = positions[:noise_count]
+    
+    for offset in noise_positions:
+        pos = block_pos + offset
+        noise_color = rng.randint(1, 9)
+        while noise_color == color:
+            noise_color = rng.randint(1, 9)
+        field[pos] = noise_color
+        
+    question = field
+    answer = field.copy()
+    for offset in noise_positions:
+        answer[block_pos + offset] = color
+        
+    return {"input": question, "output": answer}
+
+def task_copy_block_to_dots(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block pattern is copied to dot positions."""
+    block_size = 3 if rng.random() < 0.5 else 5
+    if block_size >= size:
+        return None
+        
+    color = rng.randint(1, 9)
+    block = [color] * block_size
+    
+    # Generate dots with minimum distance to prevent overlap
+    min_gap = block_size
+    dot_positions = []
+    pos = block_size + block_size//2 + 1
+    
+    while pos <= size - block_size:
+        if rng.random() < 0.5:  # Control dot density
+            dot_positions.append(pos)
+            pos += min_gap
+        pos += 1
+        
+    if not dot_positions:
+        return None
+        
+    question = gen_field(size)
+    question = write_block(0, block, question)
+    for pos in dot_positions:
+        question[pos] = color
+        
+    answer = gen_field(size)
+    answer = write_block(0, block, answer)
+    for pos in dot_positions:
+        block_start = pos - block_size//2
+        answer = write_block(block_start, block, answer)
+        
+    return {"input": question, "output": answer}

From dc11f88c0bc119c0a5703bc09e921cdb4e1b2df4 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:36:19 +0100
Subject: [PATCH 66/94] feat: Add new 1D ARC task generation functions for
 block manipulation

---
 reasoning_gym/cognition/arc_1d.py | 208 ++++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index bdbf9253..24d30455 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -440,3 +440,211 @@ def task_copy_block_to_dots(size: int, rng: Random) -> Optional[Dict[str, List[i
         answer = write_block(block_start, block, answer)
         
     return {"input": question, "output": answer}
+
+def task_copy_block_to_dots_colors(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block pattern is copied to dot positions with matching colors."""
+    block_size = 3 if rng.random() < 0.5 else 5
+    if block_size >= size:
+        return None
+        
+    block_color = rng.randint(1, 9)
+    block = [block_color] * block_size
+    
+    # Generate dots with minimum distance to prevent overlap
+    min_gap = block_size
+    dot_positions = []
+    dot_colors = []
+    pos = block_size + block_size//2 + 1
+    
+    while pos < size - block_size:
+        if rng.random() < 0.5:
+            dot_color = rng.randint(1, 9)
+            dot_positions.append(pos)
+            dot_colors.append(dot_color)
+            pos += min_gap
+        pos += 1
+        
+    if not dot_positions:
+        return None
+        
+    question = gen_field(size)
+    question = write_block(0, block, question)
+    for i, pos in enumerate(dot_positions):
+        question[pos] = dot_colors[i]
+        
+    answer = gen_field(size)
+    answer = write_block(0, block, answer)
+    for i, pos in enumerate(dot_positions):
+        block_start = pos - block_size//2
+        colored_block = [dot_colors[i]] * block_size
+        answer = write_block(block_start, colored_block, answer)
+        
+    return {"input": question, "output": answer}
+
+def task_paint_biggest_block(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where the largest block is painted a different color."""
+    target_color = 1
+    initial_color = rng.randint(2, 9)
+    
+    # Generate random blocks
+    question = gen_field(size)
+    blocks = []
+    pos = 0
+    
+    while pos < size:
+        if rng.random() < 0.4 and size - pos >= 2:
+            block_size = rng.randint(2, min(size - pos, 6))
+            blocks.append((pos, block_size))
+            for i in range(block_size):
+                question[pos + i] = initial_color
+            pos += block_size + 1
+        else:
+            pos += 1
+            
+    if len(blocks) < 2:
+        return None
+        
+    # Find biggest block
+    biggest_pos, biggest_size = max(blocks, key=lambda x: x[1])
+    
+    # Check if there are multiple blocks of the same size
+    biggest_count = sum(1 for _, size in blocks if size == biggest_size)
+    if biggest_count > 1:
+        return None
+        
+    answer = question.copy()
+    for i in range(biggest_size):
+        answer[biggest_pos + i] = target_color
+        
+    return {"input": question, "output": answer}
+
+def task_sort_blocks_by_size(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where blocks are sorted by size with 1 pixel gaps."""
+    color = rng.randint(1, 9)
+    blocks = []
+    pos = 0
+    
+    # Generate random blocks with random sizes
+    while pos < size:
+        if rng.random() < 0.4 and size - pos >= 2:
+            block_size = rng.randint(1, min(size - pos, 6))
+            blocks.append((pos, block_size))
+            pos += block_size + rng.randint(1, 4)  # Random gaps
+        else:
+            pos += 1
+            
+    if len(blocks) < 2:
+        return None
+        
+    # Create input field
+    question = gen_field(size)
+    for pos, block_size in blocks:
+        for i in range(block_size):
+            question[pos + i] = color
+            
+    # Sort blocks by size
+    blocks.sort(key=lambda x: x[1])
+    
+    # Check if sorted blocks fit with gaps
+    total_space = sum(size for _, size in blocks) + len(blocks) - 1
+    if total_space > size:
+        return None
+        
+    # Create answer field with sorted blocks
+    answer = gen_field(size)
+    current_pos = 0
+    
+    for _, block_size in blocks:
+        for i in range(block_size):
+            answer[current_pos + i] = color
+        current_pos += block_size + 1  # One pixel gap
+        
+    return {"input": question, "output": answer}
+
+def task_sort_complete_sequence(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a complete sequence of block sizes is sorted."""
+    # Calculate max possible block size given total array size
+    max_size = 1
+    total_space = 0
+    while total_space + max_size + 1 <= size:
+        total_space += max_size + 1
+        max_size += 1
+    max_size -= 1
+    
+    if max_size < 2:
+        return None
+        
+    color = rng.randint(1, 9)
+    
+    # Create sequence of all sizes from 1 to max_size
+    blocks = list(range(1, max_size + 1))
+    rng.shuffle(blocks)
+    
+    # Create input field with shuffled blocks
+    question = gen_field(size)
+    pos = 0
+    for block_size in blocks:
+        for i in range(block_size):
+            question[pos + i] = color
+        pos += block_size + 1
+        
+    # Create answer field with sorted blocks
+    answer = gen_field(size)
+    pos = 0
+    for block_size in range(1, max_size + 1):
+        for i in range(block_size):
+            answer[pos + i] = color
+        pos += block_size + 1
+        
+    return {"input": question, "output": answer}
+
+def task_recolor_blocks_by_size(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where two blocks are recolored based on their size."""
+    # Generate two different random sizes
+    size1 = rng.randint(2, 8)
+    size2 = rng.randint(2, 8)
+    while size2 == size1:
+        size2 = rng.randint(2, 8)
+        
+    # Ensure both blocks fit with at least 1 gap
+    if size1 + size2 + 1 > size:
+        return None
+        
+    # Place blocks with gap
+    pos1 = rng.randint(0, size - (size1 + size2 + 1))
+    pos2 = rng.randint(pos1 + size1 + 1, size - size2)
+    
+    # Create input field with both blocks color 3
+    question = gen_field(size)
+    for i in range(size1):
+        question[pos1 + i] = 3
+    for i in range(size2):
+        question[pos2 + i] = 3
+        
+    # Create answer field with recolored blocks
+    answer = question.copy()
+    if size1 > size2:
+        for i in range(size1):
+            answer[pos1 + i] = 1
+        for i in range(size2):
+            answer[pos2 + i] = 2
+    else:
+        for i in range(size1):
+            answer[pos1 + i] = 2
+        for i in range(size2):
+            answer[pos2 + i] = 1
+            
+    return {"input": question, "output": answer}
+
+def task_gravity_one_step(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where non-zero elements move one step left if possible."""
+    question = [rng.randint(1, 9) if rng.random() < 0.5 else 0 for _ in range(size)]
+    answer = question.copy()
+    
+    # Move each non-zero pixel one step left if possible
+    for i in range(1, size):
+        if answer[i] != 0 and answer[i-1] == 0:
+            answer[i-1] = answer[i]
+            answer[i] = 0
+            
+    return {"input": question, "output": answer}

From 3714e6c5ff3d24e0e4ebfb892d80397b329171c0 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:37:14 +0100
Subject: [PATCH 67/94] feat: Add five new 1D ARC task generation functions

---
 reasoning_gym/cognition/arc_1d.py | 181 ++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index 24d30455..b4c1f01e 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -648,3 +648,184 @@ def task_gravity_one_step(size: int, rng: Random) -> Optional[Dict[str, List[int
             answer[i] = 0
             
     return {"input": question, "output": answer}
+
+def task_move_block_by_own_size(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block moves right by its own size."""
+    block_size = rng.randint(1, size//2)  # Ensure space for movement
+    pos = rng.randint(0, size - block_size * 2)  # Space for block and movement
+    color = rng.randint(1, 9)
+    
+    question = gen_field(size)
+    block = [color] * block_size
+    question = write_block(pos, block, question)
+    
+    answer = write_block(pos + block_size, block, gen_field(size))
+    
+    return {"input": question, "output": answer}
+
+def task_change_to_five(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where all non-zero colors change to 5."""
+    density = 0.5
+    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
+    answer = [5 if x != 0 else 0 for x in question]
+    
+    return {"input": question, "output": answer}
+
+def task_recolor_blocks_from_palette(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where blocks are recolored using a color palette."""
+    # Generate blocks of same size
+    block_size = rng.randint(2, 4)
+    blocks = []
+    pos = 0
+    
+    while pos + block_size <= size:
+        if rng.random() < 0.4:
+            blocks.append(pos)
+            pos += block_size + 1
+        else:
+            pos += 1
+            
+    # Ensure we have space for palette
+    while blocks and blocks[-1] + block_size + len(blocks) + 1 >= size:
+        blocks.pop()
+        
+    if not blocks:
+        return None
+        
+    # Shift blocks right to make room for palette
+    palette_size = len(blocks)
+    blocks = [pos + palette_size + 1 for pos in blocks]
+    
+    # Generate color palette
+    colors = []
+    for _ in range(len(blocks)):
+        while True:
+            color = rng.randint(1, 9)
+            if color not in colors:
+                colors.append(color)
+                break
+                
+    # Create question with color palette and blocks
+    question = gen_field(size)
+    
+    # Place color palette at start
+    for i, color in enumerate(colors):
+        question[i] = color
+        
+    # Place blocks of color 5
+    for block_pos in blocks:
+        for i in range(block_size):
+            question[block_pos + i] = 5
+            
+    # Create answer with recolored blocks
+    answer = question.copy()
+    for block_idx, block_pos in enumerate(blocks):
+        color = colors[block_idx]
+        for i in range(block_size):
+            answer[block_pos + i] = color
+            
+    return {"input": question, "output": answer}
+
+def task_duplicate_block_from_seeds(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block is duplicated from seed pixels."""
+    block_size = rng.randint(2, 4)
+    if block_size + 1 >= size:
+        return None
+    if size <= 3 + block_size:
+        return None
+    
+    # Position block with space for seeds
+    block_pos = rng.randint(2, size - block_size - 1)
+    
+    # Decide seed placement
+    left_seed = rng.random() < 0.5
+    right_seed = rng.random() < 0.5
+    if not (left_seed or right_seed):
+        return None
+    
+    # Create input
+    question = gen_field(size)
+    
+    # Place main block
+    for i in range(block_size):
+        question[block_pos + i] = 1
+    
+    # Place seeds with gaps
+    seeds = []
+    if left_seed:
+        color = rng.randint(1, 9)
+        question[block_pos - 2] = color
+        seeds.append(("left", block_pos - 2, color))
+    if right_seed:
+        color = rng.randint(1, 9)
+        question[block_pos + block_size + 1] = color
+        seeds.append(("right", block_pos + block_size + 1, color))
+    
+    # Create answer with duplicated blocks
+    answer = question.copy()
+    
+    for side, seed_pos, color in seeds:
+        if side == "left":
+            # For left seed, blocks end at seed
+            end_pos = seed_pos
+            while end_pos >= 0:
+                start_pos = end_pos - block_size + 1
+                for pos in range(max(0, start_pos), end_pos + 1):
+                    answer[pos] = color
+                if start_pos < 1:
+                    break
+                end_pos = start_pos - 2  # -1 for gap
+        else:  # side == "right"
+            # For right seed, blocks start at seed
+            start_pos = seed_pos
+            while start_pos < size:
+                for offset in range(min(block_size, size - start_pos)):
+                    answer[start_pos + offset] = color
+                if start_pos + block_size + 1 >= size:
+                    break
+                start_pos = start_pos + block_size + 1  # +1 for gap
+    
+    return {"input": question, "output": answer}
+
+def task_fill_from_pixel(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a pixel fills in one direction until hitting another pixel."""
+    block_size = rng.randint(3, 6)
+    if block_size >= size - 2:
+        return None
+    
+    # Position block with space for seed
+    block_pos = rng.randint(1, size - block_size - 1)
+    
+    # Create input
+    question = gen_field(size)
+    
+    # Place main block
+    block_color = rng.randint(1, 9)
+    for i in range(block_size):
+        question[block_pos + i] = block_color
+    
+    # Place seed pixel and determine fill direction
+    seed_color = rng.randint(1, 9)
+    while seed_color == block_color:
+        seed_color = rng.randint(1, 9)
+        
+    is_left = rng.random() < 0.5
+    
+    if is_left:
+        question[block_pos - 1] = seed_color
+    else:
+        question[block_pos + block_size] = seed_color
+    
+    # Create answer with fill
+    answer = question.copy()
+    
+    if is_left:
+        # Fill from seed to left border
+        for i in range(block_pos):
+            answer[i] = seed_color
+    else:
+        # Fill from seed to right border
+        for i in range(block_pos + block_size, size):
+            answer[i] = seed_color
+    
+    return {"input": question, "output": answer}

From da16467ca7d1671969909a594e0adaf776ef2375 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:38:14 +0100
Subject: [PATCH 68/94] feat: Add five new 1D ARC task generation functions

---
 reasoning_gym/cognition/arc_1d.py | 202 ++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index b4c1f01e..4d3733fa 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -829,3 +829,205 @@ def task_fill_from_pixel(size: int, rng: Random) -> Optional[Dict[str, List[int]
             answer[i] = seed_color
     
     return {"input": question, "output": answer}
+
+def task_mark_size_two_blocks(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where size-2 blocks are marked with surrounding pixels."""
+    blocks = []
+    pos = 0
+    
+    # Generate blocks with minimum gap of 2
+    while pos < size:
+        if rng.random() < 0.4:
+            block_size = rng.randint(1, 3)
+            # Check if we have space for block and potential markers
+            needed_space = block_size + (2 if block_size == 2 else 0)
+            if pos + needed_space < size:
+                blocks.append((pos, block_size))
+                pos += block_size + 2  # Minimum gap of 2
+            
+        pos += 1
+    
+    if len(blocks) < 2:
+        return None
+    
+    # Verify gaps between blocks (including markers)
+    valid = True
+    for i in range(len(blocks)-1):
+        pos1, size1 = blocks[i]
+        pos2, _ = blocks[i+1]
+        needed_gap = 3 if size1 == 2 else 2
+        if pos2 - (pos1 + size1) < needed_gap:
+            valid = False
+            break
+    if not valid:
+        return None
+    
+    # Create input with blocks
+    question = gen_field(size)
+    for pos, block_size in blocks:
+        # Place block
+        for i in range(block_size):
+            question[pos + i] = 1
+    
+    # Create answer with markers
+    answer = question.copy()
+    for pos, block_size in blocks:
+        if block_size == 2:
+            # Add markers for size 2 blocks
+            if pos > 0:
+                answer[pos - 1] = 3
+            if pos + block_size < size:
+                answer[pos + block_size] = 3
+    
+    return {"input": question, "output": answer}
+
+def task_fill_until_collision(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where pixels fill empty space until collision."""
+    # At least 4 positions for meaningful puzzle
+    if size < 4:
+        return None
+    
+    is_left = rng.random() < 0.5
+    question = gen_field(size)
+    
+    # Place the side marker
+    if is_left:
+        question[0] = 5
+    else:
+        question[size - 1] = 5
+    
+    # Place 2-4 random pixels
+    num_pixels = rng.randint(2, 4)
+    positions = []
+    
+    if is_left:
+        # Skip first position
+        for _ in range(num_pixels):
+            while True:
+                pos = rng.randint(1, size-1)
+                if pos not in positions:
+                    positions.append(pos)
+                    break
+    else:
+        # Skip last position
+        for _ in range(num_pixels):
+            while True:
+                pos = rng.randint(0, size-2)
+                if pos not in positions:
+                    positions.append(pos)
+                    break
+    
+    # Color random pixels
+    for pos in positions:
+        question[pos] = rng.randint(1, 9)
+    
+    positions.sort()
+    
+    # Create answer
+    answer = question.copy()
+    
+    if is_left:
+        # Fill right from each pixel
+        prev_pos = 0  # Start from marker
+        for pos in positions:
+            color = question[pos]
+            # Fill from previous position to current
+            for i in range(prev_pos + 1, pos):
+                answer[i] = color
+            prev_pos = pos
+    else:
+        # Fill left from each pixel
+        prev_pos = size-1  # Start from marker
+        for pos in reversed(positions):
+            color = question[pos]
+            # Fill from current position to previous
+            for i in range(pos+1, prev_pos):
+                answer[i] = color
+            prev_pos = pos
+    
+    return {"input": question, "output": answer}
+
+def task_repeat_pattern_full(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a pattern is repeated to fill the space."""
+    # Generate initial pattern
+    pattern_size = rng.randint(2, 5)
+    pattern = [rng.randint(1, 9) for _ in range(pattern_size)]
+    
+    # Calculate total size needed for 2 repetitions
+    double_size = pattern_size * 2
+    if double_size >= size:
+        return None
+    
+    # Create input with 2 repetitions
+    question = gen_field(size)
+    for i in range(pattern_size):
+        question[i] = pattern[i]
+        question[i + pattern_size] = pattern[i]
+    
+    # Create answer with maximum repetitions
+    answer = gen_field(size)
+    pos = 0
+    while pos + pattern_size <= size:
+        for i in range(pattern_size):
+            answer[pos + i] = pattern[i]
+        pos += pattern_size
+    
+    # Fill remaining space (if any) with pattern elements
+    for i in range(pos, size):
+        answer[i] = pattern[i - pos]
+    
+    return {"input": question, "output": answer}
+
+def task_gravity_weighted_colors(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where color 2 is heavier than color 1 in gravity."""
+    # Generate random field with only colors 1 and 2
+    question = [rng.randint(1, 2) if rng.random() < 0.5 else 0 for _ in range(size)]
+    
+    # Count colors
+    count_1 = sum(1 for x in question if x == 1)
+    count_2 = sum(1 for x in question if x == 2)
+    
+    # Create answer with sorted colors
+    answer = gen_field(size)
+    
+    # Place heavier color 2 first
+    for i in range(count_2):
+        answer[i] = 2
+    
+    # Then place color 1
+    for i in range(count_1):
+        answer[count_2 + i] = 1
+    
+    return {"input": question, "output": answer}
+
+def task_color_left_half_blocks(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where left half of blocks are colored differently."""
+    pos = 0
+    question = gen_field(size)
+    blocks = []
+    
+    # Generate blocks with gap 1
+    while pos < size:
+        if rng.random() < 0.4:
+            block_size = rng.randint(2, 8)
+            if pos + block_size >= size:
+                break
+            
+            blocks.append((pos, block_size))
+            for i in range(block_size):
+                question[pos + i] = 2
+            pos += block_size + 1  # block size + gap
+        else:
+            pos += 1
+    
+    if len(blocks) < 2:
+        return None
+    
+    # Create answer with half-colored blocks
+    answer = question.copy()
+    for pos, block_size in blocks:
+        half_size = block_size // 2
+        for i in range(half_size):
+            answer[pos + i] = 8
+    
+    return {"input": question, "output": answer}

From f0ab9ec0d473b681fa5d79549dad120f47d17679 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:40:39 +0100
Subject: [PATCH 69/94] test: Add comprehensive unittest for arc_1d task
 functions

---
 tests/test_arc_1d.py | 75 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 tests/test_arc_1d.py

diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
new file mode 100644
index 00000000..23a32ce3
--- /dev/null
+++ b/tests/test_arc_1d.py
@@ -0,0 +1,75 @@
+import random
+import pytest
+from reasoning_gym.cognition.arc_1d import (
+    task_move_n_pix, task_move_n_pix_wrapped, task_gravity, task_gravity_counting,
+    task_gravity_antigravity, task_block_touch_dot, task_block_touch_dot_n_pix,
+    task_block_scale_to_dot, task_two_points_and_fill, task_reflect_block_with_border_pixel,
+    task_reflect_block_with_border_pixel_random, task_reflect_block_around_dot,
+    task_block_and_noise_remove, task_block_and_noise_remove_inside,
+    task_copy_block_to_dots, task_copy_block_to_dots_colors, task_paint_biggest_block,
+    task_sort_blocks_by_size, task_sort_complete_sequence, task_recolor_blocks_by_size,
+    task_gravity_one_step, task_move_block_by_own_size, task_change_to_five,
+    task_recolor_blocks_from_palette, task_duplicate_block_from_seeds,
+    task_fill_from_pixel, task_mark_size_two_blocks, task_fill_until_collision,
+    task_repeat_pattern_full, task_gravity_weighted_colors, task_color_left_half_blocks
+)
+
+def test_all_arc_1d_tasks():
+    """Test that all ARC 1D task functions can be executed without exceptions."""
+    rng = random.Random(42)  # Fixed seed for reproducibility
+    size = 20  # Reasonable size for testing
+    
+    # Test all task functions
+    tasks = [
+        (task_move_n_pix, {"move_pix": 2, "solid": True}),
+        (task_move_n_pix_wrapped, {"move_pix": 2, "solid": True}),
+        (task_gravity, {}),
+        (task_gravity_counting, {}),
+        (task_gravity_antigravity, {}),
+        (task_block_touch_dot, {}),
+        (task_block_touch_dot_n_pix, {"move_pix": 2}),
+        (task_block_scale_to_dot, {}),
+        (task_two_points_and_fill, {}),
+        (task_reflect_block_with_border_pixel, {}),
+        (task_reflect_block_with_border_pixel_random, {}),
+        (task_reflect_block_around_dot, {}),
+        (task_block_and_noise_remove, {}),
+        (task_block_and_noise_remove_inside, {}),
+        (task_copy_block_to_dots, {}),
+        (task_copy_block_to_dots_colors, {}),
+        (task_paint_biggest_block, {}),
+        (task_sort_blocks_by_size, {}),
+        (task_sort_complete_sequence, {}),
+        (task_recolor_blocks_by_size, {}),
+        (task_gravity_one_step, {}),
+        (task_move_block_by_own_size, {}),
+        (task_change_to_five, {}),
+        (task_recolor_blocks_from_palette, {}),
+        (task_duplicate_block_from_seeds, {}),
+        (task_fill_from_pixel, {}),
+        (task_mark_size_two_blocks, {}),
+        (task_fill_until_collision, {}),
+        (task_repeat_pattern_full, {}),
+        (task_gravity_weighted_colors, {}),
+        (task_color_left_half_blocks, {})
+    ]
+
+    for task_func, kwargs in tasks:
+        # Try multiple times as some functions might return None for certain inputs
+        success = False
+        for _ in range(10):  # Try up to 10 times
+            try:
+                result = task_func(size, rng, **kwargs)
+                if result is not None:
+                    success = True
+                    # Basic structure checks
+                    assert isinstance(result, dict)
+                    assert "input" in result
+                    assert "output" in result
+                    assert len(result["input"]) == size
+                    assert len(result["output"]) == size
+                    break
+            except Exception as e:
+                pytest.fail(f"Task {task_func.__name__} failed with error: {str(e)}")
+        
+        assert success, f"Task {task_func.__name__} always returned None in 10 attempts"

From d56e8c3a035f34e8e05ccc25de14a400043944d9 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:42:21 +0100
Subject: [PATCH 70/94] fix: Remove redundant parameters in ARC 1D task test
 suite

---
 tests/test_arc_1d.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
index 23a32ce3..9a7ee083 100644
--- a/tests/test_arc_1d.py
+++ b/tests/test_arc_1d.py
@@ -21,13 +21,13 @@ def test_all_arc_1d_tasks():
     
     # Test all task functions
     tasks = [
-        (task_move_n_pix, {"move_pix": 2, "solid": True}),
-        (task_move_n_pix_wrapped, {"move_pix": 2, "solid": True}),
+        (task_move_n_pix, {"solid": True}),
+        (task_move_n_pix_wrapped, {"solid": True}),
         (task_gravity, {}),
         (task_gravity_counting, {}),
         (task_gravity_antigravity, {}),
         (task_block_touch_dot, {}),
-        (task_block_touch_dot_n_pix, {"move_pix": 2}),
+        (task_block_touch_dot_n_pix, {}),
         (task_block_scale_to_dot, {}),
         (task_two_points_and_fill, {}),
         (task_reflect_block_with_border_pixel, {}),

From 2d3012d5ae81cf8f935a9c5f1a145165266ee74a Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:42:46 +0100
Subject: [PATCH 71/94] fix: Update test_arc_1d.py to handle task function
 argument order

---
 tests/test_arc_1d.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
index 9a7ee083..b0f759b1 100644
--- a/tests/test_arc_1d.py
+++ b/tests/test_arc_1d.py
@@ -20,14 +20,17 @@ def test_all_arc_1d_tasks():
     size = 20  # Reasonable size for testing
     
     # Test all task functions
+    # Fixed move_pix value for testing
+    move_pix = 2
+    
     tasks = [
-        (task_move_n_pix, {"solid": True}),
-        (task_move_n_pix_wrapped, {"solid": True}),
+        (lambda s, r, **k: task_move_n_pix(s, move_pix, **k, r=r), {"solid": True}),
+        (lambda s, r, **k: task_move_n_pix_wrapped(s, move_pix, **k, r=r), {"solid": True}),
         (task_gravity, {}),
         (task_gravity_counting, {}),
         (task_gravity_antigravity, {}),
         (task_block_touch_dot, {}),
-        (task_block_touch_dot_n_pix, {}),
+        (lambda s, r, **k: task_block_touch_dot_n_pix(s, move_pix, r), {}),
         (task_block_scale_to_dot, {}),
         (task_two_points_and_fill, {}),
         (task_reflect_block_with_border_pixel, {}),

From 28c30c69d11cd866f30afcf30e71b6128a2d1823 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 16:43:25 +0100
Subject: [PATCH 72/94] fix: Correct argument passing in ARC 1D task test
 lambda functions

---
 tests/test_arc_1d.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
index b0f759b1..78479a7f 100644
--- a/tests/test_arc_1d.py
+++ b/tests/test_arc_1d.py
@@ -24,8 +24,8 @@ def test_all_arc_1d_tasks():
     move_pix = 2
     
     tasks = [
-        (lambda s, r, **k: task_move_n_pix(s, move_pix, **k, r=r), {"solid": True}),
-        (lambda s, r, **k: task_move_n_pix_wrapped(s, move_pix, **k, r=r), {"solid": True}),
+        (lambda s, r, **k: task_move_n_pix(s, move_pix, k["solid"], r), {"solid": True}),
+        (lambda s, r, **k: task_move_n_pix_wrapped(s, move_pix, k["solid"], r), {"solid": True}),
         (task_gravity, {}),
         (task_gravity_counting, {}),
         (task_gravity_antigravity, {}),

From c74b600085b427f6ee3cabdc02d68c898338ce1d Mon Sep 17 00:00:00 2001
From: Zafir Stojanovski <zaf.stojano@gmail.com>
Date: Sun, 2 Feb 2025 16:47:21 +0100
Subject: [PATCH 73/94] n queens

---
 GALLERY.md                      | 125 +++++++++++++++++++++++++
 README.md                       |   1 +
 reasoning_gym/games/__init__.py |   2 +
 reasoning_gym/games/n_queens.py | 158 ++++++++++++++++++++++++++++++++
 tests/test_n_queens.py          | 122 ++++++++++++++++++++++++
 5 files changed, 408 insertions(+)
 create mode 100644 reasoning_gym/games/n_queens.py
 create mode 100644 tests/test_n_queens.py

diff --git a/GALLERY.md b/GALLERY.md
index 8baa0db4..57d0edd4 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -35,6 +35,7 @@ This gallery shows examples from all available datasets using their default conf
 - [simple_geometry](#simple_geometry)
 - [spell_backward](#spell_backward)
 - [sudoku](#sudoku)
+- [n_queens](#n_queens)
 - [syllogism](#syllogism)
 - [time_intervals](#time_intervals)
 - [tower_of_hanoi](#tower_of_hanoi)
@@ -1514,6 +1515,130 @@ Metadata: {'puzzle': [[0, 0, 1, 2, 3, 0, 0, 0, 9], [3, 0, 0, 1, 8, 5, 6, 7, 2],
 
 ````
 
+
+### n_queens
+
+Generates N-Queens puzzles with configurable board size and number of starting queens
+
+Default configuration:
+```python
+n = 8
+min_remove = 1
+max_remove = 7
+size = 500
+```
+
+Example tasks:
+````
+Example 1
+Question: Solve this N Queens puzzle:
+_ _ _ _ _ _ Q _
+_ Q _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ _ Q
+_ _ _ _ Q _ _ _
+_ _ Q _ _ _ _ _
+_ _ _ _ _ Q _ _
+
+The board size is 8x8 and your job is to place 1 queen(s) on the board such that no two queens attack each other.
+
+No two queens attack each other if they are not in the same row, column, or diagonal.
+
+Place a queen by replacing an underscore (_) with a Q.
+
+Answer 1:
+_ _ _ _ _ _ Q _
+_ Q _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+Q _ _ _ _ _ _ _
+_ _ _ _ _ _ _ Q
+_ _ _ _ Q _ _ _
+_ _ Q _ _ _ _ _
+_ _ _ _ _ Q _ _
+
+Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solution': [[['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 1}
+
+Example 2
+Question: Solve this N Queens puzzle:
+_ Q _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ Q _ _
+_ _ _ _ _ _ _ Q
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ Q _
+_ _ _ _ Q _ _ _
+
+The board size is 8x8 and your job is to place 3 queen(s) on the board such that no two queens attack each other.
+
+No two queens attack each other if they are not in the same row, column, or diagonal.
+
+Place a queen by replacing an underscore (_) with a Q.
+
+Answer 1:
+_ Q _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+_ _ _ _ _ Q _ _
+_ _ _ _ _ _ _ Q
+_ _ Q _ _ _ _ _
+Q _ _ _ _ _ _ _
+_ _ _ _ _ _ Q _
+_ _ _ _ Q _ _ _
+
+Metadata: {'puzzle': [['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']], 'solution': [[['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']]], 'num_removed': 3}
+
+Example 3
+Question: Solve this N Queens puzzle:
+_ _ _ _ _ _ _ _
+_ Q _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+Q _ _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ Q _ _
+
+The board size is 8x8 and your job is to place 5 queen(s) on the board such that no two queens attack each other.
+
+No two queens attack each other if they are not in the same row, column, or diagonal.
+
+Place a queen by replacing an underscore (_) with a Q.
+
+Answer 1:
+_ _ _ _ Q _ _ _
+_ Q _ _ _ _ _ _
+_ _ _ _ _ _ _ Q
+Q _ _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+_ _ _ _ _ _ Q _
+_ _ Q _ _ _ _ _
+_ _ _ _ _ Q _ _
+
+Answer 2:
+_ _ _ _ _ _ Q _
+_ Q _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+Q _ _ _ _ _ _ _
+_ _ _ _ _ _ _ Q
+_ _ _ _ Q _ _ _
+_ _ Q _ _ _ _ _
+_ _ _ _ _ Q _ _
+
+Answer 3:
+_ _ _ _ _ _ _ Q
+_ Q _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+Q _ _ _ _ _ _ _
+_ _ _ _ _ _ Q _
+_ _ _ _ Q _ _ _
+_ _ Q _ _ _ _ _
+_ _ _ _ _ Q _ _
+
+Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solution': [[['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 5}
+````
+
+
 ### syllogism
 Generates syllogism reasoning tasks
 
diff --git a/README.md b/README.md
index d5126451..636f52e5 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
 - `MiniSudokuDataset`: Generate 4x4 Mini Sudoku puzzles with configurable difficulty
 - `MazeDataset`: Generate a maze with a start and a goal
 - `CountdownDataset`: Generate number game tasks where numbers and operators must be combined to reach a target value
+- `NQueensDataset`: Generate N-Queens puzzles with configurable board size and number of starting queens
 
 ## Future Generator Ideas
 
diff --git a/reasoning_gym/games/__init__.py b/reasoning_gym/games/__init__.py
index 6a6df59f..8e4e32d6 100644
--- a/reasoning_gym/games/__init__.py
+++ b/reasoning_gym/games/__init__.py
@@ -10,6 +10,7 @@ from .countdown import CountdownConfig, CountdownDataset
 from .game_of_life import GameOfLifeConfig, GameOfLifeDataset
 from .maze import MazeConfig, MazeDataset
 from .mini_sudoku import MiniSudokuConfig, MiniSudokuDataset
+from .n_queens import NQueensDataset
 from .sudoku import SudokuConfig, SudokuDataset
 from .tower_of_hanoi import HanoiConfig, HanoiDataset
 
@@ -26,4 +27,5 @@ __all__ = [
     "GameOfLifeDataset",
     "HanoiConfig",
     "HanoiDataset",
+    "NQueensDataset",
 ]
diff --git a/reasoning_gym/games/n_queens.py b/reasoning_gym/games/n_queens.py
new file mode 100644
index 00000000..ca767170
--- /dev/null
+++ b/reasoning_gym/games/n_queens.py
@@ -0,0 +1,158 @@
+"""N Queens puzzle generator
+
+A generalization of the 8-queens puzzle to any board size.
+https://en.wikipedia.org/wiki/Eight_queens_puzzle
+"""
+
+from copy import deepcopy
+from dataclasses import dataclass
+from random import Random
+from typing import Dict, List, Optional
+
+from ..factory import ProceduralDataset, register_dataset
+
+MIN_BOARD_SIZE = 4
+MAX_BOARD_SIZE = 15
+
+QUESTION_TEMPLATE = """Solve this N Queens puzzle:
+{puzzle}
+
+The board size is {n}x{n} and your job is to place {num_removed} queen(s) on the board such that no two queens attack each other.
+
+No two queens attack each other if they are not in the same row, column, or diagonal.
+
+Place a queen by replacing an underscore (_) with a Q.
+"""
+
+
+@dataclass
+class NQueensConfig:
+    """Configuration for N Queens puzzle generation"""
+
+    n: int = 8  # Board size
+    min_remove: int = 1  # Minimum number of queens to remove from solved board
+    max_remove: int = 7  # Maximum number of queens to remove from solved board
+
+    size: int = 500  # Virtual dataset size
+    seed: Optional[int] = None
+
+    def validate(self):
+        """Validate configuration parameters"""
+        assert MIN_BOARD_SIZE <= self.n <= MAX_BOARD_SIZE, f"n must be between {MIN_BOARD_SIZE} and {MAX_BOARD_SIZE}"
+        assert 1 <= self.min_remove <= self.max_remove, "min_remove must be between 1 and max_remove"
+        assert self.min_remove <= self.max_remove <= self.n, "max_remove must be between min_remove and n"
+
+
+class NQueensDataset(ProceduralDataset):
+    """Generates N Queens puzzles with configurable difficulty"""
+
+    def __init__(self, config: NQueensConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+        self._solutions = self._get_all_solutions(config.n)
+
+    def __len__(self) -> int:
+        return self.config.size
+
+    def __iter__(self):
+        self._current_idx = 0
+        return self
+
+    def __next__(self):
+        if self._current_idx >= self.config.size:
+            raise StopIteration
+        item = self[self._current_idx]
+        self._current_idx += 1
+        return item
+
+    def _get_all_solutions(self, n: int) -> List[List[List[str]]]:
+        """Get all solutions for the N Queens puzzle"""
+
+        visited_cols = set()
+        visited_pos_diag = set()
+        visited_neg_diag = set()
+
+        res = []
+        board = [["_"] * n for _ in range(n)]
+
+        def backtrack(row: int):
+            if row == n:
+                res.append(deepcopy(board))
+                return
+
+            for col in range(n):
+                if col in visited_cols or (row + col) in visited_pos_diag or (row - col) in visited_neg_diag:
+                    continue
+
+                visited_cols.add(col)
+                visited_pos_diag.add(row + col)
+                visited_neg_diag.add(row - col)
+                board[row][col] = "Q"
+                backtrack(row + 1)
+                visited_cols.remove(col)
+                visited_pos_diag.remove(row + col)
+                visited_neg_diag.remove(row - col)
+                board[row][col] = "_"
+
+        backtrack(0)
+        return res
+
+    def _create_puzzle(self, solved_board: List[List[str]], num_removed: int, rng: Random) -> List[List[str]]:
+        """Create puzzle by removing queens from solved board"""
+        puzzle = deepcopy(solved_board)
+        queens = [(i, j) for i in range(len(puzzle)) for j in range(len(puzzle)) if puzzle[i][j] == "Q"]
+        rng.shuffle(queens)
+        for i in range(num_removed):
+            x, y = queens[i]
+            puzzle[x][y] = "_"
+        return puzzle
+
+    def _board_to_string(self, board: List[List[str]]) -> str:
+        """Convert board to string representation"""
+        return "\n".join(" ".join(x for x in row) for row in board)
+
+    def _string_to_board(self, board_str: str) -> List[List[str]]:
+        """Convert string representation to board"""
+        return [list(row.split()) for row in board_str.strip().split("\n")]
+
+    def _is_tractable_solution(self, puzzle: List[List[str]], solution: List[List[str]]) -> bool:
+        """Check if a solution is achievable from the starting state of the puzzle"""
+        for r in range(len(puzzle)):
+            for c in range(len(puzzle)):
+                if puzzle[r][c] == "Q" and solution[r][c] != "Q":
+                    return False
+        return True
+
+    def __getitem__(self, idx: int) -> dict:
+        """Generate a single N Queens puzzle"""
+        rng = Random(self.seed + idx)
+
+        # Randomly select a valid solution
+        solved_board = rng.choice(self._solutions)
+
+        # Create puzzle by removing queens
+        num_removed = rng.randint(self.config.min_remove, self.config.max_remove)
+        puzzle = self._create_puzzle(solved_board, num_removed, rng)
+        puzzle_str = self._board_to_string(puzzle)
+
+        # Filter all solutions that are intractable from the puzzle's starting state
+        valid_solutions = [board for board in self._solutions if self._is_tractable_solution(puzzle, board)]
+        valid_solutions_str = {self._board_to_string(board) for board in valid_solutions}
+
+        return {
+            "question": QUESTION_TEMPLATE.format(puzzle=puzzle_str, n=len(puzzle), num_removed=num_removed),
+            "answer": valid_solutions_str,
+            "metadata": {"puzzle": puzzle, "solution": valid_solutions, "num_removed": num_removed},
+        }
+
+    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+        valid_solutions = entry["answer"]
+        reward = 0.0
+        if answer is not None:
+            if answer in valid_solutions:
+                reward = 1.0
+            else:
+                reward = 0.01
+        return reward
+
+
+register_dataset("n_queens", NQueensDataset, NQueensConfig)
diff --git a/tests/test_n_queens.py b/tests/test_n_queens.py
new file mode 100644
index 00000000..8476864a
--- /dev/null
+++ b/tests/test_n_queens.py
@@ -0,0 +1,122 @@
+"""Tests for N Queens puzzle generation"""
+
+import pytest
+
+from reasoning_gym.games.n_queens import NQueensConfig, NQueensDataset
+
+
+def test_nqueens_config_validation():
+    """Test that invalid configs raise appropriate errors"""
+    with pytest.raises(AssertionError):
+        config = NQueensConfig(n=-1)  # Negative not allowed
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = NQueensConfig(n=0)  # Zero not allowed
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = NQueensConfig(n=5, min_remove=5, max_remove=4)  # max < min
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = NQueensConfig(n=5, min_remove=3, max_remove=6)  # n < max
+        config.validate()
+
+
+def test_nqueens_dataset_deterministic():
+    """Test that dataset generates same items with same seed"""
+    config = NQueensConfig(seed=42, size=10)
+    dataset1 = NQueensDataset(config)
+    dataset2 = NQueensDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i]
+
+
+def test_nqueens_dataset_items():
+    """Test basic properties of generated items"""
+    config = NQueensConfig(n=8, min_remove=1, max_remove=7, size=10, seed=42)
+    dataset = NQueensDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        # Check item structure
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        # Check metadata
+        assert "puzzle" in item["metadata"]
+        assert "solution" in item["metadata"]
+        assert "num_removed" in item["metadata"]
+
+        puzzle = item["metadata"]["puzzle"]
+        solution = item["metadata"]["solution"]
+        num_removed = item["metadata"]["num_removed"]
+
+        # Verify board dimensions
+        assert len(puzzle) == 8
+        assert all(len(row) == 8 for row in puzzle)
+        for board in solution:
+            assert len(board) == 8
+            assert all(len(row) == 8 for row in board)
+
+        # Verify empty cell count
+        removed_count = len(puzzle) - sum(1 for row in puzzle for cell in row if cell == "Q")
+        assert config.min_remove <= removed_count <= config.max_remove
+        assert removed_count == num_removed
+
+        # Verify solution validity
+        for board in solution:
+            assert is_valid_solution(board)
+
+            # Verify puzzle matches solution where filled
+            for i in range(8):
+                for j in range(8):
+                    if puzzle[i][j] == "Q":
+                        assert puzzle[i][j] == board[i][j]
+
+
+def test_nqueens_dataset_iteration():
+    """Test that iteration respects dataset size"""
+    config = NQueensConfig(size=5, seed=42)
+    dataset = NQueensDataset(config)
+
+    items = list(dataset)
+    assert len(items) == config.size
+
+    # Test multiple iterations yield same items
+    assert items == list(dataset)
+
+
+def test_nqueens_board_generation():
+    """Test that generated boards are valid"""
+    config = NQueensConfig(n=10, size=5, seed=42)
+    dataset = NQueensDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        for board in item["metadata"]["solution"]:
+            assert is_valid_solution(board)
+
+
+def is_valid_solution(board: list[list[str]]) -> bool:
+    """Helper function to verify N Queens solution validity"""
+    rows, cols, diags, off_diags = set(), set(), set(), set()
+    n = len(board)
+    num_queens = 0
+
+    for r in range(n):
+        for c in range(n):
+            if board[r][c] == "Q":
+                num_queens += 1
+                if r in rows or c in cols or (r + c) in diags or (r - c) in off_diags:
+                    return False
+                rows.add(r)
+                cols.add(c)
+                diags.add(r + c)
+                off_diags.add(r - c)
+
+    return num_queens == n

From 1912c571f93bba754cd0c81ce77c7077572b8479 Mon Sep 17 00:00:00 2001
From: Zafir Stojanovski <zaf.stojano@gmail.com>
Date: Sun, 2 Feb 2025 16:52:36 +0100
Subject: [PATCH 74/94] cap N at 12

---
 reasoning_gym/games/n_queens.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reasoning_gym/games/n_queens.py b/reasoning_gym/games/n_queens.py
index ca767170..0af85a0d 100644
--- a/reasoning_gym/games/n_queens.py
+++ b/reasoning_gym/games/n_queens.py
@@ -12,7 +12,7 @@ from typing import Dict, List, Optional
 from ..factory import ProceduralDataset, register_dataset
 
 MIN_BOARD_SIZE = 4
-MAX_BOARD_SIZE = 15
+MAX_BOARD_SIZE = 12
 
 QUESTION_TEMPLATE = """Solve this N Queens puzzle:
 {puzzle}

From 56ded2c299a9de9935e41194a56a1e6505f329f8 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 17:23:02 +0100
Subject: [PATCH 75/94] feat: Improve syllogism sentence formatting for natural
 language

---
 reasoning_gym/logic/syllogisms.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/reasoning_gym/logic/syllogisms.py b/reasoning_gym/logic/syllogisms.py
index 0af9d1b2..a5bbb219 100644
--- a/reasoning_gym/logic/syllogisms.py
+++ b/reasoning_gym/logic/syllogisms.py
@@ -206,6 +206,13 @@ class SyllogismDataset(ProceduralDataset):
 
         return False
 
+    def _format_quantifier_statement(self, quantifier: Quantifier, subject: Term, predicate: Term) -> str:
+        """Format a quantified statement in natural language"""
+        if quantifier == Quantifier.SOME_NOT:
+            return f"Some {subject.plural} are not {predicate.plural}"
+        else:
+            return f"{quantifier.value} {subject.plural} are {predicate.plural}"
+
     def _generate_syllogism(self, rng: Random) -> dict:
         """Generate a single syllogism problem"""
         # Select three different terms
@@ -226,9 +233,9 @@ class SyllogismDataset(ProceduralDataset):
                 conclusion = (rng.choice(quantifiers), terms[0], terms[2])
 
         # Format the syllogism as text
-        premise1_text = f"{premise1[0].value} {premise1[1].plural} are {premise1[2].plural}"
-        premise2_text = f"{premise2[0].value} {premise2[1].plural} are {premise2[2].plural}"
-        conclusion_text = f"{conclusion[0].value} {conclusion[1].plural} are {conclusion[2].plural}"
+        premise1_text = self._format_quantifier_statement(premise1[0], premise1[1], premise1[2])
+        premise2_text = self._format_quantifier_statement(premise2[0], premise2[1], premise2[2])
+        conclusion_text = self._format_quantifier_statement(conclusion[0], conclusion[1], conclusion[2])
 
         question = (
             f"Consider these statements:\n"

From 5dd4c0e831db5da271a0258810ab8020109be336 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 17:25:37 +0100
Subject: [PATCH 76/94] change parameter order for basic arc tasks

---
 reasoning_gym/cognition/arc_1d.py | 470 ++++++++++++++++--------------
 tests/test_arc_1d.py              |  61 ++--
 2 files changed, 292 insertions(+), 239 deletions(-)

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index 4d3733fa..d9bb2f82 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -1,5 +1,5 @@
 from random import Random
-from typing import Optional, Dict, List
+from typing import Dict, List, Optional
 
 
 def gen_field(size: int, color: int = 0) -> List[int]:
@@ -15,7 +15,7 @@ def write_block(pos: int, block: List[int], field: List[int]) -> List[int]:
     return result
 
 
-def task_move_n_pix(size: int, move_pix: int, solid: bool, rng: Random) -> Optional[Dict[str, List[int]]]:
+def task_move_n_pix(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block is moved to the right by move_pix pixels."""
     if size <= move_pix + 1:
         return None
@@ -34,11 +34,12 @@ def task_move_n_pix(size: int, move_pix: int, solid: bool, rng: Random) -> Optio
 
     return {"input": question, "output": answer}
 
-def task_move_n_pix_wrapped(size: int, move_pix: int, solid: bool, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_move_n_pix_wrapped(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block is moved to the right by move_pix pixels with wrapping."""
     block_size = rng.randint(1, size)
     block_pos = rng.randint(0, size)
-    
+
     if solid:
         color = rng.randint(1, 9)
         block = [color] * block_size
@@ -47,103 +48,108 @@ def task_move_n_pix_wrapped(size: int, move_pix: int, solid: bool, rng: Random)
 
     question = gen_field(size)
     answer = gen_field(size)
-    
+
     for i, color in enumerate(block):
         question[(block_pos + i) % size] = color
         answer[(block_pos + move_pix + i) % size] = color
 
     return {"input": question, "output": answer}
 
-def task_gravity(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_gravity(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where all non-zero elements are attracted to the left."""
     density = 0.5
     question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
-    
+
     non_zero = [x for x in question if x != 0]
     answer = non_zero + [0] * (size - len(non_zero))
-    
+
     return {"input": question, "output": answer}
 
-def task_gravity_counting(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_gravity_counting(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where non-zero elements are counted and represented as a sequence of 1s."""
     density = 0.5
     question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
-    
+
     count = sum(1 for x in question if x != 0)
     answer = [1] * count + [0] * (size - count)
-    
+
     return {"input": question, "output": answer}
 
-def task_gravity_antigravity(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_gravity_antigravity(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where color 1 moves right and color 2 moves left."""
     density = 0.5
     question = [rng.randint(1, 2) if rng.random() < density else 0 for _ in range(size)]
-    
+
     color1 = [x for x in question if x == 1]
     color2 = [x for x in question if x == 2]
     answer = [2] * len(color2) + [0] * (size - len(color1) - len(color2)) + [1] * len(color1)
-    
+
     return {"input": question, "output": answer}
 
-def task_block_touch_dot(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_block_touch_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block moves to touch (but not cover) a dot."""
     dot_color = 1
     block_color = rng.randint(2, 9)
-    
+
     block_size = rng.randint(1, size)
     dot_pos = rng.randint(0, size)
-    
+
     can_place_left = dot_pos >= block_size
     can_place_right = dot_pos + block_size < size
-    
+
     if not (can_place_left or can_place_right):
         return None
-        
+
     if can_place_left and can_place_right:
         side = rng.choice(["left", "right"])
     elif can_place_left:
         side = "left"
     else:
         side = "right"
-        
+
     if side == "left":
         q_block_pos = rng.randint(0, dot_pos - block_size)
         a_block_pos = dot_pos - block_size
     else:
         q_block_pos = rng.randint(dot_pos + 1, size - block_size)
         a_block_pos = dot_pos + 1
-        
+
     question = gen_field(size)
     question[dot_pos] = dot_color
     question = write_block(q_block_pos, [block_color] * block_size, question)
-    
+
     answer = gen_field(size)
     answer[dot_pos] = dot_color
     answer = write_block(a_block_pos, [block_color] * block_size, answer)
-    
+
     return {"input": question, "output": answer}
 
-def task_block_touch_dot_n_pix(size: int, move_pix: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_block_touch_dot_n_pix(rng: Random, size: int, move_pix: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block moves move_pix pixels toward a dot."""
     dot_color = 2
     block_color = rng.randint(3, 9)
-    
+
     block_size = rng.randint(1, size)
     dot_pos = rng.randint(0, size)
-    
+
     can_place_left = dot_pos >= block_size
     can_place_right = dot_pos + block_size < size
-    
+
     if not (can_place_left or can_place_right):
         return None
-        
+
     if can_place_left and can_place_right:
         side = rng.choice(["left", "right"])
     elif can_place_left:
         side = "left"
     else:
         side = "right"
-        
+
     if side == "left":
         q_block_pos = rng.randint(0, dot_pos - block_size)
         distance = (dot_pos - block_size) - q_block_pos
@@ -154,38 +160,39 @@ def task_block_touch_dot_n_pix(size: int, move_pix: int, rng: Random) -> Optiona
         distance = q_block_pos - (dot_pos + 1)
         move = min(distance, move_pix)
         a_block_pos = q_block_pos - move
-        
+
     question = gen_field(size)
     question[dot_pos] = dot_color
     question = write_block(q_block_pos, [block_color] * block_size, question)
-    
+
     answer = gen_field(size)
     answer[dot_pos] = dot_color
     answer = write_block(a_block_pos, [block_color] * block_size, answer)
-    
+
     return {"input": question, "output": answer}
 
-def task_block_scale_to_dot(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_block_scale_to_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block scales to touch a dot (keeping one end fixed)."""
     dot_color = 2
     block_color = rng.randint(3, 9)
-    
+
     block_size = rng.randint(1, size)
     dot_pos = rng.randint(0, size)
-    
+
     can_place_left = dot_pos >= block_size
     can_place_right = dot_pos + block_size < size
-    
+
     if not (can_place_left or can_place_right):
         return None
-        
+
     if can_place_left and can_place_right:
         side = rng.choice(["left", "right"])
     elif can_place_left:
         side = "left"
     else:
         side = "right"
-        
+
     if side == "left":
         q_block_pos = rng.randint(0, dot_pos - block_size)
         new_size = dot_pos - q_block_pos + 1
@@ -194,76 +201,79 @@ def task_block_scale_to_dot(size: int, rng: Random) -> Optional[Dict[str, List[i
         q_block_pos = rng.randint(dot_pos + 1, size - block_size)
         new_size = (q_block_pos + block_size) - dot_pos
         a_block_pos = dot_pos
-        
+
     question = gen_field(size)
     question[dot_pos] = dot_color
     question = write_block(q_block_pos, [block_color] * block_size, question)
-    
+
     answer = gen_field(size)
     answer[dot_pos] = dot_color
     answer = write_block(a_block_pos, [block_color] * new_size, answer)
-    
+
     return {"input": question, "output": answer}
 
-def task_two_points_and_fill(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_two_points_and_fill(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where space between two points of same color is filled with that color."""
     color = rng.randint(1, 9)
-    
+
     pos1 = rng.randint(0, size - 1)
     pos2 = rng.randint(0, size - 1)
     if pos1 == pos2:
         return None
-        
+
     pos1, pos2 = min(pos1, pos2), max(pos1, pos2)
-    
+
     question = gen_field(size)
     question[pos1] = color
     question[pos2] = color
-    
+
     answer = question.copy()
     for i in range(pos1, pos2 + 1):
         answer[i] = color
-    
+
     return {"input": question, "output": answer}
 
-def task_reflect_block_with_border_pixel(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_reflect_block_with_border_pixel(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block with a border pixel is reflected."""
     block_size = rng.randint(2, size)
     if block_size > size:
         return None
-        
+
     c1 = rng.randint(1, 9)
     c2 = rng.randint(1, 9)
     if c1 == c2:
         return None
-        
+
     side = "left" if rng.random() < 0.5 else "right"
     pos = rng.randint(0, size - block_size)
-    
+
     block = [c1] * block_size
     if side == "left":
         block[0] = c2
     else:
         block[block_size - 1] = c2
-        
+
     question = write_block(pos, block, gen_field(size))
     reversed_block = block[::-1]  # Reverse the block
     answer = write_block(pos, reversed_block, gen_field(size))
-    
+
     return {"input": question, "output": answer}
 
-def task_reflect_block_with_border_pixel_random(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_reflect_block_with_border_pixel_random(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a random-colored block with a border pixel is reflected."""
     block_size = rng.randint(2, size)
     if block_size > size:
         return None
-        
+
     side = "left" if rng.random() < 0.5 else "right"
     pos = rng.randint(0, size - block_size)
-    
+
     block = [rng.randint(1, 9) for _ in range(block_size)]
     border_color = rng.randint(1, 9)
-    
+
     if side == "left":
         if block[0] == border_color:
             return None
@@ -272,64 +282,66 @@ def task_reflect_block_with_border_pixel_random(size: int, rng: Random) -> Optio
         if block[block_size - 1] == border_color:
             return None
         block[block_size - 1] = border_color
-        
+
     question = write_block(pos, block, gen_field(size))
     reversed_block = block[::-1]  # Reverse the block
     answer = write_block(pos, reversed_block, gen_field(size))
-    
+
     return {"input": question, "output": answer}
 
-def task_reflect_block_around_dot(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_reflect_block_around_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block is reflected around a dot."""
     dot_color = 2
-    
+
     dot_pos = rng.randint(0, size)
     block_size = rng.randint(1, size)
     block_pos = rng.randint(0, size - block_size)
     block_end = block_pos + block_size - 1
-    
+
     # Check if block is strictly to left or right of dot
     strictly_left = block_end < dot_pos
     strictly_right = block_pos > dot_pos
-    
+
     if not (strictly_left or strictly_right):
         return None
-        
+
     block_color = rng.randint(3, 9)  # Different from dot color
     block = [block_color] * block_size
-    
+
     # Calculate reflection bounds
     min_reflect = 2 * dot_pos - block_end
     max_reflect = 2 * dot_pos - block_pos
     if min_reflect < 0 or max_reflect >= size:
         return None
-        
+
     question = gen_field(size)
     question = write_block(block_pos, block, question)
     question[dot_pos] = dot_color
-    
+
     answer = gen_field(size)
     answer[dot_pos] = dot_color
     for i in range(block_size):
         reflect_idx = 2 * dot_pos - (block_pos + i)
         answer[reflect_idx] = block[i]
-    
+
     return {"input": question, "output": answer}
 
-def task_block_and_noise_remove(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_block_and_noise_remove(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where noise around a block needs to be removed."""
     block_size = rng.randint(2, size)
     if block_size > size:
         return None
-        
+
     block_pos = rng.randint(0, size - block_size)
     color = rng.randint(1, 9)
-    
+
     # Create field with block
     field = gen_field(size)
     for i in range(block_size):
         field[block_pos + i] = color
-        
+
     # Track forbidden positions for noise
     forbidden = [False] * size
     for i in range(block_pos, block_pos + block_size):
@@ -338,11 +350,11 @@ def task_block_and_noise_remove(size: int, rng: Random) -> Optional[Dict[str, Li
         forbidden[block_pos - 1] = True
     if block_pos + block_size < size:
         forbidden[block_pos + block_size] = True
-        
+
     # Add noise
     noise_count = rng.randint(1, 3)
     noise_positions = []
-    
+
     for _ in range(noise_count):
         allowed = [i for i in range(size) if not forbidden[i]]
         if not allowed:
@@ -355,107 +367,110 @@ def task_block_and_noise_remove(size: int, rng: Random) -> Optional[Dict[str, Li
             forbidden[noise_pos - 1] = True
         if noise_pos + 1 < size:
             forbidden[noise_pos + 1] = True
-            
+
     if len(noise_positions) < noise_count:
         return None
-        
+
     question = field
     answer = field.copy()
     for pos in noise_positions:
         answer[pos] = 0
-        
+
     return {"input": question, "output": answer}
 
-def task_block_and_noise_remove_inside(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_block_and_noise_remove_inside(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where noise inside a block needs to be removed."""
     if size <= 6:
         return None
-        
+
     block_size = rng.randint(6, size)
     if block_size > size:
         return None
-        
+
     block_pos = rng.randint(0, size - block_size)
     color = rng.randint(1, 9)
-    
+
     # Create field with block
     field = gen_field(size)
     for i in range(block_size):
         field[block_pos + i] = color
-        
+
     # Add noise inside block
     max_noise = max(1, (block_size // 2) - 1)
     noise_count = rng.randint(1, max_noise)
-    
+
     positions = list(range(block_size))
     rng.shuffle(positions)
     noise_positions = positions[:noise_count]
-    
+
     for offset in noise_positions:
         pos = block_pos + offset
         noise_color = rng.randint(1, 9)
         while noise_color == color:
             noise_color = rng.randint(1, 9)
         field[pos] = noise_color
-        
+
     question = field
     answer = field.copy()
     for offset in noise_positions:
         answer[block_pos + offset] = color
-        
+
     return {"input": question, "output": answer}
 
-def task_copy_block_to_dots(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_copy_block_to_dots(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block pattern is copied to dot positions."""
     block_size = 3 if rng.random() < 0.5 else 5
     if block_size >= size:
         return None
-        
+
     color = rng.randint(1, 9)
     block = [color] * block_size
-    
+
     # Generate dots with minimum distance to prevent overlap
     min_gap = block_size
     dot_positions = []
-    pos = block_size + block_size//2 + 1
-    
+    pos = block_size + block_size // 2 + 1
+
     while pos <= size - block_size:
         if rng.random() < 0.5:  # Control dot density
             dot_positions.append(pos)
             pos += min_gap
         pos += 1
-        
+
     if not dot_positions:
         return None
-        
+
     question = gen_field(size)
     question = write_block(0, block, question)
     for pos in dot_positions:
         question[pos] = color
-        
+
     answer = gen_field(size)
     answer = write_block(0, block, answer)
     for pos in dot_positions:
-        block_start = pos - block_size//2
+        block_start = pos - block_size // 2
         answer = write_block(block_start, block, answer)
-        
+
     return {"input": question, "output": answer}
 
-def task_copy_block_to_dots_colors(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_copy_block_to_dots_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block pattern is copied to dot positions with matching colors."""
     block_size = 3 if rng.random() < 0.5 else 5
     if block_size >= size:
         return None
-        
+
     block_color = rng.randint(1, 9)
     block = [block_color] * block_size
-    
+
     # Generate dots with minimum distance to prevent overlap
     min_gap = block_size
     dot_positions = []
     dot_colors = []
-    pos = block_size + block_size//2 + 1
-    
+    pos = block_size + block_size // 2 + 1
+
     while pos < size - block_size:
         if rng.random() < 0.5:
             dot_color = rng.randint(1, 9)
@@ -463,34 +478,35 @@ def task_copy_block_to_dots_colors(size: int, rng: Random) -> Optional[Dict[str,
             dot_colors.append(dot_color)
             pos += min_gap
         pos += 1
-        
+
     if not dot_positions:
         return None
-        
+
     question = gen_field(size)
     question = write_block(0, block, question)
     for i, pos in enumerate(dot_positions):
         question[pos] = dot_colors[i]
-        
+
     answer = gen_field(size)
     answer = write_block(0, block, answer)
     for i, pos in enumerate(dot_positions):
-        block_start = pos - block_size//2
+        block_start = pos - block_size // 2
         colored_block = [dot_colors[i]] * block_size
         answer = write_block(block_start, colored_block, answer)
-        
+
     return {"input": question, "output": answer}
 
-def task_paint_biggest_block(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_paint_biggest_block(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where the largest block is painted a different color."""
     target_color = 1
     initial_color = rng.randint(2, 9)
-    
+
     # Generate random blocks
     question = gen_field(size)
     blocks = []
     pos = 0
-    
+
     while pos < size:
         if rng.random() < 0.4 and size - pos >= 2:
             block_size = rng.randint(2, min(size - pos, 6))
@@ -500,30 +516,31 @@ def task_paint_biggest_block(size: int, rng: Random) -> Optional[Dict[str, List[
             pos += block_size + 1
         else:
             pos += 1
-            
+
     if len(blocks) < 2:
         return None
-        
+
     # Find biggest block
     biggest_pos, biggest_size = max(blocks, key=lambda x: x[1])
-    
+
     # Check if there are multiple blocks of the same size
     biggest_count = sum(1 for _, size in blocks if size == biggest_size)
     if biggest_count > 1:
         return None
-        
+
     answer = question.copy()
     for i in range(biggest_size):
         answer[biggest_pos + i] = target_color
-        
+
     return {"input": question, "output": answer}
 
-def task_sort_blocks_by_size(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_sort_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where blocks are sorted by size with 1 pixel gaps."""
     color = rng.randint(1, 9)
     blocks = []
     pos = 0
-    
+
     # Generate random blocks with random sizes
     while pos < size:
         if rng.random() < 0.4 and size - pos >= 2:
@@ -532,36 +549,37 @@ def task_sort_blocks_by_size(size: int, rng: Random) -> Optional[Dict[str, List[
             pos += block_size + rng.randint(1, 4)  # Random gaps
         else:
             pos += 1
-            
+
     if len(blocks) < 2:
         return None
-        
+
     # Create input field
     question = gen_field(size)
     for pos, block_size in blocks:
         for i in range(block_size):
             question[pos + i] = color
-            
+
     # Sort blocks by size
     blocks.sort(key=lambda x: x[1])
-    
+
     # Check if sorted blocks fit with gaps
     total_space = sum(size for _, size in blocks) + len(blocks) - 1
     if total_space > size:
         return None
-        
+
     # Create answer field with sorted blocks
     answer = gen_field(size)
     current_pos = 0
-    
+
     for _, block_size in blocks:
         for i in range(block_size):
             answer[current_pos + i] = color
         current_pos += block_size + 1  # One pixel gap
-        
+
     return {"input": question, "output": answer}
 
-def task_sort_complete_sequence(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_sort_complete_sequence(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a complete sequence of block sizes is sorted."""
     # Calculate max possible block size given total array size
     max_size = 1
@@ -570,16 +588,16 @@ def task_sort_complete_sequence(size: int, rng: Random) -> Optional[Dict[str, Li
         total_space += max_size + 1
         max_size += 1
     max_size -= 1
-    
+
     if max_size < 2:
         return None
-        
+
     color = rng.randint(1, 9)
-    
+
     # Create sequence of all sizes from 1 to max_size
     blocks = list(range(1, max_size + 1))
     rng.shuffle(blocks)
-    
+
     # Create input field with shuffled blocks
     question = gen_field(size)
     pos = 0
@@ -587,7 +605,7 @@ def task_sort_complete_sequence(size: int, rng: Random) -> Optional[Dict[str, Li
         for i in range(block_size):
             question[pos + i] = color
         pos += block_size + 1
-        
+
     # Create answer field with sorted blocks
     answer = gen_field(size)
     pos = 0
@@ -595,32 +613,33 @@ def task_sort_complete_sequence(size: int, rng: Random) -> Optional[Dict[str, Li
         for i in range(block_size):
             answer[pos + i] = color
         pos += block_size + 1
-        
+
     return {"input": question, "output": answer}
 
-def task_recolor_blocks_by_size(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_recolor_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where two blocks are recolored based on their size."""
     # Generate two different random sizes
     size1 = rng.randint(2, 8)
     size2 = rng.randint(2, 8)
     while size2 == size1:
         size2 = rng.randint(2, 8)
-        
+
     # Ensure both blocks fit with at least 1 gap
     if size1 + size2 + 1 > size:
         return None
-        
+
     # Place blocks with gap
     pos1 = rng.randint(0, size - (size1 + size2 + 1))
     pos2 = rng.randint(pos1 + size1 + 1, size - size2)
-    
+
     # Create input field with both blocks color 3
     question = gen_field(size)
     for i in range(size1):
         question[pos1 + i] = 3
     for i in range(size2):
         question[pos2 + i] = 3
-        
+
     # Create answer field with recolored blocks
     answer = question.copy()
     if size1 > size2:
@@ -633,69 +652,73 @@ def task_recolor_blocks_by_size(size: int, rng: Random) -> Optional[Dict[str, Li
             answer[pos1 + i] = 2
         for i in range(size2):
             answer[pos2 + i] = 1
-            
+
     return {"input": question, "output": answer}
 
-def task_gravity_one_step(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_gravity_one_step(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where non-zero elements move one step left if possible."""
     question = [rng.randint(1, 9) if rng.random() < 0.5 else 0 for _ in range(size)]
     answer = question.copy()
-    
+
     # Move each non-zero pixel one step left if possible
     for i in range(1, size):
-        if answer[i] != 0 and answer[i-1] == 0:
-            answer[i-1] = answer[i]
+        if answer[i] != 0 and answer[i - 1] == 0:
+            answer[i - 1] = answer[i]
             answer[i] = 0
-            
+
     return {"input": question, "output": answer}
 
-def task_move_block_by_own_size(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_move_block_by_own_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block moves right by its own size."""
-    block_size = rng.randint(1, size//2)  # Ensure space for movement
+    block_size = rng.randint(1, size // 2)  # Ensure space for movement
     pos = rng.randint(0, size - block_size * 2)  # Space for block and movement
     color = rng.randint(1, 9)
-    
+
     question = gen_field(size)
     block = [color] * block_size
     question = write_block(pos, block, question)
-    
+
     answer = write_block(pos + block_size, block, gen_field(size))
-    
+
     return {"input": question, "output": answer}
 
-def task_change_to_five(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_change_to_five(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where all non-zero colors change to 5."""
     density = 0.5
     question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
     answer = [5 if x != 0 else 0 for x in question]
-    
+
     return {"input": question, "output": answer}
 
-def task_recolor_blocks_from_palette(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_recolor_blocks_from_palette(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where blocks are recolored using a color palette."""
     # Generate blocks of same size
     block_size = rng.randint(2, 4)
     blocks = []
     pos = 0
-    
+
     while pos + block_size <= size:
         if rng.random() < 0.4:
             blocks.append(pos)
             pos += block_size + 1
         else:
             pos += 1
-            
+
     # Ensure we have space for palette
     while blocks and blocks[-1] + block_size + len(blocks) + 1 >= size:
         blocks.pop()
-        
+
     if not blocks:
         return None
-        
+
     # Shift blocks right to make room for palette
     palette_size = len(blocks)
     blocks = [pos + palette_size + 1 for pos in blocks]
-    
+
     # Generate color palette
     colors = []
     for _ in range(len(blocks)):
@@ -704,52 +727,53 @@ def task_recolor_blocks_from_palette(size: int, rng: Random) -> Optional[Dict[st
             if color not in colors:
                 colors.append(color)
                 break
-                
+
     # Create question with color palette and blocks
     question = gen_field(size)
-    
+
     # Place color palette at start
     for i, color in enumerate(colors):
         question[i] = color
-        
+
     # Place blocks of color 5
     for block_pos in blocks:
         for i in range(block_size):
             question[block_pos + i] = 5
-            
+
     # Create answer with recolored blocks
     answer = question.copy()
     for block_idx, block_pos in enumerate(blocks):
         color = colors[block_idx]
         for i in range(block_size):
             answer[block_pos + i] = color
-            
+
     return {"input": question, "output": answer}
 
-def task_duplicate_block_from_seeds(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_duplicate_block_from_seeds(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a block is duplicated from seed pixels."""
     block_size = rng.randint(2, 4)
     if block_size + 1 >= size:
         return None
     if size <= 3 + block_size:
         return None
-    
+
     # Position block with space for seeds
     block_pos = rng.randint(2, size - block_size - 1)
-    
+
     # Decide seed placement
     left_seed = rng.random() < 0.5
     right_seed = rng.random() < 0.5
     if not (left_seed or right_seed):
         return None
-    
+
     # Create input
     question = gen_field(size)
-    
+
     # Place main block
     for i in range(block_size):
         question[block_pos + i] = 1
-    
+
     # Place seeds with gaps
     seeds = []
     if left_seed:
@@ -760,10 +784,10 @@ def task_duplicate_block_from_seeds(size: int, rng: Random) -> Optional[Dict[str
         color = rng.randint(1, 9)
         question[block_pos + block_size + 1] = color
         seeds.append(("right", block_pos + block_size + 1, color))
-    
+
     # Create answer with duplicated blocks
     answer = question.copy()
-    
+
     for side, seed_pos, color in seeds:
         if side == "left":
             # For left seed, blocks end at seed
@@ -784,41 +808,42 @@ def task_duplicate_block_from_seeds(size: int, rng: Random) -> Optional[Dict[str
                 if start_pos + block_size + 1 >= size:
                     break
                 start_pos = start_pos + block_size + 1  # +1 for gap
-    
+
     return {"input": question, "output": answer}
 
-def task_fill_from_pixel(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_fill_from_pixel(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a pixel fills in one direction until hitting another pixel."""
     block_size = rng.randint(3, 6)
     if block_size >= size - 2:
         return None
-    
+
     # Position block with space for seed
     block_pos = rng.randint(1, size - block_size - 1)
-    
+
     # Create input
     question = gen_field(size)
-    
+
     # Place main block
     block_color = rng.randint(1, 9)
     for i in range(block_size):
         question[block_pos + i] = block_color
-    
+
     # Place seed pixel and determine fill direction
     seed_color = rng.randint(1, 9)
     while seed_color == block_color:
         seed_color = rng.randint(1, 9)
-        
+
     is_left = rng.random() < 0.5
-    
+
     if is_left:
         question[block_pos - 1] = seed_color
     else:
         question[block_pos + block_size] = seed_color
-    
+
     # Create answer with fill
     answer = question.copy()
-    
+
     if is_left:
         # Fill from seed to left border
         for i in range(block_pos):
@@ -827,14 +852,15 @@ def task_fill_from_pixel(size: int, rng: Random) -> Optional[Dict[str, List[int]
         # Fill from seed to right border
         for i in range(block_pos + block_size, size):
             answer[i] = seed_color
-    
+
     return {"input": question, "output": answer}
 
-def task_mark_size_two_blocks(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_mark_size_two_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where size-2 blocks are marked with surrounding pixels."""
     blocks = []
     pos = 0
-    
+
     # Generate blocks with minimum gap of 2
     while pos < size:
         if rng.random() < 0.4:
@@ -844,31 +870,31 @@ def task_mark_size_two_blocks(size: int, rng: Random) -> Optional[Dict[str, List
             if pos + needed_space < size:
                 blocks.append((pos, block_size))
                 pos += block_size + 2  # Minimum gap of 2
-            
+
         pos += 1
-    
+
     if len(blocks) < 2:
         return None
-    
+
     # Verify gaps between blocks (including markers)
     valid = True
-    for i in range(len(blocks)-1):
+    for i in range(len(blocks) - 1):
         pos1, size1 = blocks[i]
-        pos2, _ = blocks[i+1]
+        pos2, _ = blocks[i + 1]
         needed_gap = 3 if size1 == 2 else 2
         if pos2 - (pos1 + size1) < needed_gap:
             valid = False
             break
     if not valid:
         return None
-    
+
     # Create input with blocks
     question = gen_field(size)
     for pos, block_size in blocks:
         # Place block
         for i in range(block_size):
             question[pos + i] = 1
-    
+
     # Create answer with markers
     answer = question.copy()
     for pos, block_size in blocks:
@@ -878,33 +904,34 @@ def task_mark_size_two_blocks(size: int, rng: Random) -> Optional[Dict[str, List
                 answer[pos - 1] = 3
             if pos + block_size < size:
                 answer[pos + block_size] = 3
-    
+
     return {"input": question, "output": answer}
 
-def task_fill_until_collision(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_fill_until_collision(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where pixels fill empty space until collision."""
     # At least 4 positions for meaningful puzzle
     if size < 4:
         return None
-    
+
     is_left = rng.random() < 0.5
     question = gen_field(size)
-    
+
     # Place the side marker
     if is_left:
         question[0] = 5
     else:
         question[size - 1] = 5
-    
+
     # Place 2-4 random pixels
     num_pixels = rng.randint(2, 4)
     positions = []
-    
+
     if is_left:
         # Skip first position
         for _ in range(num_pixels):
             while True:
-                pos = rng.randint(1, size-1)
+                pos = rng.randint(1, size - 1)
                 if pos not in positions:
                     positions.append(pos)
                     break
@@ -912,20 +939,20 @@ def task_fill_until_collision(size: int, rng: Random) -> Optional[Dict[str, List
         # Skip last position
         for _ in range(num_pixels):
             while True:
-                pos = rng.randint(0, size-2)
+                pos = rng.randint(0, size - 2)
                 if pos not in positions:
                     positions.append(pos)
                     break
-    
+
     # Color random pixels
     for pos in positions:
         question[pos] = rng.randint(1, 9)
-    
+
     positions.sort()
-    
+
     # Create answer
     answer = question.copy()
-    
+
     if is_left:
         # Fill right from each pixel
         prev_pos = 0  # Start from marker
@@ -937,33 +964,34 @@ def task_fill_until_collision(size: int, rng: Random) -> Optional[Dict[str, List
             prev_pos = pos
     else:
         # Fill left from each pixel
-        prev_pos = size-1  # Start from marker
+        prev_pos = size - 1  # Start from marker
         for pos in reversed(positions):
             color = question[pos]
             # Fill from current position to previous
-            for i in range(pos+1, prev_pos):
+            for i in range(pos + 1, prev_pos):
                 answer[i] = color
             prev_pos = pos
-    
+
     return {"input": question, "output": answer}
 
-def task_repeat_pattern_full(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_repeat_pattern_full(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where a pattern is repeated to fill the space."""
     # Generate initial pattern
     pattern_size = rng.randint(2, 5)
     pattern = [rng.randint(1, 9) for _ in range(pattern_size)]
-    
+
     # Calculate total size needed for 2 repetitions
     double_size = pattern_size * 2
     if double_size >= size:
         return None
-    
+
     # Create input with 2 repetitions
     question = gen_field(size)
     for i in range(pattern_size):
         question[i] = pattern[i]
         question[i + pattern_size] = pattern[i]
-    
+
     # Create answer with maximum repetitions
     answer = gen_field(size)
     pos = 0
@@ -971,63 +999,65 @@ def task_repeat_pattern_full(size: int, rng: Random) -> Optional[Dict[str, List[
         for i in range(pattern_size):
             answer[pos + i] = pattern[i]
         pos += pattern_size
-    
+
     # Fill remaining space (if any) with pattern elements
     for i in range(pos, size):
         answer[i] = pattern[i - pos]
-    
+
     return {"input": question, "output": answer}
 
-def task_gravity_weighted_colors(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where color 2 is heavier than color 1 in gravity."""
     # Generate random field with only colors 1 and 2
     question = [rng.randint(1, 2) if rng.random() < 0.5 else 0 for _ in range(size)]
-    
+
     # Count colors
     count_1 = sum(1 for x in question if x == 1)
     count_2 = sum(1 for x in question if x == 2)
-    
+
     # Create answer with sorted colors
     answer = gen_field(size)
-    
+
     # Place heavier color 2 first
     for i in range(count_2):
         answer[i] = 2
-    
+
     # Then place color 1
     for i in range(count_1):
         answer[count_2 + i] = 1
-    
+
     return {"input": question, "output": answer}
 
-def task_color_left_half_blocks(size: int, rng: Random) -> Optional[Dict[str, List[int]]]:
+
+def task_color_left_half_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where left half of blocks are colored differently."""
     pos = 0
     question = gen_field(size)
     blocks = []
-    
+
     # Generate blocks with gap 1
     while pos < size:
         if rng.random() < 0.4:
             block_size = rng.randint(2, 8)
             if pos + block_size >= size:
                 break
-            
+
             blocks.append((pos, block_size))
             for i in range(block_size):
                 question[pos + i] = 2
             pos += block_size + 1  # block size + gap
         else:
             pos += 1
-    
+
     if len(blocks) < 2:
         return None
-    
+
     # Create answer with half-colored blocks
     answer = question.copy()
     for pos, block_size in blocks:
         half_size = block_size // 2
         for i in range(half_size):
             answer[pos + i] = 8
-    
+
     return {"input": question, "output": answer}
diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
index 78479a7f..f724037f 100644
--- a/tests/test_arc_1d.py
+++ b/tests/test_arc_1d.py
@@ -1,36 +1,59 @@
 import random
+
 import pytest
+
 from reasoning_gym.cognition.arc_1d import (
-    task_move_n_pix, task_move_n_pix_wrapped, task_gravity, task_gravity_counting,
-    task_gravity_antigravity, task_block_touch_dot, task_block_touch_dot_n_pix,
-    task_block_scale_to_dot, task_two_points_and_fill, task_reflect_block_with_border_pixel,
-    task_reflect_block_with_border_pixel_random, task_reflect_block_around_dot,
-    task_block_and_noise_remove, task_block_and_noise_remove_inside,
-    task_copy_block_to_dots, task_copy_block_to_dots_colors, task_paint_biggest_block,
-    task_sort_blocks_by_size, task_sort_complete_sequence, task_recolor_blocks_by_size,
-    task_gravity_one_step, task_move_block_by_own_size, task_change_to_five,
-    task_recolor_blocks_from_palette, task_duplicate_block_from_seeds,
-    task_fill_from_pixel, task_mark_size_two_blocks, task_fill_until_collision,
-    task_repeat_pattern_full, task_gravity_weighted_colors, task_color_left_half_blocks
+    task_block_and_noise_remove,
+    task_block_and_noise_remove_inside,
+    task_block_scale_to_dot,
+    task_block_touch_dot,
+    task_block_touch_dot_n_pix,
+    task_change_to_five,
+    task_color_left_half_blocks,
+    task_copy_block_to_dots,
+    task_copy_block_to_dots_colors,
+    task_duplicate_block_from_seeds,
+    task_fill_from_pixel,
+    task_fill_until_collision,
+    task_gravity,
+    task_gravity_antigravity,
+    task_gravity_counting,
+    task_gravity_one_step,
+    task_gravity_weighted_colors,
+    task_mark_size_two_blocks,
+    task_move_block_by_own_size,
+    task_move_n_pix,
+    task_move_n_pix_wrapped,
+    task_paint_biggest_block,
+    task_recolor_blocks_by_size,
+    task_recolor_blocks_from_palette,
+    task_reflect_block_around_dot,
+    task_reflect_block_with_border_pixel,
+    task_reflect_block_with_border_pixel_random,
+    task_repeat_pattern_full,
+    task_sort_blocks_by_size,
+    task_sort_complete_sequence,
+    task_two_points_and_fill,
 )
 
+
 def test_all_arc_1d_tasks():
     """Test that all ARC 1D task functions can be executed without exceptions."""
     rng = random.Random(42)  # Fixed seed for reproducibility
     size = 20  # Reasonable size for testing
-    
+
     # Test all task functions
     # Fixed move_pix value for testing
     move_pix = 2
-    
+
     tasks = [
-        (lambda s, r, **k: task_move_n_pix(s, move_pix, k["solid"], r), {"solid": True}),
-        (lambda s, r, **k: task_move_n_pix_wrapped(s, move_pix, k["solid"], r), {"solid": True}),
+        (task_move_n_pix, {"move_pix": move_pix, "solid": True}),
+        (task_move_n_pix_wrapped, {"move_pix": move_pix, "solid": True}),
         (task_gravity, {}),
         (task_gravity_counting, {}),
         (task_gravity_antigravity, {}),
         (task_block_touch_dot, {}),
-        (lambda s, r, **k: task_block_touch_dot_n_pix(s, move_pix, r), {}),
+        (task_block_touch_dot_n_pix, {"move_pix": move_pix}),
         (task_block_scale_to_dot, {}),
         (task_two_points_and_fill, {}),
         (task_reflect_block_with_border_pixel, {}),
@@ -54,7 +77,7 @@ def test_all_arc_1d_tasks():
         (task_fill_until_collision, {}),
         (task_repeat_pattern_full, {}),
         (task_gravity_weighted_colors, {}),
-        (task_color_left_half_blocks, {})
+        (task_color_left_half_blocks, {}),
     ]
 
     for task_func, kwargs in tasks:
@@ -62,7 +85,7 @@ def test_all_arc_1d_tasks():
         success = False
         for _ in range(10):  # Try up to 10 times
             try:
-                result = task_func(size, rng, **kwargs)
+                result = task_func(rng, size, **kwargs)
                 if result is not None:
                     success = True
                     # Basic structure checks
@@ -74,5 +97,5 @@ def test_all_arc_1d_tasks():
                     break
             except Exception as e:
                 pytest.fail(f"Task {task_func.__name__} failed with error: {str(e)}")
-        
+
         assert success, f"Task {task_func.__name__} always returned None in 10 attempts"

From 519999ff89494da789e7c2517f3d7eddd7f50ed6 Mon Sep 17 00:00:00 2001
From: rishabhranawat <rishabhranawat12345@gmail.com>
Date: Sun, 2 Feb 2025 08:26:05 -0800
Subject: [PATCH 77/94] Update dataset list w/ some missing logic datasets

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d5126451..180331af 100644
--- a/README.md
+++ b/README.md
@@ -112,7 +112,8 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
 ### <small>Logic Tasks</small>
 
 - `PropositionalLogicDataset`: Generate propositional logic reasoning problems
-
+- `SyllogismDataset`: Generates a [syllogism](https://en.wikipedia.org/wiki/Syllogism) reasoning dataset
+- `AliceInWonderlandDataset`: Generates [AIW](https://openreview.net/forum?id=Mkl7dzjYiW) (Alice In Wonderland) problems with a few variations
 ### <small>Graph Tasks</small>
 
 - `FamilyRelationshipsDataset`: Generate family relationship reasoning tasks with family trees

From 1df952001ea4244c637fcb908ec9c9df470fd923 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 17:28:01 +0100
Subject: [PATCH 78/94] update gallery SyllogismDataset

---
 GALLERY.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index 77127cd0..f99fe13b 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -1576,14 +1576,14 @@ Metadata: {'premise1': 'No students are humans', 'premise2': 'No humans are chef
 
 Example 2:
 Question: Consider these statements:
-1. Some ... are not children are animals
+1. Some children are not animals
 2. Some animals are doctors
 
 Does it logically follow that:
 All children are doctors?
 (Answer Yes or No)
 Answer: Yes
-Metadata: {'premise1': 'Some ... are not children are animals', 'premise2': 'Some animals are doctors', 'conclusion': 'All children are doctors', 'is_valid': True}
+Metadata: {'premise1': 'Some children are not animals', 'premise2': 'Some animals are doctors', 'conclusion': 'All children are doctors', 'is_valid': True}
 
 Example 3:
 Question: Consider these statements:
@@ -1591,10 +1591,10 @@ Question: Consider these statements:
 2. No tigers are whales
 
 Does it logically follow that:
-Some ... are not butterflies are whales?
+Some butterflies are not whales?
 (Answer Yes or No)
 Answer: No
-Metadata: {'premise1': 'All butterflies are tigers', 'premise2': 'No tigers are whales', 'conclusion': 'Some ... are not butterflies are whales', 'is_valid': False}
+Metadata: {'premise1': 'All butterflies are tigers', 'premise2': 'No tigers are whales', 'conclusion': 'Some butterflies are not whales', 'is_valid': False}
 
 ````
 

From b0d21cf6643617ae453d77108db38d5155f4c370 Mon Sep 17 00:00:00 2001
From: joesharratt1229 <joesharratt1229@gmail.com>
Date: Sun, 2 Feb 2025 17:18:56 +0000
Subject: [PATCH 79/94] added score_answer implementation and tests

---
 .../algebra/intermediate_integration.py       | 31 +++++++++-
 reasoning_gym/algebra/simple_integration.py   | 32 +++++++++-
 tests/test_intermediate_integration.py        | 53 ++++++++++++++---
 tests/test_simple_integration.py              | 58 ++++++++++++++-----
 4 files changed, 148 insertions(+), 26 deletions(-)

diff --git a/reasoning_gym/algebra/intermediate_integration.py b/reasoning_gym/algebra/intermediate_integration.py
index 9e5c9528..5d0b139c 100644
--- a/reasoning_gym/algebra/intermediate_integration.py
+++ b/reasoning_gym/algebra/intermediate_integration.py
@@ -1,6 +1,6 @@
 import random
 from dataclasses import dataclass
-from typing import Optional
+from typing import Any, Dict, Optional
 
 import sympy
 
@@ -221,16 +221,43 @@ class IntermediateIntegrationDataset(ProceduralDataset):
                 integrand = self._generate_repeated_parts(rng, x)
 
         answer = sympy.integrate(integrand, x)
+        answer_str = str(answer) + " + C"
+
         return {
             "question": rng.choice(self.prompt_template).format(integrand=integrand),
-            "answer": str(answer) + " + C",
+            "answer": answer_str,
             "metadata": {
                 "integrand": str(integrand),
                 "problem_type": problem_type,
                 "variable": str(x),
                 "type": substitution_type if problem_type == "substitution" else parts_type,
+                "expected_answer_expression": answer,
             },
         }
 
+    def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
+        """Determine if the solution provided solves the problem"""
+        reward = 0.0
+        if answer is not None:
+            try:
+                var = metadata["variable"]
+                x = sympy.Symbol(var)
+                # Parse answer while allowing integration constant 'C'
+                user_expr = sympy.parse_expr(answer, local_dict={var: x, "C": sympy.Symbol("C")})
+                # Compute derivative of student's answer
+                derivative = sympy.diff(user_expr, x)
+                integrand = sympy.parse_expr(metadata["integrand"], local_dict={var: x})
+
+                # Check mathematical equivalence through simplification
+                if sympy.simplify(derivative - integrand) == 0:
+                    reward = 1.0
+                elif answer.strip():
+                    reward = 0.05
+                else:
+                    reward = 0.01
+            except:
+                reward = 0.01
+        return reward
+
 
 register_dataset("intermediate_integration", IntermediateIntegrationDataset, IntermediateIntegrationConfig)
diff --git a/reasoning_gym/algebra/simple_integration.py b/reasoning_gym/algebra/simple_integration.py
index e524e3ef..1da32004 100644
--- a/reasoning_gym/algebra/simple_integration.py
+++ b/reasoning_gym/algebra/simple_integration.py
@@ -1,7 +1,7 @@
 import random
 from dataclasses import dataclass
 from fractions import Fraction
-from typing import Optional
+from typing import Any, Dict, Optional
 
 import sympy
 
@@ -73,8 +73,36 @@ class SimpleIntegrationDataset(ProceduralDataset):
         return {
             "question": rng.choice(self._prompt_templates).format(integrand=derivative),
             "answer": str(polynomial) + " + C",
-            "metadata": {"integrand": str(derivative), "variable": str(symbol), "antiderivative": str(polynomial)},
+            "metadata": {
+                "integrand": str(derivative),
+                "variable": str(symbol),
+                "expected_answer_expression": polynomial,
+            },
         }
 
+    def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
+        """Determine if the solution provided solves the problem"""
+        reward = 0.0
+        if answer is not None:
+            try:
+                var = metadata["variable"]
+                x = sympy.Symbol(var)
+                # Parse answer while allowing integration constant 'C'
+                user_expr = sympy.parse_expr(answer, local_dict={var: x, "C": sympy.Symbol("C")})
+                # Compute derivative of student's answer
+                derivative = sympy.diff(user_expr, x)
+                integrand = sympy.parse_expr(metadata["integrand"], local_dict={var: x})
+
+                # Check mathematical equivalence through simplification
+                if sympy.simplify(derivative - integrand) == 0:
+                    reward = 1.0
+                elif answer.strip():
+                    reward = 0.05
+                else:
+                    reward = 0.01
+            except:
+                reward = 0.01
+        return reward
+
 
 register_dataset("simple_integration", SimpleIntegrationDataset, SimpleIntegrationConfig)
diff --git a/tests/test_intermediate_integration.py b/tests/test_intermediate_integration.py
index fc35f387..df62ea76 100644
--- a/tests/test_intermediate_integration.py
+++ b/tests/test_intermediate_integration.py
@@ -95,15 +95,50 @@ def test_intermediate_integration_dataset_items():
         assert isinstance(parse_expr(answer), sympy.Expr)
 
 
-def test_solution_verification():
-    """Test for solution verification of each answer"""
-    config = IntermediateIntegrationConfig(seed=42, size=10)
+def test_verify_answer():
+    config = IntermediateIntegrationConfig(seed=42)
     dataset = IntermediateIntegrationDataset(config)
+    for i in range(len(dataset)):
+        item = dataset[i]
+        score = dataset.score_answer(item["answer"], item["metadata"])
+        assert score == 1.0
 
-    for item in dataset:
-        integrand = parse_expr(item["metadata"]["integrand"])
-        variable = sympy.Symbol(item["metadata"]["variable"])
-        answer = parse_expr(item["answer"].replace(" + C", ""))
 
-        # Verify that the derivative of the answer equals the integrand
-        assert sympy.simplify(sympy.diff(answer, variable) - integrand) == 0
+def test_score_answer_cases():
+    """Test various answer scoring scenarios"""
+    config = IntermediateIntegrationConfig(seed=42)
+    dataset = IntermediateIntegrationDataset(config)
+    x = sympy.Symbol("x")
+    X = sympy.Symbol("X")
+
+    # Test cases: (answer, metadata, expected_score)
+    test_cases = [
+        # Correct answers
+        ("x**2 + C", {"variable": "x", "integrand": "2*x"}, 1.0),
+        ("X**3 - 5*X + C", {"variable": "X", "integrand": "3*X**2 - 5"}, 1.0),
+        ("sin(x) + C", {"variable": "x", "integrand": "cos(x)"}, 1.0),
+        # Correct without explicit constant
+        ("x**2", {"variable": "x", "integrand": "2*x"}, 1.0),
+        ("log(x)", {"variable": "x", "integrand": "1/x"}, 1.0),
+        # Incorrect but properly formatted
+        ("x**3 + C", {"variable": "x", "integrand": "2*x"}, 0.05),
+        ("cos(X)", {"variable": "X", "integrand": "sin(X)"}, 0.05),
+        # Malformed expressions
+        ("x**2 +", {"variable": "x", "integrand": "2*x"}, 0.01),
+        ("sin(x", {"variable": "x", "integrand": "cos(x)"}, 0.01),
+        # Empty answer
+        ("", {"variable": "x", "integrand": "2*x"}, 0.01),
+        # Case sensitivity
+        ("x**2 + C", {"variable": "X", "integrand": "2*X"}, 0.05),
+        ("X**2 + C", {"variable": "x", "integrand": "2*x"}, 0.05),
+        # Alternative constant notation
+        ("x**2 + K", {"variable": "x", "integrand": "2*x"}, 1.0),
+        ("sin(x) + D", {"variable": "x", "integrand": "cos(x)"}, 1.0),
+        # Simplification required
+        ("x**2 + C + 5 - 5", {"variable": "x", "integrand": "2*x"}, 1.0),
+        ("(x**3)/3 - 2*x + C", {"variable": "x", "integrand": "x**2 - 2"}, 1.0),
+    ]
+
+    for answer, metadata, expected in test_cases:
+        score = dataset.score_answer(answer, metadata)
+        assert score == expected, f"Failed case: {answer} | Expected {expected}, got {score}"
diff --git a/tests/test_simple_integration.py b/tests/test_simple_integration.py
index 8d64cc25..0de8ab36 100644
--- a/tests/test_simple_integration.py
+++ b/tests/test_simple_integration.py
@@ -1,6 +1,3 @@
-import random
-from fractions import Fraction
-
 import pytest
 import sympy
 from sympy.parsing.sympy_parser import parse_expr
@@ -63,7 +60,7 @@ def test_simple_integration_dataset_items():
 
         assert "integrand" in item["metadata"]
         assert "variable" in item["metadata"]
-        assert "antiderivative" in item["metadata"]
+        assert "expected_answer_expression" in item["metadata"]
 
         # Verify answer is a mathematical expression
         answer = item["answer"]
@@ -71,15 +68,50 @@ def test_simple_integration_dataset_items():
         assert isinstance(parse_expr(answer), sympy.Expr)
 
 
-def test_simple_integration_solution_verification():
-    """Test for solution verification of each answer"""
-    config = SimpleIntegrationConfig(seed=42, size=10)
+def test_verify_answer():
+    config = SimpleIntegrationConfig(seed=42)
     dataset = SimpleIntegrationDataset(config)
+    for i in range(len(dataset)):
+        item = dataset[i]
+        score = dataset.score_answer(item["answer"], item["metadata"])
+        assert score == 1.0
 
-    for item in dataset:
-        integrand = parse_expr(item["metadata"]["integrand"])
-        variable = sympy.Symbol(item["metadata"]["variable"])
-        answer = parse_expr(item["answer"].replace(" + C", ""))
 
-        # Verify that the derivative of the answer equals the integrand
-        assert sympy.simplify(sympy.diff(answer, variable) - integrand) == 0
+def test_score_answer_cases():
+    """Test various answer scoring scenarios"""
+    config = SimpleIntegrationConfig(seed=42)
+    dataset = SimpleIntegrationDataset(config)
+    x = sympy.Symbol("x")
+    X = sympy.Symbol("X")
+
+    # Test cases: (answer, metadata, expected_score)
+    test_cases = [
+        # Correct answers
+        ("x**2 + C", {"variable": "x", "integrand": "2*x"}, 1.0),
+        ("X**3 - 5*X + C", {"variable": "X", "integrand": "3*X**2 - 5"}, 1.0),
+        ("sin(x) + C", {"variable": "x", "integrand": "cos(x)"}, 1.0),
+        # Correct without explicit constant
+        ("x**2", {"variable": "x", "integrand": "2*x"}, 1.0),
+        ("log(x)", {"variable": "x", "integrand": "1/x"}, 1.0),
+        # Incorrect but properly formatted
+        ("x**3 + C", {"variable": "x", "integrand": "2*x"}, 0.05),
+        ("cos(X)", {"variable": "X", "integrand": "sin(X)"}, 0.05),
+        # Malformed expressions
+        ("x**2 +", {"variable": "x", "integrand": "2*x"}, 0.01),
+        ("sin(x", {"variable": "x", "integrand": "cos(x)"}, 0.01),
+        # Empty answer
+        ("", {"variable": "x", "integrand": "2*x"}, 0.01),
+        # Case sensitivity
+        ("x**2 + C", {"variable": "X", "integrand": "2*X"}, 0.05),
+        ("X**2 + C", {"variable": "x", "integrand": "2*x"}, 0.05),
+        # Alternative constant notation
+        ("x**2 + K", {"variable": "x", "integrand": "2*x"}, 1.0),
+        ("sin(x) + D", {"variable": "x", "integrand": "cos(x)"}, 1.0),
+        # Simplification required
+        ("x**2 + C + 5 - 5", {"variable": "x", "integrand": "2*x"}, 1.0),
+        ("(x**3)/3 - 2*x + C", {"variable": "x", "integrand": "x**2 - 2"}, 1.0),
+    ]
+
+    for answer, metadata, expected in test_cases:
+        score = dataset.score_answer(answer, metadata)
+        assert score == expected, f"Failed case: {answer} | Expected {expected}, got {score}"

From ccff85f81cb7175893648a7a01d3e58dd6b60359 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 21:56:17 +0100
Subject: [PATCH 80/94] run scripts/generate_gallery.py

---
 GALLERY.md | 202 ++++++++++++++++++++---------------------------------
 1 file changed, 77 insertions(+), 125 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index 9b1ec4df..e134f23c 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -23,6 +23,7 @@ This gallery shows examples from all available datasets using their default conf
 - [letter_jumble](#letter_jumble)
 - [maze](#maze)
 - [mini_sudoku](#mini_sudoku)
+- [n_queens](#n_queens)
 - [number_filtering](#number_filtering)
 - [number_sequence](#number_sequence)
 - [number_sorting](#number_sorting)
@@ -36,7 +37,6 @@ This gallery shows examples from all available datasets using their default conf
 - [simple_geometry](#simple_geometry)
 - [spell_backward](#spell_backward)
 - [sudoku](#sudoku)
-- [n_queens](#n_queens)
 - [syllogism](#syllogism)
 - [time_intervals](#time_intervals)
 - [tower_of_hanoi](#tower_of_hanoi)
@@ -999,6 +999,82 @@ Metadata: {'puzzle': [[0, 0, 0, 0], [1, 3, 4, 0], [3, 1, 2, 4], [4, 0, 0, 0]], '
 
 ````
 
+### n_queens
+Generates N Queens puzzles with configurable difficulty
+
+Default configuration:
+```python
+n = 8
+min_remove = 1
+max_remove = 7
+size = 500
+seed = 42
+```
+
+Example tasks:
+````
+Example 1:
+Question: Solve this N Queens puzzle:
+_ _ _ _ _ _ Q _
+_ Q _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ _ Q
+_ _ _ _ Q _ _ _
+_ _ Q _ _ _ _ _
+_ _ _ _ _ Q _ _
+
+The board size is 8x8 and your job is to place 1 queen(s) on the board such that no two queens attack each other.
+
+No two queens attack each other if they are not in the same row, column, or diagonal.
+
+Place a queen by replacing an underscore (_) with a Q.
+
+Answer: {'_ _ _ _ _ _ Q _\n_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ _ Q\n_ _ _ _ Q _ _ _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _'}
+Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solution': [[['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 1}
+
+Example 2:
+Question: Solve this N Queens puzzle:
+_ Q _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ Q _ _
+_ _ _ _ _ _ _ Q
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ Q _
+_ _ _ _ Q _ _ _
+
+The board size is 8x8 and your job is to place 3 queen(s) on the board such that no two queens attack each other.
+
+No two queens attack each other if they are not in the same row, column, or diagonal.
+
+Place a queen by replacing an underscore (_) with a Q.
+
+Answer: {'_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\n_ _ _ _ _ Q _ _\n_ _ _ _ _ _ _ Q\n_ _ Q _ _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ Q _\n_ _ _ _ Q _ _ _'}
+Metadata: {'puzzle': [['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']], 'solution': [[['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']]], 'num_removed': 3}
+
+Example 3:
+Question: Solve this N Queens puzzle:
+_ _ _ _ _ _ _ _
+_ Q _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+Q _ _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ _ _ _
+_ _ _ _ _ Q _ _
+
+The board size is 8x8 and your job is to place 5 queen(s) on the board such that no two queens attack each other.
+
+No two queens attack each other if they are not in the same row, column, or diagonal.
+
+Place a queen by replacing an underscore (_) with a Q.
+
+Answer: {'_ _ _ _ _ _ _ Q\n_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ Q _\n_ _ _ _ Q _ _ _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _', '_ _ _ _ Q _ _ _\n_ Q _ _ _ _ _ _\n_ _ _ _ _ _ _ Q\nQ _ _ _ _ _ _ _\n_ _ _ Q _ _ _ _\n_ _ _ _ _ _ Q _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _', '_ _ _ _ _ _ Q _\n_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ _ Q\n_ _ _ _ Q _ _ _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _'}
+Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solution': [[['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 5}
+
+````
+
 ### number_filtering
 Generates number filtering tasks
 
@@ -1546,130 +1622,6 @@ Metadata: {'puzzle': [[0, 0, 1, 2, 3, 0, 0, 0, 9], [3, 0, 0, 1, 8, 5, 6, 7, 2],
 
 ````
 
-
-### n_queens
-
-Generates N-Queens puzzles with configurable board size and number of starting queens
-
-Default configuration:
-```python
-n = 8
-min_remove = 1
-max_remove = 7
-size = 500
-```
-
-Example tasks:
-````
-Example 1
-Question: Solve this N Queens puzzle:
-_ _ _ _ _ _ Q _
-_ Q _ _ _ _ _ _
-_ _ _ Q _ _ _ _
-_ _ _ _ _ _ _ _
-_ _ _ _ _ _ _ Q
-_ _ _ _ Q _ _ _
-_ _ Q _ _ _ _ _
-_ _ _ _ _ Q _ _
-
-The board size is 8x8 and your job is to place 1 queen(s) on the board such that no two queens attack each other.
-
-No two queens attack each other if they are not in the same row, column, or diagonal.
-
-Place a queen by replacing an underscore (_) with a Q.
-
-Answer 1:
-_ _ _ _ _ _ Q _
-_ Q _ _ _ _ _ _
-_ _ _ Q _ _ _ _
-Q _ _ _ _ _ _ _
-_ _ _ _ _ _ _ Q
-_ _ _ _ Q _ _ _
-_ _ Q _ _ _ _ _
-_ _ _ _ _ Q _ _
-
-Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solution': [[['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 1}
-
-Example 2
-Question: Solve this N Queens puzzle:
-_ Q _ _ _ _ _ _
-_ _ _ _ _ _ _ _
-_ _ _ _ _ Q _ _
-_ _ _ _ _ _ _ Q
-_ _ _ _ _ _ _ _
-_ _ _ _ _ _ _ _
-_ _ _ _ _ _ Q _
-_ _ _ _ Q _ _ _
-
-The board size is 8x8 and your job is to place 3 queen(s) on the board such that no two queens attack each other.
-
-No two queens attack each other if they are not in the same row, column, or diagonal.
-
-Place a queen by replacing an underscore (_) with a Q.
-
-Answer 1:
-_ Q _ _ _ _ _ _
-_ _ _ Q _ _ _ _
-_ _ _ _ _ Q _ _
-_ _ _ _ _ _ _ Q
-_ _ Q _ _ _ _ _
-Q _ _ _ _ _ _ _
-_ _ _ _ _ _ Q _
-_ _ _ _ Q _ _ _
-
-Metadata: {'puzzle': [['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']], 'solution': [[['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']]], 'num_removed': 3}
-
-Example 3
-Question: Solve this N Queens puzzle:
-_ _ _ _ _ _ _ _
-_ Q _ _ _ _ _ _
-_ _ _ _ _ _ _ _
-Q _ _ _ _ _ _ _
-_ _ _ _ _ _ _ _
-_ _ _ _ _ _ _ _
-_ _ _ _ _ _ _ _
-_ _ _ _ _ Q _ _
-
-The board size is 8x8 and your job is to place 5 queen(s) on the board such that no two queens attack each other.
-
-No two queens attack each other if they are not in the same row, column, or diagonal.
-
-Place a queen by replacing an underscore (_) with a Q.
-
-Answer 1:
-_ _ _ _ Q _ _ _
-_ Q _ _ _ _ _ _
-_ _ _ _ _ _ _ Q
-Q _ _ _ _ _ _ _
-_ _ _ Q _ _ _ _
-_ _ _ _ _ _ Q _
-_ _ Q _ _ _ _ _
-_ _ _ _ _ Q _ _
-
-Answer 2:
-_ _ _ _ _ _ Q _
-_ Q _ _ _ _ _ _
-_ _ _ Q _ _ _ _
-Q _ _ _ _ _ _ _
-_ _ _ _ _ _ _ Q
-_ _ _ _ Q _ _ _
-_ _ Q _ _ _ _ _
-_ _ _ _ _ Q _ _
-
-Answer 3:
-_ _ _ _ _ _ _ Q
-_ Q _ _ _ _ _ _
-_ _ _ Q _ _ _ _
-Q _ _ _ _ _ _ _
-_ _ _ _ _ _ Q _
-_ _ _ _ Q _ _ _
-_ _ Q _ _ _ _ _
-_ _ _ _ _ Q _ _
-
-Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solution': [[['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 5}
-````
-
-
 ### syllogism
 Generates syllogism reasoning tasks
 

From b0267747087b839004d98c731ba1f62a57ed9b83 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 22:15:47 +0100
Subject: [PATCH 81/94] refactor: Update test cases to use 'solutions' instead
 of 'solution' in metadata

---
 tests/test_n_queens.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_n_queens.py b/tests/test_n_queens.py
index 8476864a..f5b8108c 100644
--- a/tests/test_n_queens.py
+++ b/tests/test_n_queens.py
@@ -49,17 +49,17 @@ def test_nqueens_dataset_items():
 
         # Check metadata
         assert "puzzle" in item["metadata"]
-        assert "solution" in item["metadata"]
+        assert "solutions" in item["metadata"]
         assert "num_removed" in item["metadata"]
 
         puzzle = item["metadata"]["puzzle"]
-        solution = item["metadata"]["solution"]
+        solutions = item["metadata"]["solutions"]
         num_removed = item["metadata"]["num_removed"]
 
         # Verify board dimensions
         assert len(puzzle) == 8
         assert all(len(row) == 8 for row in puzzle)
-        for board in solution:
+        for board in solutions:
             assert len(board) == 8
             assert all(len(row) == 8 for row in board)
 
@@ -69,7 +69,7 @@ def test_nqueens_dataset_items():
         assert removed_count == num_removed
 
         # Verify solution validity
-        for board in solution:
+        for board in solutions:
             assert is_valid_solution(board)
 
             # Verify puzzle matches solution where filled
@@ -98,7 +98,7 @@ def test_nqueens_board_generation():
 
     for i in range(len(dataset)):
         item = dataset[i]
-        for board in item["metadata"]["solution"]:
+        for board in item["metadata"]["solutions"]:
             assert is_valid_solution(board)
 
 

From 751773828f58c64e539c951388a7c43a9070affd Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 22:15:49 +0100
Subject: [PATCH 82/94] test: Add unit test for score_answer method in N-Queens
 dataset

---
 tests/test_n_queens.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/test_n_queens.py b/tests/test_n_queens.py
index f5b8108c..946685be 100644
--- a/tests/test_n_queens.py
+++ b/tests/test_n_queens.py
@@ -102,6 +102,27 @@ def test_nqueens_board_generation():
             assert is_valid_solution(board)
 
 
+def test_nqueens_score_answer():
+    """Test the score_answer method"""
+    config = NQueensConfig(n=8, size=10, seed=42)
+    dataset = NQueensDataset(config)
+
+    # Test a few items
+    for i in range(len(dataset)):
+        item = dataset[i]
+        
+        # Test correct answer gets score 1.0
+        valid_answer = item["metadata"]["valid_answers"][0]
+        assert dataset.score_answer(valid_answer, item) == 1.0
+        
+        # Test invalid answer gets score 0.01
+        invalid_answer = "_ _ _ _\n_ _ _ _\n_ _ _ _\n_ _ _ _"
+        assert dataset.score_answer(invalid_answer, item) == 0.01
+        
+        # Test None answer gets score 0.0
+        assert dataset.score_answer(None, item) == 0.0
+
+
 def is_valid_solution(board: list[list[str]]) -> bool:
     """Helper function to verify N Queens solution validity"""
     rows, cols, diags, off_diags = set(), set(), set(), set()

From 057b9f203481356d7f5386a9dcec9a0dc560b464 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 22:18:54 +0100
Subject: [PATCH 83/94] auto-load simple/intermediate integration tasks, stable
 order for n_queens (set was not stable)

---
 GALLERY.md                        | 113 ++++++++++++++++++++++++++++--
 reasoning_gym/algebra/__init__.py |  10 ++-
 reasoning_gym/games/n_queens.py   |  13 ++--
 tests/test_n_queens.py            |   6 +-
 4 files changed, 127 insertions(+), 15 deletions(-)

diff --git a/GALLERY.md b/GALLERY.md
index e134f23c..be5a3a65 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -17,6 +17,7 @@ This gallery shows examples from all available datasets using their default conf
 - [fraction_simplification](#fraction_simplification)
 - [game_of_life](#game_of_life)
 - [gcd](#gcd)
+- [intermediate_integration](#intermediate_integration)
 - [lcm](#lcm)
 - [leg_counting](#leg_counting)
 - [letter_counting](#letter_counting)
@@ -35,6 +36,7 @@ This gallery shows examples from all available datasets using their default conf
 - [sentence_reordering](#sentence_reordering)
 - [simple_equations](#simple_equations)
 - [simple_geometry](#simple_geometry)
+- [simple_integration](#simple_integration)
 - [spell_backward](#spell_backward)
 - [sudoku](#sudoku)
 - [syllogism](#syllogism)
@@ -746,6 +748,48 @@ Metadata: {'numbers': [297, 30], 'result': 3}
 
 ````
 
+### intermediate_integration
+Generates intermediate integration problem - either
+    by substitution or by parts
+
+Default configuration:
+```python
+problem_types = ('substitution', 'by_parts')
+substitution_types = ('linear', 'trigonometric', 'exponential', 'radical')
+by_parts_types = ('polynomial_exp_trig', 'log_inverse_trig', 'cyclic', 'repeated_parts')
+seed = 42
+size = 500
+linear_lower_bound = 1
+linear_upper_bound = 10
+min_linear_degree = 2
+max_linear_degree = 4
+outer_constant_min = 1
+outer_constant_max = 3
+min_poly_degree = 1
+max_poly_degree = 3
+symbols = ('x', 'X')
+operators = ('+', '-')
+```
+
+Example tasks:
+````
+Example 1:
+Question: Find the indefinite integral: ∫ -3*exp(3*x + 9) dx
+Answer: -exp(3*x + 9) + C
+Metadata: {'integrand': '-3*exp(3*x + 9)', 'problem_type': 'substitution', 'variable': 'x', 'type': 'exponential', 'expected_answer_expression': -exp(3*x + 9)}
+
+Example 2:
+Question: Evaluate the indefinite integral: ∫ -6*sin(2*X + 10)*cos(2*X + 10)**4 dx
+Answer: 3*cos(2*X + 10)**5/5 + C
+Metadata: {'integrand': '-6*sin(2*X + 10)*cos(2*X + 10)**4', 'problem_type': 'substitution', 'variable': 'X', 'type': 'trigonometric', 'expected_answer_expression': 3*cos(2*X + 10)**5/5}
+
+Example 3:
+Question: Find the indefinite integral: ∫ 2*asin(x) dx
+Answer: 2*Integral(asin(x), x) + C
+Metadata: {'integrand': '2*asin(x)', 'problem_type': 'by_parts', 'variable': 'x', 'type': 'log_inverse_trig', 'expected_answer_expression': 2*Integral(asin(x), x)}
+
+````
+
 ### lcm
 Generates Least Common Multiple (LCM) tasks
 
@@ -1030,8 +1074,15 @@ No two queens attack each other if they are not in the same row, column, or diag
 
 Place a queen by replacing an underscore (_) with a Q.
 
-Answer: {'_ _ _ _ _ _ Q _\n_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ _ Q\n_ _ _ _ Q _ _ _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _'}
-Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solution': [[['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 1}
+Answer: _ _ _ _ _ _ Q _
+_ Q _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+Q _ _ _ _ _ _ _
+_ _ _ _ _ _ _ Q
+_ _ _ _ Q _ _ _
+_ _ Q _ _ _ _ _
+_ _ _ _ _ Q _ _
+Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solutions': [[['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 1, 'valid_answers': ['_ _ _ _ _ _ Q _\n_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ _ Q\n_ _ _ _ Q _ _ _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _']}
 
 Example 2:
 Question: Solve this N Queens puzzle:
@@ -1050,8 +1101,15 @@ No two queens attack each other if they are not in the same row, column, or diag
 
 Place a queen by replacing an underscore (_) with a Q.
 
-Answer: {'_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\n_ _ _ _ _ Q _ _\n_ _ _ _ _ _ _ Q\n_ _ Q _ _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ Q _\n_ _ _ _ Q _ _ _'}
-Metadata: {'puzzle': [['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']], 'solution': [[['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']]], 'num_removed': 3}
+Answer: _ Q _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+_ _ _ _ _ Q _ _
+_ _ _ _ _ _ _ Q
+_ _ Q _ _ _ _ _
+Q _ _ _ _ _ _ _
+_ _ _ _ _ _ Q _
+_ _ _ _ Q _ _ _
+Metadata: {'puzzle': [['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']], 'solutions': [[['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_']]], 'num_removed': 3, 'valid_answers': ['_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\n_ _ _ _ _ Q _ _\n_ _ _ _ _ _ _ Q\n_ _ Q _ _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ Q _\n_ _ _ _ Q _ _ _']}
 
 Example 3:
 Question: Solve this N Queens puzzle:
@@ -1070,8 +1128,15 @@ No two queens attack each other if they are not in the same row, column, or diag
 
 Place a queen by replacing an underscore (_) with a Q.
 
-Answer: {'_ _ _ _ _ _ _ Q\n_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ Q _\n_ _ _ _ Q _ _ _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _', '_ _ _ _ Q _ _ _\n_ Q _ _ _ _ _ _\n_ _ _ _ _ _ _ Q\nQ _ _ _ _ _ _ _\n_ _ _ Q _ _ _ _\n_ _ _ _ _ _ Q _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _', '_ _ _ _ _ _ Q _\n_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ _ Q\n_ _ _ _ Q _ _ _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _'}
-Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solution': [[['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 5}
+Answer: _ _ _ _ Q _ _ _
+_ Q _ _ _ _ _ _
+_ _ _ _ _ _ _ Q
+Q _ _ _ _ _ _ _
+_ _ _ Q _ _ _ _
+_ _ _ _ _ _ Q _
+_ _ Q _ _ _ _ _
+_ _ _ _ _ Q _ _
+Metadata: {'puzzle': [['_', '_', '_', '_', '_', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], 'solutions': [[['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']], [['_', '_', '_', '_', '_', '_', '_', 'Q'], ['_', 'Q', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', 'Q', '_', '_', '_', '_'], ['Q', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', 'Q', '_'], ['_', '_', '_', '_', 'Q', '_', '_', '_'], ['_', '_', 'Q', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', 'Q', '_', '_']]], 'num_removed': 5, 'valid_answers': ['_ _ _ _ Q _ _ _\n_ Q _ _ _ _ _ _\n_ _ _ _ _ _ _ Q\nQ _ _ _ _ _ _ _\n_ _ _ Q _ _ _ _\n_ _ _ _ _ _ Q _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _', '_ _ _ _ _ _ Q _\n_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ _ Q\n_ _ _ _ Q _ _ _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _', '_ _ _ _ _ _ _ Q\n_ Q _ _ _ _ _ _\n_ _ _ Q _ _ _ _\nQ _ _ _ _ _ _ _\n_ _ _ _ _ _ Q _\n_ _ _ _ Q _ _ _\n_ _ Q _ _ _ _ _\n_ _ _ _ _ Q _ _']}
 
 ````
 
@@ -1512,6 +1577,42 @@ Metadata: {'n_sides': 6, 'known_angles': [143.0, 148.0, 39.0, 55.0, 107.0], 'sum
 
 ````
 
+### simple_integration
+Generates simple integration problems with one variable
+
+Default configuration:
+```python
+min_terms = 2
+max_terms = 5
+min_degree = 1
+max_degree = 10
+min_bounds = 1
+max_bounds = 10
+operators = ('+', '-')
+symbols = ('x', 'X')
+seed = 42
+size = 500
+```
+
+Example tasks:
+````
+Example 1:
+Question: Find the indefinite integral: ∫ 70*x**6 + 12*x**2/5 dx
+Answer: 10*x**7 + 4*x**3/5 + C
+Metadata: {'integrand': '70*x**6 + 12*x**2/5', 'variable': 'x', 'expected_answer_expression': 10*x**7 + 4*x**3/5}
+
+Example 2:
+Question: Find the indefinite integral: ∫ 49*x**6/10 + 48*x**5 - 4*x - 10/9 dx
+Answer: 7*x**7/10 + 8*x**6 - 2*x**2 - 10*x/9 + C
+Metadata: {'integrand': '49*x**6/10 + 48*x**5 - 4*x - 10/9', 'variable': 'x', 'expected_answer_expression': 7*x**7/10 + 8*x**6 - 2*x**2 - 10*x/9}
+
+Example 3:
+Question: Find the indefinite integral: ∫ -28*X**3 + 8*X dx
+Answer: -7*X**4 + 4*X**2 + C
+Metadata: {'integrand': '-28*X**3 + 8*X', 'variable': 'X', 'expected_answer_expression': -7*X**4 + 4*X**2}
+
+````
+
 ### spell_backward
 Generates tasks to spell words backward
 
diff --git a/reasoning_gym/algebra/__init__.py b/reasoning_gym/algebra/__init__.py
index 69d4b91e..fc7a867a 100644
--- a/reasoning_gym/algebra/__init__.py
+++ b/reasoning_gym/algebra/__init__.py
@@ -1,9 +1,15 @@
+from .intermediate_integration import IntermediateIntegrationConfig, IntermediateIntegrationDataset
 from .polynomial_equations import PolynomialEquationsConfig, PolynomialEquationsDataset
 from .simple_equations import SimpleEquationsConfig, SimpleEquationsDataset
+from .simple_integration import SimpleIntegrationConfig, SimpleIntegrationDataset
 
 __all__ = [
-    "SimpleEquationsDataset",
-    "SimpleEquationsConfig",
+    "IntermediateIntegrationConfig",
+    "IntermediateIntegrationDataset",
     "PolynomialEquationsConfig",
     "PolynomialEquationsDataset",
+    "SimpleEquationsDataset",
+    "SimpleEquationsConfig",
+    "SimpleIntegrationConfig",
+    "SimpleIntegrationDataset",
 ]
diff --git a/reasoning_gym/games/n_queens.py b/reasoning_gym/games/n_queens.py
index 0af85a0d..1fef6c62 100644
--- a/reasoning_gym/games/n_queens.py
+++ b/reasoning_gym/games/n_queens.py
@@ -136,16 +136,21 @@ class NQueensDataset(ProceduralDataset):
 
         # Filter all solutions that are intractable from the puzzle's starting state
         valid_solutions = [board for board in self._solutions if self._is_tractable_solution(puzzle, board)]
-        valid_solutions_str = {self._board_to_string(board) for board in valid_solutions}
+        valid_solutions_str = sorted({self._board_to_string(board) for board in valid_solutions})
 
         return {
             "question": QUESTION_TEMPLATE.format(puzzle=puzzle_str, n=len(puzzle), num_removed=num_removed),
-            "answer": valid_solutions_str,
-            "metadata": {"puzzle": puzzle, "solution": valid_solutions, "num_removed": num_removed},
+            "answer": rng.choice(valid_solutions_str),  # choose arbitary answer (e.g. for SFT)
+            "metadata": {
+                "puzzle": puzzle,
+                "solutions": valid_solutions,
+                "num_removed": num_removed,
+                "valid_answers": valid_solutions_str,
+            },
         }
 
     def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
-        valid_solutions = entry["answer"]
+        valid_solutions = entry["metadata"]["valid_answers"]
         reward = 0.0
         if answer is not None:
             if answer in valid_solutions:
diff --git a/tests/test_n_queens.py b/tests/test_n_queens.py
index 946685be..16911220 100644
--- a/tests/test_n_queens.py
+++ b/tests/test_n_queens.py
@@ -110,15 +110,15 @@ def test_nqueens_score_answer():
     # Test a few items
     for i in range(len(dataset)):
         item = dataset[i]
-        
+
         # Test correct answer gets score 1.0
         valid_answer = item["metadata"]["valid_answers"][0]
         assert dataset.score_answer(valid_answer, item) == 1.0
-        
+
         # Test invalid answer gets score 0.01
         invalid_answer = "_ _ _ _\n_ _ _ _\n_ _ _ _\n_ _ _ _"
         assert dataset.score_answer(invalid_answer, item) == 0.01
-        
+
         # Test None answer gets score 0.0
         assert dataset.score_answer(None, item) == 0.0
 

From 82196bd2dfa98d4780e8a22593b26e90706409fc Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 22:26:24 +0100
Subject: [PATCH 84/94] bump version to 0.1.3, uploaded to pypi

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 52d04319..964c0b4c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "reasoning_gym"
-version = "0.1.2"
+version = "0.1.3"
 authors = [
   { name = "Open-Thought community", email = "andreas.koepf@xamla.com" },
 ]

From 01cc239746fdbf2ec9b9f006565413e1cd50ff1d Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 22:35:43 +0100
Subject: [PATCH 85/94] add quantum lock answer format hint

---
 reasoning_gym/graphs/quantum_lock.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reasoning_gym/graphs/quantum_lock.py b/reasoning_gym/graphs/quantum_lock.py
index 402b6f0c..5863a5bf 100644
--- a/reasoning_gym/graphs/quantum_lock.py
+++ b/reasoning_gym/graphs/quantum_lock.py
@@ -28,7 +28,7 @@ class QuantumLockDataset(ProceduralDataset):
         self._prompt_templates = [
             """\
 In front of you are some buttons, a light, and a number. The light will toggle between red and green whenever you press a button. Each button performs a mathematical operation to the number, but the operation may depend on the state of the light.
-You must press the shortest correct sequence of buttons to reach the target value.
+You must press the shortest correct sequence of buttons to reach the target value. Your answer should be a sequence of buttons separated by '→', for example: A → B → C
 
 Start: {initial_value} ({initial_state})
 Target: {target_value}

From 84e4f1c5bccd85543ae3cdfa166d37462bf7e271 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 22:42:21 +0100
Subject: [PATCH 86/94] feat: Add task augmentation functions mirror, inverse,
 and identity to arc_1d.py

---
 reasoning_gym/cognition/arc_1d.py | 25 +++++++++++++++++++++++++
 tests/test_arc_1d.py              | 18 ++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index d9bb2f82..2aa65d76 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -1030,6 +1030,31 @@ def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[Dict[str, L
     return {"input": question, "output": answer}
 
 
+def task_mirror(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
+    """Mirror the input and output arrays of a task result."""
+    if task_result is None:
+        return None
+    return {
+        "input": list(reversed(task_result["input"])),
+        "output": list(reversed(task_result["output"]))
+    }
+
+
+def task_inverse(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
+    """Swap the input and output arrays of a task result."""
+    if task_result is None:
+        return None
+    return {
+        "input": task_result["output"],
+        "output": task_result["input"]
+    }
+
+
+def task_identity(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
+    """Return the task result unchanged."""
+    return task_result
+
+
 def task_color_left_half_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where left half of blocks are colored differently."""
     pos = 0
diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
index f724037f..082869c6 100644
--- a/tests/test_arc_1d.py
+++ b/tests/test_arc_1d.py
@@ -46,6 +46,24 @@ def test_all_arc_1d_tasks():
     # Fixed move_pix value for testing
     move_pix = 2
 
+    # Test task augmentation functions
+    base_task = task_move_n_pix(rng, size, move_pix, True)
+    assert base_task is not None
+    
+    mirrored = task_mirror(base_task)
+    assert mirrored is not None
+    assert mirrored["input"] == list(reversed(base_task["input"]))
+    assert mirrored["output"] == list(reversed(base_task["output"]))
+    
+    inversed = task_inverse(base_task)
+    assert inversed is not None
+    assert inversed["input"] == base_task["output"]
+    assert inversed["output"] == base_task["input"]
+    
+    identical = task_identity(base_task)
+    assert identical is not None
+    assert identical == base_task
+
     tasks = [
         (task_move_n_pix, {"move_pix": move_pix, "solid": True}),
         (task_move_n_pix_wrapped, {"move_pix": move_pix, "solid": True}),

From 905ef7b89d9c47d05c12e61e012703faa669b740 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 22:42:43 +0100
Subject: [PATCH 87/94] feat: Add missing task transformation imports to
 test_arc_1d.py

---
 tests/test_arc_1d.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
index 082869c6..1ce05148 100644
--- a/tests/test_arc_1d.py
+++ b/tests/test_arc_1d.py
@@ -20,7 +20,10 @@ from reasoning_gym.cognition.arc_1d import (
     task_gravity_counting,
     task_gravity_one_step,
     task_gravity_weighted_colors,
+    task_identity,
+    task_inverse,
     task_mark_size_two_blocks,
+    task_mirror,
     task_move_block_by_own_size,
     task_move_n_pix,
     task_move_n_pix_wrapped,

From b599d6e1a2cd90984e5434d3e91c7055067c61cb Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 22:49:00 +0100
Subject: [PATCH 88/94] feat: Add Arc1D dataset with comprehensive task
 generation and configuration

---
 reasoning_gym/cognition/arc_1d.py | 170 +++++++++++++++++++++++++++++-
 1 file changed, 169 insertions(+), 1 deletion(-)

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index 2aa65d76..5fb643a6 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -1,5 +1,8 @@
 from random import Random
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Callable, Tuple
+
+from ..dataset import ProceduralDataset
+from ..factory import register_dataset
 
 
 def gen_field(size: int, color: int = 0) -> List[int]:
@@ -1007,6 +1010,167 @@ def task_repeat_pattern_full(rng: Random, size: int) -> Optional[Dict[str, List[
     return {"input": question, "output": answer}
 
 
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple
+
+@dataclass
+class Arc1DConfig:
+    """Configuration for ARC 1D task generation"""
+    min_size: int = 10  # Minimum grid size
+    max_size: int = 30  # Maximum grid size 
+    num_train: int = 3  # Number of training examples
+    seed: Optional[int] = None
+    size: int = 500
+
+    def validate(self) -> None:
+        """Validate configuration parameters"""
+        assert self.min_size > 0, "min_size must be positive"
+        assert self.max_size >= self.min_size, "max_size must be >= min_size"
+        assert self.num_train > 0, "num_train must be positive"
+        assert self.size > 0, "size must be positive"
+
+
+# Table of all ARC 1D task functions with their parameters
+ARC_1D_TASKS = {
+    # Move tasks
+    "move_1pix_solid": (task_move_n_pix, {"move_pix": 1, "solid": True}),
+    "move_2pix_solid": (task_move_n_pix, {"move_pix": 2, "solid": True}),
+    "move_3pix_solid": (task_move_n_pix, {"move_pix": 3, "solid": True}),
+    "move_4pix_solid": (task_move_n_pix, {"move_pix": 4, "solid": True}),
+    "move_1pix_colorful": (task_move_n_pix, {"move_pix": 1, "solid": False}),
+    "move_2pix_colorful": (task_move_n_pix, {"move_pix": 2, "solid": False}),
+    "move_3pix_colorful": (task_move_n_pix, {"move_pix": 3, "solid": False}),
+    "move_4pix_colorful": (task_move_n_pix, {"move_pix": 4, "solid": False}),
+    
+    # Move wrapped tasks
+    "move_1pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": True}),
+    "move_2pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": True}),
+    "move_3pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": True}),
+    "move_4pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": True}),
+    "move_1pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": False}),
+    "move_2pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": False}),
+    "move_3pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": False}),
+    "move_4pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": False}),
+
+    # Gravity tasks
+    "gravity": (task_gravity, {}),
+    "gravity_counting": (task_gravity_counting, {}),
+    "gravity_antigravity": (task_gravity_antigravity, {}),
+    "gravity_one_step": (task_gravity_one_step, {}),
+    "gravity_weighted_colors": (task_gravity_weighted_colors, {}),
+
+    # Block tasks
+    "block_touch_dot": (task_block_touch_dot, {}),
+    "block_touch_dot_1pix": (task_block_touch_dot_n_pix, {"move_pix": 1}),
+    "block_touch_dot_2pix": (task_block_touch_dot_n_pix, {"move_pix": 2}),
+    "block_touch_dot_3pix": (task_block_touch_dot_n_pix, {"move_pix": 3}),
+    "block_touch_dot_4pix": (task_block_touch_dot_n_pix, {"move_pix": 4}),
+    "block_scale_to_dot": (task_block_scale_to_dot, {}),
+    "block_and_noise_remove": (task_block_and_noise_remove, {}),
+    "block_and_noise_remove_inside": (task_block_and_noise_remove_inside, {}),
+    "move_block_by_own_size": (task_move_block_by_own_size, {}),
+
+    # Pattern tasks
+    "two_points_and_fill": (task_two_points_and_fill, {}),
+    "copy_block_to_dots": (task_copy_block_to_dots, {}),
+    "copy_block_to_dots_colors": (task_copy_block_to_dots_colors, {}),
+    "repeat_pattern_full": (task_repeat_pattern_full, {}),
+
+    # Reflection tasks
+    "reflect_block_with_border_pixel": (task_reflect_block_with_border_pixel, {}),
+    "reflect_block_random": (task_reflect_block_with_border_pixel_random, {}),
+    "reflect_block_around_dot": (task_reflect_block_around_dot, {}),
+
+    # Color tasks
+    "paint_biggest_block": (task_paint_biggest_block, {}),
+    "recolor_blocks_by_size": (task_recolor_blocks_by_size, {}),
+    "change_to_five": (task_change_to_five, {}),
+    "recolor_blocks_from_palette": (task_recolor_blocks_from_palette, {}),
+    "color_left_half_blocks": (task_color_left_half_blocks, {}),
+
+    # Sorting tasks
+    "sort_blocks_by_size": (task_sort_blocks_by_size, {}),
+    "sort_complete_sequence": (task_sort_complete_sequence, {}),
+
+    # Fill tasks
+    "duplicate_block_from_seeds": (task_duplicate_block_from_seeds, {}),
+    "fill_from_pixel": (task_fill_from_pixel, {}),
+    "fill_until_collision": (task_fill_until_collision, {}),
+    
+    # Marking tasks
+    "mark_size_two_blocks": (task_mark_size_two_blocks, {}),
+}
+
+
+class Arc1DDataset(ProceduralDataset):
+    """Generates ARC 1D tasks by randomly selecting from available task generators"""
+
+    def __init__(self, config: Arc1DConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+        self.task_names = list(ARC_1D_TASKS.keys())
+
+    def __getitem__(self, idx: int) -> dict:
+        """Generate a single ARC 1D task with training examples
+
+        Args:
+            idx: Index of the item to generate
+
+        Returns:
+            dict with keys:
+                - question: str, the task description and examples
+                - answer: str, the expected output format
+                - metadata: dict with generation parameters
+        """
+        # Create deterministic RNG from base seed and idx
+        item_rng = random.Random(self.seed + idx)
+        
+        # Select random task
+        task_name = item_rng.choice(self.task_names)
+        task_func, task_kwargs = ARC_1D_TASKS[task_name]
+        
+        # Generate training examples
+        train_examples = []
+        size = item_rng.randint(self.config.min_size, self.config.max_size)
+        
+        for _ in range(self.config.num_train):
+            example = None
+            while example is None:
+                example = task_func(item_rng, size, **task_kwargs)
+        
+            train_examples.append(example)
+            
+        # Generate test example
+        test_example = None
+        while test_example is None:
+            test_example = task_func(item_rng, size, **task_kwargs)
+            
+        # Format question
+        question = "Find the common rule that maps an input grid to an output grid, given the examples below.\n\n"
+        
+        # Add training examples
+        for i, example in enumerate(train_examples, 1):
+            question += f"Example {i}:\n"
+            question += "Input:  " + " ".join(str(x) for x in example["input"]) + "\n"
+            question += "Output: " + " ".join(str(x) for x in example["output"]) + "\n\n"
+            
+        # Add test input
+        question += "Below is a test input grid. Predict the corresponding output grid by applying the rule you found. "
+        question += "Describe how you derived the rule and your overall reasoning process in detail before you submit your answer. "
+        question += "Your final answer must be placed in <output></output> tags and should be just be the text output grid itself.\n\n"
+        question += "Input:\n"
+        question += " ".join(str(x) for x in test_example["input"])
+
+        return {
+            "question": question,
+            "answer": " ".join(str(x) for x in test_example["output"]),
+            "metadata": {
+                "task_name": task_name,
+                "size": size,
+                "train_examples": train_examples,
+                "test_example": test_example,
+            },
+        }
+
 def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where color 2 is heavier than color 1 in gravity."""
     # Generate random field with only colors 1 and 2
@@ -1055,6 +1219,10 @@ def task_identity(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[
     return task_result
 
 
+# Register the dataset
+register_dataset("arc_1d", Arc1DDataset, Arc1DConfig)
+
+
 def task_color_left_half_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
     """Generate a task where left half of blocks are colored differently."""
     pos = 0

From a060348a9ce5e17b662438fe831b1b294105cd7d Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 22:49:28 +0100
Subject: [PATCH 89/94] fix: Resolve undefined task function references in
 arc_1d.py

---
 reasoning_gym/cognition/arc_1d.py | 109 ++++++++++++++++--------------
 1 file changed, 57 insertions(+), 52 deletions(-)

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index 5fb643a6..badea2c4 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -1,3 +1,4 @@
+import random
 from random import Random
 from typing import Dict, List, Optional, Callable, Tuple
 
@@ -1030,6 +1031,62 @@ class Arc1DConfig:
         assert self.size > 0, "size must be positive"
 
 
+def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where color 2 is heavier than color 1 in gravity."""
+    # Generate random field with only colors 1 and 2
+    question = [rng.randint(1, 2) if rng.random() < 0.5 else 0 for _ in range(size)]
+
+    # Count colors
+    count_1 = sum(1 for x in question if x == 1)
+    count_2 = sum(1 for x in question if x == 2)
+
+    # Create answer with sorted colors
+    answer = gen_field(size)
+
+    # Place heavier color 2 first
+    for i in range(count_2):
+        answer[i] = 2
+
+    # Then place color 1
+    for i in range(count_1):
+        answer[count_2 + i] = 1
+
+    return {"input": question, "output": answer}
+
+
+def task_color_left_half_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where left half of blocks are colored differently."""
+    pos = 0
+    question = gen_field(size)
+    blocks = []
+
+    # Generate blocks with gap 1
+    while pos < size:
+        if rng.random() < 0.4:
+            block_size = rng.randint(2, 8)
+            if pos + block_size >= size:
+                break
+
+            blocks.append((pos, block_size))
+            for i in range(block_size):
+                question[pos + i] = 2
+            pos += block_size + 1  # block size + gap
+        else:
+            pos += 1
+
+    if len(blocks) < 2:
+        return None
+
+    # Create answer with half-colored blocks
+    answer = question.copy()
+    for pos, block_size in blocks:
+        half_size = block_size // 2
+        for i in range(half_size):
+            answer[pos + i] = 8
+
+    return {"input": question, "output": answer}
+
+
 # Table of all ARC 1D task functions with their parameters
 ARC_1D_TASKS = {
     # Move tasks
@@ -1171,27 +1228,6 @@ class Arc1DDataset(ProceduralDataset):
             },
         }
 
-def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where color 2 is heavier than color 1 in gravity."""
-    # Generate random field with only colors 1 and 2
-    question = [rng.randint(1, 2) if rng.random() < 0.5 else 0 for _ in range(size)]
-
-    # Count colors
-    count_1 = sum(1 for x in question if x == 1)
-    count_2 = sum(1 for x in question if x == 2)
-
-    # Create answer with sorted colors
-    answer = gen_field(size)
-
-    # Place heavier color 2 first
-    for i in range(count_2):
-        answer[i] = 2
-
-    # Then place color 1
-    for i in range(count_1):
-        answer[count_2 + i] = 1
-
-    return {"input": question, "output": answer}
 
 
 def task_mirror(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
@@ -1223,34 +1259,3 @@ def task_identity(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[
 register_dataset("arc_1d", Arc1DDataset, Arc1DConfig)
 
 
-def task_color_left_half_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where left half of blocks are colored differently."""
-    pos = 0
-    question = gen_field(size)
-    blocks = []
-
-    # Generate blocks with gap 1
-    while pos < size:
-        if rng.random() < 0.4:
-            block_size = rng.randint(2, 8)
-            if pos + block_size >= size:
-                break
-
-            blocks.append((pos, block_size))
-            for i in range(block_size):
-                question[pos + i] = 2
-            pos += block_size + 1  # block size + gap
-        else:
-            pos += 1
-
-    if len(blocks) < 2:
-        return None
-
-    # Create answer with half-colored blocks
-    answer = question.copy()
-    for pos, block_size in blocks:
-        half_size = block_size // 2
-        for i in range(half_size):
-            answer[pos + i] = 8
-
-    return {"input": question, "output": answer}

From 9a1270dd95b1404b7c61976cc94e6167446f935b Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 23:03:56 +0100
Subject: [PATCH 90/94] add arc_1d dataset

---
 GALLERY.md                                    |   89 +-
 reasoning_gym/__init__.py                     |    2 +-
 reasoning_gym/cognition/__init__.py           |   11 +-
 reasoning_gym/cognition/arc_1d.py             | 1194 +----------------
 reasoning_gym/cognition/arc_1d_tasks.py       | 1145 ++++++++++++++++
 ...arithmetic.py => test_basic_arithmetic.py} |    0
 6 files changed, 1256 insertions(+), 1185 deletions(-)
 create mode 100644 reasoning_gym/cognition/arc_1d_tasks.py
 rename tests/{test_arithmetic.py => test_basic_arithmetic.py} (100%)

diff --git a/GALLERY.md b/GALLERY.md
index be5a3a65..56e5836f 100644
--- a/GALLERY.md
+++ b/GALLERY.md
@@ -4,6 +4,7 @@ This gallery shows examples from all available datasets using their default conf
 ## Available Datasets
 - [advanced_geometry](#advanced_geometry)
 - [aiw](#aiw)
+- [arc_1d](#arc_1d)
 - [base_conversion](#base_conversion)
 - [basic_arithmetic](#basic_arithmetic)
 - [bf](#bf)
@@ -122,6 +123,88 @@ Metadata: {'task_type': 'friends'}
 
 ````
 
+### arc_1d
+Generates ARC 1D tasks by randomly selecting from available task generators
+
+Default configuration:
+```python
+min_size = 10
+max_size = 30
+num_train = 3
+seed = 42
+size = 500
+```
+
+Example tasks:
+````
+Example 1:
+Question: Find the common rule that maps an input grid to an output grid, given the examples below.
+
+Example 1:
+Input:  7 1 0 0 5 5 0 5 5 0 0 0 0
+Output: 7 1 0 0 7 7 0 1 1 0 0 0 0
+
+Example 2:
+Input:  5 1 0 5 5 0 5 5 0 0 0 0 0
+Output: 5 1 0 5 5 0 1 1 0 0 0 0 0
+
+Example 3:
+Input:  2 6 0 0 5 5 0 5 5 0 0 0 0
+Output: 2 6 0 0 2 2 0 6 6 0 0 0 0
+
+Below is a test input grid. Predict the corresponding output grid by applying the rule you found. Describe how you derived the rule and your overall reasoning process in detail before you submit your answer. Your final answer must be placed in <output></output> tags and should be just be the text output grid itself.
+
+Input:
+6 0 0 0 0 0 0 5 5 5 0 0 0
+Answer: 6 0 0 0 0 0 0 6 6 6 0 0 0
+Metadata: {'task_name': 'recolor_blocks_from_palette', 'size': 13, 'train_examples': [{'input': [7, 1, 0, 0, 5, 5, 0, 5, 5, 0, 0, 0, 0], 'output': [7, 1, 0, 0, 7, 7, 0, 1, 1, 0, 0, 0, 0]}, {'input': [5, 1, 0, 5, 5, 0, 5, 5, 0, 0, 0, 0, 0], 'output': [5, 1, 0, 5, 5, 0, 1, 1, 0, 0, 0, 0, 0]}, {'input': [2, 6, 0, 0, 5, 5, 0, 5, 5, 0, 0, 0, 0], 'output': [2, 6, 0, 0, 2, 2, 0, 6, 6, 0, 0, 0, 0]}], 'test_example': {'input': [6, 0, 0, 0, 0, 0, 0, 5, 5, 5, 0, 0, 0], 'output': [6, 0, 0, 0, 0, 0, 0, 6, 6, 6, 0, 0, 0]}}
+
+Example 2:
+Question: Find the common rule that maps an input grid to an output grid, given the examples below.
+
+Example 1:
+Input:  0 8 8 8 8 8 8 8 8 8 8 8 8 0 0 0 0 0 0
+Output: 0 0 0 0 8 8 8 8 8 8 8 8 8 8 8 8 0 0 0
+
+Example 2:
+Input:  0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 0 0 0
+Output: 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2
+
+Example 3:
+Input:  0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0
+Output: 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0
+
+Below is a test input grid. Predict the corresponding output grid by applying the rule you found. Describe how you derived the rule and your overall reasoning process in detail before you submit your answer. Your final answer must be placed in <output></output> tags and should be just be the text output grid itself.
+
+Input:
+0 0 0 0 0 0 6 6 6 6 6 6 6 6 6 0 0 0 0
+Answer: 0 0 0 0 0 0 0 0 0 6 6 6 6 6 6 6 6 6 0
+Metadata: {'task_name': 'move_3pix_solid', 'size': 19, 'train_examples': [{'input': [0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0], 'output': [0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0]}, {'input': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0], 'output': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2]}, {'input': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], 'output': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0]}], 'test_example': {'input': [0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0, 0], 'output': [0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0]}}
+
+Example 3:
+Question: Find the common rule that maps an input grid to an output grid, given the examples below.
+
+Example 1:
+Input:  0 0 0 0 0 0 0 2 0 0 4 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0
+Output: 0 0 0 0 0 0 0 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0
+
+Example 2:
+Input:  0 0 0 2 0 0 0 0 0 0 0 0 0 3 3 3 3 3 3 3 3 0 0 0 0 0
+Output: 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 0 0 0 0
+
+Example 3:
+Input:  0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 3 0 0 0 0
+Output: 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 0 0 0
+
+Below is a test input grid. Predict the corresponding output grid by applying the rule you found. Describe how you derived the rule and your overall reasoning process in detail before you submit your answer. Your final answer must be placed in <output></output> tags and should be just be the text output grid itself.
+
+Input:
+0 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7 7 7 7 7 7 7 7 2 0
+Answer: 0 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7 7 7 7 7 7 7 7 7 0
+Metadata: {'task_name': 'block_scale_to_dot', 'size': 26, 'train_examples': [{'input': [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0], 'output': [0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0]}, {'input': [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0], 'output': [0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0]}, {'input': [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0], 'output': [0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0]}], 'test_example': {'input': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 0], 'output': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0]}}
+
+````
+
 ### base_conversion
 Generates base conversion tasks
 
@@ -1368,7 +1451,7 @@ Example tasks:
 ````
 Example 1:
 Question: In front of you are some buttons, a light, and a number. The light will toggle between red and green whenever you press a button. Each button performs a mathematical operation to the number, but the operation may depend on the state of the light.
-You must press the shortest correct sequence of buttons to reach the target value.
+You must press the shortest correct sequence of buttons to reach the target value. Your answer should be a sequence of buttons separated by '→', for example: A → B → C
 
 Start: 0 (red)
 Target: 46
@@ -1381,7 +1464,7 @@ Metadata: {'difficulty': 10, 'solution_path': ['A', 'B', 'C', 'C', 'A', 'C'], 't
 
 Example 2:
 Question: In front of you are some buttons, a light, and a number. The light will toggle between red and green whenever you press a button. Each button performs a mathematical operation to the number, but the operation may depend on the state of the light.
-You must press the shortest correct sequence of buttons to reach the target value.
+You must press the shortest correct sequence of buttons to reach the target value. Your answer should be a sequence of buttons separated by '→', for example: A → B → C
 
 Start: 0 (red)
 Target: 30
@@ -1394,7 +1477,7 @@ Metadata: {'difficulty': 10, 'solution_path': ['C', 'A', 'C', 'A', 'C', 'A', 'C'
 
 Example 3:
 Question: In front of you are some buttons, a light, and a number. The light will toggle between red and green whenever you press a button. Each button performs a mathematical operation to the number, but the operation may depend on the state of the light.
-You must press the shortest correct sequence of buttons to reach the target value.
+You must press the shortest correct sequence of buttons to reach the target value. Your answer should be a sequence of buttons separated by '→', for example: A → B → C
 
 Start: 0 (red)
 Target: 45
diff --git a/reasoning_gym/__init__.py b/reasoning_gym/__init__.py
index 49068294..054cbd95 100644
--- a/reasoning_gym/__init__.py
+++ b/reasoning_gym/__init__.py
@@ -5,7 +5,7 @@ Reasoning Gym - A library of procedural dataset generators for training reasonin
 from . import algebra, algorithmic, arithmetic, cognition, data, games, geometry, graphs, logic
 from .factory import create_dataset, register_dataset
 
-__version__ = "0.1.1"
+__version__ = "0.1.3"
 __all__ = [
     "algebra",
     "algorithmic",
diff --git a/reasoning_gym/cognition/__init__.py b/reasoning_gym/cognition/__init__.py
index fddd97b1..38baf31b 100644
--- a/reasoning_gym/cognition/__init__.py
+++ b/reasoning_gym/cognition/__init__.py
@@ -6,18 +6,21 @@ Cognition tasks for training reasoning capabilities:
 - Working memory
 """
 
+from .arc_1d import Arc1DConfig, Arc1DDataset
 from .color_cube_rotation import ColorCubeRotationConfig, ColorCubeRotationDataset
 from .figlet_fonts import FigletFontConfig, FigletFontDataset
 from .number_sequences import NumberSequenceConfig, NumberSequenceDataset
 from .rubiks_cube import RubiksCubeConfig, RubiksCubeDataset
 
 __all__ = [
-    "NumberSequenceConfig",
-    "NumberSequenceDataset",
+    "Arc1DConfig",
+    "Arc1DDataset",
     "ColorCubeRotationConfig",
     "ColorCubeRotationDataset",
-    "RubiksCubeConfig",
-    "RubiksCubeDataset",
     "FigletFontConfig",
     "FigletFontDataset",
+    "NumberSequenceConfig",
+    "NumberSequenceDataset",
+    "RubiksCubeConfig",
+    "RubiksCubeDataset",
 ]
diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index badea2c4..332a5851 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -1,1024 +1,17 @@
-import random
+from dataclasses import dataclass
 from random import Random
-from typing import Dict, List, Optional, Callable, Tuple
+from typing import Optional
 
 from ..dataset import ProceduralDataset
 from ..factory import register_dataset
 
 
-def gen_field(size: int, color: int = 0) -> List[int]:
-    """Generate a field of given size filled with specified color (default 0)."""
-    return [color] * size
-
-
-def write_block(pos: int, block: List[int], field: List[int]) -> List[int]:
-    """Write a block into a field at given position."""
-    result = field.copy()
-    for i, color in enumerate(block):
-        result[pos + i] = color
-    return result
-
-
-def task_move_n_pix(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block is moved to the right by move_pix pixels."""
-    if size <= move_pix + 1:
-        return None
-
-    block_size = rng.randint(1, size - move_pix - 1)
-    block_pos = rng.randint(0, size - block_size - move_pix)
-
-    if solid:
-        color = rng.randint(1, 9)
-        block = [color] * block_size
-    else:
-        block = [rng.randint(1, 9) for _ in range(block_size)]
-
-    question = write_block(block_pos, block, gen_field(size))
-    answer = write_block(block_pos + move_pix, block, gen_field(size))
-
-    return {"input": question, "output": answer}
-
-
-def task_move_n_pix_wrapped(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block is moved to the right by move_pix pixels with wrapping."""
-    block_size = rng.randint(1, size)
-    block_pos = rng.randint(0, size)
-
-    if solid:
-        color = rng.randint(1, 9)
-        block = [color] * block_size
-    else:
-        block = [rng.randint(1, 9) for _ in range(block_size)]
-
-    question = gen_field(size)
-    answer = gen_field(size)
-
-    for i, color in enumerate(block):
-        question[(block_pos + i) % size] = color
-        answer[(block_pos + move_pix + i) % size] = color
-
-    return {"input": question, "output": answer}
-
-
-def task_gravity(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where all non-zero elements are attracted to the left."""
-    density = 0.5
-    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
-
-    non_zero = [x for x in question if x != 0]
-    answer = non_zero + [0] * (size - len(non_zero))
-
-    return {"input": question, "output": answer}
-
-
-def task_gravity_counting(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where non-zero elements are counted and represented as a sequence of 1s."""
-    density = 0.5
-    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
-
-    count = sum(1 for x in question if x != 0)
-    answer = [1] * count + [0] * (size - count)
-
-    return {"input": question, "output": answer}
-
-
-def task_gravity_antigravity(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where color 1 moves right and color 2 moves left."""
-    density = 0.5
-    question = [rng.randint(1, 2) if rng.random() < density else 0 for _ in range(size)]
-
-    color1 = [x for x in question if x == 1]
-    color2 = [x for x in question if x == 2]
-    answer = [2] * len(color2) + [0] * (size - len(color1) - len(color2)) + [1] * len(color1)
-
-    return {"input": question, "output": answer}
-
-
-def task_block_touch_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block moves to touch (but not cover) a dot."""
-    dot_color = 1
-    block_color = rng.randint(2, 9)
-
-    block_size = rng.randint(1, size)
-    dot_pos = rng.randint(0, size)
-
-    can_place_left = dot_pos >= block_size
-    can_place_right = dot_pos + block_size < size
-
-    if not (can_place_left or can_place_right):
-        return None
-
-    if can_place_left and can_place_right:
-        side = rng.choice(["left", "right"])
-    elif can_place_left:
-        side = "left"
-    else:
-        side = "right"
-
-    if side == "left":
-        q_block_pos = rng.randint(0, dot_pos - block_size)
-        a_block_pos = dot_pos - block_size
-    else:
-        q_block_pos = rng.randint(dot_pos + 1, size - block_size)
-        a_block_pos = dot_pos + 1
-
-    question = gen_field(size)
-    question[dot_pos] = dot_color
-    question = write_block(q_block_pos, [block_color] * block_size, question)
-
-    answer = gen_field(size)
-    answer[dot_pos] = dot_color
-    answer = write_block(a_block_pos, [block_color] * block_size, answer)
-
-    return {"input": question, "output": answer}
-
-
-def task_block_touch_dot_n_pix(rng: Random, size: int, move_pix: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block moves move_pix pixels toward a dot."""
-    dot_color = 2
-    block_color = rng.randint(3, 9)
-
-    block_size = rng.randint(1, size)
-    dot_pos = rng.randint(0, size)
-
-    can_place_left = dot_pos >= block_size
-    can_place_right = dot_pos + block_size < size
-
-    if not (can_place_left or can_place_right):
-        return None
-
-    if can_place_left and can_place_right:
-        side = rng.choice(["left", "right"])
-    elif can_place_left:
-        side = "left"
-    else:
-        side = "right"
-
-    if side == "left":
-        q_block_pos = rng.randint(0, dot_pos - block_size)
-        distance = (dot_pos - block_size) - q_block_pos
-        move = min(distance, move_pix)
-        a_block_pos = q_block_pos + move
-    else:
-        q_block_pos = rng.randint(dot_pos + 1, size - block_size)
-        distance = q_block_pos - (dot_pos + 1)
-        move = min(distance, move_pix)
-        a_block_pos = q_block_pos - move
-
-    question = gen_field(size)
-    question[dot_pos] = dot_color
-    question = write_block(q_block_pos, [block_color] * block_size, question)
-
-    answer = gen_field(size)
-    answer[dot_pos] = dot_color
-    answer = write_block(a_block_pos, [block_color] * block_size, answer)
-
-    return {"input": question, "output": answer}
-
-
-def task_block_scale_to_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block scales to touch a dot (keeping one end fixed)."""
-    dot_color = 2
-    block_color = rng.randint(3, 9)
-
-    block_size = rng.randint(1, size)
-    dot_pos = rng.randint(0, size)
-
-    can_place_left = dot_pos >= block_size
-    can_place_right = dot_pos + block_size < size
-
-    if not (can_place_left or can_place_right):
-        return None
-
-    if can_place_left and can_place_right:
-        side = rng.choice(["left", "right"])
-    elif can_place_left:
-        side = "left"
-    else:
-        side = "right"
-
-    if side == "left":
-        q_block_pos = rng.randint(0, dot_pos - block_size)
-        new_size = dot_pos - q_block_pos + 1
-        a_block_pos = q_block_pos
-    else:
-        q_block_pos = rng.randint(dot_pos + 1, size - block_size)
-        new_size = (q_block_pos + block_size) - dot_pos
-        a_block_pos = dot_pos
-
-    question = gen_field(size)
-    question[dot_pos] = dot_color
-    question = write_block(q_block_pos, [block_color] * block_size, question)
-
-    answer = gen_field(size)
-    answer[dot_pos] = dot_color
-    answer = write_block(a_block_pos, [block_color] * new_size, answer)
-
-    return {"input": question, "output": answer}
-
-
-def task_two_points_and_fill(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where space between two points of same color is filled with that color."""
-    color = rng.randint(1, 9)
-
-    pos1 = rng.randint(0, size - 1)
-    pos2 = rng.randint(0, size - 1)
-    if pos1 == pos2:
-        return None
-
-    pos1, pos2 = min(pos1, pos2), max(pos1, pos2)
-
-    question = gen_field(size)
-    question[pos1] = color
-    question[pos2] = color
-
-    answer = question.copy()
-    for i in range(pos1, pos2 + 1):
-        answer[i] = color
-
-    return {"input": question, "output": answer}
-
-
-def task_reflect_block_with_border_pixel(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block with a border pixel is reflected."""
-    block_size = rng.randint(2, size)
-    if block_size > size:
-        return None
-
-    c1 = rng.randint(1, 9)
-    c2 = rng.randint(1, 9)
-    if c1 == c2:
-        return None
-
-    side = "left" if rng.random() < 0.5 else "right"
-    pos = rng.randint(0, size - block_size)
-
-    block = [c1] * block_size
-    if side == "left":
-        block[0] = c2
-    else:
-        block[block_size - 1] = c2
-
-    question = write_block(pos, block, gen_field(size))
-    reversed_block = block[::-1]  # Reverse the block
-    answer = write_block(pos, reversed_block, gen_field(size))
-
-    return {"input": question, "output": answer}
-
-
-def task_reflect_block_with_border_pixel_random(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a random-colored block with a border pixel is reflected."""
-    block_size = rng.randint(2, size)
-    if block_size > size:
-        return None
-
-    side = "left" if rng.random() < 0.5 else "right"
-    pos = rng.randint(0, size - block_size)
-
-    block = [rng.randint(1, 9) for _ in range(block_size)]
-    border_color = rng.randint(1, 9)
-
-    if side == "left":
-        if block[0] == border_color:
-            return None
-        block[0] = border_color
-    else:
-        if block[block_size - 1] == border_color:
-            return None
-        block[block_size - 1] = border_color
-
-    question = write_block(pos, block, gen_field(size))
-    reversed_block = block[::-1]  # Reverse the block
-    answer = write_block(pos, reversed_block, gen_field(size))
-
-    return {"input": question, "output": answer}
-
-
-def task_reflect_block_around_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block is reflected around a dot."""
-    dot_color = 2
-
-    dot_pos = rng.randint(0, size)
-    block_size = rng.randint(1, size)
-    block_pos = rng.randint(0, size - block_size)
-    block_end = block_pos + block_size - 1
-
-    # Check if block is strictly to left or right of dot
-    strictly_left = block_end < dot_pos
-    strictly_right = block_pos > dot_pos
-
-    if not (strictly_left or strictly_right):
-        return None
-
-    block_color = rng.randint(3, 9)  # Different from dot color
-    block = [block_color] * block_size
-
-    # Calculate reflection bounds
-    min_reflect = 2 * dot_pos - block_end
-    max_reflect = 2 * dot_pos - block_pos
-    if min_reflect < 0 or max_reflect >= size:
-        return None
-
-    question = gen_field(size)
-    question = write_block(block_pos, block, question)
-    question[dot_pos] = dot_color
-
-    answer = gen_field(size)
-    answer[dot_pos] = dot_color
-    for i in range(block_size):
-        reflect_idx = 2 * dot_pos - (block_pos + i)
-        answer[reflect_idx] = block[i]
-
-    return {"input": question, "output": answer}
-
-
-def task_block_and_noise_remove(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where noise around a block needs to be removed."""
-    block_size = rng.randint(2, size)
-    if block_size > size:
-        return None
-
-    block_pos = rng.randint(0, size - block_size)
-    color = rng.randint(1, 9)
-
-    # Create field with block
-    field = gen_field(size)
-    for i in range(block_size):
-        field[block_pos + i] = color
-
-    # Track forbidden positions for noise
-    forbidden = [False] * size
-    for i in range(block_pos, block_pos + block_size):
-        forbidden[i] = True
-    if block_pos > 0:
-        forbidden[block_pos - 1] = True
-    if block_pos + block_size < size:
-        forbidden[block_pos + block_size] = True
-
-    # Add noise
-    noise_count = rng.randint(1, 3)
-    noise_positions = []
-
-    for _ in range(noise_count):
-        allowed = [i for i in range(size) if not forbidden[i]]
-        if not allowed:
-            break
-        noise_pos = rng.choice(allowed)
-        noise_positions.append(noise_pos)
-        field[noise_pos] = color
-        forbidden[noise_pos] = True
-        if noise_pos > 0:
-            forbidden[noise_pos - 1] = True
-        if noise_pos + 1 < size:
-            forbidden[noise_pos + 1] = True
-
-    if len(noise_positions) < noise_count:
-        return None
-
-    question = field
-    answer = field.copy()
-    for pos in noise_positions:
-        answer[pos] = 0
-
-    return {"input": question, "output": answer}
-
-
-def task_block_and_noise_remove_inside(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where noise inside a block needs to be removed."""
-    if size <= 6:
-        return None
-
-    block_size = rng.randint(6, size)
-    if block_size > size:
-        return None
-
-    block_pos = rng.randint(0, size - block_size)
-    color = rng.randint(1, 9)
-
-    # Create field with block
-    field = gen_field(size)
-    for i in range(block_size):
-        field[block_pos + i] = color
-
-    # Add noise inside block
-    max_noise = max(1, (block_size // 2) - 1)
-    noise_count = rng.randint(1, max_noise)
-
-    positions = list(range(block_size))
-    rng.shuffle(positions)
-    noise_positions = positions[:noise_count]
-
-    for offset in noise_positions:
-        pos = block_pos + offset
-        noise_color = rng.randint(1, 9)
-        while noise_color == color:
-            noise_color = rng.randint(1, 9)
-        field[pos] = noise_color
-
-    question = field
-    answer = field.copy()
-    for offset in noise_positions:
-        answer[block_pos + offset] = color
-
-    return {"input": question, "output": answer}
-
-
-def task_copy_block_to_dots(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block pattern is copied to dot positions."""
-    block_size = 3 if rng.random() < 0.5 else 5
-    if block_size >= size:
-        return None
-
-    color = rng.randint(1, 9)
-    block = [color] * block_size
-
-    # Generate dots with minimum distance to prevent overlap
-    min_gap = block_size
-    dot_positions = []
-    pos = block_size + block_size // 2 + 1
-
-    while pos <= size - block_size:
-        if rng.random() < 0.5:  # Control dot density
-            dot_positions.append(pos)
-            pos += min_gap
-        pos += 1
-
-    if not dot_positions:
-        return None
-
-    question = gen_field(size)
-    question = write_block(0, block, question)
-    for pos in dot_positions:
-        question[pos] = color
-
-    answer = gen_field(size)
-    answer = write_block(0, block, answer)
-    for pos in dot_positions:
-        block_start = pos - block_size // 2
-        answer = write_block(block_start, block, answer)
-
-    return {"input": question, "output": answer}
-
-
-def task_copy_block_to_dots_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block pattern is copied to dot positions with matching colors."""
-    block_size = 3 if rng.random() < 0.5 else 5
-    if block_size >= size:
-        return None
-
-    block_color = rng.randint(1, 9)
-    block = [block_color] * block_size
-
-    # Generate dots with minimum distance to prevent overlap
-    min_gap = block_size
-    dot_positions = []
-    dot_colors = []
-    pos = block_size + block_size // 2 + 1
-
-    while pos < size - block_size:
-        if rng.random() < 0.5:
-            dot_color = rng.randint(1, 9)
-            dot_positions.append(pos)
-            dot_colors.append(dot_color)
-            pos += min_gap
-        pos += 1
-
-    if not dot_positions:
-        return None
-
-    question = gen_field(size)
-    question = write_block(0, block, question)
-    for i, pos in enumerate(dot_positions):
-        question[pos] = dot_colors[i]
-
-    answer = gen_field(size)
-    answer = write_block(0, block, answer)
-    for i, pos in enumerate(dot_positions):
-        block_start = pos - block_size // 2
-        colored_block = [dot_colors[i]] * block_size
-        answer = write_block(block_start, colored_block, answer)
-
-    return {"input": question, "output": answer}
-
-
-def task_paint_biggest_block(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where the largest block is painted a different color."""
-    target_color = 1
-    initial_color = rng.randint(2, 9)
-
-    # Generate random blocks
-    question = gen_field(size)
-    blocks = []
-    pos = 0
-
-    while pos < size:
-        if rng.random() < 0.4 and size - pos >= 2:
-            block_size = rng.randint(2, min(size - pos, 6))
-            blocks.append((pos, block_size))
-            for i in range(block_size):
-                question[pos + i] = initial_color
-            pos += block_size + 1
-        else:
-            pos += 1
-
-    if len(blocks) < 2:
-        return None
-
-    # Find biggest block
-    biggest_pos, biggest_size = max(blocks, key=lambda x: x[1])
-
-    # Check if there are multiple blocks of the same size
-    biggest_count = sum(1 for _, size in blocks if size == biggest_size)
-    if biggest_count > 1:
-        return None
-
-    answer = question.copy()
-    for i in range(biggest_size):
-        answer[biggest_pos + i] = target_color
-
-    return {"input": question, "output": answer}
-
-
-def task_sort_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where blocks are sorted by size with 1 pixel gaps."""
-    color = rng.randint(1, 9)
-    blocks = []
-    pos = 0
-
-    # Generate random blocks with random sizes
-    while pos < size:
-        if rng.random() < 0.4 and size - pos >= 2:
-            block_size = rng.randint(1, min(size - pos, 6))
-            blocks.append((pos, block_size))
-            pos += block_size + rng.randint(1, 4)  # Random gaps
-        else:
-            pos += 1
-
-    if len(blocks) < 2:
-        return None
-
-    # Create input field
-    question = gen_field(size)
-    for pos, block_size in blocks:
-        for i in range(block_size):
-            question[pos + i] = color
-
-    # Sort blocks by size
-    blocks.sort(key=lambda x: x[1])
-
-    # Check if sorted blocks fit with gaps
-    total_space = sum(size for _, size in blocks) + len(blocks) - 1
-    if total_space > size:
-        return None
-
-    # Create answer field with sorted blocks
-    answer = gen_field(size)
-    current_pos = 0
-
-    for _, block_size in blocks:
-        for i in range(block_size):
-            answer[current_pos + i] = color
-        current_pos += block_size + 1  # One pixel gap
-
-    return {"input": question, "output": answer}
-
-
-def task_sort_complete_sequence(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a complete sequence of block sizes is sorted."""
-    # Calculate max possible block size given total array size
-    max_size = 1
-    total_space = 0
-    while total_space + max_size + 1 <= size:
-        total_space += max_size + 1
-        max_size += 1
-    max_size -= 1
-
-    if max_size < 2:
-        return None
-
-    color = rng.randint(1, 9)
-
-    # Create sequence of all sizes from 1 to max_size
-    blocks = list(range(1, max_size + 1))
-    rng.shuffle(blocks)
-
-    # Create input field with shuffled blocks
-    question = gen_field(size)
-    pos = 0
-    for block_size in blocks:
-        for i in range(block_size):
-            question[pos + i] = color
-        pos += block_size + 1
-
-    # Create answer field with sorted blocks
-    answer = gen_field(size)
-    pos = 0
-    for block_size in range(1, max_size + 1):
-        for i in range(block_size):
-            answer[pos + i] = color
-        pos += block_size + 1
-
-    return {"input": question, "output": answer}
-
-
-def task_recolor_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where two blocks are recolored based on their size."""
-    # Generate two different random sizes
-    size1 = rng.randint(2, 8)
-    size2 = rng.randint(2, 8)
-    while size2 == size1:
-        size2 = rng.randint(2, 8)
-
-    # Ensure both blocks fit with at least 1 gap
-    if size1 + size2 + 1 > size:
-        return None
-
-    # Place blocks with gap
-    pos1 = rng.randint(0, size - (size1 + size2 + 1))
-    pos2 = rng.randint(pos1 + size1 + 1, size - size2)
-
-    # Create input field with both blocks color 3
-    question = gen_field(size)
-    for i in range(size1):
-        question[pos1 + i] = 3
-    for i in range(size2):
-        question[pos2 + i] = 3
-
-    # Create answer field with recolored blocks
-    answer = question.copy()
-    if size1 > size2:
-        for i in range(size1):
-            answer[pos1 + i] = 1
-        for i in range(size2):
-            answer[pos2 + i] = 2
-    else:
-        for i in range(size1):
-            answer[pos1 + i] = 2
-        for i in range(size2):
-            answer[pos2 + i] = 1
-
-    return {"input": question, "output": answer}
-
-
-def task_gravity_one_step(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where non-zero elements move one step left if possible."""
-    question = [rng.randint(1, 9) if rng.random() < 0.5 else 0 for _ in range(size)]
-    answer = question.copy()
-
-    # Move each non-zero pixel one step left if possible
-    for i in range(1, size):
-        if answer[i] != 0 and answer[i - 1] == 0:
-            answer[i - 1] = answer[i]
-            answer[i] = 0
-
-    return {"input": question, "output": answer}
-
-
-def task_move_block_by_own_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block moves right by its own size."""
-    block_size = rng.randint(1, size // 2)  # Ensure space for movement
-    pos = rng.randint(0, size - block_size * 2)  # Space for block and movement
-    color = rng.randint(1, 9)
-
-    question = gen_field(size)
-    block = [color] * block_size
-    question = write_block(pos, block, question)
-
-    answer = write_block(pos + block_size, block, gen_field(size))
-
-    return {"input": question, "output": answer}
-
-
-def task_change_to_five(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where all non-zero colors change to 5."""
-    density = 0.5
-    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
-    answer = [5 if x != 0 else 0 for x in question]
-
-    return {"input": question, "output": answer}
-
-
-def task_recolor_blocks_from_palette(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where blocks are recolored using a color palette."""
-    # Generate blocks of same size
-    block_size = rng.randint(2, 4)
-    blocks = []
-    pos = 0
-
-    while pos + block_size <= size:
-        if rng.random() < 0.4:
-            blocks.append(pos)
-            pos += block_size + 1
-        else:
-            pos += 1
-
-    # Ensure we have space for palette
-    while blocks and blocks[-1] + block_size + len(blocks) + 1 >= size:
-        blocks.pop()
-
-    if not blocks:
-        return None
-
-    # Shift blocks right to make room for palette
-    palette_size = len(blocks)
-    blocks = [pos + palette_size + 1 for pos in blocks]
-
-    # Generate color palette
-    colors = []
-    for _ in range(len(blocks)):
-        while True:
-            color = rng.randint(1, 9)
-            if color not in colors:
-                colors.append(color)
-                break
-
-    # Create question with color palette and blocks
-    question = gen_field(size)
-
-    # Place color palette at start
-    for i, color in enumerate(colors):
-        question[i] = color
-
-    # Place blocks of color 5
-    for block_pos in blocks:
-        for i in range(block_size):
-            question[block_pos + i] = 5
-
-    # Create answer with recolored blocks
-    answer = question.copy()
-    for block_idx, block_pos in enumerate(blocks):
-        color = colors[block_idx]
-        for i in range(block_size):
-            answer[block_pos + i] = color
-
-    return {"input": question, "output": answer}
-
-
-def task_duplicate_block_from_seeds(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a block is duplicated from seed pixels."""
-    block_size = rng.randint(2, 4)
-    if block_size + 1 >= size:
-        return None
-    if size <= 3 + block_size:
-        return None
-
-    # Position block with space for seeds
-    block_pos = rng.randint(2, size - block_size - 1)
-
-    # Decide seed placement
-    left_seed = rng.random() < 0.5
-    right_seed = rng.random() < 0.5
-    if not (left_seed or right_seed):
-        return None
-
-    # Create input
-    question = gen_field(size)
-
-    # Place main block
-    for i in range(block_size):
-        question[block_pos + i] = 1
-
-    # Place seeds with gaps
-    seeds = []
-    if left_seed:
-        color = rng.randint(1, 9)
-        question[block_pos - 2] = color
-        seeds.append(("left", block_pos - 2, color))
-    if right_seed:
-        color = rng.randint(1, 9)
-        question[block_pos + block_size + 1] = color
-        seeds.append(("right", block_pos + block_size + 1, color))
-
-    # Create answer with duplicated blocks
-    answer = question.copy()
-
-    for side, seed_pos, color in seeds:
-        if side == "left":
-            # For left seed, blocks end at seed
-            end_pos = seed_pos
-            while end_pos >= 0:
-                start_pos = end_pos - block_size + 1
-                for pos in range(max(0, start_pos), end_pos + 1):
-                    answer[pos] = color
-                if start_pos < 1:
-                    break
-                end_pos = start_pos - 2  # -1 for gap
-        else:  # side == "right"
-            # For right seed, blocks start at seed
-            start_pos = seed_pos
-            while start_pos < size:
-                for offset in range(min(block_size, size - start_pos)):
-                    answer[start_pos + offset] = color
-                if start_pos + block_size + 1 >= size:
-                    break
-                start_pos = start_pos + block_size + 1  # +1 for gap
-
-    return {"input": question, "output": answer}
-
-
-def task_fill_from_pixel(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a pixel fills in one direction until hitting another pixel."""
-    block_size = rng.randint(3, 6)
-    if block_size >= size - 2:
-        return None
-
-    # Position block with space for seed
-    block_pos = rng.randint(1, size - block_size - 1)
-
-    # Create input
-    question = gen_field(size)
-
-    # Place main block
-    block_color = rng.randint(1, 9)
-    for i in range(block_size):
-        question[block_pos + i] = block_color
-
-    # Place seed pixel and determine fill direction
-    seed_color = rng.randint(1, 9)
-    while seed_color == block_color:
-        seed_color = rng.randint(1, 9)
-
-    is_left = rng.random() < 0.5
-
-    if is_left:
-        question[block_pos - 1] = seed_color
-    else:
-        question[block_pos + block_size] = seed_color
-
-    # Create answer with fill
-    answer = question.copy()
-
-    if is_left:
-        # Fill from seed to left border
-        for i in range(block_pos):
-            answer[i] = seed_color
-    else:
-        # Fill from seed to right border
-        for i in range(block_pos + block_size, size):
-            answer[i] = seed_color
-
-    return {"input": question, "output": answer}
-
-
-def task_mark_size_two_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where size-2 blocks are marked with surrounding pixels."""
-    blocks = []
-    pos = 0
-
-    # Generate blocks with minimum gap of 2
-    while pos < size:
-        if rng.random() < 0.4:
-            block_size = rng.randint(1, 3)
-            # Check if we have space for block and potential markers
-            needed_space = block_size + (2 if block_size == 2 else 0)
-            if pos + needed_space < size:
-                blocks.append((pos, block_size))
-                pos += block_size + 2  # Minimum gap of 2
-
-        pos += 1
-
-    if len(blocks) < 2:
-        return None
-
-    # Verify gaps between blocks (including markers)
-    valid = True
-    for i in range(len(blocks) - 1):
-        pos1, size1 = blocks[i]
-        pos2, _ = blocks[i + 1]
-        needed_gap = 3 if size1 == 2 else 2
-        if pos2 - (pos1 + size1) < needed_gap:
-            valid = False
-            break
-    if not valid:
-        return None
-
-    # Create input with blocks
-    question = gen_field(size)
-    for pos, block_size in blocks:
-        # Place block
-        for i in range(block_size):
-            question[pos + i] = 1
-
-    # Create answer with markers
-    answer = question.copy()
-    for pos, block_size in blocks:
-        if block_size == 2:
-            # Add markers for size 2 blocks
-            if pos > 0:
-                answer[pos - 1] = 3
-            if pos + block_size < size:
-                answer[pos + block_size] = 3
-
-    return {"input": question, "output": answer}
-
-
-def task_fill_until_collision(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where pixels fill empty space until collision."""
-    # At least 4 positions for meaningful puzzle
-    if size < 4:
-        return None
-
-    is_left = rng.random() < 0.5
-    question = gen_field(size)
-
-    # Place the side marker
-    if is_left:
-        question[0] = 5
-    else:
-        question[size - 1] = 5
-
-    # Place 2-4 random pixels
-    num_pixels = rng.randint(2, 4)
-    positions = []
-
-    if is_left:
-        # Skip first position
-        for _ in range(num_pixels):
-            while True:
-                pos = rng.randint(1, size - 1)
-                if pos not in positions:
-                    positions.append(pos)
-                    break
-    else:
-        # Skip last position
-        for _ in range(num_pixels):
-            while True:
-                pos = rng.randint(0, size - 2)
-                if pos not in positions:
-                    positions.append(pos)
-                    break
-
-    # Color random pixels
-    for pos in positions:
-        question[pos] = rng.randint(1, 9)
-
-    positions.sort()
-
-    # Create answer
-    answer = question.copy()
-
-    if is_left:
-        # Fill right from each pixel
-        prev_pos = 0  # Start from marker
-        for pos in positions:
-            color = question[pos]
-            # Fill from previous position to current
-            for i in range(prev_pos + 1, pos):
-                answer[i] = color
-            prev_pos = pos
-    else:
-        # Fill left from each pixel
-        prev_pos = size - 1  # Start from marker
-        for pos in reversed(positions):
-            color = question[pos]
-            # Fill from current position to previous
-            for i in range(pos + 1, prev_pos):
-                answer[i] = color
-            prev_pos = pos
-
-    return {"input": question, "output": answer}
-
-
-def task_repeat_pattern_full(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where a pattern is repeated to fill the space."""
-    # Generate initial pattern
-    pattern_size = rng.randint(2, 5)
-    pattern = [rng.randint(1, 9) for _ in range(pattern_size)]
-
-    # Calculate total size needed for 2 repetitions
-    double_size = pattern_size * 2
-    if double_size >= size:
-        return None
-
-    # Create input with 2 repetitions
-    question = gen_field(size)
-    for i in range(pattern_size):
-        question[i] = pattern[i]
-        question[i + pattern_size] = pattern[i]
-
-    # Create answer with maximum repetitions
-    answer = gen_field(size)
-    pos = 0
-    while pos + pattern_size <= size:
-        for i in range(pattern_size):
-            answer[pos + i] = pattern[i]
-        pos += pattern_size
-
-    # Fill remaining space (if any) with pattern elements
-    for i in range(pos, size):
-        answer[i] = pattern[i - pos]
-
-    return {"input": question, "output": answer}
-
-
-from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple
-
 @dataclass
 class Arc1DConfig:
     """Configuration for ARC 1D task generation"""
+
     min_size: int = 10  # Minimum grid size
-    max_size: int = 30  # Maximum grid size 
+    max_size: int = 30  # Maximum grid size
     num_train: int = 3  # Number of training examples
     seed: Optional[int] = None
     size: int = 500
@@ -1031,139 +24,14 @@ class Arc1DConfig:
         assert self.size > 0, "size must be positive"
 
 
-def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where color 2 is heavier than color 1 in gravity."""
-    # Generate random field with only colors 1 and 2
-    question = [rng.randint(1, 2) if rng.random() < 0.5 else 0 for _ in range(size)]
-
-    # Count colors
-    count_1 = sum(1 for x in question if x == 1)
-    count_2 = sum(1 for x in question if x == 2)
-
-    # Create answer with sorted colors
-    answer = gen_field(size)
-
-    # Place heavier color 2 first
-    for i in range(count_2):
-        answer[i] = 2
-
-    # Then place color 1
-    for i in range(count_1):
-        answer[count_2 + i] = 1
-
-    return {"input": question, "output": answer}
-
-
-def task_color_left_half_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
-    """Generate a task where left half of blocks are colored differently."""
-    pos = 0
-    question = gen_field(size)
-    blocks = []
-
-    # Generate blocks with gap 1
-    while pos < size:
-        if rng.random() < 0.4:
-            block_size = rng.randint(2, 8)
-            if pos + block_size >= size:
-                break
-
-            blocks.append((pos, block_size))
-            for i in range(block_size):
-                question[pos + i] = 2
-            pos += block_size + 1  # block size + gap
-        else:
-            pos += 1
-
-    if len(blocks) < 2:
-        return None
-
-    # Create answer with half-colored blocks
-    answer = question.copy()
-    for pos, block_size in blocks:
-        half_size = block_size // 2
-        for i in range(half_size):
-            answer[pos + i] = 8
-
-    return {"input": question, "output": answer}
-
-
-# Table of all ARC 1D task functions with their parameters
-ARC_1D_TASKS = {
-    # Move tasks
-    "move_1pix_solid": (task_move_n_pix, {"move_pix": 1, "solid": True}),
-    "move_2pix_solid": (task_move_n_pix, {"move_pix": 2, "solid": True}),
-    "move_3pix_solid": (task_move_n_pix, {"move_pix": 3, "solid": True}),
-    "move_4pix_solid": (task_move_n_pix, {"move_pix": 4, "solid": True}),
-    "move_1pix_colorful": (task_move_n_pix, {"move_pix": 1, "solid": False}),
-    "move_2pix_colorful": (task_move_n_pix, {"move_pix": 2, "solid": False}),
-    "move_3pix_colorful": (task_move_n_pix, {"move_pix": 3, "solid": False}),
-    "move_4pix_colorful": (task_move_n_pix, {"move_pix": 4, "solid": False}),
-    
-    # Move wrapped tasks
-    "move_1pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": True}),
-    "move_2pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": True}),
-    "move_3pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": True}),
-    "move_4pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": True}),
-    "move_1pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": False}),
-    "move_2pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": False}),
-    "move_3pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": False}),
-    "move_4pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": False}),
-
-    # Gravity tasks
-    "gravity": (task_gravity, {}),
-    "gravity_counting": (task_gravity_counting, {}),
-    "gravity_antigravity": (task_gravity_antigravity, {}),
-    "gravity_one_step": (task_gravity_one_step, {}),
-    "gravity_weighted_colors": (task_gravity_weighted_colors, {}),
-
-    # Block tasks
-    "block_touch_dot": (task_block_touch_dot, {}),
-    "block_touch_dot_1pix": (task_block_touch_dot_n_pix, {"move_pix": 1}),
-    "block_touch_dot_2pix": (task_block_touch_dot_n_pix, {"move_pix": 2}),
-    "block_touch_dot_3pix": (task_block_touch_dot_n_pix, {"move_pix": 3}),
-    "block_touch_dot_4pix": (task_block_touch_dot_n_pix, {"move_pix": 4}),
-    "block_scale_to_dot": (task_block_scale_to_dot, {}),
-    "block_and_noise_remove": (task_block_and_noise_remove, {}),
-    "block_and_noise_remove_inside": (task_block_and_noise_remove_inside, {}),
-    "move_block_by_own_size": (task_move_block_by_own_size, {}),
-
-    # Pattern tasks
-    "two_points_and_fill": (task_two_points_and_fill, {}),
-    "copy_block_to_dots": (task_copy_block_to_dots, {}),
-    "copy_block_to_dots_colors": (task_copy_block_to_dots_colors, {}),
-    "repeat_pattern_full": (task_repeat_pattern_full, {}),
-
-    # Reflection tasks
-    "reflect_block_with_border_pixel": (task_reflect_block_with_border_pixel, {}),
-    "reflect_block_random": (task_reflect_block_with_border_pixel_random, {}),
-    "reflect_block_around_dot": (task_reflect_block_around_dot, {}),
-
-    # Color tasks
-    "paint_biggest_block": (task_paint_biggest_block, {}),
-    "recolor_blocks_by_size": (task_recolor_blocks_by_size, {}),
-    "change_to_five": (task_change_to_five, {}),
-    "recolor_blocks_from_palette": (task_recolor_blocks_from_palette, {}),
-    "color_left_half_blocks": (task_color_left_half_blocks, {}),
-
-    # Sorting tasks
-    "sort_blocks_by_size": (task_sort_blocks_by_size, {}),
-    "sort_complete_sequence": (task_sort_complete_sequence, {}),
-
-    # Fill tasks
-    "duplicate_block_from_seeds": (task_duplicate_block_from_seeds, {}),
-    "fill_from_pixel": (task_fill_from_pixel, {}),
-    "fill_until_collision": (task_fill_until_collision, {}),
-    
-    # Marking tasks
-    "mark_size_two_blocks": (task_mark_size_two_blocks, {}),
-}
-
-
 class Arc1DDataset(ProceduralDataset):
     """Generates ARC 1D tasks by randomly selecting from available task generators"""
 
     def __init__(self, config: Arc1DConfig):
+        from .arc_1d_tasks import ARC_1D_TASKS
+
         super().__init__(config=config, seed=config.seed, size=config.size)
+        self.ARC_1D_TASKS = ARC_1D_TASKS
         self.task_names = list(ARC_1D_TASKS.keys())
 
     def __getitem__(self, idx: int) -> dict:
@@ -1179,37 +47,37 @@ class Arc1DDataset(ProceduralDataset):
                 - metadata: dict with generation parameters
         """
         # Create deterministic RNG from base seed and idx
-        item_rng = random.Random(self.seed + idx)
-        
+        item_rng = Random(self.seed + idx)
+
         # Select random task
         task_name = item_rng.choice(self.task_names)
-        task_func, task_kwargs = ARC_1D_TASKS[task_name]
-        
+        task_func, task_kwargs = self.ARC_1D_TASKS[task_name]
+
         # Generate training examples
         train_examples = []
         size = item_rng.randint(self.config.min_size, self.config.max_size)
-        
+
         for _ in range(self.config.num_train):
             example = None
             while example is None:
                 example = task_func(item_rng, size, **task_kwargs)
-        
+
             train_examples.append(example)
-            
+
         # Generate test example
         test_example = None
         while test_example is None:
             test_example = task_func(item_rng, size, **task_kwargs)
-            
+
         # Format question
         question = "Find the common rule that maps an input grid to an output grid, given the examples below.\n\n"
-        
+
         # Add training examples
         for i, example in enumerate(train_examples, 1):
             question += f"Example {i}:\n"
             question += "Input:  " + " ".join(str(x) for x in example["input"]) + "\n"
             question += "Output: " + " ".join(str(x) for x in example["output"]) + "\n\n"
-            
+
         # Add test input
         question += "Below is a test input grid. Predict the corresponding output grid by applying the rule you found. "
         question += "Describe how you derived the rule and your overall reasoning process in detail before you submit your answer. "
@@ -1229,33 +97,5 @@ class Arc1DDataset(ProceduralDataset):
         }
 
 
-
-def task_mirror(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
-    """Mirror the input and output arrays of a task result."""
-    if task_result is None:
-        return None
-    return {
-        "input": list(reversed(task_result["input"])),
-        "output": list(reversed(task_result["output"]))
-    }
-
-
-def task_inverse(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
-    """Swap the input and output arrays of a task result."""
-    if task_result is None:
-        return None
-    return {
-        "input": task_result["output"],
-        "output": task_result["input"]
-    }
-
-
-def task_identity(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
-    """Return the task result unchanged."""
-    return task_result
-
-
 # Register the dataset
 register_dataset("arc_1d", Arc1DDataset, Arc1DConfig)
-
-
diff --git a/reasoning_gym/cognition/arc_1d_tasks.py b/reasoning_gym/cognition/arc_1d_tasks.py
new file mode 100644
index 00000000..9e1334e6
--- /dev/null
+++ b/reasoning_gym/cognition/arc_1d_tasks.py
@@ -0,0 +1,1145 @@
+from random import Random
+from typing import Dict, List, Optional
+
+
+def gen_field(size: int, color: int = 0) -> List[int]:
+    """Generate a field of given size filled with specified color (default 0)."""
+    return [color] * size
+
+
+def write_block(pos: int, block: List[int], field: List[int]) -> List[int]:
+    """Write a block into a field at given position."""
+    result = field.copy()
+    for i, color in enumerate(block):
+        result[pos + i] = color
+    return result
+
+
+def task_move_n_pix(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block is moved to the right by move_pix pixels."""
+    if size <= move_pix + 1:
+        return None
+
+    block_size = rng.randint(1, size - move_pix - 1)
+    block_pos = rng.randint(0, size - block_size - move_pix)
+
+    if solid:
+        color = rng.randint(1, 9)
+        block = [color] * block_size
+    else:
+        block = [rng.randint(1, 9) for _ in range(block_size)]
+
+    question = write_block(block_pos, block, gen_field(size))
+    answer = write_block(block_pos + move_pix, block, gen_field(size))
+
+    return {"input": question, "output": answer}
+
+
+def task_move_n_pix_wrapped(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block is moved to the right by move_pix pixels with wrapping."""
+    block_size = rng.randint(1, size)
+    block_pos = rng.randint(0, size)
+
+    if solid:
+        color = rng.randint(1, 9)
+        block = [color] * block_size
+    else:
+        block = [rng.randint(1, 9) for _ in range(block_size)]
+
+    question = gen_field(size)
+    answer = gen_field(size)
+
+    for i, color in enumerate(block):
+        question[(block_pos + i) % size] = color
+        answer[(block_pos + move_pix + i) % size] = color
+
+    return {"input": question, "output": answer}
+
+
+def task_gravity(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where all non-zero elements are attracted to the left."""
+    density = 0.5
+    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
+
+    non_zero = [x for x in question if x != 0]
+    answer = non_zero + [0] * (size - len(non_zero))
+
+    return {"input": question, "output": answer}
+
+
+def task_gravity_counting(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where non-zero elements are counted and represented as a sequence of 1s."""
+    density = 0.5
+    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
+
+    count = sum(1 for x in question if x != 0)
+    answer = [1] * count + [0] * (size - count)
+
+    return {"input": question, "output": answer}
+
+
+def task_gravity_antigravity(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where color 1 moves right and color 2 moves left."""
+    density = 0.5
+    question = [rng.randint(1, 2) if rng.random() < density else 0 for _ in range(size)]
+
+    color1 = [x for x in question if x == 1]
+    color2 = [x for x in question if x == 2]
+    answer = [2] * len(color2) + [0] * (size - len(color1) - len(color2)) + [1] * len(color1)
+
+    return {"input": question, "output": answer}
+
+
+def task_block_touch_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block moves to touch (but not cover) a dot."""
+    dot_color = 1
+    block_color = rng.randint(2, 9)
+
+    block_size = rng.randint(1, size)
+    dot_pos = rng.randint(0, size)
+
+    can_place_left = dot_pos >= block_size
+    can_place_right = dot_pos + block_size < size
+
+    if not (can_place_left or can_place_right):
+        return None
+
+    if can_place_left and can_place_right:
+        side = rng.choice(["left", "right"])
+    elif can_place_left:
+        side = "left"
+    else:
+        side = "right"
+
+    if side == "left":
+        q_block_pos = rng.randint(0, dot_pos - block_size)
+        a_block_pos = dot_pos - block_size
+    else:
+        q_block_pos = rng.randint(dot_pos + 1, size - block_size)
+        a_block_pos = dot_pos + 1
+
+    question = gen_field(size)
+    question[dot_pos] = dot_color
+    question = write_block(q_block_pos, [block_color] * block_size, question)
+
+    answer = gen_field(size)
+    answer[dot_pos] = dot_color
+    answer = write_block(a_block_pos, [block_color] * block_size, answer)
+
+    return {"input": question, "output": answer}
+
+
+def task_block_touch_dot_n_pix(rng: Random, size: int, move_pix: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block moves move_pix pixels toward a dot."""
+    dot_color = 2
+    block_color = rng.randint(3, 9)
+
+    block_size = rng.randint(1, size)
+    dot_pos = rng.randint(0, size)
+
+    can_place_left = dot_pos >= block_size
+    can_place_right = dot_pos + block_size < size
+
+    if not (can_place_left or can_place_right):
+        return None
+
+    if can_place_left and can_place_right:
+        side = rng.choice(["left", "right"])
+    elif can_place_left:
+        side = "left"
+    else:
+        side = "right"
+
+    if side == "left":
+        q_block_pos = rng.randint(0, dot_pos - block_size)
+        distance = (dot_pos - block_size) - q_block_pos
+        move = min(distance, move_pix)
+        a_block_pos = q_block_pos + move
+    else:
+        q_block_pos = rng.randint(dot_pos + 1, size - block_size)
+        distance = q_block_pos - (dot_pos + 1)
+        move = min(distance, move_pix)
+        a_block_pos = q_block_pos - move
+
+    question = gen_field(size)
+    question[dot_pos] = dot_color
+    question = write_block(q_block_pos, [block_color] * block_size, question)
+
+    answer = gen_field(size)
+    answer[dot_pos] = dot_color
+    answer = write_block(a_block_pos, [block_color] * block_size, answer)
+
+    return {"input": question, "output": answer}
+
+
+def task_block_scale_to_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block scales to touch a dot (keeping one end fixed)."""
+    dot_color = 2
+    block_color = rng.randint(3, 9)
+
+    block_size = rng.randint(1, size)
+    dot_pos = rng.randint(0, size)
+
+    can_place_left = dot_pos >= block_size
+    can_place_right = dot_pos + block_size < size
+
+    if not (can_place_left or can_place_right):
+        return None
+
+    if can_place_left and can_place_right:
+        side = rng.choice(["left", "right"])
+    elif can_place_left:
+        side = "left"
+    else:
+        side = "right"
+
+    if side == "left":
+        q_block_pos = rng.randint(0, dot_pos - block_size)
+        new_size = dot_pos - q_block_pos + 1
+        a_block_pos = q_block_pos
+    else:
+        q_block_pos = rng.randint(dot_pos + 1, size - block_size)
+        new_size = (q_block_pos + block_size) - dot_pos
+        a_block_pos = dot_pos
+
+    question = gen_field(size)
+    question[dot_pos] = dot_color
+    question = write_block(q_block_pos, [block_color] * block_size, question)
+
+    answer = gen_field(size)
+    answer[dot_pos] = dot_color
+    answer = write_block(a_block_pos, [block_color] * new_size, answer)
+
+    return {"input": question, "output": answer}
+
+
+def task_two_points_and_fill(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where space between two points of same color is filled with that color."""
+    color = rng.randint(1, 9)
+
+    pos1 = rng.randint(0, size - 1)
+    pos2 = rng.randint(0, size - 1)
+    if pos1 == pos2:
+        return None
+
+    pos1, pos2 = min(pos1, pos2), max(pos1, pos2)
+
+    question = gen_field(size)
+    question[pos1] = color
+    question[pos2] = color
+
+    answer = question.copy()
+    for i in range(pos1, pos2 + 1):
+        answer[i] = color
+
+    return {"input": question, "output": answer}
+
+
+def task_reflect_block_with_border_pixel(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block with a border pixel is reflected."""
+    block_size = rng.randint(2, size)
+    if block_size > size:
+        return None
+
+    c1 = rng.randint(1, 9)
+    c2 = rng.randint(1, 9)
+    if c1 == c2:
+        return None
+
+    side = "left" if rng.random() < 0.5 else "right"
+    pos = rng.randint(0, size - block_size)
+
+    block = [c1] * block_size
+    if side == "left":
+        block[0] = c2
+    else:
+        block[block_size - 1] = c2
+
+    question = write_block(pos, block, gen_field(size))
+    reversed_block = block[::-1]  # Reverse the block
+    answer = write_block(pos, reversed_block, gen_field(size))
+
+    return {"input": question, "output": answer}
+
+
+def task_reflect_block_with_border_pixel_random(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a random-colored block with a border pixel is reflected."""
+    block_size = rng.randint(2, size)
+    if block_size > size:
+        return None
+
+    side = "left" if rng.random() < 0.5 else "right"
+    pos = rng.randint(0, size - block_size)
+
+    block = [rng.randint(1, 9) for _ in range(block_size)]
+    border_color = rng.randint(1, 9)
+
+    if side == "left":
+        if block[0] == border_color:
+            return None
+        block[0] = border_color
+    else:
+        if block[block_size - 1] == border_color:
+            return None
+        block[block_size - 1] = border_color
+
+    question = write_block(pos, block, gen_field(size))
+    reversed_block = block[::-1]  # Reverse the block
+    answer = write_block(pos, reversed_block, gen_field(size))
+
+    return {"input": question, "output": answer}
+
+
+def task_reflect_block_around_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block is reflected around a dot."""
+    dot_color = 2
+
+    dot_pos = rng.randint(0, size)
+    block_size = rng.randint(1, size)
+    block_pos = rng.randint(0, size - block_size)
+    block_end = block_pos + block_size - 1
+
+    # Check if block is strictly to left or right of dot
+    strictly_left = block_end < dot_pos
+    strictly_right = block_pos > dot_pos
+
+    if not (strictly_left or strictly_right):
+        return None
+
+    block_color = rng.randint(3, 9)  # Different from dot color
+    block = [block_color] * block_size
+
+    # Calculate reflection bounds
+    min_reflect = 2 * dot_pos - block_end
+    max_reflect = 2 * dot_pos - block_pos
+    if min_reflect < 0 or max_reflect >= size:
+        return None
+
+    question = gen_field(size)
+    question = write_block(block_pos, block, question)
+    question[dot_pos] = dot_color
+
+    answer = gen_field(size)
+    answer[dot_pos] = dot_color
+    for i in range(block_size):
+        reflect_idx = 2 * dot_pos - (block_pos + i)
+        answer[reflect_idx] = block[i]
+
+    return {"input": question, "output": answer}
+
+
+def task_block_and_noise_remove(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where noise around a block needs to be removed."""
+    block_size = rng.randint(2, size)
+    if block_size > size:
+        return None
+
+    block_pos = rng.randint(0, size - block_size)
+    color = rng.randint(1, 9)
+
+    # Create field with block
+    field = gen_field(size)
+    for i in range(block_size):
+        field[block_pos + i] = color
+
+    # Track forbidden positions for noise
+    forbidden = [False] * size
+    for i in range(block_pos, block_pos + block_size):
+        forbidden[i] = True
+    if block_pos > 0:
+        forbidden[block_pos - 1] = True
+    if block_pos + block_size < size:
+        forbidden[block_pos + block_size] = True
+
+    # Add noise
+    noise_count = rng.randint(1, 3)
+    noise_positions = []
+
+    for _ in range(noise_count):
+        allowed = [i for i in range(size) if not forbidden[i]]
+        if not allowed:
+            break
+        noise_pos = rng.choice(allowed)
+        noise_positions.append(noise_pos)
+        field[noise_pos] = color
+        forbidden[noise_pos] = True
+        if noise_pos > 0:
+            forbidden[noise_pos - 1] = True
+        if noise_pos + 1 < size:
+            forbidden[noise_pos + 1] = True
+
+    if len(noise_positions) < noise_count:
+        return None
+
+    question = field
+    answer = field.copy()
+    for pos in noise_positions:
+        answer[pos] = 0
+
+    return {"input": question, "output": answer}
+
+
+def task_block_and_noise_remove_inside(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where noise inside a block needs to be removed."""
+    if size <= 6:
+        return None
+
+    block_size = rng.randint(6, size)
+    if block_size > size:
+        return None
+
+    block_pos = rng.randint(0, size - block_size)
+    color = rng.randint(1, 9)
+
+    # Create field with block
+    field = gen_field(size)
+    for i in range(block_size):
+        field[block_pos + i] = color
+
+    # Add noise inside block
+    max_noise = max(1, (block_size // 2) - 1)
+    noise_count = rng.randint(1, max_noise)
+
+    positions = list(range(block_size))
+    rng.shuffle(positions)
+    noise_positions = positions[:noise_count]
+
+    for offset in noise_positions:
+        pos = block_pos + offset
+        noise_color = rng.randint(1, 9)
+        while noise_color == color:
+            noise_color = rng.randint(1, 9)
+        field[pos] = noise_color
+
+    question = field
+    answer = field.copy()
+    for offset in noise_positions:
+        answer[block_pos + offset] = color
+
+    return {"input": question, "output": answer}
+
+
+def task_copy_block_to_dots(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block pattern is copied to dot positions."""
+    block_size = 3 if rng.random() < 0.5 else 5
+    if block_size >= size:
+        return None
+
+    color = rng.randint(1, 9)
+    block = [color] * block_size
+
+    # Generate dots with minimum distance to prevent overlap
+    min_gap = block_size
+    dot_positions = []
+    pos = block_size + block_size // 2 + 1
+
+    while pos <= size - block_size:
+        if rng.random() < 0.5:  # Control dot density
+            dot_positions.append(pos)
+            pos += min_gap
+        pos += 1
+
+    if not dot_positions:
+        return None
+
+    question = gen_field(size)
+    question = write_block(0, block, question)
+    for pos in dot_positions:
+        question[pos] = color
+
+    answer = gen_field(size)
+    answer = write_block(0, block, answer)
+    for pos in dot_positions:
+        block_start = pos - block_size // 2
+        answer = write_block(block_start, block, answer)
+
+    return {"input": question, "output": answer}
+
+
+def task_copy_block_to_dots_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block pattern is copied to dot positions with matching colors."""
+    block_size = 3 if rng.random() < 0.5 else 5
+    if block_size >= size:
+        return None
+
+    block_color = rng.randint(1, 9)
+    block = [block_color] * block_size
+
+    # Generate dots with minimum distance to prevent overlap
+    min_gap = block_size
+    dot_positions = []
+    dot_colors = []
+    pos = block_size + block_size // 2 + 1
+
+    while pos < size - block_size:
+        if rng.random() < 0.5:
+            dot_color = rng.randint(1, 9)
+            dot_positions.append(pos)
+            dot_colors.append(dot_color)
+            pos += min_gap
+        pos += 1
+
+    if not dot_positions:
+        return None
+
+    question = gen_field(size)
+    question = write_block(0, block, question)
+    for i, pos in enumerate(dot_positions):
+        question[pos] = dot_colors[i]
+
+    answer = gen_field(size)
+    answer = write_block(0, block, answer)
+    for i, pos in enumerate(dot_positions):
+        block_start = pos - block_size // 2
+        colored_block = [dot_colors[i]] * block_size
+        answer = write_block(block_start, colored_block, answer)
+
+    return {"input": question, "output": answer}
+
+
+def task_paint_biggest_block(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where the largest block is painted a different color."""
+    target_color = 1
+    initial_color = rng.randint(2, 9)
+
+    # Generate random blocks
+    question = gen_field(size)
+    blocks = []
+    pos = 0
+
+    while pos < size:
+        if rng.random() < 0.4 and size - pos >= 2:
+            block_size = rng.randint(2, min(size - pos, 6))
+            blocks.append((pos, block_size))
+            for i in range(block_size):
+                question[pos + i] = initial_color
+            pos += block_size + 1
+        else:
+            pos += 1
+
+    if len(blocks) < 2:
+        return None
+
+    # Find biggest block
+    biggest_pos, biggest_size = max(blocks, key=lambda x: x[1])
+
+    # Check if there are multiple blocks of the same size
+    biggest_count = sum(1 for _, size in blocks if size == biggest_size)
+    if biggest_count > 1:
+        return None
+
+    answer = question.copy()
+    for i in range(biggest_size):
+        answer[biggest_pos + i] = target_color
+
+    return {"input": question, "output": answer}
+
+
+def task_sort_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where blocks are sorted by size with 1 pixel gaps."""
+    color = rng.randint(1, 9)
+    blocks = []
+    pos = 0
+
+    # Generate random blocks with random sizes
+    while pos < size:
+        if rng.random() < 0.4 and size - pos >= 2:
+            block_size = rng.randint(1, min(size - pos, 6))
+            blocks.append((pos, block_size))
+            pos += block_size + rng.randint(1, 4)  # Random gaps
+        else:
+            pos += 1
+
+    if len(blocks) < 2:
+        return None
+
+    # Create input field
+    question = gen_field(size)
+    for pos, block_size in blocks:
+        for i in range(block_size):
+            question[pos + i] = color
+
+    # Sort blocks by size
+    blocks.sort(key=lambda x: x[1])
+
+    # Check if sorted blocks fit with gaps
+    total_space = sum(size for _, size in blocks) + len(blocks) - 1
+    if total_space > size:
+        return None
+
+    # Create answer field with sorted blocks
+    answer = gen_field(size)
+    current_pos = 0
+
+    for _, block_size in blocks:
+        for i in range(block_size):
+            answer[current_pos + i] = color
+        current_pos += block_size + 1  # One pixel gap
+
+    return {"input": question, "output": answer}
+
+
+def task_sort_complete_sequence(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a complete sequence of block sizes is sorted."""
+    # Calculate max possible block size given total array size
+    max_size = 1
+    total_space = 0
+    while total_space + max_size + 1 <= size:
+        total_space += max_size + 1
+        max_size += 1
+    max_size -= 1
+
+    if max_size < 2:
+        return None
+
+    color = rng.randint(1, 9)
+
+    # Create sequence of all sizes from 1 to max_size
+    blocks = list(range(1, max_size + 1))
+    rng.shuffle(blocks)
+
+    # Create input field with shuffled blocks
+    question = gen_field(size)
+    pos = 0
+    for block_size in blocks:
+        for i in range(block_size):
+            question[pos + i] = color
+        pos += block_size + 1
+
+    # Create answer field with sorted blocks
+    answer = gen_field(size)
+    pos = 0
+    for block_size in range(1, max_size + 1):
+        for i in range(block_size):
+            answer[pos + i] = color
+        pos += block_size + 1
+
+    return {"input": question, "output": answer}
+
+
+def task_recolor_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where two blocks are recolored based on their size."""
+    # Generate two different random sizes
+    size1 = rng.randint(2, 8)
+    size2 = rng.randint(2, 8)
+    while size2 == size1:
+        size2 = rng.randint(2, 8)
+
+    # Ensure both blocks fit with at least 1 gap
+    if size1 + size2 + 1 > size:
+        return None
+
+    # Place blocks with gap
+    pos1 = rng.randint(0, size - (size1 + size2 + 1))
+    pos2 = rng.randint(pos1 + size1 + 1, size - size2)
+
+    # Create input field with both blocks color 3
+    question = gen_field(size)
+    for i in range(size1):
+        question[pos1 + i] = 3
+    for i in range(size2):
+        question[pos2 + i] = 3
+
+    # Create answer field with recolored blocks
+    answer = question.copy()
+    if size1 > size2:
+        for i in range(size1):
+            answer[pos1 + i] = 1
+        for i in range(size2):
+            answer[pos2 + i] = 2
+    else:
+        for i in range(size1):
+            answer[pos1 + i] = 2
+        for i in range(size2):
+            answer[pos2 + i] = 1
+
+    return {"input": question, "output": answer}
+
+
+def task_gravity_one_step(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where non-zero elements move one step left if possible."""
+    question = [rng.randint(1, 9) if rng.random() < 0.5 else 0 for _ in range(size)]
+    answer = question.copy()
+
+    # Move each non-zero pixel one step left if possible
+    for i in range(1, size):
+        if answer[i] != 0 and answer[i - 1] == 0:
+            answer[i - 1] = answer[i]
+            answer[i] = 0
+
+    return {"input": question, "output": answer}
+
+
+def task_move_block_by_own_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block moves right by its own size."""
+    block_size = rng.randint(1, size // 2)  # Ensure space for movement
+    pos = rng.randint(0, size - block_size * 2)  # Space for block and movement
+    color = rng.randint(1, 9)
+
+    question = gen_field(size)
+    block = [color] * block_size
+    question = write_block(pos, block, question)
+
+    answer = write_block(pos + block_size, block, gen_field(size))
+
+    return {"input": question, "output": answer}
+
+
+def task_change_to_five(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where all non-zero colors change to 5."""
+    density = 0.5
+    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
+    answer = [5 if x != 0 else 0 for x in question]
+
+    return {"input": question, "output": answer}
+
+
+def task_recolor_blocks_from_palette(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where blocks are recolored using a color palette."""
+    # Generate blocks of same size
+    block_size = rng.randint(2, 4)
+    blocks = []
+    pos = 0
+
+    while pos + block_size <= size:
+        if rng.random() < 0.4:
+            blocks.append(pos)
+            pos += block_size + 1
+        else:
+            pos += 1
+
+    # Ensure we have space for palette
+    while blocks and blocks[-1] + block_size + len(blocks) + 1 >= size:
+        blocks.pop()
+
+    if not blocks:
+        return None
+
+    # Shift blocks right to make room for palette
+    palette_size = len(blocks)
+    blocks = [pos + palette_size + 1 for pos in blocks]
+
+    # Generate color palette
+    colors = []
+    for _ in range(len(blocks)):
+        while True:
+            color = rng.randint(1, 9)
+            if color not in colors:
+                colors.append(color)
+                break
+
+    # Create question with color palette and blocks
+    question = gen_field(size)
+
+    # Place color palette at start
+    for i, color in enumerate(colors):
+        question[i] = color
+
+    # Place blocks of color 5
+    for block_pos in blocks:
+        for i in range(block_size):
+            question[block_pos + i] = 5
+
+    # Create answer with recolored blocks
+    answer = question.copy()
+    for block_idx, block_pos in enumerate(blocks):
+        color = colors[block_idx]
+        for i in range(block_size):
+            answer[block_pos + i] = color
+
+    return {"input": question, "output": answer}
+
+
+def task_duplicate_block_from_seeds(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a block is duplicated from seed pixels."""
+    block_size = rng.randint(2, 4)
+    if block_size + 1 >= size:
+        return None
+    if size <= 3 + block_size:
+        return None
+
+    # Position block with space for seeds
+    block_pos = rng.randint(2, size - block_size - 1)
+
+    # Decide seed placement
+    left_seed = rng.random() < 0.5
+    right_seed = rng.random() < 0.5
+    if not (left_seed or right_seed):
+        return None
+
+    # Create input
+    question = gen_field(size)
+
+    # Place main block
+    for i in range(block_size):
+        question[block_pos + i] = 1
+
+    # Place seeds with gaps
+    seeds = []
+    if left_seed:
+        color = rng.randint(1, 9)
+        question[block_pos - 2] = color
+        seeds.append(("left", block_pos - 2, color))
+    if right_seed:
+        color = rng.randint(1, 9)
+        question[block_pos + block_size + 1] = color
+        seeds.append(("right", block_pos + block_size + 1, color))
+
+    # Create answer with duplicated blocks
+    answer = question.copy()
+
+    for side, seed_pos, color in seeds:
+        if side == "left":
+            # For left seed, blocks end at seed
+            end_pos = seed_pos
+            while end_pos >= 0:
+                start_pos = end_pos - block_size + 1
+                for pos in range(max(0, start_pos), end_pos + 1):
+                    answer[pos] = color
+                if start_pos < 1:
+                    break
+                end_pos = start_pos - 2  # -1 for gap
+        else:  # side == "right"
+            # For right seed, blocks start at seed
+            start_pos = seed_pos
+            while start_pos < size:
+                for offset in range(min(block_size, size - start_pos)):
+                    answer[start_pos + offset] = color
+                if start_pos + block_size + 1 >= size:
+                    break
+                start_pos = start_pos + block_size + 1  # +1 for gap
+
+    return {"input": question, "output": answer}
+
+
+def task_fill_from_pixel(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a pixel fills in one direction until hitting another pixel."""
+    block_size = rng.randint(3, 6)
+    if block_size >= size - 2:
+        return None
+
+    # Position block with space for seed
+    block_pos = rng.randint(1, size - block_size - 1)
+
+    # Create input
+    question = gen_field(size)
+
+    # Place main block
+    block_color = rng.randint(1, 9)
+    for i in range(block_size):
+        question[block_pos + i] = block_color
+
+    # Place seed pixel and determine fill direction
+    seed_color = rng.randint(1, 9)
+    while seed_color == block_color:
+        seed_color = rng.randint(1, 9)
+
+    is_left = rng.random() < 0.5
+
+    if is_left:
+        question[block_pos - 1] = seed_color
+    else:
+        question[block_pos + block_size] = seed_color
+
+    # Create answer with fill
+    answer = question.copy()
+
+    if is_left:
+        # Fill from seed to left border
+        for i in range(block_pos):
+            answer[i] = seed_color
+    else:
+        # Fill from seed to right border
+        for i in range(block_pos + block_size, size):
+            answer[i] = seed_color
+
+    return {"input": question, "output": answer}
+
+
+def task_mark_size_two_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where size-2 blocks are marked with surrounding pixels."""
+    blocks = []
+    pos = 0
+
+    # Generate blocks with minimum gap of 2
+    while pos < size:
+        if rng.random() < 0.4:
+            block_size = rng.randint(1, 3)
+            # Check if we have space for block and potential markers
+            needed_space = block_size + (2 if block_size == 2 else 0)
+            if pos + needed_space < size:
+                blocks.append((pos, block_size))
+                pos += block_size + 2  # Minimum gap of 2
+
+        pos += 1
+
+    if len(blocks) < 2:
+        return None
+
+    # Verify gaps between blocks (including markers)
+    valid = True
+    for i in range(len(blocks) - 1):
+        pos1, size1 = blocks[i]
+        pos2, _ = blocks[i + 1]
+        needed_gap = 3 if size1 == 2 else 2
+        if pos2 - (pos1 + size1) < needed_gap:
+            valid = False
+            break
+    if not valid:
+        return None
+
+    # Create input with blocks
+    question = gen_field(size)
+    for pos, block_size in blocks:
+        # Place block
+        for i in range(block_size):
+            question[pos + i] = 1
+
+    # Create answer with markers
+    answer = question.copy()
+    for pos, block_size in blocks:
+        if block_size == 2:
+            # Add markers for size 2 blocks
+            if pos > 0:
+                answer[pos - 1] = 3
+            if pos + block_size < size:
+                answer[pos + block_size] = 3
+
+    return {"input": question, "output": answer}
+
+
+def task_fill_until_collision(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where pixels fill empty space until collision."""
+    # At least 4 positions for meaningful puzzle
+    if size < 4:
+        return None
+
+    is_left = rng.random() < 0.5
+    question = gen_field(size)
+
+    # Place the side marker
+    if is_left:
+        question[0] = 5
+    else:
+        question[size - 1] = 5
+
+    # Place 2-4 random pixels
+    num_pixels = rng.randint(2, 4)
+    positions = []
+
+    if is_left:
+        # Skip first position
+        for _ in range(num_pixels):
+            while True:
+                pos = rng.randint(1, size - 1)
+                if pos not in positions:
+                    positions.append(pos)
+                    break
+    else:
+        # Skip last position
+        for _ in range(num_pixels):
+            while True:
+                pos = rng.randint(0, size - 2)
+                if pos not in positions:
+                    positions.append(pos)
+                    break
+
+    # Color random pixels
+    for pos in positions:
+        question[pos] = rng.randint(1, 9)
+
+    positions.sort()
+
+    # Create answer
+    answer = question.copy()
+
+    if is_left:
+        # Fill right from each pixel
+        prev_pos = 0  # Start from marker
+        for pos in positions:
+            color = question[pos]
+            # Fill from previous position to current
+            for i in range(prev_pos + 1, pos):
+                answer[i] = color
+            prev_pos = pos
+    else:
+        # Fill left from each pixel
+        prev_pos = size - 1  # Start from marker
+        for pos in reversed(positions):
+            color = question[pos]
+            # Fill from current position to previous
+            for i in range(pos + 1, prev_pos):
+                answer[i] = color
+            prev_pos = pos
+
+    return {"input": question, "output": answer}
+
+
+def task_repeat_pattern_full(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where a pattern is repeated to fill the space."""
+    # Generate initial pattern
+    pattern_size = rng.randint(2, 5)
+    pattern = [rng.randint(1, 9) for _ in range(pattern_size)]
+
+    # Calculate total size needed for 2 repetitions
+    double_size = pattern_size * 2
+    if double_size >= size:
+        return None
+
+    # Create input with 2 repetitions
+    question = gen_field(size)
+    for i in range(pattern_size):
+        question[i] = pattern[i]
+        question[i + pattern_size] = pattern[i]
+
+    # Create answer with maximum repetitions
+    answer = gen_field(size)
+    pos = 0
+    while pos + pattern_size <= size:
+        for i in range(pattern_size):
+            answer[pos + i] = pattern[i]
+        pos += pattern_size
+
+    # Fill remaining space (if any) with pattern elements
+    for i in range(pos, size):
+        answer[i] = pattern[i - pos]
+
+    return {"input": question, "output": answer}
+
+
+def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where color 2 is heavier than color 1 in gravity."""
+    # Generate random field with only colors 1 and 2
+    question = [rng.randint(1, 2) if rng.random() < 0.5 else 0 for _ in range(size)]
+
+    # Count colors
+    count_1 = sum(1 for x in question if x == 1)
+    count_2 = sum(1 for x in question if x == 2)
+
+    # Create answer with sorted colors
+    answer = gen_field(size)
+
+    # Place heavier color 2 first
+    for i in range(count_2):
+        answer[i] = 2
+
+    # Then place color 1
+    for i in range(count_1):
+        answer[count_2 + i] = 1
+
+    return {"input": question, "output": answer}
+
+
+def task_color_left_half_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+    """Generate a task where left half of blocks are colored differently."""
+    pos = 0
+    question = gen_field(size)
+    blocks = []
+
+    # Generate blocks with gap 1
+    while pos < size:
+        if rng.random() < 0.4:
+            block_size = rng.randint(2, 8)
+            if pos + block_size >= size:
+                break
+
+            blocks.append((pos, block_size))
+            for i in range(block_size):
+                question[pos + i] = 2
+            pos += block_size + 1  # block size + gap
+        else:
+            pos += 1
+
+    if len(blocks) < 2:
+        return None
+
+    # Create answer with half-colored blocks
+    answer = question.copy()
+    for pos, block_size in blocks:
+        half_size = block_size // 2
+        for i in range(half_size):
+            answer[pos + i] = 8
+
+    return {"input": question, "output": answer}
+
+
+def task_mirror(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
+    """Mirror the input and output arrays of a task result."""
+    if task_result is None:
+        return None
+    return {"input": list(reversed(task_result["input"])), "output": list(reversed(task_result["output"]))}
+
+
+def task_inverse(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
+    """Swap the input and output arrays of a task result."""
+    if task_result is None:
+        return None
+    return {"input": task_result["output"], "output": task_result["input"]}
+
+
+def task_identity(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
+    """Return the task result unchanged."""
+    return task_result
+
+
+# Table of all ARC 1D task functions with their parameters
+ARC_1D_TASKS = {
+    # Move tasks
+    "move_1pix_solid": (task_move_n_pix, {"move_pix": 1, "solid": True}),
+    "move_2pix_solid": (task_move_n_pix, {"move_pix": 2, "solid": True}),
+    "move_3pix_solid": (task_move_n_pix, {"move_pix": 3, "solid": True}),
+    "move_4pix_solid": (task_move_n_pix, {"move_pix": 4, "solid": True}),
+    "move_1pix_colorful": (task_move_n_pix, {"move_pix": 1, "solid": False}),
+    "move_2pix_colorful": (task_move_n_pix, {"move_pix": 2, "solid": False}),
+    "move_3pix_colorful": (task_move_n_pix, {"move_pix": 3, "solid": False}),
+    "move_4pix_colorful": (task_move_n_pix, {"move_pix": 4, "solid": False}),
+    # Move wrapped tasks
+    "move_1pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": True}),
+    "move_2pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": True}),
+    "move_3pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": True}),
+    "move_4pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": True}),
+    "move_1pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": False}),
+    "move_2pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": False}),
+    "move_3pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": False}),
+    "move_4pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": False}),
+    # Gravity tasks
+    "gravity": (task_gravity, {}),
+    "gravity_counting": (task_gravity_counting, {}),
+    "gravity_antigravity": (task_gravity_antigravity, {}),
+    "gravity_one_step": (task_gravity_one_step, {}),
+    "gravity_weighted_colors": (task_gravity_weighted_colors, {}),
+    # Block tasks
+    "block_touch_dot": (task_block_touch_dot, {}),
+    "block_touch_dot_1pix": (task_block_touch_dot_n_pix, {"move_pix": 1}),
+    "block_touch_dot_2pix": (task_block_touch_dot_n_pix, {"move_pix": 2}),
+    "block_touch_dot_3pix": (task_block_touch_dot_n_pix, {"move_pix": 3}),
+    "block_touch_dot_4pix": (task_block_touch_dot_n_pix, {"move_pix": 4}),
+    "block_scale_to_dot": (task_block_scale_to_dot, {}),
+    "block_and_noise_remove": (task_block_and_noise_remove, {}),
+    "block_and_noise_remove_inside": (task_block_and_noise_remove_inside, {}),
+    "move_block_by_own_size": (task_move_block_by_own_size, {}),
+    # Pattern tasks
+    "two_points_and_fill": (task_two_points_and_fill, {}),
+    "copy_block_to_dots": (task_copy_block_to_dots, {}),
+    "copy_block_to_dots_colors": (task_copy_block_to_dots_colors, {}),
+    "repeat_pattern_full": (task_repeat_pattern_full, {}),
+    # Reflection tasks
+    "reflect_block_with_border_pixel": (task_reflect_block_with_border_pixel, {}),
+    "reflect_block_random": (task_reflect_block_with_border_pixel_random, {}),
+    "reflect_block_around_dot": (task_reflect_block_around_dot, {}),
+    # Color tasks
+    "paint_biggest_block": (task_paint_biggest_block, {}),
+    "recolor_blocks_by_size": (task_recolor_blocks_by_size, {}),
+    "change_to_five": (task_change_to_five, {}),
+    "recolor_blocks_from_palette": (task_recolor_blocks_from_palette, {}),
+    "color_left_half_blocks": (task_color_left_half_blocks, {}),
+    # Sorting tasks
+    "sort_blocks_by_size": (task_sort_blocks_by_size, {}),
+    "sort_complete_sequence": (task_sort_complete_sequence, {}),
+    # Fill tasks
+    "duplicate_block_from_seeds": (task_duplicate_block_from_seeds, {}),
+    "fill_from_pixel": (task_fill_from_pixel, {}),
+    "fill_until_collision": (task_fill_until_collision, {}),
+    # Marking tasks
+    "mark_size_two_blocks": (task_mark_size_two_blocks, {}),
+}
diff --git a/tests/test_arithmetic.py b/tests/test_basic_arithmetic.py
similarity index 100%
rename from tests/test_arithmetic.py
rename to tests/test_basic_arithmetic.py

From 978a0879f7e98cd4980b9eefb12bb765d6385102 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 23:21:46 +0100
Subject: [PATCH 91/94] feat: Add mirrored and inverse task variations to
 ARC_1D_TASKS

---
 reasoning_gym/cognition/arc_1d_tasks.py | 76 +++++++++++++++++--------
 1 file changed, 52 insertions(+), 24 deletions(-)

diff --git a/reasoning_gym/cognition/arc_1d_tasks.py b/reasoning_gym/cognition/arc_1d_tasks.py
index 9e1334e6..dc1ff971 100644
--- a/reasoning_gym/cognition/arc_1d_tasks.py
+++ b/reasoning_gym/cognition/arc_1d_tasks.py
@@ -1084,30 +1084,57 @@ def task_identity(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[
 
 # Table of all ARC 1D task functions with their parameters
 ARC_1D_TASKS = {
-    # Move tasks
-    "move_1pix_solid": (task_move_n_pix, {"move_pix": 1, "solid": True}),
-    "move_2pix_solid": (task_move_n_pix, {"move_pix": 2, "solid": True}),
-    "move_3pix_solid": (task_move_n_pix, {"move_pix": 3, "solid": True}),
-    "move_4pix_solid": (task_move_n_pix, {"move_pix": 4, "solid": True}),
-    "move_1pix_colorful": (task_move_n_pix, {"move_pix": 1, "solid": False}),
-    "move_2pix_colorful": (task_move_n_pix, {"move_pix": 2, "solid": False}),
-    "move_3pix_colorful": (task_move_n_pix, {"move_pix": 3, "solid": False}),
-    "move_4pix_colorful": (task_move_n_pix, {"move_pix": 4, "solid": False}),
-    # Move wrapped tasks
-    "move_1pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": True}),
-    "move_2pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": True}),
-    "move_3pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": True}),
-    "move_4pix_solid_wrapped": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": True}),
-    "move_1pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": False}),
-    "move_2pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": False}),
-    "move_3pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": False}),
-    "move_4pix_colorful_wrapped": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": False}),
-    # Gravity tasks
-    "gravity": (task_gravity, {}),
-    "gravity_counting": (task_gravity_counting, {}),
-    "gravity_antigravity": (task_gravity_antigravity, {}),
-    "gravity_one_step": (task_gravity_one_step, {}),
-    "gravity_weighted_colors": (task_gravity_weighted_colors, {}),
+    # Move tasks - right direction
+    "move_1pix_solid_right": (task_move_n_pix, {"move_pix": 1, "solid": True}),
+    "move_2pix_solid_right": (task_move_n_pix, {"move_pix": 2, "solid": True}),
+    "move_3pix_solid_right": (task_move_n_pix, {"move_pix": 3, "solid": True}),
+    "move_4pix_solid_right": (task_move_n_pix, {"move_pix": 4, "solid": True}),
+    "move_1pix_colorful_right": (task_move_n_pix, {"move_pix": 1, "solid": False}),
+    "move_2pix_colorful_right": (task_move_n_pix, {"move_pix": 2, "solid": False}),
+    "move_3pix_colorful_right": (task_move_n_pix, {"move_pix": 3, "solid": False}),
+    "move_4pix_colorful_right": (task_move_n_pix, {"move_pix": 4, "solid": False}),
+    
+    # Move tasks - left direction (mirrored)
+    "move_1pix_solid_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 1, "solid": True}),
+    "move_2pix_solid_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 2, "solid": True}),
+    "move_3pix_solid_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 3, "solid": True}),
+    "move_4pix_solid_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 4, "solid": True}),
+    "move_1pix_colorful_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 1, "solid": False}),
+    "move_2pix_colorful_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 2, "solid": False}),
+    "move_3pix_colorful_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 3, "solid": False}),
+    "move_4pix_colorful_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 4, "solid": False}),
+    # Move wrapped tasks - right direction
+    "move_1pix_solid_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": True}),
+    "move_2pix_solid_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": True}),
+    "move_3pix_solid_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": True}),
+    "move_4pix_solid_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": True}),
+    "move_1pix_colorful_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": False}),
+    "move_2pix_colorful_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": False}),
+    "move_3pix_colorful_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": False}),
+    "move_4pix_colorful_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": False}),
+
+    # Move wrapped tasks - left direction (mirrored)
+    "move_1pix_solid_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 1, "solid": True}),
+    "move_2pix_solid_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 2, "solid": True}),
+    "move_3pix_solid_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 3, "solid": True}),
+    "move_4pix_solid_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 4, "solid": True}),
+    "move_1pix_colorful_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 1, "solid": False}),
+    "move_2pix_colorful_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 2, "solid": False}),
+    "move_3pix_colorful_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 3, "solid": False}),
+    "move_4pix_colorful_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 4, "solid": False}),
+    # Gravity tasks - right direction
+    "gravity_right": (task_gravity, {}),
+    "gravity_counting_right": (task_gravity_counting, {}),
+    "gravity_antigravity_right": (task_gravity_antigravity, {}),
+    "gravity_one_step_right": (task_gravity_one_step, {}),
+    "gravity_weighted_colors_right": (task_gravity_weighted_colors, {}),
+
+    # Gravity tasks - left direction (mirrored)
+    "gravity_left": (lambda rng, size, **kwargs: task_mirror(task_gravity(rng, size, **kwargs)), {}),
+    "gravity_counting_left": (lambda rng, size, **kwargs: task_mirror(task_gravity_counting(rng, size, **kwargs)), {}),
+    "gravity_antigravity_left": (lambda rng, size, **kwargs: task_mirror(task_gravity_antigravity(rng, size, **kwargs)), {}),
+    "gravity_one_step_left": (lambda rng, size, **kwargs: task_mirror(task_gravity_one_step(rng, size, **kwargs)), {}),
+    "gravity_weighted_colors_left": (lambda rng, size, **kwargs: task_mirror(task_gravity_weighted_colors(rng, size, **kwargs)), {}),
     # Block tasks
     "block_touch_dot": (task_block_touch_dot, {}),
     "block_touch_dot_1pix": (task_block_touch_dot_n_pix, {"move_pix": 1}),
@@ -1120,6 +1147,7 @@ ARC_1D_TASKS = {
     "move_block_by_own_size": (task_move_block_by_own_size, {}),
     # Pattern tasks
     "two_points_and_fill": (task_two_points_and_fill, {}),
+    "two_points_and_fill_inv": (lambda rng, size, **kwargs: task_inverse(task_two_points_and_fill(rng, size, **kwargs)), {}),
     "copy_block_to_dots": (task_copy_block_to_dots, {}),
     "copy_block_to_dots_colors": (task_copy_block_to_dots_colors, {}),
     "repeat_pattern_full": (task_repeat_pattern_full, {}),

From b7532f66cad7799251a591a4afe968ab754e3cb6 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 23:30:15 +0100
Subject: [PATCH 92/94] test: Remove test_arc_1d.py file from tests directory

---
 tests/test_arc_1d.py | 192 ++++++++++++++++++-------------------------
 1 file changed, 79 insertions(+), 113 deletions(-)

diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
index 1ce05148..10443e76 100644
--- a/tests/test_arc_1d.py
+++ b/tests/test_arc_1d.py
@@ -1,122 +1,88 @@
-import random
-
 import pytest
 
-from reasoning_gym.cognition.arc_1d import (
-    task_block_and_noise_remove,
-    task_block_and_noise_remove_inside,
-    task_block_scale_to_dot,
-    task_block_touch_dot,
-    task_block_touch_dot_n_pix,
-    task_change_to_five,
-    task_color_left_half_blocks,
-    task_copy_block_to_dots,
-    task_copy_block_to_dots_colors,
-    task_duplicate_block_from_seeds,
-    task_fill_from_pixel,
-    task_fill_until_collision,
-    task_gravity,
-    task_gravity_antigravity,
-    task_gravity_counting,
-    task_gravity_one_step,
-    task_gravity_weighted_colors,
-    task_identity,
-    task_inverse,
-    task_mark_size_two_blocks,
-    task_mirror,
-    task_move_block_by_own_size,
-    task_move_n_pix,
-    task_move_n_pix_wrapped,
-    task_paint_biggest_block,
-    task_recolor_blocks_by_size,
-    task_recolor_blocks_from_palette,
-    task_reflect_block_around_dot,
-    task_reflect_block_with_border_pixel,
-    task_reflect_block_with_border_pixel_random,
-    task_repeat_pattern_full,
-    task_sort_blocks_by_size,
-    task_sort_complete_sequence,
-    task_two_points_and_fill,
-)
+from reasoning_gym.cognition import Arc1DDataset, Arc1DConfig
 
 
-def test_all_arc_1d_tasks():
-    """Test that all ARC 1D task functions can be executed without exceptions."""
-    rng = random.Random(42)  # Fixed seed for reproducibility
-    size = 20  # Reasonable size for testing
+def test_arc_1d_config_validation():
+    """Test that invalid configs raise appropriate errors"""
+    with pytest.raises(AssertionError):
+        config = Arc1DConfig(min_size=0)
+        config.validate()
 
-    # Test all task functions
-    # Fixed move_pix value for testing
-    move_pix = 2
+    with pytest.raises(AssertionError):
+        config = Arc1DConfig(min_size=30, max_size=20)
+        config.validate()
 
-    # Test task augmentation functions
-    base_task = task_move_n_pix(rng, size, move_pix, True)
-    assert base_task is not None
-    
-    mirrored = task_mirror(base_task)
-    assert mirrored is not None
-    assert mirrored["input"] == list(reversed(base_task["input"]))
-    assert mirrored["output"] == list(reversed(base_task["output"]))
-    
-    inversed = task_inverse(base_task)
-    assert inversed is not None
-    assert inversed["input"] == base_task["output"]
-    assert inversed["output"] == base_task["input"]
-    
-    identical = task_identity(base_task)
-    assert identical is not None
-    assert identical == base_task
+    with pytest.raises(AssertionError):
+        config = Arc1DConfig(num_train=0)
+        config.validate()
 
-    tasks = [
-        (task_move_n_pix, {"move_pix": move_pix, "solid": True}),
-        (task_move_n_pix_wrapped, {"move_pix": move_pix, "solid": True}),
-        (task_gravity, {}),
-        (task_gravity_counting, {}),
-        (task_gravity_antigravity, {}),
-        (task_block_touch_dot, {}),
-        (task_block_touch_dot_n_pix, {"move_pix": move_pix}),
-        (task_block_scale_to_dot, {}),
-        (task_two_points_and_fill, {}),
-        (task_reflect_block_with_border_pixel, {}),
-        (task_reflect_block_with_border_pixel_random, {}),
-        (task_reflect_block_around_dot, {}),
-        (task_block_and_noise_remove, {}),
-        (task_block_and_noise_remove_inside, {}),
-        (task_copy_block_to_dots, {}),
-        (task_copy_block_to_dots_colors, {}),
-        (task_paint_biggest_block, {}),
-        (task_sort_blocks_by_size, {}),
-        (task_sort_complete_sequence, {}),
-        (task_recolor_blocks_by_size, {}),
-        (task_gravity_one_step, {}),
-        (task_move_block_by_own_size, {}),
-        (task_change_to_five, {}),
-        (task_recolor_blocks_from_palette, {}),
-        (task_duplicate_block_from_seeds, {}),
-        (task_fill_from_pixel, {}),
-        (task_mark_size_two_blocks, {}),
-        (task_fill_until_collision, {}),
-        (task_repeat_pattern_full, {}),
-        (task_gravity_weighted_colors, {}),
-        (task_color_left_half_blocks, {}),
-    ]
 
-    for task_func, kwargs in tasks:
-        # Try multiple times as some functions might return None for certain inputs
-        success = False
-        for _ in range(10):  # Try up to 10 times
-            try:
-                result = task_func(rng, size, **kwargs)
-                if result is not None:
-                    success = True
-                    # Basic structure checks
-                    assert isinstance(result, dict)
-                    assert "input" in result
-                    assert "output" in result
-                    assert len(result["input"]) == size
-                    assert len(result["output"]) == size
-                    break
-            except Exception as e:
-                pytest.fail(f"Task {task_func.__name__} failed with error: {str(e)}")
+def test_arc_1d_deterministic():
+    """Test that dataset generates same items with same seed"""
+    config = Arc1DConfig(seed=42, size=10)
+    dataset1 = Arc1DDataset(config)
+    dataset2 = Arc1DDataset(config)
 
-        assert success, f"Task {task_func.__name__} always returned None in 10 attempts"
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i]
+
+
+def test_arc_1d_items():
+    """Test basic properties of generated items"""
+    config = Arc1DConfig(min_size=10, max_size=15, num_train=2, size=50, seed=42)
+    dataset = Arc1DDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        # Check metadata contents
+        metadata = item["metadata"]
+        assert "task_name" in metadata
+        assert "size" in metadata
+        assert "train_examples" in metadata
+        assert "test_example" in metadata
+
+        # Verify size constraints
+        assert config.min_size <= metadata["size"] <= config.max_size
+
+        # Check training examples
+        train_examples = metadata["train_examples"]
+        assert len(train_examples) == config.num_train
+        for example in train_examples:
+            assert "input" in example
+            assert "output" in example
+            assert len(example["input"]) == metadata["size"]
+            assert len(example["output"]) == metadata["size"]
+
+        # Check test example
+        test_example = metadata["test_example"]
+        assert "input" in test_example
+        assert "output" in test_example
+        assert len(test_example["input"]) == metadata["size"]
+        assert len(test_example["output"]) == metadata["size"]
+
+
+def test_arc_1d_iteration():
+    """Test that iteration respects dataset size"""
+    config = Arc1DConfig(size=5, seed=42)  # Small size for testing
+    dataset = Arc1DDataset(config)
+
+    # Test manual iteration
+    items = []
+    for item in dataset:
+        items.append(item)
+    assert len(items) == config.size, "Iterator should yield exactly size items"
+
+    # Test list conversion
+    items = list(dataset)
+    assert len(items) == config.size, "Iterator should yield exactly size items"
+
+    # Test multiple iterations
+    first_items = list(dataset)
+    second_items = list(dataset)
+    assert first_items == second_items, "Multiple iterations should yield same items"

From a9549057e9438412421792d28652c9ffaf6e0c4e Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 23:31:20 +0100
Subject: [PATCH 93/94] test: Add scoring tests for Arc1D dataset answer
 evaluation

---
 tests/test_arc_1d.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
index 10443e76..197c4d9d 100644
--- a/tests/test_arc_1d.py
+++ b/tests/test_arc_1d.py
@@ -86,3 +86,22 @@ def test_arc_1d_iteration():
     first_items = list(dataset)
     second_items = list(dataset)
     assert first_items == second_items, "Multiple iterations should yield same items"
+
+
+def test_arc_1d_scoring():
+    """Test answer scoring logic"""
+    config = Arc1DConfig(size=1, seed=42)
+    dataset = Arc1DDataset(config)
+    entry = dataset[0]
+
+    # Test exact match
+    assert dataset.score_answer(entry["answer"], entry) == 1.0
+
+    # Test partial match (answer contained within response)
+    assert dataset.score_answer(f"The answer is: {entry['answer']}", entry) == 0.5
+
+    # Test incorrect answer
+    assert dataset.score_answer("wrong answer", entry) == 0.01
+
+    # Test None answer
+    assert dataset.score_answer(None, entry) == 0.0

From 3aeec71523802a8ca9f7ec6c75cd9b014f098fd6 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Sun, 2 Feb 2025 23:45:25 +0100
Subject: [PATCH 94/94] add attribution for arc-1d and unit tests

---
 reasoning_gym/cognition/arc_1d.py       |  13 ++-
 reasoning_gym/cognition/arc_1d_tasks.py |  98 ++++++++++++++-----
 tests/test_arc_1d.py                    |   2 +-
 tests/test_arc_1d_tasks.py              | 122 ++++++++++++++++++++++++
 4 files changed, 211 insertions(+), 24 deletions(-)
 create mode 100644 tests/test_arc_1d_tasks.py

diff --git a/reasoning_gym/cognition/arc_1d.py b/reasoning_gym/cognition/arc_1d.py
index 332a5851..7e399f20 100644
--- a/reasoning_gym/cognition/arc_1d.py
+++ b/reasoning_gym/cognition/arc_1d.py
@@ -25,7 +25,18 @@ class Arc1DConfig:
 
 
 class Arc1DDataset(ProceduralDataset):
-    """Generates ARC 1D tasks by randomly selecting from available task generators"""
+    """
+    Generates ARC 1D tasks by randomly selecting from available task generators
+
+    This dataset is a procedural variant of the 1D-ARC dataset which is described in the paper:
+    `LLMs and the Abstraction and Reasoning Corpus:  Successes, Failures, and the Importance
+    of Object-based Representations` (https://arxiv.org/abs/2305.18354)
+
+    Ilya Sheprut (optozorax) created rust generators for most of the ARC 1d tasks. For
+    reasoning-gym rust tasks were machine-converted to python via Sonnet.
+
+    Ilya's original rust code can be found here: https://github.com/optozorax/arc_1d/
+    """
 
     def __init__(self, config: Arc1DConfig):
         from .arc_1d_tasks import ARC_1D_TASKS
diff --git a/reasoning_gym/cognition/arc_1d_tasks.py b/reasoning_gym/cognition/arc_1d_tasks.py
index dc1ff971..61151b34 100644
--- a/reasoning_gym/cognition/arc_1d_tasks.py
+++ b/reasoning_gym/cognition/arc_1d_tasks.py
@@ -1093,16 +1093,39 @@ ARC_1D_TASKS = {
     "move_2pix_colorful_right": (task_move_n_pix, {"move_pix": 2, "solid": False}),
     "move_3pix_colorful_right": (task_move_n_pix, {"move_pix": 3, "solid": False}),
     "move_4pix_colorful_right": (task_move_n_pix, {"move_pix": 4, "solid": False}),
-    
     # Move tasks - left direction (mirrored)
-    "move_1pix_solid_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 1, "solid": True}),
-    "move_2pix_solid_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 2, "solid": True}),
-    "move_3pix_solid_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 3, "solid": True}),
-    "move_4pix_solid_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 4, "solid": True}),
-    "move_1pix_colorful_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 1, "solid": False}),
-    "move_2pix_colorful_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 2, "solid": False}),
-    "move_3pix_colorful_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 3, "solid": False}),
-    "move_4pix_colorful_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)), {"move_pix": 4, "solid": False}),
+    "move_1pix_solid_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)),
+        {"move_pix": 1, "solid": True},
+    ),
+    "move_2pix_solid_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)),
+        {"move_pix": 2, "solid": True},
+    ),
+    "move_3pix_solid_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)),
+        {"move_pix": 3, "solid": True},
+    ),
+    "move_4pix_solid_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)),
+        {"move_pix": 4, "solid": True},
+    ),
+    "move_1pix_colorful_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)),
+        {"move_pix": 1, "solid": False},
+    ),
+    "move_2pix_colorful_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)),
+        {"move_pix": 2, "solid": False},
+    ),
+    "move_3pix_colorful_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)),
+        {"move_pix": 3, "solid": False},
+    ),
+    "move_4pix_colorful_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix(rng, size, **kwargs)),
+        {"move_pix": 4, "solid": False},
+    ),
     # Move wrapped tasks - right direction
     "move_1pix_solid_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 1, "solid": True}),
     "move_2pix_solid_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": True}),
@@ -1112,29 +1135,57 @@ ARC_1D_TASKS = {
     "move_2pix_colorful_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 2, "solid": False}),
     "move_3pix_colorful_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 3, "solid": False}),
     "move_4pix_colorful_wrapped_right": (task_move_n_pix_wrapped, {"move_pix": 4, "solid": False}),
-
     # Move wrapped tasks - left direction (mirrored)
-    "move_1pix_solid_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 1, "solid": True}),
-    "move_2pix_solid_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 2, "solid": True}),
-    "move_3pix_solid_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 3, "solid": True}),
-    "move_4pix_solid_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 4, "solid": True}),
-    "move_1pix_colorful_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 1, "solid": False}),
-    "move_2pix_colorful_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 2, "solid": False}),
-    "move_3pix_colorful_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 3, "solid": False}),
-    "move_4pix_colorful_wrapped_left": (lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)), {"move_pix": 4, "solid": False}),
+    "move_1pix_solid_wrapped_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)),
+        {"move_pix": 1, "solid": True},
+    ),
+    "move_2pix_solid_wrapped_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)),
+        {"move_pix": 2, "solid": True},
+    ),
+    "move_3pix_solid_wrapped_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)),
+        {"move_pix": 3, "solid": True},
+    ),
+    "move_4pix_solid_wrapped_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)),
+        {"move_pix": 4, "solid": True},
+    ),
+    "move_1pix_colorful_wrapped_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)),
+        {"move_pix": 1, "solid": False},
+    ),
+    "move_2pix_colorful_wrapped_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)),
+        {"move_pix": 2, "solid": False},
+    ),
+    "move_3pix_colorful_wrapped_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)),
+        {"move_pix": 3, "solid": False},
+    ),
+    "move_4pix_colorful_wrapped_left": (
+        lambda rng, size, **kwargs: task_mirror(task_move_n_pix_wrapped(rng, size, **kwargs)),
+        {"move_pix": 4, "solid": False},
+    ),
     # Gravity tasks - right direction
     "gravity_right": (task_gravity, {}),
     "gravity_counting_right": (task_gravity_counting, {}),
     "gravity_antigravity_right": (task_gravity_antigravity, {}),
     "gravity_one_step_right": (task_gravity_one_step, {}),
     "gravity_weighted_colors_right": (task_gravity_weighted_colors, {}),
-
     # Gravity tasks - left direction (mirrored)
     "gravity_left": (lambda rng, size, **kwargs: task_mirror(task_gravity(rng, size, **kwargs)), {}),
     "gravity_counting_left": (lambda rng, size, **kwargs: task_mirror(task_gravity_counting(rng, size, **kwargs)), {}),
-    "gravity_antigravity_left": (lambda rng, size, **kwargs: task_mirror(task_gravity_antigravity(rng, size, **kwargs)), {}),
+    "gravity_antigravity_left": (
+        lambda rng, size, **kwargs: task_mirror(task_gravity_antigravity(rng, size, **kwargs)),
+        {},
+    ),
     "gravity_one_step_left": (lambda rng, size, **kwargs: task_mirror(task_gravity_one_step(rng, size, **kwargs)), {}),
-    "gravity_weighted_colors_left": (lambda rng, size, **kwargs: task_mirror(task_gravity_weighted_colors(rng, size, **kwargs)), {}),
+    "gravity_weighted_colors_left": (
+        lambda rng, size, **kwargs: task_mirror(task_gravity_weighted_colors(rng, size, **kwargs)),
+        {},
+    ),
     # Block tasks
     "block_touch_dot": (task_block_touch_dot, {}),
     "block_touch_dot_1pix": (task_block_touch_dot_n_pix, {"move_pix": 1}),
@@ -1147,7 +1198,10 @@ ARC_1D_TASKS = {
     "move_block_by_own_size": (task_move_block_by_own_size, {}),
     # Pattern tasks
     "two_points_and_fill": (task_two_points_and_fill, {}),
-    "two_points_and_fill_inv": (lambda rng, size, **kwargs: task_inverse(task_two_points_and_fill(rng, size, **kwargs)), {}),
+    "two_points_and_fill_inv": (
+        lambda rng, size, **kwargs: task_inverse(task_two_points_and_fill(rng, size, **kwargs)),
+        {},
+    ),
     "copy_block_to_dots": (task_copy_block_to_dots, {}),
     "copy_block_to_dots_colors": (task_copy_block_to_dots_colors, {}),
     "repeat_pattern_full": (task_repeat_pattern_full, {}),
diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py
index 197c4d9d..1679fb50 100644
--- a/tests/test_arc_1d.py
+++ b/tests/test_arc_1d.py
@@ -1,6 +1,6 @@
 import pytest
 
-from reasoning_gym.cognition import Arc1DDataset, Arc1DConfig
+from reasoning_gym.cognition import Arc1DConfig, Arc1DDataset
 
 
 def test_arc_1d_config_validation():
diff --git a/tests/test_arc_1d_tasks.py b/tests/test_arc_1d_tasks.py
new file mode 100644
index 00000000..98d9d2ef
--- /dev/null
+++ b/tests/test_arc_1d_tasks.py
@@ -0,0 +1,122 @@
+import random
+
+import pytest
+
+from reasoning_gym.cognition.arc_1d_tasks import (
+    task_block_and_noise_remove,
+    task_block_and_noise_remove_inside,
+    task_block_scale_to_dot,
+    task_block_touch_dot,
+    task_block_touch_dot_n_pix,
+    task_change_to_five,
+    task_color_left_half_blocks,
+    task_copy_block_to_dots,
+    task_copy_block_to_dots_colors,
+    task_duplicate_block_from_seeds,
+    task_fill_from_pixel,
+    task_fill_until_collision,
+    task_gravity,
+    task_gravity_antigravity,
+    task_gravity_counting,
+    task_gravity_one_step,
+    task_gravity_weighted_colors,
+    task_identity,
+    task_inverse,
+    task_mark_size_two_blocks,
+    task_mirror,
+    task_move_block_by_own_size,
+    task_move_n_pix,
+    task_move_n_pix_wrapped,
+    task_paint_biggest_block,
+    task_recolor_blocks_by_size,
+    task_recolor_blocks_from_palette,
+    task_reflect_block_around_dot,
+    task_reflect_block_with_border_pixel,
+    task_reflect_block_with_border_pixel_random,
+    task_repeat_pattern_full,
+    task_sort_blocks_by_size,
+    task_sort_complete_sequence,
+    task_two_points_and_fill,
+)
+
+
+def test_all_arc_1d_tasks():
+    """Test that all ARC 1D task functions can be executed without exceptions."""
+    rng = random.Random(42)  # Fixed seed for reproducibility
+    size = 20  # Reasonable size for testing
+
+    # Test all task functions
+    # Fixed move_pix value for testing
+    move_pix = 2
+
+    # Test task augmentation functions
+    base_task = task_move_n_pix(rng, size, move_pix, True)
+    assert base_task is not None
+
+    mirrored = task_mirror(base_task)
+    assert mirrored is not None
+    assert mirrored["input"] == list(reversed(base_task["input"]))
+    assert mirrored["output"] == list(reversed(base_task["output"]))
+
+    inversed = task_inverse(base_task)
+    assert inversed is not None
+    assert inversed["input"] == base_task["output"]
+    assert inversed["output"] == base_task["input"]
+
+    identical = task_identity(base_task)
+    assert identical is not None
+    assert identical == base_task
+
+    tasks = [
+        (task_move_n_pix, {"move_pix": move_pix, "solid": True}),
+        (task_move_n_pix_wrapped, {"move_pix": move_pix, "solid": True}),
+        (task_gravity, {}),
+        (task_gravity_counting, {}),
+        (task_gravity_antigravity, {}),
+        (task_block_touch_dot, {}),
+        (task_block_touch_dot_n_pix, {"move_pix": move_pix}),
+        (task_block_scale_to_dot, {}),
+        (task_two_points_and_fill, {}),
+        (task_reflect_block_with_border_pixel, {}),
+        (task_reflect_block_with_border_pixel_random, {}),
+        (task_reflect_block_around_dot, {}),
+        (task_block_and_noise_remove, {}),
+        (task_block_and_noise_remove_inside, {}),
+        (task_copy_block_to_dots, {}),
+        (task_copy_block_to_dots_colors, {}),
+        (task_paint_biggest_block, {}),
+        (task_sort_blocks_by_size, {}),
+        (task_sort_complete_sequence, {}),
+        (task_recolor_blocks_by_size, {}),
+        (task_gravity_one_step, {}),
+        (task_move_block_by_own_size, {}),
+        (task_change_to_five, {}),
+        (task_recolor_blocks_from_palette, {}),
+        (task_duplicate_block_from_seeds, {}),
+        (task_fill_from_pixel, {}),
+        (task_mark_size_two_blocks, {}),
+        (task_fill_until_collision, {}),
+        (task_repeat_pattern_full, {}),
+        (task_gravity_weighted_colors, {}),
+        (task_color_left_half_blocks, {}),
+    ]
+
+    for task_func, kwargs in tasks:
+        # Try multiple times as some functions might return None for certain inputs
+        success = False
+        for _ in range(10):  # Try up to 10 times
+            try:
+                result = task_func(rng, size, **kwargs)
+                if result is not None:
+                    success = True
+                    # Basic structure checks
+                    assert isinstance(result, dict)
+                    assert "input" in result
+                    assert "output" in result
+                    assert len(result["input"]) == size
+                    assert len(result["output"]) == size
+                    break
+            except Exception as e:
+                pytest.fail(f"Task {task_func.__name__} failed with error: {str(e)}")
+
+        assert success, f"Task {task_func.__name__} always returned None in 10 attempts"