use native types List->list, Dict->dict, Set->set, Tuple->tuple

2026-04-19 12:58:07 +00:00 · 2025-02-21 15:13:19 +01:00 · 2025-02-21 15:13:19 +01:00 · 3e7ff3b084
commit 3e7ff3b084
parent 5d02064b5a
95 changed files with 754 additions and 760 deletions
--- a/eval/eval.py
+++ b/eval/eval.py
@ -5,7 +5,7 @@ import os
 import re
 import time
 from datetime import datetime
-from typing import Any, Dict, List
+from typing import Any

 from openai import AsyncOpenAI
 from tqdm.asyncio import tqdm_asyncio
@ -44,7 +44,7 @@ class AsyncOpenRouterEvaluator:
        match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
        return match.group(1).strip() if match else response

-    async def process_single_question(self, entry: Dict, dataset) -> Dict:
+    async def process_single_question(self, entry: dict, dataset) -> dict:
        """Process a single question and return the result."""
        response = await self.get_model_response(entry["question"])
        answer = self.parse_model_response(response)
@ -58,7 +58,7 @@ class AsyncOpenRouterEvaluator:
            "metadata": entry["metadata"],
        }

-    async def evaluate_dataset(self, dataset_config: Dict[str, Any]) -> Dict[str, Any]:
+    async def evaluate_dataset(self, dataset_config: dict[str, Any]) -> dict[str, Any]:
        """Evaluate a single dataset with concurrent question processing."""
        dataset_name = dataset_config.pop("name")
        print(f"\nEvaluating dataset: {dataset_name}")
@ -92,7 +92,7 @@ class AsyncOpenRouterEvaluator:
            print(f"Error evaluating dataset {dataset_name}: {str(e)}")
            return None

-    async def evaluate_datasets(self, dataset_configs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    async def evaluate_datasets(self, dataset_configs: list[dict[str, Any]]) -> list[dict[str, Any]]:
        """Evaluate multiple datasets concurrently."""
        tasks = [self.evaluate_dataset(config) for config in dataset_configs]

--- a/eval/r1/eval.py
+++ b/eval/r1/eval.py
@ -5,7 +5,7 @@ import logging
 import os
 from dataclasses import asdict
 from datetime import datetime
-from typing import Any, Dict, List
+from typing import Any

 import aiohttp
 from eval_config import EvalConfig
@ -32,7 +32,7 @@ class OpenRouterEvaluator:
        }
        self.semaphore = asyncio.Semaphore(10)  # Control concurrency

-    def save_results(self, results: List[Dict[str, Any]], dataset, dataset_name) -> Dict[str, Any]:
+    def save_results(self, results: list[dict[str, Any]], dataset, dataset_name) -> dict[str, Any]:
        file_name = f"{self.output_dir}/{dataset_name}.json"
        total_score = sum(r["score"] for r in results)

@ -52,7 +52,7 @@ class OpenRouterEvaluator:
            json.dump(metrics, f, indent=2)
        return metrics

-    def prepare_messages(self, prompt: str) -> List[Dict[str, str]]:
+    def prepare_messages(self, prompt: str) -> list[dict[str, str]]:
        return {
            "model": self.model,
            "messages": [
@ -92,7 +92,7 @@ class OpenRouterEvaluator:

        raise Exception("Failed to get valid response after retries")

-    async def process_entry(self, session: aiohttp.ClientSession, dataset: Any, entry: Any) -> Dict[str, Any]:
+    async def process_entry(self, session: aiohttp.ClientSession, dataset: Any, entry: Any) -> dict[str, Any]:
        """Process a single entry with concurrency control."""
        async with self.semaphore:
            response = await self.get_model_response(session, entry["question"])
@ -108,7 +108,7 @@ class OpenRouterEvaluator:
                "metadata": str(entry["metadata"]),
            }

-    async def evaluate_dataset(self, session: aiohttp.ClientSession, dataset_name: str) -> Dict[str, Any]:
+    async def evaluate_dataset(self, session: aiohttp.ClientSession, dataset_name: str) -> dict[str, Any]:
        """Evaluate a single dataset asynchronously."""
        self.logger.info(f"\nEvaluating dataset: {dataset_name}")
        dataset = reasoning_gym.create_dataset(
@ -119,7 +119,7 @@ class OpenRouterEvaluator:
        results = await asyncio.gather(*tasks)
        return self.save_results(results, dataset, dataset_name)

-    async def evaluate_datasets(self) -> List[Dict[str, Any]]:
+    async def evaluate_datasets(self) -> list[dict[str, Any]]:
        """Main async evaluation entry point."""
        all_results = []
        async with aiohttp.ClientSession(headers=self.headers) as session:
--- a/eval/r1/eval_config.py
+++ b/eval/r1/eval_config.py
@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Union
+from typing import Union

 import yaml

@ -9,7 +9,7 @@ from reasoning_gym.utils import SYSTEM_PROMPTS
@dataclass
 class EvalConfig:
    category: str
-    datasets: Union[str, List[str]]
+    datasets: Union[str, list[str]]
    eval_dir: str
    dataset_size: int
    dataset_seed: int
--- a/examples/OpenRLHF/custom_reward.py
+++ b/examples/OpenRLHF/custom_reward.py
@ -6,7 +6,7 @@ import math
 import os
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional

 import torch
 import torch.nn as nn
@ -109,7 +109,7 @@ class AlgorithmicRewardExperienceMaker(NaiveExperienceMaker):
        self.dataset = dataset

    @torch.no_grad()
-    def generate_samples(self, all_prompts: List[Tuple[str, Any]], **generate_kwargs) -> List[Samples]:
+    def generate_samples(self, all_prompts: list[tuple[str, Any]], **generate_kwargs) -> list[Samples]:
        """
        Generate samples and return in batches.
        """
--- a/examples/veRL/main_ppo_custom_reward_server.py
+++ b/examples/veRL/main_ppo_custom_reward_server.py
@ -1,7 +1,7 @@
 # This example is an adapted version of Bytedance's code:
 # https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/verl/trainer/main_ppo.py
 import os
-from typing import Dict, List, Optional
+from typing import Optional

 import hydra
 import ray
@ -64,12 +64,12 @@ class ReasoningGymDataset(Dataset):
            self.client.create_experiment(dataset_name, config)

        # Cache for batches
-        self._batch_cache: dict[int, List[BatchEntry]] = {}
+        self._batch_cache: dict[int, list[BatchEntry]] = {}

    def __len__(self) -> int:
        return self.size

-    def _get_batch(self, batch_idx: int) -> List[BatchEntry]:
+    def _get_batch(self, batch_idx: int) -> list[BatchEntry]:
        """Fetch or retrieve cached batch"""
        if batch_idx not in self._batch_cache:
            base_index = batch_idx * self.batch_size
--- a/examples/word_ladder/main.py
+++ b/examples/word_ladder/main.py
@ -14,7 +14,7 @@ from typing import Any, Dict
 from examples.word_ladder.utils import create_word_ladders, generate_reasoning


-def create_dataset(jsonl_path: Path, config: Dict[str, Any]) -> bool:
+def create_dataset(jsonl_path: Path, config: dict[str, Any]) -> bool:
    """
    Creates the word ladder dataset, handling potential exhaustion gracefully.

--- a/notebooks/gsm-symbolic-cot.txt
+++ b/notebooks/gsm-symbolic-cot.txt
@ -15,7 +15,7 @@ OUTPUT 1: Output in the form which should be generated
 from random import Random
 from typing import Dict, Any

-def generate_from_variables(time_per_interval: int, distance_per_interval: int, total_distance: int) -> Dict[str, Any]:
+def generate_from_variables(time_per_interval: int, distance_per_interval: int, total_distance: int) -> dict[str, Any]:
    intervals = total_distance // distance_per_interval
    total_time = intervals * time_per_interval

@ -36,7 +36,7 @@ def generate_from_variables(time_per_interval: int, distance_per_interval: int,
        }}
    }}

-def generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:
+def generate_example(rng: Random, difficulty: float = 1.0) -> dict[str, Any]:
    # Generate random values scaled by difficulty
    distance_per_interval = int(rng.randint(2, int(10 * difficulty)))
    time_per_interval = int(rng.randint(5, int(30 * difficulty)))
@ -57,7 +57,7 @@ def generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:
        }}
    }}

-def original_example() -> Dict[str, Any]:
+def original_example() -> dict[str, Any]:
   return generate_from_variables(10, 3, 42)
 ```

@ -79,7 +79,7 @@ from random import Random
 from typing import Dict, Any

 def generate_from_variables(name: str, food: str, rate_per_min: int, batch_size: int,
-                          time_per_batch: int, total_amount: int) -> Dict[str, Any]:
+                          time_per_batch: int, total_amount: int) -> dict[str, Any]:
    peel_time = total_amount // rate_per_min
    num_batches = total_amount // batch_size
    cook_time = num_batches * time_per_batch
@ -110,7 +110,7 @@ def generate_from_variables(name: str, food: str, rate_per_min: int, batch_size:
        }
    }

-def generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:
+def generate_example(rng: Random, difficulty: float = 1.0) -> dict[str, Any]:
    names = ["Emily", "Sarah", "Emma", "Sophia", "Olivia", "Ava", "Isabella", "Mia"]
    foods = ["shrimp", "onion", "carrot", "mushroom", "clam"]

@ -139,7 +139,7 @@ def generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:
        }
    }

-def original_example() -> Dict[str, Any]:
+def original_example() -> dict[str, Any]:
    return generate_from_variables("Emily", "shrimp", 6, 30, 10, 90)
 ```

@ -161,7 +161,7 @@ from random import Random
 from typing import Dict, Any

 def generate_from_variables(family: str, item: str, total: int, n1: int, n2: int,
-                          flavor1: str, flavor2: str, flavor3: str) -> Dict[str, Any]:
+                          flavor1: str, flavor2: str, flavor3: str) -> dict[str, Any]:
    n3 = total - (n1 + n2)

    question = f"The {family} family is busy making {item}s. So far, they've made {total} {item}s. They have {n1} {flavor1} {item}s, {n2} {flavor2} {item}s, and some {flavor3} {item}s. How many {flavor3} {item}s have they made?"
@ -186,7 +186,7 @@ def generate_from_variables(family: str, item: str, total: int, n1: int, n2: int
        }
    }

-def generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:
+def generate_example(rng: Random, difficulty: float = 1.0) -> dict[str, Any]:
    families = ["Smith", "Johnson", "Williams", "Brown", "Jones"]
    items = ["cupcake", "muffin", "brownie", "biscuit"]
    flavors = ["vanilla", "strawberry", "blueberry", "lemon", "peanut butter"]
@ -217,7 +217,7 @@ def generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:
        }
    }

-def original_example() -> Dict[str, Any]:
+def original_example() -> dict[str, Any]:
    return generate_from_variables("Adams", "cookie", 7995, 2595, 3075,
                                 "rainbow", "oatmeal", "chocolate chip")
 ```
@ -241,7 +241,7 @@ from typing import Dict, Any

 def generate_from_variables(name: str, event: str, food: str, obj: str,
                          package_husband: int, used_spoons: int,
-                          remaining_spoons: int) -> Dict[str, Any]:
+                          remaining_spoons: int) -> dict[str, Any]:

    total_spoons = remaining_spoons + used_spoons
    package_julia = total_spoons - package_husband
@ -268,7 +268,7 @@ def generate_from_variables(name: str, event: str, food: str, obj: str,
        }
    }

-def generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:
+def generate_example(rng: Random, difficulty: float = 1.0) -> dict[str, Any]:
    names = ['Emma', 'Olivia', 'Ava', 'Isabella', 'Sophia', 'Mia', 'Charlotte']
    events = ['lunch party', 'birthday party', 'potluck party', 'baby shower', 'game night']
    foods = ['roast chicken', 'grilled salmon', 'beef stew', 'vegetable lasagna',
@ -298,7 +298,7 @@ def generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:
        }
    }

-def original_example() -> Dict[str, Any]:
+def original_example() -> dict[str, Any]:
    return generate_from_variables('Julia', 'dinner party', 'stew', 'spoons',
                                 5, 3, 12)
 ```
--- a/reasoning_gym/algebra/complex_arithmetic.py
+++ b/reasoning_gym/algebra/complex_arithmetic.py
@ -2,7 +2,7 @@ import cmath
 import math
 import random
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -13,7 +13,7 @@ class ComplexArithmeticConfig:
    max_real: int = 10
    min_imag: int = -10
    max_imag: int = 10
-    operations: Tuple[str, ...] = ("+", "-", "*", "/")
+    operations: tuple[str, ...] = ("+", "-", "*", "/")
    seed: Optional[int] = None
    size: int = 500

--- a/reasoning_gym/algebra/intermediate_integration.py
+++ b/reasoning_gym/algebra/intermediate_integration.py
@ -241,7 +241,7 @@ In addition, when doing calculation, use the following instructions together wit
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the problem"""
        reward = 0.0
        metadata = entry["metadata"]
--- a/reasoning_gym/algebra/polynomial_equations.py
+++ b/reasoning_gym/algebra/polynomial_equations.py
@ -1,8 +1,7 @@
 import math
 import random
-import string
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Optional

 from sympy import Eq, Symbol, expand, solve

@ -21,7 +20,7 @@ class PolynomialEquationsConfig:
    max_value: int = 100  # Maximum value for coefficients
    min_degree: int = 1  # Minimum polynomial degree
    max_degree: int = 3  # Maximum polynomial degree
-    operators: Tuple[str, ...] = (
+    operators: tuple[str, ...] = (
        "+",
        "-",
    )  # Allowed operators between terms, Avoid adding '*' or '/' because they will affect the degree
@ -163,7 +162,7 @@ In solving the equations, please abide by the following instruction:

        return polynomial_expr

-    def _parse_score_to_list(self, answer: Optional[str]) -> List[float]:
+    def _parse_score_to_list(self, answer: Optional[str]) -> list[float]:
        """Parses a comma-separated string of scores into a sorted list of floats.

        This method takes a string containing comma-separated numeric values,
@ -193,7 +192,7 @@ In solving the equations, please abide by the following instruction:

        return sorted(output_float_vals)  # Return the sorted list of floats

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """
        Score an answer based on its numerical distance to oracle solutions using exponential decay.
        This function compares a predicted answer (or list of answers) to a set of oracle solutions
--- a/reasoning_gym/algebra/polynomial_multiplication.py
+++ b/reasoning_gym/algebra/polynomial_multiplication.py
@ -1,7 +1,6 @@
 import random
-import warnings
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Optional

 import sympy as sp
 from sympy.polys.monomials import itermonomials
@ -23,10 +22,10 @@ class PolynomialMultiplicationConfig:
    max_degree: int = 3  # Maximum polynomial degree
    min_polynomials: int = 2  # Minimum number of polynomials being multiplied
    max_polynomials: int = 3  # Maximum number of polynomials being multiplied
-    variables: Tuple[str] = ("x", "y", "z")  # Tuple of variable names, that will be chosen randomly
+    variables: tuple[str] = ("x", "y", "z")  # Tuple of variable names, that will be chosen randomly
    allow_cross_variable_product: bool = False  # Generate tasks like "Multiply (x^2+3x-1)*(y^2-5)"
    allow_multivariate_polynomials: bool = False  # Generate multivariate tasks like "Multiply (2x^2 + 3y)*(5x^2+3x-1)"
-    operators: Tuple[str, ...] = (
+    operators: tuple[str, ...] = (
        "+",
        "-",
    )  # Allowed operators between terms, Avoid adding '*' or '/' because they will affect the degree
@ -146,7 +145,7 @@ In addition, When doing calculation, Use the following instructions together wit

        return polynomial_expr

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        reward = 0.0
        metadata = entry["metadata"]
        if answer is not None:
--- a/reasoning_gym/algebra/simple_equations.py
+++ b/reasoning_gym/algebra/simple_equations.py
@ -1,7 +1,7 @@
 import random
 import string
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional

 from sympy import Symbol

@ -69,7 +69,7 @@ class SimpleEquationsDataset(ProceduralDataset):
        """Get a random lowercase variable name"""
        return rng.choice(string.ascii_lowercase)

-    def _generate_equation(self, rng: random.Random, variable: str) -> Tuple[str, int]:
+    def _generate_equation(self, rng: random.Random, variable: str) -> tuple[str, int]:
        """Generate an equation and its solution

        Args:
--- a/reasoning_gym/algebra/simple_integration.py
+++ b/reasoning_gym/algebra/simple_integration.py
@ -86,7 +86,7 @@ In addition, When doing calculation, Use the following instructions together wit
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the problem"""
        reward = 0.0
        metadata = entry["metadata"]
--- a/reasoning_gym/algorithmic/ab.py
+++ b/reasoning_gym/algorithmic/ab.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -129,14 +129,14 @@ Return the final state of the program.
            "metadata": {},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the AB task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/algorithmic/base_conversion.py
+++ b/reasoning_gym/algorithmic/base_conversion.py
@ -2,7 +2,7 @@

 from dataclasses import dataclass
 from random import Random
-from typing import Optional, Tuple
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -61,7 +61,7 @@ class BaseConversionDataset(ProceduralDataset):
        else:
            return f"base-{base}"

-    def _generate_conversion(self, rng: Random) -> Tuple[int, int, int]:
+    def _generate_conversion(self, rng: Random) -> tuple[int, int, int]:
        """Generate random value and source/target bases"""
        value = rng.randint(self.config.min_value, self.config.max_value)

--- a/reasoning_gym/algorithmic/binary_matrix.py
+++ b/reasoning_gym/algorithmic/binary_matrix.py
@ -7,7 +7,7 @@ https://leetcode.com/problems/01-matrix/description/
 from collections import deque
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -112,7 +112,7 @@ class BinaryMatrixDataset(ProceduralDataset):
        """Get a string representation of the matrix"""
        return "\n".join(" ".join(str(x) for x in row) for row in matrix)

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Overwrite this method in derived classes if a single oracle answer is not available."""
        oracle_answer = entry["answer"]
        if answer is not None:
--- a/reasoning_gym/algorithmic/cryptarithm.py
+++ b/reasoning_gym/algorithmic/cryptarithm.py
@ -211,14 +211,14 @@ class CryptarithmDataset(ProceduralDataset):
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the Cryptarithm task.

        The function awards 1.0 for a correct format and answers for all alphabet pairs.

        Args:
            answer (Optional[str]): The user's answer already parsed by `extract_answer`
-            answer_str (Dict[str, any]): The original dataset answer_str containing the correct answer. ie "A=1,B=3..."
+            answer_str (dict[str, Any]): The original dataset answer_str containing the correct answer. ie "A=1,B=3..."

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/algorithmic/game_of_life.py
+++ b/reasoning_gym/algorithmic/game_of_life.py
@ -1,7 +1,7 @@
 import json
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 import cellpylib as cpl

@ -86,14 +86,14 @@ class GameOfLifeDataset(ProceduralDataset):
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the GoL task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/algorithmic/graph_color.py
+++ b/reasoning_gym/algorithmic/graph_color.py
@ -1,7 +1,7 @@
 import json
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -209,14 +209,14 @@ Return your solution as a JSON map of vertices to colors. (For example: {{0: 1,
            "metadata": {"possible_answer": solution, "puzzle": puzzle},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the GraphColor task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/algorithmic/group_anagrams.py
+++ b/reasoning_gym/algorithmic/group_anagrams.py
@ -10,7 +10,7 @@ import json
 from collections import defaultdict
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..data import get_data_file_path
 from ..factory import ProceduralDataset, register_dataset
@ -88,7 +88,7 @@ class GroupAnagramsDataset(ProceduralDataset):
        anagrams = list(res.values())
        return self._sort_nested_list(anagrams)

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Score a single Group Anagrams question"""
        reward = 0.0
        if answer is not None:
--- a/reasoning_gym/algorithmic/letter_jumble.py
+++ b/reasoning_gym/algorithmic/letter_jumble.py
@ -3,7 +3,7 @@
 import re
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from reasoning_gym.data import read_data_file

@ -123,14 +123,14 @@ class LetterJumbleDataset(ProceduralDataset):
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves this task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/algorithmic/number_filtering.py
+++ b/reasoning_gym/algorithmic/number_filtering.py
@ -2,7 +2,7 @@

 from dataclasses import dataclass
 from random import Random
-from typing import List, Optional, Tuple
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -39,7 +39,7 @@ class NumberFilteringDataset(ProceduralDataset):
        """Format a number with specified decimal places"""
        return f"{num:.{decimals}f}"

-    def _generate_numbers(self, rng: Random) -> Tuple[List[float], List[str]]:
+    def _generate_numbers(self, rng: Random) -> tuple[list[float], list[str]]:
        """Generate list of numbers and their string representations"""
        count = rng.randint(self.config.min_numbers, self.config.max_numbers)
        numbers = []
--- a/reasoning_gym/algorithmic/number_sorting.py
+++ b/reasoning_gym/algorithmic/number_sorting.py
@ -2,7 +2,7 @@

 from dataclasses import dataclass
 from random import Random
-from typing import List, Optional, Tuple
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -46,7 +46,7 @@ Please follow the instruction below:
        # Reparse to ensure exact decimal representation
        return f"{float(formatted):.{decimals}f}"

-    def _generate_numbers(self, rng: Random) -> Tuple[List[float], List[str]]:
+    def _generate_numbers(self, rng: Random) -> tuple[list[float], list[str]]:
        """Generate list of numbers and their string representations"""
        count = rng.randint(self.config.min_numbers, self.config.max_numbers)
        decimals = rng.randint(self.config.min_decimals, self.config.max_decimals)
--- a/reasoning_gym/algorithmic/palindrome_generation.py
+++ b/reasoning_gym/algorithmic/palindrome_generation.py
@ -90,7 +90,7 @@ class PalindromeDataset(ProceduralDataset):
        """Return the palindrome string from the letter set."""
        return "".join(letters)

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided is a valid palindrome.
        The answer is expected to be a single string

--- a/reasoning_gym/algorithmic/palindrome_partitioning.py
+++ b/reasoning_gym/algorithmic/palindrome_partitioning.py
@ -8,7 +8,7 @@ import json
 import string
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -93,7 +93,7 @@ class PalindromePartitioningDataset(ProceduralDataset):
        _partition(0)
        return self._sort_list(res)

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Score a single Palindrome Partitioning question"""
        if answer is not None:
            try:
--- a/reasoning_gym/algorithmic/pool_matrix.py
+++ b/reasoning_gym/algorithmic/pool_matrix.py
@ -1,9 +1,8 @@
 """Perform average / max pooling on a matrix"""

-from copy import deepcopy
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 import numpy as np

@ -95,7 +94,7 @@ class PoolMatrixDataset(ProceduralDataset):
            ]
        )

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Score the answer based on the metadata"""

        reward = 0.0
--- a/reasoning_gym/algorithmic/ransom_note.py
+++ b/reasoning_gym/algorithmic/ransom_note.py
@ -7,7 +7,7 @@ https://leetcode.com/problems/ransom-note/description/
 from collections import defaultdict
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -95,14 +95,14 @@ class RansomNoteDataset(ProceduralDataset):
            "metadata": {"ransom_note": ransom_note, "magazine": magazine, "solution": answer, "solvable": solvable},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves this task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/algorithmic/sentence_reordering.py
+++ b/reasoning_gym/algorithmic/sentence_reordering.py
@ -92,7 +92,7 @@ class SentenceReorderingDataset(ProceduralDataset):
            "metadata": {"word_count": word_count},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        reward = 0.0
        expected_answer = entry["answer"]
        if answer is not None:
--- a/reasoning_gym/algorithmic/spell_backward.py
+++ b/reasoning_gym/algorithmic/spell_backward.py
@ -49,7 +49,7 @@ class SpellBackwardDataset(ProceduralDataset):
            "metadata": {"word": word, "word_len": len(word)},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        reward = 0.0
        expected_answer = entry["answer"]
        if answer is not None:
--- a/reasoning_gym/algorithmic/spiral_matrix.py
+++ b/reasoning_gym/algorithmic/spiral_matrix.py
@ -6,7 +6,7 @@ https://leetcode.com/problems/spiral-matrix/description/

 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -116,7 +116,7 @@ class SpiralMatrixDataset(ProceduralDataset):
            "metadata": {"matrix": matrix, "solution": answer},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Overwrite this method in derived classes if a single oracle answer is not available."""
        oracle_answer = entry["answer"].strip()

--- a/reasoning_gym/algorithmic/string_insertion.py
+++ b/reasoning_gym/algorithmic/string_insertion.py
@ -5,7 +5,7 @@ https://github.com/yongchao98/CodeSteer-v1.0/blob/main/create_dataset/create_dat

 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -80,7 +80,7 @@ class StringInsertionDataset(ProceduralDataset):
                i += 1
        return "".join(output)

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Overwrite this method in derived classes if a single oracle answer is not available."""
        oracle_answer = entry["answer"]
        if answer is not None:
--- a/reasoning_gym/algorithmic/word_ladder.py
+++ b/reasoning_gym/algorithmic/word_ladder.py
@ -3,7 +3,7 @@
 from collections import deque
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Any, Optional

 from ..data import get_data_file_path
 from ..factory import ProceduralDataset, register_dataset
@ -82,7 +82,7 @@ class WordLadderDataset(ProceduralDataset):
        super().__init__(config=config, seed=config.seed, size=config.size)

    @classmethod
-    def _load_words_from_csv(cls, min_length: int = 3, max_length: int = 5) -> Dict[int, Set[str]]:
+    def _load_words_from_csv(cls, min_length: int = 3, max_length: int = 5) -> dict[int, set[str]]:
        """Load words from CSV file organized by length"""
        # Validate length range before processing
        assert 3 <= min_length <= max_length <= 5, "Word length must be between 3 and 5 inclusive"
@ -117,7 +117,7 @@ class WordLadderDataset(ProceduralDataset):

        return word_sets

-    def _get_neighbors(self, word: str, word_set: Set[str]) -> Set[str]:
+    def _get_neighbors(self, word: str, word_set: set[str]) -> set[str]:
        """Get neighbors from either precomputed graph or by computing on demand"""
        # Try precomputed graph first
        if len(word) in self.word_graphs and word in self.word_graphs[len(word)]:
@ -132,7 +132,7 @@ class WordLadderDataset(ProceduralDataset):
                    neighbors.add(neighbor)
        return neighbors

-    def _build_word_graph(self, word_length: int) -> Dict[str, Set[str]]:
+    def _build_word_graph(self, word_length: int) -> dict[str, set[str]]:
        """Build graph of word connections for given length, using caching"""
        # Return cached graph if it exists
        if word_length in self.word_graphs:
@ -156,7 +156,7 @@ class WordLadderDataset(ProceduralDataset):
        self.word_graphs[word_length] = graph
        return self.word_graphs[word_length]

-    def _find_path(self, start: str, end: str, word_set: Set[str]) -> Optional[List[str]]:
+    def _find_path(self, start: str, end: str, word_set: set[str]) -> Optional[list[str]]:
        """Simplified path finding using BFS for shortest paths"""
        # Early exit if words are direct neighbors
        if end in self._get_neighbors(start, word_set):
@ -181,7 +181,7 @@ class WordLadderDataset(ProceduralDataset):

        return None

-    def _generate_word_pair(self, rng: Random, length: int) -> Tuple[str, str, List[str]]:
+    def _generate_word_pair(self, rng: Random, length: int) -> tuple[str, str, list[str]]:
        """Simplified word pair generation"""
        word_set = self.word_sets[length]
        words_list = sorted(word_set)
@ -220,7 +220,7 @@ class WordLadderDataset(ProceduralDataset):
            "metadata": {"start_word": start, "end_word": end, "word_length": length, "chain_length": len(path)},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        if answer is None:
            return 0

--- a/reasoning_gym/algorithmic/word_sorting.py
+++ b/reasoning_gym/algorithmic/word_sorting.py
@ -4,7 +4,7 @@ import re
 from dataclasses import dataclass
 from enum import StrEnum
 from random import Random
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Optional

 from ..data import read_data_file
 from ..factory import ProceduralDataset, register_dataset
@ -84,7 +84,7 @@ class WordSortingDataset(ProceduralDataset):
            return "".join(c.upper() if rng.choice([True, False]) else c.lower() for c in word)
        return word  # ORIGINAL case

-    def _generate_words(self, rng: Random) -> Tuple[List[str], List[str]]:
+    def _generate_words(self, rng: Random) -> tuple[list[str], list[str]]:
        """Generate list of words and their transformed versions"""
        count = rng.randint(self.config.min_words, self.config.max_words)

@ -122,7 +122,7 @@ class WordSortingDataset(ProceduralDataset):
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        oracle_answer = entry["metadata"]["sorted_words"]
        if answer is not None and len(answer) > 0:
            parsed_answer = [word.strip() for word in re.split(r",\s*", answer)]
--- a/reasoning_gym/arc/arc_1d_tasks.py
+++ b/reasoning_gym/arc/arc_1d_tasks.py
@ -1,13 +1,13 @@
 from random import Random
-from typing import Dict, List, Optional
+from typing import Optional


-def gen_field(size: int, color: int = 0) -> List[int]:
+def gen_field(size: int, color: int = 0) -> list[int]:
    """Generate a field of given size filled with specified color (default 0)."""
    return [color] * size


-def write_block(pos: int, block: List[int], field: List[int]) -> List[int]:
+def write_block(pos: int, block: list[int], field: list[int]) -> list[int]:
    """Write a block into a field at given position."""
    result = field.copy()
    for i, color in enumerate(block):
@ -15,7 +15,7 @@ def write_block(pos: int, block: List[int], field: List[int]) -> List[int]:
    return result


-def task_move_n_pix(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[Dict[str, List[int]]]:
+def task_move_n_pix(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block is moved to the right by move_pix pixels."""
    if size <= move_pix + 1:
        return None
@ -35,7 +35,7 @@ def task_move_n_pix(rng: Random, size: int, move_pix: int, solid: bool) -> Optio
    return {"input": question, "output": answer}


-def task_move_n_pix_wrapped(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[Dict[str, List[int]]]:
+def task_move_n_pix_wrapped(rng: Random, size: int, move_pix: int, solid: bool) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block is moved to the right by move_pix pixels with wrapping."""
    block_size = rng.randint(1, size)
    block_pos = rng.randint(0, size)
@ -56,7 +56,7 @@ def task_move_n_pix_wrapped(rng: Random, size: int, move_pix: int, solid: bool)
    return {"input": question, "output": answer}


-def task_gravity(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_gravity(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where all non-zero elements are attracted to the left."""
    density = 0.5
    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
@ -67,7 +67,7 @@ def task_gravity(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
    return {"input": question, "output": answer}


-def task_gravity_counting(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_gravity_counting(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where non-zero elements are counted and represented as a sequence of 1s."""
    density = 0.5
    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
@ -78,7 +78,7 @@ def task_gravity_counting(rng: Random, size: int) -> Optional[Dict[str, List[int
    return {"input": question, "output": answer}


-def task_gravity_antigravity(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_gravity_antigravity(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where color 1 moves right and color 2 moves left."""
    density = 0.5
    question = [rng.randint(1, 2) if rng.random() < density else 0 for _ in range(size)]
@ -90,7 +90,7 @@ def task_gravity_antigravity(rng: Random, size: int) -> Optional[Dict[str, List[
    return {"input": question, "output": answer}


-def task_block_touch_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_block_touch_dot(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block moves to touch (but not cover) a dot."""
    dot_color = 1
    block_color = rng.randint(2, 9)
@ -129,7 +129,7 @@ def task_block_touch_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]
    return {"input": question, "output": answer}


-def task_block_touch_dot_n_pix(rng: Random, size: int, move_pix: int) -> Optional[Dict[str, List[int]]]:
+def task_block_touch_dot_n_pix(rng: Random, size: int, move_pix: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block moves move_pix pixels toward a dot."""
    dot_color = 2
    block_color = rng.randint(3, 9)
@ -172,7 +172,7 @@ def task_block_touch_dot_n_pix(rng: Random, size: int, move_pix: int) -> Optiona
    return {"input": question, "output": answer}


-def task_block_scale_to_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_block_scale_to_dot(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block scales to touch a dot (keeping one end fixed)."""
    dot_color = 2
    block_color = rng.randint(3, 9)
@ -213,7 +213,7 @@ def task_block_scale_to_dot(rng: Random, size: int) -> Optional[Dict[str, List[i
    return {"input": question, "output": answer}


-def task_two_points_and_fill(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_two_points_and_fill(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where space between two points of same color is filled with that color."""
    color = rng.randint(1, 9)

@ -235,7 +235,7 @@ def task_two_points_and_fill(rng: Random, size: int) -> Optional[Dict[str, List[
    return {"input": question, "output": answer}


-def task_reflect_block_with_border_pixel(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_reflect_block_with_border_pixel(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block with a border pixel is reflected."""
    block_size = rng.randint(2, size)
    if block_size > size:
@ -262,7 +262,7 @@ def task_reflect_block_with_border_pixel(rng: Random, size: int) -> Optional[Dic
    return {"input": question, "output": answer}


-def task_reflect_block_with_border_pixel_random(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_reflect_block_with_border_pixel_random(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a random-colored block with a border pixel is reflected."""
    block_size = rng.randint(2, size)
    if block_size > size:
@ -290,7 +290,7 @@ def task_reflect_block_with_border_pixel_random(rng: Random, size: int) -> Optio
    return {"input": question, "output": answer}


-def task_reflect_block_around_dot(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_reflect_block_around_dot(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block is reflected around a dot."""
    dot_color = 2

@ -328,7 +328,7 @@ def task_reflect_block_around_dot(rng: Random, size: int) -> Optional[Dict[str,
    return {"input": question, "output": answer}


-def task_block_and_noise_remove(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_block_and_noise_remove(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where noise around a block needs to be removed."""
    block_size = rng.randint(2, size)
    if block_size > size:
@ -379,7 +379,7 @@ def task_block_and_noise_remove(rng: Random, size: int) -> Optional[Dict[str, Li
    return {"input": question, "output": answer}


-def task_block_and_noise_remove_inside(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_block_and_noise_remove_inside(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where noise inside a block needs to be removed."""
    if size <= 6:
        return None
@ -419,7 +419,7 @@ def task_block_and_noise_remove_inside(rng: Random, size: int) -> Optional[Dict[
    return {"input": question, "output": answer}


-def task_copy_block_to_dots(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_copy_block_to_dots(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block pattern is copied to dot positions."""
    block_size = 3 if rng.random() < 0.5 else 5
    if block_size >= size:
@ -456,7 +456,7 @@ def task_copy_block_to_dots(rng: Random, size: int) -> Optional[Dict[str, List[i
    return {"input": question, "output": answer}


-def task_copy_block_to_dots_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_copy_block_to_dots_colors(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block pattern is copied to dot positions with matching colors."""
    block_size = 3 if rng.random() < 0.5 else 5
    if block_size >= size:
@ -497,7 +497,7 @@ def task_copy_block_to_dots_colors(rng: Random, size: int) -> Optional[Dict[str,
    return {"input": question, "output": answer}


-def task_paint_biggest_block(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_paint_biggest_block(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where the largest block is painted a different color."""
    target_color = 1
    initial_color = rng.randint(2, 9)
@ -535,7 +535,7 @@ def task_paint_biggest_block(rng: Random, size: int) -> Optional[Dict[str, List[
    return {"input": question, "output": answer}


-def task_sort_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_sort_blocks_by_size(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where blocks are sorted by size with 1 pixel gaps."""
    color = rng.randint(1, 9)
    blocks = []
@ -579,7 +579,7 @@ def task_sort_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, List[
    return {"input": question, "output": answer}


-def task_sort_complete_sequence(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_sort_complete_sequence(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a complete sequence of block sizes is sorted."""
    # Calculate max possible block size given total array size
    max_size = 1
@ -617,7 +617,7 @@ def task_sort_complete_sequence(rng: Random, size: int) -> Optional[Dict[str, Li
    return {"input": question, "output": answer}


-def task_recolor_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_recolor_blocks_by_size(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where two blocks are recolored based on their size."""
    # Generate two different random sizes
    size1 = rng.randint(2, 8)
@ -656,7 +656,7 @@ def task_recolor_blocks_by_size(rng: Random, size: int) -> Optional[Dict[str, Li
    return {"input": question, "output": answer}


-def task_gravity_one_step(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_gravity_one_step(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where non-zero elements move one step left if possible."""
    question = [rng.randint(1, 9) if rng.random() < 0.5 else 0 for _ in range(size)]
    answer = question.copy()
@ -670,7 +670,7 @@ def task_gravity_one_step(rng: Random, size: int) -> Optional[Dict[str, List[int
    return {"input": question, "output": answer}


-def task_move_block_by_own_size(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_move_block_by_own_size(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block moves right by its own size."""
    block_size = rng.randint(1, size // 2)  # Ensure space for movement
    pos = rng.randint(0, size - block_size * 2)  # Space for block and movement
@ -685,7 +685,7 @@ def task_move_block_by_own_size(rng: Random, size: int) -> Optional[Dict[str, Li
    return {"input": question, "output": answer}


-def task_change_to_five(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_change_to_five(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where all non-zero colors change to 5."""
    density = 0.5
    question = [rng.randint(1, 9) if rng.random() < density else 0 for _ in range(size)]
@ -694,7 +694,7 @@ def task_change_to_five(rng: Random, size: int) -> Optional[Dict[str, List[int]]
    return {"input": question, "output": answer}


-def task_recolor_blocks_from_palette(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_recolor_blocks_from_palette(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where blocks are recolored using a color palette."""
    # Generate blocks of same size
    block_size = rng.randint(2, 4)
@ -750,7 +750,7 @@ def task_recolor_blocks_from_palette(rng: Random, size: int) -> Optional[Dict[st
    return {"input": question, "output": answer}


-def task_duplicate_block_from_seeds(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_duplicate_block_from_seeds(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a block is duplicated from seed pixels."""
    block_size = rng.randint(2, 4)
    if block_size + 1 >= size:
@ -812,7 +812,7 @@ def task_duplicate_block_from_seeds(rng: Random, size: int) -> Optional[Dict[str
    return {"input": question, "output": answer}


-def task_fill_from_pixel(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_fill_from_pixel(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a pixel fills in one direction until hitting another pixel."""
    block_size = rng.randint(3, 6)
    if block_size >= size - 2:
@ -856,7 +856,7 @@ def task_fill_from_pixel(rng: Random, size: int) -> Optional[Dict[str, List[int]
    return {"input": question, "output": answer}


-def task_mark_size_two_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_mark_size_two_blocks(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where size-2 blocks are marked with surrounding pixels."""
    blocks = []
    pos = 0
@ -908,7 +908,7 @@ def task_mark_size_two_blocks(rng: Random, size: int) -> Optional[Dict[str, List
    return {"input": question, "output": answer}


-def task_fill_until_collision(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_fill_until_collision(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where pixels fill empty space until collision."""
    # At least 4 positions for meaningful puzzle
    if size < 4:
@ -975,7 +975,7 @@ def task_fill_until_collision(rng: Random, size: int) -> Optional[Dict[str, List
    return {"input": question, "output": answer}


-def task_repeat_pattern_full(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_repeat_pattern_full(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where a pattern is repeated to fill the space."""
    # Generate initial pattern
    pattern_size = rng.randint(2, 5)
@ -1007,7 +1007,7 @@ def task_repeat_pattern_full(rng: Random, size: int) -> Optional[Dict[str, List[
    return {"input": question, "output": answer}


-def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where color 2 is heavier than color 1 in gravity."""
    # Generate random field with only colors 1 and 2
    question = [rng.randint(1, 2) if rng.random() < 0.5 else 0 for _ in range(size)]
@ -1030,7 +1030,7 @@ def task_gravity_weighted_colors(rng: Random, size: int) -> Optional[Dict[str, L
    return {"input": question, "output": answer}


-def task_color_left_half_blocks(rng: Random, size: int) -> Optional[Dict[str, List[int]]]:
+def task_color_left_half_blocks(rng: Random, size: int) -> Optional[dict[str, list[int]]]:
    """Generate a task where left half of blocks are colored differently."""
    pos = 0
    question = gen_field(size)
@ -1063,21 +1063,21 @@ def task_color_left_half_blocks(rng: Random, size: int) -> Optional[Dict[str, Li
    return {"input": question, "output": answer}


-def task_mirror(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
+def task_mirror(task_result: Optional[dict[str, list[int]]]) -> Optional[dict[str, list[int]]]:
    """Mirror the input and output arrays of a task result."""
    if task_result is None:
        return None
    return {"input": list(reversed(task_result["input"])), "output": list(reversed(task_result["output"]))}


-def task_inverse(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
+def task_inverse(task_result: Optional[dict[str, list[int]]]) -> Optional[dict[str, list[int]]]:
    """Swap the input and output arrays of a task result."""
    if task_result is None:
        return None
    return {"input": task_result["output"], "output": task_result["input"]}


-def task_identity(task_result: Optional[Dict[str, List[int]]]) -> Optional[Dict[str, List[int]]]:
+def task_identity(task_result: Optional[dict[str, list[int]]]) -> Optional[dict[str, list[int]]]:
    """Return the task result unchanged."""
    return task_result

--- a/reasoning_gym/arc/board_format.py
+++ b/reasoning_gym/arc/board_format.py
@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List, Tuple

 ARC_PROMPT_TEMPLATE = """Find the common rule that maps an input grid to an output grid, given the examples below.

@ -21,7 +20,7 @@ class BoardFormattingOptions:


 def format_board(
-    board: List[List[int]], formatting_options: BoardFormattingOptions, with_board_shape: bool = False
+    board: list[list[int]], formatting_options: BoardFormattingOptions, with_board_shape: bool = False
 ) -> str:
    """
    Format a board as a string
@ -65,7 +64,7 @@ def format_board(

 def format_board_pair(
    index: int,
-    pair: dict[str, List[List[int]]],
+    pair: dict[str, list[list[int]]],
    formatting_options: BoardFormattingOptions,
 ) -> str:
    """
@ -82,7 +81,7 @@ def format_board_pair(
    return f"Example {index}:\n\nInput:\n{input_element}\nOutput:\n{output_element}\n\n"


-def parse_board(formatted_str: str, formatting_options: BoardFormattingOptions) -> Tuple[Tuple[int, ...], ...]:
+def parse_board(formatted_str: str, formatting_options: BoardFormattingOptions) -> tuple[tuple[int, ...], ...]:
    """
    Convert a formatted board string back to a tuple grid using formatting options
    """
--- a/reasoning_gym/arc/rearc.py
+++ b/reasoning_gym/arc/rearc.py
@ -95,7 +95,7 @@ class ReArcDataset(ProceduralDataset):
            },
        }

-    def score_answer(self, answer: str, entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: str, entry: dict[str, Any]) -> float:
        reward = 0.0
        metadata = entry["metadata"]
        if answer is not None:
--- a/reasoning_gym/arc/rearc_utils/utils.py
+++ b/reasoning_gym/arc/rearc_utils/utils.py
@ -1,5 +1,5 @@
 import random
-from typing import Any, List, Tuple
+from typing import Any

 from .dsl import *

@ -40,7 +40,7 @@ def get_pso_difficulty(example: dict) -> float:
    return (pix_pct + col_pct + obj_dens) / 3


-def unifint(rng: random.Random, diff_lb: float, diff_ub: float, bounds: Tuple[int, int]) -> int:
+def unifint(rng: random.Random, diff_lb: float, diff_ub: float, bounds: tuple[int, int]) -> int:
    """
    rng
    diff_lb: lower bound for difficulty, must be in range [0, diff_ub]
@ -83,7 +83,7 @@ def strip_prefix(string: str, prefix: str) -> str:
    return string[len(prefix) :]


-def format_grid(grid: List[List[int]]) -> Grid:
+def format_grid(grid: list[list[int]]) -> Grid:
    """
    grid type casting
    """
--- a/reasoning_gym/arithmetic/basic_arithmetic.py
+++ b/reasoning_gym/arithmetic/basic_arithmetic.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Any, Dict, Literal, Optional
+from typing import Any, Literal, Optional

 from reasoning_gym import utils

@ -234,7 +234,7 @@ class BasicArithmeticDataset(ProceduralDataset):
            template = rng.choice(templates)
            return template.format(expression)

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        oracle_answer = entry["answer"].strip()
        return utils.compute_reward(answer, oracle_answer, allow_commas=False)

--- a/reasoning_gym/arithmetic/calendar_arithmetic.py
+++ b/reasoning_gym/arithmetic/calendar_arithmetic.py
@ -4,7 +4,7 @@ import random
 from dataclasses import dataclass
 from datetime import date, timedelta
 from enum import Enum, StrEnum, auto
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -51,7 +51,7 @@ class CalendarTask(StrEnum):
@dataclass
 class CalendarArithmeticConfig:
    year: int = 2022
-    tasks: Optional[List[str]] = None
+    tasks: Optional[list[str]] = None
    offset_upper_bound: int = 100
    leap_year_range: int = 200
    seed: Optional[int] = 42
@ -131,7 +131,7 @@ class CalendarArithmeticDataset(ProceduralDataset):
            "metadata": metadata,
        }

-    def _weekday_offset(self, rng: random.Random) -> Tuple[str, str, dict]:
+    def _weekday_offset(self, rng: random.Random) -> tuple[str, str, dict]:
        """
        Task: Given a starting date and a day offset (which may be positive or negative),
        ask what day of the week it will be.
@ -170,7 +170,7 @@ class CalendarArithmeticDataset(ProceduralDataset):
        }
        return question, target_weekday, metadata

-    def _weekday_of_date(self, rng: random.Random) -> Tuple[str, str, dict]:
+    def _weekday_of_date(self, rng: random.Random) -> tuple[str, str, dict]:
        """
        task: Ask what day of the week a given date was.
        example:
@ -193,7 +193,7 @@ class CalendarArithmeticDataset(ProceduralDataset):
        }
        return question, answer_weekday, metadata

-    def _weekday_of_date_from_first_day(self, rng: random.Random) -> Tuple[str, str, dict]:
+    def _weekday_of_date_from_first_day(self, rng: random.Random) -> tuple[str, str, dict]:
        """
        task: Given an hypothetical weekday for January 1, ask what weekday a later date in the year falls on.
        example:
@ -235,7 +235,7 @@ class CalendarArithmeticDataset(ProceduralDataset):
        }
        return question, answer_weekday, metadata

-    def _recurring_event_day(self, rng: random.Random) -> Tuple[str, str, dict]:
+    def _recurring_event_day(self, rng: random.Random) -> tuple[str, str, dict]:
        """
        task: For a recurring event defined by an ordinal weekday pattern in a month,
        ask on which day of the month the event occurs.
@ -294,7 +294,7 @@ class CalendarArithmeticDataset(ProceduralDataset):
        }
        return question, str(event_day), metadata

-    def _count_days(self, rng: random.Random) -> Tuple[str, str, dict]:
+    def _count_days(self, rng: random.Random) -> tuple[str, str, dict]:
        """
        task: Ask how many times a given weekday occurs in a specified range.
        example:
@ -334,7 +334,7 @@ class CalendarArithmeticDataset(ProceduralDataset):
        }
        return question, str(count), metadata

-    def _count_business_days(self, rng: random.Random) -> Tuple[str, str, dict]:
+    def _count_business_days(self, rng: random.Random) -> tuple[str, str, dict]:
        """
        task: Count the number of business days (Monday-Friday) between two dates.
        example:
@ -385,7 +385,7 @@ class CalendarArithmeticDataset(ProceduralDataset):
        }
        return question, str(count), metadata

-    def _is_leap_year(self, rng: random.Random) -> Tuple[str, str, dict]:
+    def _is_leap_year(self, rng: random.Random) -> tuple[str, str, dict]:
        """
        task: Given a year, determine whether it is a leap year.
        example:
@ -426,7 +426,7 @@ class CalendarArithmeticDataset(ProceduralDataset):
        random_days = rng.randint(0, delta)
        return start_date + timedelta(days=random_days)

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        # we suppose the answer is the last occurence of the expected answer type
        if answer is None:
            return 0.0
--- a/reasoning_gym/arithmetic/chain_sum.py
+++ b/reasoning_gym/arithmetic/chain_sum.py
@ -1,6 +1,6 @@
 import random
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Any, Optional

 from reasoning_gym import utils

@ -110,7 +110,7 @@ class ChainSumDataset(ProceduralDataset):
        expression = " ".join(expression_parts)
        return expression, result

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        oracle_answer = entry["answer"].strip()
        return utils.compute_reward(answer, oracle_answer)

--- a/reasoning_gym/arithmetic/decimal_arithmetic.py
+++ b/reasoning_gym/arithmetic/decimal_arithmetic.py
@ -2,7 +2,7 @@ import ast
 from dataclasses import dataclass
 from decimal import ROUND_HALF_UP, Decimal, getcontext
 from random import Random
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -25,7 +25,7 @@ class DecimalArithmeticConfig:
        ), "precision must be 2 or more higher than max_num_decimal_places"


-def build_grouped_expression(operands: List[str], operators: List[str], rng: Random) -> str:
+def build_grouped_expression(operands: list[str], operators: list[str], rng: Random) -> str:
    """
    Recursively build an arithmetic expression string from operands and operators,
    inserting parentheses at random.
@ -53,7 +53,7 @@ def generate_arithmetic_problem(
    min_num_decimal_places: int,
    max_num_decimal_places: int,
    terms: int = 2,
-    operations: Optional[List[str]] = None,
+    operations: Optional[list[str]] = None,
 ) -> str:
    """
    Generates a simple arithmetic problem with decimal numbers (as a string) formatted
@ -72,8 +72,8 @@ def generate_arithmetic_problem(
    if operations is None:
        operations = ["+", "-", "*", "/"]

-    operands: List[str] = []
-    operators: List[str] = []
+    operands: list[str] = []
+    operators: list[str] = []

    for i in range(terms):
        # Choose a random number of decimal places for this term.
@ -149,7 +149,7 @@ class DecimalArithmeticDataset(ProceduralDataset):
    def __init__(self, config: DecimalArithmeticConfig) -> None:
        super().__init__(config=config, seed=config.seed, size=config.size)

-    def __getitem__(self, idx: int) -> Dict[str, Any]:
+    def __getitem__(self, idx: int) -> dict[str, Any]:
        """
        Generate a single arithmetic task.

@ -180,7 +180,7 @@ class DecimalArithmeticDataset(ProceduralDataset):

        return {"question": problem_str, "answer": answer, "metadata": {}}

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """
        Compares the user's answer (converted to Decimal) with the correct answer.
        Instead of requiring exact equality, we allow an error up to one unit in the
--- a/reasoning_gym/arithmetic/decimal_chain_sum.py
+++ b/reasoning_gym/arithmetic/decimal_chain_sum.py
@ -133,7 +133,7 @@ class DecimalChainSumDataset(ProceduralDataset):
        result = result.quantize(Decimal(f"0.{'0' * max(decimal_places)}"))
        return expression, result

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Score the answer by comparing decimal values instead of strings.
        Args:
            answer: The answer to score
--- a/reasoning_gym/arithmetic/dice.py
+++ b/reasoning_gym/arithmetic/dice.py
@ -2,7 +2,7 @@ from dataclasses import dataclass
 from functools import reduce
 from math import gcd
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -125,14 +125,14 @@ class DiceDataset(ProceduralDataset):
            "metadata": {},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the Dice task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/arithmetic/fraction_simplification.py
+++ b/reasoning_gym/arithmetic/fraction_simplification.py
@ -4,7 +4,7 @@ import re
 from dataclasses import dataclass
 from math import gcd
 from random import Random
-from typing import Any, Dict, Optional, Sequence, Tuple
+from typing import Any, Optional, Sequence

 from ..factory import ProceduralDataset, register_dataset

@ -42,7 +42,7 @@ class FractionSimplificationDataset(ProceduralDataset):
    def __init__(self, config: FractionSimplificationConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)

-    def _generate_fraction(self, rng: Random) -> Tuple[int, int, int, int]:
+    def _generate_fraction(self, rng: Random) -> tuple[int, int, int, int]:
        """Generate a random fraction and its simplified form.
        Returns (numerator, denominator, simplified_num, simplified_den)"""
        # Try to generate valid fractions until we get one that meets our criteria
@ -134,7 +134,7 @@ class FractionSimplificationDataset(ProceduralDataset):
        except:
            return None

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]):
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]):
        reward = 0.0
        metadata = entry["metadata"]
        try:
--- a/reasoning_gym/arithmetic/gcd.py
+++ b/reasoning_gym/arithmetic/gcd.py
@ -4,7 +4,7 @@ from dataclasses import dataclass
 from functools import reduce
 from math import gcd
 from random import Random
-from typing import List, Optional, Tuple
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -34,7 +34,7 @@ class GCDDataset(ProceduralDataset):
    def __init__(self, config: GCDConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)

-    def _generate_numbers(self, rng: Random) -> Tuple[List[int], int]:
+    def _generate_numbers(self, rng: Random) -> tuple[list[int], int]:
        """Generate a list of random positive integers and their GCD.
        Will try up to 3 times to find numbers with GCD > 1."""

--- a/reasoning_gym/arithmetic/gsm_symbolic/generators_00_49.py
+++ b/reasoning_gym/arithmetic/gsm_symbolic/generators_00_49.py
--- a/reasoning_gym/arithmetic/gsm_symbolic/generators_50_99.py
+++ b/reasoning_gym/arithmetic/gsm_symbolic/generators_50_99.py
--- a/reasoning_gym/arithmetic/lcm.py
+++ b/reasoning_gym/arithmetic/lcm.py
@ -4,7 +4,7 @@ from dataclasses import dataclass
 from functools import reduce
 from math import lcm
 from random import Random
-from typing import List, Optional, Tuple
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -34,11 +34,11 @@ class LCMDataset(ProceduralDataset):
    def __init__(self, config: LCMConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)

-    def _generate_numbers(self, rng: Random) -> Tuple[List[int], int]:
+    def _generate_numbers(self, rng: Random) -> tuple[list[int], int]:
        """Generate a list of random positive integers and their LCM.
        Will try up to 3 times to find numbers with LCM < product."""

-        def calculate_product(nums: List[int]) -> int:
+        def calculate_product(nums: list[int]) -> int:
            return reduce(lambda x, y: x * y, nums)

        # Try up to 3 times to get LCM < product
--- a/reasoning_gym/arithmetic/leg_counting.py
+++ b/reasoning_gym/arithmetic/leg_counting.py
@ -93,7 +93,7 @@ class LegCountingDataset(ProceduralDataset):
    def __init__(self, config: LegCountingConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)

-    def _generate_animals(self, rng: Random) -> Dict[str, int]:
+    def _generate_animals(self, rng: Random) -> dict[str, int]:
        """Generate a random set of animals and their counts"""
        num_types = rng.randint(self.config.min_animals, self.config.max_animals)
        animals = {}
--- a/reasoning_gym/arithmetic/number_format.py
+++ b/reasoning_gym/arithmetic/number_format.py
@ -2,7 +2,7 @@

 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -67,7 +67,7 @@ class NumberFormatDataset(ProceduralDataset):
                output.append(f"{candidate:.15e}")
        return output

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Overwrite this method in derived classes if a single oracle answer is not available."""
        oracle_answer = entry["metadata"]["solution"]
        if answer is not None and len(answer) > 0:
--- a/reasoning_gym/arithmetic/power_function.py
+++ b/reasoning_gym/arithmetic/power_function.py
@ -3,7 +3,7 @@
 from dataclasses import dataclass
 from math import pow
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -46,7 +46,7 @@ class PowerFunctionDataset(ProceduralDataset):
    def __init__(self, config: PowerFunctionConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Overwrite this method in derived classes if a single oracle answer is not available."""
        oracle_answer = entry["answer"]
        if answer is not None:
--- a/reasoning_gym/arithmetic/prime_factorization.py
+++ b/reasoning_gym/arithmetic/prime_factorization.py
@ -3,7 +3,7 @@
 import math
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, List, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -29,7 +29,7 @@ class PrimeFactorizationDataset(ProceduralDataset):
    def __init__(self, config: PrimeFactorizationConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)

-    def _prime_factors(self, n: int) -> List[int]:
+    def _prime_factors(self, n: int) -> list[int]:
        """Compute prime factors of a number"""
        factors = []
        d = 2
@ -44,11 +44,11 @@ class PrimeFactorizationDataset(ProceduralDataset):
                break
        return factors

-    def _normalize_answer(self, answer: str) -> List[int]:
+    def _normalize_answer(self, answer: str) -> list[int]:
        """Parse and sort factors from a string"""
        return sorted([int(factor.strip()) for factor in answer.split("×")])

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        oracle_answer = entry["answer"]
        reward = 0.0
        if answer is not None:
--- a/reasoning_gym/arithmetic/products.py
+++ b/reasoning_gym/arithmetic/products.py
@ -1,6 +1,6 @@
 import random
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Any, Optional

 from reasoning_gym import utils

@ -102,7 +102,7 @@ class ProductsDataset(ProceduralDataset):
        expression = " ".join(expression_parts)
        return expression, result

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        oracle_answer = entry["answer"].strip()
        return utils.compute_reward(answer, oracle_answer)

--- a/reasoning_gym/arithmetic/time_intervals.py
+++ b/reasoning_gym/arithmetic/time_intervals.py
@ -1,7 +1,7 @@
 import random
 from dataclasses import dataclass, field
 from datetime import date, datetime, time, timedelta
-from typing import List, Optional
+from typing import Optional

 import pytz
 from dateutil import parser
@ -19,7 +19,7 @@ class TimeIntervalsConfig:
    min_date: date = date(1900, 1, 1)
    max_date: date = date(3000, 1, 1)
    max_date_difference_days: int = 100
-    task_types: List[str] = field(
+    task_types: list[str] = field(
        default_factory=lambda: ["time", "time_seconds", "time_ms", "date", "datetime", "datetime_tz"]
    )
    seed: Optional[int] = None
--- a/reasoning_gym/coaching/base_curriculum.py
+++ b/reasoning_gym/coaching/base_curriculum.py
@ -11,7 +11,7 @@ class BaseCurriculum:
        self._attributes: dict[str, AttributeDefinition] = {}
        self._current_levels: dict[str, int] = {}

-    def generate_configuration(self, defaults: Optional[dict[str, any]] = None) -> ConfigT:
+    def generate_configuration(self, defaults: Optional[dict[str, Any]] = None) -> ConfigT:
        config_args = defaults.copy() if defaults is not None else {}
        for attr in self._attributes.values():
            if isinstance(attr, RangeAttributeDefinition):
--- a/reasoning_gym/coaching/coach.py
+++ b/reasoning_gym/coaching/coach.py
@ -6,7 +6,7 @@ from collections import OrderedDict
 from dataclasses import dataclass, field
 from pathlib import Path
 from statistics import mean, stdev
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Optional, Union

 from ..dataset import ProceduralDataset

@ -15,7 +15,7 @@ from ..dataset import ProceduralDataset
 class ScoreStats:
    """Container for score statistics with mean, std, min, max"""

-    scores: OrderedDict[Tuple[Tuple[str, Any], ...], Tuple[int, float, float, float, float]]
+    scores: OrderedDict[tuple[tuple[str, Any], ...], tuple[int, float, float, float, float]]

    def __str__(self) -> str:
        """Create a formatted report of the statistics
@ -41,7 +41,7 @@ class ScoreStats:
 class GroupedScores:
    """Container for grouped scores with total count"""

-    scores: OrderedDict[Tuple[Tuple[str, Any], ...], List[float]]
+    scores: OrderedDict[tuple[tuple[str, Any], ...], list[float]]
    total_scores: int

    def __str__(self) -> str:
@ -114,11 +114,11 @@ class GroupedScores:
 class ScoreBoard:
    """Tracks scores and metadata for coaching sessions"""

-    scores: List[float] = field(default_factory=list)
-    metadata: List[Dict[str, Any]] = field(default_factory=list)
-    conversations: List[Optional[List[Dict]]] = field(default_factory=list)
+    scores: list[float] = field(default_factory=list)
+    metadata: list[dict[str, Any]] = field(default_factory=list)
+    conversations: list[Optional[list[dict]]] = field(default_factory=list)

-    def add_score(self, score: float, metadata: Dict[str, Any], conversation: Optional[List[Dict]] = None) -> None:
+    def add_score(self, score: float, metadata: dict[str, Any], conversation: Optional[list[dict]] = None) -> None:
        """Add a new score entry with associated metadata and optional conversation

        Args:
@ -140,7 +140,7 @@ class ScoreBoard:
        """Return the number of stored scores"""
        return len(self.scores)

-    def _metadata_to_key(self, metadata: Dict[str, Any]) -> Tuple[Tuple[str, Any], ...]:
+    def _metadata_to_key(self, metadata: dict[str, Any]) -> tuple[tuple[str, Any], ...]:
        """Convert metadata dict to tuple of key-value pairs, sorted by key

        If source_dataset and source_index are present in metadata, they will be
@ -222,7 +222,7 @@ class Coach(ProceduralDataset):
        return self.dataset[idx]

    def score_answer(
-        self, answer: Optional[str], entry: Dict[str, Any], conversation: Optional[List[Dict]] = None
+        self, answer: Optional[str], entry: dict[str, Any], conversation: Optional[list[dict]] = None
    ) -> float:
        """Score answer and track results

--- a/reasoning_gym/coaching/registry.py
+++ b/reasoning_gym/coaching/registry.py
@ -1,6 +1,6 @@
 """Registry for managing active experiments."""

-from typing import Dict, List, Optional
+from typing import Optional

 from ..composite import CompositeConfig
 from .experiment import Experiment
@ -25,7 +25,7 @@ class ExperimentRegistry:
        """Get an experiment by name."""
        return self._experiments.get(name)

-    def list_experiments(self) -> List[str]:
+    def list_experiments(self) -> list[str]:
        """List all registered experiment names."""
        return list(self._experiments.keys())

--- a/reasoning_gym/code/bf.py
+++ b/reasoning_gym/code/bf.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 import bfi

@ -108,14 +108,14 @@ int main() {{
        # bf = Minify.minify(bf) # Is this necessary?
        return bf

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the BF task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/cognition/color_cube_rotation.py
+++ b/reasoning_gym/cognition/color_cube_rotation.py
@ -1,7 +1,7 @@
 import random
 from dataclasses import dataclass
 from enum import StrEnum
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -38,7 +38,7 @@ class Side(StrEnum):
 class Cube:
    """Represents a cube with colored sides"""

-    colors: Dict[Side, Color]
+    colors: dict[Side, Color]

    def rotate_front_to_top(self) -> None:
        """Rotate cube so front face becomes top"""
@ -162,7 +162,7 @@ class ColorCubeRotationDataset(ProceduralDataset):
            rotation_map[from_side]()

    def _generate_story(
-        self, initial_state: Dict[Side, Color], rotations: List[Side], target_side: Side, rng: random.Random
+        self, initial_state: dict[Side, Color], rotations: list[Side], target_side: Side, rng: random.Random
    ) -> str:
        """Generate story describing cube state and rotations"""
        # Describe initial state
@ -189,7 +189,7 @@ class ColorCubeRotationDataset(ProceduralDataset):

        return "\n".join(story_parts)

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        reward = 0.0
        metadata = entry["metadata"]
        if answer is not None:
--- a/reasoning_gym/cognition/figlet_fonts.py
+++ b/reasoning_gym/cognition/figlet_fonts.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 import pyfiglet

@ -119,7 +119,7 @@ class FigletFontDataset(ProceduralDataset):
            "metadata": {"font": chosen_font, "space_letters": self.config.space_letters},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the figlet task.

        The function awards 1.0 for a correct answer and 0.1 points for each correct letter in the correct position,
@ -127,7 +127,7 @@ class FigletFontDataset(ProceduralDataset):

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/cognition/number_sequences.py
+++ b/reasoning_gym/cognition/number_sequences.py
@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from enum import StrEnum
 from random import Random
-from typing import List, Optional
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -42,12 +42,12 @@ class NumberSequenceConfig:
 class PatternRule:
    """Represents a composable sequence pattern rule"""

-    def __init__(self, operations: List[Operation], parameters: List[int], subrules: List["PatternRule"] = None):
+    def __init__(self, operations: list[Operation], parameters: list[int], subrules: list["PatternRule"] = None):
        self.operations = operations
        self.parameters = parameters
        self.subrules = subrules or []

-    def apply(self, sequence: List[int], position: int) -> int:
+    def apply(self, sequence: list[int], position: int) -> int:
        """Apply the rule to generate the next number"""
        result = sequence[position]  # Start with current number

@ -75,7 +75,7 @@ class PatternRule:
        return result

    @classmethod
-    def compose(cls, rules: List["PatternRule"]) -> "PatternRule":
+    def compose(cls, rules: list["PatternRule"]) -> "PatternRule":
        """Create a new rule that composes multiple rules together"""
        return cls([Operation.COMPOSE], [0], subrules=rules)

@ -129,7 +129,7 @@ class PatternGenerator:

        return PatternRule(operations, parameters)

-    def is_interesting(self, sequence: List[int], max_value: int = 1000) -> bool:
+    def is_interesting(self, sequence: list[int], max_value: int = 1000) -> bool:
        """Check if sequence is interesting enough"""
        if not sequence:
            return False
--- a/reasoning_gym/cognition/rectangle_count.py
+++ b/reasoning_gym/cognition/rectangle_count.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -145,14 +145,14 @@ class RectangleCountDataset(ProceduralDataset):
            "metadata": {"puzzle": puzzle, "solution": answer},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the RectangleCount task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/cognition/rubiks_cube.py
+++ b/reasoning_gym/cognition/rubiks_cube.py
@ -1,7 +1,7 @@
 import re
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, List, Optional
+from typing import Any, Optional

 from magiccube.cube import Cube, CubeMove, CubeMoveType
 from magiccube.solver.basic.basic_solver import BasicSolver
@ -36,7 +36,7 @@ class RubiksCubeDataset(ProceduralDataset):
        ]
        super().__init__(config=config, seed=config.seed, size=config.size)

-    def _generate_random_moves(self, rng: Random, cube: Cube, num_steps: int = 50, wide=None) -> List[CubeMove]:
+    def _generate_random_moves(self, rng: Random, cube: Cube, num_steps: int = 50, wide=None) -> list[CubeMove]:
        """Generate a list of random moves (but don't apply them).
        By default scramble only uses wide moves to cubes with size >=4."""

@ -106,7 +106,7 @@ class RubiksCubeDataset(ProceduralDataset):
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the cube"""
        reward = 0.0  # default reward
        if answer is not None:
--- a/reasoning_gym/composite.py
+++ b/reasoning_gym/composite.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass, replace
 from random import Random
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional

 import yaml

@ -30,7 +30,7 @@ class CompositeConfig:

    size: int = 500
    seed: Optional[int] = None
-    datasets: List[DatasetSpec] = None
+    datasets: list[DatasetSpec] = None

    def validate(self):
        """Validate configuration parameters"""
@ -120,7 +120,7 @@ class CompositeDataset(ProceduralDataset):

        return item

-    def update_dataset_config(self, dataset_name: str, config_updates: Dict[str, Any]) -> None:
+    def update_dataset_config(self, dataset_name: str, config_updates: dict[str, Any]) -> None:
        """Update configuration of a specific dataset

        Args:
@ -175,7 +175,7 @@ class CompositeDataset(ProceduralDataset):
                self.weights[i] = weight
                break

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Forward scoring to appropriate dataset"""
        dataset_name = entry["metadata"]["source_dataset"]
        return self.datasets[dataset_name].score_answer(answer, entry)
--- a/reasoning_gym/dataset.py
+++ b/reasoning_gym/dataset.py
@ -4,10 +4,10 @@ from abc import ABC, abstractmethod
 from collections.abc import Iterable, Sized
 from copy import deepcopy
 from random import Random
-from typing import Any, Dict, Iterator, Optional, Type, TypeVar
+from typing import Any, Iterator, Optional, Type, TypeVar


-class ProceduralDataset(ABC, Sized, Iterable[Dict[str, Any]]):
+class ProceduralDataset(ABC, Sized, Iterable[dict[str, Any]]):
    """Abstract base class for procedural dataset generators"""

    def __init__(self, config: Any, seed: Optional[int] = None, size: int = 500):
@ -28,7 +28,7 @@ class ProceduralDataset(ABC, Sized, Iterable[Dict[str, Any]]):
        self._current_idx = 0
        return self

-    def __next__(self) -> Dict[str, Any]:
+    def __next__(self) -> dict[str, Any]:
        """Get next item in iteration"""
        if self._current_idx >= self.size:
            raise StopIteration
@ -51,7 +51,7 @@ class ProceduralDataset(ABC, Sized, Iterable[Dict[str, Any]]):
        """
        raise NotImplementedError

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Overwrite this method in derived classes if a single oracle answer is not available."""
        oracle_answer = entry["answer"].strip()
        reward = 0.0
@ -70,7 +70,7 @@ class ProceduralDataset(ABC, Sized, Iterable[Dict[str, Any]]):
 T = TypeVar("T", bound="ProceduralDataset")


-class ReseedingDataset(Iterable[Dict[str, Any]]):
+class ReseedingDataset(Iterable[dict[str, Any]]):
    """Wrapper that makes any ProceduralDataset infinite by reseeding when reaching the end"""

    def __init__(self, dataset: T, chunk_size: int = 500):
@ -100,14 +100,14 @@ class ReseedingDataset(Iterable[Dict[str, Any]]):
        # Create new dataset instance with chunk config
        return self.dataset_cls(new_config)

-    def __iter__(self) -> Iterator[Dict[str, Any]]:
+    def __iter__(self) -> Iterator[dict[str, Any]]:
        """Make the dataset iterable"""
        self._current_chunk = 0
        self._current_dataset = self._create_chunk(0)
        self._current_idx = 0
        return self

-    def __next__(self) -> Dict[str, Any]:
+    def __next__(self) -> dict[str, Any]:
        """Get next item, creating new chunk if needed"""
        if self._current_idx >= self.chunk_size:
            # Move to next chunk
@ -119,6 +119,6 @@ class ReseedingDataset(Iterable[Dict[str, Any]]):
        self._current_idx += 1
        return item

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Forward scoring to the wrapped dataset's implementation"""
        return self.dataset.score_answer(answer, entry)
--- a/reasoning_gym/factory.py
+++ b/reasoning_gym/factory.py
@ -1,5 +1,5 @@
 from dataclasses import is_dataclass
-from typing import Dict, Type, TypeVar
+from typing import Type, TypeVar

 from .dataset import ProceduralDataset

@ -8,7 +8,7 @@ ConfigT = TypeVar("ConfigT")
 DatasetT = TypeVar("DatasetT", bound=ProceduralDataset)

 # Global registry of datasets
-DATASETS: Dict[str, tuple[Type[ProceduralDataset], Type]] = {}
+DATASETS: dict[str, tuple[Type[ProceduralDataset], Type]] = {}


 def register_dataset(name: str, dataset_cls: Type[DatasetT], config_cls: Type[ConfigT]) -> None:
--- a/reasoning_gym/games/countdown.py
+++ b/reasoning_gym/games/countdown.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional

 import sympy
 from sympy import Symbol, symbols
@ -89,7 +89,7 @@ class CountdownDataset(ProceduralDataset):
            },
        }

-    def _generate_candidate_expression(self, rng: Random, num_terms: int) -> Tuple[sympy.Expr, List[int], List[Symbol]]:
+    def _generate_candidate_expression(self, rng: Random, num_terms: int) -> tuple[sympy.Expr, list[int], list[Symbol]]:
        """Generate a candidate expression with random numbers and operators

        Args:
@ -140,7 +140,7 @@ class CountdownDataset(ProceduralDataset):

        return expr, numbers, syms

-    def _generate_expression(self, rng: Random) -> Tuple[str, List[int], int]:
+    def _generate_expression(self, rng: Random) -> tuple[str, list[int], int]:
        """Generate a valid expression and its result

        Returns:
@ -171,7 +171,7 @@ class CountdownDataset(ProceduralDataset):

        raise ValueError(f"Failed to generate valid expression after {max_attempts} attempts")

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the problem"""
        reward = 0.0
        metadata = entry["metadata"]
--- a/reasoning_gym/games/futoshiki.py
+++ b/reasoning_gym/games/futoshiki.py
@ -4,7 +4,7 @@ import copy
 import itertools
 from dataclasses import dataclass
 from random import Random
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -85,7 +85,7 @@ class FutoshikiDataset(ProceduralDataset):
        }

    def _puzzle_to_string(
-        self, puzzle_grid: List[List[int]], constraints: Dict[Tuple[Tuple[int, int], Tuple[int, int]], str]
+        self, puzzle_grid: list[list[int]], constraints: dict[tuple[tuple[int, int], tuple[int, int]], str]
    ) -> str:
        """
        Formats a Futoshiki puzzle grid as a string with constraints.
@ -161,9 +161,9 @@ class FutoshikiDataset(ProceduralDataset):

    def _solve_logical(
        self,
-        grid: List[List[int]],
-        constraints: Dict[Tuple[Tuple[int, int], Tuple[int, int]], str],
-    ) -> Tuple[List[List[int]], List[List[Set[int]]]]:
+        grid: list[list[int]],
+        constraints: dict[tuple[tuple[int, int], tuple[int, int]], str],
+    ) -> tuple[list[list[int]], list[list[set[int]]]]:
        """
        Apply logical rules to progress solution.
        Returns current state if logical rules can't progress further.
@ -172,7 +172,7 @@ class FutoshikiDataset(ProceduralDataset):
        size, working_grid = len(grid), copy.deepcopy(grid)

        # Starting point all numbers are candidates for all unfilled squares
-        candidates: List[List[Set[int]]] = [
+        candidates: list[list[set[int]]] = [
            [set(range(1, len(grid) + 1)) if grid[r][c] == 0 else {grid[r][c]} for c in range(len(grid))]
            for r in range(len(grid))
        ]
@ -214,7 +214,7 @@ class FutoshikiDataset(ProceduralDataset):

            # Eliminate candidates based on constraints
            # Based on currently filled values, eliminate candidates that violate constraints
-            def _eliminate_by_constraint(rc_less: Tuple[int, int], rc_greater: Tuple[int, int]) -> bool:
+            def _eliminate_by_constraint(rc_less: tuple[int, int], rc_greater: tuple[int, int]) -> bool:
                r_less, c_less = rc_less
                r_greater, c_greater = rc_greater
                progress = False
@ -399,9 +399,9 @@ class FutoshikiDataset(ProceduralDataset):

    def _solve(
        self,
-        grid: List[List[int]],
-        constraints: Dict[Tuple[Tuple[int, int], Tuple[int, int]], str],
-    ) -> List[List[int]] | None:
+        grid: list[list[int]],
+        constraints: dict[tuple[tuple[int, int], tuple[int, int]], str],
+    ) -> list[list[int]] | None:
        """
        Backtracking Futoshiki solver. Used to verify generated puzzles.
        Applies logical rules first then backtracks to fill gaps.
@ -442,11 +442,11 @@ class FutoshikiDataset(ProceduralDataset):

    def _is_valid(
        self,
-        grid: List[List[int]],
+        grid: list[list[int]],
        row: int,
        col: int,
        val: int,
-        constraints: Dict[Tuple[Tuple[int, int], Tuple[int, int]], str],
+        constraints: dict[tuple[tuple[int, int], tuple[int, int]], str],
    ) -> bool:
        """Check row, col, and inequality constraints for placing val in grid[row][col]."""
        size = len(grid)
@ -482,7 +482,7 @@ class FutoshikiDataset(ProceduralDataset):
        grid[row][col] = original_val
        return True

-    def _generate_random_solution(self, size: int, rng: Random) -> List[List[int]]:
+    def _generate_random_solution(self, size: int, rng: Random) -> list[list[int]]:
        """
        Generates a random valid completed Futoshiki solution with numbers 1..size.
        Ensures each row and column has unique numbers.
@ -514,8 +514,8 @@ class FutoshikiDataset(ProceduralDataset):
        raise ValueError("Could not generate a random solution.")

    def _generate_random_constraints(
-        self, solution: List[List[int]], difficulty: int, rng: Random
-    ) -> Dict[Tuple[Tuple[int, int], Tuple[int, int]], str]:
+        self, solution: list[list[int]], difficulty: int, rng: Random
+    ) -> dict[tuple[tuple[int, int], tuple[int, int]], str]:
        """
        Randomly add inequality constraints that match the solution.
        We only add constraints for adjacent cells (horizontal or vertical).
@ -570,10 +570,10 @@ class FutoshikiDataset(ProceduralDataset):

    def _remove_clues(
        self,
-        grid: List[List[int]],
-        constraints: Dict[Tuple[Tuple[int, int], Tuple[int, int]], str],
+        grid: list[list[int]],
+        constraints: dict[tuple[tuple[int, int], tuple[int, int]], str],
        rng: Random,
-    ) -> List[List[int]]:
+    ) -> list[list[int]]:
        """
        Remove clues from a full solution to try to maintain a unique-solution puzzle.
        We remove in random order until we reach our target, or can't without losing uniqueness.
--- a/reasoning_gym/games/knight_swap.py
+++ b/reasoning_gym/games/knight_swap.py
@ -2,7 +2,7 @@ import collections
 import json
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, FrozenSet, List, Optional, Set, Tuple
+from typing import FrozenSet, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -81,7 +81,7 @@ class KnightSwapLogic:
        return {abs(a_col - b_col), abs(a_row - b_row)} == {1, 2}

    @staticmethod
-    def is_connected(graph: Dict[str, List[str]]) -> bool:
+    def is_connected(graph: dict[str, list[str]]) -> bool:
        """Check if a graph is connected (all nodes reachable from any starting node)."""
        if not graph:
            return True
@ -98,7 +98,7 @@ class KnightSwapLogic:
        return len(visited) == len(graph)

    @staticmethod
-    def generate_board(num_nodes: int, rng: Random, max_attempts: int = 1000) -> Dict[str, List[str]]:
+    def generate_board(num_nodes: int, rng: Random, max_attempts: int = 1000) -> dict[str, list[str]]:
        """Generate a random connected board where edges represent valid knight moves."""
        candidates = ["A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3", "D1", "D2", "D3"]
        attempts = 0
@ -120,8 +120,8 @@ class KnightSwapLogic:

    @staticmethod
    def solve_swap(
-        board: Dict[str, List[str]], pieces: Dict[str, str], start_turn: str = "w"
-    ) -> Optional[List[Tuple[str, str, str]]]:
+        board: dict[str, list[str]], pieces: dict[str, str], start_turn: str = "w"
+    ) -> Optional[list[tuple[str, str, str]]]:
        """Find a sequence of moves to swap white and black pieces positions."""

        @dataclass(frozen=True)
@ -177,7 +177,7 @@ class KnightSwapDataset(ProceduralDataset):
        super().__init__(config=config, seed=config.seed, size=config.size)
        self.game_logic = KnightSwapLogic()

-    def _format_board(self, board: Dict[str, List[str]], pieces: Dict[str, str]) -> str:
+    def _format_board(self, board: dict[str, list[str]], pieces: dict[str, str]) -> str:
        """Format the board state as a string."""
        positions = list(board.keys())
        if not positions:
@ -206,13 +206,13 @@ class KnightSwapDataset(ProceduralDataset):

        return "\n".join(lines)

-    def _format_moves(self, moves: List[Tuple[str, str, str]]) -> str:
+    def _format_moves(self, moves: list[tuple[str, str, str]]) -> str:
        """Format the solution moves as a string."""
        if not moves:
            return "No"
        return json.dumps([f"{color},{start},{end}" for color, start, end in moves])

-    def __getitem__(self, idx: int) -> Dict:
+    def __getitem__(self, idx: int) -> dict:
        """Generate a single Knight Swap puzzle."""
        rng = Random(self.seed + idx)

@ -303,7 +303,7 @@ class KnightSwapDataset(ProceduralDataset):

        raise ValueError(f"Failed to generate valid puzzle after trying {self.config.max_attempts} different boards")

-    def score_answer(self, answer: Optional[str], entry: Dict) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict) -> float:
        """Score the user's solution for the Knight Swap puzzle.

        The answer should be either:
--- a/reasoning_gym/games/mini_sudoku.py
+++ b/reasoning_gym/games/mini_sudoku.py
@ -3,7 +3,7 @@
 import copy
 from dataclasses import dataclass
 from random import Random
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -46,7 +46,7 @@ class MiniSudokuDataset(ProceduralDataset):
        self._current_idx += 1
        return item

-    def _is_valid(self, board: List[List[int]], row: int, col: int, num: int) -> bool:
+    def _is_valid(self, board: list[list[int]], row: int, col: int, num: int) -> bool:
        """Check if number can be placed at position"""
        # Check row
        if num in board[row]:
@ -64,7 +64,7 @@ class MiniSudokuDataset(ProceduralDataset):
                    return False
        return True

-    def _solve(self, board: List[List[int]]) -> bool:
+    def _solve(self, board: list[list[int]]) -> bool:
        """Solve mini sudoku using backtracking"""
        empty = self._find_empty(board)
        if not empty:
@ -79,7 +79,7 @@ class MiniSudokuDataset(ProceduralDataset):
                board[row][col] = 0
        return False

-    def _find_empty(self, board: List[List[int]]) -> Optional[Tuple[int, int]]:
+    def _find_empty(self, board: list[list[int]]) -> Optional[tuple[int, int]]:
        """Find an empty cell"""
        for i in range(4):
            for j in range(4):
@ -87,7 +87,7 @@ class MiniSudokuDataset(ProceduralDataset):
                    return (i, j)
        return None

-    def _generate_solved_board(self, rng: Random) -> List[List[int]]:
+    def _generate_solved_board(self, rng: Random) -> list[list[int]]:
        """Generate a complete solved mini sudoku board"""
        board = [[0] * 4 for _ in range(4)]

@ -115,10 +115,10 @@ class MiniSudokuDataset(ProceduralDataset):

        raise RuntimeError("Failed to generate valid mini sudoku board")

-    def _count_solutions(self, board: List[List[int]], limit: int = 2) -> int:
+    def _count_solutions(self, board: list[list[int]], limit: int = 2) -> int:
        """Count the number of solutions for a given board"""

-        def _count_solutions_helper(board: List[List[int]]) -> int:
+        def _count_solutions_helper(board: list[list[int]]) -> int:
            empty = self._find_empty(board)
            if not empty:
                return 1
@ -136,7 +136,7 @@ class MiniSudokuDataset(ProceduralDataset):

        return _count_solutions_helper(board)

-    def _create_puzzle(self, solved_board: List[List[int]], num_empty: int, rng: Random) -> List[List[int]]:
+    def _create_puzzle(self, solved_board: list[list[int]], num_empty: int, rng: Random) -> list[list[int]]:
        """Create puzzle by removing numbers from solved board"""
        puzzle = [row[:] for row in solved_board]
        cells = [(i, j) for i in range(4) for j in range(4)]
@ -157,7 +157,7 @@ class MiniSudokuDataset(ProceduralDataset):

        return puzzle

-    def _board_to_string(self, board: List[List[int]]) -> str:
+    def _board_to_string(self, board: list[list[int]]) -> str:
        """Convert board to string representation"""
        return "\n".join(" ".join(str(x) if x != 0 else "_" for x in row) for row in board)

--- a/reasoning_gym/games/n_queens.py
+++ b/reasoning_gym/games/n_queens.py
@ -7,7 +7,7 @@ https://en.wikipedia.org/wiki/Eight_queens_puzzle
 from copy import deepcopy
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, List, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -65,7 +65,7 @@ class NQueensDataset(ProceduralDataset):
        super().__init__(config=config, seed=config.seed, size=config.size)
        self._solutions = self._get_all_solutions(config.n)

-    def _get_all_solutions(self, n: int) -> List[List[List[str]]]:
+    def _get_all_solutions(self, n: int) -> list[list[list[str]]]:
        """Get all solutions for the N Queens puzzle"""

        visited_cols = set()
@ -97,7 +97,7 @@ class NQueensDataset(ProceduralDataset):
        backtrack(0)
        return res

-    def _create_puzzle(self, solved_board: List[List[str]], num_removed: int, rng: Random) -> List[List[str]]:
+    def _create_puzzle(self, solved_board: list[list[str]], num_removed: int, rng: Random) -> list[list[str]]:
        """Create puzzle by removing queens from solved board"""
        puzzle = deepcopy(solved_board)
        queens = [(i, j) for i in range(len(puzzle)) for j in range(len(puzzle)) if puzzle[i][j] == "Q"]
@ -107,15 +107,15 @@ class NQueensDataset(ProceduralDataset):
            puzzle[x][y] = "_"
        return puzzle

-    def _board_to_string(self, board: List[List[str]]) -> str:
+    def _board_to_string(self, board: list[list[str]]) -> str:
        """Convert board to string representation"""
        return "\n".join(" ".join(x for x in row) for row in board)

-    def _string_to_board(self, board_str: str) -> List[List[str]]:
+    def _string_to_board(self, board_str: str) -> list[list[str]]:
        """Convert string representation to board"""
        return [list(row.split()) for row in board_str.strip().split("\n")]

-    def _is_tractable_solution(self, puzzle: List[List[str]], solution: List[List[str]]) -> bool:
+    def _is_tractable_solution(self, puzzle: list[list[str]], solution: list[list[str]]) -> bool:
        """Check if a solution is achievable from the starting state of the puzzle"""
        for r in range(len(puzzle)):
            for c in range(len(puzzle)):
@ -150,7 +150,7 @@ class NQueensDataset(ProceduralDataset):
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        valid_solutions = entry["metadata"]["valid_answers"]
        if answer is not None:
            if answer in valid_solutions:
--- a/reasoning_gym/games/rush_hour.py
+++ b/reasoning_gym/games/rush_hour.py
@ -6,7 +6,7 @@ https://www.michaelfogleman.com/rush/
 import random
 import re
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import Optional

 from ..data import get_data_file_path
 from ..factory import ProceduralDataset, register_dataset
@ -105,7 +105,7 @@ class RushHourDataset(ProceduralDataset):
        super().__init__(config=config, seed=config.seed, size=config.size)

        # Load and filter puzzles from data file
-        self.puzzles: List[Tuple[str, int]] = []  # (board_config, min_moves)
+        self.puzzles: list[tuple[str, int]] = []  # (board_config, min_moves)

        data_path = get_data_file_path("rush_18k.txt")
        with data_path.open() as f:
--- a/reasoning_gym/games/sokoban.py
+++ b/reasoning_gym/games/sokoban.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 import numpy as np

@ -80,14 +80,14 @@ Here is your puzzle:
            "metadata": {"gamestr": gamestr, "difficulty": difficulty},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the Sokoban task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/games/sudoku.py
+++ b/reasoning_gym/games/sudoku.py
@ -3,7 +3,7 @@
 import copy
 from dataclasses import dataclass
 from random import Random
-from typing import Any, List, Optional, Set, Tuple
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -47,7 +47,7 @@ class SudokuDataset(ProceduralDataset):
        self._current_idx += 1
        return item

-    def _is_valid(self, board: List[List[int]], row: int, col: int, num: int) -> bool:
+    def _is_valid(self, board: list[list[int]], row: int, col: int, num: int) -> bool:
        """Check if number can be placed at position"""
        # Check row
        if num in board[row]:
@ -65,7 +65,7 @@ class SudokuDataset(ProceduralDataset):
                    return False
        return True

-    def _get_possible_values(self, board: List[List[int]], row: int, col: int) -> Set[int]:
+    def _get_possible_values(self, board: list[list[int]], row: int, col: int) -> set[int]:
        """Get all possible values for a cell."""
        row_values = set(board[row])
        col_values = set(board[i][col] for i in range(9))
@ -80,7 +80,7 @@ class SudokuDataset(ProceduralDataset):
        used_values = row_values | col_values | box_values
        return set(range(1, 10)) - used_values

-    def _solve(self, board: List[List[int]]) -> bool:
+    def _solve(self, board: list[list[int]]) -> bool:
        """Solve sudoku using backtracking"""
        empty = self._find_empty(board)
        if not empty:
@ -94,7 +94,7 @@ class SudokuDataset(ProceduralDataset):
            board[row][col] = 0
        return False

-    def _find_empty(self, board: List[List[int]]) -> Optional[Tuple[int, int]]:
+    def _find_empty(self, board: list[list[int]]) -> Optional[tuple[int, int]]:
        """Find an empty cell"""
        for i in range(9):
            for j in range(9):
@ -102,7 +102,7 @@ class SudokuDataset(ProceduralDataset):
                    return (i, j)
        return None

-    def _generate_solved_board(self, rng: Random) -> List[List[int]]:
+    def _generate_solved_board(self, rng: Random) -> list[list[int]]:
        """Generate a complete solved sudoku board"""
        board = [[0] * 9 for _ in range(9)]

@ -120,10 +120,10 @@ class SudokuDataset(ProceduralDataset):
        self._solve(board)
        return board

-    def _count_solutions(self, board: List[List[int]], limit: int = 2) -> int:
+    def _count_solutions(self, board: list[list[int]], limit: int = 2) -> int:
        """Count the number of solutions for a given board"""

-        def _get_min_possibilities_cell(board: List[List[int]]) -> Optional[Tuple[int, int, Set[int]]]:
+        def _get_min_possibilities_cell(board: list[list[int]]) -> Optional[tuple[int, int, set[int]]]:
            """
            Get the cell with the lowest number of possibilities.
            Returns None if the board is already solved.
@ -145,7 +145,7 @@ class SudokuDataset(ProceduralDataset):

            return (*min_cell, min_values) if min_cell else None

-        def _count_solutions_helper(board: List[List[int]]) -> int:
+        def _count_solutions_helper(board: list[list[int]]) -> int:
            cell_info = _get_min_possibilities_cell(board)
            if not cell_info:
                return 1
@ -162,7 +162,7 @@ class SudokuDataset(ProceduralDataset):

        return _count_solutions_helper(board)

-    def _create_puzzle(self, solved_board: List[List[int]], num_empty: int, rng: Random) -> List[List[int]]:
+    def _create_puzzle(self, solved_board: list[list[int]], num_empty: int, rng: Random) -> list[list[int]]:
        """Create puzzle by removing numbers from solved board"""
        puzzle = [row[:] for row in solved_board]
        cells = [(i, j) for i in range(9) for j in range(9)]
@ -183,7 +183,7 @@ class SudokuDataset(ProceduralDataset):

        return puzzle

-    def _board_to_string(self, board: List[List[int]]) -> str:
+    def _board_to_string(self, board: list[list[int]]) -> str:
        """Convert board to string representation"""
        return "\n".join(" ".join(str(x) if x != 0 else "_" for x in row) for row in board)

--- a/reasoning_gym/games/tower_of_hanoi.py
+++ b/reasoning_gym/games/tower_of_hanoi.py
@ -4,7 +4,7 @@ import math
 import random
 import re
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -62,23 +62,23 @@ class MoveGenerator:
    It maintains the current state of all pegs to ensure move validity.
    """

-    def __init__(self, num_disks: int, pegs: List[int], start: int, target: int):
+    def __init__(self, num_disks: int, pegs: list[int], start: int, target: int):
        self.num_disks = num_disks
        self.pegs = pegs
        self.start = start
        self.target = target
        self.auxiliary_pegs = [peg for peg in pegs if peg not in (start, target)]
-        self.pegs_state: Dict[int, List[int]] = {peg: [] for peg in pegs}
+        self.pegs_state: dict[int, list[int]] = {peg: [] for peg in pegs}
        for disk in range(num_disks, 0, -1):  # Largest disk at the bottom
            self.pegs_state[start].append(disk)
-        self.moves: List[str] = []
-        self.memo: Dict[Tuple[int, int], int] = {}  # Memoization for T(n, k)
+        self.moves: list[str] = []
+        self.memo: dict[tuple[int, int], int] = {}  # Memoization for T(n, k)

-    def generate_moves(self) -> List[str]:
+    def generate_moves(self) -> list[str]:
        self.move(n=self.num_disks, source=self.start, target=self.target, auxiliary_pegs=self.auxiliary_pegs)
        return self.moves

-    def move(self, n: int, source: int, target: int, auxiliary_pegs: List[int]):
+    def move(self, n: int, source: int, target: int, auxiliary_pegs: list[int]):
        if n == 0:
            return
        if n == 1:
@ -175,10 +175,10 @@ class HanoiDataset(ProceduralDataset):
        Returns:
            dict with:
            - "question": Text describing the problem setup.
-            - "answer": List of moves to solve the puzzle.
+            - "answer": list of moves to solve the puzzle.
            - "metadata": Configuration and solution details.
            - "initial_state": (Optional) ASCII visualization of the initial pegs.
-            - "states": (Optional) List of ASCII visualizations after each move.
+            - "states": (Optional) list of ASCII visualizations after each move.
        """
        rng = random.Random(self.seed + idx if self.seed is not None else None)

@ -282,11 +282,11 @@ class HanoiDataset(ProceduralDataset):

        if self.visualize:
            result["initial_state"] = initial_state_str
-            result["states"] = states  # List of all states including initial and after each move
+            result["states"] = states  # list of all states including initial and after each move

        return result

-    def _visualize_state(self, pegs_state: Dict[int, List[int]]) -> str:
+    def _visualize_state(self, pegs_state: dict[int, list[int]]) -> str:
        """
        Create an ASCII visualization of the current state of the pegs.
        Adapts to variable number of pegs.
@ -321,7 +321,7 @@ class HanoiDataset(ProceduralDataset):

        return visualization

-    def _validate_move(self, pegs_state: Dict[int, List[int]], move: str) -> bool:
+    def _validate_move(self, pegs_state: dict[int, list[int]], move: str) -> bool:
        """
        Validate that a move adheres to the Tower of Hanoi rules.

@ -356,7 +356,7 @@ class HanoiDataset(ProceduralDataset):
            print(f"Error validating move '{move}': {e}")
            return False

-    def _parse_move(self, move: str) -> Tuple[int, int, int]:
+    def _parse_move(self, move: str) -> tuple[int, int, int]:
        """
        Parse a move string and extract disk number, from peg, and to peg.

@ -376,7 +376,7 @@ class HanoiDataset(ProceduralDataset):
        to_peg = int(match.group(3))
        return disk, from_peg, to_peg

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """
        Score the user's solution for the Tower of Hanoi puzzle.

--- a/reasoning_gym/games/tsumego.py
+++ b/reasoning_gym/games/tsumego.py
@ -19,7 +19,7 @@ TODO: Generate multi-step Tsumego problems.
 import re
 from dataclasses import dataclass
 from random import Random
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -62,11 +62,11 @@ class TsumegoDataset(ProceduralDataset):
        super().__init__(config=config, seed=config.seed, size=config.size)

    # New helper method for board copying
-    def _copy_board(self, board: List[List[str]]) -> List[List[str]]:
+    def _copy_board(self, board: list[list[str]]) -> list[list[str]]:
        """Return a deep copy of the board."""
        return [row[:] for row in board]

-    def _get_liberties(self, board: List[List[str]], row: int, col: int) -> Set[Tuple[int, int]]:
+    def _get_liberties(self, board: list[list[str]], row: int, col: int) -> set[tuple[int, int]]:
        """Get empty adjacent points (liberties) for a stone"""
        size = len(board)
        liberties = set()
@ -76,7 +76,7 @@ class TsumegoDataset(ProceduralDataset):
                liberties.add((r, c))
        return liberties

-    def _get_group(self, board: List[List[str]], row: int, col: int) -> Set[Tuple[int, int]]:
+    def _get_group(self, board: list[list[str]], row: int, col: int) -> set[tuple[int, int]]:
        """Get all stones in the same group (connected stones of same color)"""
        size = len(board)
        color = board[row][col]
@ -94,14 +94,14 @@ class TsumegoDataset(ProceduralDataset):
                    queue.append((nr, nc))
        return group

-    def _count_liberties(self, board: List[List[str]], group: Set[Tuple[int, int]]) -> int:
+    def _count_liberties(self, board: list[list[str]], group: set[tuple[int, int]]) -> int:
        """Count total liberties for a group of stones"""
        liberties = set()
        for row, col in group:
            liberties.update(self._get_liberties(board, row, col))
        return len(liberties)

-    def _would_capture(self, board: List[List[str]], row: int, col: int, color: str) -> bool:
+    def _would_capture(self, board: list[list[str]], row: int, col: int, color: str) -> bool:
        """Check if a move would capture any opponent stones"""
        size = len(board)
        opponent = "O" if color == "X" else "X"
@ -120,7 +120,7 @@ class TsumegoDataset(ProceduralDataset):
                    return True
        return False

-    def _is_valid_move(self, board: List[List[str]], row: int, col: int, color: str) -> bool:
+    def _is_valid_move(self, board: list[list[str]], row: int, col: int, color: str) -> bool:
        """Check if a move is legal (not suicide, unless it captures)"""
        size = len(board)
        if not (0 <= row < size and 0 <= col < size):
@ -139,7 +139,7 @@ class TsumegoDataset(ProceduralDataset):
        group = self._get_group(board_copy, row, col)
        return self._count_liberties(board_copy, group) > 0

-    def _make_move(self, board: List[List[str]], row: int, col: int, color: str) -> bool:
+    def _make_move(self, board: list[list[str]], row: int, col: int, color: str) -> bool:
        """Make a move and update ko point. Returns True if move was valid."""
        if not self._is_valid_move(board, row, col, color):
            return False
@ -164,7 +164,7 @@ class TsumegoDataset(ProceduralDataset):

        return True

-    def _generate_capture_problem(self, size: int, rng: Random) -> Tuple[List[List[str]], Tuple[int, int]]:
+    def _generate_capture_problem(self, size: int, rng: Random) -> tuple[list[list[str]], tuple[int, int]]:
        """Generate a capture problem"""
        board = [["." for _ in range(size)] for _ in range(size)]
        stones_placed = 0
@ -235,7 +235,7 @@ class TsumegoDataset(ProceduralDataset):
            tries += 1
        raise RuntimeError("Failed to generate a capture problem")

-    def _board_to_string(self, board: List[List[str]]) -> str:
+    def _board_to_string(self, board: list[list[str]]) -> str:
        """Convert board to string representation"""
        size = len(board)
        # Column labels
@ -272,7 +272,7 @@ class TsumegoDataset(ProceduralDataset):
            "metadata": {"difficulty": {"board_size": size}, "board": board, "solution": solution_str},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Score the answer against the solution"""
        if answer is None:
            return 0.0
--- a/reasoning_gym/geometry/advanced_geometry.py
+++ b/reasoning_gym/geometry/advanced_geometry.py
@ -1,11 +1,10 @@
 import random
-import re
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional

 import numpy as np
 import sympy
-from sympy.geometry import Point, Segment, Triangle
+from sympy.geometry import Point

 from ..factory import ProceduralDataset, register_dataset

@ -23,7 +22,7 @@ class AdvancedGeometryConfig:

    # Probability or list of tasks we want to generate
    # For demonstration, we have three categories:
-    task_types: List[str] = field(
+    task_types: list[str] = field(
        default_factory=lambda: [
            "orthocenter",
            "incircle_radius",
@ -228,7 +227,7 @@ class AdvancedGeometryDataset(ProceduralDataset):
        }
        return question, answer_str, metadata

-    def score_answer(self, answer: str | None, entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: str | None, entry: dict[str, Any]) -> float:
        reward = 0.0
        expected_answer = entry["answer"]
        metadata = entry["metadata"]
--- a/reasoning_gym/graphs/course_schedule.py
+++ b/reasoning_gym/graphs/course_schedule.py
@ -8,7 +8,7 @@ https://leetcode.com/problems/course-schedule/description/
 from collections import defaultdict
 from dataclasses import dataclass
 from random import Random
-from typing import List, Optional
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -54,7 +54,7 @@ class CourseScheduleDataset(ProceduralDataset):
    def __init__(self, config: CourseScheduleConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)

-    def _can_finish(self, num_courses: int, prerequisites: List[List[int]]) -> bool:
+    def _can_finish(self, num_courses: int, prerequisites: list[list[int]]) -> bool:
        adj = defaultdict(list)
        for course, prereq in prerequisites:
            adj[course].append(prereq)
@ -81,7 +81,7 @@ class CourseScheduleDataset(ProceduralDataset):

        return True

-    def _create_prerequisites(self, rng: Random, courses: List[int], solvable: bool) -> List[List[int]]:
+    def _create_prerequisites(self, rng: Random, courses: list[int], solvable: bool) -> list[list[int]]:
        """Create a list of prerequisites for each course"""
        prerequisites = []
        # Generate a valid course schedule
--- a/reasoning_gym/graphs/family_relationships.py
+++ b/reasoning_gym/graphs/family_relationships.py
@ -2,7 +2,7 @@ import random
 from dataclasses import dataclass, field
 from enum import StrEnum
 from itertools import count
-from typing import List, Optional, Set, Tuple
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -37,8 +37,8 @@ class Person:
    gender: Gender
    id: int
    spouse: Optional["Person"] = None
-    parents: List["Person"] = field(default_factory=list)
-    children: List["Person"] = field(default_factory=list)
+    parents: list["Person"] = field(default_factory=list)
+    children: list["Person"] = field(default_factory=list)

    def __hash__(self):
        return self.id
@ -65,7 +65,7 @@ class FamilyRelationshipsConfig:

    min_family_size: int = 4
    max_family_size: int = 8
-    male_names: List[str] = field(
+    male_names: list[str] = field(
        default_factory=lambda: [
            "James",
            "John",
@ -112,7 +112,7 @@ class FamilyRelationshipsConfig:
            "Finn",
        ]
    )
-    female_names: List[str] = field(
+    female_names: list[str] = field(
        default_factory=lambda: [
            "Mary",
            "Patricia",
@ -207,7 +207,7 @@ class FamilyRelationshipsDataset(ProceduralDataset):
            },
        }

-    def _generate_family(self, rng: random.Random) -> Set[Person]:
+    def _generate_family(self, rng: random.Random) -> set[Person]:
        """Generate a random family tree"""
        family_size = rng.randint(self.config.min_family_size, self.config.max_family_size)
        family = set()
@ -292,8 +292,8 @@ class FamilyRelationshipsDataset(ProceduralDataset):
        return family

    def _get_relationship_question(
-        self, rng: random.Random, family: Set[Person]
-    ) -> Tuple[Person, Person, Relationship]:
+        self, rng: random.Random, family: set[Person]
+    ) -> tuple[Person, Person, Relationship]:
        """Select two family members and determine their relationship"""
        person1, person2 = rng.sample(list(family), 2)

@ -326,7 +326,7 @@ class FamilyRelationshipsDataset(ProceduralDataset):

        return person1, person2, relationship

-    def _generate_story(self, family: Set[Person]) -> str:
+    def _generate_story(self, family: set[Person]) -> str:
        """Generate a story describing the family relationships"""
        story_parts = []

--- a/reasoning_gym/graphs/largest_island.py
+++ b/reasoning_gym/graphs/largest_island.py
@ -7,7 +7,7 @@ https://leetcode.com/problems/max-area-of-island/description/
 from collections import deque
 from dataclasses import dataclass
 from random import Random
-from typing import List, Optional
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -58,7 +58,7 @@ class LargestIslandDataset(ProceduralDataset):
    def _is_valid_cell(self, r: int, c: int) -> bool:
        return 0 <= r < self.config.rows and 0 <= c < self.config.cols

-    def _create_grid(self, rng: Random) -> List[List[int]]:
+    def _create_grid(self, rng: Random) -> list[list[int]]:
        """Create a random grid of islands using a random walk algorithm"""
        grid = [[0] * self.config.cols for _ in range(self.config.rows)]
        directions = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Up, Down, Left, Right
@ -81,7 +81,7 @@ class LargestIslandDataset(ProceduralDataset):

        return grid

-    def _get_largest_island(self, grid: List[List[int]]) -> int:
+    def _get_largest_island(self, grid: list[list[int]]) -> int:
        """Find the largest island in the grid"""
        directions = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Up, Down, Left, Right
        visited = set()
@ -108,11 +108,11 @@ class LargestIslandDataset(ProceduralDataset):

        return max_area

-    def _grid_to_string(self, grid: List[List[int]]) -> str:
+    def _grid_to_string(self, grid: list[list[int]]) -> str:
        """Convert grid to a string representation"""
        return "\n".join(" ".join(str(cell) for cell in row) for row in grid)

-    def _string_to_board(self, grid_str: str) -> List[List[int]]:
+    def _string_to_board(self, grid_str: str) -> list[list[int]]:
        """Convert string representation to a grid"""
        return [[int(cell) for cell in row.split()] for row in grid_str.split("\n")]

--- a/reasoning_gym/graphs/quantum_lock.py
+++ b/reasoning_gym/graphs/quantum_lock.py
@ -164,7 +164,7 @@ Buttons:
        # If no solution found, regenerate
        return self.generate_quantum_puzzle(rng, difficulty)

-    def score_answer(self, answer: Optional[str], entry: dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the task.

        The function awards 1.0 for a correct answer and less otherwise.
--- a/reasoning_gym/logic/aiw.py
+++ b/reasoning_gym/logic/aiw.py
@ -2,7 +2,7 @@ from dataclasses import dataclass, field
 from enum import StrEnum
 from random import Random
 from string import Template
-from typing import List, Optional
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -20,15 +20,15 @@ class AliceInWonderlandConfig:
    """Configuration options for the Alice in Wonderland dataset.

    Attributes:
-        male_names (List[str]): List of male names to use in questions.
-        female_names (List[str]): List of female names to use in questions. Must include 'Alice'.
-        task_types (List[TaskType]): List of task types to include in dataset.
+        male_names (list[str]): List of male names to use in questions.
+        female_names (list[str]): List of female names to use in questions. Must include 'Alice'.
+        task_types (list[TaskType]): List of task types to include in dataset.
        seed (Optional[int]): Seed for random number generation.
        size (int): Number of samples in the dataset.
        max_entities (int): Max number of siblings/friends/colleagues in questions.
    """

-    male_names: List[str] = field(
+    male_names: list[str] = field(
        default_factory=lambda: [
            "James",
            "John",
@ -43,7 +43,7 @@ class AliceInWonderlandConfig:
            "Bob",
        ]
    )
-    female_names: List[str] = field(
+    female_names: list[str] = field(
        default_factory=lambda: [
            "Mary",
            "Patricia",
@ -58,7 +58,7 @@ class AliceInWonderlandConfig:
            "Alice",
        ]
    )
-    task_types: List[TaskType] = field(
+    task_types: list[TaskType] = field(
        default_factory=lambda: [TaskType.SIBLINGS, TaskType.FRIENDS, TaskType.COLLEAGUES]  # Added Colleagues
    )
    seed: Optional[int] = None
--- a/reasoning_gym/logic/circuit_logic.py
+++ b/reasoning_gym/logic/circuit_logic.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -17,7 +17,7 @@ def _repeat(s: str, n: int) -> str:
    return s * n


-def _matrix_put(matrix: List[List[str]], h: int, w: int, x: int, y: int, s: str, direction: str):
+def _matrix_put(matrix: list[list[str]], h: int, w: int, x: int, y: int, s: str, direction: str):
    """Place a string `s` into the 2D `matrix` starting at (x,y),
    advancing in `direction` ('RIGHT' or 'DOWN')."""
    if x >= w or y >= h:
@ -119,14 +119,14 @@ class CircuitLogicDataset(ProceduralDataset):
        self._current_idx = 0
        return self

-    def __next__(self) -> Dict[str, Any]:
+    def __next__(self) -> dict[str, Any]:
        if self._current_idx >= self.config.size:
            raise StopIteration
        item = self[self._current_idx]
        self._current_idx += 1
        return item

-    def __getitem__(self, idx: int) -> Dict[str, Any]:
+    def __getitem__(self, idx: int) -> dict[str, Any]:
        """
        Generate one random circuit logic item using ASCII drawing.
        """
@ -142,14 +142,14 @@ class CircuitLogicDataset(ProceduralDataset):

    def _generate_circuit(
        self, rng: Random, num_terms: int, min_inputs: int, max_inputs: int, neg_prob: float, allow_reuse: bool
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        """
        Generate circuit logic (ASCII drawing + expression + evaluation)
        """
        final_gate_name, final_gate_sym = rng.choice(self.final_gate_options)
        final_gate_width = 2 + len(final_gate_sym)

-        distinct_inputs: List[str] = []
+        distinct_inputs: list[str] = []

        def get_random_input() -> str:
            if allow_reuse and distinct_inputs and rng.random() < 0.5:
@ -159,8 +159,8 @@ class CircuitLogicDataset(ProceduralDataset):
                distinct_inputs.append(name)
                return name

-        term_ops: List[Tuple[str, str, str]] = []
-        term_strings: List[str] = []
+        term_ops: list[tuple[str, str, str]] = []
+        term_strings: list[str] = []
        for _ in range(num_terms):
            op = rng.choice(self.internal_ops)
            term_ops.append(op)
@ -400,7 +400,7 @@ class CircuitLogicDataset(ProceduralDataset):
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        if answer is None or len(answer) == 0:
            return 0.0

--- a/reasoning_gym/logic/contrib/logic_puzzle/clues.py
+++ b/reasoning_gym/logic/contrib/logic_puzzle/clues.py
@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from functools import wraps
 from itertools import product
-from typing import Iterable, List, Tuple
+from typing import Iterable

 from reasoning_gym.logic.contrib.logic_puzzle.literals import Literal
 from reasoning_gym.logic.contrib.logic_puzzle.sat_utils import from_dnf, neg
@ -39,7 +39,7 @@ class Clue(ABC):
    """Base class for the types of clues that we allow."""

    @abstractmethod
-    def as_cnf(self) -> Iterable[Tuple[str]]: ...
+    def as_cnf(self) -> Iterable[tuple[str]]: ...

    @abstractmethod
    def __repr__(self) -> str: ...
@ -67,7 +67,7 @@ class found_at(Clue):
    value: Literal
    house: int

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        return [(comb(self.value, self.house),)]

    @_capitalize_first
@ -89,7 +89,7 @@ class not_at(Clue):
    value: Literal
    house: int

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        return [(neg(comb(self.value, self.house)),)]

    @_capitalize_first
@ -110,9 +110,9 @@ class same_house(Clue):

    value1: Literal
    value2: Literal
-    houses: Tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))
+    houses: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        return from_dnf((comb(self.value1, i), comb(self.value2, i)) for i in self.houses)

    @_capitalize_first
@ -134,9 +134,9 @@ class consecutive(Clue):

    value1: Literal
    value2: Literal
-    houses: Tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))
+    houses: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        return from_dnf((comb(self.value1, i), comb(self.value2, j)) for i, j in zip(self.houses, self.houses[1:]))

    @_capitalize_first
@ -156,9 +156,9 @@ class beside(Clue):

    value1: Literal
    value2: Literal
-    houses: Tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))
+    houses: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        return from_dnf(
            [(comb(self.value1, i), comb(self.value2, j)) for i, j in zip(self.houses, self.houses[1:])]
            + [(comb(self.value2, i), comb(self.value1, j)) for i, j in zip(self.houses, self.houses[1:])]
@ -182,9 +182,9 @@ class left_of(Clue):

    value1: Literal
    value2: Literal
-    houses: Tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))
+    houses: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        return from_dnf(
            (comb(self.value1, i), comb(self.value2, j)) for i, j in product(self.houses, self.houses) if i < j
        )
@ -207,9 +207,9 @@ class right_of(Clue):

    value1: Literal
    value2: Literal
-    houses: Tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))
+    houses: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        return sat_utils.from_dnf(
            (comb(self.value1, i), comb(self.value2, j)) for i, j in product(self.houses, self.houses) if i > j
        )
@ -233,9 +233,9 @@ class one_between(Clue):

    value1: Literal
    value2: Literal
-    houses: Tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))
+    houses: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        return from_dnf(
            [(comb(self.value1, i), comb(self.value2, j)) for i, j in zip(self.houses, self.houses[2:])]
            + [(comb(self.value2, i), comb(self.value1, j)) for i, j in zip(self.houses, self.houses[2:])]
@ -257,9 +257,9 @@ class two_between(Clue):

    value1: Literal
    value2: Literal
-    houses: Tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))
+    houses: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3, 4, 5))

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        return from_dnf(
            [(comb(self.value1, i), comb(self.value2, j)) for i, j in zip(self.houses, self.houses[3:])]
            + [(comb(self.value2, i), comb(self.value1, j)) for i, j in zip(self.houses, self.houses[3:])]
--- a/reasoning_gym/logic/contrib/logic_puzzle/generate.py
+++ b/reasoning_gym/logic/contrib/logic_puzzle/generate.py
@ -7,7 +7,7 @@ This is a driver script that can be used to generate new zebra puzzles.
 from collections import OrderedDict
 from itertools import product
 from random import Random
-from typing import Dict, Iterable, List, Set, Tuple, Type
+from typing import Iterable, Type

 from tabulate import tabulate

@ -18,18 +18,18 @@ from reasoning_gym.logic.contrib.logic_puzzle.sat_utils import itersolve
 from .clues import Clue, beside, consecutive, found_at, left_of, not_at, one_between, right_of, same_house, two_between


-def generate_found_at(puzzle: Puzzle, solution: OrderedDict[Literal, int]) -> Set[Clue]:
+def generate_found_at(puzzle: Puzzle, solution: OrderedDict[Literal, int]) -> set[Clue]:
    """Generate the `found_at` / `not_at` Clue instances"""
-    clues: Set[Clue] = set()
+    clues: set[Clue] = set()
    for element, loc in solution.items():
        clues.add(found_at(element, loc))

    return clues


-def generate_not_found_at(puzzle: Puzzle, solution: Dict[Literal, int]) -> Set[Clue]:
+def generate_not_found_at(puzzle: Puzzle, solution: dict[Literal, int]) -> set[Clue]:
    """Generate the `found_at` / `not_at` Clue instances"""
-    clues: Set[Clue] = set()
+    clues: set[Clue] = set()
    for element, loc in solution.items():
        for house in puzzle.houses:
            if house != loc:
@ -38,13 +38,13 @@ def generate_not_found_at(puzzle: Puzzle, solution: Dict[Literal, int]) -> Set[C
    return clues


-def generate_same_house(puzzle: Puzzle, solution: OrderedDict[Literal, int]) -> Set[Clue]:
+def generate_same_house(puzzle: Puzzle, solution: OrderedDict[Literal, int]) -> set[Clue]:
    """Generate the `same_house` Clue instances"""

-    clues: Set[Clue] = set()
+    clues: set[Clue] = set()
    for house in puzzle.houses:
        items_at_house = {item: loc for item, loc in solution.items() if loc == house}
-        pairs: Set[Tuple[Literal, Literal]] = {
+        pairs: set[tuple[Literal, Literal]] = {
            (item1, item2) for item1, item2 in product(items_at_house, repeat=2) if item1 != item2
        }
        for pair in pairs:
@ -53,18 +53,18 @@ def generate_same_house(puzzle: Puzzle, solution: OrderedDict[Literal, int]) ->
    return clues


-def generate_consecutive_beside(puzzle: Puzzle, solution: OrderedDict[Literal, int]) -> Set[Clue]:
+def generate_consecutive_beside(puzzle: Puzzle, solution: OrderedDict[Literal, int]) -> set[Clue]:
    """Generate the `consecutive` / `beside` Clue instances

    (Note that consecutive is just a more informative version of beside. Since they have the same
    structure, for every possible combination we'll just keep one.
    """

-    clues: Set[Clue] = set()
+    clues: set[Clue] = set()
    for left, right in zip(puzzle.houses, puzzle.houses[1:]):
        items_left = {item: loc for item, loc in solution.items() if loc == left}
        items_right = {item: loc for item, loc in solution.items() if loc == right}
-        pairs: Set[Tuple[Literal, Literal]] = {(item1, item2) for item1, item2 in product(items_left, items_right)}
+        pairs: set[tuple[Literal, Literal]] = {(item1, item2) for item1, item2 in product(items_left, items_right)}
        # sorted, no hash randomization
        for pair in sorted(pairs):
            # consecutive is just a more informative version of beside, but they have same structure
@ -77,20 +77,20 @@ def generate_consecutive_beside(puzzle: Puzzle, solution: OrderedDict[Literal, i
    return clues


-def generate_left_right_of(puzzle: Puzzle, solution: Dict[Literal, int]) -> Set[Clue]:
+def generate_left_right_of(puzzle: Puzzle, solution: dict[Literal, int]) -> set[Clue]:
    """Generate the `left_of` / `right_of` Clue instances
    Note that since (x left-of y) is guaranteed to be redundant with (b right-of a), we only add
    one of these clues to the final set.
    """

-    clues: Set[Clue] = set()
+    clues: set[Clue] = set()
    for left, right in product(puzzle.houses, puzzle.houses):
        if left >= right:
            continue

        items_left = {item: loc for item, loc in solution.items() if loc == left}
        items_right = {item: loc for item, loc in solution.items() if loc == right}
-        pairs: Set[Tuple[Literal, Literal]] = {(item1, item2) for item1, item2 in product(items_left, items_right)}
+        pairs: set[tuple[Literal, Literal]] = {(item1, item2) for item1, item2 in product(items_left, items_right)}
        # sorted, no hash randomization
        for pair in sorted(pairs):
            if puzzle.rng.randint(0, 1) == 0:
@ -101,28 +101,28 @@ def generate_left_right_of(puzzle: Puzzle, solution: Dict[Literal, int]) -> Set[
    return clues


-def generate_one_between(puzzle: Puzzle, solution: Dict[Literal, int]) -> Set[Clue]:
+def generate_one_between(puzzle: Puzzle, solution: dict[Literal, int]) -> set[Clue]:
    """Generate the `one_between` Clue instances"""

-    clues: Set[Clue] = set()
+    clues: set[Clue] = set()
    for left, right in zip(puzzle.houses, puzzle.houses[2:]):
        items_left = {item: loc for item, loc in solution.items() if loc == left}
        items_right = {item: loc for item, loc in solution.items() if loc == right}
-        pairs: Set[Tuple[Literal, Literal]] = {(item1, item2) for item1, item2 in product(items_left, items_right)}
+        pairs: set[tuple[Literal, Literal]] = {(item1, item2) for item1, item2 in product(items_left, items_right)}
        for pair in pairs:
            clues.add(one_between(pair[0], pair[1], puzzle.houses))

    return clues


-def generate_two_between(puzzle: Puzzle, solution: Dict[Literal, int]) -> Set[Clue]:
+def generate_two_between(puzzle: Puzzle, solution: dict[Literal, int]) -> set[Clue]:
    """Generate the `two_between` Clue instances"""

-    clues: Set[Clue] = set()
+    clues: set[Clue] = set()
    for left, right in zip(puzzle.houses, puzzle.houses[3:]):
        items_left = {item: loc for item, loc in solution.items() if loc == left}
        items_right = {item: loc for item, loc in solution.items() if loc == right}
-        pairs: Set[Tuple[Literal, Literal]] = {(item1, item2) for item1, item2 in product(items_left, items_right)}
+        pairs: set[tuple[Literal, Literal]] = {(item1, item2) for item1, item2 in product(items_left, items_right)}
        for pair in pairs:
            clues.add(two_between(pair[0], pair[1], puzzle.houses))

@ -144,7 +144,7 @@ def has_unique_solution(puzzle: Puzzle, clues: Iterable[Clue]) -> bool:
            return False


-def try_to_remove(puzzle: Puzzle, clues: Set[Clue], n: int, must_have=set()) -> Set[Clue]:
+def try_to_remove(puzzle: Puzzle, clues: set[Clue], n: int, must_have=set()) -> set[Clue]:
    """
    Attempt to remove n clues from a set of candidate clues; if we are able to, return the new,
    smaller set of clues. If not, return the original set.
@ -152,7 +152,7 @@ def try_to_remove(puzzle: Puzzle, clues: Set[Clue], n: int, must_have=set()) ->

    def weight(clue: Clue) -> float:
        # relative probabilities of each type of clue being selected for removal
-        weights: Dict[Type[Clue], float] = {
+        weights: dict[Type[Clue], float] = {
            not_at: 0.75,
            found_at: 0.75,
            same_house: 0.75,
@ -167,7 +167,7 @@ def try_to_remove(puzzle: Puzzle, clues: Set[Clue], n: int, must_have=set()) ->

    # sorted, no hash randomization
    weights = [weight(clue) for clue in sorted(clues)]
-    candidates: Set[Clue] = set(puzzle.rng.choices(sorted(clues), weights, k=n))
+    candidates: set[Clue] = set(puzzle.rng.choices(sorted(clues), weights, k=n))
    candidates = candidates - must_have
    clues = clues.difference(candidates)
    if has_unique_solution(puzzle, clues):
@ -180,8 +180,8 @@ def try_to_remove(puzzle: Puzzle, clues: Set[Clue], n: int, must_have=set()) ->


 def reduce_individually(
-    puzzle: Puzzle, clues: Set[Clue], removed: Set[Clue], must_have=set()
-) -> Tuple[Set[Clue], Set[Clue]]:
+    puzzle: Puzzle, clues: set[Clue], removed: set[Clue], must_have=set()
+) -> tuple[set[Clue], set[Clue]]:
    """
    Attempt to remove each candidate clue one by one.

@ -202,7 +202,7 @@ def reduce_individually(
    return clues, removed


-def reduce_clues(puzzle: Puzzle, clues: Set[Clue], must_have=set()) -> Tuple[Set[Clue], Set[Clue]]:
+def reduce_clues(puzzle: Puzzle, clues: set[Clue], must_have=set()) -> tuple[set[Clue], set[Clue]]:
    """
    Reduce a set of clues to a minimally solvable set.

@ -265,7 +265,7 @@ def reduce_clues(puzzle: Puzzle, clues: Set[Clue], must_have=set()) -> Tuple[Set

    # secondary reduction time! While we can still remove clues, do so; then we're done.
    # print(f"Starting the secondary reduction.")
-    removed_clues: Set[Clue] = set()
+    removed_clues: set[Clue] = set()
    while True:
        minimal_clues_size = len(minimal_clues)
        minimal_clues, removed_clues = reduce_individually(puzzle, minimal_clues, removed_clues, must_have)
@ -304,12 +304,12 @@ def question_generation(rng: Random, col_name, table_data):
    return questions_data


-def generate_solution_dict(rng: Random, selected_elements: List[Literal], n: int) -> OrderedDict[Literal, int]:
+def generate_solution_dict(rng: Random, selected_elements: list[Literal], n: int) -> OrderedDict[Literal, int]:
    solution = OrderedDict()
    house_ids = list(range(1, n + 1))
    for element in selected_elements:
        rng.shuffle(house_ids)
-        attributes: List[Literal] = sorted(element.__members__.values())
+        attributes: list[Literal] = sorted(element.__members__.values())
        for i in range(n):
            solution[attributes[i]] = house_ids[i]
    return solution
@ -376,7 +376,7 @@ def generate_puzzle(rng: Random, K=2, M=3) -> tuple[OrderedDict, Puzzle]:
    context = str(puzzle)

    # generate all the clues
-    clues: Set[Clue] = set()
+    clues: set[Clue] = set()

    for generate_function in clue_types:
        clues = clues.union(generate_function(puzzle, solution))
--- a/reasoning_gym/logic/contrib/logic_puzzle/puzzle.py
+++ b/reasoning_gym/logic/contrib/logic_puzzle/puzzle.py
@ -7,7 +7,7 @@ from __future__ import annotations

 from contextlib import contextmanager
 from random import Random
-from typing import Generator, Iterable, List, Set, Tuple, Type
+from typing import Generator, Iterable, Type

 from reasoning_gym.logic.contrib.logic_puzzle.clues import (
    Clue,
@ -82,12 +82,12 @@ class Puzzle:
            self.literals = list(elements)

        self.houses = tuple(range(1, n_houses + 1))
-        self.clues: Set[Clue] = set()
-        self.constraints: List[Tuple[str]] = []
-        self.extra_clues: Set[Clue] = set()
+        self.clues: set[Clue] = set()
+        self.constraints: list[tuple[str]] = []
+        self.extra_clues: set[Clue] = set()
        self.solution = None

-    def _add_constraint(self, constraints: List[Tuple[str]]) -> Puzzle:
+    def _add_constraint(self, constraints: list[tuple[str]]) -> Puzzle:
        self.constraints.extend(constraints)
        return self

@ -128,7 +128,7 @@ class Puzzle:

        return self

-    def as_cnf(self) -> List[Tuple[str]]:
+    def as_cnf(self) -> list[tuple[str]]:
        """Express puzzle as solvable CNF"""

        # this would be a comprehension if we could use iterable unpacking
@ -195,8 +195,8 @@ they smoke, and what pet they own.
 """

 if __name__ == "__main__":
-    enum_classes: List[Type[Literal]] = [Color, Nationality, Animal, Drink, Cigar]
-    literals: List[Literal] = [el for group in enum_classes for el in group]
+    enum_classes: list[Type[Literal]] = [Color, Nationality, Animal, Drink, Cigar]
+    literals: list[Literal] = [el for group in enum_classes for el in group]

    # set up the puzzle with constraints and clues
    puzzle = Puzzle(rng=Random(), element_types=[Color, Nationality, Drink, Cigar, Animal])
@ -245,7 +245,7 @@ in between them that neither is sitting in).
 """

 if __name__ == "__main__":
-    enum_classes: List[Type[Literal]] = [Mother, Children, Flower, Food]
+    enum_classes: list[Type[Literal]] = [Mother, Children, Flower, Food]
    literals = [el for group in enum_classes for el in group]

    # set up the puzzle with constraints and clues
--- a/reasoning_gym/logic/contrib/logic_puzzle/sat_utils.py
+++ b/reasoning_gym/logic/contrib/logic_puzzle/sat_utils.py
@ -4,15 +4,15 @@ __author__ = "Raymond Hettinger"

 from functools import lru_cache
 from itertools import combinations
-from typing import Dict, FrozenSet, Iterable, List, Set, Tuple
+from typing import FrozenSet, Iterable

 import pycosat

 Element = str  # literal; any string, but here it's <element house#> e.g., "tushar 5" or "chai 2"
-CNF = List[Tuple[Element, ...]]
+CNF = list[tuple[Element, ...]]


-def make_translate(cnf: CNF) -> Tuple[Dict[Element, int], Dict[int, Element]]:
+def make_translate(cnf: CNF) -> tuple[dict[Element, int], dict[int, Element]]:
    """Make a translator from symbolic CNF to pycosat's numbered clauses.

    Return literal to number dictionary and reverse lookup dict.
@ -22,7 +22,7 @@ def make_translate(cnf: CNF) -> Tuple[Dict[Element, int], Dict[int, Element]]:
     {1: 'a', 2: 'b', 3: 'c', -1: '~a', -3: '~c', -2: '~b'})
    """

-    lit2num: Dict[Element, int] = {}
+    lit2num: dict[Element, int] = {}
    for clause in cnf:
        for literal in clause:
            if literal not in lit2num:
@ -36,7 +36,7 @@ def make_translate(cnf: CNF) -> Tuple[Dict[Element, int], Dict[int, Element]]:
    return lit2num, num2var


-def translate(cnf: CNF, uniquify=False) -> Tuple[List[Tuple[int, ...]], Dict[int, Element]]:
+def translate(cnf: CNF, uniquify=False) -> tuple[list[tuple[int, ...]], dict[int, Element]]:
    """Translate a symbolic CNF to a numbered CNF and return reverse mapping.

    >>> translate([['~P', 'Q'],['~P', 'R']])
@ -78,14 +78,14 @@ def neg(element: str) -> str:
    return element[1:] if element.startswith("~") else "~" + element


-def from_dnf(groups: Iterable[Tuple[str, ...]]) -> CNF:
+def from_dnf(groups: Iterable[tuple[str, ...]]) -> CNF:
    """Convert from or-of-ands to and-of-ors

    >>> from_dnf([['~P'], ['Q', 'R']])
    [('~P', 'Q'), ('~P', 'R')]
    """

-    cnf: Set[FrozenSet[str]] = {frozenset()}
+    cnf: set[FrozenSet[str]] = {frozenset()}
    for group in groups:
        nl = {frozenset([literal]): neg(literal) for literal in group}
        # The "clause | literal" prevents dup lits: {x, x, y} -> {x, y}
@ -134,7 +134,7 @@ class Q:
        return f"{self.__class__.__name__}(elements={self.elements!r})"


-def all_of(elements: List[Element]) -> CNF:
+def all_of(elements: list[Element]) -> CNF:
    """Forces inclusion of matching rows on a truth table"""
    return Q(elements) == len(elements)

--- a/reasoning_gym/logic/propositional_logic.py
+++ b/reasoning_gym/logic/propositional_logic.py
@ -3,7 +3,7 @@
 from dataclasses import dataclass
 from enum import StrEnum
 from random import Random
-from typing import Any, List, Optional, Set
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -123,7 +123,7 @@ class PropositionalLogicDataset(ProceduralDataset):
            },
        }

-    def _generate_premises(self, rng: Random, variables: List[str], num_statements: int) -> List[Expression]:
+    def _generate_premises(self, rng: Random, variables: list[str], num_statements: int) -> list[Expression]:
        """Generate a list of premise statements"""
        premises = []
        for _ in range(num_statements):
@ -131,7 +131,7 @@ class PropositionalLogicDataset(ProceduralDataset):
            premises.append(self._generate_expression(rng, variables, depth))
        return premises

-    def _generate_expression(self, rng: Random, variables: List[str], depth: int) -> Expression:
+    def _generate_expression(self, rng: Random, variables: list[str], depth: int) -> Expression:
        """Generate a random logical expression"""
        if depth <= 1:
            return Expression(None, rng.choice(variables))
@ -144,7 +144,7 @@ class PropositionalLogicDataset(ProceduralDataset):
            right = self._generate_expression(rng, variables, depth - 1)
            return Expression(operator, left, right)

-    def _find_valid_conclusion(self, rng: Random, premises: List[Expression], variables: List[str]) -> Expression:
+    def _find_valid_conclusion(self, rng: Random, premises: list[Expression], variables: list[str]) -> Expression:
        """Find a valid conclusion that follows from the premises"""
        # Try random conclusions until we find a valid one
        for _ in range(100):
@ -155,7 +155,7 @@ class PropositionalLogicDataset(ProceduralDataset):
        # Fallback to a simple conclusion
        return Expression(None, variables[0])

-    def _is_valid_conclusion(self, premises: List[Expression], conclusion: Expression) -> bool:
+    def _is_valid_conclusion(self, premises: list[Expression], conclusion: Expression) -> bool:
        """Check if conclusion follows from premises using truth tables"""
        variables = self._collect_variables(premises + [conclusion])

@ -166,7 +166,7 @@ class PropositionalLogicDataset(ProceduralDataset):
                return False
        return True

-    def _collect_variables(self, expressions: List[Expression]) -> Set[str]:
+    def _collect_variables(self, expressions: list[Expression]) -> set[str]:
        """Collect all variables used in expressions"""
        variables = set()
        for expr in expressions:
@ -179,7 +179,7 @@ class PropositionalLogicDataset(ProceduralDataset):
                    variables.update(self._collect_variables([expr.right]))
        return variables

-    def _generate_assignments(self, variables: Set[str]) -> List[dict[str, bool]]:
+    def _generate_assignments(self, variables: set[str]) -> list[dict[str, bool]]:
        """Generate all possible truth value assignments"""
        assignments = []
        for i in range(2 ** len(variables)):
--- a/reasoning_gym/logic/self_reference.py
+++ b/reasoning_gym/logic/self_reference.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset

@ -349,14 +349,14 @@ class SelfReferenceDataset(ProceduralDataset):
            "metadata": {},
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the SelfReference task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/logic/syllogisms.py
+++ b/reasoning_gym/logic/syllogisms.py
@ -3,7 +3,7 @@
 from dataclasses import dataclass
 from enum import StrEnum
 from random import Random
-from typing import List, Optional, Tuple
+from typing import Optional

 from ..factory import ProceduralDataset, register_dataset

@ -105,7 +105,7 @@ class SyllogismDataset(ProceduralDataset):
        super().__init__(config=config, seed=config.seed, size=config.size)
        self.terms = self.DEFAULT_TERMS

-    def _get_allowed_quantifiers(self) -> List[Quantifier]:
+    def _get_allowed_quantifiers(self) -> list[Quantifier]:
        """Get list of allowed quantifiers based on config"""
        quantifiers = []
        if self.config.allow_all:
@ -120,9 +120,9 @@ class SyllogismDataset(ProceduralDataset):

    @staticmethod
    def _is_valid_syllogism(
-        premise1: Tuple[Quantifier, "Term", "Term"],
-        premise2: Tuple[Quantifier, "Term", "Term"],
-        conclusion: Tuple[Quantifier, "Term", "Term"],
+        premise1: tuple[Quantifier, "Term", "Term"],
+        premise2: tuple[Quantifier, "Term", "Term"],
+        conclusion: tuple[Quantifier, "Term", "Term"],
    ) -> bool:
        """
        Checks whether a given syllogism is valid under classical (Aristotelian) rules,
@ -247,7 +247,7 @@ class SyllogismDataset(ProceduralDataset):
            return f"{quantifier.value} {subject.plural} are {predicate.plural}"

    def _check_logical_equivalence(
-        self, premise: Tuple[Quantifier, Term, Term], conclusion: Tuple[Quantifier, Term, Term]
+        self, premise: tuple[Quantifier, Term, Term], conclusion: tuple[Quantifier, Term, Term]
    ) -> bool:
        """Check if a conclusion is logically equivalent to a premise"""
        p_quant, p_subj, p_pred = premise
--- a/reasoning_gym/logic/zebra_puzzles.py
+++ b/reasoning_gym/logic/zebra_puzzles.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from random import Random
-from typing import Dict, Optional
+from typing import Any, Optional

 from ..factory import ProceduralDataset, register_dataset
 from .contrib.logic_puzzle.generate import generate_puzzle
@ -55,14 +55,14 @@ class ZebraDataset(ProceduralDataset):
            },
        }

-    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
        """Determine if the solution provided solves the Zebra task.

        The function awards 1.0 for a correct answer.

        Args:
            answer (Optional[str]): The user's answer.
-            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+            entry (dict[str, Any]): The original dataset entry containing the correct answer.

        Returns:
            float: The computed score between 0.0 and 1.0.
--- a/reasoning_gym/version_manager.py
+++ b/reasoning_gym/version_manager.py
@ -1,6 +1,6 @@
 """Version manager for tracking dataset versions."""

-from typing import Dict, Optional, Tuple
+from typing import Any, Optional

 from .dataset import ProceduralDataset

@ -12,7 +12,7 @@ class DatasetVersionManager:
        """Initialize the version manager."""
        self.current_version = 0
        # version_id -> (dataset_name, dataset_instance)
-        self.datasets: Dict[int, Tuple[str, ProceduralDataset]] = {}
+        self.datasets: dict[int, tuple[str, ProceduralDataset]] = {}

    def register_dataset(self, name: str, dataset: ProceduralDataset) -> int:
        """
@ -29,7 +29,7 @@ class DatasetVersionManager:
        self.datasets[self.current_version] = (name, dataset)
        return self.current_version

-    def get_dataset(self, version_id: int) -> Optional[Tuple[str, ProceduralDataset]]:
+    def get_dataset(self, version_id: int) -> Optional[tuple[str, ProceduralDataset]]:
        """
        Retrieve a dataset by its version ID.

@ -41,7 +41,7 @@ class DatasetVersionManager:
        """
        return self.datasets.get(version_id)

-    def get_entry(self, version_id: int, index: int) -> Dict[str, any]:
+    def get_entry(self, version_id: int, index: int) -> dict[str, Any]:
        """
        Get a specific entry from a versioned dataset.

--- a/tests/test_coaching.py
+++ b/tests/test_coaching.py
@ -1,7 +1,6 @@
 import json
 import math
 from collections import OrderedDict
-from pathlib import Path

 import pytest

--- a/tools/cli/rgc/client.py
+++ b/tools/cli/rgc/client.py
@ -105,7 +105,7 @@ class RGClient:
        response.raise_for_status()
        return BatchResponse.model_validate(response.json())

-    def score_outputs(self, experiment: str, entry_answers: List[AnswerItem]) -> ScoringResponse:
+    def score_outputs(self, experiment: str, entry_answers: list[AnswerItem]) -> ScoringResponse:
        """Score a batch of answers.

        Args:
--- a/tools/server/models.py
+++ b/tools/server/models.py
@ -1,6 +1,6 @@
 """Pydantic models for API request/response data."""

-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional

 from pydantic import BaseModel, Field

@ -11,7 +11,7 @@ class ExperimentCreate(BaseModel):
    name: str = Field(..., description="Unique name for the experiment")
    size: int = Field(500, description="Size of the dataset")
    seed: Optional[int] = Field(None, description="Random seed for reproducibility")
-    datasets: Dict[str, Dict[str, Any]] = Field(..., description="Dictionary of datasets configurations")
+    datasets: dict[str, dict[str, Any]] = Field(..., description="Dictionary of datasets configurations")


 class ExperimentResponse(BaseModel):
@ -20,19 +20,19 @@ class ExperimentResponse(BaseModel):
    name: str = Field(..., description="Name of the experiment")
    size: int = Field(..., description="Size of the dataset")
    seed: Optional[int] = Field(None, description="Random seed used")
-    datasets: Dict[str, Dict[str, Any]] = Field(..., description="Current dataset configurations")
+    datasets: dict[str, dict[str, Any]] = Field(..., description="Current dataset configurations")


 class ExperimentList(BaseModel):
    """Response model for listing experiments."""

-    experiments: List[str] = Field(default_factory=list, description="List of registered experiment names")
+    experiments: list[str] = Field(default_factory=list, description="List of registered experiment names")


 class DatasetConfigUpdate(BaseModel):
    """Request model for updating dataset configuration."""

-    config: Dict[str, Any] = Field(..., description="Configuration parameters to update")
+    config: dict[str, Any] = Field(..., description="Configuration parameters to update")


 class ErrorResponse(BaseModel):
@ -46,13 +46,13 @@ class BatchEntry(BaseModel):

    question: str = Field(..., description="The question text")
    entry_id: str = Field(..., description="Unique identifier in format '{version}.{index}'")
-    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata about the entry")
+    metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata about the entry")


 class BatchResponse(BaseModel):
    """Response containing a batch of entries"""

-    entries: List[BatchEntry] = Field(..., description="List of batch entries")
+    entries: list[BatchEntry] = Field(..., description="List of batch entries")


 class AnswerItem(BaseModel):
@ -65,11 +65,11 @@ class AnswerItem(BaseModel):
 class ScoringRequest(BaseModel):
    """Request for scoring model outputs"""

-    answers: List[AnswerItem] = Field(..., description="List of entries to score")
+    answers: list[AnswerItem] = Field(..., description="List of entries to score")


 class ScoringResponse(BaseModel):
    """Response containing scores for answers"""

-    scores: List[float] = Field(..., description="List of scores in same order as request")
-    entry_ids: List[str] = Field(..., description="List of entry_ids in same order as request")
+    scores: list[float] = Field(..., description="List of scores in same order as request")
+    entry_ids: list[str] = Field(..., description="List of entry_ids in same order as request")