add simple dataset gallery generation script

2026-04-28 17:29:39 +00:00 · 2025-01-30 22:30:26 +01:00 · 2025-01-30 22:30:26 +01:00 · 5a88cf2529
commit 5a88cf2529
parent 71ccd41adb
6 changed files with 1352 additions and 105 deletions
--- a/GALLERY.md
+++ b/GALLERY.md
--- a/63
+++ b/63
@ -1,63 +0,0 @@
 """Generate a markdown gallery of all available datasets with examples"""
 from pathlib import Path
 import textwrap
 from reasoning_gym.factory import DATASETS, create_dataset
 def generate_gallery() -> str:
    """Generate markdown content for the gallery"""
    # Start with header
    content = ["# Dataset Gallery\n"]
    # Add index
    content.append("## Available Datasets\n")
    for name in sorted(DATASETS.keys()):
        # Create anchor link
        anchor = name.replace("_", "-")
        content.append(f"- [{name}](#{anchor})\n")
    content.append("\n")
    # Add examples for each dataset
    content.append("## Examples\n")
    for name in sorted(DATASETS.keys()):
        dataset = create_dataset(name)
        # Add dataset header
        content.append(f"### {name}\n")
        # Get dataset class docstring if available
        if dataset.__class__.__doc__:
            doc = textwrap.dedent(dataset.__class__.__doc__.strip())
            content.append(f"{doc}\n")
        content.append("```\n")
        # Show 3 examples
        for i, item in enumerate(dataset):
            if i >= 3:
                break
            content.append(f"Example {i+1}:\n")
            content.append(f"Question: {item['question']}\n")
            content.append(f"Answer: {item['answer']}\n")
            content.append(f"Metadata: {item['metadata']}\n")
            content.append("\n")
        content.append("```\n\n")
    return "".join(content)
 def main():
    """Generate gallery markdown file"""
    gallery_path = Path(__file__).parent.parent / "GALLERY.md"
    gallery_content = generate_gallery()
    with open(gallery_path, "w") as f:
        f.write(gallery_content)
    print(f"Generated gallery at {gallery_path}")
 if __name__ == "__main__":
    main()
--- a/reasoning_gym/dataset.py
+++ b/reasoning_gym/dataset.py
@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from collections.abc import Iterable, Sized
 from copy import deepcopy
 from random import Random
-from typing import Any, Dict, Iterator, Optional, TypeVar, Type
+from typing import Any, Dict, Iterator, Optional, Type, TypeVar
 class ProceduralDataset(ABC, Sized, Iterable[Dict[str, Any]]):
@ -66,15 +66,15 @@ class ProceduralDataset(ABC, Sized, Iterable[Dict[str, Any]]):
        return reward
-T = TypeVar('T', bound='ProceduralDataset')
+T = TypeVar("T", bound="ProceduralDataset")
 class ReseedingDataset(Iterable[Dict[str, Any]]):
    """Wrapper that makes any ProceduralDataset infinite by reseeding when reaching the end"""
-    
+
    def __init__(self, dataset: T, chunk_size: int = 500):
        """Initialize with dataset instance and chunk size
-        
+
        Args:
            dataset: The ProceduralDataset instance to wrap
            chunk_size: Size of each generated chunk before reseeding
@ -82,12 +82,12 @@ class ReseedingDataset(Iterable[Dict[str, Any]]):
        self.dataset = dataset
        self.dataset_cls: Type[T] = type(dataset)
        self.chunk_size = chunk_size
-        
+
        # Start with chunk 0
        self._current_chunk = 0
        self._current_dataset = self._create_chunk(0)
        self._current_idx = 0
-        
+
    def _create_chunk(self, chunk_num: int) -> T:
        """Create a new dataset chunk with unique seed"""
        # Create new config with modified seed
@ -95,17 +95,17 @@ class ReseedingDataset(Iterable[Dict[str, Any]]):
        if hasattr(new_config, "seed"):
            # Derive new seed from chunk number using dataset's seed, wrapping around at 2^32
            new_config.seed = (self.dataset.seed + chunk_num) % (2**32)
-            
+
        # Create new dataset instance with chunk config
        return self.dataset_cls(new_config)
-        
+
    def __iter__(self) -> Iterator[Dict[str, Any]]:
        """Make the dataset iterable"""
        self._current_chunk = 0
        self._current_dataset = self._create_chunk(0)
        self._current_idx = 0
        return self
-        
+
    def __next__(self) -> Dict[str, Any]:
        """Get next item, creating new chunk if needed"""
        if self._current_idx >= self.chunk_size:
@ -113,11 +113,11 @@ class ReseedingDataset(Iterable[Dict[str, Any]]):
            self._current_chunk += 1
            self._current_dataset = self._create_chunk(self._current_chunk)
            self._current_idx = 0
-            
+
        item = self._current_dataset[self._current_idx]
        self._current_idx += 1
        return item
-        
+
    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
        """Forward scoring to the wrapped dataset's implementation"""
        return self.dataset.score_answer(answer, entry)
--- a/reasoning_gym/graphs/init.py
+++ b/reasoning_gym/graphs/init.py
@ -1,3 +1,9 @@
 from .family_relationships import FamilyRelationshipsConfig, FamilyRelationshipsDataset
 from .quantum_lock import QuantumLockConfig, QuantumLockDataset
-__all__ = ["FamilyRelationshipsDataset", "FamilyRelationshipsConfig"]
+__all__ = [
    "FamilyRelationshipsDataset",
    "FamilyRelationshipsConfig",
    "QuantumLockConfig",
    "QuantumLockDataset",
 ]
--- a/scripts/generate_gallery.py
+++ b/scripts/generate_gallery.py
@ -2,19 +2,21 @@
 """Generate a markdown gallery of all available datasets with examples"""
 import os
 from pathlib import Path
 import textwrap
 from pathlib import Path
 import reasoning_gym.cognition.figlet_fonts
 import reasoning_gym.cognition.rubiks_cube
 from reasoning_gym.factory import DATASETS, create_dataset
 def generate_gallery() -> str:
    """Generate markdown content for the gallery"""
-    
+
    # Start with header
    content = ["# Reasoning Gym Dataset Gallery\n"]
    content.append("This gallery shows examples from all available datasets using their default configurations.\n\n")
-    
+
    # Add index
    content.append("## Available Datasets\n")
    for name in sorted(DATASETS.keys()):
@ -22,21 +24,21 @@ def generate_gallery() -> str:
        anchor = name.replace("_", "-").lower()
        content.append(f"- [{name}](#{anchor})\n")
    content.append("\n")
-    
+
    # Add examples for each dataset
    content.append("## Dataset Examples\n")
    for name in sorted(DATASETS.keys()):
        dataset = create_dataset(name)
-        
+
        # Add dataset header with anchor
        anchor = name.replace("_", "-").lower()
        content.append(f"### {name} {{{anchor}}}\n")
-        
+
        # Get dataset class docstring if available
        if dataset.__class__.__doc__:
            doc = textwrap.dedent(dataset.__class__.__doc__.strip())
            content.append(f"{doc}\n\n")
-        
+
        # Show configuration
        content.append("Default configuration:\n")
        content.append("```python\n")
@ -44,7 +46,7 @@ def generate_gallery() -> str:
            if not key.startswith("_"):
                content.append(f"{key} = {value}\n")
        content.append("```\n\n")
-        
+
        # Show examples
        content.append("Example tasks:\n")
        content.append("```\n")
@ -54,11 +56,11 @@ def generate_gallery() -> str:
            content.append(f"Example {i+1}:\n")
            content.append(f"Question: {item['question']}\n")
            content.append(f"Answer: {item['answer']}\n")
-            if item.get('metadata'):
+            if item.get("metadata"):
                content.append(f"Metadata: {item['metadata']}\n")
            content.append("\n")
        content.append("```\n\n")
-    
+
    return "".join(content)
@ -68,13 +70,13 @@ def main():
    script_dir = Path(__file__).parent
    if not script_dir.exists():
        script_dir.mkdir(parents=True)
-        
+
    gallery_path = script_dir.parent / "GALLERY.md"
    gallery_content = generate_gallery()
-    
+
    with open(gallery_path, "w") as f:
        f.write(gallery_content)
-    
+
    print(f"Generated gallery at {gallery_path}")
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@ -1,46 +1,39 @@
 import pytest
 from reasoning_gym.dataset import ReseedingDataset
 from reasoning_gym.arithmetic.basic_arithmetic import BasicArithmeticDataset, BasicArithmeticDatasetConfig
 from reasoning_gym.dataset import ReseedingDataset
 def test_reseeding_dataset_iteration():
    """Test that ReseedingDataset provides infinite iteration with consistent chunks"""
-    
+
    # Create base dataset
    config = BasicArithmeticDatasetConfig(
-        min_terms=2,
+        min_terms=2, max_terms=3, min_digits=1, max_digits=2, operators=["+"], allow_parentheses=False, seed=42, size=10
        max_terms=3,
        min_digits=1,
        max_digits=2,
        operators=["+"],
        allow_parentheses=False,
        seed=42,
        size=10
    )
    base_dataset = BasicArithmeticDataset(config)
-    
+
    # Create reseeding dataset with small chunk size
    chunk_size = 3
    infinite_dataset = ReseedingDataset(base_dataset, chunk_size=chunk_size)
-    
+
    # Get first 10 items
    first_items = []
    for _, item in zip(range(10), infinite_dataset):
        first_items.append(item["question"])
-    
+
    # Create new iterator and verify first 10 items are identical
    second_items = []
    for _, item in zip(range(10), infinite_dataset):
        second_items.append(item["question"])
-    
+
    assert first_items == second_items, "Items should be deterministic across iterations"
-    
+
    # Verify chunks are different
    chunk1 = first_items[:chunk_size]
-    chunk2 = first_items[chunk_size:2*chunk_size]
+    chunk2 = first_items[chunk_size : 2 * chunk_size]
    assert chunk1 != chunk2, "Different chunks should generate different items"
-    
+
    # Test score_answer forwarding
    test_item = next(iter(infinite_dataset))
    assert infinite_dataset.score_answer("wrong", test_item) == 0.01