diff --git a/pyproject.toml b/pyproject.toml index 49f2d033..3129330b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,9 @@ cli = [ "pyyaml>=6.0.1", "httpx>=0.27.0", ] +scripts = [ + "datasets>=3.5.0" +] [project.urls] "Homepage" = "https://github.com/open-thought/reasoning-gym" diff --git a/scripts/hf_dataset/example_hf_dataset_config.yaml b/scripts/hf_dataset/example_hf_dataset_config.yaml new file mode 100644 index 00000000..7388c038 --- /dev/null +++ b/scripts/hf_dataset/example_hf_dataset_config.yaml @@ -0,0 +1,44 @@ +# Example configuration for generating Reasoning Gym datasets for Hugging Face +# Used with save_hf_dataset.py + +reasoning_gym: + # Total size of the dataset to generate + dataset_size: 20000 + + # Datasets to include in the composite dataset + datasets: + # Example algorithmic tasks + spell_backward: + # Weight of this dataset in the composite (weights should sum to 1.0) + weight: 0.33 + # Dataset-specific configuration + config: + min_word_len: 3 + max_word_len: 10 + + letter_jumble: + weight: 0.34 + config: + min_word_len: 1 # Minimum word length + max_word_len: 50 # Maximum word length + min_words: 3 # Minimum words per task + max_words: 40 + + word_sorting: + weight: 0.33 + config: + min_words: 3 + max_words: 10 + min_word_length: 3 + max_word_length: 12 + +# Hugging Face upload settings +huggingface: + # Repository ID (required when using this config file directly) + repo_id: "username/reasoning-gym-dataset" + + # Whether to make the repository private + private: false + + # Dataset split name + split: "train" diff --git a/scripts/hf_dataset/save_hf_dataset.py b/scripts/hf_dataset/save_hf_dataset.py new file mode 100644 index 00000000..641b9784 --- /dev/null +++ b/scripts/hf_dataset/save_hf_dataset.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +""" +Script to generate Reasoning Gym datasets and save them to the Hugging Face Hub. +""" + +import argparse +from typing import Dict, List, Optional + +import yaml +from datasets import Dataset +from tqdm import tqdm + +from reasoning_gym.composite import DatasetSpec +from reasoning_gym.factory import DATASETS, create_dataset + + +def generate_dataset( + dataset_names: List[str], + dataset_size: int = 20000, + seed: int = 42, + weights: Optional[Dict[str, float]] = None, + configs: Optional[Dict[str, Dict]] = None, +) -> Dataset: + """ + Generate a dataset from the specified Reasoning Gym datasets. + + Args: + dataset_names: List of dataset names to include + dataset_size: Total size of the dataset to generate + seed: Random seed for dataset generation + weights: Optional dictionary mapping dataset names to weights + configs: Optional dictionary mapping dataset names to configurations + + Returns: + A Hugging Face Dataset object + """ + # Validate dataset names + for name in dataset_names: + if name not in DATASETS: + raise ValueError(f"Dataset '{name}' not found. Available datasets: {sorted(DATASETS.keys())}") + + # Set default weights if not provided + if weights is None: + equal_weight = 1.0 / len(dataset_names) + weights = {name: equal_weight for name in dataset_names} + else: + # Validate weights + for name in dataset_names: + if name not in weights: + weights[name] = 0.0 + print(f"Warning: No weight provided for {name}, setting to 0.0") + + # Set default configs if not provided + if configs is None: + configs = {name: {} for name in dataset_names} + else: + # Add empty configs for missing datasets + for name in dataset_names: + if name not in configs: + configs[name] = {} + + # Create dataset specs + dataset_specs = [DatasetSpec(name=name, weight=weights[name], config=configs[name]) for name in dataset_names] + + # Create composite dataset + data_source = create_dataset("composite", seed=seed, size=dataset_size, datasets=dataset_specs) + + # Generate all examples + examples = [] + for idx in tqdm(range(dataset_size), desc="Generating examples"): + example = data_source[idx] + examples.append(example) + + # Convert to HF Dataset + hf_dataset = Dataset.from_list(examples) + return hf_dataset + + +def save_to_hub( + dataset: Dataset, + repo_id: str, + token: Optional[str] = None, + private: bool = False, + commit_message: str = "Upload reasoning_gym dataset", + split: Optional[str] = None, +) -> str: + """ + Save the dataset to the Hugging Face Hub. + + Args: + dataset: HF Dataset to save + repo_id: Hugging Face repo ID (e.g., "username/dataset-name") + token: HF API token + private: Whether the repository should be private + commit_message: Commit message + split: Dataset split name + + Returns: + URL of the uploaded dataset + """ + # Push to the hub + dataset.push_to_hub( + repo_id, + token=token, + private=private, + commit_message=commit_message, + ) + + print(f"Dataset pushed to https://huggingface.co/datasets/{repo_id}") + return f"https://huggingface.co/datasets/{repo_id}" + + +def load_config(config_path: str) -> dict: + """ + Load dataset configuration from a YAML file. + + Args: + config_path: Path to the YAML configuration file + + Returns: + Dictionary containing the configuration + """ + with open(config_path, "r") as f: + config = yaml.safe_load(f) + + return config + + +def main(): + parser = argparse.ArgumentParser(description="Generate and upload Reasoning Gym datasets to HF Hub") + parser.add_argument("--dataset", type=str, required=False, help="Dataset names (comma-separated list)") + parser.add_argument("--config", type=str, required=False, help="Path to dataset configuration YAML file") + parser.add_argument("--size", type=int, default=20000, help="Total dataset size") + parser.add_argument("--seed", type=int, default=42, help="Random seed") + parser.add_argument("--repo-id", type=str, help="Hugging Face repository ID (e.g., 'username/dataset-name')") + parser.add_argument("--private", action="store_true", help="Make the HF repository private") + parser.add_argument( + "--split", type=str, choices=["train", "test", "validation"], default="train", help="Dataset split name" + ) + + # First parse args to check for config file + args, unknown = parser.parse_known_args() + + # If config specified, load it to handle repo_id + repo_id_from_config = None + if args.config: + config = load_config(args.config) + if "huggingface" in config and "repo_id" in config["huggingface"]: + repo_id_from_config = config["huggingface"]["repo_id"] + + # Re-parse with defaults if needed + if repo_id_from_config: + parser.set_defaults(repo_id=repo_id_from_config) + + args = parser.parse_args() + + # Validate repo_id is provided + if not args.repo_id: + parser.error( + "--repo-id is required. Provide it via command line or in the config file under huggingface.repo_id" + ) + + # Load configuration + dataset_names = [] + weights = {} + configs = {} + + # Load from config file if provided + if args.config: + config = load_config(args.config) + if "reasoning_gym" in config: + rg_config = config["reasoning_gym"] + if "datasets" in rg_config: + for name, ds_config in rg_config["datasets"].items(): + dataset_names.append(name) + weights[name] = ds_config.get("weight", 1.0 / len(rg_config["datasets"])) + configs[name] = ds_config.get("config", {}) + + # Get dataset size from config if not explicitly provided + if "dataset_size" in rg_config and args.size == 20000: # Only use if default size + args.size = rg_config["dataset_size"] + + # Check for HF settings in config + if "huggingface" in config: + hf_config = config["huggingface"] + if "private" in hf_config: + args.private = hf_config["private"] + if "split" in hf_config and args.split == "train": # Only override if using default + args.split = hf_config["split"] + + # Override with command line arguments if provided + if args.dataset: + dataset_names = [name.strip() for name in args.dataset.split(",")] + # Reset weights if datasets are provided + equal_weight = 1.0 / len(dataset_names) + weights = {name: equal_weight for name in dataset_names} + + # Validate that we have dataset names + if not dataset_names: + parser.error("No datasets specified. Use --dataset or --config to specify datasets.") + + print(f"Generating dataset with {len(dataset_names)} datasets: {', '.join(dataset_names)}") + print(f"Dataset size: {args.size}") + print(f"Dataset seed: {args.seed}") + print(f"Repository ID: {args.repo_id}") + + # Generate the dataset + dataset = generate_dataset( + dataset_names=dataset_names, + dataset_size=args.size, + seed=args.seed, + weights=weights, + configs=configs, + ) + + # Save to hub with specified split + save_to_hub( + dataset=dataset, + repo_id=args.repo_id, + private=args.private, + commit_message=f"Upload reasoning_gym dataset with {len(dataset_names)} datasets: {', '.join(dataset_names)}", + split=args.split, + ) + + print("Done!") + + +if __name__ == "__main__": + # Example usage: + # python save_hf_dataset.py --config example_hf_dataset_config.yaml + # python save_hf_dataset.py --dataset "number_sorting,spell_backward,word_sorting" --repo-id "username/reasoning-gym-dataset" + main()