Feat: Add script to save datasets on HuggingFace (#416)

* feat: add script to save datasets on HuggingFace * fix * refactor * fix formatting --------- Co-authored-by: Oliver Stanley <olivergestanley@gmail.com>
2026-04-19 12:58:07 +00:00 · 2025-04-28 19:04:53 +02:00 · 2025-04-28 19:04:53 +02:00 · dca8117e7e
commit dca8117e7e
parent 73e3cb33a4
3 changed files with 279 additions and 0 deletions
--- a/scripts/hf_dataset/example_hf_dataset_config.yaml
+++ b/scripts/hf_dataset/example_hf_dataset_config.yaml
@ -0,0 +1,44 @@
+# Example configuration for generating Reasoning Gym datasets for Hugging Face
+# Used with save_hf_dataset.py
+
+reasoning_gym:
+  # Total size of the dataset to generate
+  dataset_size: 20000
+
+  # Datasets to include in the composite dataset
+  datasets:
+    # Example algorithmic tasks
+    spell_backward:
+      # Weight of this dataset in the composite (weights should sum to 1.0)
+      weight: 0.33
+      # Dataset-specific configuration
+      config:
+        min_word_len: 3
+        max_word_len: 10
+
+    letter_jumble:
+      weight: 0.34
+      config:
+        min_word_len: 1  # Minimum word length
+        max_word_len: 50  # Maximum word length
+        min_words: 3  # Minimum words per task
+        max_words: 40
+
+    word_sorting:
+      weight: 0.33
+      config:
+        min_words: 3
+        max_words: 10
+        min_word_length: 3
+        max_word_length: 12
+
+# Hugging Face upload settings
+huggingface:
+  # Repository ID (required when using this config file directly)
+  repo_id: "username/reasoning-gym-dataset"
+
+  # Whether to make the repository private
+  private: false
+
+  # Dataset split name
+  split: "train"