mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
Feat: Add script to save datasets on HuggingFace (#416)
* feat: add script to save datasets on HuggingFace * fix * refactor * fix formatting --------- Co-authored-by: Oliver Stanley <olivergestanley@gmail.com>
This commit is contained in:
parent
73e3cb33a4
commit
dca8117e7e
3 changed files with 279 additions and 0 deletions
44
scripts/hf_dataset/example_hf_dataset_config.yaml
Normal file
44
scripts/hf_dataset/example_hf_dataset_config.yaml
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# Example configuration for generating Reasoning Gym datasets for Hugging Face
|
||||
# Used with save_hf_dataset.py
|
||||
|
||||
reasoning_gym:
|
||||
# Total size of the dataset to generate
|
||||
dataset_size: 20000
|
||||
|
||||
# Datasets to include in the composite dataset
|
||||
datasets:
|
||||
# Example algorithmic tasks
|
||||
spell_backward:
|
||||
# Weight of this dataset in the composite (weights should sum to 1.0)
|
||||
weight: 0.33
|
||||
# Dataset-specific configuration
|
||||
config:
|
||||
min_word_len: 3
|
||||
max_word_len: 10
|
||||
|
||||
letter_jumble:
|
||||
weight: 0.34
|
||||
config:
|
||||
min_word_len: 1 # Minimum word length
|
||||
max_word_len: 50 # Maximum word length
|
||||
min_words: 3 # Minimum words per task
|
||||
max_words: 40
|
||||
|
||||
word_sorting:
|
||||
weight: 0.33
|
||||
config:
|
||||
min_words: 3
|
||||
max_words: 10
|
||||
min_word_length: 3
|
||||
max_word_length: 12
|
||||
|
||||
# Hugging Face upload settings
|
||||
huggingface:
|
||||
# Repository ID (required when using this config file directly)
|
||||
repo_id: "username/reasoning-gym-dataset"
|
||||
|
||||
# Whether to make the repository private
|
||||
private: false
|
||||
|
||||
# Dataset split name
|
||||
split: "train"
|
||||
Loading…
Add table
Add a link
Reference in a new issue