reasoning-gym/scripts/hf_dataset/example_hf_dataset_config.yaml
rasdani dca8117e7e
Feat: Add script to save datasets on HuggingFace (#416)
* feat: add script to save datasets on HuggingFace

* fix

* refactor

* fix formatting

---------

Co-authored-by: Oliver Stanley <olivergestanley@gmail.com>
2025-04-28 18:04:53 +01:00

44 lines
1.1 KiB
YAML

# Example configuration for generating Reasoning Gym datasets for Hugging Face
# Used with save_hf_dataset.py
reasoning_gym:
# Total size of the dataset to generate
dataset_size: 20000
# Datasets to include in the composite dataset
datasets:
# Example algorithmic tasks
spell_backward:
# Weight of this dataset in the composite (weights should sum to 1.0)
weight: 0.33
# Dataset-specific configuration
config:
min_word_len: 3
max_word_len: 10
letter_jumble:
weight: 0.34
config:
min_word_len: 1 # Minimum word length
max_word_len: 50 # Maximum word length
min_words: 3 # Minimum words per task
max_words: 40
word_sorting:
weight: 0.33
config:
min_words: 3
max_words: 10
min_word_length: 3
max_word_length: 12
# Hugging Face upload settings
huggingface:
# Repository ID (required when using this config file directly)
repo_id: "username/reasoning-gym-dataset"
# Whether to make the repository private
private: false
# Dataset split name
split: "train"