mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
* feat: add script to save datasets on HuggingFace * fix * refactor * fix formatting --------- Co-authored-by: Oliver Stanley <olivergestanley@gmail.com>
44 lines
1.1 KiB
YAML
44 lines
1.1 KiB
YAML
# Example configuration for generating Reasoning Gym datasets for Hugging Face
|
|
# Used with save_hf_dataset.py
|
|
|
|
reasoning_gym:
|
|
# Total size of the dataset to generate
|
|
dataset_size: 20000
|
|
|
|
# Datasets to include in the composite dataset
|
|
datasets:
|
|
# Example algorithmic tasks
|
|
spell_backward:
|
|
# Weight of this dataset in the composite (weights should sum to 1.0)
|
|
weight: 0.33
|
|
# Dataset-specific configuration
|
|
config:
|
|
min_word_len: 3
|
|
max_word_len: 10
|
|
|
|
letter_jumble:
|
|
weight: 0.34
|
|
config:
|
|
min_word_len: 1 # Minimum word length
|
|
max_word_len: 50 # Maximum word length
|
|
min_words: 3 # Minimum words per task
|
|
max_words: 40
|
|
|
|
word_sorting:
|
|
weight: 0.33
|
|
config:
|
|
min_words: 3
|
|
max_words: 10
|
|
min_word_length: 3
|
|
max_word_length: 12
|
|
|
|
# Hugging Face upload settings
|
|
huggingface:
|
|
# Repository ID (required when using this config file directly)
|
|
repo_id: "username/reasoning-gym-dataset"
|
|
|
|
# Whether to make the repository private
|
|
private: false
|
|
|
|
# Dataset split name
|
|
split: "train"
|