mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
84 lines
2.7 KiB
Python
84 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
main.py – Orchestrates the overall flow:
|
||
1. Generate word ladder sets
|
||
2. Submit chain-of-thought reasoning requests in batches via the LLM
|
||
3. Upload the final dataset to HuggingFace Hub (if needed)
|
||
"""
|
||
|
||
import sys
|
||
import uuid
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
from examples.word_ladder.utils import create_word_ladders, generate_reasoning
|
||
|
||
|
||
def create_dataset(jsonl_path: Path, config: dict[str, Any]) -> bool:
|
||
"""
|
||
Creates the word ladder dataset, handling potential exhaustion gracefully.
|
||
|
||
Returns:
|
||
bool: True if dataset was created (even if truncated), False if creation failed
|
||
"""
|
||
try:
|
||
print("Step 1: Algorithmically creating word ladder chains...")
|
||
create_word_ladders.create_word_ladder_dataset(str(jsonl_path), config=config)
|
||
return True
|
||
|
||
except IndexError as e:
|
||
# Dataset was exhausted but some examples were generated
|
||
print("\nNote: Dataset generation stopped early due to exhaustion of unique puzzles.")
|
||
print(f"Reason: {str(e)}")
|
||
if jsonl_path.exists():
|
||
print("Continuing with the partial dataset that was successfully generated.")
|
||
return True
|
||
return False
|
||
|
||
except Exception as e:
|
||
# Unexpected error during dataset creation
|
||
print(f"\nError: Failed to create dataset: {str(e)}")
|
||
return False
|
||
|
||
|
||
def main():
|
||
# Centralized configuration for the dataset
|
||
config = {
|
||
"dataset_name": "word_ladder",
|
||
"dataset_config": {
|
||
"min_word_length": 3,
|
||
"max_word_length": 3,
|
||
"min_chain_length": -1, # set to -1 for the shortest possible path
|
||
"max_chain_length": 7,
|
||
"size": 2000, # Generate a small-ish dataset for demonstration
|
||
},
|
||
}
|
||
|
||
# Generate a friendly unique identifier and compose the file path
|
||
unique_id = uuid.uuid4().hex[:8]
|
||
output_dir = Path(__file__).resolve().parent / "output"
|
||
output_dir.mkdir(exist_ok=True) # Create output directory if it doesn't exist
|
||
jsonl_path = output_dir / f"word_ladders_{unique_id}.jsonl"
|
||
|
||
# Step 1: Create the dataset
|
||
if not create_dataset(jsonl_path, config):
|
||
print("Exiting due to dataset creation failure.")
|
||
sys.exit(1)
|
||
|
||
# Step 2: Generate reasoning
|
||
|
||
try:
|
||
print("\nStep 2: Submitting reasoning batches for the dataset...")
|
||
generate_reasoning.submit_reasoning_batches(input_path=str(jsonl_path))
|
||
except Exception as e:
|
||
print(f"\nError: Failed to submit reasoning batches: {str(e)}")
|
||
sys.exit(1)
|
||
|
||
# Step 3: Check Anthropic batch results
|
||
# Step 4: Upload to HuggingFace 🤗
|
||
|
||
print("\nComplete!")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|