This commit is contained in:
Cavit Erginsoy 2025-02-03 11:35:30 +00:00
parent 1e27021e11
commit 6c564b3dd9
13 changed files with 305 additions and 317 deletions

View file

@ -6,10 +6,10 @@ main.py Orchestrates the overall flow:
3. Upload the final dataset to HuggingFace Hub (if needed)
"""
import uuid
import sys
import uuid
from pathlib import Path
from typing import Dict, Any
from typing import Any, Dict
from examples.word_ladder.utils import create_word_ladders, generate_reasoning
@ -17,7 +17,7 @@ from examples.word_ladder.utils import create_word_ladders, generate_reasoning
def create_dataset(jsonl_path: Path, config: Dict[str, Any]) -> bool:
"""
Creates the word ladder dataset, handling potential exhaustion gracefully.
Returns:
bool: True if dataset was created (even if truncated), False if creation failed
"""
@ -25,7 +25,7 @@ def create_dataset(jsonl_path: Path, config: Dict[str, Any]) -> bool:
print("Step 1: Algorithmically creating word ladder chains...")
create_word_ladders.create_word_ladder_dataset(str(jsonl_path), config=config)
return True
except IndexError as e:
# Dataset was exhausted but some examples were generated
print("\nNote: Dataset generation stopped early due to exhaustion of unique puzzles.")
@ -34,23 +34,24 @@ def create_dataset(jsonl_path: Path, config: Dict[str, Any]) -> bool:
print("Continuing with the partial dataset that was successfully generated.")
return True
return False
except Exception as e:
# Unexpected error during dataset creation
print(f"\nError: Failed to create dataset: {str(e)}")
return False
def main():
# Centralized configuration for the dataset
config = {
'dataset_name': 'word_ladder',
'dataset_config': {
'min_word_length': 3,
'max_word_length': 5,
'min_chain_length':-1, # set to -1 for the shortest possible path
'max_chain_length':10,
'size': 100, # Generate a small-ish dataset for demonstration
}
"dataset_name": "word_ladder",
"dataset_config": {
"min_word_length": 3,
"max_word_length": 3,
"min_chain_length": -1, # set to -1 for the shortest possible path
"max_chain_length": 7,
"size": 2000, # Generate a small-ish dataset for demonstration
},
}
# Generate a friendly unique identifier and compose the file path
@ -64,21 +65,20 @@ def main():
print("Exiting due to dataset creation failure.")
sys.exit(1)
# Step 2: Generate reasoning
'''
try:
print("\nStep 2: Submitting reasoning batches for the dataset...")
generate_reasoning.submit_reasoning_batches(input_path=str(jsonl_path))
except Exception as e:
print(f"\nError: Failed to submit reasoning batches: {str(e)}")
sys.exit(1)
'''
# Step 3: Check Anthropic batch results
# Step 4: Upload to HuggingFace 🤗
print("\nComplete!")
if __name__ == "__main__":
main()
main()