atropos/configs/rubiks_training.yaml
Joshua Jerin b298a0eeb6 train
2025-05-18 16:00:43 -07:00

28 lines
No EOL
883 B
YAML

# Rubik's Cube Solver Training Configuration
# Flattened configuration for TrainerConfig
model_name: "NousResearch/DeepHermes-3-Llama-3-8B-Preview" # Target model for training
learning_rate: 5.0e-6
batch_size: 8
gradient_accumulation_steps: 4
sequence_length: 2048
warmup_steps: 100
total_steps: 2000
eval_every: 50
save_every: 250
checkpoint_dir: "./rubiks_checkpoints"
use_wandb: true
wandb_project: "atropos-rubiks-cube"
wandb_run_name: "rubiks-solver-training"
train_file: "/Users/joshuajerin/Desktop/jarvis/atropos/environments/rubiks_process_results_22.jsonl"
validation_size: 0.1 # 10% for validation
prefer_higher_scores: true
max_samples: -1 # Use all samples
method: "GRPO" # Group Relative Policy Optimization
temperature: 0.7
top_p: 0.9
beta: 0.1 # KL penalty coefficient
reference_model: "NousResearch/DeepHermes-3-Llama-3-3B-Preview" # Smaller reference model