mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-30 17:40:36 +00:00
train
This commit is contained in:
parent
7e1de80695
commit
b298a0eeb6
2 changed files with 492 additions and 0 deletions
28
configs/rubiks_training.yaml
Normal file
28
configs/rubiks_training.yaml
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# Rubik's Cube Solver Training Configuration
|
||||
|
||||
# Flattened configuration for TrainerConfig
|
||||
model_name: "NousResearch/DeepHermes-3-Llama-3-8B-Preview" # Target model for training
|
||||
learning_rate: 5.0e-6
|
||||
batch_size: 8
|
||||
gradient_accumulation_steps: 4
|
||||
sequence_length: 2048
|
||||
warmup_steps: 100
|
||||
|
||||
total_steps: 2000
|
||||
eval_every: 50
|
||||
save_every: 250
|
||||
checkpoint_dir: "./rubiks_checkpoints"
|
||||
use_wandb: true
|
||||
wandb_project: "atropos-rubiks-cube"
|
||||
wandb_run_name: "rubiks-solver-training"
|
||||
|
||||
train_file: "/Users/joshuajerin/Desktop/jarvis/atropos/environments/rubiks_process_results_22.jsonl"
|
||||
validation_size: 0.1 # 10% for validation
|
||||
prefer_higher_scores: true
|
||||
max_samples: -1 # Use all samples
|
||||
|
||||
method: "GRPO" # Group Relative Policy Optimization
|
||||
temperature: 0.7
|
||||
top_p: 0.9
|
||||
beta: 0.1 # KL penalty coefficient
|
||||
reference_model: "NousResearch/DeepHermes-3-Llama-3-3B-Preview" # Smaller reference model
|
||||
Loading…
Add table
Add a link
Reference in a new issue