atropos/example_trainer/configs/math_zero_lora.yaml
Jai Suphavadeeprasit 527433b5bc change OPD style
2026-02-19 17:08:27 -05:00

16 lines
524 B
YAML

env:
tokenizer_name: "Qwen/Qwen3-4B-Instruct-2507"
rollout_server_url: "http://localhost:8002"
max_token_length: 8192
start_tok_length: 8192
group_size: 8
batch_size: 64
total_steps: 120
steps_per_eval: 20
use_wandb: true
wandb_name: "math-zero-lora-env"
# Optional teacher-behavior steering for on-policy distillation.
# teacher_system_prompt: "Use simple language and avoid jargon."
# teacher_prefix_text: "Style: concise, non-jargony.\n\n"
eval_limit_ratio: 0.1
max_num_workers_per_node: 24