restart issues

This commit is contained in:
Jai Suphavadeeprasit 2026-02-12 17:14:11 -05:00
parent 2364d9d8f8
commit 917193d2ea
3 changed files with 172 additions and 23 deletions

View file

@ -300,11 +300,12 @@ LORA_ONLY_TRAINER_PID=$!
# -----------------------------------------------------------------------------
echo ""
echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
# NOTE: lora_restart shares GPU with trainer's model (~8GB), so use lower vLLM memory
CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
--model-name "$MODEL" \
--weight-bridge-mode lora_restart \
--vllm-port $LORA_RESTART_VLLM_PORT \
--vllm-gpu-memory-utilization 0.70 \
--vllm-gpu-memory-utilization 0.20 \
--atropos-url "http://localhost:${LORA_RESTART_API_PORT}" \
--batch-size $BATCH_SIZE \
--training-steps $TRAINING_STEPS \