mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-22 16:48:57 +00:00
restart issues
This commit is contained in:
parent
2364d9d8f8
commit
917193d2ea
3 changed files with 172 additions and 23 deletions
|
|
@ -300,11 +300,12 @@ LORA_ONLY_TRAINER_PID=$!
|
|||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
|
||||
# NOTE: lora_restart shares GPU with trainer's model (~8GB), so use lower vLLM memory
|
||||
CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode lora_restart \
|
||||
--vllm-port $LORA_RESTART_VLLM_PORT \
|
||||
--vllm-gpu-memory-utilization 0.70 \
|
||||
--vllm-gpu-memory-utilization 0.20 \
|
||||
--atropos-url "http://localhost:${LORA_RESTART_API_PORT}" \
|
||||
--batch-size $BATCH_SIZE \
|
||||
--training-steps $TRAINING_STEPS \
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue