mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
408 lines
13 KiB
Bash
Executable file
408 lines
13 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Runs three GSM8K test trainings with separate infra/ports:
|
|
# 1) shared_vllm
|
|
# 2) lora_only (+ layer filtering support)
|
|
# 3) lora_restart (+ layer filtering support)
|
|
#
|
|
# Usage:
|
|
# chmod +x example_trainer/run_gsm8k_lora_matrix.sh
|
|
# ./example_trainer/run_gsm8k_lora_matrix.sh
|
|
#
|
|
# Optional environment overrides:
|
|
# MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
|
# TRAINING_STEPS=10
|
|
# LORA_LAYER_INDICES="20-31"
|
|
# WANDB_PROJECT="gsm8k-grpo-smoke"
|
|
# WANDB_GROUP="gsm8k-$(date +%Y%m%d-%H%M%S)"
|
|
# START_API_PORT=8002
|
|
# START_VLLM_PORT=9001
|
|
# PYTHON_BIN=python3
|
|
# OUTPUT_BASE_DIR="$PWD" # logs/saves base (defaults to launch directory)
|
|
# SHARED_GPU=0
|
|
# LORA_ONLY_TRAINER_GPU=1
|
|
# LORA_ONLY_VLLM_GPU=2
|
|
# LORA_RESTART_TRAINER_GPU=3
|
|
# LORA_RESTART_VLLM_GPU=4
|
|
# DRY_RUN=1 # print commands only, do not execute
|
|
|
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
LAUNCH_DIR="$PWD"
|
|
cd "$ROOT_DIR"
|
|
|
|
PYTHON_BIN="${PYTHON_BIN:-python3}"
|
|
MODEL_NAME="${MODEL_NAME:-NousResearch/Hermes-3-Llama-3.1-8B}"
|
|
TRAINING_STEPS="${TRAINING_STEPS:-10}"
|
|
BATCH_SIZE="${BATCH_SIZE:-4}"
|
|
GRAD_ACCUM="${GRAD_ACCUM:-4}"
|
|
LR="${LR:-1e-5}"
|
|
KL_COEF="${KL_COEF:-0.1}"
|
|
CLIP_EPS="${CLIP_EPS:-0.2}"
|
|
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.45}"
|
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
|
|
DTYPE="${DTYPE:-bfloat16}"
|
|
LORA_R="${LORA_R:-16}"
|
|
LORA_ALPHA="${LORA_ALPHA:-32}"
|
|
LORA_DROPOUT="${LORA_DROPOUT:-0.05}"
|
|
LORA_TARGET_MODULES="${LORA_TARGET_MODULES:-q_proj v_proj}"
|
|
LORA_LAYER_INDICES="${LORA_LAYER_INDICES:-}"
|
|
WANDB_PROJECT="${WANDB_PROJECT:-gsm8k-grpo-smoke}"
|
|
WANDB_GROUP="${WANDB_GROUP:-gsm8k-$(date +%Y%m%d-%H%M%S)}"
|
|
START_API_PORT="${START_API_PORT:-8002}"
|
|
START_VLLM_PORT="${START_VLLM_PORT:-9001}"
|
|
OUTPUT_BASE_DIR="${OUTPUT_BASE_DIR:-$LAUNCH_DIR}"
|
|
|
|
# GPU pinning (one process per GPU preference)
|
|
SHARED_GPU="${SHARED_GPU:-0}"
|
|
LORA_ONLY_TRAINER_GPU="${LORA_ONLY_TRAINER_GPU:-1}"
|
|
LORA_ONLY_VLLM_GPU="${LORA_ONLY_VLLM_GPU:-2}"
|
|
LORA_RESTART_TRAINER_GPU="${LORA_RESTART_TRAINER_GPU:-3}"
|
|
LORA_RESTART_VLLM_GPU="${LORA_RESTART_VLLM_GPU:-4}"
|
|
DRY_RUN="${DRY_RUN:-0}"
|
|
|
|
SHARED_API_PORT="$START_API_PORT"
|
|
SHARED_VLLM_PORT="$START_VLLM_PORT"
|
|
LORA_ONLY_API_PORT="$((START_API_PORT + 1))"
|
|
LORA_ONLY_VLLM_PORT="$((START_VLLM_PORT + 1))"
|
|
LORA_RESTART_API_PORT="$((START_API_PORT + 2))"
|
|
LORA_RESTART_VLLM_PORT="$((START_VLLM_PORT + 2))"
|
|
|
|
run_pids=()
|
|
run_ports=()
|
|
|
|
log() {
|
|
echo "[$(date '+%H:%M:%S')] $*"
|
|
}
|
|
|
|
kill_port() {
|
|
local port="$1"
|
|
if lsof -i ":${port}" -sTCP:LISTEN >/dev/null 2>&1; then
|
|
lsof -ti ":${port}" | xargs -r kill -9 || true
|
|
fi
|
|
}
|
|
|
|
wait_for_http() {
|
|
local url="$1"
|
|
local timeout="${2:-180}"
|
|
local name="${3:-endpoint}"
|
|
local start
|
|
start="$(date +%s)"
|
|
while true; do
|
|
if curl -fsS "$url" >/dev/null 2>&1; then
|
|
log "Ready: ${name} (${url})"
|
|
return 0
|
|
fi
|
|
if (( "$(date +%s)" - start > timeout )); then
|
|
log "Timeout waiting for ${name}: ${url}"
|
|
return 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
start_process() {
|
|
local name="$1"
|
|
local logfile="$2"
|
|
shift 2
|
|
if [[ "$DRY_RUN" == "1" ]]; then
|
|
log "[DRY RUN] start ${name} (log: ${logfile})"
|
|
printf ' '
|
|
printf '%q ' "$@"
|
|
printf '\n'
|
|
return 0
|
|
fi
|
|
log "Starting ${name} (log: ${logfile})"
|
|
"$@" >"$logfile" 2>&1 &
|
|
local pid=$!
|
|
run_pids+=("$pid")
|
|
log "${name} PID=${pid}"
|
|
}
|
|
|
|
cleanup_run() {
|
|
log "Cleaning up run processes..."
|
|
for pid in "${run_pids[@]:-}"; do
|
|
kill "$pid" >/dev/null 2>&1 || true
|
|
done
|
|
sleep 1
|
|
for pid in "${run_pids[@]:-}"; do
|
|
kill -9 "$pid" >/dev/null 2>&1 || true
|
|
done
|
|
for port in "${run_ports[@]:-}"; do
|
|
kill_port "$port"
|
|
done
|
|
run_pids=()
|
|
run_ports=()
|
|
}
|
|
|
|
add_lora_layer_flag() {
|
|
if [[ -n "$LORA_LAYER_INDICES" ]]; then
|
|
echo "--lora-layer-indices" "$LORA_LAYER_INDICES"
|
|
fi
|
|
}
|
|
|
|
common_trainer_flags() {
|
|
echo \
|
|
--model-name "$MODEL_NAME" \
|
|
--training-steps "$TRAINING_STEPS" \
|
|
--batch-size "$BATCH_SIZE" \
|
|
--gradient-accumulation-steps "$GRAD_ACCUM" \
|
|
--lr "$LR" \
|
|
--kl-coef "$KL_COEF" \
|
|
--clip-eps "$CLIP_EPS" \
|
|
--use-wandb \
|
|
--wandb-project "$WANDB_PROJECT" \
|
|
--wandb-group "$WANDB_GROUP"
|
|
}
|
|
|
|
start_gsm8k_env() {
|
|
local vllm_port="$1"
|
|
local logfile="$2"
|
|
start_process "gsm8k_env" "$logfile" \
|
|
"$PYTHON_BIN" environments/gsm8k_server.py serve \
|
|
--env.group_size 4 \
|
|
--env.max_num 200 \
|
|
--slurm.num_requests_per_time_interval 16 \
|
|
--slurm.time_interval 10 \
|
|
--openai.api_key "dummy" \
|
|
--openai.base_url "http://localhost:${vllm_port}/v1" \
|
|
--openai.model_name "$MODEL_NAME" \
|
|
--openai.server_type vllm
|
|
}
|
|
|
|
run_shared_vllm() {
|
|
log "========== RUN: shared_vllm =========="
|
|
local api_port="$SHARED_API_PORT"
|
|
local vllm_port="$SHARED_VLLM_PORT"
|
|
local mode_dir="${OUTPUT_BASE_DIR}/logs/gsm8k_shared_vllm"
|
|
local save_dir="${OUTPUT_BASE_DIR}/saves/gsm8k_shared_vllm"
|
|
local bridge_dir="${mode_dir}/bridge"
|
|
mkdir -p "$mode_dir"
|
|
mkdir -p "$save_dir"
|
|
mkdir -p "$bridge_dir"
|
|
|
|
run_ports+=("$api_port" "$vllm_port")
|
|
kill_port "$api_port"
|
|
kill_port "$vllm_port"
|
|
|
|
start_process "run_api" "$mode_dir/run_api.log" run-api --port "$api_port"
|
|
if [[ "$DRY_RUN" == "1" ]]; then
|
|
log "[DRY RUN] wait for http://localhost:${api_port}/info"
|
|
else
|
|
wait_for_http "http://localhost:${api_port}/info" 60 "run-api"
|
|
fi
|
|
|
|
start_process "vllm_shared" "$mode_dir/vllm.log" \
|
|
env CUDA_VISIBLE_DEVICES="$SHARED_GPU" VLLM_ENABLE_SHARED_WEIGHTS=1 LOGDIR="$bridge_dir" \
|
|
"$PYTHON_BIN" -m example_trainer.vllm_api_server \
|
|
--model "$MODEL_NAME" \
|
|
--port "$vllm_port" \
|
|
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
|
--max-model-len "$MAX_MODEL_LEN" \
|
|
--dtype "$DTYPE" \
|
|
--enforce-eager
|
|
if [[ "$DRY_RUN" == "1" ]]; then
|
|
log "[DRY RUN] wait for http://localhost:${vllm_port}/health"
|
|
else
|
|
wait_for_http "http://localhost:${vllm_port}/health" 300 "shared vLLM"
|
|
fi
|
|
|
|
start_gsm8k_env "$vllm_port" "$mode_dir/env.log"
|
|
|
|
log "Starting trainer: shared_vllm"
|
|
if [[ "$DRY_RUN" == "1" ]]; then
|
|
log "[DRY RUN] trainer command (shared_vllm):"
|
|
printf ' '
|
|
printf '%q ' env CUDA_VISIBLE_DEVICES="$SHARED_GPU" "$PYTHON_BIN" -m example_trainer.grpo \
|
|
$(common_trainer_flags) \
|
|
--weight-bridge-mode shared_vllm \
|
|
--device cuda:0 \
|
|
--save-path "$save_dir" \
|
|
--vllm-port "$vllm_port" \
|
|
--vllm-config-path "${bridge_dir}/vllm_bridge_config.json" \
|
|
--atropos-url "http://localhost:${api_port}"
|
|
printf '\n'
|
|
log "[DRY RUN] trainer log path: $mode_dir/trainer.log"
|
|
else
|
|
env CUDA_VISIBLE_DEVICES="$SHARED_GPU" "$PYTHON_BIN" -m example_trainer.grpo \
|
|
$(common_trainer_flags) \
|
|
--weight-bridge-mode shared_vllm \
|
|
--device cuda:0 \
|
|
--save-path "$save_dir" \
|
|
--vllm-port "$vllm_port" \
|
|
--vllm-config-path "${bridge_dir}/vllm_bridge_config.json" \
|
|
--atropos-url "http://localhost:${api_port}" | tee "$mode_dir/trainer.log"
|
|
fi
|
|
|
|
cleanup_run
|
|
}
|
|
|
|
run_lora_only() {
|
|
log "========== RUN: lora_only =========="
|
|
local api_port="$LORA_ONLY_API_PORT"
|
|
local vllm_port="$LORA_ONLY_VLLM_PORT"
|
|
local mode_dir="${OUTPUT_BASE_DIR}/logs/gsm8k_lora_only"
|
|
local save_dir="${OUTPUT_BASE_DIR}/saves/gsm8k_lora_only"
|
|
mkdir -p "$mode_dir"
|
|
mkdir -p "$save_dir"
|
|
|
|
run_ports+=("$api_port" "$vllm_port")
|
|
kill_port "$api_port"
|
|
kill_port "$vllm_port"
|
|
|
|
start_process "run_api" "$mode_dir/run_api.log" run-api --port "$api_port"
|
|
if [[ "$DRY_RUN" == "1" ]]; then
|
|
log "[DRY RUN] wait for http://localhost:${api_port}/info"
|
|
else
|
|
wait_for_http "http://localhost:${api_port}/info" 60 "run-api"
|
|
fi
|
|
|
|
start_process "vllm_lora_only" "$mode_dir/vllm.log" \
|
|
env CUDA_VISIBLE_DEVICES="$LORA_ONLY_VLLM_GPU" \
|
|
"$PYTHON_BIN" -m example_trainer.vllm_api_server \
|
|
--model "$MODEL_NAME" \
|
|
--port "$vllm_port" \
|
|
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
|
--max-model-len "$MAX_MODEL_LEN" \
|
|
--dtype "$DTYPE" \
|
|
--enable-lora \
|
|
--enforce-eager
|
|
if [[ "$DRY_RUN" == "1" ]]; then
|
|
log "[DRY RUN] wait for http://localhost:${vllm_port}/health"
|
|
else
|
|
wait_for_http "http://localhost:${vllm_port}/health" 300 "lora_only vLLM"
|
|
fi
|
|
|
|
start_gsm8k_env "$vllm_port" "$mode_dir/env.log"
|
|
|
|
log "Starting trainer: lora_only"
|
|
if [[ "$DRY_RUN" == "1" ]]; then
|
|
log "[DRY RUN] trainer command (lora_only):"
|
|
printf ' '
|
|
printf '%q ' env CUDA_VISIBLE_DEVICES="$LORA_ONLY_TRAINER_GPU" "$PYTHON_BIN" -m example_trainer.grpo \
|
|
$(common_trainer_flags) \
|
|
--weight-bridge-mode lora_only \
|
|
--device cuda:0 \
|
|
--save-path "$save_dir" \
|
|
--vllm-port "$vllm_port" \
|
|
--atropos-url "http://localhost:${api_port}" \
|
|
--lora-r "$LORA_R" \
|
|
--lora-alpha "$LORA_ALPHA" \
|
|
--lora-dropout "$LORA_DROPOUT" \
|
|
--lora-target-modules $LORA_TARGET_MODULES \
|
|
$(add_lora_layer_flag)
|
|
printf '\n'
|
|
log "[DRY RUN] trainer log path: $mode_dir/trainer.log"
|
|
else
|
|
env CUDA_VISIBLE_DEVICES="$LORA_ONLY_TRAINER_GPU" "$PYTHON_BIN" -m example_trainer.grpo \
|
|
$(common_trainer_flags) \
|
|
--weight-bridge-mode lora_only \
|
|
--device cuda:0 \
|
|
--save-path "$save_dir" \
|
|
--vllm-port "$vllm_port" \
|
|
--atropos-url "http://localhost:${api_port}" \
|
|
--lora-r "$LORA_R" \
|
|
--lora-alpha "$LORA_ALPHA" \
|
|
--lora-dropout "$LORA_DROPOUT" \
|
|
--lora-target-modules $LORA_TARGET_MODULES \
|
|
$(add_lora_layer_flag) | tee "$mode_dir/trainer.log"
|
|
fi
|
|
|
|
cleanup_run
|
|
}
|
|
|
|
run_lora_restart() {
|
|
log "========== RUN: lora_restart =========="
|
|
local api_port="$LORA_RESTART_API_PORT"
|
|
local vllm_port="$LORA_RESTART_VLLM_PORT"
|
|
local mode_dir="${OUTPUT_BASE_DIR}/logs/gsm8k_lora_restart"
|
|
local save_dir="${OUTPUT_BASE_DIR}/saves/gsm8k_lora_restart"
|
|
mkdir -p "$mode_dir"
|
|
mkdir -p "$save_dir"
|
|
|
|
run_ports+=("$api_port" "$vllm_port")
|
|
kill_port "$api_port"
|
|
kill_port "$vllm_port"
|
|
|
|
start_process "run_api" "$mode_dir/run_api.log" run-api --port "$api_port"
|
|
if [[ "$DRY_RUN" == "1" ]]; then
|
|
log "[DRY RUN] wait for http://localhost:${api_port}/info"
|
|
else
|
|
wait_for_http "http://localhost:${api_port}/info" 60 "run-api"
|
|
fi
|
|
|
|
log "Starting trainer: lora_restart (it launches its own vLLM)"
|
|
if [[ "$DRY_RUN" == "1" ]]; then
|
|
log "[DRY RUN] trainer command (lora_restart):"
|
|
printf ' '
|
|
printf '%q ' env CUDA_VISIBLE_DEVICES="$LORA_RESTART_TRAINER_GPU" "$PYTHON_BIN" -m example_trainer.grpo \
|
|
$(common_trainer_flags) \
|
|
--weight-bridge-mode lora_restart \
|
|
--device cuda:0 \
|
|
--save-path "$save_dir" \
|
|
--vllm-port "$vllm_port" \
|
|
--vllm-gpu "$LORA_RESTART_VLLM_GPU" \
|
|
--vllm-restart-interval 3 \
|
|
--atropos-url "http://localhost:${api_port}" \
|
|
--lora-r "$LORA_R" \
|
|
--lora-alpha "$LORA_ALPHA" \
|
|
--lora-dropout "$LORA_DROPOUT" \
|
|
--lora-target-modules $LORA_TARGET_MODULES \
|
|
$(add_lora_layer_flag)
|
|
printf '\n'
|
|
log "[DRY RUN] then wait for http://localhost:${vllm_port}/health"
|
|
log "[DRY RUN] then start GSM8K env pointed at http://localhost:${vllm_port}/v1"
|
|
log "[DRY RUN] trainer log path: $mode_dir/trainer.log"
|
|
else
|
|
env CUDA_VISIBLE_DEVICES="$LORA_RESTART_TRAINER_GPU" "$PYTHON_BIN" -m example_trainer.grpo \
|
|
$(common_trainer_flags) \
|
|
--weight-bridge-mode lora_restart \
|
|
--device cuda:0 \
|
|
--save-path "$save_dir" \
|
|
--vllm-port "$vllm_port" \
|
|
--vllm-gpu "$LORA_RESTART_VLLM_GPU" \
|
|
--vllm-restart-interval 3 \
|
|
--atropos-url "http://localhost:${api_port}" \
|
|
--lora-r "$LORA_R" \
|
|
--lora-alpha "$LORA_ALPHA" \
|
|
--lora-dropout "$LORA_DROPOUT" \
|
|
--lora-target-modules $LORA_TARGET_MODULES \
|
|
$(add_lora_layer_flag) >"$mode_dir/trainer.log" 2>&1 &
|
|
trainer_pid=$!
|
|
run_pids+=("$trainer_pid")
|
|
|
|
wait_for_http "http://localhost:${vllm_port}/health" 420 "lora_restart vLLM"
|
|
start_gsm8k_env "$vllm_port" "$mode_dir/env.log"
|
|
|
|
wait "$trainer_pid"
|
|
cat "$mode_dir/trainer.log"
|
|
fi
|
|
|
|
cleanup_run
|
|
}
|
|
|
|
trap cleanup_run EXIT INT TERM
|
|
|
|
log "Model: $MODEL_NAME"
|
|
log "W&B project/group: $WANDB_PROJECT / $WANDB_GROUP"
|
|
log "Dry run mode: $DRY_RUN"
|
|
log "Output base directory (logs + saves): $OUTPUT_BASE_DIR"
|
|
log "Port plan:"
|
|
log " shared_vllm: run-api=${SHARED_API_PORT}, vllm=${SHARED_VLLM_PORT}"
|
|
log " lora_only: run-api=${LORA_ONLY_API_PORT}, vllm=${LORA_ONLY_VLLM_PORT}"
|
|
log " lora_restart: run-api=${LORA_RESTART_API_PORT}, vllm=${LORA_RESTART_VLLM_PORT}"
|
|
log "GPU plan:"
|
|
log " shared_vllm: trainer+vllm on GPU ${SHARED_GPU} (required for shared weights)"
|
|
log " lora_only: trainer GPU ${LORA_ONLY_TRAINER_GPU}, vllm GPU ${LORA_ONLY_VLLM_GPU}"
|
|
log " lora_restart: trainer GPU ${LORA_RESTART_TRAINER_GPU}, vllm GPU ${LORA_RESTART_VLLM_GPU}"
|
|
if [[ -n "$LORA_LAYER_INDICES" ]]; then
|
|
log "LoRA layer indices: $LORA_LAYER_INDICES"
|
|
else
|
|
log "LoRA layer indices: all matching layers"
|
|
fi
|
|
|
|
run_shared_vllm
|
|
run_lora_only
|
|
run_lora_restart
|
|
|
|
log "All GSM8K mode runs completed."
|