restart issues 2

This commit is contained in:
Jai Suphavadeeprasit 2026-02-12 17:32:10 -05:00
parent 917193d2ea
commit c53febd0a8

View file

@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""
Minimal test for vLLM restart cycle - no training, just launch/terminate/relaunch.
Tests whether GPU memory is properly released between restarts.
"""
import os
import sys
import time
import argparse
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="Qwen/Qwen3-4B-Instruct-2507")
parser.add_argument("--port", type=int, default=9099)
parser.add_argument("--gpu", type=int, default=0)
parser.add_argument("--memory-util", type=float, default=0.3)
parser.add_argument("--restarts", type=int, default=3, help="Number of restart cycles to test")
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
import torch
from trainers import _launch_vllm_with_lora, _terminate_vllm
from config import TrainingConfig
print("=" * 60)
print("vLLM RESTART CYCLE TEST")
print("=" * 60)
print(f"Model: {args.model}")
print(f"Port: {args.port}")
print(f"GPU: {args.gpu}")
print(f"Memory utilization: {args.memory_util}")
print(f"Restart cycles: {args.restarts}")
print("=" * 60)
# Check initial GPU memory
if torch.cuda.is_available():
free_mem = torch.cuda.mem_get_info()[0] / 1e9
total_mem = torch.cuda.mem_get_info()[1] / 1e9
print(f"\nInitial GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free")
# Create a minimal config
config = TrainingConfig(
model_name=args.model,
vllm_port=args.port,
vllm_gpu_memory_utilization=args.memory_util,
max_model_len=4096, # Small for quick test
lora_r=16,
lora_alpha=32,
weight_bridge_mode="lora_restart",
save_path="/tmp/vllm_restart_test",
)
# Create dummy adapter directory
os.makedirs(config.save_path, exist_ok=True)
adapter_path = os.path.join(config.save_path, "dummy_adapter")
# We need to create a real adapter for vLLM to load
# Let's skip the adapter for this test and just test launch/terminate
print("\n" + "=" * 60)
print("Testing vLLM launch/terminate cycle (no adapter)")
print("=" * 60)
from vllm_manager import kill_process_on_port, wait_for_vllm_ready
import subprocess
script_dir = os.path.dirname(os.path.abspath(__file__))
server_script = os.path.join(os.path.dirname(script_dir), "vllm_api_server.py")
for cycle in range(args.restarts):
print(f"\n{'='*60}")
print(f"CYCLE {cycle + 1}/{args.restarts}")
print(f"{'='*60}")
# Check memory before launch
if torch.cuda.is_available():
torch.cuda.empty_cache()
free_mem = torch.cuda.mem_get_info()[0] / 1e9
total_mem = torch.cuda.mem_get_info()[1] / 1e9
print(f"[Before launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
# Launch vLLM (without LoRA for simplicity)
print(f"\n[{cycle+1}] Launching vLLM...")
cmd = [
"python", server_script,
"--model", args.model,
"--port", str(args.port),
"--gpu-memory-utilization", str(args.memory_util),
"--max-model-len", "4096",
]
print(f" Command: {' '.join(cmd)}")
log_file = f"/tmp/vllm_restart_test/vllm_cycle_{cycle}.log"
with open(log_file, "w") as f:
proc = subprocess.Popen(
cmd,
stdout=f,
stderr=subprocess.STDOUT,
env=os.environ.copy(),
)
print(f" PID: {proc.pid}")
print(f" Log: {log_file}")
# Wait for vLLM to be ready
print(f" Waiting for vLLM to be ready...")
start_time = time.time()
if wait_for_vllm_ready(args.port, timeout=300):
elapsed = time.time() - start_time
print(f" ✓ vLLM ready in {elapsed:.1f}s")
else:
print(f" ✗ vLLM failed to start!")
print(f" Check log: {log_file}")
with open(log_file, "r") as f:
print(f" Last 20 lines:\n{''.join(f.readlines()[-20:])}")
proc.kill()
return 1
# Check memory after launch
if torch.cuda.is_available():
free_mem = torch.cuda.mem_get_info()[0] / 1e9
total_mem = torch.cuda.mem_get_info()[1] / 1e9
print(f"[After launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
# Keep vLLM running for a bit
print(f"\n Letting vLLM run for 5s...")
time.sleep(5)
# Terminate vLLM
print(f"\n[{cycle+1}] Terminating vLLM...")
_terminate_vllm(proc, args.port)
# Check memory after terminate
if torch.cuda.is_available():
torch.cuda.empty_cache()
free_mem = torch.cuda.mem_get_info()[0] / 1e9
total_mem = torch.cuda.mem_get_info()[1] / 1e9
print(f"[After terminate] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
print("\n" + "=" * 60)
print("TEST COMPLETE!")
print("=" * 60)
if torch.cuda.is_available():
free_mem = torch.cuda.mem_get_info()[0] / 1e9
total_mem = torch.cuda.mem_get_info()[1] / 1e9
print(f"Final GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
return 0
if __name__ == "__main__":
sys.exit(main())