diff --git a/example_trainer/scripts/test_vllm_restart_only.py b/example_trainer/scripts/test_vllm_restart_only.py new file mode 100755 index 00000000..fb55412b --- /dev/null +++ b/example_trainer/scripts/test_vllm_restart_only.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Minimal test for vLLM restart cycle - no training, just launch/terminate/relaunch. +Tests whether GPU memory is properly released between restarts. +""" +import os +import sys +import time +import argparse + +# Add parent directory to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="Qwen/Qwen3-4B-Instruct-2507") + parser.add_argument("--port", type=int, default=9099) + parser.add_argument("--gpu", type=int, default=0) + parser.add_argument("--memory-util", type=float, default=0.3) + parser.add_argument("--restarts", type=int, default=3, help="Number of restart cycles to test") + args = parser.parse_args() + + os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) + + import torch + from trainers import _launch_vllm_with_lora, _terminate_vllm + from config import TrainingConfig + + print("=" * 60) + print("vLLM RESTART CYCLE TEST") + print("=" * 60) + print(f"Model: {args.model}") + print(f"Port: {args.port}") + print(f"GPU: {args.gpu}") + print(f"Memory utilization: {args.memory_util}") + print(f"Restart cycles: {args.restarts}") + print("=" * 60) + + # Check initial GPU memory + if torch.cuda.is_available(): + free_mem = torch.cuda.mem_get_info()[0] / 1e9 + total_mem = torch.cuda.mem_get_info()[1] / 1e9 + print(f"\nInitial GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free") + + # Create a minimal config + config = TrainingConfig( + model_name=args.model, + vllm_port=args.port, + vllm_gpu_memory_utilization=args.memory_util, + max_model_len=4096, # Small for quick test + lora_r=16, + lora_alpha=32, + weight_bridge_mode="lora_restart", + save_path="/tmp/vllm_restart_test", + ) + + # Create dummy adapter directory + os.makedirs(config.save_path, exist_ok=True) + adapter_path = os.path.join(config.save_path, "dummy_adapter") + + # We need to create a real adapter for vLLM to load + # Let's skip the adapter for this test and just test launch/terminate + print("\n" + "=" * 60) + print("Testing vLLM launch/terminate cycle (no adapter)") + print("=" * 60) + + from vllm_manager import kill_process_on_port, wait_for_vllm_ready + import subprocess + + script_dir = os.path.dirname(os.path.abspath(__file__)) + server_script = os.path.join(os.path.dirname(script_dir), "vllm_api_server.py") + + for cycle in range(args.restarts): + print(f"\n{'='*60}") + print(f"CYCLE {cycle + 1}/{args.restarts}") + print(f"{'='*60}") + + # Check memory before launch + if torch.cuda.is_available(): + torch.cuda.empty_cache() + free_mem = torch.cuda.mem_get_info()[0] / 1e9 + total_mem = torch.cuda.mem_get_info()[1] / 1e9 + print(f"[Before launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)") + + # Launch vLLM (without LoRA for simplicity) + print(f"\n[{cycle+1}] Launching vLLM...") + cmd = [ + "python", server_script, + "--model", args.model, + "--port", str(args.port), + "--gpu-memory-utilization", str(args.memory_util), + "--max-model-len", "4096", + ] + print(f" Command: {' '.join(cmd)}") + + log_file = f"/tmp/vllm_restart_test/vllm_cycle_{cycle}.log" + with open(log_file, "w") as f: + proc = subprocess.Popen( + cmd, + stdout=f, + stderr=subprocess.STDOUT, + env=os.environ.copy(), + ) + print(f" PID: {proc.pid}") + print(f" Log: {log_file}") + + # Wait for vLLM to be ready + print(f" Waiting for vLLM to be ready...") + start_time = time.time() + if wait_for_vllm_ready(args.port, timeout=300): + elapsed = time.time() - start_time + print(f" ✓ vLLM ready in {elapsed:.1f}s") + else: + print(f" ✗ vLLM failed to start!") + print(f" Check log: {log_file}") + with open(log_file, "r") as f: + print(f" Last 20 lines:\n{''.join(f.readlines()[-20:])}") + proc.kill() + return 1 + + # Check memory after launch + if torch.cuda.is_available(): + free_mem = torch.cuda.mem_get_info()[0] / 1e9 + total_mem = torch.cuda.mem_get_info()[1] / 1e9 + print(f"[After launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)") + + # Keep vLLM running for a bit + print(f"\n Letting vLLM run for 5s...") + time.sleep(5) + + # Terminate vLLM + print(f"\n[{cycle+1}] Terminating vLLM...") + _terminate_vllm(proc, args.port) + + # Check memory after terminate + if torch.cuda.is_available(): + torch.cuda.empty_cache() + free_mem = torch.cuda.mem_get_info()[0] / 1e9 + total_mem = torch.cuda.mem_get_info()[1] / 1e9 + print(f"[After terminate] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)") + + print("\n" + "=" * 60) + print("TEST COMPLETE!") + print("=" * 60) + + if torch.cuda.is_available(): + free_mem = torch.cuda.mem_get_info()[0] / 1e9 + total_mem = torch.cuda.mem_get_info()[1] / 1e9 + print(f"Final GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)") + + return 0 + +if __name__ == "__main__": + sys.exit(main())