restart issues 2

2026-04-25 17:10:42 +00:00 · 2026-02-12 17:32:10 -05:00 · 2026-02-12 17:32:10 -05:00 · c53febd0a8
commit c53febd0a8
parent 917193d2ea
1 changed files with 154 additions and 0 deletions
--- a/example_trainer/scripts/test_vllm_restart_only.py
+++ b/example_trainer/scripts/test_vllm_restart_only.py
@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+Minimal test for vLLM restart cycle - no training, just launch/terminate/relaunch.
+Tests whether GPU memory is properly released between restarts.
+"""
+import os
+import sys
+import time
+import argparse
+
+# Add parent directory to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Qwen/Qwen3-4B-Instruct-2507")
+    parser.add_argument("--port", type=int, default=9099)
+    parser.add_argument("--gpu", type=int, default=0)
+    parser.add_argument("--memory-util", type=float, default=0.3)
+    parser.add_argument("--restarts", type=int, default=3, help="Number of restart cycles to test")
+    args = parser.parse_args()
+    
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
+    
+    import torch
+    from trainers import _launch_vllm_with_lora, _terminate_vllm
+    from config import TrainingConfig
+    
+    print("=" * 60)
+    print("vLLM RESTART CYCLE TEST")
+    print("=" * 60)
+    print(f"Model: {args.model}")
+    print(f"Port: {args.port}")
+    print(f"GPU: {args.gpu}")
+    print(f"Memory utilization: {args.memory_util}")
+    print(f"Restart cycles: {args.restarts}")
+    print("=" * 60)
+    
+    # Check initial GPU memory
+    if torch.cuda.is_available():
+        free_mem = torch.cuda.mem_get_info()[0] / 1e9
+        total_mem = torch.cuda.mem_get_info()[1] / 1e9
+        print(f"\nInitial GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free")
+    
+    # Create a minimal config
+    config = TrainingConfig(
+        model_name=args.model,
+        vllm_port=args.port,
+        vllm_gpu_memory_utilization=args.memory_util,
+        max_model_len=4096,  # Small for quick test
+        lora_r=16,
+        lora_alpha=32,
+        weight_bridge_mode="lora_restart",
+        save_path="/tmp/vllm_restart_test",
+    )
+    
+    # Create dummy adapter directory
+    os.makedirs(config.save_path, exist_ok=True)
+    adapter_path = os.path.join(config.save_path, "dummy_adapter")
+    
+    # We need to create a real adapter for vLLM to load
+    # Let's skip the adapter for this test and just test launch/terminate
+    print("\n" + "=" * 60)
+    print("Testing vLLM launch/terminate cycle (no adapter)")
+    print("=" * 60)
+    
+    from vllm_manager import kill_process_on_port, wait_for_vllm_ready
+    import subprocess
+    
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    server_script = os.path.join(os.path.dirname(script_dir), "vllm_api_server.py")
+    
+    for cycle in range(args.restarts):
+        print(f"\n{'='*60}")
+        print(f"CYCLE {cycle + 1}/{args.restarts}")
+        print(f"{'='*60}")
+        
+        # Check memory before launch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            free_mem = torch.cuda.mem_get_info()[0] / 1e9
+            total_mem = torch.cuda.mem_get_info()[1] / 1e9
+            print(f"[Before launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
+        
+        # Launch vLLM (without LoRA for simplicity)
+        print(f"\n[{cycle+1}] Launching vLLM...")
+        cmd = [
+            "python", server_script,
+            "--model", args.model,
+            "--port", str(args.port),
+            "--gpu-memory-utilization", str(args.memory_util),
+            "--max-model-len", "4096",
+        ]
+        print(f"  Command: {' '.join(cmd)}")
+        
+        log_file = f"/tmp/vllm_restart_test/vllm_cycle_{cycle}.log"
+        with open(log_file, "w") as f:
+            proc = subprocess.Popen(
+                cmd,
+                stdout=f,
+                stderr=subprocess.STDOUT,
+                env=os.environ.copy(),
+            )
+        print(f"  PID: {proc.pid}")
+        print(f"  Log: {log_file}")
+        
+        # Wait for vLLM to be ready
+        print(f"  Waiting for vLLM to be ready...")
+        start_time = time.time()
+        if wait_for_vllm_ready(args.port, timeout=300):
+            elapsed = time.time() - start_time
+            print(f"  ✓ vLLM ready in {elapsed:.1f}s")
+        else:
+            print(f"  ✗ vLLM failed to start!")
+            print(f"  Check log: {log_file}")
+            with open(log_file, "r") as f:
+                print(f"  Last 20 lines:\n{''.join(f.readlines()[-20:])}")
+            proc.kill()
+            return 1
+        
+        # Check memory after launch
+        if torch.cuda.is_available():
+            free_mem = torch.cuda.mem_get_info()[0] / 1e9
+            total_mem = torch.cuda.mem_get_info()[1] / 1e9
+            print(f"[After launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
+        
+        # Keep vLLM running for a bit
+        print(f"\n  Letting vLLM run for 5s...")
+        time.sleep(5)
+        
+        # Terminate vLLM
+        print(f"\n[{cycle+1}] Terminating vLLM...")
+        _terminate_vllm(proc, args.port)
+        
+        # Check memory after terminate
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            free_mem = torch.cuda.mem_get_info()[0] / 1e9
+            total_mem = torch.cuda.mem_get_info()[1] / 1e9
+            print(f"[After terminate] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
+    
+    print("\n" + "=" * 60)
+    print("TEST COMPLETE!")
+    print("=" * 60)
+    
+    if torch.cuda.is_available():
+        free_mem = torch.cuda.mem_get_info()[0] / 1e9
+        total_mem = torch.cuda.mem_get_info()[1] / 1e9
+        print(f"Final GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
+    
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())