atropos/example_trainer/scripts/test_vllm_restart_only.py

#!/usr/bin/env python3
"""
Minimal test for vLLM restart cycle - no training, just launch/terminate/relaunch.
Tests whether GPU memory is properly released between restarts.
"""
import os
import sys
import time
import argparse

# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default="Qwen/Qwen3-4B-Instruct-2507")
    parser.add_argument("--port", type=int, default=9099)
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument("--memory-util", type=float, default=0.3)
    parser.add_argument("--restarts", type=int, default=3, help="Number of restart cycles to test")
    args = parser.parse_args()

    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)

    import torch
    from trainers import _launch_vllm_with_lora, _terminate_vllm
    from config import TrainingConfig

    print("=" * 60)
    print("vLLM RESTART CYCLE TEST")
    print("=" * 60)
    print(f"Model: {args.model}")
    print(f"Port: {args.port}")
    print(f"GPU: {args.gpu}")
    print(f"Memory utilization: {args.memory_util}")
    print(f"Restart cycles: {args.restarts}")
    print("=" * 60)

    # Check initial GPU memory
    if torch.cuda.is_available():
        free_mem = torch.cuda.mem_get_info()[0] / 1e9
        total_mem = torch.cuda.mem_get_info()[1] / 1e9
        print(f"\nInitial GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free")

    # Create a minimal config
    config = TrainingConfig(
        model_name=args.model,
        vllm_port=args.port,
        vllm_gpu_memory_utilization=args.memory_util,
        max_model_len=4096,  # Small for quick test
        lora_r=16,
        lora_alpha=32,
        weight_bridge_mode="lora_restart",
        save_path="/tmp/vllm_restart_test",
    )

    # Create dummy adapter directory
    os.makedirs(config.save_path, exist_ok=True)
    adapter_path = os.path.join(config.save_path, "dummy_adapter")

    # We need to create a real adapter for vLLM to load
    # Let's skip the adapter for this test and just test launch/terminate
    print("\n" + "=" * 60)
    print("Testing vLLM launch/terminate cycle (no adapter)")
    print("=" * 60)

    from vllm_manager import kill_process_on_port, wait_for_vllm_ready
    import subprocess

    script_dir = os.path.dirname(os.path.abspath(__file__))
    server_script = os.path.join(os.path.dirname(script_dir), "vllm_api_server.py")

    for cycle in range(args.restarts):
        print(f"\n{'='*60}")
        print(f"CYCLE {cycle + 1}/{args.restarts}")
        print(f"{'='*60}")

        # Check memory before launch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            free_mem = torch.cuda.mem_get_info()[0] / 1e9
            total_mem = torch.cuda.mem_get_info()[1] / 1e9
            print(f"[Before launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")

        # Launch vLLM (without LoRA for simplicity)
        print(f"\n[{cycle+1}] Launching vLLM...")
        cmd = [
            "python", server_script,
            "--model", args.model,
            "--port", str(args.port),
            "--gpu-memory-utilization", str(args.memory_util),
            "--max-model-len", "4096",
        ]
        print(f"  Command: {' '.join(cmd)}")

        log_file = f"/tmp/vllm_restart_test/vllm_cycle_{cycle}.log"
        with open(log_file, "w") as f:
            proc = subprocess.Popen(
                cmd,
                stdout=f,
                stderr=subprocess.STDOUT,
                env=os.environ.copy(),
            )
        print(f"  PID: {proc.pid}")
        print(f"  Log: {log_file}")

        # Wait for vLLM to be ready
        print(f"  Waiting for vLLM to be ready...")
        start_time = time.time()
        if wait_for_vllm_ready(args.port, timeout=300):
            elapsed = time.time() - start_time
            print(f"  ✓ vLLM ready in {elapsed:.1f}s")
        else:
            print(f"  ✗ vLLM failed to start!")
            print(f"  Check log: {log_file}")
            with open(log_file, "r") as f:
                print(f"  Last 20 lines:\n{''.join(f.readlines()[-20:])}")
            proc.kill()
            return 1

        # Check memory after launch
        if torch.cuda.is_available():
            free_mem = torch.cuda.mem_get_info()[0] / 1e9
            total_mem = torch.cuda.mem_get_info()[1] / 1e9
            print(f"[After launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")

        # Keep vLLM running for a bit
        print(f"\n  Letting vLLM run for 5s...")
        time.sleep(5)

        # Terminate vLLM
        print(f"\n[{cycle+1}] Terminating vLLM...")
        _terminate_vllm(proc, args.port)

        # Check memory after terminate
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            free_mem = torch.cuda.mem_get_info()[0] / 1e9
            total_mem = torch.cuda.mem_get_info()[1] / 1e9
            print(f"[After terminate] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")

    print("\n" + "=" * 60)
    print("TEST COMPLETE!")
    print("=" * 60)

    if torch.cuda.is_available():
        free_mem = torch.cuda.mem_get_info()[0] / 1e9
        total_mem = torch.cuda.mem_get_info()[1] / 1e9
        print(f"Final GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")

    return 0

if __name__ == "__main__":
    sys.exit(main())