testing lora

2026-04-19 12:57:58 +00:00 · 2026-02-12 08:40:35 -05:00 · 2026-02-12 08:40:35 -05:00 · d7e661117d
commit d7e661117d
parent 71f7cc5b27
1 changed files with 327 additions and 0 deletions
--- a/example_trainer/scripts/benchmark_lora_vs_shared.py
+++ b/example_trainer/scripts/benchmark_lora_vs_shared.py
@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+"""
+Benchmark LoRA vs Shared vLLM inference performance.
+
+This script:
+1. Starts two vLLM instances (one with LoRA, one without)
+2. Optionally loads a LoRA adapter
+3. Sends identical prompts to both
+4. Measures and compares TPS (tokens per second)
+
+Usage:
+    python benchmark_lora_vs_shared.py --model Qwen/Qwen3-4B-Instruct-2507
+    python benchmark_lora_vs_shared.py --model Qwen/Qwen3-4B-Instruct-2507 --lora-path ./checkpoints/final_adapter
+"""
+
+import argparse
+import json
+import os
+import signal
+import subprocess
+import sys
+import time
+from typing import Optional
+
+import requests
+
+# Complex math prompt that requires extended reasoning
+BENCHMARK_PROMPT = """You are a mathematics expert. Solve this problem step by step, showing all your work:
+
+A rectangular garden has a perimeter of 56 meters. The length is 4 meters more than twice the width. 
+
+1) Set up the equations
+2) Solve for width and length
+3) Calculate the area
+4) If we want to put a circular fountain in the center with radius equal to 1/4 of the width, what area remains for planting?
+5) Express the planting area as a percentage of the total garden area
+
+Show all calculations clearly and verify your answer."""
+
+# Longer prompt for extended generation
+LONG_PROMPT = """Write a detailed technical explanation of how transformer neural networks work, covering:
+
+1. The attention mechanism - explain self-attention, multi-head attention, and how queries, keys, and values work
+2. The encoder-decoder architecture vs decoder-only models
+3. Positional encoding - why it's needed and different approaches
+4. Layer normalization and residual connections
+5. The feed-forward network component
+6. How training works with cross-entropy loss and backpropagation through attention
+
+Include mathematical formulas where appropriate and explain the intuition behind each component. This should be comprehensive enough for someone with basic ML knowledge to understand transformers deeply."""
+
+
+def wait_for_server(port: int, timeout: int = 300) -> bool:
+    """Wait for vLLM server to be ready."""
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            resp = requests.get(f"http://localhost:{port}/health", timeout=5)
+            if resp.status_code == 200:
+                return True
+        except Exception:
+            pass
+        time.sleep(2)
+    return False
+
+
+def start_vllm_server(
+    model: str,
+    port: int,
+    gpu_id: int,
+    enable_lora: bool = False,
+    max_lora_rank: int = 32,
+    log_file: str = "vllm.log",
+) -> subprocess.Popen:
+    """Start a vLLM server."""
+    cmd = [
+        "python", "example_trainer/vllm_api_server.py",
+        "--model", model,
+        "--port", str(port),
+        "--gpu-memory-utilization", "0.45",
+        "--max-model-len", "8192",
+        "--dtype", "bfloat16",
+    ]
+    
+    if enable_lora:
+        cmd.extend([
+            "--enable-lora",
+            "--max-lora-rank", str(max_lora_rank),
+            "--enforce-eager",  # Required for LoRA
+        ])
+    
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    
+    log_f = open(log_file, "w")
+    proc = subprocess.Popen(
+        cmd,
+        env=env,
+        stdout=log_f,
+        stderr=subprocess.STDOUT,
+    )
+    return proc
+
+
+def load_lora_adapter(port: int, adapter_path: str) -> bool:
+    """Load a LoRA adapter into vLLM."""
+    try:
+        resp = requests.post(
+            f"http://localhost:{port}/lora/load",
+            json={"adapter_path": adapter_path, "adapter_name": "benchmark_adapter"},
+            timeout=30,
+        )
+        return resp.status_code == 200
+    except Exception as e:
+        print(f"Failed to load LoRA adapter: {e}")
+        return False
+
+
+def benchmark_inference(
+    port: int,
+    prompt: str,
+    max_tokens: int = 2048,
+    num_runs: int = 3,
+) -> dict:
+    """Benchmark inference on a vLLM server."""
+    results = {
+        "times": [],
+        "tokens": [],
+        "tps": [],
+    }
+    
+    for i in range(num_runs):
+        start = time.time()
+        try:
+            resp = requests.post(
+                f"http://localhost:{port}/generate",
+                json={
+                    "prompt": prompt,
+                    "max_tokens": max_tokens,
+                    "temperature": 0.7,
+                },
+                timeout=300,
+            )
+            elapsed = time.time() - start
+            
+            if resp.status_code == 200:
+                data = resp.json()
+                output_text = data.get("text", [""])[0]
+                # Rough token count (words * 1.3)
+                output_tokens = len(output_text.split()) * 1.3
+                
+                results["times"].append(elapsed)
+                results["tokens"].append(output_tokens)
+                results["tps"].append(output_tokens / elapsed if elapsed > 0 else 0)
+                
+                print(f"  Run {i+1}: {elapsed:.2f}s, ~{output_tokens:.0f} tokens, {output_tokens/elapsed:.1f} TPS")
+            else:
+                print(f"  Run {i+1}: FAILED ({resp.status_code})")
+        except Exception as e:
+            print(f"  Run {i+1}: ERROR - {e}")
+    
+    if results["times"]:
+        results["avg_time"] = sum(results["times"]) / len(results["times"])
+        results["avg_tokens"] = sum(results["tokens"]) / len(results["tokens"])
+        results["avg_tps"] = sum(results["tps"]) / len(results["tps"])
+    
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark LoRA vs Shared vLLM inference")
+    parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B-Instruct-2507",
+                        help="Model to benchmark")
+    parser.add_argument("--lora-path", type=str, default=None,
+                        help="Path to LoRA adapter (optional)")
+    parser.add_argument("--max-tokens", type=int, default=2048,
+                        help="Max tokens to generate")
+    parser.add_argument("--num-runs", type=int, default=3,
+                        help="Number of benchmark runs per server")
+    parser.add_argument("--lora-gpu", type=int, default=0,
+                        help="GPU for LoRA server")
+    parser.add_argument("--shared-gpu", type=int, default=1,
+                        help="GPU for shared/base server")
+    parser.add_argument("--lora-port", type=int, default=9001,
+                        help="Port for LoRA server")
+    parser.add_argument("--shared-port", type=int, default=9002,
+                        help="Port for shared/base server")
+    parser.add_argument("--prompt", type=str, choices=["math", "long"], default="long",
+                        help="Which prompt to use")
+    parser.add_argument("--skip-lora", action="store_true",
+                        help="Skip LoRA server (test base only)")
+    parser.add_argument("--skip-shared", action="store_true",
+                        help="Skip shared/base server (test LoRA only)")
+    args = parser.parse_args()
+    
+    prompt = LONG_PROMPT if args.prompt == "long" else BENCHMARK_PROMPT
+    
+    procs = []
+    
+    def cleanup():
+        print("\nCleaning up...")
+        for p in procs:
+            try:
+                p.terminate()
+                p.wait(timeout=5)
+            except Exception:
+                p.kill()
+    
+    signal.signal(signal.SIGINT, lambda s, f: (cleanup(), sys.exit(0)))
+    signal.signal(signal.SIGTERM, lambda s, f: (cleanup(), sys.exit(0)))
+    
+    try:
+        print("=" * 70)
+        print("vLLM Inference Benchmark: LoRA vs Base Model")
+        print("=" * 70)
+        print(f"Model: {args.model}")
+        print(f"LoRA adapter: {args.lora_path or 'None (base model only)'}")
+        print(f"Max tokens: {args.max_tokens}")
+        print(f"Num runs: {args.num_runs}")
+        print(f"Prompt type: {args.prompt}")
+        print("=" * 70)
+        
+        # Start LoRA server
+        if not args.skip_lora:
+            print(f"\n[1/4] Starting LoRA-enabled vLLM on GPU {args.lora_gpu}, port {args.lora_port}...")
+            print("      Flags: --enable-lora --enforce-eager (no CUDA graphs)")
+            lora_proc = start_vllm_server(
+                args.model, args.lora_port, args.lora_gpu,
+                enable_lora=True, log_file="benchmark_lora.log"
+            )
+            procs.append(lora_proc)
+        
+        # Start base/shared server
+        if not args.skip_shared:
+            print(f"\n[2/4] Starting base vLLM on GPU {args.shared_gpu}, port {args.shared_port}...")
+            print("      Flags: (none) - uses CUDA graphs for faster inference")
+            shared_proc = start_vllm_server(
+                args.model, args.shared_port, args.shared_gpu,
+                enable_lora=False, log_file="benchmark_shared.log"
+            )
+            procs.append(shared_proc)
+        
+        # Wait for servers
+        print("\n[3/4] Waiting for servers to be ready...")
+        
+        lora_ready = False
+        shared_ready = False
+        
+        if not args.skip_lora:
+            print(f"  Waiting for LoRA server (port {args.lora_port})...")
+            lora_ready = wait_for_server(args.lora_port, timeout=300)
+            if lora_ready:
+                print(f"  ✓ LoRA server ready")
+                
+                # Load LoRA adapter if provided
+                if args.lora_path:
+                    print(f"  Loading LoRA adapter from {args.lora_path}...")
+                    if load_lora_adapter(args.lora_port, args.lora_path):
+                        print(f"  ✓ LoRA adapter loaded")
+                    else:
+                        print(f"  ✗ Failed to load LoRA adapter")
+            else:
+                print(f"  ✗ LoRA server failed to start")
+        
+        if not args.skip_shared:
+            print(f"  Waiting for base server (port {args.shared_port})...")
+            shared_ready = wait_for_server(args.shared_port, timeout=300)
+            if shared_ready:
+                print(f"  ✓ Base server ready")
+            else:
+                print(f"  ✗ Base server failed to start")
+        
+        # Run benchmarks
+        print("\n[4/4] Running benchmarks...")
+        print("-" * 70)
+        
+        lora_results = None
+        shared_results = None
+        
+        if lora_ready and not args.skip_lora:
+            print(f"\nLoRA Server (--enable-lora --enforce-eager):")
+            lora_results = benchmark_inference(
+                args.lora_port, prompt, args.max_tokens, args.num_runs
+            )
+        
+        if shared_ready and not args.skip_shared:
+            print(f"\nBase Server (CUDA graphs enabled):")
+            shared_results = benchmark_inference(
+                args.shared_port, prompt, args.max_tokens, args.num_runs
+            )
+        
+        # Print comparison
+        print("\n" + "=" * 70)
+        print("RESULTS SUMMARY")
+        print("=" * 70)
+        
+        if lora_results and "avg_tps" in lora_results:
+            print(f"\nLoRA Mode (--enable-lora --enforce-eager):")
+            print(f"  Avg time:   {lora_results['avg_time']:.2f}s")
+            print(f"  Avg tokens: {lora_results['avg_tokens']:.0f}")
+            print(f"  Avg TPS:    {lora_results['avg_tps']:.1f}")
+        
+        if shared_results and "avg_tps" in shared_results:
+            print(f"\nBase Mode (CUDA graphs):")
+            print(f"  Avg time:   {shared_results['avg_time']:.2f}s")
+            print(f"  Avg tokens: {shared_results['avg_tokens']:.0f}")
+            print(f"  Avg TPS:    {shared_results['avg_tps']:.1f}")
+        
+        if lora_results and shared_results and "avg_tps" in lora_results and "avg_tps" in shared_results:
+            speedup = shared_results["avg_tps"] / lora_results["avg_tps"] if lora_results["avg_tps"] > 0 else 0
+            time_diff = lora_results["avg_time"] - shared_results["avg_time"]
+            print(f"\nComparison:")
+            print(f"  Base is {speedup:.2f}x faster in TPS")
+            print(f"  Base saves {time_diff:.2f}s per request")
+            print(f"  --enforce-eager overhead: ~{(1 - 1/speedup) * 100:.1f}%")
+        
+        print("\n" + "=" * 70)
+        print("Note: The main difference is --enforce-eager which disables CUDA graphs.")
+        print("This is REQUIRED for LoRA hot-swapping but costs ~10-30% performance.")
+        print("=" * 70)
+        
+    finally:
+        cleanup()
+
+
+if __name__ == "__main__":
+    main()