testing lora

This commit is contained in:
Jai Suphavadeeprasit 2026-02-12 08:40:35 -05:00
parent 71f7cc5b27
commit d7e661117d

View file

@ -0,0 +1,327 @@
#!/usr/bin/env python3
"""
Benchmark LoRA vs Shared vLLM inference performance.
This script:
1. Starts two vLLM instances (one with LoRA, one without)
2. Optionally loads a LoRA adapter
3. Sends identical prompts to both
4. Measures and compares TPS (tokens per second)
Usage:
python benchmark_lora_vs_shared.py --model Qwen/Qwen3-4B-Instruct-2507
python benchmark_lora_vs_shared.py --model Qwen/Qwen3-4B-Instruct-2507 --lora-path ./checkpoints/final_adapter
"""
import argparse
import json
import os
import signal
import subprocess
import sys
import time
from typing import Optional
import requests
# Complex math prompt that requires extended reasoning
BENCHMARK_PROMPT = """You are a mathematics expert. Solve this problem step by step, showing all your work:
A rectangular garden has a perimeter of 56 meters. The length is 4 meters more than twice the width.
1) Set up the equations
2) Solve for width and length
3) Calculate the area
4) If we want to put a circular fountain in the center with radius equal to 1/4 of the width, what area remains for planting?
5) Express the planting area as a percentage of the total garden area
Show all calculations clearly and verify your answer."""
# Longer prompt for extended generation
LONG_PROMPT = """Write a detailed technical explanation of how transformer neural networks work, covering:
1. The attention mechanism - explain self-attention, multi-head attention, and how queries, keys, and values work
2. The encoder-decoder architecture vs decoder-only models
3. Positional encoding - why it's needed and different approaches
4. Layer normalization and residual connections
5. The feed-forward network component
6. How training works with cross-entropy loss and backpropagation through attention
Include mathematical formulas where appropriate and explain the intuition behind each component. This should be comprehensive enough for someone with basic ML knowledge to understand transformers deeply."""
def wait_for_server(port: int, timeout: int = 300) -> bool:
"""Wait for vLLM server to be ready."""
start = time.time()
while time.time() - start < timeout:
try:
resp = requests.get(f"http://localhost:{port}/health", timeout=5)
if resp.status_code == 200:
return True
except Exception:
pass
time.sleep(2)
return False
def start_vllm_server(
model: str,
port: int,
gpu_id: int,
enable_lora: bool = False,
max_lora_rank: int = 32,
log_file: str = "vllm.log",
) -> subprocess.Popen:
"""Start a vLLM server."""
cmd = [
"python", "example_trainer/vllm_api_server.py",
"--model", model,
"--port", str(port),
"--gpu-memory-utilization", "0.45",
"--max-model-len", "8192",
"--dtype", "bfloat16",
]
if enable_lora:
cmd.extend([
"--enable-lora",
"--max-lora-rank", str(max_lora_rank),
"--enforce-eager", # Required for LoRA
])
env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
log_f = open(log_file, "w")
proc = subprocess.Popen(
cmd,
env=env,
stdout=log_f,
stderr=subprocess.STDOUT,
)
return proc
def load_lora_adapter(port: int, adapter_path: str) -> bool:
"""Load a LoRA adapter into vLLM."""
try:
resp = requests.post(
f"http://localhost:{port}/lora/load",
json={"adapter_path": adapter_path, "adapter_name": "benchmark_adapter"},
timeout=30,
)
return resp.status_code == 200
except Exception as e:
print(f"Failed to load LoRA adapter: {e}")
return False
def benchmark_inference(
port: int,
prompt: str,
max_tokens: int = 2048,
num_runs: int = 3,
) -> dict:
"""Benchmark inference on a vLLM server."""
results = {
"times": [],
"tokens": [],
"tps": [],
}
for i in range(num_runs):
start = time.time()
try:
resp = requests.post(
f"http://localhost:{port}/generate",
json={
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": 0.7,
},
timeout=300,
)
elapsed = time.time() - start
if resp.status_code == 200:
data = resp.json()
output_text = data.get("text", [""])[0]
# Rough token count (words * 1.3)
output_tokens = len(output_text.split()) * 1.3
results["times"].append(elapsed)
results["tokens"].append(output_tokens)
results["tps"].append(output_tokens / elapsed if elapsed > 0 else 0)
print(f" Run {i+1}: {elapsed:.2f}s, ~{output_tokens:.0f} tokens, {output_tokens/elapsed:.1f} TPS")
else:
print(f" Run {i+1}: FAILED ({resp.status_code})")
except Exception as e:
print(f" Run {i+1}: ERROR - {e}")
if results["times"]:
results["avg_time"] = sum(results["times"]) / len(results["times"])
results["avg_tokens"] = sum(results["tokens"]) / len(results["tokens"])
results["avg_tps"] = sum(results["tps"]) / len(results["tps"])
return results
def main():
parser = argparse.ArgumentParser(description="Benchmark LoRA vs Shared vLLM inference")
parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B-Instruct-2507",
help="Model to benchmark")
parser.add_argument("--lora-path", type=str, default=None,
help="Path to LoRA adapter (optional)")
parser.add_argument("--max-tokens", type=int, default=2048,
help="Max tokens to generate")
parser.add_argument("--num-runs", type=int, default=3,
help="Number of benchmark runs per server")
parser.add_argument("--lora-gpu", type=int, default=0,
help="GPU for LoRA server")
parser.add_argument("--shared-gpu", type=int, default=1,
help="GPU for shared/base server")
parser.add_argument("--lora-port", type=int, default=9001,
help="Port for LoRA server")
parser.add_argument("--shared-port", type=int, default=9002,
help="Port for shared/base server")
parser.add_argument("--prompt", type=str, choices=["math", "long"], default="long",
help="Which prompt to use")
parser.add_argument("--skip-lora", action="store_true",
help="Skip LoRA server (test base only)")
parser.add_argument("--skip-shared", action="store_true",
help="Skip shared/base server (test LoRA only)")
args = parser.parse_args()
prompt = LONG_PROMPT if args.prompt == "long" else BENCHMARK_PROMPT
procs = []
def cleanup():
print("\nCleaning up...")
for p in procs:
try:
p.terminate()
p.wait(timeout=5)
except Exception:
p.kill()
signal.signal(signal.SIGINT, lambda s, f: (cleanup(), sys.exit(0)))
signal.signal(signal.SIGTERM, lambda s, f: (cleanup(), sys.exit(0)))
try:
print("=" * 70)
print("vLLM Inference Benchmark: LoRA vs Base Model")
print("=" * 70)
print(f"Model: {args.model}")
print(f"LoRA adapter: {args.lora_path or 'None (base model only)'}")
print(f"Max tokens: {args.max_tokens}")
print(f"Num runs: {args.num_runs}")
print(f"Prompt type: {args.prompt}")
print("=" * 70)
# Start LoRA server
if not args.skip_lora:
print(f"\n[1/4] Starting LoRA-enabled vLLM on GPU {args.lora_gpu}, port {args.lora_port}...")
print(" Flags: --enable-lora --enforce-eager (no CUDA graphs)")
lora_proc = start_vllm_server(
args.model, args.lora_port, args.lora_gpu,
enable_lora=True, log_file="benchmark_lora.log"
)
procs.append(lora_proc)
# Start base/shared server
if not args.skip_shared:
print(f"\n[2/4] Starting base vLLM on GPU {args.shared_gpu}, port {args.shared_port}...")
print(" Flags: (none) - uses CUDA graphs for faster inference")
shared_proc = start_vllm_server(
args.model, args.shared_port, args.shared_gpu,
enable_lora=False, log_file="benchmark_shared.log"
)
procs.append(shared_proc)
# Wait for servers
print("\n[3/4] Waiting for servers to be ready...")
lora_ready = False
shared_ready = False
if not args.skip_lora:
print(f" Waiting for LoRA server (port {args.lora_port})...")
lora_ready = wait_for_server(args.lora_port, timeout=300)
if lora_ready:
print(f" ✓ LoRA server ready")
# Load LoRA adapter if provided
if args.lora_path:
print(f" Loading LoRA adapter from {args.lora_path}...")
if load_lora_adapter(args.lora_port, args.lora_path):
print(f" ✓ LoRA adapter loaded")
else:
print(f" ✗ Failed to load LoRA adapter")
else:
print(f" ✗ LoRA server failed to start")
if not args.skip_shared:
print(f" Waiting for base server (port {args.shared_port})...")
shared_ready = wait_for_server(args.shared_port, timeout=300)
if shared_ready:
print(f" ✓ Base server ready")
else:
print(f" ✗ Base server failed to start")
# Run benchmarks
print("\n[4/4] Running benchmarks...")
print("-" * 70)
lora_results = None
shared_results = None
if lora_ready and not args.skip_lora:
print(f"\nLoRA Server (--enable-lora --enforce-eager):")
lora_results = benchmark_inference(
args.lora_port, prompt, args.max_tokens, args.num_runs
)
if shared_ready and not args.skip_shared:
print(f"\nBase Server (CUDA graphs enabled):")
shared_results = benchmark_inference(
args.shared_port, prompt, args.max_tokens, args.num_runs
)
# Print comparison
print("\n" + "=" * 70)
print("RESULTS SUMMARY")
print("=" * 70)
if lora_results and "avg_tps" in lora_results:
print(f"\nLoRA Mode (--enable-lora --enforce-eager):")
print(f" Avg time: {lora_results['avg_time']:.2f}s")
print(f" Avg tokens: {lora_results['avg_tokens']:.0f}")
print(f" Avg TPS: {lora_results['avg_tps']:.1f}")
if shared_results and "avg_tps" in shared_results:
print(f"\nBase Mode (CUDA graphs):")
print(f" Avg time: {shared_results['avg_time']:.2f}s")
print(f" Avg tokens: {shared_results['avg_tokens']:.0f}")
print(f" Avg TPS: {shared_results['avg_tps']:.1f}")
if lora_results and shared_results and "avg_tps" in lora_results and "avg_tps" in shared_results:
speedup = shared_results["avg_tps"] / lora_results["avg_tps"] if lora_results["avg_tps"] > 0 else 0
time_diff = lora_results["avg_time"] - shared_results["avg_time"]
print(f"\nComparison:")
print(f" Base is {speedup:.2f}x faster in TPS")
print(f" Base saves {time_diff:.2f}s per request")
print(f" --enforce-eager overhead: ~{(1 - 1/speedup) * 100:.1f}%")
print("\n" + "=" * 70)
print("Note: The main difference is --enforce-eager which disables CUDA graphs.")
print("This is REQUIRED for LoRA hot-swapping but costs ~10-30% performance.")
print("=" * 70)
finally:
cleanup()
if __name__ == "__main__":
main()