mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-24 17:04:55 +00:00
group tests
This commit is contained in:
parent
7bffa4bfe7
commit
bbda8bedb2
1 changed files with 47 additions and 14 deletions
|
|
@ -12,23 +12,24 @@ With --auto-env, each model gets its own isolated stack:
|
|||
- trainer
|
||||
|
||||
Usage:
|
||||
# RECOMMENDED: Fully automated parallel test (each model gets isolated stack)
|
||||
# RECOMMENDED: Fully automated parallel test with W&B logging
|
||||
python -m example_trainer.test_multi_model \
|
||||
--models qwen3-4b hermes-8b nemotron-14b devstral-24b \
|
||||
--parallel \
|
||||
--gpus 0 1 2 3 \
|
||||
--auto-env
|
||||
--auto-env \
|
||||
--use-wandb \
|
||||
--wandb-project multi-model-test
|
||||
|
||||
# Sequential test on one GPU
|
||||
python -m example_trainer.test_multi_model \
|
||||
--models qwen3-4b hermes-8b \
|
||||
--sequential \
|
||||
--gpu 0 \
|
||||
--auto-env
|
||||
--auto-env \
|
||||
--use-wandb
|
||||
|
||||
# Manual mode (you must start run-api and gsm8k_server yourself)
|
||||
# First start: run-api --port 8002 &
|
||||
# Then start gsm8k for your model
|
||||
python -m example_trainer.test_multi_model \
|
||||
--models qwen3-4b \
|
||||
--sequential \
|
||||
|
|
@ -36,9 +37,9 @@ Usage:
|
|||
--atropos-url http://localhost:8002
|
||||
|
||||
Port allocation with --auto-env:
|
||||
Model 0: run-api:8002, vLLM:9001
|
||||
Model 1: run-api:8003, vLLM:9002
|
||||
Model 2: run-api:8004, vLLM:9003
|
||||
Model 0: run-api:8002, vLLM:9001, GPU from --gpus[0]
|
||||
Model 1: run-api:8003, vLLM:9002, GPU from --gpus[1]
|
||||
Model 2: run-api:8004, vLLM:9003, GPU from --gpus[2]
|
||||
...
|
||||
"""
|
||||
|
||||
|
|
@ -195,6 +196,8 @@ def run_model_test(
|
|||
training_steps: int,
|
||||
vllm_port_offset: int = 0,
|
||||
auto_env: bool = False,
|
||||
use_wandb: bool = False,
|
||||
wandb_project: str = "multi-model-test",
|
||||
) -> Dict:
|
||||
"""
|
||||
Run a complete training test for a single model.
|
||||
|
|
@ -288,13 +291,15 @@ def run_model_test(
|
|||
# which is required for CUDA IPC with ptrace_scope=1
|
||||
run_script = script_dir / "run.py"
|
||||
|
||||
# Don't use CUDA_VISIBLE_DEVICES - use --device instead
|
||||
# run.py sets CUDA_VISIBLE_DEVICES internally based on --device
|
||||
run_env = os.environ.copy()
|
||||
run_env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
||||
run_env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
||||
|
||||
run_cmd = [
|
||||
sys.executable, "-u", str(run_script),
|
||||
"--model", model_config.model_id,
|
||||
"--device", f"cuda:{gpu_id}", # This controls GPU selection
|
||||
"--vllm-port", str(vllm_port),
|
||||
"--gpu-memory-utilization", str(model_config.gpu_memory_utilization),
|
||||
"--max-model-len", str(model_config.max_model_len),
|
||||
|
|
@ -307,6 +312,10 @@ def run_model_test(
|
|||
"--log-dir", str(log_dir),
|
||||
]
|
||||
|
||||
# Add wandb flags if enabled
|
||||
if use_wandb:
|
||||
run_cmd.extend(["--use-wandb", "--wandb-project", wandb_project])
|
||||
|
||||
print(f"[{model_name}] Starting unified trainer (vLLM + GRPO) for {training_steps} steps...")
|
||||
with open(trainer_log, "w") as tlog:
|
||||
trainer_process = subprocess.Popen(
|
||||
|
|
@ -386,6 +395,8 @@ def run_parallel_tests(
|
|||
base_dir: str,
|
||||
training_steps: int,
|
||||
auto_env: bool = False,
|
||||
use_wandb: bool = False,
|
||||
wandb_project: str = "multi-model-test",
|
||||
) -> List[Dict]:
|
||||
"""Run tests for multiple models in parallel."""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
|
@ -396,7 +407,7 @@ def run_parallel_tests(
|
|||
def run_and_store(model, gpu, port_offset):
|
||||
result = run_model_test(
|
||||
model, gpu, atropos_url, atropos_port, base_dir, timestamp,
|
||||
training_steps, port_offset, auto_env
|
||||
training_steps, port_offset, auto_env, use_wandb, wandb_project
|
||||
)
|
||||
with result_lock:
|
||||
results.append(result)
|
||||
|
|
@ -423,6 +434,8 @@ def run_sequential_tests(
|
|||
base_dir: str,
|
||||
training_steps: int,
|
||||
auto_env: bool = False,
|
||||
use_wandb: bool = False,
|
||||
wandb_project: str = "multi-model-test",
|
||||
) -> List[Dict]:
|
||||
"""Run tests for multiple models sequentially on one GPU."""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
|
@ -431,7 +444,8 @@ def run_sequential_tests(
|
|||
for i, model in enumerate(models):
|
||||
result = run_model_test(
|
||||
model, gpu_id, atropos_url, atropos_port, base_dir, timestamp,
|
||||
training_steps, port_offset=0, auto_env=auto_env
|
||||
training_steps, port_offset=0, auto_env=auto_env,
|
||||
use_wandb=use_wandb, wandb_project=wandb_project
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
|
|
@ -548,7 +562,18 @@ Available models: """ + ", ".join(TEST_MODELS.keys())
|
|||
parser.add_argument(
|
||||
"--auto-env",
|
||||
action="store_true",
|
||||
help="Automatically start gsm8k environment for each model (requires run-api to be running)",
|
||||
help="Automatically start run-api and gsm8k environment for each model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-wandb",
|
||||
action="store_true",
|
||||
help="Enable Weights & Biases logging for training runs",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wandb-project",
|
||||
type=str,
|
||||
default="multi-model-test",
|
||||
help="W&B project name for logging",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
@ -580,19 +605,27 @@ Available models: """ + ", ".join(TEST_MODELS.keys())
|
|||
gpus = gpus * (len(models) // len(gpus) + 1)
|
||||
|
||||
print(f"Using GPUs: {gpus[:len(models)]}")
|
||||
if args.use_wandb:
|
||||
print(f"W&B logging enabled (project: {args.wandb_project})")
|
||||
results = run_parallel_tests(
|
||||
models, gpus[:len(models)],
|
||||
args.atropos_url, args.atropos_port,
|
||||
args.output_dir, args.training_steps,
|
||||
auto_env=args.auto_env
|
||||
auto_env=args.auto_env,
|
||||
use_wandb=args.use_wandb,
|
||||
wandb_project=args.wandb_project,
|
||||
)
|
||||
else:
|
||||
print(f"Using GPU: {args.gpu}")
|
||||
if args.use_wandb:
|
||||
print(f"W&B logging enabled (project: {args.wandb_project})")
|
||||
results = run_sequential_tests(
|
||||
models, args.gpu,
|
||||
args.atropos_url, args.atropos_port,
|
||||
args.output_dir, args.training_steps,
|
||||
auto_env=args.auto_env
|
||||
auto_env=args.auto_env,
|
||||
use_wandb=args.use_wandb,
|
||||
wandb_project=args.wandb_project,
|
||||
)
|
||||
|
||||
# Print summary
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue