error handling

This commit is contained in:
Jai Suphavadeeprasit 2025-12-29 20:23:58 -05:00
parent 9e53076a82
commit 80f67f979a
4 changed files with 38 additions and 1 deletions

View file

@ -336,6 +336,14 @@ class VLLMWeightBridge:
group_name="weight_update_group",
)
print("[Bridge] ✓ NCCL group created")
# Barrier synchronization to ensure both sides are ready
print("[Bridge] Waiting for all ranks to be ready...")
try:
dist.barrier(group=self.gloo_group)
print("[Bridge] ✓ All ranks synchronized and ready")
except Exception as e:
print(f"[Bridge] Warning: Barrier sync failed: {e}")
def _initialize_http_mode(self) -> None:
"""Initialize HTTP-based weight synchronization (fallback)."""