Skip to content
Closed
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions nemo_deploy/llm/megatronllm_deployable_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import logging
import os
import random
import socket
import time
from typing import Any, Dict, Optional

Expand Down Expand Up @@ -66,6 +67,12 @@ def __init__(
**model_config_kwargs,
):
# Use replica-specific environment variables to avoid conflicts
master_addr = "127.0.0.1"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the need to hard code the master address and port here?

with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("", 0)) # Bind to port 0 lets OS pick a free port
new_port = s.getsockname()[1]
self.master_port = str(new_port)

os.environ["MASTER_PORT"] = master_port
# All ranks must use the SAME MASTER_ADDR (rank 0 node IP)
os.environ["MASTER_ADDR"] = master_addr if master_addr else ray._private.services.get_node_ip_address()
Expand All @@ -82,6 +89,11 @@ def __init__(
LOGGER.info(f"Replica {replica_id} - MASTER_PORT: {os.environ['MASTER_PORT']}")
LOGGER.info(f"Replica {replica_id} - MASTER_ADDR: {os.environ['MASTER_ADDR']}")

if rank != 0:
sleep_time = 5 + rank # Rank 1 waits 6s, Rank 2 waits 7s, etc.
LOGGER.info(f"Worker {rank}: Sleeping {sleep_time}s to avoid JIT lock contention...")
time.sleep(sleep_time)

try:
self.model = MegatronLLMDeployableNemo2(
nemo_checkpoint_filepath=nemo_checkpoint_filepath,
Expand Down Expand Up @@ -192,9 +204,8 @@ def __init__(
deployment_node_id = node.get("NodeID")
break

rank_0_worker = ModelWorker.options(
scheduling_strategy=NodeAffinitySchedulingStrategy(node_id=deployment_node_id, soft=False)
).remote(
# Common arguments for rank 0 worker
rank_0_kwargs = dict(
nemo_checkpoint_filepath=nemo_checkpoint_filepath,
rank=0,
world_size=num_gpus,
Expand All @@ -216,6 +227,14 @@ def __init__(
micro_batch_size=micro_batch_size,
**model_config_kwargs,
)

# Use node affinity if we found a matching node, otherwise use default scheduling
if deployment_node_id is not None:
rank_0_worker = ModelWorker.options(
scheduling_strategy=NodeAffinitySchedulingStrategy(node_id=deployment_node_id, soft=True)
).remote(**rank_0_kwargs)
else:
rank_0_worker = ModelWorker.remote(**rank_0_kwargs)
worker_futures.append(rank_0_worker)

# Wait for rank 0 to start before creating other workers
Expand Down