From cce3bf49a8e0a92ce7e76e25e2b51ce55571264c Mon Sep 17 00:00:00 2001 From: srinivamd <52507740+srinivamd@users.noreply.github.com> Date: Mon, 1 Jun 2026 01:23:06 -0700 Subject: [PATCH] [ROCm] Fix duplicate hipMemMap call in map_block() causing SIGSEGV (#ROCM-25272) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ROCm code path in map_block() calls hipMemMap() twice with identical arguments — a copy-paste bug from the CUDA path which only calls cuMemMap_() once. Mapping over an already-mapped VA range causes undefined behavior in the HIP VMM driver, leading to SIGSEGV in IntraNodeComm::rendezvous() during the first allreduce call. This bug is dormant on upstream pytorch/pytorch because test_intra_node_comm_all_reduce has @skipIfRocm. On ROCm/pytorch release/2.12, the skip was replaced with @runOnRocmArch(MI300_ARCH), exposing the crash on MI300X/gfx942. Fixes: ROCM-25272 --- .../c10d/symm_mem/CUDASymmetricMemoryUtils.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp index 98a272468a84d..072128c7d3113 100644 --- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp @@ -72,7 +72,7 @@ IpcChannel::~IpcChannel() { } void IpcChannel::send_fd(int dst_pid, int fd) { - // Because file descriptors are process-local kernel objects, and we can’t + // Because file descriptors are process-local kernel objects, and we can't // pass them via normal socket payloads (like write() or send()). Unix domain // sockets provide a mechanism to pass actual FDs via sendmsg()/recvmsg(). // Define destination socket address @@ -251,12 +251,6 @@ void map_block( 0, reinterpret_cast(handle), 0ULL)); - C10_CUDA_CHECK(hipMemMap( - *ptr, - size, - 0, - reinterpret_cast(handle), - 0ULL)); hipMemAccessDesc desc; desc.location.type = hipMemLocationTypeDevice;