From 292642c9e4ea408e57d3783eda48225afe8c2c0a Mon Sep 17 00:00:00 2001 From: henrylhtsang Date: Wed, 25 Feb 2026 15:29:19 -0800 Subject: [PATCH 1/2] Fix CUtensorMap segfault due to alignas(64) vs Python allocator mismatch CUtensorMap_st in cuda.h is declared with alignas(64), but when embedded inline as _pvt_val in a Cython cdef class, the Python object allocator (PyObject_Malloc) only guarantees 8-16 byte alignment. The compiler sees alignas(64) and may generate aligned instructions (e.g. movaps) for zero-initializing the struct, causing SIGSEGV on the unaligned memory. Fix by heap-allocating the CUtensorMap buffer with posix_memalign(64) instead of embedding it inline in the Python object. --- cuda_bindings/cuda/bindings/driver.pxd.in | 3 ++- cuda_bindings/cuda/bindings/driver.pyx.in | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in index ed992b8bd0..ab152a0b6d 100644 --- a/cuda_bindings/cuda/bindings/driver.pxd.in +++ b/cuda_bindings/cuda/bindings/driver.pxd.in @@ -3211,8 +3211,9 @@ cdef class CUtensorMap_st: getPtr() Get memory address of class instance """ - cdef cydriver.CUtensorMap_st _pvt_val + cdef void* _pvt_buf cdef cydriver.CUtensorMap_st* _pvt_ptr + cdef bint _owns_buf {{endif}} {{if 'CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st' in found_struct}} diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in index 60f510dde2..a9672d386f 100644 --- a/cuda_bindings/cuda/bindings/driver.pyx.in +++ b/cuda_bindings/cuda/bindings/driver.pyx.in @@ -6,7 +6,9 @@ from typing import Any, Optional import cython import ctypes from libc.stdlib cimport calloc, malloc, free +from libc.string cimport memset from libc cimport string +from posix.stdlib cimport posix_memalign from libc.stdint cimport int32_t, uint32_t, int64_t, uint64_t, uintptr_t from libc.stddef cimport wchar_t from libc.limits cimport CHAR_MIN @@ -18132,14 +18134,25 @@ cdef class CUtensorMap_st: Get memory address of class instance """ def __cinit__(self, void_ptr _ptr = 0): + cdef int rc if _ptr == 0: - self._pvt_ptr = &self._pvt_val - else: + rc = posix_memalign(&self._pvt_buf, 64, sizeof(cydriver.CUtensorMap_st)) + if rc != 0 or self._pvt_buf is NULL: + raise MemoryError("Failed to allocate 64-byte aligned CUtensorMap") + memset(self._pvt_buf, 0, sizeof(cydriver.CUtensorMap_st)) + self._pvt_ptr = self._pvt_buf + self._owns_buf = True + else: + self._pvt_buf = NULL self._pvt_ptr = _ptr + self._owns_buf = False def __init__(self, void_ptr _ptr = 0): pass def __dealloc__(self): - pass + if self._owns_buf and self._pvt_buf is not NULL: + free(self._pvt_buf) + self._pvt_buf = NULL + self._pvt_ptr = NULL def getPtr(self): return self._pvt_ptr def __repr__(self): From a03d844433c2c14dd8dbdf3fb30ead16407e1328 Mon Sep 17 00:00:00 2001 From: henrylhtsang Date: Thu, 26 Feb 2026 15:23:38 -0800 Subject: [PATCH 2/2] Use std::aligned_alloc instead of posix_memalign for C++17 compatibility --- cuda_bindings/cuda/bindings/driver.pyx.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in index a9672d386f..795d4be0c0 100644 --- a/cuda_bindings/cuda/bindings/driver.pyx.in +++ b/cuda_bindings/cuda/bindings/driver.pyx.in @@ -8,7 +8,6 @@ import ctypes from libc.stdlib cimport calloc, malloc, free from libc.string cimport memset from libc cimport string -from posix.stdlib cimport posix_memalign from libc.stdint cimport int32_t, uint32_t, int64_t, uint64_t, uintptr_t from libc.stddef cimport wchar_t from libc.limits cimport CHAR_MIN @@ -18,6 +17,8 @@ from cpython.bytes cimport PyBytes_FromStringAndSize from ._internal._fast_enum import FastEnum as _FastEnum import cuda.bindings.driver from libcpp.map cimport map +cdef extern from "" nogil: + void *aligned_alloc(size_t alignment, size_t size) _driver = globals() include "_lib/utils.pxi" @@ -18134,10 +18135,9 @@ cdef class CUtensorMap_st: Get memory address of class instance """ def __cinit__(self, void_ptr _ptr = 0): - cdef int rc if _ptr == 0: - rc = posix_memalign(&self._pvt_buf, 64, sizeof(cydriver.CUtensorMap_st)) - if rc != 0 or self._pvt_buf is NULL: + self._pvt_buf = aligned_alloc(64, sizeof(cydriver.CUtensorMap_st)) + if self._pvt_buf is NULL: raise MemoryError("Failed to allocate 64-byte aligned CUtensorMap") memset(self._pvt_buf, 0, sizeof(cydriver.CUtensorMap_st)) self._pvt_ptr = self._pvt_buf