Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cuda_bindings/cuda/bindings/driver.pxd.in
Original file line number Diff line number Diff line change
Expand Up @@ -3211,8 +3211,9 @@ cdef class CUtensorMap_st:
getPtr()
Get memory address of class instance
"""
cdef cydriver.CUtensorMap_st _pvt_val
cdef void* _pvt_buf
Copy link
Member

@leofang leofang Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is where the confusion arises. In the way Cython compiles, this member maps to the actual CUtensorMap from cuda.h, which should carry the alignment information (since the headers are build-time dependencies for cuda.bindings and get included by the generated .cpp files). So it is unclear to me how this struct member, after Cython emits C++ code that transforms this cdef class to a C struct (+ free functions), could lose its alignment requirement.

Given what Ralf alluded to above (that the alignment changes between CUDA 12 (64B) and 13 (128B)), I suspect there is a nontrivial mix-n-match between 12/13 somewhere in your environment.

cdef cydriver.CUtensorMap_st* _pvt_ptr
cdef bint _owns_buf
{{endif}}
{{if 'CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st' in found_struct}}

Expand Down
19 changes: 16 additions & 3 deletions cuda_bindings/cuda/bindings/driver.pyx.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from typing import Any, Optional
import cython
import ctypes
from libc.stdlib cimport calloc, malloc, free
from libc.string cimport memset
from libc cimport string
from libc.stdint cimport int32_t, uint32_t, int64_t, uint64_t, uintptr_t
from libc.stddef cimport wchar_t
Expand All @@ -16,6 +17,8 @@ from cpython.bytes cimport PyBytes_FromStringAndSize
from ._internal._fast_enum import FastEnum as _FastEnum
import cuda.bindings.driver
from libcpp.map cimport map
cdef extern from "<cstdlib>" nogil:
void *aligned_alloc(size_t alignment, size_t size)

_driver = globals()
include "_lib/utils.pxi"
Expand Down Expand Up @@ -18133,13 +18136,23 @@ cdef class CUtensorMap_st:
"""
def __cinit__(self, void_ptr _ptr = 0):
if _ptr == 0:
self._pvt_ptr = &self._pvt_val
else:
self._pvt_buf = aligned_alloc(64, sizeof(cydriver.CUtensorMap_st))
if self._pvt_buf is NULL:
raise MemoryError("Failed to allocate 64-byte aligned CUtensorMap")
memset(self._pvt_buf, 0, sizeof(cydriver.CUtensorMap_st))
self._pvt_ptr = <cydriver.CUtensorMap_st *>self._pvt_buf
self._owns_buf = True
else:
self._pvt_buf = NULL
self._pvt_ptr = <cydriver.CUtensorMap_st *>_ptr
self._owns_buf = False
def __init__(self, void_ptr _ptr = 0):
pass
def __dealloc__(self):
pass
if self._owns_buf and self._pvt_buf is not NULL:
free(self._pvt_buf)
self._pvt_buf = NULL
self._pvt_ptr = NULL
def getPtr(self):
return <void_ptr>self._pvt_ptr
def __repr__(self):
Expand Down