NVIDIA · cpcloud · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -58,11 +58,10 @@ def elems_to_bytes(nelems, dt):
 
 
 def main():
-    print("CUDA Clock sample")
+    import pytest
 
     if platform.machine() == "armv7l":
-        print("clock_nvrtc is not supported on ARMv7 - waiving sample")
-        return
+        pytest.skip("clock_nvrtc is not supported on ARMv7")
 
     timer = np.empty(NUM_BLOCKS * 2, dtype="int64")
     hinput = np.empty(NUM_THREADS * 2, dtype="float32")

diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -90,8 +90,9 @@ def main():
         f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}"
     )
     if deviceProps.major < 2:
-        print("Test requires SM 2.0 or higher for support of Texture Arrays.  Test will exit...")
-        sys.exit()
+        import pytest
+
+        pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.")
 
     # Generate input data for layered texture
     width = 64
@@ -208,12 +209,10 @@ def main():
     checkCudaErrors(cudart.cudaFree(d_data))
     checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
 
-    print("Comparing kernel output to expected data")
     MIN_EPSILON_ERROR = 5.0e-3
     if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR:
-        print("Failed")
-        sys.exit(-1)
-    print("Passed")
+        print("Failed", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":

diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -24,32 +24,27 @@
 
 
 def main():
-    print("Starting...")
+    import pytest
 
     if platform.system() == "Darwin":
-        print("simpleP2P is not supported on Mac OSX - waiving sample")
-        return
+        pytest.skip("simpleP2P is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        print("simpleP2P is not supported on ARMv7 - waiving sample")
-        return
+        pytest.skip("simpleP2P is not supported on ARMv7")
 
     if platform.machine() == "aarch64":
-        print("simpleP2P is not supported on aarch64 - waiving sample")
-        return
+        pytest.skip("simpleP2P is not supported on aarch64")
 
     if platform.machine() == "sbsa":
-        print("simpleP2P is not supported on sbsa - waiving sample")
-        return
+        pytest.skip("simpleP2P is not supported on sbsa")
 
     # Number of GPUs
     print("Checking for multiple GPUs...")
     gpu_n = checkCudaErrors(cudart.cudaGetDeviceCount())
     print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
-        print("Two or more GPUs with Peer-to-Peer access capability are required")
-        return
+        pytest.skip("Two or more GPUs with Peer-to-Peer access capability are required")
 
     prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
     # Check possibility for peer access
@@ -80,9 +75,7 @@ def main():
             break
 
     if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1:
-        print("Two or more GPUs with Peer-to-Peer access capability are required.")
-        print("Peer to Peer access is not available amongst GPUs in the system, waiving test.")
-        return
+        pytest.skip("Peer to Peer access is not available amongst GPUs in the system")
 
     # Use first pair of p2p capable GPUs detected
     gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]]
@@ -239,9 +232,8 @@ def main():
         checkCudaErrors(cudart.cudaSetDevice(i))
 
     if error_count != 0:
-        print("Test failed!")
-        sys.exit(-1)
-    print("Test passed!")
+        print("Test failed!", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":

diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -32,28 +32,26 @@ def main():
     idev = 0
     bPinGenericMemory = False
 
+    import pytest
+
     if platform.system() == "Darwin":
-        print("simpleZeroCopy is not supported on Mac OSX - waiving sample")
-        return
+        pytest.skip("simpleZeroCopy is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        print("simpleZeroCopy is not supported on ARMv7 - waiving sample")
-        return
+        pytest.skip("simpleZeroCopy is not supported on ARMv7")
 
     if platform.machine() == "aarch64":
-        print("simpleZeroCopy is not supported on aarch64 - waiving sample")
-        return
+        pytest.skip("simpleZeroCopy is not supported on aarch64")
 
     if platform.machine() == "sbsa":
-        print("simpleZeroCopy is not supported on sbsa - waiving sample")
-        return
+        pytest.skip("simpleZeroCopy is not supported on sbsa")
 
     if checkCmdLineFlag("help"):
-        print("Usage:  simpleZeroCopy [OPTION]\n")
-        print("Options:")
-        print("  device=[device #]  Specify the device to be used")
-        print("  use_generic_memory (optional) use generic page-aligned for system memory")
-        return
+        print("Usage:  simpleZeroCopy [OPTION]\n", file=sys.stderr)
+        print("Options:", file=sys.stderr)
+        print("  device=[device #]  Specify the device to be used", file=sys.stderr)
+        print("  use_generic_memory (optional) use generic page-aligned for system memory", file=sys.stderr)
+        sys.exit(1)
 
     # Get the device selected by the user or default to 0, and then set it.
     if checkCmdLineFlag("device="):
@@ -78,8 +76,7 @@ def main():
     deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(idev))
 
     if not deviceProp.canMapHostMemory:
-        print(f"Device {idev} does not support mapping CPU host memory!")
-        return
+        pytest.skip(f"Device {idev} does not support mapping CPU host memory!")
 
     checkCudaErrors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost))
 
@@ -177,9 +174,8 @@ def main():
         checkCudaErrors(cudart.cudaFreeHost(c))
 
     if errorNorm / refNorm >= 1.0e-7:
-        print("FAILED")
-        sys.exit(-1)
-    print("PASSED")
+        print("FAILED", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":

diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -165,28 +165,24 @@ def verify(testData, length):
 
 
 def main():
+    import pytest
+
     if os.name == "nt":
-        print("Atomics not supported on Windows")
-        return
+        pytest.skip("Atomics not supported on Windows")
 
     # set device
     dev_id = findCudaDevice()
     device_prop = checkCudaErrors(cudart.cudaGetDeviceProperties(dev_id))
 
     if not device_prop.managedMemory:
-        # This samples requires being run on a device that supports Unified Memory
-        print("Unified Memory not supported on this device")
-        return
+        pytest.skip("Unified Memory not supported on this device")
 
     computeMode = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id))
     if computeMode == cudart.cudaComputeMode.cudaComputeModeProhibited:
-        # This sample requires being run with a default or process exclusive mode
-        print("This sample requires a device in either default or process exclusive mode")
-        return
+        pytest.skip("This sample requires a device in either default or process exclusive mode")
 
     if device_prop.major < 6:
-        print("Requires a minimum CUDA compute 6.0 capability, waiving testing.")
-        return
+        pytest.skip("Requires a minimum CUDA compute 6.0 capability")
 
     numThreads = 256
     numBlocks = 64
@@ -240,9 +236,9 @@ def main():
     else:
         checkCudaErrors(cudart.cudaFree(atom_arr))
 
-    print("systemWideAtomics completed, returned {}".format("OK" if testResult else "ERROR!"))
     if not testResult:
-        sys.exit(-1)
+        print("systemWideAtomics completed with errors", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":

diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -31,7 +31,6 @@
 
 
 def main():
-    print("Vector Addition (Driver API)")
     N = 50000
     nbytes = N * np.dtype(np.float32).itemsize
 
@@ -45,8 +44,9 @@ def main():
         cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
     )
     if not uvaSupported:
-        print("Accessing pageable memory directly requires UVA")
-        return
+        import pytest
+
+        pytest.skip("Accessing pageable memory directly requires UVA")
 
     kernelHelper = common.KernelHelper(vectorAddDrv, int(cuDevice))
     _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
@@ -106,9 +106,9 @@ def main():
     checkCudaErrors(cuda.cuMemFree(d_C))
 
     checkCudaErrors(cuda.cuCtxDestroy(cuContext))
-    print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL"))
     if i + 1 != N:
-        sys.exit(-1)
+        print("Result = FAIL", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":

diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -189,23 +189,19 @@ def simpleFreeMultiDeviceMmap(dptr, size):
 
 
 def main():
-    print("Vector Addition (Driver API)")
+    import pytest
 
     if platform.system() == "Darwin":
-        print("vectorAddMMAP is not supported on Mac OSX - waiving sample")
-        return
+        pytest.skip("vectorAddMMAP is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        print("vectorAddMMAP is not supported on ARMv7 - waiving sample")
-        return
+        pytest.skip("vectorAddMMAP is not supported on ARMv7")
 
     if platform.machine() == "aarch64":
-        print("vectorAddMMAP is not supported on aarch64 - waiving sample")
-        return
+        pytest.skip("vectorAddMMAP is not supported on aarch64")
 
     if platform.machine() == "sbsa":
-        print("vectorAddMMAP is not supported on sbsa - waiving sample")
-        return
+        pytest.skip("vectorAddMMAP is not supported on sbsa")
 
     N = 50000
     size = N * np.dtype(np.float32).itemsize
@@ -224,8 +220,7 @@ def main():
     )
     print(f"Device {cuDevice} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attributeVal}.")
     if not attributeVal:
-        print(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
-        return
+        pytest.skip(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
 
     # The vector addition happens on cuDevice, so the allocations need to be mapped there.
     mappingDevices = [cuDevice]
@@ -298,9 +293,9 @@ def main():
 
     checkCudaErrors(cuda.cuCtxDestroy(cuContext))
 
-    print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL"))
     if i + 1 != N:
-        sys.exit(-1)
+        print("Result = FAIL", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":

diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -92,9 +92,6 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c):
     errorNorm = math.sqrt(errorNorm)
     refNorm = math.sqrt(refNorm)
 
-    if errorNorm / refNorm < 1.0e-6:
-        print("basicStreamOrderedAllocation PASSED")
-
     checkCudaErrors(cudart.cudaStreamDestroy(stream))
 
     return errorNorm / refNorm < 1.0e-6
@@ -188,25 +185,23 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c):
     errorNorm = math.sqrt(errorNorm)
     refNorm = math.sqrt(refNorm)
 
-    if errorNorm / refNorm < 1.0e-6:
-        print("streamOrderedAllocationPostSync PASSED")
-
     checkCudaErrors(cudart.cudaStreamDestroy(stream))
 
     return errorNorm / refNorm < 1.0e-6
 
 
 def main():
+    import pytest
+
     if platform.system() == "Darwin":
-        print("streamOrderedAllocation is not supported on Mac OSX - waiving sample")
-        return
+        pytest.skip("streamOrderedAllocation is not supported on Mac OSX")
 
     cuda.cuInit(0)
     if checkCmdLineFlag("help"):
-        print("Usage:  streamOrderedAllocation [OPTION]\n")
-        print("Options:")
-        print("  device=[device #]  Specify the device to be used")
-        return
+        print("Usage:  streamOrderedAllocation [OPTION]\n", file=sys.stderr)
+        print("Options:", file=sys.stderr)
+        print("  device=[device #]  Specify the device to be used", file=sys.stderr)
+        sys.exit(1)
 
     dev = findCudaDevice()
 
@@ -218,8 +213,7 @@ def main():
             cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev)
         )
     if not isMemPoolSupported:
-        print("Waiving execution as device does not support Memory Pools")
-        return
+        pytest.skip("Waiving execution as device does not support Memory Pools")
 
     global _vectorAddGPU
     kernelHelper = common.KernelHelper(streamOrderedAllocation, dev)
@@ -241,7 +235,7 @@ def main():
     ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c)
 
     if not ret1 or not ret2:
-        sys.exit(-1)
+        sys.exit(1)
 
 
 if __name__ == "__main__":