diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py index 2786e54b3f..576c708400 100644 --- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py +++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py @@ -58,11 +58,10 @@ def elems_to_bytes(nelems, dt): def main(): - print("CUDA Clock sample") + import pytest if platform.machine() == "armv7l": - print("clock_nvrtc is not supported on ARMv7 - waiving sample") - return + pytest.skip("clock_nvrtc is not supported on ARMv7") timer = np.empty(NUM_BLOCKS * 2, dtype="int64") hinput = np.empty(NUM_THREADS * 2, dtype="float32") diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py index 4681d29c1d..889bb75bd6 100644 --- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py @@ -90,8 +90,9 @@ def main(): f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}" ) if deviceProps.major < 2: - print("Test requires SM 2.0 or higher for support of Texture Arrays. Test will exit...") - sys.exit() + import pytest + + pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.") # Generate input data for layered texture width = 64 @@ -208,12 +209,10 @@ def main(): checkCudaErrors(cudart.cudaFree(d_data)) checkCudaErrors(cudart.cudaFreeArray(cu_3darray)) - print("Comparing kernel output to expected data") MIN_EPSILON_ERROR = 5.0e-3 if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR: - print("Failed") - sys.exit(-1) - print("Passed") + print("Failed", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py index ee5f4ea921..7f59667a79 100644 --- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py @@ -24,23 +24,19 @@ def main(): - print("Starting...") + import pytest if platform.system() == "Darwin": - print("simpleP2P is not supported on Mac OSX - waiving sample") - return + pytest.skip("simpleP2P is not supported on Mac OSX") if platform.machine() == "armv7l": - print("simpleP2P is not supported on ARMv7 - waiving sample") - return + pytest.skip("simpleP2P is not supported on ARMv7") if platform.machine() == "aarch64": - print("simpleP2P is not supported on aarch64 - waiving sample") - return + pytest.skip("simpleP2P is not supported on aarch64") if platform.machine() == "sbsa": - print("simpleP2P is not supported on sbsa - waiving sample") - return + pytest.skip("simpleP2P is not supported on sbsa") # Number of GPUs print("Checking for multiple GPUs...") @@ -48,8 +44,7 @@ def main(): print(f"CUDA-capable device count: {gpu_n}") if gpu_n < 2: - print("Two or more GPUs with Peer-to-Peer access capability are required") - return + pytest.skip("Two or more GPUs with Peer-to-Peer access capability are required") prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] # Check possibility for peer access @@ -80,9 +75,7 @@ def main(): break if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1: - print("Two or more GPUs with Peer-to-Peer access capability are required.") - print("Peer to Peer access is not available amongst GPUs in the system, waiving test.") - return + pytest.skip("Peer to Peer access is not available amongst GPUs in the system") # Use first pair of p2p capable GPUs detected gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]] @@ -239,9 +232,8 @@ def main(): checkCudaErrors(cudart.cudaSetDevice(i)) if error_count != 0: - print("Test failed!") - sys.exit(-1) - print("Test passed!") + print("Test failed!", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py index e0ce7ae0aa..034f7f66cb 100644 --- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py @@ -32,28 +32,26 @@ def main(): idev = 0 bPinGenericMemory = False + import pytest + if platform.system() == "Darwin": - print("simpleZeroCopy is not supported on Mac OSX - waiving sample") - return + pytest.skip("simpleZeroCopy is not supported on Mac OSX") if platform.machine() == "armv7l": - print("simpleZeroCopy is not supported on ARMv7 - waiving sample") - return + pytest.skip("simpleZeroCopy is not supported on ARMv7") if platform.machine() == "aarch64": - print("simpleZeroCopy is not supported on aarch64 - waiving sample") - return + pytest.skip("simpleZeroCopy is not supported on aarch64") if platform.machine() == "sbsa": - print("simpleZeroCopy is not supported on sbsa - waiving sample") - return + pytest.skip("simpleZeroCopy is not supported on sbsa") if checkCmdLineFlag("help"): - print("Usage: simpleZeroCopy [OPTION]\n") - print("Options:") - print(" device=[device #] Specify the device to be used") - print(" use_generic_memory (optional) use generic page-aligned for system memory") - return + print("Usage: simpleZeroCopy [OPTION]\n", file=sys.stderr) + print("Options:", file=sys.stderr) + print(" device=[device #] Specify the device to be used", file=sys.stderr) + print(" use_generic_memory (optional) use generic page-aligned for system memory", file=sys.stderr) + sys.exit(1) # Get the device selected by the user or default to 0, and then set it. if checkCmdLineFlag("device="): @@ -78,8 +76,7 @@ def main(): deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(idev)) if not deviceProp.canMapHostMemory: - print(f"Device {idev} does not support mapping CPU host memory!") - return + pytest.skip(f"Device {idev} does not support mapping CPU host memory!") checkCudaErrors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost)) @@ -177,9 +174,8 @@ def main(): checkCudaErrors(cudart.cudaFreeHost(c)) if errorNorm / refNorm >= 1.0e-7: - print("FAILED") - sys.exit(-1) - print("PASSED") + print("FAILED", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py index c86b246482..305b27648b 100644 --- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py +++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py @@ -165,28 +165,24 @@ def verify(testData, length): def main(): + import pytest + if os.name == "nt": - print("Atomics not supported on Windows") - return + pytest.skip("Atomics not supported on Windows") # set device dev_id = findCudaDevice() device_prop = checkCudaErrors(cudart.cudaGetDeviceProperties(dev_id)) if not device_prop.managedMemory: - # This samples requires being run on a device that supports Unified Memory - print("Unified Memory not supported on this device") - return + pytest.skip("Unified Memory not supported on this device") computeMode = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id)) if computeMode == cudart.cudaComputeMode.cudaComputeModeProhibited: - # This sample requires being run with a default or process exclusive mode - print("This sample requires a device in either default or process exclusive mode") - return + pytest.skip("This sample requires a device in either default or process exclusive mode") if device_prop.major < 6: - print("Requires a minimum CUDA compute 6.0 capability, waiving testing.") - return + pytest.skip("Requires a minimum CUDA compute 6.0 capability") numThreads = 256 numBlocks = 64 @@ -240,9 +236,9 @@ def main(): else: checkCudaErrors(cudart.cudaFree(atom_arr)) - print("systemWideAtomics completed, returned {}".format("OK" if testResult else "ERROR!")) if not testResult: - sys.exit(-1) + print("systemWideAtomics completed with errors", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py index 71a9c59352..e2730b7f45 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py @@ -31,7 +31,6 @@ def main(): - print("Vector Addition (Driver API)") N = 50000 nbytes = N * np.dtype(np.float32).itemsize @@ -45,8 +44,9 @@ def main(): cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice) ) if not uvaSupported: - print("Accessing pageable memory directly requires UVA") - return + import pytest + + pytest.skip("Accessing pageable memory directly requires UVA") kernelHelper = common.KernelHelper(vectorAddDrv, int(cuDevice)) _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel") @@ -106,9 +106,9 @@ def main(): checkCudaErrors(cuda.cuMemFree(d_C)) checkCudaErrors(cuda.cuCtxDestroy(cuContext)) - print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL")) if i + 1 != N: - sys.exit(-1) + print("Result = FAIL", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py index 15c6e9821c..304e8a7142 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py @@ -189,23 +189,19 @@ def simpleFreeMultiDeviceMmap(dptr, size): def main(): - print("Vector Addition (Driver API)") + import pytest if platform.system() == "Darwin": - print("vectorAddMMAP is not supported on Mac OSX - waiving sample") - return + pytest.skip("vectorAddMMAP is not supported on Mac OSX") if platform.machine() == "armv7l": - print("vectorAddMMAP is not supported on ARMv7 - waiving sample") - return + pytest.skip("vectorAddMMAP is not supported on ARMv7") if platform.machine() == "aarch64": - print("vectorAddMMAP is not supported on aarch64 - waiving sample") - return + pytest.skip("vectorAddMMAP is not supported on aarch64") if platform.machine() == "sbsa": - print("vectorAddMMAP is not supported on sbsa - waiving sample") - return + pytest.skip("vectorAddMMAP is not supported on sbsa") N = 50000 size = N * np.dtype(np.float32).itemsize @@ -224,8 +220,7 @@ def main(): ) print(f"Device {cuDevice} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attributeVal}.") if not attributeVal: - print(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.") - return + pytest.skip(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.") # The vector addition happens on cuDevice, so the allocations need to be mapped there. mappingDevices = [cuDevice] @@ -298,9 +293,9 @@ def main(): checkCudaErrors(cuda.cuCtxDestroy(cuContext)) - print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL")) if i + 1 != N: - sys.exit(-1) + print("Result = FAIL", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py index 682250c7c0..3fd8b6caa5 100644 --- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py +++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py @@ -92,9 +92,6 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c): errorNorm = math.sqrt(errorNorm) refNorm = math.sqrt(refNorm) - if errorNorm / refNorm < 1.0e-6: - print("basicStreamOrderedAllocation PASSED") - checkCudaErrors(cudart.cudaStreamDestroy(stream)) return errorNorm / refNorm < 1.0e-6 @@ -188,25 +185,23 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c): errorNorm = math.sqrt(errorNorm) refNorm = math.sqrt(refNorm) - if errorNorm / refNorm < 1.0e-6: - print("streamOrderedAllocationPostSync PASSED") - checkCudaErrors(cudart.cudaStreamDestroy(stream)) return errorNorm / refNorm < 1.0e-6 def main(): + import pytest + if platform.system() == "Darwin": - print("streamOrderedAllocation is not supported on Mac OSX - waiving sample") - return + pytest.skip("streamOrderedAllocation is not supported on Mac OSX") cuda.cuInit(0) if checkCmdLineFlag("help"): - print("Usage: streamOrderedAllocation [OPTION]\n") - print("Options:") - print(" device=[device #] Specify the device to be used") - return + print("Usage: streamOrderedAllocation [OPTION]\n", file=sys.stderr) + print("Options:", file=sys.stderr) + print(" device=[device #] Specify the device to be used", file=sys.stderr) + sys.exit(1) dev = findCudaDevice() @@ -218,8 +213,7 @@ def main(): cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev) ) if not isMemPoolSupported: - print("Waiving execution as device does not support Memory Pools") - return + pytest.skip("Waiving execution as device does not support Memory Pools") global _vectorAddGPU kernelHelper = common.KernelHelper(streamOrderedAllocation, dev) @@ -241,7 +235,7 @@ def main(): ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c) if not ret1 or not ret2: - sys.exit(-1) + sys.exit(1) if __name__ == "__main__": diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py index b82c9b02b4..e1ed910c60 100644 --- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py @@ -742,8 +742,8 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): h_C = checkCudaErrors(cudart.cudaMallocHost(mem_size_C)) if h_C == 0: - print("Failed to allocate host matri C!") - exit(-1) + print("Failed to allocate host matrix C!", file=sys.stderr) + sys.exit(1) d_A = checkCudaErrors(cudart.cudaMalloc(mem_size_A)) d_B = checkCudaErrors(cudart.cudaMalloc(mem_size_B)) @@ -916,7 +916,6 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): ) ) # arguments - print("done") checkCudaErrors(cudart.cudaStreamSynchronize(stream)) # Execute the kernel @@ -1075,7 +1074,6 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): checkCudaErrors(cudart.cudaMemcpyAsync(h_C, d_C, mem_size_C, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) checkCudaErrors(cudart.cudaStreamSynchronize(stream)) - print("Checking computed result for correctness: ") correct = True # test relative error by the formula @@ -1090,10 +1088,14 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): rel_err = abs_err / abs_val / dot_length if rel_err > eps: - print(f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}") + print( + f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}", + file=sys.stderr, + ) correct = False - print("Result = PASS" if correct else "Result = FAIL") + if not correct: + print("Result = FAIL", file=sys.stderr) # Clean up memory checkCudaErrors(cudart.cudaFreeHost(h_A)) @@ -1114,31 +1116,35 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): def main(): - common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0)) + import pytest - print("[globalToShmemAsyncCopy] - Starting...") + common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0)) if platform.machine() == "qnx": - print("globalToShmemAsyncCopy is not supported on QNX - waiving sample") - return + pytest.skip("globalToShmemAsyncCopy is not supported on QNX") version = checkCudaErrors(cuda.cuDriverGetVersion()) if version < 11010: - print("CUDA Toolkit 11.1 or greater is required") - return + pytest.skip("CUDA Toolkit 11.1 or greater is required") if checkCmdLineFlag("help") or checkCmdLineFlag("?"): - print("Usage device=n (n >= 0 for deviceID)") - print(" wA=WidthA hA=HeightA (Width x Height of Matrix A)") - print(" wB=WidthB hB=HeightB (Width x Height of Matrix B)") - print(" kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)") - print(" (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)") + print("Usage device=n (n >= 0 for deviceID)", file=sys.stderr) + print(" wA=WidthA hA=HeightA (Width x Height of Matrix A)", file=sys.stderr) + print(" wB=WidthB hB=HeightB (Width x Height of Matrix B)", file=sys.stderr) + print( + " kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)", file=sys.stderr + ) + print( + " (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)", + file=sys.stderr, + ) print( - " (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)" + " (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)", + file=sys.stderr, ) - print(" (7 - NaiveLargeChunk without memcpy_async)") - print(" Note: Outer matrix dimensions of A & B matrices must be equal.") - return + print(" (7 - NaiveLargeChunk without memcpy_async)", file=sys.stderr) + print(" Note: Outer matrix dimensions of A & B matrices must be equal.", file=sys.stderr) + sys.exit(1) # This will pick the best possible CUDA capable device, otherwise # override the device ID based on input provided at the command line @@ -1169,8 +1175,8 @@ def main(): dimsB.y = int(getCmdLineArgumentInt("hB=")) if dimsA.x != dimsB.y: - print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})") - sys.exit(-1) + print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})", file=sys.stderr) + sys.exit(1) selected_kernel = kernels.AsyncCopyMultiStageLargeChunk @@ -1180,15 +1186,14 @@ def main(): if kernel_number < 8: selected_kernel = kernels(kernel_number) else: - print("Error: kernel number should be between 0 to 7, you have entered %d".format()) - sys.exit(-1) + print("Error: kernel number should be between 0 to 7", file=sys.stderr) + sys.exit(1) major = checkCudaErrors( cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID) ) if major < 7: - print("globalToShmemAsyncCopy requires SM 7.0 or higher. Exiting...") - return + pytest.skip("globalToShmemAsyncCopy requires SM 7.0 or higher.") print(f"MatrixA({dimsA.x},{dimsA.y}), MatrixB({dimsB.x},{dimsB.y})") @@ -1213,7 +1218,7 @@ def main(): matrix_result = MatrixMultiply(dimsA, dimsB, selected_kernel) if matrix_result != 0: - sys.exit(-1) + sys.exit(1) if __name__ == "__main__": diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py index b1725bb899..5f528df7cb 100644 --- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py +++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py @@ -198,36 +198,31 @@ def genTridiag(I, J, val, N, nz): def main(): tol = 1e-5 - print(f"Starting [{sSDKname}]...\n") + import pytest + # WAIVE: Due to bug in NVRTC return if platform.system() == "Darwin": - print("conjugateGradientMultiBlockCG is not supported on Mac OSX - waiving sample") - return + pytest.skip("conjugateGradientMultiBlockCG is not supported on Mac OSX") if platform.machine() == "armv7l": - print("conjugateGradientMultiBlockCG is not supported on ARMv7 - waiving sample") - return + pytest.skip("conjugateGradientMultiBlockCG is not supported on ARMv7") if platform.machine() == "qnx": - print("conjugateGradientMultiBlockCG is not supported on QNX - waiving sample") - return + pytest.skip("conjugateGradientMultiBlockCG is not supported on QNX") # This will pick the best possible CUDA capable device devID = findCudaDevice() deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(devID)) if not deviceProp.managedMemory: - # This sample requires being run on a device that supports Unified Memory - print("Unified Memory not supported on this device") - return + pytest.skip("Unified Memory not supported on this device") # This sample requires being run on a device that supports Cooperative Kernel # Launch if not deviceProp.cooperativeLaunch: - print(f"\nSelected GPU {devID:%d} does not support Cooperative Kernel Launch, Waiving the run") - return + pytest.skip(f"Selected GPU {devID} does not support Cooperative Kernel Launch") # Statistics about the GPU device print( @@ -351,7 +346,6 @@ def main(): checkCudaErrors(cudart.cudaEventDestroy(stop)) print(f"Test Summary: Error amount = {err:f}") - print("&&&& conjugateGradientMultiBlockCG %s\n" % ("PASSED" if math.sqrt(dot_result_local) < tol else "FAILED")) - if math.sqrt(dot_result_local) >= tol: - sys.exit(-1) + print("conjugateGradientMultiBlockCG FAILED", file=sys.stderr) + sys.exit(1) diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py index ee1bef5acb..b6afe06628 100644 --- a/cuda_bindings/examples/common/common.py +++ b/cuda_bindings/examples/common/common.py @@ -66,9 +66,11 @@ def __init__(self, code, devID): logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog)) log = b" " * logSize checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log)) - print(log.decode()) - print(err) - exit(-1) + import sys + + print(log.decode(), file=sys.stderr) + print(err, file=sys.stderr) + sys.exit(1) if use_cubin: dataSize = checkCudaErrors(nvrtc.nvrtcGetCUBINSize(prog)) diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py index 9cc759b500..427f6bdff8 100644 --- a/cuda_core/examples/cuda_graphs.py +++ b/cuda_core/examples/cuda_graphs.py @@ -10,6 +10,7 @@ # # ################################################################################ +import sys import time import cupy as cp @@ -79,14 +80,14 @@ def main(): result3 = cp.empty_like(a) # Prepare launch configuration - block_size = 256 - grid_size = (size + block_size - 1) // block_size - config = LaunchConfig(grid=grid_size, block=block_size) + block = 256 + grid = (size + block - 1) // block + config = LaunchConfig(grid=grid, block=block) # Sync before graph capture dev.sync() - print("Building CUDA graph...") + print("Building CUDA graph...", file=sys.stderr) # Build the graph graph_builder = stream.create_graph_builder() @@ -105,13 +106,11 @@ def main(): # Complete the graph graph = graph_builder.end_building().complete() - print("Graph built successfully!") - # Upload the graph to the stream graph.upload(stream) # Execute the entire graph with a single launch - print("Executing graph...") + print("Executing graph...", file=sys.stderr) start_time = time.time() graph.launch(stream) stream.sync() @@ -125,14 +124,12 @@ def main(): expected_result2 = expected_result1 * c expected_result3 = expected_result2 - a - print("Verifying results...") assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch" assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch" assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch" - print("All results verified successfully!") # Demonstrate performance benefit by running the same operations without graph - print("\nRunning same operations without graph for comparison...") + print("\nRunning same operations without graph for comparison...", file=sys.stderr) # Reset results result1.fill(0) @@ -163,8 +160,6 @@ def main(): cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream - print("\nExample completed successfully!") - if __name__ == "__main__": main() diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py index 09e955efdf..7fcfd805ee 100644 --- a/cuda_core/examples/gl_interop_plasma.py +++ b/cuda_core/examples/gl_interop_plasma.py @@ -93,8 +93,8 @@ def setup_cuda(kernel_source): dev.set_current() stream = dev.create_stream() - opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") - prog = Program(kernel_source, code_type="c++", options=opts) + program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") + prog = Program(kernel_source, code_type="c++", options=program_options) mod = prog.compile("cubin") kernel = mod.get_kernel("plasma") @@ -353,7 +353,6 @@ def on_close(): resource.close() pyglet.app.run(interval=0) - print("done!") # ======================== GPU code (CUDA + GLSL) ============================ diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py index b0040708b6..a7dc8b0e8c 100644 --- a/cuda_core/examples/jit_lto_fractal.py +++ b/cuda_core/examples/jit_lto_fractal.py @@ -266,7 +266,7 @@ def main(): import matplotlib.pyplot as plt except ImportError: print("this example requires matplotlib installed in order to display the image", file=sys.stderr) - sys.exit(0) + sys.exit(1) result_to_display = [] lib = MockLibrary() @@ -298,4 +298,3 @@ def main(): if __name__ == "__main__": main() - print("done!") diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py index 123b1f6a11..d65481a670 100644 --- a/cuda_core/examples/memory_ops.py +++ b/cuda_core/examples/memory_ops.py @@ -27,7 +27,7 @@ if np.__version__ < "2.1.0": print("This example requires NumPy 2.1.0 or later", file=sys.stderr) - sys.exit(0) + sys.exit(1) # Kernel for memory operations code = """ @@ -101,7 +101,7 @@ assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed" # Copy data between different memory types -print("\nCopying data between memory types...") +print("\nCopying data between memory types...", file=sys.stderr) # Copy from device to pinned memory device_buffer.copy_to(pinned_buffer, stream=stream) @@ -131,5 +131,3 @@ assert device_buffer.handle == 0, "Device buffer should be closed" assert pinned_buffer.handle == 0, "Pinned buffer should be closed" assert new_device_buffer.handle == 0, "New device buffer should be closed" - -print("Memory management example completed!") diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py index 433d63c9eb..ab4067d1d8 100644 --- a/cuda_core/examples/pytorch_example.py +++ b/cuda_core/examples/pytorch_example.py @@ -34,7 +34,7 @@ # Get PyTorch's current stream pt_stream = torch.cuda.current_stream() -print(f"PyTorch stream: {pt_stream}") +print(f"PyTorch stream: {pt_stream}", file=sys.stderr) # Create a wrapper class that implements __cuda_stream__ @@ -47,7 +47,7 @@ def __cuda_stream__(self): return (0, stream_id) # Return format required by CUDA Python -s = dev.create_stream(PyTorchStreamWrapper(pt_stream)) +stream = dev.create_stream(PyTorchStreamWrapper(pt_stream)) # prepare program program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") @@ -59,7 +59,7 @@ def __cuda_stream__(self): ) # Run in single precision -ker = mod.get_kernel("saxpy_kernel") +kernel = mod.get_kernel("saxpy_kernel") dtype = torch.float32 # prepare input/output @@ -74,17 +74,16 @@ def __cuda_stream__(self): block = 32 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) -ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) +kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) # launch kernel on our stream -launch(s, config, ker, *ker_args) +launch(stream, config, kernel, *kernel_args) # check result assert torch.allclose(out, a.item() * x + y) -print("Single precision test passed!") # let's repeat again with double precision -ker = mod.get_kernel("saxpy_kernel") +kernel = mod.get_kernel("saxpy_kernel") dtype = torch.float64 # prepare input @@ -101,12 +100,10 @@ def __cuda_stream__(self): block = 64 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) -ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) +kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) # launch kernel on PyTorch's stream -launch(s, config, ker, *ker_args) +launch(stream, config, kernel, *kernel_args) # check result assert torch.allclose(out, a * x + y) -print("Double precision test passed!") -print("All tests passed successfully!") diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index aa0d77eff9..937206e5ef 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -34,7 +34,7 @@ dev = Device() dev.set_current() -s = dev.create_stream() +stream = dev.create_stream() # prepare program program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") @@ -50,7 +50,7 @@ ) # run in single precision -ker = mod.get_kernel("saxpy") +kernel = mod.get_kernel("saxpy") dtype = cp.float32 # prepare input/output @@ -60,24 +60,24 @@ x = rng.random(size, dtype=dtype) y = rng.random(size, dtype=dtype) out = cp.empty_like(x) -dev.sync() # cupy runs on a different stream from s, so sync before accessing +dev.sync() # cupy runs on a different stream from stream, so sync before accessing # prepare launch block = 32 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) -ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) +kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) -# launch kernel on stream s -launch(s, config, ker, *ker_args) -s.sync() +# launch kernel on stream +launch(stream, config, kernel, *kernel_args) +stream.sync() # check result assert cp.allclose(out, a * x + y) # let's repeat again, this time allocates our own out buffer instead of cupy's # run in double precision -ker = mod.get_kernel("saxpy") +kernel = mod.get_kernel("saxpy") dtype = cp.float64 # prepare input @@ -90,18 +90,18 @@ # prepare output buf = dev.allocate( size * 8, # = dtype.itemsize - stream=s, + stream=stream, ) # prepare launch block = 64 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) -ker_args = (a, x.data.ptr, y.data.ptr, buf, size) +kernel_args = (a, x.data.ptr, y.data.ptr, buf, size) -# launch kernel on stream s -launch(s, config, ker, *ker_args) -s.sync() +# launch kernel on stream +launch(stream, config, kernel, *kernel_args) +stream.sync() # check result # we wrap output buffer as a cupy array for simplicity @@ -112,7 +112,5 @@ # clean up resources that we allocate # cupy cleans up automatically the rest -buf.close(s) -s.close() - -print("done!") +buf.close(stream) +stream.close() diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py index 8b14cf0767..baf86ebc03 100644 --- a/cuda_core/examples/show_device_properties.py +++ b/cuda_core/examples/show_device_properties.py @@ -237,5 +237,7 @@ def show_device_properties(): if __name__ == "__main__": - assert len(sys.argv) == 1, "no command-line arguments expected" + if len(sys.argv) != 1: + print("no command-line arguments expected", file=sys.stderr) + sys.exit(1) show_device_properties() diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index 497a4309cf..19e271e712 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -12,11 +12,11 @@ import sys import cupy as cp -from cuda.core import Device, LaunchConfig, Program, launch, system +from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system if system.get_num_devices() < 2: print("this example requires at least 2 GPUs", file=sys.stderr) - sys.exit(0) + sys.exit(1) dtype = cp.float32 size = 50000 @@ -39,9 +39,9 @@ } } """ -prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"}) +prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}")) mod_add = prog_add.compile("cubin") -ker_add = mod_add.get_kernel("vector_add") +add_kernel = mod_add.get_kernel("vector_add") # Set GPU 1 dev1 = Device(1) @@ -61,9 +61,9 @@ } } """ -prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"}) +prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}")) mod_sub = prog_sub.compile("cubin") -ker_sub = mod_sub.get_kernel("vector_sub") +sub_kernel = mod_sub.get_kernel("vector_sub") # This adaptor ensures that any foreign stream (ex: from CuPy) that have not @@ -99,7 +99,7 @@ def __cuda_stream__(self): stream0.wait(cp_stream0) # Launch the add kernel on GPU 0 / stream 0 -launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) +launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) # Allocate memory on GPU 1 # Note: This runs on CuPy's current stream for GPU 1. @@ -114,7 +114,7 @@ def __cuda_stream__(self): stream1.wait(cp_stream1) # Launch the subtract kernel on GPU 1 / stream 1 -launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size)) +launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size)) # Synchronize both GPUs are validate the results dev0.set_current() @@ -123,5 +123,3 @@ def __cuda_stream__(self): dev1.set_current() stream1.sync() assert cp.allclose(z, x - y) - -print("done") diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py index a20377cc76..308b95d74d 100644 --- a/cuda_core/examples/strided_memory_view_cpu.py +++ b/cuda_core/examples/strided_memory_view_cpu.py @@ -23,8 +23,8 @@ try: from cffi import FFI except ImportError: - print("cffi is not installed, the CPU example will be skipped", file=sys.stderr) - FFI = None + print("cffi is not installed, this example requires cffi", file=sys.stderr) + sys.exit(1) import numpy as np from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory @@ -80,8 +80,6 @@ def my_func(arr): def run(): global my_func - if not FFI: - return # Here is a concrete (very naive!) implementation on CPU: cpu_code = string.Template(r""" extern "C" diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py index e91ddc25cc..dd8c3c8557 100644 --- a/cuda_core/examples/strided_memory_view_gpu.py +++ b/cuda_core/examples/strided_memory_view_gpu.py @@ -20,8 +20,8 @@ try: import cupy as cp except ImportError: - print("cupy is not installed, the GPU example will be skipped", file=sys.stderr) - cp = None + print("cupy is not installed, this example requires cupy", file=sys.stderr) + sys.exit(1) import numpy as np from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory @@ -56,7 +56,7 @@ # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both # of which are supported by StridedMemoryView). @args_viewable_as_strided_memory((0,)) -def my_func(arr, work_stream, gpu_ker): +def my_func(arr, work_stream, kernel): # Create a memory view over arr (assumed to be a 1D array of int32). The stream # ordering is taken care of, so that arr can be safely accessed on our work # stream (ordered after a data stream on which arr is potentially prepared). @@ -72,7 +72,7 @@ def my_func(arr, work_stream, gpu_ker): block = 256 grid = (size + block - 1) // block config = LaunchConfig(grid=grid, block=block) - launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size)) + launch(work_stream, config, kernel, view.ptr, np.uint64(size)) # Here we're being conservative and synchronize over our work stream, # assuming we do not know the data stream; if we know then we could # just order the data stream after the work stream here, e.g. @@ -85,8 +85,6 @@ def my_func(arr, work_stream, gpu_ker): def run(): global my_func - if not cp: - return None # Here is a concrete (very naive!) implementation on GPU: gpu_code = string.Template(r""" extern "C" @@ -102,24 +100,24 @@ def run(): # To know the GPU's compute capability, we need to identify which GPU to use. dev = Device(0) dev.set_current() - gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11")) - mod = gpu_prog.compile(target_type="cubin") - gpu_ker = mod.get_kernel(func_name) + prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11")) + mod = prog.compile(target_type="cubin") + kernel = mod.get_kernel(func_name) - s = dev.create_stream() + stream = dev.create_stream() try: # Create input array on GPU arr_gpu = cp.ones(1024, dtype=cp.int32) print(f"before: {arr_gpu[:10]=}") # Run the workload - my_func(arr_gpu, s, gpu_ker) + my_func(arr_gpu, stream, kernel) # Check the result print(f"after: {arr_gpu[:10]=}") assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) finally: - s.close() + stream.close() if __name__ == "__main__": diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index f1ea8b8579..b72ca3ca7e 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -24,15 +24,17 @@ if np.lib.NumpyVersion(np.__version__) < "2.2.5": print("This example requires NumPy 2.2.5 or later", file=sys.stderr) - sys.exit(0) + sys.exit(1) # prepare include cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME")) if cuda_path is None: print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr) - sys.exit(0) + sys.exit(1) cuda_include = os.path.join(cuda_path, "include") -assert os.path.isdir(cuda_include) +if not os.path.isdir(cuda_include): + print(f"CUDA include directory not found: {cuda_include}", file=sys.stderr) + sys.exit(1) include_path = [cuda_include] cccl_include = os.path.join(cuda_include, "cccl") if os.path.isdir(cccl_include): @@ -80,7 +82,7 @@ "this demo requires compute capability >= 9.0 (since thread block cluster is a hardware feature)", file=sys.stderr, ) - sys.exit(0) + sys.exit(1) arch = "".join(f"{i}" for i in arch) # prepare program & compile kernel @@ -91,7 +93,7 @@ options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path), ) mod = prog.compile(target_type="cubin") -ker = mod.get_kernel("check_cluster_info") +kernel = mod.get_kernel("check_cluster_info") # prepare launch config grid = 4 @@ -119,7 +121,7 @@ block_dims[:] = 0 # launch kernel on the default stream -launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer) +launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer) dev.sync() # verify results @@ -133,15 +135,6 @@ expected_grid_blocks = grid * cluster # 4 * 2 = 8 actual_grid_blocks = grid_dims[0] -print("\nVerification:") -print(f"LaunchConfig specified: grid={grid} clusters, cluster={cluster} blocks/cluster") -print(f"Expected total blocks: {expected_grid_blocks}") -print(f"Actual total blocks: {actual_grid_blocks}") - -if actual_grid_blocks == expected_grid_blocks: - print("✓ Grid conversion is correct!") -else: - print("✗ Grid conversion failed!") - sys.exit(1) - -print("done!") +assert actual_grid_blocks == expected_grid_blocks, ( + f"Grid conversion failed: expected {expected_grid_blocks} total blocks, got {actual_grid_blocks}" +) diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index d31ab77208..b4ad2efd1b 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -29,7 +29,7 @@ dev = Device() dev.set_current() -s = dev.create_stream() +stream = dev.create_stream() # prepare program program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") @@ -37,7 +37,7 @@ mod = prog.compile("cubin", name_expressions=("vector_add",)) # run in single precision -ker = mod.get_kernel("vector_add") +kernel = mod.get_kernel("vector_add") dtype = cp.float32 # prepare input/output @@ -47,7 +47,7 @@ b = rng.random(size, dtype=dtype) c = cp.empty_like(a) -# cupy runs on a different stream from s, so sync before accessing +# cupy runs on a different stream from stream, so sync before accessing dev.sync() # prepare launch @@ -55,10 +55,9 @@ grid = (size + block - 1) // block config = LaunchConfig(grid=grid, block=block) -# launch kernel on stream s -launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) -s.sync() +# launch kernel on stream +launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) +stream.sync() # check result assert cp.allclose(c, a + b) -print("done!")