diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
index 2786e54b3f..576c708400 100644
--- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
+++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -58,11 +58,10 @@ def elems_to_bytes(nelems, dt):
 
 
 def main():
-    print("CUDA Clock sample")
+    import pytest
 
     if platform.machine() == "armv7l":
-        print("clock_nvrtc is not supported on ARMv7 - waiving sample")
-        return
+        pytest.skip("clock_nvrtc is not supported on ARMv7")
 
     timer = np.empty(NUM_BLOCKS * 2, dtype="int64")
     hinput = np.empty(NUM_THREADS * 2, dtype="float32")
diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
index 4681d29c1d..889bb75bd6 100644
--- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -90,8 +90,9 @@ def main():
         f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}"
     )
     if deviceProps.major < 2:
-        print("Test requires SM 2.0 or higher for support of Texture Arrays.  Test will exit...")
-        sys.exit()
+        import pytest
+
+        pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.")
 
     # Generate input data for layered texture
     width = 64
@@ -208,12 +209,10 @@ def main():
     checkCudaErrors(cudart.cudaFree(d_data))
     checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
 
-    print("Comparing kernel output to expected data")
     MIN_EPSILON_ERROR = 5.0e-3
     if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR:
-        print("Failed")
-        sys.exit(-1)
-    print("Passed")
+        print("Failed", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
index ee5f4ea921..7f59667a79 100644
--- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -24,23 +24,19 @@
 
 
 def main():
-    print("Starting...")
+    import pytest
 
     if platform.system() == "Darwin":
-        print("simpleP2P is not supported on Mac OSX - waiving sample")
-        return
+        pytest.skip("simpleP2P is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        print("simpleP2P is not supported on ARMv7 - waiving sample")
-        return
+        pytest.skip("simpleP2P is not supported on ARMv7")
 
     if platform.machine() == "aarch64":
-        print("simpleP2P is not supported on aarch64 - waiving sample")
-        return
+        pytest.skip("simpleP2P is not supported on aarch64")
 
     if platform.machine() == "sbsa":
-        print("simpleP2P is not supported on sbsa - waiving sample")
-        return
+        pytest.skip("simpleP2P is not supported on sbsa")
 
     # Number of GPUs
     print("Checking for multiple GPUs...")
@@ -48,8 +44,7 @@ def main():
     print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
-        print("Two or more GPUs with Peer-to-Peer access capability are required")
-        return
+        pytest.skip("Two or more GPUs with Peer-to-Peer access capability are required")
 
     prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
     # Check possibility for peer access
@@ -80,9 +75,7 @@ def main():
             break
 
     if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1:
-        print("Two or more GPUs with Peer-to-Peer access capability are required.")
-        print("Peer to Peer access is not available amongst GPUs in the system, waiving test.")
-        return
+        pytest.skip("Peer to Peer access is not available amongst GPUs in the system")
 
     # Use first pair of p2p capable GPUs detected
     gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]]
@@ -239,9 +232,8 @@ def main():
         checkCudaErrors(cudart.cudaSetDevice(i))
 
     if error_count != 0:
-        print("Test failed!")
-        sys.exit(-1)
-    print("Test passed!")
+        print("Test failed!", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
index e0ce7ae0aa..034f7f66cb 100644
--- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -32,28 +32,26 @@ def main():
     idev = 0
     bPinGenericMemory = False
 
+    import pytest
+
     if platform.system() == "Darwin":
-        print("simpleZeroCopy is not supported on Mac OSX - waiving sample")
-        return
+        pytest.skip("simpleZeroCopy is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        print("simpleZeroCopy is not supported on ARMv7 - waiving sample")
-        return
+        pytest.skip("simpleZeroCopy is not supported on ARMv7")
 
     if platform.machine() == "aarch64":
-        print("simpleZeroCopy is not supported on aarch64 - waiving sample")
-        return
+        pytest.skip("simpleZeroCopy is not supported on aarch64")
 
     if platform.machine() == "sbsa":
-        print("simpleZeroCopy is not supported on sbsa - waiving sample")
-        return
+        pytest.skip("simpleZeroCopy is not supported on sbsa")
 
     if checkCmdLineFlag("help"):
-        print("Usage:  simpleZeroCopy [OPTION]\n")
-        print("Options:")
-        print("  device=[device #]  Specify the device to be used")
-        print("  use_generic_memory (optional) use generic page-aligned for system memory")
-        return
+        print("Usage:  simpleZeroCopy [OPTION]\n", file=sys.stderr)
+        print("Options:", file=sys.stderr)
+        print("  device=[device #]  Specify the device to be used", file=sys.stderr)
+        print("  use_generic_memory (optional) use generic page-aligned for system memory", file=sys.stderr)
+        sys.exit(1)
 
     # Get the device selected by the user or default to 0, and then set it.
     if checkCmdLineFlag("device="):
@@ -78,8 +76,7 @@ def main():
     deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(idev))
 
     if not deviceProp.canMapHostMemory:
-        print(f"Device {idev} does not support mapping CPU host memory!")
-        return
+        pytest.skip(f"Device {idev} does not support mapping CPU host memory!")
 
     checkCudaErrors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost))
 
@@ -177,9 +174,8 @@ def main():
         checkCudaErrors(cudart.cudaFreeHost(c))
 
     if errorNorm / refNorm >= 1.0e-7:
-        print("FAILED")
-        sys.exit(-1)
-    print("PASSED")
+        print("FAILED", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
index c86b246482..305b27648b 100644
--- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
+++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -165,28 +165,24 @@ def verify(testData, length):
 
 
 def main():
+    import pytest
+
     if os.name == "nt":
-        print("Atomics not supported on Windows")
-        return
+        pytest.skip("Atomics not supported on Windows")
 
     # set device
     dev_id = findCudaDevice()
     device_prop = checkCudaErrors(cudart.cudaGetDeviceProperties(dev_id))
 
     if not device_prop.managedMemory:
-        # This samples requires being run on a device that supports Unified Memory
-        print("Unified Memory not supported on this device")
-        return
+        pytest.skip("Unified Memory not supported on this device")
 
     computeMode = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id))
     if computeMode == cudart.cudaComputeMode.cudaComputeModeProhibited:
-        # This sample requires being run with a default or process exclusive mode
-        print("This sample requires a device in either default or process exclusive mode")
-        return
+        pytest.skip("This sample requires a device in either default or process exclusive mode")
 
     if device_prop.major < 6:
-        print("Requires a minimum CUDA compute 6.0 capability, waiving testing.")
-        return
+        pytest.skip("Requires a minimum CUDA compute 6.0 capability")
 
     numThreads = 256
     numBlocks = 64
@@ -240,9 +236,9 @@ def main():
     else:
         checkCudaErrors(cudart.cudaFree(atom_arr))
 
-    print("systemWideAtomics completed, returned {}".format("OK" if testResult else "ERROR!"))
     if not testResult:
-        sys.exit(-1)
+        print("systemWideAtomics completed with errors", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
index 71a9c59352..e2730b7f45 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -31,7 +31,6 @@
 
 
 def main():
-    print("Vector Addition (Driver API)")
     N = 50000
     nbytes = N * np.dtype(np.float32).itemsize
 
@@ -45,8 +44,9 @@ def main():
         cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
     )
     if not uvaSupported:
-        print("Accessing pageable memory directly requires UVA")
-        return
+        import pytest
+
+        pytest.skip("Accessing pageable memory directly requires UVA")
 
     kernelHelper = common.KernelHelper(vectorAddDrv, int(cuDevice))
     _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
@@ -106,9 +106,9 @@ def main():
     checkCudaErrors(cuda.cuMemFree(d_C))
 
     checkCudaErrors(cuda.cuCtxDestroy(cuContext))
-    print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL"))
     if i + 1 != N:
-        sys.exit(-1)
+        print("Result = FAIL", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
index 15c6e9821c..304e8a7142 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -189,23 +189,19 @@ def simpleFreeMultiDeviceMmap(dptr, size):
 
 
 def main():
-    print("Vector Addition (Driver API)")
+    import pytest
 
     if platform.system() == "Darwin":
-        print("vectorAddMMAP is not supported on Mac OSX - waiving sample")
-        return
+        pytest.skip("vectorAddMMAP is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        print("vectorAddMMAP is not supported on ARMv7 - waiving sample")
-        return
+        pytest.skip("vectorAddMMAP is not supported on ARMv7")
 
     if platform.machine() == "aarch64":
-        print("vectorAddMMAP is not supported on aarch64 - waiving sample")
-        return
+        pytest.skip("vectorAddMMAP is not supported on aarch64")
 
     if platform.machine() == "sbsa":
-        print("vectorAddMMAP is not supported on sbsa - waiving sample")
-        return
+        pytest.skip("vectorAddMMAP is not supported on sbsa")
 
     N = 50000
     size = N * np.dtype(np.float32).itemsize
@@ -224,8 +220,7 @@ def main():
     )
     print(f"Device {cuDevice} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attributeVal}.")
     if not attributeVal:
-        print(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
-        return
+        pytest.skip(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
 
     # The vector addition happens on cuDevice, so the allocations need to be mapped there.
     mappingDevices = [cuDevice]
@@ -298,9 +293,9 @@ def main():
 
     checkCudaErrors(cuda.cuCtxDestroy(cuContext))
 
-    print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL"))
     if i + 1 != N:
-        sys.exit(-1)
+        print("Result = FAIL", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
index 682250c7c0..3fd8b6caa5 100644
--- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
+++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -92,9 +92,6 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c):
     errorNorm = math.sqrt(errorNorm)
     refNorm = math.sqrt(refNorm)
 
-    if errorNorm / refNorm < 1.0e-6:
-        print("basicStreamOrderedAllocation PASSED")
-
     checkCudaErrors(cudart.cudaStreamDestroy(stream))
 
     return errorNorm / refNorm < 1.0e-6
@@ -188,25 +185,23 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c):
     errorNorm = math.sqrt(errorNorm)
     refNorm = math.sqrt(refNorm)
 
-    if errorNorm / refNorm < 1.0e-6:
-        print("streamOrderedAllocationPostSync PASSED")
-
     checkCudaErrors(cudart.cudaStreamDestroy(stream))
 
     return errorNorm / refNorm < 1.0e-6
 
 
 def main():
+    import pytest
+
     if platform.system() == "Darwin":
-        print("streamOrderedAllocation is not supported on Mac OSX - waiving sample")
-        return
+        pytest.skip("streamOrderedAllocation is not supported on Mac OSX")
 
     cuda.cuInit(0)
     if checkCmdLineFlag("help"):
-        print("Usage:  streamOrderedAllocation [OPTION]\n")
-        print("Options:")
-        print("  device=[device #]  Specify the device to be used")
-        return
+        print("Usage:  streamOrderedAllocation [OPTION]\n", file=sys.stderr)
+        print("Options:", file=sys.stderr)
+        print("  device=[device #]  Specify the device to be used", file=sys.stderr)
+        sys.exit(1)
 
     dev = findCudaDevice()
 
@@ -218,8 +213,7 @@ def main():
             cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev)
         )
     if not isMemPoolSupported:
-        print("Waiving execution as device does not support Memory Pools")
-        return
+        pytest.skip("Waiving execution as device does not support Memory Pools")
 
     global _vectorAddGPU
     kernelHelper = common.KernelHelper(streamOrderedAllocation, dev)
@@ -241,7 +235,7 @@ def main():
     ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c)
 
     if not ret1 or not ret2:
-        sys.exit(-1)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index b82c9b02b4..e1ed910c60 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -742,8 +742,8 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
     h_C = checkCudaErrors(cudart.cudaMallocHost(mem_size_C))
 
     if h_C == 0:
-        print("Failed to allocate host matri C!")
-        exit(-1)
+        print("Failed to allocate host matrix C!", file=sys.stderr)
+        sys.exit(1)
 
     d_A = checkCudaErrors(cudart.cudaMalloc(mem_size_A))
     d_B = checkCudaErrors(cudart.cudaMalloc(mem_size_B))
@@ -916,7 +916,6 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
             )
         )  # arguments
 
-    print("done")
     checkCudaErrors(cudart.cudaStreamSynchronize(stream))
 
     # Execute the kernel
@@ -1075,7 +1074,6 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
     checkCudaErrors(cudart.cudaMemcpyAsync(h_C, d_C, mem_size_C, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
     checkCudaErrors(cudart.cudaStreamSynchronize(stream))
 
-    print("Checking computed result for correctness: ")
     correct = True
 
     # test relative error by the formula
@@ -1090,10 +1088,14 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
         rel_err = abs_err / abs_val / dot_length
 
         if rel_err > eps:
-            print(f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}")
+            print(
+                f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}",
+                file=sys.stderr,
+            )
             correct = False
 
-    print("Result = PASS" if correct else "Result = FAIL")
+    if not correct:
+        print("Result = FAIL", file=sys.stderr)
 
     # Clean up memory
     checkCudaErrors(cudart.cudaFreeHost(h_A))
@@ -1114,31 +1116,35 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
 
 
 def main():
-    common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0))
+    import pytest
 
-    print("[globalToShmemAsyncCopy] - Starting...")
+    common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0))
 
     if platform.machine() == "qnx":
-        print("globalToShmemAsyncCopy is not supported on QNX - waiving sample")
-        return
+        pytest.skip("globalToShmemAsyncCopy is not supported on QNX")
 
     version = checkCudaErrors(cuda.cuDriverGetVersion())
     if version < 11010:
-        print("CUDA Toolkit 11.1 or greater is required")
-        return
+        pytest.skip("CUDA Toolkit 11.1 or greater is required")
 
     if checkCmdLineFlag("help") or checkCmdLineFlag("?"):
-        print("Usage device=n (n >= 0 for deviceID)")
-        print("      wA=WidthA hA=HeightA (Width x Height of Matrix A)")
-        print("      wB=WidthB hB=HeightB (Width x Height of Matrix B)")
-        print("      kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)")
-        print("                            (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)")
+        print("Usage device=n (n >= 0 for deviceID)", file=sys.stderr)
+        print("      wA=WidthA hA=HeightA (Width x Height of Matrix A)", file=sys.stderr)
+        print("      wB=WidthB hB=HeightB (Width x Height of Matrix B)", file=sys.stderr)
+        print(
+            "      kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)", file=sys.stderr
+        )
+        print(
+            "                            (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)",
+            file=sys.stderr,
+        )
         print(
-            "                            (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)"
+            "                            (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)",
+            file=sys.stderr,
         )
-        print("                            (7 - NaiveLargeChunk without memcpy_async)")
-        print("  Note: Outer matrix dimensions of A & B matrices must be equal.")
-        return
+        print("                            (7 - NaiveLargeChunk without memcpy_async)", file=sys.stderr)
+        print("  Note: Outer matrix dimensions of A & B matrices must be equal.", file=sys.stderr)
+        sys.exit(1)
 
     # This will pick the best possible CUDA capable device, otherwise
     # override the device ID based on input provided at the command line
@@ -1169,8 +1175,8 @@ def main():
         dimsB.y = int(getCmdLineArgumentInt("hB="))
 
     if dimsA.x != dimsB.y:
-        print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})")
-        sys.exit(-1)
+        print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})", file=sys.stderr)
+        sys.exit(1)
 
     selected_kernel = kernels.AsyncCopyMultiStageLargeChunk
 
@@ -1180,15 +1186,14 @@ def main():
         if kernel_number < 8:
             selected_kernel = kernels(kernel_number)
         else:
-            print("Error: kernel number should be between 0 to 7, you have entered %d".format())
-            sys.exit(-1)
+            print("Error: kernel number should be between 0 to 7", file=sys.stderr)
+            sys.exit(1)
 
     major = checkCudaErrors(
         cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
     )
     if major < 7:
-        print("globalToShmemAsyncCopy requires SM 7.0 or higher.  Exiting...")
-        return
+        pytest.skip("globalToShmemAsyncCopy requires SM 7.0 or higher.")
 
     print(f"MatrixA({dimsA.x},{dimsA.y}), MatrixB({dimsB.x},{dimsB.y})")
 
@@ -1213,7 +1218,7 @@ def main():
     matrix_result = MatrixMultiply(dimsA, dimsB, selected_kernel)
 
     if matrix_result != 0:
-        sys.exit(-1)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
index b1725bb899..5f528df7cb 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
@@ -198,36 +198,31 @@ def genTridiag(I, J, val, N, nz):
 def main():
     tol = 1e-5
 
-    print(f"Starting [{sSDKname}]...\n")
+    import pytest
+
     # WAIVE: Due to bug in NVRTC
     return
 
     if platform.system() == "Darwin":
-        print("conjugateGradientMultiBlockCG is not supported on Mac OSX - waiving sample")
-        return
+        pytest.skip("conjugateGradientMultiBlockCG is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        print("conjugateGradientMultiBlockCG is not supported on ARMv7 - waiving sample")
-        return
+        pytest.skip("conjugateGradientMultiBlockCG is not supported on ARMv7")
 
     if platform.machine() == "qnx":
-        print("conjugateGradientMultiBlockCG is not supported on QNX - waiving sample")
-        return
+        pytest.skip("conjugateGradientMultiBlockCG is not supported on QNX")
 
     # This will pick the best possible CUDA capable device
     devID = findCudaDevice()
     deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(devID))
 
     if not deviceProp.managedMemory:
-        # This sample requires being run on a device that supports Unified Memory
-        print("Unified Memory not supported on this device")
-        return
+        pytest.skip("Unified Memory not supported on this device")
 
     # This sample requires being run on a device that supports Cooperative Kernel
     # Launch
     if not deviceProp.cooperativeLaunch:
-        print(f"\nSelected GPU {devID:%d} does not support Cooperative Kernel Launch, Waiving the run")
-        return
+        pytest.skip(f"Selected GPU {devID} does not support Cooperative Kernel Launch")
 
     # Statistics about the GPU device
     print(
@@ -351,7 +346,6 @@ def main():
     checkCudaErrors(cudart.cudaEventDestroy(stop))
 
     print(f"Test Summary:  Error amount = {err:f}")
-    print("&&&& conjugateGradientMultiBlockCG %s\n" % ("PASSED" if math.sqrt(dot_result_local) < tol else "FAILED"))
-
     if math.sqrt(dot_result_local) >= tol:
-        sys.exit(-1)
+        print("conjugateGradientMultiBlockCG FAILED", file=sys.stderr)
+        sys.exit(1)
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index ee1bef5acb..b6afe06628 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -66,9 +66,11 @@ def __init__(self, code, devID):
             logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog))
             log = b" " * logSize
             checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log))
-            print(log.decode())
-            print(err)
-            exit(-1)
+            import sys
+
+            print(log.decode(), file=sys.stderr)
+            print(err, file=sys.stderr)
+            sys.exit(1)
 
         if use_cubin:
             dataSize = checkCudaErrors(nvrtc.nvrtcGetCUBINSize(prog))
diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
index 9cc759b500..427f6bdff8 100644
--- a/cuda_core/examples/cuda_graphs.py
+++ b/cuda_core/examples/cuda_graphs.py
@@ -10,6 +10,7 @@
 #
 # ################################################################################
 
+import sys
 import time
 
 import cupy as cp
@@ -79,14 +80,14 @@ def main():
     result3 = cp.empty_like(a)
 
     # Prepare launch configuration
-    block_size = 256
-    grid_size = (size + block_size - 1) // block_size
-    config = LaunchConfig(grid=grid_size, block=block_size)
+    block = 256
+    grid = (size + block - 1) // block
+    config = LaunchConfig(grid=grid, block=block)
 
     # Sync before graph capture
     dev.sync()
 
-    print("Building CUDA graph...")
+    print("Building CUDA graph...", file=sys.stderr)
 
     # Build the graph
     graph_builder = stream.create_graph_builder()
@@ -105,13 +106,11 @@ def main():
     # Complete the graph
     graph = graph_builder.end_building().complete()
 
-    print("Graph built successfully!")
-
     # Upload the graph to the stream
     graph.upload(stream)
 
     # Execute the entire graph with a single launch
-    print("Executing graph...")
+    print("Executing graph...", file=sys.stderr)
     start_time = time.time()
     graph.launch(stream)
     stream.sync()
@@ -125,14 +124,12 @@ def main():
     expected_result2 = expected_result1 * c
     expected_result3 = expected_result2 - a
 
-    print("Verifying results...")
     assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch"
     assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch"
     assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch"
-    print("All results verified successfully!")
 
     # Demonstrate performance benefit by running the same operations without graph
-    print("\nRunning same operations without graph for comparison...")
+    print("\nRunning same operations without graph for comparison...", file=sys.stderr)
 
     # Reset results
     result1.fill(0)
@@ -163,8 +160,6 @@ def main():
 
     cp.cuda.Stream.null.use()  # reset CuPy's current stream to the null stream
 
-    print("\nExample completed successfully!")
-
 
 if __name__ == "__main__":
     main()
diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py
index 09e955efdf..7fcfd805ee 100644
--- a/cuda_core/examples/gl_interop_plasma.py
+++ b/cuda_core/examples/gl_interop_plasma.py
@@ -93,8 +93,8 @@ def setup_cuda(kernel_source):
     dev.set_current()
     stream = dev.create_stream()
 
-    opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
-    prog = Program(kernel_source, code_type="c++", options=opts)
+    program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
+    prog = Program(kernel_source, code_type="c++", options=program_options)
     mod = prog.compile("cubin")
     kernel = mod.get_kernel("plasma")
 
@@ -353,7 +353,6 @@ def on_close():
         resource.close()
 
     pyglet.app.run(interval=0)
-    print("done!")
 
 
 # ======================== GPU code (CUDA + GLSL) ============================
diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index b0040708b6..a7dc8b0e8c 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -266,7 +266,7 @@ def main():
             import matplotlib.pyplot as plt
         except ImportError:
             print("this example requires matplotlib installed in order to display the image", file=sys.stderr)
-            sys.exit(0)
+            sys.exit(1)
 
     result_to_display = []
     lib = MockLibrary()
@@ -298,4 +298,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-    print("done!")
diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py
index 123b1f6a11..d65481a670 100644
--- a/cuda_core/examples/memory_ops.py
+++ b/cuda_core/examples/memory_ops.py
@@ -27,7 +27,7 @@
 
 if np.__version__ < "2.1.0":
     print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
-    sys.exit(0)
+    sys.exit(1)
 
 # Kernel for memory operations
 code = """
@@ -101,7 +101,7 @@
 assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed"
 
 # Copy data between different memory types
-print("\nCopying data between memory types...")
+print("\nCopying data between memory types...", file=sys.stderr)
 
 # Copy from device to pinned memory
 device_buffer.copy_to(pinned_buffer, stream=stream)
@@ -131,5 +131,3 @@
 assert device_buffer.handle == 0, "Device buffer should be closed"
 assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
 assert new_device_buffer.handle == 0, "New device buffer should be closed"
-
-print("Memory management example completed!")
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
index 433d63c9eb..ab4067d1d8 100644
--- a/cuda_core/examples/pytorch_example.py
+++ b/cuda_core/examples/pytorch_example.py
@@ -34,7 +34,7 @@
 
 # Get PyTorch's current stream
 pt_stream = torch.cuda.current_stream()
-print(f"PyTorch stream: {pt_stream}")
+print(f"PyTorch stream: {pt_stream}", file=sys.stderr)
 
 
 # Create a wrapper class that implements __cuda_stream__
@@ -47,7 +47,7 @@ def __cuda_stream__(self):
         return (0, stream_id)  # Return format required by CUDA Python
 
 
-s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
+stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
 
 # prepare program
 program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -59,7 +59,7 @@ def __cuda_stream__(self):
 )
 
 # Run in single precision
-ker = mod.get_kernel("saxpy_kernel<float>")
+kernel = mod.get_kernel("saxpy_kernel<float>")
 dtype = torch.float32
 
 # prepare input/output
@@ -74,17 +74,16 @@ def __cuda_stream__(self):
 block = 32
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
 # launch kernel on our stream
-launch(s, config, ker, *ker_args)
+launch(stream, config, kernel, *kernel_args)
 
 # check result
 assert torch.allclose(out, a.item() * x + y)
-print("Single precision test passed!")
 
 # let's repeat again with double precision
-ker = mod.get_kernel("saxpy_kernel<double>")
+kernel = mod.get_kernel("saxpy_kernel<double>")
 dtype = torch.float64
 
 # prepare input
@@ -101,12 +100,10 @@ def __cuda_stream__(self):
 block = 64
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
 # launch kernel on PyTorch's stream
-launch(s, config, ker, *ker_args)
+launch(stream, config, kernel, *kernel_args)
 
 # check result
 assert torch.allclose(out, a * x + y)
-print("Double precision test passed!")
-print("All tests passed successfully!")
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index aa0d77eff9..937206e5ef 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -34,7 +34,7 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 
 # prepare program
 program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -50,7 +50,7 @@
 )
 
 # run in single precision
-ker = mod.get_kernel("saxpy<float>")
+kernel = mod.get_kernel("saxpy<float>")
 dtype = cp.float32
 
 # prepare input/output
@@ -60,24 +60,24 @@
 x = rng.random(size, dtype=dtype)
 y = rng.random(size, dtype=dtype)
 out = cp.empty_like(x)
-dev.sync()  # cupy runs on a different stream from s, so sync before accessing
+dev.sync()  # cupy runs on a different stream from stream, so sync before accessing
 
 # prepare launch
 block = 32
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
+kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
 
-# launch kernel on stream s
-launch(s, config, ker, *ker_args)
-s.sync()
+# launch kernel on stream
+launch(stream, config, kernel, *kernel_args)
+stream.sync()
 
 # check result
 assert cp.allclose(out, a * x + y)
 
 # let's repeat again, this time allocates our own out buffer instead of cupy's
 # run in double precision
-ker = mod.get_kernel("saxpy<double>")
+kernel = mod.get_kernel("saxpy<double>")
 dtype = cp.float64
 
 # prepare input
@@ -90,18 +90,18 @@
 # prepare output
 buf = dev.allocate(
     size * 8,  # = dtype.itemsize
-    stream=s,
+    stream=stream,
 )
 
 # prepare launch
 block = 64
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
+kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
 
-# launch kernel on stream s
-launch(s, config, ker, *ker_args)
-s.sync()
+# launch kernel on stream
+launch(stream, config, kernel, *kernel_args)
+stream.sync()
 
 # check result
 # we wrap output buffer as a cupy array for simplicity
@@ -112,7 +112,5 @@
 
 # clean up resources that we allocate
 # cupy cleans up automatically the rest
-buf.close(s)
-s.close()
-
-print("done!")
+buf.close(stream)
+stream.close()
diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py
index 8b14cf0767..baf86ebc03 100644
--- a/cuda_core/examples/show_device_properties.py
+++ b/cuda_core/examples/show_device_properties.py
@@ -237,5 +237,7 @@ def show_device_properties():
 
 
 if __name__ == "__main__":
-    assert len(sys.argv) == 1, "no command-line arguments expected"
+    if len(sys.argv) != 1:
+        print("no command-line arguments expected", file=sys.stderr)
+        sys.exit(1)
     show_device_properties()
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index 497a4309cf..19e271e712 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -12,11 +12,11 @@
 import sys
 
 import cupy as cp
-from cuda.core import Device, LaunchConfig, Program, launch, system
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
 
 if system.get_num_devices() < 2:
     print("this example requires at least 2 GPUs", file=sys.stderr)
-    sys.exit(0)
+    sys.exit(1)
 
 dtype = cp.float32
 size = 50000
@@ -39,9 +39,9 @@
     }
 }
 """
-prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
+prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
 mod_add = prog_add.compile("cubin")
-ker_add = mod_add.get_kernel("vector_add")
+add_kernel = mod_add.get_kernel("vector_add")
 
 # Set GPU 1
 dev1 = Device(1)
@@ -61,9 +61,9 @@
     }
 }
 """
-prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
+prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
 mod_sub = prog_sub.compile("cubin")
-ker_sub = mod_sub.get_kernel("vector_sub")
+sub_kernel = mod_sub.get_kernel("vector_sub")
 
 
 # This adaptor ensures that any foreign stream (ex: from CuPy) that have not
@@ -99,7 +99,7 @@ def __cuda_stream__(self):
 stream0.wait(cp_stream0)
 
 # Launch the add kernel on GPU 0 / stream 0
-launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
 
 # Allocate memory on GPU 1
 # Note: This runs on CuPy's current stream for GPU 1.
@@ -114,7 +114,7 @@ def __cuda_stream__(self):
 stream1.wait(cp_stream1)
 
 # Launch the subtract kernel on GPU 1 / stream 1
-launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
+launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
 
 # Synchronize both GPUs are validate the results
 dev0.set_current()
@@ -123,5 +123,3 @@ def __cuda_stream__(self):
 dev1.set_current()
 stream1.sync()
 assert cp.allclose(z, x - y)
-
-print("done")
diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py
index a20377cc76..308b95d74d 100644
--- a/cuda_core/examples/strided_memory_view_cpu.py
+++ b/cuda_core/examples/strided_memory_view_cpu.py
@@ -23,8 +23,8 @@
 try:
     from cffi import FFI
 except ImportError:
-    print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
-    FFI = None
+    print("cffi is not installed, this example requires cffi", file=sys.stderr)
+    sys.exit(1)
 import numpy as np
 from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
 
@@ -80,8 +80,6 @@ def my_func(arr):
 
 def run():
     global my_func
-    if not FFI:
-        return
     # Here is a concrete (very naive!) implementation on CPU:
     cpu_code = string.Template(r"""
     extern "C"
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
index e91ddc25cc..dd8c3c8557 100644
--- a/cuda_core/examples/strided_memory_view_gpu.py
+++ b/cuda_core/examples/strided_memory_view_gpu.py
@@ -20,8 +20,8 @@
 try:
     import cupy as cp
 except ImportError:
-    print("cupy is not installed, the GPU example will be skipped", file=sys.stderr)
-    cp = None
+    print("cupy is not installed, this example requires cupy", file=sys.stderr)
+    sys.exit(1)
 import numpy as np
 from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
@@ -56,7 +56,7 @@
 # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
 # of which are supported by StridedMemoryView).
 @args_viewable_as_strided_memory((0,))
-def my_func(arr, work_stream, gpu_ker):
+def my_func(arr, work_stream, kernel):
     # Create a memory view over arr (assumed to be a 1D array of int32). The stream
     # ordering is taken care of, so that arr can be safely accessed on our work
     # stream (ordered after a data stream on which arr is potentially prepared).
@@ -72,7 +72,7 @@ def my_func(arr, work_stream, gpu_ker):
     block = 256
     grid = (size + block - 1) // block
     config = LaunchConfig(grid=grid, block=block)
-    launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
+    launch(work_stream, config, kernel, view.ptr, np.uint64(size))
     # Here we're being conservative and synchronize over our work stream,
     # assuming we do not know the data stream; if we know then we could
     # just order the data stream after the work stream here, e.g.
@@ -85,8 +85,6 @@ def my_func(arr, work_stream, gpu_ker):
 
 def run():
     global my_func
-    if not cp:
-        return None
     # Here is a concrete (very naive!) implementation on GPU:
     gpu_code = string.Template(r"""
     extern "C"
@@ -102,24 +100,24 @@ def run():
     # To know the GPU's compute capability, we need to identify which GPU to use.
     dev = Device(0)
     dev.set_current()
-    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
-    mod = gpu_prog.compile(target_type="cubin")
-    gpu_ker = mod.get_kernel(func_name)
+    prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
+    mod = prog.compile(target_type="cubin")
+    kernel = mod.get_kernel(func_name)
 
-    s = dev.create_stream()
+    stream = dev.create_stream()
     try:
         # Create input array on GPU
         arr_gpu = cp.ones(1024, dtype=cp.int32)
         print(f"before: {arr_gpu[:10]=}")
 
         # Run the workload
-        my_func(arr_gpu, s, gpu_ker)
+        my_func(arr_gpu, stream, kernel)
 
         # Check the result
         print(f"after: {arr_gpu[:10]=}")
         assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
     finally:
-        s.close()
+        stream.close()
 
 
 if __name__ == "__main__":
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index f1ea8b8579..b72ca3ca7e 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -24,15 +24,17 @@
 
 if np.lib.NumpyVersion(np.__version__) < "2.2.5":
     print("This example requires NumPy 2.2.5 or later", file=sys.stderr)
-    sys.exit(0)
+    sys.exit(1)
 
 # prepare include
 cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
 if cuda_path is None:
     print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr)
-    sys.exit(0)
+    sys.exit(1)
 cuda_include = os.path.join(cuda_path, "include")
-assert os.path.isdir(cuda_include)
+if not os.path.isdir(cuda_include):
+    print(f"CUDA include directory not found: {cuda_include}", file=sys.stderr)
+    sys.exit(1)
 include_path = [cuda_include]
 cccl_include = os.path.join(cuda_include, "cccl")
 if os.path.isdir(cccl_include):
@@ -80,7 +82,7 @@
         "this demo requires compute capability >= 9.0 (since thread block cluster is a hardware feature)",
         file=sys.stderr,
     )
-    sys.exit(0)
+    sys.exit(1)
 arch = "".join(f"{i}" for i in arch)
 
 # prepare program & compile kernel
@@ -91,7 +93,7 @@
     options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
 )
 mod = prog.compile(target_type="cubin")
-ker = mod.get_kernel("check_cluster_info")
+kernel = mod.get_kernel("check_cluster_info")
 
 # prepare launch config
 grid = 4
@@ -119,7 +121,7 @@
 block_dims[:] = 0
 
 # launch kernel on the default stream
-launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
+launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
 dev.sync()
 
 # verify results
@@ -133,15 +135,6 @@
 expected_grid_blocks = grid * cluster  # 4 * 2 = 8
 actual_grid_blocks = grid_dims[0]
 
-print("\nVerification:")
-print(f"LaunchConfig specified: grid={grid} clusters, cluster={cluster} blocks/cluster")
-print(f"Expected total blocks: {expected_grid_blocks}")
-print(f"Actual total blocks: {actual_grid_blocks}")
-
-if actual_grid_blocks == expected_grid_blocks:
-    print("✓ Grid conversion is correct!")
-else:
-    print("✗ Grid conversion failed!")
-    sys.exit(1)
-
-print("done!")
+assert actual_grid_blocks == expected_grid_blocks, (
+    f"Grid conversion failed: expected {expected_grid_blocks} total blocks, got {actual_grid_blocks}"
+)
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index d31ab77208..b4ad2efd1b 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -29,7 +29,7 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 
 # prepare program
 program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
@@ -37,7 +37,7 @@
 mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
 # run in single precision
-ker = mod.get_kernel("vector_add<float>")
+kernel = mod.get_kernel("vector_add<float>")
 dtype = cp.float32
 
 # prepare input/output
@@ -47,7 +47,7 @@
 b = rng.random(size, dtype=dtype)
 c = cp.empty_like(a)
 
-# cupy runs on a different stream from s, so sync before accessing
+# cupy runs on a different stream from stream, so sync before accessing
 dev.sync()
 
 # prepare launch
@@ -55,10 +55,9 @@
 grid = (size + block - 1) // block
 config = LaunchConfig(grid=grid, block=block)
 
-# launch kernel on stream s
-launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
-s.sync()
+# launch kernel on stream
+launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+stream.sync()
 
 # check result
 assert cp.allclose(c, a + b)
-print("done!")