diff --git a/.github/workflows/build_test_tutorial.yml b/.github/workflows/build_test_tutorial.yml
new file mode 100644
index 0000000..5ecb18d
--- /dev/null
+++ b/.github/workflows/build_test_tutorial.yml
@@ -0,0 +1,99 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Copyright (c) 2024.
+
+name: "Test tutorial"
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}-test_tutorial
+  cancel-in-progress: true
+
+jobs:
+
+  test-tutorial:
+
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on: [
+          "ubuntu-22.04",
+          "ubuntu-22.04-arm",
+          "macos-14",
+          "windows-2022"
+        ]
+        python-version: [
+          "3.10", "3.11", "3.12",
+          "3.13", "3.14", "3.14t"
+        ]
+        include: [
+          {runs-on: "ubuntu-22.04", name: "ubuntu_x86_64", os: "ubuntu"},
+          {runs-on: "ubuntu-22.04-arm", name: "ubuntu_aarch64", os: "ubuntu"},
+          {runs-on: "windows-2022", name: "windows_amd64", os: "windows"},
+          {runs-on: "macos-14", name: "macos_arm64", os: "macos"},
+        ]
+        exclude:
+          # <frozen importlib._bootstrap>:491: Warning: Numpy built with MINGW-W64 on Windows 64 bits is experimental, and only available for testing. You are advised not to use it for production.
+          - runs-on: windows-2022
+            python-version: "3.14"
+
+          - runs-on: windows-2022
+            python-version: "3.14t"
+
+          - runs-on: macos-14
+            python-version: "3.10"
+
+          - runs-on: macos-14
+            python-version: "3.11"
+
+          - runs-on: macos-14
+            python-version: "3.12"
+
+          - runs-on: macos-14
+            python-version: "3.13"
+
+          - runs-on: macos-14
+            python-version: "3.14"
+
+    runs-on: ${{ matrix.runs-on }}
+
+    name: "Test tutorial ${{ matrix.name }} ${{ matrix.python-version }}"
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: "Check out repository"
+        uses: actions/checkout@v4.2.2
+        with:
+          submodules: false
+
+      - name: "Install Python"
+        uses: actions/setup-python@v6.0.0
+        with:
+          python-version: "${{ matrix.python-version }}"
+
+      - name: "Install requirements"
+        run: python -m pip install -r requirements.txt
+
+      - name: "Build tutorial"
+        run: |
+          
+          cmake -B build -S $PWD -DCMAKE_PREFIX_PATH=$(python -m mlir_wheel --root-dir) -DLLVM_EXTERNAL_LIT=$(which lit)
+          cmake --build build --target tutorial-opt
+
+      - name: "Test tutorial"
+        run: |
+          
+          cmake --build build --target check-tutorial
+
diff --git a/python/tiny.egg-info/PKG-INFO b/python/tiny.egg-info/PKG-INFO
deleted file mode 100644
index 5d44949..0000000
--- a/python/tiny.egg-info/PKG-INFO
+++ /dev/null
@@ -1,5 +0,0 @@
-Metadata-Version: 2.4
-Name: tiny
-Version: 0.1.0
-Summary: Python DSL for MLIR Tutorial dialects
-Requires-Python: >=3.10
diff --git a/python/tiny.egg-info/dependency_links.txt b/python/tiny.egg-info/dependency_links.txt
deleted file mode 100644
index 8b13789..0000000
--- a/python/tiny.egg-info/dependency_links.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/python/tiny.egg-info/top_level.txt b/python/tiny.egg-info/top_level.txt
deleted file mode 100644
index 51c58a0..0000000
--- a/python/tiny.egg-info/top_level.txt
+++ /dev/null
@@ -1 +0,0 @@
-tiny
diff --git a/python/tiny/ch1/__init__.py b/python/tiny/ch1/__init__.py
index b39f285..0b28b8f 100644
--- a/python/tiny/ch1/__init__.py
+++ b/python/tiny/ch1/__init__.py
@@ -2,8 +2,3 @@
 
 from .dsl import Index, F16Vector, Ptr
 from .dsl import print_ir, compile_and_print
-
-__all__ = [
-    "Index", "F16Vector", "Ptr",
-    "print_ir", "compile_and_print",
-]
diff --git a/python/tiny/ch1/dsl.py b/python/tiny/ch1/dsl.py
index 8d2f863..c01e439 100644
--- a/python/tiny/ch1/dsl.py
+++ b/python/tiny/ch1/dsl.py
@@ -1,22 +1,22 @@
 """DSL wrapper types for Tiny dialect."""
 
 from mlir.ir import (
-    Value, Type, Operation,
-    IndexType, VectorType, F16Type,
-    IntegerAttr, DenseElementsAttr,
+    Value,
+    Type,
+    Operation,
+    IndexType,
+    VectorType,
+    F16Type,
+    IntegerAttr,
+    DenseElementsAttr,
+    register_value_caster,
 )
 import numpy as np
 
 
-class Index:
+@register_value_caster(IndexType.static_typeid)
+class Index(Value):
     """Wrapper for index-typed SSA values."""
-    _value: Value
-
-    @staticmethod
-    def _wrap(value: Value) -> "Index":
-        idx = Index()
-        idx._value = value
-        return idx
 
     @staticmethod
     def constant(val: int) -> "Index":
@@ -28,31 +28,34 @@ def constant(val: int) -> "Index":
             results=[idx_type],
             attributes={"value": attr},
         )
-        return Index._wrap(op.result)
+        return op.result
 
     def _binop(self, other: "Index", op_name: str) -> "Index":
         op = Operation.create(
             f"tiny.{op_name}",
             results=[IndexType.get()],
-            operands=[self._value, other._value],
+            operands=[self, other],
         )
-        return Index._wrap(op.result)
+        return op.result
 
-    def __add__(self, other): return self._binop(other, "addi")
-    def __sub__(self, other): return self._binop(other, "subi")
-    def __mul__(self, other): return self._binop(other, "muli")
-    def __floordiv__(self, other): return self._binop(other, "divi")
+    def __add__(self, other):
+        return self._binop(other, "addi")
 
+    def __sub__(self, other):
+        return self._binop(other, "subi")
 
-class F16Vector:
-    """Wrapper for vector<Nxf16> SSA values."""
-    _value: Value
+    def __mul__(self, other):
+        return self._binop(other, "muli")
 
-    @staticmethod
-    def _wrap(value: Value) -> "F16Vector":
-        vec = F16Vector()
-        vec._value = value
-        return vec
+    def __floordiv__(self, other):
+        return self._binop(other, "divi")
+
+    def __repr__(self):
+        return repr(self).replace("Value", "Index")
+
+
+class F16Vector(Value):
+    """Wrapper for vector<Nxf16> SSA values."""
 
     @staticmethod
     def constant(vals: list[float], size: int = None) -> "F16Vector":
@@ -66,20 +69,27 @@ def constant(vals: list[float], size: int = None) -> "F16Vector":
             results=[vec_type],
             attributes={"value": attr},
         )
-        return F16Vector._wrap(op.result)
+        return op.result
 
     def _binop(self, other: "F16Vector", op_name: str) -> "F16Vector":
         op = Operation.create(
             f"tiny.{op_name}",
-            results=[self._value.type],
-            operands=[self._value, other._value],
+            results=[self.type],
+            operands=[self, other],
         )
-        return F16Vector._wrap(op.result)
+        return op.result
 
-    def __add__(self, other): return self._binop(other, "addf")
-    def __sub__(self, other): return self._binop(other, "subf")
-    def __mul__(self, other): return self._binop(other, "mulf")
-    def __truediv__(self, other): return self._binop(other, "divf")
+    def __add__(self, other):
+        return self._binop(other, "addf")
+
+    def __sub__(self, other):
+        return self._binop(other, "subf")
+
+    def __mul__(self, other):
+        return self._binop(other, "mulf")
+
+    def __truediv__(self, other):
+        return self._binop(other, "divf")
 
     def sum(self) -> "F16Vector":
         """Reduce to vector<1xf16> via tiny.sum."""
@@ -87,13 +97,24 @@ def sum(self) -> "F16Vector":
         op = Operation.create(
             "tiny.sum",
             results=[result_type],
-            operands=[self._value],
+            operands=[self],
         )
-        return F16Vector._wrap(op.result)
+        return op.result
+
+    def __repr__(self):
+        return repr(self).replace("Value", "F16Vector")
+
+
+@register_value_caster(VectorType.static_typeid)
+def maybe_wrap_vector(val: Value):
+    if isinstance(val.type.element_type, F16Type):
+        return F16Vector(val)
+    return val
 
 
 class Ptr:
     """Wrapper for !tiny.ptr SSA values."""
+
     _value: Value
 
     @staticmethod
@@ -113,16 +134,16 @@ def load(self, offset: Index, num_elements: int) -> F16Vector:
         op = Operation.create(
             "tiny.load",
             results=[vec_type],
-            operands=[self._value, offset._value],
+            operands=[self._value, offset],
         )
-        return F16Vector._wrap(op.result)
+        return op.result
 
     def store(self, offset: Index, vec: F16Vector) -> None:
         """Store vector<Nxf16> to pointer at offset."""
         Operation.create(
             "tiny.store",
             results=[],
-            operands=[vec._value, self._value, offset._value],
+            operands=[vec, self._value, offset],
         )
 
 
@@ -130,6 +151,7 @@ def store(self, offset: Index, vec: F16Vector) -> None:
 
 from ..compiler import MLIRModule, TutorialOpt
 
+
 def _get_type_map():
     """Type map for ch1 tiny dialect."""
     return {
@@ -137,6 +159,7 @@ def _get_type_map():
         Index: (IndexType.get, Index),
     }
 
+
 def print_ir(fn):
     """Print generated Tiny dialect IR (verified and pretty-printed)."""
     opt = TutorialOpt()
@@ -145,6 +168,7 @@ def print_ir(fn):
         print(tiny_ir)
     return fn
 
+
 def compile_and_print(fn):
     """Compile and print all lowering stages."""
     opt = TutorialOpt()
@@ -159,7 +183,10 @@ def compile_and_print(fn):
     print("=== After tiny-to-arith ===")
     print(arith_ir)
 
-    llvm_ir = opt.run(tiny_ir, ["tiny-to-arith", "canonicalize", "cse", "tiny-to-llvm", "convert-to-llvm"])
+    llvm_ir = opt.run(
+        tiny_ir,
+        ["tiny-to-arith", "canonicalize", "cse", "tiny-to-llvm", "convert-to-llvm"],
+    )
     print("=== LLVM Dialect ===")
     print(llvm_ir)
 
diff --git a/python/tiny/ch2/__init__.py b/python/tiny/ch2/__init__.py
index bed5466..7b633a1 100644
--- a/python/tiny/ch2/__init__.py
+++ b/python/tiny/ch2/__init__.py
@@ -2,9 +2,3 @@
 
 from ..ch1 import Index, F16Vector, Ptr
 from .dsl import accumulate, print_ir, compile_and_print
-
-__all__ = [
-    "Index", "F16Vector", "Ptr",
-    "accumulate",
-    "print_ir", "compile_and_print",
-]
diff --git a/python/tiny/ch2/dsl.py b/python/tiny/ch2/dsl.py
index f8364da..8da4826 100644
--- a/python/tiny/ch2/dsl.py
+++ b/python/tiny/ch2/dsl.py
@@ -1,7 +1,10 @@
 """DSL for TinyLoop dialect (Chapter 2)."""
 
 from mlir.ir import (
-    Value, Block, Operation, InsertionPoint,
+    Value,
+    Block,
+    Operation,
+    InsertionPoint,
     IndexType,
 )
 
@@ -13,6 +16,9 @@
 def _wrap_value(value: Value, template):
     """Wrap MLIR value using the same type as template."""
     # Try passing template for types that need it (e.g., Tile)
+    if isinstance(value, (F16Vector, Index)):
+        return value
+
     try:
         return type(template)._wrap(value, template=template)
     except TypeError:
@@ -39,15 +45,20 @@ def loop_body(i: Index, acc: F16Vector):
 
     def decorator(body_fn):
         # Get init value types and MLIR values
-        init_values = [v._value for v in inits]
-        init_types = [v._value.type for v in inits]
+        init_values = [
+            v if isinstance(v, (Index, F16Vector)) else v._value for v in inits
+        ]
+        init_types = [
+            v.type if isinstance(v, (Index, F16Vector)) else v._value.type
+            for v in inits
+        ]
         result_types = init_types  # Results match init types
 
         # Create the accumulate op with one region
         op = Operation.create(
             "tiny_loop.accumulate",
             results=result_types,
-            operands=[bound._value, step._value] + init_values,
+            operands=[bound, step] + init_values,
             regions=1,  # One region for the body
         )
 
@@ -58,16 +69,20 @@ def decorator(body_fn):
 
         # Execute body with wrapped arguments
         with InsertionPoint(block):
-            iv = Index._wrap(block.arguments[0])
-            iter_args = [_wrap_value(block.arguments[i+1], inits[i])
-                         for i in range(len(inits))]
+            iv = block.arguments[0]
+            iter_args = [
+                _wrap_value(block.arguments[i + 1], inits[i]) for i in range(len(inits))
+            ]
 
             # Call user's body function
             if inits:
                 results = body_fn(iv, *iter_args)
                 if not isinstance(results, (list, tuple)):
                     results = [results]
-                yield_values = [r._value for r in results]
+                yield_values = [
+                    r if isinstance(r, (Index, F16Vector)) else r._value
+                    for r in results
+                ]
             else:
                 body_fn(iv)
                 yield_values = []
@@ -77,8 +92,9 @@ def decorator(body_fn):
 
         # Wrap and return results
         if result_types:
-            return [_wrap_value(op.results[i], inits[i])
-                    for i in range(len(result_types))]
+            return [
+                _wrap_value(op.results[i], inits[i]) for i in range(len(result_types))
+            ]
         return None
 
     return decorator
@@ -86,6 +102,7 @@ def decorator(body_fn):
 
 # --- Convenience functions ---
 
+
 def _get_type_map():
     """Type map for ch2 (same as ch1)."""
     return {
@@ -124,7 +141,9 @@ def compile_and_print(fn):
     print(arith_ir)
 
     # Lower to LLVM
-    llvm_ir = opt.run(arith_ir, ["tiny-to-llvm", "convert-scf-to-cf", "convert-to-llvm"])
+    llvm_ir = opt.run(
+        arith_ir, ["tiny-to-llvm", "convert-scf-to-cf", "convert-to-llvm"]
+    )
     print("=== LLVM Dialect ===")
     print(llvm_ir)
 
diff --git a/python/tiny/ch3/__init__.py b/python/tiny/ch3/__init__.py
index dfc4e09..be7d2e5 100644
--- a/python/tiny/ch3/__init__.py
+++ b/python/tiny/ch3/__init__.py
@@ -2,15 +2,13 @@
 
 from ..ch1 import Index, Ptr
 from .dsl import (
-    Layout, Tile, load_tile, store_tile,
-    block_id_x, block_id_y, block_id_z,
-    print_ir, compile_and_print,
+    Layout,
+    Tile,
+    load_tile,
+    store_tile,
+    block_id_x,
+    block_id_y,
+    block_id_z,
+    print_ir,
+    compile_and_print,
 )
-
-__all__ = [
-    "Index", "Ptr",
-    "Layout", "Tile",
-    "load_tile", "store_tile",
-    "block_id_x", "block_id_y", "block_id_z",
-    "print_ir", "compile_and_print",
-]
diff --git a/python/tiny/ch3/dsl.py b/python/tiny/ch3/dsl.py
index f2d47ec..9bec316 100644
--- a/python/tiny/ch3/dsl.py
+++ b/python/tiny/ch3/dsl.py
@@ -1,6 +1,16 @@
 """DSL for TinyTile dialect (Chapter 3)."""
 
-from mlir.ir import Value, Type, Operation, Attribute, IndexType, F16Type, F64Type, FloatAttr, VectorType
+from mlir.ir import (
+    Value,
+    Type,
+    Operation,
+    Attribute,
+    IndexType,
+    F16Type,
+    F64Type,
+    FloatAttr,
+    VectorType,
+)
 from ..ch1.dsl import Index, Ptr, F16Vector
 from ..compiler import MLIRModule, TutorialOpt
 
@@ -18,13 +28,21 @@ def num_elements(self) -> int:
 
 class Tile:
     """Wrapper for !tiny_tile.tile SSA values."""
+
     _value: Value
     _height: int
     _width: int
     _layout: Layout
 
     @staticmethod
-    def _wrap(value: Value, height: int = None, width: int = None, layout: Layout = None, *, template: "Tile" = None) -> "Tile":
+    def _wrap(
+        value: Value,
+        height: int = None,
+        width: int = None,
+        layout: Layout = None,
+        *,
+        template: "Tile" = None,
+    ) -> "Tile":
         """Wrap an MLIR value as a Tile.
 
         Can be called with explicit dimensions or with a template tile to copy from.
@@ -50,8 +68,10 @@ def _get_type(height: int, width: int, layout: Layout) -> Type:
         )
 
     def _binop(self, other: "Tile", kind: str) -> "Tile":
-        if self._layout.thread != other._layout.thread or \
-           self._layout.vector_size != other._layout.vector_size:
+        if (
+            self._layout.thread != other._layout.thread
+            or self._layout.vector_size != other._layout.vector_size
+        ):
             raise ValueError("Operands must have the same layout")
         result_type = Tile._get_type(self._height, self._width, self._layout)
         kind_attr = Attribute.parse(f"#tiny_tile<ew_kind {kind}>")
@@ -59,14 +79,21 @@ def _binop(self, other: "Tile", kind: str) -> "Tile":
             "tiny_tile.elementwise",
             results=[result_type],
             operands=[self._value, other._value],
-            attributes={"kind": kind_attr}
+            attributes={"kind": kind_attr},
         )
         return Tile._wrap(op.result, self._height, self._width, self._layout)
 
-    def __add__(self, other): return self._binop(other, "add")
-    def __sub__(self, other): return self._binop(other, "sub")
-    def __mul__(self, other): return self._binop(other, "mul")
-    def __truediv__(self, other): return self._binop(other, "div")
+    def __add__(self, other):
+        return self._binop(other, "add")
+
+    def __sub__(self, other):
+        return self._binop(other, "sub")
+
+    def __mul__(self, other):
+        return self._binop(other, "mul")
+
+    def __truediv__(self, other):
+        return self._binop(other, "div")
 
     @staticmethod
     def splat(value: float, height: int, width: int, layout: Layout) -> "Tile":
@@ -92,11 +119,18 @@ def sum(self) -> F16Vector:
             results=[VectorType.get([1], F16Type.get())],
             operands=[self._value],
         )
-        return F16Vector._wrap(op.result)
-
-
-def load_tile(ptr: Ptr, row: Index, col: Index, stride: Index,
-              height: int, width: int, layout: Layout) -> Tile:
+        return op.result
+
+
+def load_tile(
+    ptr: Ptr,
+    row: Index,
+    col: Index,
+    stride: Index,
+    height: int,
+    width: int,
+    layout: Layout,
+) -> Tile:
     """Load a tile from memory (tiny_tile.load).
 
     The layout specifies how elements are distributed across threads.
@@ -105,7 +139,7 @@ def load_tile(ptr: Ptr, row: Index, col: Index, stride: Index,
     op = Operation.create(
         "tiny_tile.load",
         results=[tile_type],
-        operands=[ptr._value, row._value, col._value, stride._value],
+        operands=[ptr._value, row, col, stride],
     )
     return Tile._wrap(op.result, height, width, layout)
 
@@ -120,6 +154,7 @@ def store_tile(tile: Tile, ptr: Ptr, row: Index, col: Index, stride: Index):
 
 # --- GPU block/thread ID functions ---
 
+
 def block_id_x() -> Index:
     """Get the block ID in the x dimension (gpu.block_id x)."""
     op = Operation.create(
@@ -127,7 +162,7 @@ def block_id_x() -> Index:
         results=[IndexType.get()],
         attributes={"dimension": Attribute.parse("#gpu<dim x>")},
     )
-    return Index._wrap(op.result)
+    return op.result
 
 
 def block_id_y() -> Index:
@@ -137,7 +172,7 @@ def block_id_y() -> Index:
         results=[IndexType.get()],
         attributes={"dimension": Attribute.parse("#gpu<dim y>")},
     )
-    return Index._wrap(op.result)
+    return op.result
 
 
 def block_id_z() -> Index:
@@ -147,11 +182,12 @@ def block_id_z() -> Index:
         results=[IndexType.get()],
         attributes={"dimension": Attribute.parse("#gpu<dim z>")},
     )
-    return Index._wrap(op.result)
+    return op.result
 
 
 # --- Convenience functions ---
 
+
 def _get_type_map():
     """Type map for ch3."""
     return {
diff --git a/python/tiny/compiler.py b/python/tiny/compiler.py
index 79d4fb2..519f4d4 100644
--- a/python/tiny/compiler.py
+++ b/python/tiny/compiler.py
@@ -1,6 +1,6 @@
 """Common compiler infrastructure for the tutorial DSL."""
 
-from mlir.ir import Context, Location, Module, InsertionPoint, IndexType, Type
+from mlir.ir import Context, Location, Module, InsertionPoint
 import mlir.dialects.func as func_d
 import subprocess
 import os
@@ -24,7 +24,7 @@ class TutorialOpt:
     """Wrapper around tutorial-opt for running passes."""
 
     def __init__(self, binary_path: str = None):
-        self.binary = binary_path or _get_tutorial_opt()
+        self.binary = (binary_path or _get_tutorial_opt()).strip()
 
     def run(self, ir: str, passes: list[str]) -> str:
         """Run passes on IR string, return transformed IR."""
@@ -62,6 +62,9 @@ def build_func(self, fn: Callable, type_map: dict) -> Module:
             fn: Function to compile (uses type annotations for args)
             type_map: Maps annotation types to (mlir_type, wrapper_class) tuples
         """
+        # prevent circular import
+        from tiny.ch1 import Ptr
+
         sig = inspect.signature(fn)
         self.module = Module.create()
 
@@ -82,7 +85,10 @@ def build_func(self, fn: Callable, type_map: dict) -> Module:
                 args = []
                 for i, param in enumerate(sig.parameters.values()):
                     _, wrapper_cls = type_map[param.annotation]
-                    args.append(wrapper_cls._wrap(func_op.arguments[i]))
+                    if wrapper_cls is Ptr:
+                        args.append(wrapper_cls._wrap(func_op.arguments[i]))
+                    else:
+                        args.append(func_op.arguments[i])
 
                 # Execute user function body
                 fn(*args)
@@ -92,7 +98,9 @@ def build_func(self, fn: Callable, type_map: dict) -> Module:
 
         return self.module
 
-    def build_func_verified(self, fn: Callable, type_map: dict, opt: "TutorialOpt") -> str:
+    def build_func_verified(
+        self, fn: Callable, type_map: dict, opt: "TutorialOpt"
+    ) -> str:
         """Build and verify module, return pretty-printed IR.
 
         Runs the generated IR through tutorial-opt to verify it's valid
diff --git a/tutorial/ch1-cpu-vector-dsl/square.py b/tutorial/ch1-cpu-vector-dsl/square.py
index 7b49d90..1b15f6d 100644
--- a/tutorial/ch1-cpu-vector-dsl/square.py
+++ b/tutorial/ch1-cpu-vector-dsl/square.py
@@ -2,6 +2,7 @@
 
 from tiny.ch1 import Ptr, Index, compile_and_print
 
+
 @compile_and_print
 def square(a: Ptr, result: Ptr, offset: Index):
     """Square a vector of 16 f16 elements."""
diff --git a/tutorial/ch3-gpu-tile-dsl/gpu_dot_product.py b/tutorial/ch3-gpu-tile-dsl/gpu_dot_product.py
index 1083e9c..396ad30 100644
--- a/tutorial/ch3-gpu-tile-dsl/gpu_dot_product.py
+++ b/tutorial/ch3-gpu-tile-dsl/gpu_dot_product.py
@@ -2,8 +2,13 @@
 
 from tiny.ch2 import accumulate
 from tiny.ch3 import (
-    Ptr, Index, Layout, Tile, load_tile,
-    block_id_x, compile_and_print,
+    Ptr,
+    Index,
+    Layout,
+    Tile,
+    load_tile,
+    block_id_x,
+    compile_and_print,
 )
 
 # 1x256 tile = 256 elements
@@ -13,7 +18,7 @@
 
 
 @compile_and_print
-def dot(a: Ptr, b: Ptr, out: Ptr, M: Index, K: Index):
+def dot(a: Ptr, b: Ptr, out: Ptr, K: Index):
     """
     Given two arrays of shape MxK, we perform a dot product over them.
 
@@ -35,7 +40,7 @@ def dot(a: Ptr, b: Ptr, out: Ptr, M: Index, K: Index):
     # once.
     @accumulate(K, tile_w, inits=[acc_init])
     def _(tile_idx: Index, acc: Tile):
-        # Load a[bid,tile_idx : tile_idx + tile_w]
+        # Load a[bid, tile_idx : tile_idx + tile_w]
         a_tile = load_tile(a, bid, tile_idx, K, TILE_H, TILE_W, LAYOUT)
         b_tile = load_tile(b, bid, tile_idx, K, TILE_H, TILE_W, LAYOUT)