diff --git a/.github/workflows/build_test_tutorial.yml b/.github/workflows/build_test_tutorial.yml new file mode 100644 index 0000000..5ecb18d --- /dev/null +++ b/.github/workflows/build_test_tutorial.yml @@ -0,0 +1,99 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Copyright (c) 2024. + +name: "Test tutorial" + +on: + workflow_dispatch: + pull_request: + branches: + - main + push: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.event.number || github.sha }}-test_tutorial + cancel-in-progress: true + +jobs: + + test-tutorial: + + strategy: + fail-fast: false + matrix: + runs-on: [ + "ubuntu-22.04", + "ubuntu-22.04-arm", + "macos-14", + "windows-2022" + ] + python-version: [ + "3.10", "3.11", "3.12", + "3.13", "3.14", "3.14t" + ] + include: [ + {runs-on: "ubuntu-22.04", name: "ubuntu_x86_64", os: "ubuntu"}, + {runs-on: "ubuntu-22.04-arm", name: "ubuntu_aarch64", os: "ubuntu"}, + {runs-on: "windows-2022", name: "windows_amd64", os: "windows"}, + {runs-on: "macos-14", name: "macos_arm64", os: "macos"}, + ] + exclude: + # :491: Warning: Numpy built with MINGW-W64 on Windows 64 bits is experimental, and only available for testing. You are advised not to use it for production. + - runs-on: windows-2022 + python-version: "3.14" + + - runs-on: windows-2022 + python-version: "3.14t" + + - runs-on: macos-14 + python-version: "3.10" + + - runs-on: macos-14 + python-version: "3.11" + + - runs-on: macos-14 + python-version: "3.12" + + - runs-on: macos-14 + python-version: "3.13" + + - runs-on: macos-14 + python-version: "3.14" + + runs-on: ${{ matrix.runs-on }} + + name: "Test tutorial ${{ matrix.name }} ${{ matrix.python-version }}" + + defaults: + run: + shell: bash + + steps: + - name: "Check out repository" + uses: actions/checkout@v4.2.2 + with: + submodules: false + + - name: "Install Python" + uses: actions/setup-python@v6.0.0 + with: + python-version: "${{ matrix.python-version }}" + + - name: "Install requirements" + run: python -m pip install -r requirements.txt + + - name: "Build tutorial" + run: | + + cmake -B build -S $PWD -DCMAKE_PREFIX_PATH=$(python -m mlir_wheel --root-dir) -DLLVM_EXTERNAL_LIT=$(which lit) + cmake --build build --target tutorial-opt + + - name: "Test tutorial" + run: | + + cmake --build build --target check-tutorial + diff --git a/python/tiny.egg-info/PKG-INFO b/python/tiny.egg-info/PKG-INFO deleted file mode 100644 index 5d44949..0000000 --- a/python/tiny.egg-info/PKG-INFO +++ /dev/null @@ -1,5 +0,0 @@ -Metadata-Version: 2.4 -Name: tiny -Version: 0.1.0 -Summary: Python DSL for MLIR Tutorial dialects -Requires-Python: >=3.10 diff --git a/python/tiny.egg-info/dependency_links.txt b/python/tiny.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/python/tiny.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/python/tiny.egg-info/top_level.txt b/python/tiny.egg-info/top_level.txt deleted file mode 100644 index 51c58a0..0000000 --- a/python/tiny.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -tiny diff --git a/python/tiny/ch1/__init__.py b/python/tiny/ch1/__init__.py index b39f285..0b28b8f 100644 --- a/python/tiny/ch1/__init__.py +++ b/python/tiny/ch1/__init__.py @@ -2,8 +2,3 @@ from .dsl import Index, F16Vector, Ptr from .dsl import print_ir, compile_and_print - -__all__ = [ - "Index", "F16Vector", "Ptr", - "print_ir", "compile_and_print", -] diff --git a/python/tiny/ch1/dsl.py b/python/tiny/ch1/dsl.py index 8d2f863..c01e439 100644 --- a/python/tiny/ch1/dsl.py +++ b/python/tiny/ch1/dsl.py @@ -1,22 +1,22 @@ """DSL wrapper types for Tiny dialect.""" from mlir.ir import ( - Value, Type, Operation, - IndexType, VectorType, F16Type, - IntegerAttr, DenseElementsAttr, + Value, + Type, + Operation, + IndexType, + VectorType, + F16Type, + IntegerAttr, + DenseElementsAttr, + register_value_caster, ) import numpy as np -class Index: +@register_value_caster(IndexType.static_typeid) +class Index(Value): """Wrapper for index-typed SSA values.""" - _value: Value - - @staticmethod - def _wrap(value: Value) -> "Index": - idx = Index() - idx._value = value - return idx @staticmethod def constant(val: int) -> "Index": @@ -28,31 +28,34 @@ def constant(val: int) -> "Index": results=[idx_type], attributes={"value": attr}, ) - return Index._wrap(op.result) + return op.result def _binop(self, other: "Index", op_name: str) -> "Index": op = Operation.create( f"tiny.{op_name}", results=[IndexType.get()], - operands=[self._value, other._value], + operands=[self, other], ) - return Index._wrap(op.result) + return op.result - def __add__(self, other): return self._binop(other, "addi") - def __sub__(self, other): return self._binop(other, "subi") - def __mul__(self, other): return self._binop(other, "muli") - def __floordiv__(self, other): return self._binop(other, "divi") + def __add__(self, other): + return self._binop(other, "addi") + def __sub__(self, other): + return self._binop(other, "subi") -class F16Vector: - """Wrapper for vector SSA values.""" - _value: Value + def __mul__(self, other): + return self._binop(other, "muli") - @staticmethod - def _wrap(value: Value) -> "F16Vector": - vec = F16Vector() - vec._value = value - return vec + def __floordiv__(self, other): + return self._binop(other, "divi") + + def __repr__(self): + return repr(self).replace("Value", "Index") + + +class F16Vector(Value): + """Wrapper for vector SSA values.""" @staticmethod def constant(vals: list[float], size: int = None) -> "F16Vector": @@ -66,20 +69,27 @@ def constant(vals: list[float], size: int = None) -> "F16Vector": results=[vec_type], attributes={"value": attr}, ) - return F16Vector._wrap(op.result) + return op.result def _binop(self, other: "F16Vector", op_name: str) -> "F16Vector": op = Operation.create( f"tiny.{op_name}", - results=[self._value.type], - operands=[self._value, other._value], + results=[self.type], + operands=[self, other], ) - return F16Vector._wrap(op.result) + return op.result - def __add__(self, other): return self._binop(other, "addf") - def __sub__(self, other): return self._binop(other, "subf") - def __mul__(self, other): return self._binop(other, "mulf") - def __truediv__(self, other): return self._binop(other, "divf") + def __add__(self, other): + return self._binop(other, "addf") + + def __sub__(self, other): + return self._binop(other, "subf") + + def __mul__(self, other): + return self._binop(other, "mulf") + + def __truediv__(self, other): + return self._binop(other, "divf") def sum(self) -> "F16Vector": """Reduce to vector<1xf16> via tiny.sum.""" @@ -87,13 +97,24 @@ def sum(self) -> "F16Vector": op = Operation.create( "tiny.sum", results=[result_type], - operands=[self._value], + operands=[self], ) - return F16Vector._wrap(op.result) + return op.result + + def __repr__(self): + return repr(self).replace("Value", "F16Vector") + + +@register_value_caster(VectorType.static_typeid) +def maybe_wrap_vector(val: Value): + if isinstance(val.type.element_type, F16Type): + return F16Vector(val) + return val class Ptr: """Wrapper for !tiny.ptr SSA values.""" + _value: Value @staticmethod @@ -113,16 +134,16 @@ def load(self, offset: Index, num_elements: int) -> F16Vector: op = Operation.create( "tiny.load", results=[vec_type], - operands=[self._value, offset._value], + operands=[self._value, offset], ) - return F16Vector._wrap(op.result) + return op.result def store(self, offset: Index, vec: F16Vector) -> None: """Store vector to pointer at offset.""" Operation.create( "tiny.store", results=[], - operands=[vec._value, self._value, offset._value], + operands=[vec, self._value, offset], ) @@ -130,6 +151,7 @@ def store(self, offset: Index, vec: F16Vector) -> None: from ..compiler import MLIRModule, TutorialOpt + def _get_type_map(): """Type map for ch1 tiny dialect.""" return { @@ -137,6 +159,7 @@ def _get_type_map(): Index: (IndexType.get, Index), } + def print_ir(fn): """Print generated Tiny dialect IR (verified and pretty-printed).""" opt = TutorialOpt() @@ -145,6 +168,7 @@ def print_ir(fn): print(tiny_ir) return fn + def compile_and_print(fn): """Compile and print all lowering stages.""" opt = TutorialOpt() @@ -159,7 +183,10 @@ def compile_and_print(fn): print("=== After tiny-to-arith ===") print(arith_ir) - llvm_ir = opt.run(tiny_ir, ["tiny-to-arith", "canonicalize", "cse", "tiny-to-llvm", "convert-to-llvm"]) + llvm_ir = opt.run( + tiny_ir, + ["tiny-to-arith", "canonicalize", "cse", "tiny-to-llvm", "convert-to-llvm"], + ) print("=== LLVM Dialect ===") print(llvm_ir) diff --git a/python/tiny/ch2/__init__.py b/python/tiny/ch2/__init__.py index bed5466..7b633a1 100644 --- a/python/tiny/ch2/__init__.py +++ b/python/tiny/ch2/__init__.py @@ -2,9 +2,3 @@ from ..ch1 import Index, F16Vector, Ptr from .dsl import accumulate, print_ir, compile_and_print - -__all__ = [ - "Index", "F16Vector", "Ptr", - "accumulate", - "print_ir", "compile_and_print", -] diff --git a/python/tiny/ch2/dsl.py b/python/tiny/ch2/dsl.py index f8364da..8da4826 100644 --- a/python/tiny/ch2/dsl.py +++ b/python/tiny/ch2/dsl.py @@ -1,7 +1,10 @@ """DSL for TinyLoop dialect (Chapter 2).""" from mlir.ir import ( - Value, Block, Operation, InsertionPoint, + Value, + Block, + Operation, + InsertionPoint, IndexType, ) @@ -13,6 +16,9 @@ def _wrap_value(value: Value, template): """Wrap MLIR value using the same type as template.""" # Try passing template for types that need it (e.g., Tile) + if isinstance(value, (F16Vector, Index)): + return value + try: return type(template)._wrap(value, template=template) except TypeError: @@ -39,15 +45,20 @@ def loop_body(i: Index, acc: F16Vector): def decorator(body_fn): # Get init value types and MLIR values - init_values = [v._value for v in inits] - init_types = [v._value.type for v in inits] + init_values = [ + v if isinstance(v, (Index, F16Vector)) else v._value for v in inits + ] + init_types = [ + v.type if isinstance(v, (Index, F16Vector)) else v._value.type + for v in inits + ] result_types = init_types # Results match init types # Create the accumulate op with one region op = Operation.create( "tiny_loop.accumulate", results=result_types, - operands=[bound._value, step._value] + init_values, + operands=[bound, step] + init_values, regions=1, # One region for the body ) @@ -58,16 +69,20 @@ def decorator(body_fn): # Execute body with wrapped arguments with InsertionPoint(block): - iv = Index._wrap(block.arguments[0]) - iter_args = [_wrap_value(block.arguments[i+1], inits[i]) - for i in range(len(inits))] + iv = block.arguments[0] + iter_args = [ + _wrap_value(block.arguments[i + 1], inits[i]) for i in range(len(inits)) + ] # Call user's body function if inits: results = body_fn(iv, *iter_args) if not isinstance(results, (list, tuple)): results = [results] - yield_values = [r._value for r in results] + yield_values = [ + r if isinstance(r, (Index, F16Vector)) else r._value + for r in results + ] else: body_fn(iv) yield_values = [] @@ -77,8 +92,9 @@ def decorator(body_fn): # Wrap and return results if result_types: - return [_wrap_value(op.results[i], inits[i]) - for i in range(len(result_types))] + return [ + _wrap_value(op.results[i], inits[i]) for i in range(len(result_types)) + ] return None return decorator @@ -86,6 +102,7 @@ def decorator(body_fn): # --- Convenience functions --- + def _get_type_map(): """Type map for ch2 (same as ch1).""" return { @@ -124,7 +141,9 @@ def compile_and_print(fn): print(arith_ir) # Lower to LLVM - llvm_ir = opt.run(arith_ir, ["tiny-to-llvm", "convert-scf-to-cf", "convert-to-llvm"]) + llvm_ir = opt.run( + arith_ir, ["tiny-to-llvm", "convert-scf-to-cf", "convert-to-llvm"] + ) print("=== LLVM Dialect ===") print(llvm_ir) diff --git a/python/tiny/ch3/__init__.py b/python/tiny/ch3/__init__.py index dfc4e09..be7d2e5 100644 --- a/python/tiny/ch3/__init__.py +++ b/python/tiny/ch3/__init__.py @@ -2,15 +2,13 @@ from ..ch1 import Index, Ptr from .dsl import ( - Layout, Tile, load_tile, store_tile, - block_id_x, block_id_y, block_id_z, - print_ir, compile_and_print, + Layout, + Tile, + load_tile, + store_tile, + block_id_x, + block_id_y, + block_id_z, + print_ir, + compile_and_print, ) - -__all__ = [ - "Index", "Ptr", - "Layout", "Tile", - "load_tile", "store_tile", - "block_id_x", "block_id_y", "block_id_z", - "print_ir", "compile_and_print", -] diff --git a/python/tiny/ch3/dsl.py b/python/tiny/ch3/dsl.py index f2d47ec..9bec316 100644 --- a/python/tiny/ch3/dsl.py +++ b/python/tiny/ch3/dsl.py @@ -1,6 +1,16 @@ """DSL for TinyTile dialect (Chapter 3).""" -from mlir.ir import Value, Type, Operation, Attribute, IndexType, F16Type, F64Type, FloatAttr, VectorType +from mlir.ir import ( + Value, + Type, + Operation, + Attribute, + IndexType, + F16Type, + F64Type, + FloatAttr, + VectorType, +) from ..ch1.dsl import Index, Ptr, F16Vector from ..compiler import MLIRModule, TutorialOpt @@ -18,13 +28,21 @@ def num_elements(self) -> int: class Tile: """Wrapper for !tiny_tile.tile SSA values.""" + _value: Value _height: int _width: int _layout: Layout @staticmethod - def _wrap(value: Value, height: int = None, width: int = None, layout: Layout = None, *, template: "Tile" = None) -> "Tile": + def _wrap( + value: Value, + height: int = None, + width: int = None, + layout: Layout = None, + *, + template: "Tile" = None, + ) -> "Tile": """Wrap an MLIR value as a Tile. Can be called with explicit dimensions or with a template tile to copy from. @@ -50,8 +68,10 @@ def _get_type(height: int, width: int, layout: Layout) -> Type: ) def _binop(self, other: "Tile", kind: str) -> "Tile": - if self._layout.thread != other._layout.thread or \ - self._layout.vector_size != other._layout.vector_size: + if ( + self._layout.thread != other._layout.thread + or self._layout.vector_size != other._layout.vector_size + ): raise ValueError("Operands must have the same layout") result_type = Tile._get_type(self._height, self._width, self._layout) kind_attr = Attribute.parse(f"#tiny_tile") @@ -59,14 +79,21 @@ def _binop(self, other: "Tile", kind: str) -> "Tile": "tiny_tile.elementwise", results=[result_type], operands=[self._value, other._value], - attributes={"kind": kind_attr} + attributes={"kind": kind_attr}, ) return Tile._wrap(op.result, self._height, self._width, self._layout) - def __add__(self, other): return self._binop(other, "add") - def __sub__(self, other): return self._binop(other, "sub") - def __mul__(self, other): return self._binop(other, "mul") - def __truediv__(self, other): return self._binop(other, "div") + def __add__(self, other): + return self._binop(other, "add") + + def __sub__(self, other): + return self._binop(other, "sub") + + def __mul__(self, other): + return self._binop(other, "mul") + + def __truediv__(self, other): + return self._binop(other, "div") @staticmethod def splat(value: float, height: int, width: int, layout: Layout) -> "Tile": @@ -92,11 +119,18 @@ def sum(self) -> F16Vector: results=[VectorType.get([1], F16Type.get())], operands=[self._value], ) - return F16Vector._wrap(op.result) - - -def load_tile(ptr: Ptr, row: Index, col: Index, stride: Index, - height: int, width: int, layout: Layout) -> Tile: + return op.result + + +def load_tile( + ptr: Ptr, + row: Index, + col: Index, + stride: Index, + height: int, + width: int, + layout: Layout, +) -> Tile: """Load a tile from memory (tiny_tile.load). The layout specifies how elements are distributed across threads. @@ -105,7 +139,7 @@ def load_tile(ptr: Ptr, row: Index, col: Index, stride: Index, op = Operation.create( "tiny_tile.load", results=[tile_type], - operands=[ptr._value, row._value, col._value, stride._value], + operands=[ptr._value, row, col, stride], ) return Tile._wrap(op.result, height, width, layout) @@ -120,6 +154,7 @@ def store_tile(tile: Tile, ptr: Ptr, row: Index, col: Index, stride: Index): # --- GPU block/thread ID functions --- + def block_id_x() -> Index: """Get the block ID in the x dimension (gpu.block_id x).""" op = Operation.create( @@ -127,7 +162,7 @@ def block_id_x() -> Index: results=[IndexType.get()], attributes={"dimension": Attribute.parse("#gpu")}, ) - return Index._wrap(op.result) + return op.result def block_id_y() -> Index: @@ -137,7 +172,7 @@ def block_id_y() -> Index: results=[IndexType.get()], attributes={"dimension": Attribute.parse("#gpu")}, ) - return Index._wrap(op.result) + return op.result def block_id_z() -> Index: @@ -147,11 +182,12 @@ def block_id_z() -> Index: results=[IndexType.get()], attributes={"dimension": Attribute.parse("#gpu")}, ) - return Index._wrap(op.result) + return op.result # --- Convenience functions --- + def _get_type_map(): """Type map for ch3.""" return { diff --git a/python/tiny/compiler.py b/python/tiny/compiler.py index 79d4fb2..519f4d4 100644 --- a/python/tiny/compiler.py +++ b/python/tiny/compiler.py @@ -1,6 +1,6 @@ """Common compiler infrastructure for the tutorial DSL.""" -from mlir.ir import Context, Location, Module, InsertionPoint, IndexType, Type +from mlir.ir import Context, Location, Module, InsertionPoint import mlir.dialects.func as func_d import subprocess import os @@ -24,7 +24,7 @@ class TutorialOpt: """Wrapper around tutorial-opt for running passes.""" def __init__(self, binary_path: str = None): - self.binary = binary_path or _get_tutorial_opt() + self.binary = (binary_path or _get_tutorial_opt()).strip() def run(self, ir: str, passes: list[str]) -> str: """Run passes on IR string, return transformed IR.""" @@ -62,6 +62,9 @@ def build_func(self, fn: Callable, type_map: dict) -> Module: fn: Function to compile (uses type annotations for args) type_map: Maps annotation types to (mlir_type, wrapper_class) tuples """ + # prevent circular import + from tiny.ch1 import Ptr + sig = inspect.signature(fn) self.module = Module.create() @@ -82,7 +85,10 @@ def build_func(self, fn: Callable, type_map: dict) -> Module: args = [] for i, param in enumerate(sig.parameters.values()): _, wrapper_cls = type_map[param.annotation] - args.append(wrapper_cls._wrap(func_op.arguments[i])) + if wrapper_cls is Ptr: + args.append(wrapper_cls._wrap(func_op.arguments[i])) + else: + args.append(func_op.arguments[i]) # Execute user function body fn(*args) @@ -92,7 +98,9 @@ def build_func(self, fn: Callable, type_map: dict) -> Module: return self.module - def build_func_verified(self, fn: Callable, type_map: dict, opt: "TutorialOpt") -> str: + def build_func_verified( + self, fn: Callable, type_map: dict, opt: "TutorialOpt" + ) -> str: """Build and verify module, return pretty-printed IR. Runs the generated IR through tutorial-opt to verify it's valid diff --git a/tutorial/ch1-cpu-vector-dsl/square.py b/tutorial/ch1-cpu-vector-dsl/square.py index 7b49d90..1b15f6d 100644 --- a/tutorial/ch1-cpu-vector-dsl/square.py +++ b/tutorial/ch1-cpu-vector-dsl/square.py @@ -2,6 +2,7 @@ from tiny.ch1 import Ptr, Index, compile_and_print + @compile_and_print def square(a: Ptr, result: Ptr, offset: Index): """Square a vector of 16 f16 elements.""" diff --git a/tutorial/ch3-gpu-tile-dsl/gpu_dot_product.py b/tutorial/ch3-gpu-tile-dsl/gpu_dot_product.py index 1083e9c..396ad30 100644 --- a/tutorial/ch3-gpu-tile-dsl/gpu_dot_product.py +++ b/tutorial/ch3-gpu-tile-dsl/gpu_dot_product.py @@ -2,8 +2,13 @@ from tiny.ch2 import accumulate from tiny.ch3 import ( - Ptr, Index, Layout, Tile, load_tile, - block_id_x, compile_and_print, + Ptr, + Index, + Layout, + Tile, + load_tile, + block_id_x, + compile_and_print, ) # 1x256 tile = 256 elements @@ -13,7 +18,7 @@ @compile_and_print -def dot(a: Ptr, b: Ptr, out: Ptr, M: Index, K: Index): +def dot(a: Ptr, b: Ptr, out: Ptr, K: Index): """ Given two arrays of shape MxK, we perform a dot product over them. @@ -35,7 +40,7 @@ def dot(a: Ptr, b: Ptr, out: Ptr, M: Index, K: Index): # once. @accumulate(K, tile_w, inits=[acc_init]) def _(tile_idx: Index, acc: Tile): - # Load a[bid,tile_idx : tile_idx + tile_w] + # Load a[bid, tile_idx : tile_idx + tile_w] a_tile = load_tile(a, bid, tile_idx, K, TILE_H, TILE_W, LAYOUT) b_tile = load_tile(b, bid, tile_idx, K, TILE_H, TILE_W, LAYOUT)