From 08202516d24d401a4674b851e1d73c4564eaa38b Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Thu, 19 Oct 2023 10:17:54 +0000 Subject: [PATCH 1/4] [mlir][SVE] Add an e2e test for vectorization of linalg.matmul Adds an end-2-end test for scalable vectorization of linalg.matmul. This is the most basic case where the dimension along which we vectorize fits perfectly within SVE registers. I will be extending this to more generic cases in the forthcoming patches. Depends on #68794. --- .../Dialect/Linalg/CPU/ArmSVE/matmul.mlir | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir new file mode 100644 index 0000000000000..9b2aaefc5d631 --- /dev/null +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir @@ -0,0 +1,77 @@ +// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule \ +// RUN: -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \ +// RUN: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm | \ +// RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \ +// RUN: FileCheck %s + +func.func @printTestEnd() { + %0 = llvm.mlir.addressof @str_sve_end : !llvm.ptr> + %1 = llvm.mlir.constant(0 : index) : i64 + %2 = llvm.getelementptr %0[%1, %1] + : (!llvm.ptr>, i64, i64) -> !llvm.ptr + llvm.call @printCString(%2) : (!llvm.ptr) -> () + return +} + +func.func @entry() { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %step = arith.constant 1 : index + %c0_f32 = arith.constant 0.0 : f32 + + %vscale = vector.vscale + %vl_fp = arith.muli %c4, %vscale : index + %A_alloc = bufferization.alloc_tensor(%c2, %c1) : tensor + %B_alloc = bufferization.alloc_tensor(%c1, %vl_fp) : tensor + %C_alloc = bufferization.alloc_tensor(%c2, %vl_fp) : tensor + + %pi = arith.constant 3.14 : f32 + %A = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor) -> tensor + %B = linalg.fill ins(%pi : f32) outs(%B_alloc : tensor) -> tensor + %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor) -> tensor + + %C_out = linalg.matmul ins(%A, %B: tensor, tensor) outs(%C_in: tensor) -> tensor + + // There are at least 4 f32 elements in every SVE vector, i.e. + // * %vscale is >= 1. + // For implementations with wider vectors, you should see more elements being + // printed. + // CHECK: {{\[}}[9.8596, 9.8596, 9.8596, 9.8596 + // CHECK-NEXT: [9.8596, 9.8596, 9.8596, 9.8596 + + %xf = tensor.cast %C_out : tensor to tensor<*xf32> + call @printMemrefF32(%xf) : (tensor<*xf32>) -> () + + // CHECK: SVE: END OF TEST OUTPUT + func.call @printTestEnd() : () -> () + + return +} + +transform.sequence failures(propagate) { +^bb1(%module_op: !transform.any_op): + %0 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op + %func_op = get_parent_op %0 : (!transform.any_op) -> !transform.op<"func.func"> + // The tile sizes match the output matrix sizes + %1, %loops:3 = transform.structured.tile_using_for %0 [2, [4], 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %2 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op + // The vector sizes match the output matrix sizes + // TOOD: Use variables to re-use "shared" sizes + transform.structured.vectorize %2 vector_sizes [2, [4], 1] : !transform.any_op + + transform.apply_patterns to %func_op { + transform.apply_patterns.vector.reduction_to_contract + transform.apply_patterns.vector.transfer_permutation_patterns + transform.apply_patterns.vector.lower_masked_transfers + } : !transform.op<"func.func"> + transform.apply_patterns to %func_op { + transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct" + transform.apply_patterns.vector.lower_outerproduct + } : !transform.op<"func.func"> +} + +llvm.func @printCString(!llvm.ptr) +func.func private @printMemrefF32(%ptr : tensor<*xf32>) +llvm.mlir.global internal constant @str_sve_end("SVE: END OF TEST OUTPUT\0A") From f39c97d64bd9e721f46cf6f367739a061ad3324b Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Thu, 19 Oct 2023 12:38:24 +0000 Subject: [PATCH 2/4] fixup! [mlir][SVE] Add an e2e test for vectorization of linalg.matmul Use vector.print instead of @printCString --- .../Dialect/Linalg/CPU/ArmSVE/matmul.mlir | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir index 9b2aaefc5d631..df6144bfdc019 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir @@ -4,15 +4,6 @@ // RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \ // RUN: FileCheck %s -func.func @printTestEnd() { - %0 = llvm.mlir.addressof @str_sve_end : !llvm.ptr> - %1 = llvm.mlir.constant(0 : index) : i64 - %2 = llvm.getelementptr %0[%1, %1] - : (!llvm.ptr>, i64, i64) -> !llvm.ptr - llvm.call @printCString(%2) : (!llvm.ptr) -> () - return -} - func.func @entry() { %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index @@ -45,7 +36,7 @@ func.func @entry() { call @printMemrefF32(%xf) : (tensor<*xf32>) -> () // CHECK: SVE: END OF TEST OUTPUT - func.call @printTestEnd() : () -> () + vector.print str "SVE: END OF TEST OUTPUT" return } @@ -72,6 +63,4 @@ transform.sequence failures(propagate) { } : !transform.op<"func.func"> } -llvm.func @printCString(!llvm.ptr) func.func private @printMemrefF32(%ptr : tensor<*xf32>) -llvm.mlir.global internal constant @str_sve_end("SVE: END OF TEST OUTPUT\0A") From bd61b2ce570f8c87d9dfd489269e996061302879 Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Mon, 23 Oct 2023 07:26:40 +0000 Subject: [PATCH 3/4] fixup! [mlir][SVE] Add an e2e test for vectorization of linalg.matmul Simplity check-lines --- .../Dialect/Linalg/CPU/ArmSVE/matmul.mlir | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir index df6144bfdc019..922c5d89e3217 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir @@ -25,17 +25,21 @@ func.func @entry() { %C_out = linalg.matmul ins(%A, %B: tensor, tensor) outs(%C_in: tensor) -> tensor - // There are at least 4 f32 elements in every SVE vector, i.e. - // * %vscale is >= 1. - // For implementations with wider vectors, you should see more elements being - // printed. - // CHECK: {{\[}}[9.8596, 9.8596, 9.8596, 9.8596 + // CHECK-LABEL: SVE: START OF TEST OUTPUT + vector.print str "SVE: START OF TEST OUTPUT" + + // There are at least 4 x f32 elements in every SVE vector, i.e. + // * %vscale >= 1. + // Hence, when checking the outupt there will always be at least 4 elements + // in every row. For implementations with wider vectors, you should see more + // elements being printed. + // CHECK: [9.8596, 9.8596, 9.8596, 9.8596 // CHECK-NEXT: [9.8596, 9.8596, 9.8596, 9.8596 %xf = tensor.cast %C_out : tensor to tensor<*xf32> call @printMemrefF32(%xf) : (tensor<*xf32>) -> () - // CHECK: SVE: END OF TEST OUTPUT + // CHECK-NEXT: SVE: END OF TEST OUTPUT vector.print str "SVE: END OF TEST OUTPUT" return From 2b08169fe9fffe410df5c745b0133d07b24775c1 Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Thu, 26 Oct 2023 11:50:50 +0000 Subject: [PATCH 4/4] fixup! [mlir][SVE] Add an e2e test for vectorization of linalg.matmul Use CHECK-NEXT --- mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir index 922c5d89e3217..bc94161d5d375 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir @@ -33,7 +33,8 @@ func.func @entry() { // Hence, when checking the outupt there will always be at least 4 elements // in every row. For implementations with wider vectors, you should see more // elements being printed. - // CHECK: [9.8596, 9.8596, 9.8596, 9.8596 + // CHECK-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [2, 16] strides = [16, 1] data = + // CHECK-NEXT: [9.8596, 9.8596, 9.8596, 9.8596 // CHECK-NEXT: [9.8596, 9.8596, 9.8596, 9.8596 %xf = tensor.cast %C_out : tensor to tensor<*xf32>