-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[mlir][nfc] Update vectorize-tensor-extract.mlir (4/N) #119697
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir][nfc] Update vectorize-tensor-extract.mlir (4/N) #119697
Conversation
Tests in "vectorize-tensor-extract.mlir" are inconsistent and would benefit from refactoring to: * Clearly categorize tests into "contiguous load," "gather load," and "scalar load + broadcast" cases, reflecting the structure of tensor.extract vectorization. * Unify variable naming (both MLIR and FileCheck). * Ensure all tests exercise unmasked vectorization (masked vectorization is covered in "vectorize-tensor-extract-masked.mlir"). * Improve and standardize formatting. These changes will make it easier to identify the test cases being exercised and simplify future maintenance or refactoring. This is patch 3/N in the series. Below is a summary of the changes in this patch. ---------------------------------------------------------------------- Summary of patch 4/N ---------------------------------------------------------------------- * Cluster all tests for "contiguous load" and "gather load" together (in 2 seperate groups). Note that this is merely moving things around. ---------------------------------------------------------------------- Previous patches ---------------------------------------------------------------------- * llvm#118977 * llvm#119080 * llvm#119121
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-linalg Author: Andrzej Warzyński (banach-space) ChangesTests in "vectorize-tensor-extract.mlir" are inconsistent and would
These changes will make it easier to identify the test cases being This is patch 3/N in the series. Below is a summary of the changes in Summary of patch 4/N
Note that this is merely moving things around. Previous patches
Full diff: https://github.com/llvm/llvm-project/pull/119697.diff 1 Files Affected:
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index cfa5b5d569ca9d..e66fbe968d9b0e 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -2,34 +2,9 @@
// RUN: -transform-preload-library='transform-library-paths=%p/td/vectorize-with-patterns.mlir' \
// RUN: -transform-interpreter=entry-point=vectorize_with_patterns %s | FileCheck %s
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi32>, %arg2: tensor<4x7x3x2xf32>) -> tensor<4x7x3x2xf32> {
- %1 = linalg.generic {
- indexing_maps = [#map0, #map1],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"]
- } ins(%arg1 : tensor<4x3xi32>) outs(%arg2 : tensor<4x7x3x2xf32>) {
- ^bb0(%arg3: i32, %arg4: f32):
- %2 = arith.index_cast %arg3 : i32 to index
- %3 = tensor.extract %arg0[%2] : tensor<3xf32>
- linalg.yield %3 : f32
- } -> tensor<4x7x3x2xf32>
- return %1 : tensor<4x7x3x2xf32>
-}
-// CHECK-LABEL: func.func @vectorize_1d_tensor_extract
-// CHECK-SAME: %[[ARG0:.*]]: tensor<3xf32>
-// CHECK-SAME: %[[ARG1:.*]]: tensor<4x3xi32>
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1>
-// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32>
-// CHECK: %[[V0:.*]] = vector.transfer_read %[[ARG1]]
-// CHECK: %[[CAST:.*]] = arith.index_cast %[[V0]]
-// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[CAST]]
-// CHECK: %[[INDICES:.*]] = vector.transpose %[[BROADCAST]]
-// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]]] [%[[INDICES]]], %[[MASK]], %[[PASSTHRU]]
-// CHECK: vector.transfer_write %[[GATHER]]
-
-// -----
+//===----------------------------------------------------------------------===//
+// Contiguous load
+//===----------------------------------------------------------------------===//
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
func.func @vectorize_nd_tensor_extract_transfer_read_basic(
@@ -112,6 +87,142 @@ func.func @vectorize_nd_tensor_extract_transfer_read_complex(%6: tensor<45x80x16
// -----
+// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load.
+func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+ %c79 = arith.constant 79 : index
+ %1 = linalg.generic {
+ indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+ iterator_types = ["parallel", "parallel"]
+ } outs(%extracted_slice : tensor<1x4xf32>) {
+ ^bb0(%out: f32):
+ %2 = linalg.index 1 : index
+ %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
+ %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x4xf32>
+ return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME: %[[VAL_1:.*]]: index,
+// CHECK-SAME: %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 79 : index
+// CHECK: %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
+// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
+// CHECK: %[[VAL_10:.*]] = vector.extract %[[VAL_9]][0] : index from vector<4xindex>
+// CHECK: %[[VAL_11:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_10]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
+// CHECK: %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK: return %[[VAL_12]] : tensor<1x4xf32>
+// CHECK: }
+
+// -----
+
+func.func @vectorize_nd_tensor_extract_with_tensor_extract(%input_1: tensor<1x20xi32>, %input_2: tensor<257x24xf32>, %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<1x1x4xf32> {
+ %c0 = arith.constant 0 : index
+ %c256 = arith.constant 256 : index
+ %output = tensor.empty() : tensor<1x1x4xf32>
+ %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%output : tensor<1x1x4xf32>) {
+ ^bb0(%out: f32):
+ %13 = linalg.index 0 : index
+ %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>(%arg0, %13, %arg2)
+ %15 = linalg.index 2 : index
+ %16 = linalg.index 1 : index
+ %17 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 * 24 + d2 + d3)>(%arg1, %16, %15, %arg3)
+ %extracted_0 = tensor.extract %input_1[%c0, %14] : tensor<1x20xi32>
+ %18 = arith.index_cast %extracted_0 : i32 to index
+ %19 = arith.maxsi %18, %c0 : index
+ %20 = arith.minsi %19, %c256 : index
+ %extracted_1 = tensor.extract %input_2[%20, %17] : tensor<257x24xf32>
+ linalg.yield %extracted_1 : f32
+ } -> tensor<1x1x4xf32>
+ return %1 : tensor<1x1x4xf32>
+}
+
+// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_tensor_extract(
+// CHECK-SAME: %[[INPUT_1:.*]]: tensor<1x20xi32>,
+// CHECK-SAME: %[[INPUT_2:.*]]: tensor<257x24xf32>,
+// CHECK-SAME: %[[INPUT_3:.*]]: index, %[[INPUT_4:.*]]: index, %[[INPUT_5:.*]]: index,
+// CHECK: %[[EXTRACTED_0_IDX_0:.*]] = arith.constant 0 : index
+// CHECK: %[[SCALAR:.*]] = arith.addi %[[INPUT_3]], %[[INPUT_5]] : index
+// First `vector.transfer_read` from the generic Op - loop invariant scalar load.
+// CHECK: vector.transfer_read %[[INPUT_1]][%[[EXTRACTED_0_IDX_0]], %[[SCALAR]]]
+// CHECK-SAME: tensor<1x20xi32>, vector<i32>
+// The following `tensor.extract` from the generic Op s a contiguous load (all Ops used
+// for address calculation also satisfy the required conditions).
+// CHECK: vector.transfer_read %[[INPUT_2]][%{{.*}}, %{{.*}}, %{{.*}} {in_bounds = [true, true]} : tensor<257x24xf32>, vector<1x4xf32>
+
+// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load.
+func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+ %c16 = arith.constant 16 : index
+ %1 = linalg.generic {
+ indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+ iterator_types = ["parallel", "parallel"]
+ } outs(%extracted_slice : tensor<1x4xf32>) {
+ ^bb0(%out: f32):
+ %2 = linalg.index 0 : index
+ %3 = linalg.index 1 : index
+ %4 = arith.maxsi %2, %c16 : index
+ %extracted = tensor.extract %arg0[%4, %3] : tensor<80x16xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x4xf32>
+ return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+
+// CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK-DAG: %[[CST_1:.+]] = arith.constant dense<16> : vector<4x1xindex>
+// CHECK-DAG: %[[IDX0:.+]] = vector.extract %[[CST_1]][0, 0] : index from vector<4x1xindex>
+// CHECK-DAG: %[[IDX1:.+]] = vector.extract %[[CST_0]][0] : index from vector<4xindex>
+
+// CHECK: %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[IDX0]], %[[IDX1]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
+// CHECK: %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_1]]{{\[}}%[[VAL_4]], %[[VAL_4]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK: return %[[VAL_9]] : tensor<1x4xf32>
+// CHECK: }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Gather load
+//===----------------------------------------------------------------------===//
+
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi32>, %arg2: tensor<4x7x3x2xf32>) -> tensor<4x7x3x2xf32> {
+ %1 = linalg.generic {
+ indexing_maps = [#map0, #map1],
+ iterator_types = ["parallel", "parallel", "parallel", "parallel"]
+ } ins(%arg1 : tensor<4x3xi32>) outs(%arg2 : tensor<4x7x3x2xf32>) {
+ ^bb0(%arg3: i32, %arg4: f32):
+ %2 = arith.index_cast %arg3 : i32 to index
+ %3 = tensor.extract %arg0[%2] : tensor<3xf32>
+ linalg.yield %3 : f32
+ } -> tensor<4x7x3x2xf32>
+ return %1 : tensor<4x7x3x2xf32>
+}
+// CHECK-LABEL: func.func @vectorize_1d_tensor_extract
+// CHECK-SAME: %[[ARG0:.*]]: tensor<3xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<4x3xi32>
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1>
+// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32>
+// CHECK: %[[V0:.*]] = vector.transfer_read %[[ARG1]]
+// CHECK: %[[CAST:.*]] = arith.index_cast %[[V0]]
+// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[CAST]]
+// CHECK: %[[INDICES:.*]] = vector.transpose %[[BROADCAST]]
+// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]]] [%[[INDICES]]], %[[MASK]], %[[PASSTHRU]]
+// CHECK: vector.transfer_write %[[GATHER]]
+
+// -----
+
#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
@@ -159,7 +270,7 @@ func.func @vectorize_nd_tensor_extract_load_1d_column_vector_using_gather_load(%
%c0 = arith.constant 0 : index
%0 = tensor.empty() : tensor<8x1xf32>
%1 = linalg.generic {
- indexing_maps = [#map],
+ indexing_maps = [#map],
iterator_types = ["parallel", "parallel"]
} outs(%0 : tensor<8x1xf32>) {
^bb0(%arg5: f32):
@@ -303,78 +414,6 @@ func.func @vectorize_nd_tensor_extract_contiguous_and_gather(%arg0: tensor<6xf32
// CHECK: %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_8]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
// CHECK: return %[[VAL_14]] : tensor<5xf32>
-// -----
-
-// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load.
-func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
- %c79 = arith.constant 79 : index
- %1 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]
- } outs(%extracted_slice : tensor<1x4xf32>) {
- ^bb0(%out: f32):
- %2 = linalg.index 1 : index
- %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
- %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32>
- linalg.yield %extracted : f32
- } -> tensor<1x4xf32>
- return %1 : tensor<1x4xf32>
-}
-
-// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(
-// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>,
-// CHECK-SAME: %[[VAL_1:.*]]: index,
-// CHECK-SAME: %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
-// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
-// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 79 : index
-// CHECK: %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
-// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
-// CHECK: %[[VAL_10:.*]] = vector.extract %[[VAL_9]][0] : index from vector<4xindex>
-// CHECK: %[[VAL_11:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_10]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
-// CHECK: %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
-// CHECK: return %[[VAL_12]] : tensor<1x4xf32>
-// CHECK: }
-
-// -----
-
-func.func @vectorize_nd_tensor_extract_with_tensor_extract(%input_1: tensor<1x20xi32>, %input_2: tensor<257x24xf32>, %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<1x1x4xf32> {
- %c0 = arith.constant 0 : index
- %c256 = arith.constant 256 : index
- %output = tensor.empty() : tensor<1x1x4xf32>
- %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%output : tensor<1x1x4xf32>) {
- ^bb0(%out: f32):
- %13 = linalg.index 0 : index
- %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>(%arg0, %13, %arg2)
- %15 = linalg.index 2 : index
- %16 = linalg.index 1 : index
- %17 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 * 24 + d2 + d3)>(%arg1, %16, %15, %arg3)
- %extracted_0 = tensor.extract %input_1[%c0, %14] : tensor<1x20xi32>
- %18 = arith.index_cast %extracted_0 : i32 to index
- %19 = arith.maxsi %18, %c0 : index
- %20 = arith.minsi %19, %c256 : index
- %extracted_1 = tensor.extract %input_2[%20, %17] : tensor<257x24xf32>
- linalg.yield %extracted_1 : f32
- } -> tensor<1x1x4xf32>
- return %1 : tensor<1x1x4xf32>
-}
-
-// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_tensor_extract(
-// CHECK-SAME: %[[INPUT_1:.*]]: tensor<1x20xi32>,
-// CHECK-SAME: %[[INPUT_2:.*]]: tensor<257x24xf32>,
-// CHECK-SAME: %[[INPUT_3:.*]]: index, %[[INPUT_4:.*]]: index, %[[INPUT_5:.*]]: index,
-// CHECK: %[[EXTRACTED_0_IDX_0:.*]] = arith.constant 0 : index
-// CHECK: %[[SCALAR:.*]] = arith.addi %[[INPUT_3]], %[[INPUT_5]] : index
-// First `vector.transfer_read` from the generic Op - loop invariant scalar load.
-// CHECK: vector.transfer_read %[[INPUT_1]][%[[EXTRACTED_0_IDX_0]], %[[SCALAR]]]
-// CHECK-SAME: tensor<1x20xi32>, vector<i32>
-// The following `tensor.extract` from the generic Op s a contiguous load (all Ops used
-// for address calculation also satisfy the required conditions).
-// CHECK: vector.transfer_read %[[INPUT_2]][%{{.*}}, %{{.*}}, %{{.*}} {in_bounds = [true, true]} : tensor<257x24xf32>, vector<1x4xf32>
-
-// -----
-
// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Gather load.
func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
%c16 = arith.constant 16 : index
@@ -410,8 +449,6 @@ func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16
// CHECK: return %[[VAL_14]] : tensor<1x4xf32>
// CHECK: }
-// -----
-
// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Gather load.
func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
%c79 = arith.constant 79 : index
@@ -445,41 +482,6 @@ func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32
// -----
-// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load.
-func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
- %c16 = arith.constant 16 : index
- %1 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]
- } outs(%extracted_slice : tensor<1x4xf32>) {
- ^bb0(%out: f32):
- %2 = linalg.index 0 : index
- %3 = linalg.index 1 : index
- %4 = arith.maxsi %2, %c16 : index
- %extracted = tensor.extract %arg0[%4, %3] : tensor<80x16xf32>
- linalg.yield %extracted : f32
- } -> tensor<1x4xf32>
- return %1 : tensor<1x4xf32>
-}
-
-// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(
-// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>,
-// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
-// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
-
-// CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
-// CHECK-DAG: %[[CST_1:.+]] = arith.constant dense<16> : vector<4x1xindex>
-// CHECK-DAG: %[[IDX0:.+]] = vector.extract %[[CST_1]][0, 0] : index from vector<4x1xindex>
-// CHECK-DAG: %[[IDX1:.+]] = vector.extract %[[CST_0]][0] : index from vector<4xindex>
-
-// CHECK: %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[IDX0]], %[[IDX1]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
-// CHECK: %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_1]]{{\[}}%[[VAL_4]], %[[VAL_4]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
-// CHECK: return %[[VAL_9]] : tensor<1x4xf32>
-// CHECK: }
-
-// -----
-
// The vectorizer assumes it's a gather load whenever using a block argument to calculate an index.
#map = affine_map<(d0) -> (d0)>
func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1: tensor<5xindex>) -> tensor<5xf32> {
|
Apologies, this landed without the summary. |
Tests in "vectorize-tensor-extract.mlir" are inconsistent and would
benefit from refactoring to:
"scalar load + broadcast" cases, reflecting the structure of
tensor.extract vectorization.
is covered in "vectorize-tensor-extract-masked.mlir").
These changes will make it easier to identify the test cases being
exercised and simplify future maintenance or refactoring.
This is patch 3/N in the series. Below is a summary of the changes in
this patch.
Summary of patch 4/N
2 seperate groups).
Note that this is merely moving things around (i.e. I've not changed any
of the tests).
Previous patches