diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td index cf3f2b70580da..307c9a3d23103 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -1850,6 +1850,10 @@ def TileReductionUsingForOp : Op:$tile_sizes); + DefaultValuedAttr:$reduction_dims, + DefaultValuedAttr:$tile_sizes); let results = (outs Variadic:$fill_op, TransformHandleTypeInterface:$split_op, TransformHandleTypeInterface:$combining_op, @@ -1913,6 +1918,7 @@ def TileReductionUsingForOp : Op mappingVector = {}; + SCFTilingOptions &setMapping(ArrayRef mapping) { + mappingVector = llvm::to_vector(mapping); + return *this; + } + + //-------------------------------------------------------------------------// + // Options related reduction tiling + //-------------------------------------------------------------------------// + /// Specify how reduction dimensions should be tiled. - /// - /// Tiling can be thought of as splitting a dimension into 2 and materializing - /// the outer dimension as a loop: - /// - /// op[original] -> op[original / x, x] -> loop[original] { op[x] } - /// - /// For parallel dimensions, the split can only happen in one way, with both - /// dimensions being parallel. For reduction dimensions however, there is a - /// choice in how we split the reduction dimension. This enum exposes this - /// choice. - enum class ReductionTilingStrategy { - // [reduction] -> [reduction1, reduction2] - // -> loop[reduction1] { [reduction2] } - FullReduction, - // [reduction] -> [reduction1, parallel2] - // -> loop[reduction1] { [parallel2] }; merge[reduction1] - PartialReductionOuterReduction, - // [reduction] -> [parallel1, reduction2] - // -> loop[parallel1] { [reduction2] }; merge[parallel1] - PartialReductionOuterParallel - }; ReductionTilingStrategy reductionStrategy = ReductionTilingStrategy::FullReduction; SCFTilingOptions & @@ -115,13 +108,13 @@ struct SCFTilingOptions { return *this; } - /// Specify mapping of loops to devices. This is only respected when the loop - /// constructs support such a mapping (like `scf.forall`). Will be ignored - /// when using loop constructs that dont support such a mapping (like - /// `scf.for`) - SmallVector mappingVector = {}; - SCFTilingOptions &setMapping(ArrayRef mapping) { - mappingVector = llvm::to_vector(mapping); + /// Specify the reduction dimensions to be tiled. Note that this needs to be + /// specified. If left unspecified, then none of the reduction dimensions are + /// tiled. + SetVector reductionDims; + SCFTilingOptions &setReductionDims(ArrayRef dims) { + reductionDims.clear(); + reductionDims.insert(dims.begin(), dims.end()); return *this; } }; diff --git a/mlir/include/mlir/Interfaces/TilingInterface.h b/mlir/include/mlir/Interfaces/TilingInterface.h index b33aa1489c311..8693cbea7f0b0 100644 --- a/mlir/include/mlir/Interfaces/TilingInterface.h +++ b/mlir/include/mlir/Interfaces/TilingInterface.h @@ -36,6 +36,27 @@ struct TilingResult { SmallVector generatedSlices; }; +/// Tiling can be thought of as splitting a dimension into 2 and +/// materializing the outer dimension as a loop: +/// +/// op[original] -> op[original / x, x] -> loop[original] { op[x] } +/// +/// For parallel dimensions, the split can only happen in one way, with both +/// dimensions being parallel. For reduction dimensions however, there is a +/// choice in how we split the reduction dimension. This enum exposes this +/// choice. +enum class ReductionTilingStrategy { + // [reduction] -> [reduction1, reduction2] + // -> loop[reduction1] { [reduction2] } + FullReduction, + // [reduction] -> [reduction1, parallel2] + // -> loop[reduction1] { [parallel2] }; merge[reduction1] + PartialReductionOuterReduction, + // [reduction] -> [parallel1, reduction2] + // -> loop[parallel1] { [reduction2] }; merge[parallel1] + PartialReductionOuterParallel +}; + /// Container for the result of merge operation of tiling. /// - `mergeOps` contains operations created during the merge. /// - `replacements` contains the values that represents the result of the diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td index cdf3d01ce8a84..43a27e1cb6cdf 100644 --- a/mlir/include/mlir/Interfaces/TilingInterface.td +++ b/mlir/include/mlir/Interfaces/TilingInterface.td @@ -384,7 +384,7 @@ def PartialReductionOpInterface : "::mlir::OpBuilder &":$b, "Location":$loc, "::mlir::ArrayRef<::mlir::OpFoldResult>":$sizes, - "::mlir::ArrayRef":$reductionDim), + "const ::mlir::SetVector &":$reductionDims), /*methodBody=*/"", /*defaultImplementation=*/[{ return failure(); @@ -402,10 +402,11 @@ def PartialReductionOpInterface : /*args=*/(ins "::mlir::OpBuilder &":$b, "Location ":$loc, + "::mlir::ReductionTilingStrategy":$tilingStrategy, "ValueRange":$init, "::mlir::ArrayRef<::mlir::OpFoldResult>":$offsets, "::mlir::ArrayRef<::mlir::OpFoldResult>":$sizes, - "::mlir::ArrayRef":$reductionDims), + "const ::llvm::SetVector &":$reductionDims), /*methodBody=*/"", /*defaultImplementation=*/[{ return failure(); @@ -423,7 +424,7 @@ def PartialReductionOpInterface : "::mlir::OpBuilder &":$b, "Location ":$loc, "ValueRange":$partialReduce, - "::mlir::ArrayRef":$reductionDim), + "const ::mlir::SetVector &":$reductionDims), /*methodBody=*/"", /*defaultImplementation=*/[{ return failure(); @@ -443,9 +444,9 @@ def PartialReductionOpInterface : "unsigned":$resultNumber, "::mlir::ArrayRef<::mlir::OpFoldResult> ":$offsets, "::mlir::ArrayRef<::mlir::OpFoldResult> ":$sizes, + "const ::mlir::SetVector &":$reductionDims, "::mlir::SmallVector<::mlir::OpFoldResult> &":$resultOffsets, - "::mlir::SmallVector<::mlir::OpFoldResult> &":$resultSizes, - "::mlir::ArrayRef":$reductionDims), + "::mlir::SmallVector<::mlir::OpFoldResult> &":$resultSizes), /*methodBody=*/"", /*defaultImplementation=*/[{ return failure(); diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 5d55adbf46f36..952f9cbd694d0 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -2966,10 +2966,11 @@ void transform::TileReductionUsingForOp::build( // TODO: support mixed static-dynamic (see TileUsingForallOp). MLIRContext *ctx = builder.getContext(); auto opTy = transform::AnyOpType::get(ctx); - auto staticTileSizesAttr = builder.getDenseI64ArrayAttr(staticTileSizes); + auto staticTileSizesAttr = builder.getI64ArrayAttr(staticTileSizes); build(builder, result, /*resultTypes=*/TypeRange{opTy, opTy, opTy, opTy}, /*target=*/target, + /*reduction_dims=*/nullptr, /*tile_sizes=*/staticTileSizesAttr); } @@ -2985,12 +2986,30 @@ DiagnosedSilenceableFailure transform::TileReductionUsingForOp::applyToOne( target->getLoc(), "Operation should implement PartialReductionOpInterface"); } - FailureOr result = scf::tileReductionUsingScf( - rewriter, partialReductionOp, - getAsOpFoldResult(rewriter.getI64ArrayAttr(getTileSizes()))); - if (failed(result)) - return emitDefaultSilenceableFailure(target); + SmallVector reductionDims = + extractFromIntegerArrayAttr(getReductionDims()); + if (reductionDims.empty()) { + for (auto [idx, iteratorType] : + llvm::enumerate(partialReductionOp.getLoopIteratorTypes())) { + if (iteratorType == utils::IteratorType::reduction) + reductionDims.push_back(idx); + } + } + + scf::SCFTilingOptions options; + options.setLoopType(scf::SCFTilingOptions::LoopType::ForOp); + options.setReductionTilingStrategy( + ReductionTilingStrategy::PartialReductionOuterReduction); + options.setTileSizes(getAsOpFoldResult(getTileSizesAttr())); + options.setReductionDims(reductionDims); + FailureOr result = + scf::tileUsingSCF(rewriter, partialReductionOp, options); + + if (failed(result)) { + return emitSilenceableFailure(getLoc(), + "failed to tile using partial reduction"); + } rewriter.replaceOp(target, result->replacements); for (Value initValue : result->initialValues) results.push_back(initValue.getDefiningOp()); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp index 4162aa0b71e6d..8a5a2e54cdda2 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -109,8 +109,7 @@ static void emitIsPositiveIndexAssertion(ImplicitLocOpBuilder &b, } FailureOr -mlir::linalg::computeStaticContinuousTileSizes(LinalgOp op, - unsigned dimension, +mlir::linalg::computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension, unsigned targetSize) { assert(!op.hasDynamicShape() && @@ -183,8 +182,8 @@ mlir::linalg::computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, // Find the trip count of the iteration space dimension for which the tile // sizes are computed. - Value loopRange = getValueOrCreateConstantIndexOp(b, loc, - loopRanges[dimension].size); + Value loopRange = + getValueOrCreateConstantIndexOp(b, loc, loopRanges[dimension].size); ContinuousTileSizeSpecification spec; // Compute the tile sizes and the respective numbers of tiles. @@ -633,16 +632,18 @@ FailureOr linalg::tileReductionUsingForall( if (!tileSizes.empty() && tileSizes.size() != numThreads.size()) return b.notifyMatchFailure(op, "if tile sizes are present it must have as " "many elements as number of threads"); - int reductionDim = static_cast(redDims.front()); if (redDims.front() >= numThreads.size()) return b.notifyMatchFailure( op, "reduction dimension must be mapped to threads"); // 1. Create the inital tensor value. + unsigned reductionDim = redDims.front(); + SetVector reductionDims; + reductionDims.insert(reductionDim); FailureOr> maybeInitTensors = op.generateInitialTensorForPartialReduction(b, loc, numThreads, - reductionDim); + reductionDims); if (failed(maybeInitTensors)) return b.notifyMatchFailure( op, "Failed to create inital tensors for partial reduction"); @@ -780,7 +781,7 @@ FailureOr linalg::tileReductionUsingForall( // 7. Merge the partial reductions. b.setInsertionPointAfter(forallOp); FailureOr mergeResult = - op.mergeReductions(b, loc, forallOp->getResults(), reductionDim); + op.mergeReductions(b, loc, forallOp->getResults(), reductionDims); if (failed(mergeResult)) { return failure(); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp index 7c14cc16437fe..f649bc49a8fbd 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp @@ -19,6 +19,7 @@ #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Interfaces/TilingInterface.h" #include "mlir/Interfaces/ValueBoundsOpInterface.h" #include @@ -327,23 +328,48 @@ struct LinalgOpTilingInterface // External Model for implementing `PartialReductionInterface` for `LinalgOp`s. //===----------------------------------------------------------------------===// -/// Return an AffineMap for a partial result for the given result number, -/// assuming the partial tiling strategy is outer-reduction loop + -/// inner-parallel tile. The returned AffineMap can be used as the replacement -/// AffineMap for the inner-parallel tile linalg op for the given result number. -/// -/// The new AffineMap is the old AffineMap with reduction dimensions appended -/// at end. -static AffineMap getPartialResultAffineMap(LinalgOp linalgOp, - ArrayRef reductionDims, - unsigned resultNumber) { - AffineMap map = - linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(resultNumber)); - for (int redPos : reductionDims) { - map = map.insertResult(getAffineDimExpr(redPos, linalgOp.getContext()), - map.getNumResults()); +/// Return an AffineMaps to use for the `outs` operands of the linalg op +/// generated for partial results. The new AffineMap is the AffineMap of the +/// untiled op with reduction dimensions appended at end in order in which they +/// were specified during tiling. +static SmallVector +getPartialResultAffineMaps(LinalgOp linalgOp, + const SetVector &reductionDims) { + auto partialReductionMaps = llvm::map_to_vector( + linalgOp.getDpsInitsMutable(), [&](OpOperand &opOperand) { + AffineMap map = linalgOp.getMatchingIndexingMap(&opOperand); + for (auto redPos : reductionDims) { + map = + map.insertResult(getAffineDimExpr(redPos, linalgOp.getContext()), + map.getNumResults()); + } + return map; + }); + return partialReductionMaps; +} + +/// Return the slice of the `initValue` to use as input to the partial reduction +/// op generated. +static Operation *getInitSliceForOuterReduction( + OpBuilder &b, Location loc, Value initValue, ArrayRef offsets, + ArrayRef sizes, const SetVector &reductionDims, + AffineMap partialReductionMap) { + int64_t initRank = partialReductionMap.getNumResults(); + SmallVector initOffsets, initSizes; + SmallVector initStrides(initRank, b.getIndexAttr(1)); + for (AffineExpr dimExpr : partialReductionMap.getResults()) { + unsigned dim = cast(dimExpr).getPosition(); + if (reductionDims.contains(dim)) { + initOffsets.push_back(b.getIndexAttr(0)); + } else { + initOffsets.push_back(offsets[dim]); + } + initSizes.push_back(sizes[dim]); } - return map; + // TODO: Use SubsetExtractOpInterface here once available. + auto extractSlice = b.create( + loc, initValue, initOffsets, initSizes, initStrides); + return extractSlice; } /// External model implementation of PartialReductionInterface for @@ -354,13 +380,16 @@ struct LinalgOpPartialReductionInterface LinalgOpPartialReductionInterface, LinalgOpTy> { FailureOr> generateInitialTensorForPartialReduction( Operation *op, OpBuilder &b, Location loc, ArrayRef sizes, - ArrayRef reductionDims) const { + const SetVector &reductionDims) const { auto linalgOp = cast(op); - OpBuilder::InsertionGuard guard(b); + OpBuilder::InsertionGuard guard(b); if (linalgOp.hasPureBufferSemantics()) return op->emitOpError("expected operation to have tensor semantics"); + SmallVector partialResultMaps = + getPartialResultAffineMaps(linalgOp, reductionDims); + // LinalgOp implements TilingInterface. auto tilingInterfaceOp = cast(linalgOp.getOperation()); SmallVector shape = @@ -377,8 +406,8 @@ struct LinalgOpPartialReductionInterface } SmallVector inits; - for (int initIdx = 0, e = linalgOp.getNumDpsInits(); initIdx < e; - ++initIdx) { + for (auto [initIdx, result, partialMap] : + llvm::enumerate(linalgOp->getResults(), partialResultMaps)) { SmallVector combinerOps; if (!matchReduction(linalgOp.getRegionOutputArgs(), initIdx, combinerOps) || @@ -392,16 +421,13 @@ struct LinalgOpPartialReductionInterface "Failed to get an identity value for the reduction operation."); // Append the new partial result dimensions. - AffineMap partialMap = - getPartialResultAffineMap(linalgOp, reductionDims, initIdx); SmallVector partialResultShape; for (AffineExpr dimExpr : partialMap.getResults()) { auto dim = cast(dimExpr); partialResultShape.push_back(tiledShape[dim.getPosition()]); } - Type elType = - getElementTypeOrSelf(linalgOp->getResult(initIdx).getType()); + Type elType = getElementTypeOrSelf(result.getType()); Value emptyTensor = b.create(loc, partialResultShape, elType); Value constantOp = b.create(loc, *identity); @@ -415,23 +441,25 @@ struct LinalgOpPartialReductionInterface FailureOr tileToPartialReduction(Operation *op, OpBuilder &b, Location loc, + ReductionTilingStrategy tilingStrategy, ValueRange init, ArrayRef offsets, ArrayRef sizes, - ArrayRef reductionDims) const { + const SetVector &reductionDims) const { + if (tilingStrategy != + ReductionTilingStrategy::PartialReductionOuterReduction) { + // TODO: Add support for `PartialReductionOuterParallel` strategy. + return op->emitOpError("unsupported partial reduction tiling with " + "`PartialReductionOuterParallel` strategy"); + } OpBuilder::InsertionGuard guard(b); auto linalgOp = cast(op); + SmallVector partialReductionMaps = + getPartialResultAffineMaps(linalgOp, reductionDims); + // Step 1. Extend init maps to have reduction dimension dims, since we // are converting them to parallel dimensions. - SmallVector newInitMaps; - newInitMaps.reserve(linalgOp.getNumDpsInits()); - for (int idx : llvm::seq(0, linalgOp.getNumDpsInits())) { - // TODO: linalg::Generic doesn't have getDpsInitOperands. Can replace - // this with a for range loop when we have it. - AffineMap newMap = - getPartialResultAffineMap(linalgOp, reductionDims, idx); - newInitMaps.push_back(newMap); - } + SmallVector newInitMaps = partialReductionMaps; // Step 2a: Extract a slice of the input operands. SmallVector tiledInputs = makeTiledShapes( @@ -443,31 +471,21 @@ struct LinalgOpPartialReductionInterface // Step 2b: Extract a slice of the init operands. SmallVector tiledInits; - for (auto [valueMap, valueToTile] : llvm::zip_equal(newInitMaps, init)) { - int64_t initRank = valueMap.getNumResults(); - SmallVector initOffset(initRank, b.getIndexAttr(0)); - SmallVector initStride(initRank, b.getIndexAttr(1)); - SmallVector initSizes; - for (AffineExpr dimExpr : valueMap.getResults()) { - auto dim = cast(dimExpr); - initSizes.push_back(sizes[dim.getPosition()]); - } - // TODO: Use SubsetExtractOpInterface here once available. - auto extractSlice = b.create( - loc, valueToTile, initOffset, initSizes, initStride); - tiledInits.push_back(extractSlice); - generatedSlices.push_back(extractSlice); + for (auto [partialReductionMap, valueToTile] : + llvm::zip_equal(partialReductionMaps, init)) { + Operation *sliceOp = + getInitSliceForOuterReduction(b, loc, valueToTile, offsets, sizes, + reductionDims, partialReductionMap); + tiledInits.push_back(sliceOp->getResult(0)); + generatedSlices.push_back(sliceOp); } // Update the indexing maps. SmallVector newMaps = linalgOp.getIndexingMapsArray(); - // Change the init maps. - for (int idx : llvm::seq(0, linalgOp.getNumDpsInits())) { - // TODO: linalg::Generic doesn't have getDpsInitOperands. Can replace - // this with a for range loop when we have it. - OpOperand *initOperand = linalgOp.getDpsInitOperand(idx); - int64_t mapIdx = linalgOp.getIndexingMapIndex(initOperand); - newMaps[mapIdx] = newInitMaps[idx]; + for (auto [initOperand, newInitMap] : + llvm::zip_equal(linalgOp.getDpsInitsMutable(), newInitMaps)) { + int mapIdx = linalgOp.getIndexingMapIndex(&initOperand); + newMaps[mapIdx] = newInitMap; } // Step 3. Change the reduction dim iterator types. @@ -477,9 +495,9 @@ struct LinalgOpPartialReductionInterface newIteratorTypes[dim] = utils::IteratorType::parallel; // Step 4. Create the new generic op. - auto genericOp = - b.create(loc, ValueRange(tiledInits).getTypes(), tiledInputs, - tiledInits, newMaps, newIteratorTypes); + auto resultTypes = ValueRange(tiledInits).getTypes(); + auto genericOp = b.create(loc, resultTypes, tiledInputs, + tiledInits, newMaps, newIteratorTypes); IRMapping mapping; op->getRegion(0).cloneInto(&genericOp.getRegion(), genericOp.getRegion().begin(), mapping); @@ -490,23 +508,24 @@ struct LinalgOpPartialReductionInterface generatedSlices}; } - FailureOr mergeReductions(Operation *op, OpBuilder &b, - Location loc, ValueRange partialReduce, - ArrayRef reductionDims) const { + FailureOr + mergeReductions(Operation *op, OpBuilder &b, Location loc, + ValueRange partialReduce, + const SetVector &reductionDims) const { auto linalgOp = cast(op); + SmallVector partialReductionMaps = + getPartialResultAffineMaps(linalgOp, reductionDims); // Permute the reduction dims as permuted by the partial result map. - - int64_t numInits = linalgOp.getNumDpsInits(); SmallVector mergeOperations; SmallVector replacements; - for (int idx : llvm::seq(numInits)) { + for (auto [idx, init, partialResult, partialMap] : llvm::enumerate( + linalgOp.getDpsInits(), partialReduce, partialReductionMaps)) { + unsigned initIdx = idx; // linalg.reduce's iteration space is the tiled result's iteration space // (and not the tiled operation's iteration space). To account for this, // permute the reduction dimensions based on the partial result map of the // tiled result. - AffineMap partialMap = - getPartialResultAffineMap(linalgOp, reductionDims, idx); SmallVector partialReductionDims; for (auto [resultNum, dimExpr] : llvm::enumerate(partialMap.getResults())) { @@ -516,15 +535,13 @@ struct LinalgOpPartialReductionInterface } } - Value partialResult = partialReduce[idx]; - Value init = linalgOp.getDpsInits()[idx]; - auto reduction = b.create( loc, partialResult, init, partialReductionDims, - [&linalgOp, &idx](OpBuilder &b, Location loc, ValueRange inputs) { + [&linalgOp, &initIdx](OpBuilder &b, Location loc, ValueRange inputs) { // Get the combiner op. SmallVector combinerOps; - matchReduction(linalgOp.getRegionOutputArgs(), idx, combinerOps); + matchReduction(linalgOp.getRegionOutputArgs(), initIdx, + combinerOps); Operation *clonedReductionOp = b.clone(*combinerOps[0]); // Combine the input at idx and output at numInits + idx. clonedReductionOp->setOperand(0, inputs[0]); @@ -542,14 +559,14 @@ struct LinalgOpPartialReductionInterface LogicalResult getPartialResultTilePosition( Operation *op, OpBuilder &b, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes, + const SetVector &reductionDims, SmallVector &resultOffsets, - SmallVector &resultSizes, - ArrayRef reductionDims) const { + SmallVector &resultSizes) const { auto linalgOp = cast(op); + SmallVector partialReductionMaps = + getPartialResultAffineMaps(linalgOp, reductionDims); - AffineMap partialMap = - getPartialResultAffineMap(linalgOp, reductionDims, resultNumber); - for (AffineExpr dimExpr : partialMap.getResults()) { + for (AffineExpr dimExpr : partialReductionMaps[resultNumber].getResults()) { unsigned dim = cast(dimExpr).getPosition(); resultSizes.push_back(sizes[dim]); diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 3f29dd3ac5e48..e7c076024e67b 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -77,9 +77,8 @@ fillInterchangeVector(ArrayRef interchangeVector, //===----------------------------------------------------------------------===// /// Verify the tile size options are set in a consistent manner. -static LogicalResult -verifyTileSizeOptions(RewriterBase &rewriter, Location loc, - const scf::SCFTilingOptions &options) { +static LogicalResult verifyOptions(RewriterBase &rewriter, Location loc, + const scf::SCFTilingOptions &options) { // Specifying number of threads is only supported on `scf.forall` op. if (options.numThreadsComputationFunction && options.loopType != scf::SCFTilingOptions::LoopType::ForallOp) { @@ -156,7 +155,9 @@ getUserTileSizesAndNumThreads(RewriterBase &rewriter, TilingInterface op, } /// Checks if any of the tiled loops are not parallel. -static void checkSafeToTileToForall(TilingInterface op, +static LogicalResult checkTileSizes(TilingInterface op, + scf::SCFTilingOptions::LoopType loopType, + ReductionTilingStrategy reductionStrategy, ArrayRef tileSizes, ArrayRef numThreads) { auto iterators = op.getLoopIteratorTypes(); @@ -165,28 +166,46 @@ static void checkSafeToTileToForall(TilingInterface op, assert((numThreads.empty() || (numThreads.size() == iterators.size())) && "when specified, expected number of threads to use for each loop"); + bool isParallelTiling = false, isReductionTiling = false; for (auto [index, iterator, tileSize] : llvm::enumerate(iterators, tileSizes)) { - // If num threads is specified, check that it is greater than one only for - // parallel dimensions. - if (!numThreads.empty()) { - if (std::optional constNumThreads = - getConstantIntValue(numThreads[index])) { - if (constNumThreads.value() > 1 && + if (!isConstantIntValue(tileSize, 0)) { + isParallelTiling |= iterator == utils::IteratorType::parallel; + isReductionTiling |= iterator == utils::IteratorType::reduction; + } + + if (loopType == scf::SCFTilingOptions::LoopType::ForallOp && + reductionStrategy == ReductionTilingStrategy::FullReduction) { + // If num threads is specified, check that it is greater than one only for + // parallel dimensions. + if (!numThreads.empty()) { + if (std::optional constNumThreads = + getConstantIntValue(numThreads[index])) { + if (constNumThreads.value() > 1 && + iterator != utils::IteratorType::parallel) { + op.emitWarning() << "tiling is not thread safe at axis #" << index; + } + } + continue; + } + + if (std::optional constTileSize = + getConstantIntValue(tileSize)) { + if (constTileSize.value() > 0 && iterator != utils::IteratorType::parallel) { op.emitWarning() << "tiling is not thread safe at axis #" << index; } } - continue; } + } - if (std::optional constTileSize = getConstantIntValue(tileSize)) { - if (constTileSize.value() > 0 && - iterator != utils::IteratorType::parallel) { - op.emitWarning() << "tiling is not thread safe at axis #" << index; - } - } + if (isParallelTiling && isReductionTiling && + reductionStrategy != ReductionTilingStrategy::FullReduction) { + return op->emitOpError( + "combined parallel and reduction tiling is not supported with partial " + "reduction tiling strategies"); } + return success(); } /// Check if `stride` evenly divides the trip count `size - offset`. @@ -575,35 +594,20 @@ createInitialTensorsForTiling(RewriterBase &rewriter, TilingInterface op, const scf::SCFTilingOptions &options) { SmallVector initTensors; Location loc = op->getLoc(); - switch (options.reductionStrategy) { - case scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction: + if (options.reductionStrategy == ReductionTilingStrategy::FullReduction) { if (failed(tensor::getOrCreateDestinations(rewriter, loc, op, initTensors))) return failure(); return initTensors; - case scf::SCFTilingOptions::ReductionTilingStrategy:: - PartialReductionOuterReduction: { - auto redOp = dyn_cast(op.getOperation()); - if (!redOp) { - return rewriter.notifyMatchFailure( - op, "PartialReductionOuterReduction tiling strategy is only supported" - "for operations implementing PartialReductionOpInterface"); - } - // Get reduction dimensions. - // TODO: PartialReductionOpInterface should really query TilingInterface - // itself and find reduction dimensions. - SmallVector reductionDims; - for (auto [idx, iteratorType] : - llvm::enumerate(op.getLoopIteratorTypes())) { - if (iteratorType == utils::IteratorType::reduction) - reductionDims.push_back(idx); - } - return redOp.generateInitialTensorForPartialReduction( - rewriter, loc, tileSizes, reductionDims); } - default: - return rewriter.notifyMatchFailure(op, - "unhandled reduction tiling strategy"); + + auto redOp = dyn_cast(op.getOperation()); + if (!redOp) { + return rewriter.notifyMatchFailure( + op, "PartialReductionOuterReduction tiling strategy is only supported" + "for operations implementing PartialReductionOpInterface"); } + return redOp.generateInitialTensorForPartialReduction( + rewriter, loc, tileSizes, options.reductionDims); } static FailureOr @@ -611,34 +615,20 @@ getTiledImplementation(RewriterBase &rewriter, TilingInterface op, ValueRange regionIterArg, ArrayRef offsets, ArrayRef sizes, const scf::SCFTilingOptions &options) { - switch (options.reductionStrategy) { - case scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction: + if (options.reductionStrategy == ReductionTilingStrategy::FullReduction) { return op.getTiledImplementation(rewriter, offsets, sizes); - case scf::SCFTilingOptions::ReductionTilingStrategy:: - PartialReductionOuterReduction: { - auto redOp = dyn_cast(op.getOperation()); - if (!redOp) { - return rewriter.notifyMatchFailure( - op, "PartialReductionOuterReduction tiling strategy is only " - "supported for operations " - "implementing PartialReductionOpInterface"); - } - // Get reduction dimensions. - // TODO: PartialReductionOpInterface should really query TilingInterface - // itself and find reduction dimensions. - SmallVector reductionDims; - for (auto [idx, iteratorType] : - llvm::enumerate(op.getLoopIteratorTypes())) { - if (iteratorType == utils::IteratorType::reduction) - reductionDims.push_back(idx); - } - return redOp.tileToPartialReduction(rewriter, op.getLoc(), regionIterArg, - offsets, sizes, reductionDims); } - default: - return rewriter.notifyMatchFailure(op, - "unhandled reduction tiling strategy"); + + auto redOp = dyn_cast(op.getOperation()); + if (!redOp) { + return rewriter.notifyMatchFailure( + op, "PartialReductionOuterReduction tiling strategy is only " + "supported for operations " + "implementing PartialReductionOpInterface"); } + return redOp.tileToPartialReduction(rewriter, op.getLoc(), + options.reductionStrategy, regionIterArg, + offsets, sizes, options.reductionDims); } static LogicalResult @@ -649,70 +639,37 @@ getResultTilePosition(RewriterBase &rewriter, int64_t index, Value tiledResult, SmallVector &resultSize, const scf::SCFTilingOptions &options) { - switch (options.reductionStrategy) { - case scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction: + if (options.reductionStrategy == ReductionTilingStrategy::FullReduction) { return op.getResultTilePosition(rewriter, index, offsets, sizes, resultOffset, resultSize); - case scf::SCFTilingOptions::ReductionTilingStrategy:: - PartialReductionOuterReduction: { - auto redOp = dyn_cast(op.getOperation()); - if (!redOp) { - return rewriter.notifyMatchFailure( - op, "PartialReductionOuterReduction tiling strategy is only supported" - "for operations implementing PartialReductionOpInterface"); - } - // Get reduction dimensions. - // TODO: PartialReductionOpInterface should really query TilingInterface - // itself and find reduction dimensions. - SmallVector reductionDims; - for (auto [idx, iteratorType] : - llvm::enumerate(op.getLoopIteratorTypes())) { - if (iteratorType == utils::IteratorType::reduction) - reductionDims.push_back(idx); - } - return redOp.getPartialResultTilePosition(rewriter, index, offsets, sizes, - resultOffset, resultSize, - reductionDims); } - default: - return rewriter.notifyMatchFailure(op, - "unhandled reduction tiling strategy"); + auto redOp = dyn_cast(op.getOperation()); + if (!redOp) { + return rewriter.notifyMatchFailure( + op, "PartialReductionOuterReduction tiling strategy is only supported" + "for operations implementing PartialReductionOpInterface"); } + return redOp.getPartialResultTilePosition(rewriter, index, offsets, sizes, + options.reductionDims, resultOffset, + resultSize); } static FailureOr mergeTilingResults(RewriterBase &rewriter, TilingInterface op, ValueRange partialResults, const scf::SCFTilingOptions &options) { - switch (options.reductionStrategy) { - case scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction: - // No need to merge results for reduction tiling strategy. - return MergeResult{{}, partialResults}; - case scf::SCFTilingOptions::ReductionTilingStrategy:: - PartialReductionOuterReduction: { - auto redOp = dyn_cast(op.getOperation()); - if (!redOp) { - return rewriter.notifyMatchFailure( - op, "PartialReductionOuterReduction tiling strategy is only " - "supported for operations " - "implementing PartialReductionOpInterface"); - } - // Get reduction dimensions. - // TODO: PartialReductionOpInterface should really query TilingInterface - // itself and find reduction dimensions. - SmallVector reductionDims; - for (auto [idx, iteratorType] : - llvm::enumerate(op.getLoopIteratorTypes())) { - if (iteratorType == utils::IteratorType::reduction) - reductionDims.push_back(idx); - } - return redOp.mergeReductions(rewriter, op.getLoc(), partialResults, - reductionDims); - } - default: - return rewriter.notifyMatchFailure(op, - "unhandled reduction tiling strategy"); + assert(options.reductionStrategy != ReductionTilingStrategy::FullReduction && + "expected merge to be called for only partial reduction cases"); + + auto redOp = dyn_cast(op.getOperation()); + if (!redOp) { + return rewriter.notifyMatchFailure( + op, "PartialReductionOuterReduction tiling strategy is only " + "supported for operations " + "implementing PartialReductionOpInterface"); } + return redOp.mergeReductions(rewriter, op.getLoc(), partialResults, + options.reductionDims); } /// Append the specified additional `newInitOperands` operands to the @@ -932,7 +889,7 @@ static LogicalResult addInitOperandsToLoopNest( FailureOr mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, const scf::SCFTilingOptions &options) { - if (failed(verifyTileSizeOptions(rewriter, op.getLoc(), options))) { + if (failed(verifyOptions(rewriter, op.getLoc(), options))) { return failure(); } @@ -949,8 +906,9 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // Check if it is safe to tile. This is hold over from previous iterations // of tile to for-all. Consider dropping it. - if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) { - checkSafeToTileToForall(op, tileSizes, numThreads); + if (failed(checkTileSizes(op, options.loopType, options.reductionStrategy, + tileSizes, numThreads))) { + return failure(); } // 3. If there is an interchange specified, permute the iteration domain and @@ -1073,8 +1031,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, [](OpResult r) -> Value { return r; }); // For the full reduction case, there is nothing more to do. - if (options.reductionStrategy == - scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction) { + if (options.reductionStrategy == ReductionTilingStrategy::FullReduction) { return scf::SCFTilingResult{ tilingResult->tiledOps, initTensors, loops, loopResults, tilingResult->generatedSlices, {}}; @@ -1102,9 +1059,13 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b, scf::SCFTilingOptions options; options.setLoopType(scf::SCFTilingOptions::LoopType::ForOp); options.setReductionTilingStrategy( - scf::SCFTilingOptions::ReductionTilingStrategy:: - PartialReductionOuterReduction); + ReductionTilingStrategy::PartialReductionOuterReduction); options.setTileSizes(tileSize); + SmallVector reductionDims; + for (auto [index, iteratorType] : llvm::enumerate(op.getLoopIteratorTypes())) + if (iteratorType == utils::IteratorType::reduction) + reductionDims.push_back(index); + options.setReductionDims(reductionDims); return tileUsingSCF(b, op, options); } diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir index 9d34c80822d0e..009ab17786696 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir @@ -343,7 +343,6 @@ module attributes {transform.with_named_sequence} { module { func.func @fail_for_float_neutral(%arg0: tensor, %arg1: tensor) -> tensor { // expected-error @below {{'linalg.generic' op Failed to get an identity value for the reduction operation.}} - // expected-note @below {{when applied to this op}} %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor) outs(%arg1 : tensor) { ^bb0(%in: f32, %out: f32): %1 = llvm.fmul %in, %in : f32 @@ -355,7 +354,7 @@ module { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op - // expected-error @below {{transform.structured.tile_reduction_using_for failed to apply}} + // expected-error @below {{failed to tile using partial reduction}} %fill_op, %split_linalg_op, %combining_linalg_op, %for_op = transform.structured.tile_reduction_using_for %0 by tile_sizes = [0, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -480,3 +479,167 @@ module attributes {transform.with_named_sequence} { // CHECK: } // CHECK: linalg.reduce // CHECK: return + +// ----- + +// Check that only one of the reduction dimension can be tiled (in this case outer). + +#map = affine_map<(d0, d1, d2) -> (d1, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map2 = affine_map<(d0, d1, d2) -> (d0)> +module { + func.func @reduction_tile_single_of_multiple_reduction_outer( + %arg0: tensor<86x128xf32>, %arg1: tensor<4096x86x128xf32>, %arg2: tensor<4096xf32>) -> tensor<4096xf32> { + %0 = linalg.generic { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "reduction"]} + ins(%arg0, %arg1 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%arg2 : tensor<4096xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %1 = arith.mulf %in, %in_0 : f32 + %2 = arith.addf %1, %out : f32 + linalg.yield %2 : f32 + } -> tensor<4096xf32> + return %0 : tensor<4096xf32> + } + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %fill_op, %split_linalg_op, %combining_linalg_op, %for_op = + transform.structured.tile_reduction_using_for %0 reduction_dims = [1] by tile_sizes = [0, 2] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } + } +} +// CHECK: #[[INIT_MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK: @reduction_tile_single_of_multiple_reduction_outer( +// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4096xf32> +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C86:.+]] = arith.constant 86 : index +// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty() : tensor<4096x2xf32> +// CHECK: %[[FILL:.+]] = linalg.fill +// CHECK-SAME: outs(%[[EMPTY]] : +// CHECK: %[[RESULT:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[C86]] step %[[C2]] +// CHECK-SAME: iter_args(%[[ITER_ARG:.+]] = %[[FILL]]) +// CHECK: %[[PARTIAL_RESULT:.+]] = linalg.generic +// CHECK-SAME: indexing_maps = [#{{.+}}, #{{.+}}, #[[INIT_MAP]]] +// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"] +// CHECK-SAME: outs(%[[ITER_ARG]] : +// CHECK: scf.yield %[[PARTIAL_RESULT]] +// CHECK: %[[REDUCE:.+]] = linalg.reduce +// CHECK-SAME: ins(%[[RESULT]] : +// CHECK-SAME: outs(%[[INIT]] : +// CHECK-SAME: dimensions = [1] +// CHECK: return %[[REDUCE]] + +// ----- + +// Check that only one of the reduction dimension can be tiled (in this case inner). + +#map = affine_map<(d0, d1, d2) -> (d1, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map2 = affine_map<(d0, d1, d2) -> (d0)> +module { + func.func @reduction_tile_single_of_multiple_reduction_inner( + %arg0: tensor<86x128xf32>, %arg1: tensor<4096x86x128xf32>, %arg2: tensor<4096xf32>) -> tensor<4096xf32> { + %0 = linalg.generic { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "reduction"]} + ins(%arg0, %arg1 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%arg2 : tensor<4096xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %1 = arith.mulf %in, %in_0 : f32 + %2 = arith.addf %1, %out : f32 + linalg.yield %2 : f32 + } -> tensor<4096xf32> + return %0 : tensor<4096xf32> + } + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %fill_op, %split_linalg_op, %combining_linalg_op, %for_op = + transform.structured.tile_reduction_using_for %0 reduction_dims = [2] by tile_sizes = [0, 0, 64] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } + } +} +// CHECK: #[[INIT_MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK: @reduction_tile_single_of_multiple_reduction_inner( +// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4096xf32> +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty() : tensor<4096x64xf32> +// CHECK: %[[FILL:.+]] = linalg.fill +// CHECK-SAME: outs(%[[EMPTY]] : +// CHECK: %[[RESULT:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[C128]] step %[[C64]] +// CHECK-SAME: iter_args(%[[ITER_ARG:.+]] = %[[FILL]]) +// CHECK: %[[PARTIAL_RESULT:.+]] = linalg.generic +// CHECK-SAME: indexing_maps = [#{{.+}}, #{{.+}}, #[[INIT_MAP]]] +// CHECK-SAME: iterator_types = ["parallel", "reduction", "parallel"] +// CHECK-SAME: outs(%[[ITER_ARG]] : +// CHECK: scf.yield %[[PARTIAL_RESULT]] +// CHECK: %[[REDUCE:.+]] = linalg.reduce +// CHECK-SAME: ins(%[[RESULT]] : +// CHECK-SAME: outs(%[[INIT]] : +// CHECK-SAME: dimensions = [1] +// CHECK: return %[[REDUCE]] + +// ----- + +// Check that both the reduction dimensions are tiled but the dimensions in the output are swapped. + +#map = affine_map<(d0, d1, d2) -> (d1, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map2 = affine_map<(d0, d1, d2) -> (d0)> +module { + func.func @reduction_tile_single_of_multiple_reduction_reversed( + %arg0: tensor<86x128xf32>, %arg1: tensor<4096x86x128xf32>, %arg2: tensor<4096xf32>) -> tensor<4096xf32> { + %0 = linalg.generic { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "reduction"]} + ins(%arg0, %arg1 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%arg2 : tensor<4096xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %1 = arith.mulf %in, %in_0 : f32 + %2 = arith.addf %1, %out : f32 + linalg.yield %2 : f32 + } -> tensor<4096xf32> + return %0 : tensor<4096xf32> + } + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %fill_op, %split_linalg_op, %combining_linalg_op, %for_op = + transform.structured.tile_reduction_using_for %0 reduction_dims = [2, 1] by tile_sizes = [0, 2, 64] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } + } +} +// CHECK: #[[INIT_MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2, d1)> +// CHECK: @reduction_tile_single_of_multiple_reduction_reversed( +// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4096xf32> +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index +// CHECK-DAG: %[[C86:.+]] = arith.constant 86 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty() : tensor<4096x64x2xf32> +// CHECK: %[[FILL:.+]] = linalg.fill +// CHECK-SAME: outs(%[[EMPTY]] : +// CHECK: %[[RESULT:.+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[C86]] step %[[C2]] +// CHECK-SAME: iter_args(%[[ITER_ARG:.+]] = %[[FILL]]) +// CHECK: %[[RESULT0:.+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[C128]] step %[[C64]] +// CHECK-SAME: iter_args(%[[ITER_ARG0:.+]] = %[[ITER_ARG]]) +// CHECK: %[[PARTIAL_RESULT:.+]] = linalg.generic +// CHECK-SAME: indexing_maps = [#{{.+}}, #{{.+}}, #[[INIT_MAP]]] +// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"] +// CHECK-SAME: outs(%[[ITER_ARG0]] : +// CHECK: scf.yield %[[PARTIAL_RESULT]] +// CHECK scf.yield %[[RESULT0]] +// CHECK: %[[REDUCE:.+]] = linalg.reduce +// CHECK-SAME: ins(%[[RESULT]] : +// CHECK-SAME: outs(%[[INIT]] : +// CHECK-SAME: dimensions = [1, 2] +// CHECK: return %[[REDUCE]]