//===- Utils.cpp - Utilities to support the Linalg dialect ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements utilities for the Linalg dialect. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/Affine/EDSC/Intrinsics.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Linalg/IR/LinalgTypes.h" #include "mlir/Dialect/SCF/EDSC/Builders.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/OpImplementation.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" using namespace mlir; using namespace mlir::linalg; using namespace mlir::scf; Optional RegionMatcher::matchAsScalarBinaryOp(GenericOp op) { auto ®ion = op.region(); if (!llvm::hasSingleElement(region)) return llvm::None; Block &block = region.front(); if (block.getNumArguments() != 2 || !block.getArgument(0).getType().isSignlessIntOrFloat() || !block.getArgument(1).getType().isSignlessIntOrFloat()) return llvm::None; auto &ops = block.getOperations(); if (!llvm::hasSingleElement(block.without_terminator())) return llvm::None; using mlir::matchers::m_Val; auto a = m_Val(block.getArgument(0)); auto b = m_Val(block.getArgument(1)); auto addPattern = m_Op(m_Op(a, b)); if (addPattern.match(&ops.back())) return BinaryOpKind::IAdd; return llvm::None; } bool mlir::linalg::isParallelIteratorType(Attribute attr) { if (auto strAttr = attr.dyn_cast()) { return strAttr.getValue() == getParallelIteratorTypeName(); } return false; } bool mlir::linalg::isReductionIteratorType(Attribute attr) { if (auto strAttr = attr.dyn_cast()) { return strAttr.getValue() == getReductionIteratorTypeName(); } return false; } bool mlir::linalg::isWindowIteratorType(Attribute attr) { if (auto strAttr = attr.dyn_cast()) { return strAttr.getValue() == getWindowIteratorTypeName(); } return false; } /// Explicit instantiation of loop nest generator for different loop types. template struct mlir::linalg::GenerateLoopNest; template struct mlir::linalg::GenerateLoopNest; template struct mlir::linalg::GenerateLoopNest; /// Given a list of subview ranges, extract individual values for lower, upper /// bounds and steps and put them into the corresponding vectors. static void unpackRanges(ArrayRef ranges, SmallVectorImpl &lbs, SmallVectorImpl &ubs, SmallVectorImpl &steps) { for (Range range : ranges) { lbs.emplace_back(range.offset); ubs.emplace_back(range.size); steps.emplace_back(range.stride); } } namespace mlir { namespace linalg { SmallVector getStaticShape(LinalgOp linalgOp) { SmallVector res; for (Value v : linalgOp.getShapedOperands()) { auto shape = v.getType().cast().getShape(); res.append(shape.begin(), shape.end()); } if (linalgOp.getNumInitTensors()) return res; for (Value v : linalgOp.getOperation()->getResults()) { auto shape = v.getType().cast().getShape(); res.append(shape.begin(), shape.end()); } return res; } Optional> getStaticLoopRanges(LinalgOp linalgOp) { SmallVector viewSizes = getStaticShape(linalgOp); AffineMap invertedMap = linalgOp.getShapesToLoopsMap(); if (!invertedMap) return {}; return invertedMap.compose(viewSizes); } /// Specialization to build an scf "for" nest. template <> void GenerateLoopNest::doit( ArrayRef loopRanges, ValueRange iterArgInitValues, ArrayRef iteratorTypes, function_ref bodyBuilderFn, Optional distributionOptions) { // Create procInfo so it dominates loops, if appropriate. OpBuilder &builder = edsc::ScopedContext::getBuilderRef(); Location loc = edsc::ScopedContext::getLocation(); SmallVector procInfo; if (distributionOptions.hasValue()) procInfo = distributionOptions->procInfo(builder, loc, loopRanges); SmallVector lbs, ubs, steps; unpackRanges(loopRanges, lbs, ubs, steps); LoopNest loopNest = edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn); if (!distributionOptions.hasValue() || loopNest.loops.empty()) return; // Only supports cyclic distribution for now. for (auto it : llvm::zip(loopNest.loops, procInfo, distributionOptions->distributionMethod)) if (std::get<2>(it) == DistributionMethod::Cyclic) mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId, std::get<1>(it).nprocs); } /// Specialization to build affine "for" nest. template <> void GenerateLoopNest::doit( ArrayRef loopRanges, ValueRange iterArgInitValues, ArrayRef iteratorTypes, function_ref bodyBuilderFn, Optional) { assert(iterArgInitValues.empty() && "unexpected AffineForOp init values"); SmallVector lbs, ubs, steps; unpackRanges(loopRanges, lbs, ubs, steps); // Affine loops require constant steps. SmallVector constantSteps; constantSteps.reserve(steps.size()); for (Value v : steps) { auto op = v.getDefiningOp(); assert(op && "Affine loops require constant steps"); constantSteps.push_back(op.getValue()); } auto bodyBuilderWithoutIterArgsFn = [&](ValueRange ivs) { bodyBuilderFn(ivs, {}); }; edsc::affineLoopNestBuilder(lbs, ubs, constantSteps, bodyBuilderWithoutIterArgsFn); } /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`. static void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc, Value procId, Value nprocs, Value &lb, Value &ub, Value &step) { using edsc::op::operator+; using edsc::op::operator*; lb = lb + (procId * step); step = nprocs * step; } /// Generates a loop nest consisting of scf.parallel and scf.for, depending /// on the `iteratorTypes.` Consecutive parallel loops create a single /// scf.parallel operation; each sequential loop creates a new scf.for /// operation. The body of the innermost loop is populated by /// `bodyBuilderFn` that accepts a range of induction variables for all /// loops. `ivStorage` is used to store the partial list of induction /// variables. // TODO: this function can be made iterative instead. However, it // will have at most as many recursive calls as nested loops, which rarely // exceeds 10. static void generateParallelLoopNest(ValueRange lbs, ValueRange ubs, ValueRange steps, ArrayRef iteratorTypes, function_ref bodyBuilderFn, SmallVectorImpl &ivStorage, ArrayRef distributionMethod = {}) { assert(lbs.size() == ubs.size()); assert(lbs.size() == steps.size()); assert(lbs.size() == iteratorTypes.size()); // If there are no (more) loops to be generated, generate the body and be // done with it. if (iteratorTypes.empty()) return bodyBuilderFn(ivStorage); // Find the outermost parallel loops and drop their types from the list. unsigned nLoops = iteratorTypes.size(); unsigned nOuterPar = nLoops - iteratorTypes.drop_while(isParallelIteratorType).size(); // If there are no outer parallel loops, generate one sequential loop and // recurse. Note that we wouldn't have dropped anything from `iteratorTypes` // in this case. if (nOuterPar == 0) { edsc::loopNestBuilder(lbs[0], ubs[0], steps[0], [&](Value iv) { ivStorage.push_back(iv); generateParallelLoopNest(lbs.drop_front(), ubs.drop_front(), steps.drop_front(), iteratorTypes.drop_front(), bodyBuilderFn, ivStorage, distributionMethod); }); return; } if (distributionMethod.empty()) { // Generate a single parallel loop-nest operation for all outermost // parallel loops and recurse. edsc::OperationBuilder( lbs.take_front(nOuterPar), ubs.take_front(nOuterPar), steps.take_front(nOuterPar), [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) { edsc::ScopedContext context(nestedBuilder, nestedLoc); ivStorage.append(localIvs.begin(), localIvs.end()); generateParallelLoopNest( lbs.drop_front(nOuterPar), ubs.drop_front(nOuterPar), steps.drop_front(nOuterPar), iteratorTypes.drop_front(nOuterPar), bodyBuilderFn, ivStorage, (distributionMethod.size() < nOuterPar) ? ArrayRef() : distributionMethod.drop_front(nOuterPar)); }); return; } // Process all consecutive similarly distributed loops simultaneously. DistributionMethod methodToUse = distributionMethod[0]; unsigned numProcessed = 1; for (unsigned i = 1; i < nOuterPar && i < distributionMethod.size(); ++i) { if (distributionMethod[i] != methodToUse) break; numProcessed++; } switch (methodToUse) { case DistributionMethod::Cyclic: { // Generate a single parallel loop-nest operation for all outermost // parallel loops and recurse. edsc::OperationBuilder( lbs.take_front(numProcessed), ubs.take_front(numProcessed), steps.take_front(numProcessed), [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) { edsc::ScopedContext context(nestedBuilder, nestedLoc); ivStorage.append(localIvs.begin(), localIvs.end()); generateParallelLoopNest( lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage, (distributionMethod.size() < numProcessed) ? ArrayRef() : distributionMethod.drop_front(numProcessed)); }); return; } case DistributionMethod::CyclicNumProcsGeNumIters: { // Check (for the processed loops) that the iteration is in-bounds. using edsc::op::slt; using edsc::op::operator&&; Value cond = slt(lbs[0], ubs[0]); for (unsigned i = 1; i < numProcessed; ++i) cond = cond && slt(lbs[i], ubs[i]); ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed)); edsc::conditionBuilder(cond, [&]() { generateParallelLoopNest( lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed)); }); return; } case DistributionMethod::CyclicNumProcsEqNumIters: // No check/loops needed here. Set the `%iv` to be the `%lb` and proceed // with inner loop generation. ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed)); generateParallelLoopNest( lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed)); return; } } /// Specialization for generating a mix of parallel and sequential scf loops. template <> void GenerateLoopNest::doit( ArrayRef loopRanges, ValueRange iterArgInitValues, ArrayRef iteratorTypes, function_ref bodyBuilderFn, Optional distributionOptions) { assert(iterArgInitValues.empty() && "unexpected ParallelOp init values"); // This function may be passed more iterator types than ranges. assert(iteratorTypes.size() >= loopRanges.size() && "expected iterator type for all ranges"); iteratorTypes = iteratorTypes.take_front(loopRanges.size()); SmallVector lbsStorage, ubsStorage, stepsStorage, ivs; unsigned numLoops = iteratorTypes.size(); ivs.reserve(numLoops); lbsStorage.reserve(numLoops); ubsStorage.reserve(numLoops); stepsStorage.reserve(numLoops); // Get the loop lb, ub, and step. unpackRanges(loopRanges, lbsStorage, ubsStorage, stepsStorage); // Modify the lb, ub, and step based on the distribution options. SmallVector distributionMethod; if (distributionOptions) { auto &options = distributionOptions.getValue(); OpBuilder &builder = edsc::ScopedContext::getBuilderRef(); Location loc = edsc::ScopedContext::getLocation(); distributionMethod.assign(distributionOptions->distributionMethod.begin(), distributionOptions->distributionMethod.end()); SmallVector parallelLoopRanges; for (auto iteratorType : enumerate(iteratorTypes)) { if (isParallelIteratorType(iteratorType.value())) parallelLoopRanges.push_back(loopRanges[iteratorType.index()]); } if (distributionMethod.size() < parallelLoopRanges.size()) parallelLoopRanges.resize(distributionMethod.size()); SmallVector procInfo = options.procInfo(builder, loc, parallelLoopRanges); unsigned index = 0; for (auto iteratorType : enumerate(iteratorTypes)) { if (index >= procInfo.size()) break; if (isParallelIteratorType(iteratorType.value())) { unsigned i = iteratorType.index(); updateBoundsForCyclicDistribution(builder, loc, procInfo[index].procId, procInfo[index].nprocs, lbsStorage[i], ubsStorage[i], stepsStorage[i]); index++; } } } ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage); auto bodyBuilderWithoutIterArgsFn = [&](ValueRange ivs) { bodyBuilderFn(ivs, {}); }; generateParallelLoopNest(lbs, ubs, steps, iteratorTypes, bodyBuilderWithoutIterArgsFn, ivs, distributionMethod); assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops"); } } // namespace linalg } // namespace mlir