diff options
author | David Green <david.green@arm.com> | 2024-02-13 08:31:07 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-13 08:31:07 +0000 |
commit | 815a84655262ac569db11357fef1651f3571e7ee (patch) | |
tree | d887eaf58053b54e943f4d1cfa6018bc336b6279 | |
parent | 070848c17c2944afa494d42d3ad42929f3379842 (diff) | |
download | llvm-815a84655262ac569db11357fef1651f3571e7ee.zip llvm-815a84655262ac569db11357fef1651f3571e7ee.tar.gz llvm-815a84655262ac569db11357fef1651f3571e7ee.tar.bz2 |
[Flang] Move genMinMaxlocReductionLoop to Transforms/Utils.cpp (#81380)
This is one option for attempting to move genMinMaxlocReductionLoop to a
better location. It moves it into Transforms and makes HLFIRTranforms
depend upon FIRTransforms.
It passes a build locally, both with and without -DBUILD_SHARED_LIBS,
and does OK on the windows CI.
-rw-r--r-- | flang/include/flang/Optimizer/Support/Utils.h | 139 | ||||
-rw-r--r-- | flang/include/flang/Optimizer/Transforms/Utils.h | 38 | ||||
-rw-r--r-- | flang/lib/Optimizer/Dialect/CMakeLists.txt | 1 | ||||
-rw-r--r-- | flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt | 1 | ||||
-rw-r--r-- | flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp | 2 | ||||
-rw-r--r-- | flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp | 131 |
6 files changed, 170 insertions, 142 deletions
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h index 4e06bf8..7a8a34c 100644 --- a/flang/include/flang/Optimizer/Support/Utils.h +++ b/flang/include/flang/Optimizer/Support/Utils.h @@ -18,7 +18,6 @@ #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIRType.h" -#include "flang/Optimizer/HLFIR/HLFIRDialect.h" #include "flang/Optimizer/Support/FatalError.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" @@ -135,144 +134,6 @@ inline void intrinsicTypeTODO(fir::FirOpBuilder &builder, mlir::Type type, " in " + intrinsicName); } -using MinlocBodyOpGeneratorTy = llvm::function_ref<mlir::Value( - fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value, - mlir::Value, mlir::Value, const llvm::SmallVectorImpl<mlir::Value> &)>; -using InitValGeneratorTy = llvm::function_ref<mlir::Value( - fir::FirOpBuilder &, mlir::Location, const mlir::Type &)>; -using AddrGeneratorTy = llvm::function_ref<mlir::Value( - fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value, - mlir::Value)>; - -// Produces a loop nest for a Minloc intrinsic. -inline void genMinMaxlocReductionLoop( - fir::FirOpBuilder &builder, mlir::Value array, - fir::InitValGeneratorTy initVal, fir::MinlocBodyOpGeneratorTy genBody, - fir::AddrGeneratorTy getAddrFn, unsigned rank, mlir::Type elementType, - mlir::Location loc, mlir::Type maskElemType, mlir::Value resultArr, - bool maskMayBeLogicalScalar) { - mlir::IndexType idxTy = builder.getIndexType(); - - mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0); - - fir::SequenceType::Shape flatShape(rank, - fir::SequenceType::getUnknownExtent()); - mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType); - mlir::Type boxArrTy = fir::BoxType::get(arrTy); - array = builder.create<fir::ConvertOp>(loc, boxArrTy, array); - - mlir::Type resultElemType = hlfir::getFortranElementType(resultArr.getType()); - mlir::Value flagSet = builder.createIntegerConstant(loc, resultElemType, 1); - mlir::Value zero = builder.createIntegerConstant(loc, resultElemType, 0); - mlir::Value flagRef = builder.createTemporary(loc, resultElemType); - builder.create<fir::StoreOp>(loc, zero, flagRef); - - mlir::Value init = initVal(builder, loc, elementType); - llvm::SmallVector<mlir::Value, Fortran::common::maxRank> bounds; - - assert(rank > 0 && "rank cannot be zero"); - mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); - - // Compute all the upper bounds before the loop nest. - // It is not strictly necessary for performance, since the loop nest - // does not have any store operations and any LICM optimization - // should be able to optimize the redundancy. - for (unsigned i = 0; i < rank; ++i) { - mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i); - auto dims = - builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx); - mlir::Value len = dims.getResult(1); - // We use C indexing here, so len-1 as loopcount - mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one); - bounds.push_back(loopCount); - } - // Create a loop nest consisting of OP operations. - // Collect the loops' induction variables into indices array, - // which will be used in the innermost loop to load the input - // array's element. - // The loops are generated such that the innermost loop processes - // the 0 dimension. - llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices; - for (unsigned i = rank; 0 < i; --i) { - mlir::Value step = one; - mlir::Value loopCount = bounds[i - 1]; - auto loop = - builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step, false, - /*finalCountValue=*/false, init); - init = loop.getRegionIterArgs()[0]; - indices.push_back(loop.getInductionVar()); - // Set insertion point to the loop body so that the next loop - // is inserted inside the current one. - builder.setInsertionPointToStart(loop.getBody()); - } - - // Reverse the indices such that they are ordered as: - // <dim-0-idx, dim-1-idx, ...> - std::reverse(indices.begin(), indices.end()); - mlir::Value reductionVal = - genBody(builder, loc, elementType, array, flagRef, init, indices); - - // Unwind the loop nest and insert ResultOp on each level - // to return the updated value of the reduction to the enclosing - // loops. - for (unsigned i = 0; i < rank; ++i) { - auto result = builder.create<fir::ResultOp>(loc, reductionVal); - // Proceed to the outer loop. - auto loop = mlir::cast<fir::DoLoopOp>(result->getParentOp()); - reductionVal = loop.getResult(0); - // Set insertion point after the loop operation that we have - // just processed. - builder.setInsertionPointAfter(loop.getOperation()); - } - // End of loop nest. The insertion point is after the outermost loop. - if (maskMayBeLogicalScalar) { - if (fir::IfOp ifOp = - mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp())) { - builder.create<fir::ResultOp>(loc, reductionVal); - builder.setInsertionPointAfter(ifOp); - // Redefine flagSet to escape scope of ifOp - flagSet = builder.createIntegerConstant(loc, resultElemType, 1); - reductionVal = ifOp.getResult(0); - } - } - - // Check for case where array was full of max values. - // flag will be 0 if mask was never true, 1 if mask was true as some point, - // this is needed to avoid catching cases where we didn't access any elements - // e.g. mask=.FALSE. - mlir::Value flagValue = - builder.create<fir::LoadOp>(loc, resultElemType, flagRef); - mlir::Value flagCmp = builder.create<mlir::arith::CmpIOp>( - loc, mlir::arith::CmpIPredicate::eq, flagValue, flagSet); - fir::IfOp ifMaskTrueOp = - builder.create<fir::IfOp>(loc, flagCmp, /*withElseRegion=*/false); - builder.setInsertionPointToStart(&ifMaskTrueOp.getThenRegion().front()); - - mlir::Value testInit = initVal(builder, loc, elementType); - fir::IfOp ifMinSetOp; - if (elementType.isa<mlir::FloatType>()) { - mlir::Value cmp = builder.create<mlir::arith::CmpFOp>( - loc, mlir::arith::CmpFPredicate::OEQ, testInit, reductionVal); - ifMinSetOp = builder.create<fir::IfOp>(loc, cmp, - /*withElseRegion*/ false); - } else { - mlir::Value cmp = builder.create<mlir::arith::CmpIOp>( - loc, mlir::arith::CmpIPredicate::eq, testInit, reductionVal); - ifMinSetOp = builder.create<fir::IfOp>(loc, cmp, - /*withElseRegion*/ false); - } - builder.setInsertionPointToStart(&ifMinSetOp.getThenRegion().front()); - - // Load output array with 1s instead of 0s - for (unsigned int i = 0; i < rank; ++i) { - mlir::Value index = builder.createIntegerConstant(loc, idxTy, i); - mlir::Value resultElemAddr = - getAddrFn(builder, loc, resultElemType, resultArr, index); - builder.create<fir::StoreOp>(loc, flagSet, resultElemAddr); - } - builder.setInsertionPointAfter(ifMaskTrueOp); -} - inline fir::CUDADataAttributeAttr getCUDADataAttribute(mlir::MLIRContext *mlirContext, std::optional<Fortran::common::CUDADataAttr> cudaAttr) { diff --git a/flang/include/flang/Optimizer/Transforms/Utils.h b/flang/include/flang/Optimizer/Transforms/Utils.h new file mode 100644 index 0000000..49a616f --- /dev/null +++ b/flang/include/flang/Optimizer/Transforms/Utils.h @@ -0,0 +1,38 @@ +//===-- Optimizer/Transforms/Utils.h ----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H +#define FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H + +namespace fir { + +using MinlocBodyOpGeneratorTy = llvm::function_ref<mlir::Value( + fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value, + mlir::Value, mlir::Value, const llvm::SmallVectorImpl<mlir::Value> &)>; +using InitValGeneratorTy = llvm::function_ref<mlir::Value( + fir::FirOpBuilder &, mlir::Location, const mlir::Type &)>; +using AddrGeneratorTy = llvm::function_ref<mlir::Value( + fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value, + mlir::Value)>; + +// Produces a loop nest for a Minloc intrinsic. +void genMinMaxlocReductionLoop(fir::FirOpBuilder &builder, mlir::Value array, + fir::InitValGeneratorTy initVal, + fir::MinlocBodyOpGeneratorTy genBody, + fir::AddrGeneratorTy getAddrFn, unsigned rank, + mlir::Type elementType, mlir::Location loc, + mlir::Type maskElemType, mlir::Value resultArr, + bool maskMayBeLogicalScalar); + +} // namespace fir + +#endif // FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt index 58a4276..745439b 100644 --- a/flang/lib/Optimizer/Dialect/CMakeLists.txt +++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt @@ -13,7 +13,6 @@ add_flang_library(FIRDialect CanonicalizationPatternsIncGen MLIRIR FIROpsIncGen - HLFIROpsIncGen intrinsics_gen LINK_LIBS diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt index 603b328..ad569ce 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt @@ -21,6 +21,7 @@ add_flang_library(HLFIRTransforms FIRBuilder FIRDialectSupport FIRSupport + FIRTransforms HLFIRDialect MLIRIR ${dialect_libs} diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp index 523671f..c2512c7 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp @@ -20,7 +20,7 @@ #include "flang/Optimizer/HLFIR/HLFIRDialect.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/HLFIR/Passes.h" -#include "flang/Optimizer/Support/Utils.h" +#include "flang/Optimizer/Transforms/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/PatternMatch.h" diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp index b415463..86343e2 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp @@ -31,8 +31,8 @@ #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/HLFIR/HLFIRDialect.h" -#include "flang/Optimizer/Support/Utils.h" #include "flang/Optimizer/Transforms/Passes.h" +#include "flang/Optimizer/Transforms/Utils.h" #include "flang/Runtime/entry-names.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Matchers.h" @@ -558,6 +558,135 @@ static mlir::FunctionType genRuntimeMinlocType(fir::FirOpBuilder &builder, {boxRefType, boxType, boxType}, {}); } +// Produces a loop nest for a Minloc intrinsic. +void fir::genMinMaxlocReductionLoop( + fir::FirOpBuilder &builder, mlir::Value array, + fir::InitValGeneratorTy initVal, fir::MinlocBodyOpGeneratorTy genBody, + fir::AddrGeneratorTy getAddrFn, unsigned rank, mlir::Type elementType, + mlir::Location loc, mlir::Type maskElemType, mlir::Value resultArr, + bool maskMayBeLogicalScalar) { + mlir::IndexType idxTy = builder.getIndexType(); + + mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0); + + fir::SequenceType::Shape flatShape(rank, + fir::SequenceType::getUnknownExtent()); + mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType); + mlir::Type boxArrTy = fir::BoxType::get(arrTy); + array = builder.create<fir::ConvertOp>(loc, boxArrTy, array); + + mlir::Type resultElemType = hlfir::getFortranElementType(resultArr.getType()); + mlir::Value flagSet = builder.createIntegerConstant(loc, resultElemType, 1); + mlir::Value zero = builder.createIntegerConstant(loc, resultElemType, 0); + mlir::Value flagRef = builder.createTemporary(loc, resultElemType); + builder.create<fir::StoreOp>(loc, zero, flagRef); + + mlir::Value init = initVal(builder, loc, elementType); + llvm::SmallVector<mlir::Value, Fortran::common::maxRank> bounds; + + assert(rank > 0 && "rank cannot be zero"); + mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); + + // Compute all the upper bounds before the loop nest. + // It is not strictly necessary for performance, since the loop nest + // does not have any store operations and any LICM optimization + // should be able to optimize the redundancy. + for (unsigned i = 0; i < rank; ++i) { + mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i); + auto dims = + builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx); + mlir::Value len = dims.getResult(1); + // We use C indexing here, so len-1 as loopcount + mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one); + bounds.push_back(loopCount); + } + // Create a loop nest consisting of OP operations. + // Collect the loops' induction variables into indices array, + // which will be used in the innermost loop to load the input + // array's element. + // The loops are generated such that the innermost loop processes + // the 0 dimension. + llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices; + for (unsigned i = rank; 0 < i; --i) { + mlir::Value step = one; + mlir::Value loopCount = bounds[i - 1]; + auto loop = + builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step, false, + /*finalCountValue=*/false, init); + init = loop.getRegionIterArgs()[0]; + indices.push_back(loop.getInductionVar()); + // Set insertion point to the loop body so that the next loop + // is inserted inside the current one. + builder.setInsertionPointToStart(loop.getBody()); + } + + // Reverse the indices such that they are ordered as: + // <dim-0-idx, dim-1-idx, ...> + std::reverse(indices.begin(), indices.end()); + mlir::Value reductionVal = + genBody(builder, loc, elementType, array, flagRef, init, indices); + + // Unwind the loop nest and insert ResultOp on each level + // to return the updated value of the reduction to the enclosing + // loops. + for (unsigned i = 0; i < rank; ++i) { + auto result = builder.create<fir::ResultOp>(loc, reductionVal); + // Proceed to the outer loop. + auto loop = mlir::cast<fir::DoLoopOp>(result->getParentOp()); + reductionVal = loop.getResult(0); + // Set insertion point after the loop operation that we have + // just processed. + builder.setInsertionPointAfter(loop.getOperation()); + } + // End of loop nest. The insertion point is after the outermost loop. + if (maskMayBeLogicalScalar) { + if (fir::IfOp ifOp = + mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp())) { + builder.create<fir::ResultOp>(loc, reductionVal); + builder.setInsertionPointAfter(ifOp); + // Redefine flagSet to escape scope of ifOp + flagSet = builder.createIntegerConstant(loc, resultElemType, 1); + reductionVal = ifOp.getResult(0); + } + } + + // Check for case where array was full of max values. + // flag will be 0 if mask was never true, 1 if mask was true as some point, + // this is needed to avoid catching cases where we didn't access any elements + // e.g. mask=.FALSE. + mlir::Value flagValue = + builder.create<fir::LoadOp>(loc, resultElemType, flagRef); + mlir::Value flagCmp = builder.create<mlir::arith::CmpIOp>( + loc, mlir::arith::CmpIPredicate::eq, flagValue, flagSet); + fir::IfOp ifMaskTrueOp = + builder.create<fir::IfOp>(loc, flagCmp, /*withElseRegion=*/false); + builder.setInsertionPointToStart(&ifMaskTrueOp.getThenRegion().front()); + + mlir::Value testInit = initVal(builder, loc, elementType); + fir::IfOp ifMinSetOp; + if (elementType.isa<mlir::FloatType>()) { + mlir::Value cmp = builder.create<mlir::arith::CmpFOp>( + loc, mlir::arith::CmpFPredicate::OEQ, testInit, reductionVal); + ifMinSetOp = builder.create<fir::IfOp>(loc, cmp, + /*withElseRegion*/ false); + } else { + mlir::Value cmp = builder.create<mlir::arith::CmpIOp>( + loc, mlir::arith::CmpIPredicate::eq, testInit, reductionVal); + ifMinSetOp = builder.create<fir::IfOp>(loc, cmp, + /*withElseRegion*/ false); + } + builder.setInsertionPointToStart(&ifMinSetOp.getThenRegion().front()); + + // Load output array with 1s instead of 0s + for (unsigned int i = 0; i < rank; ++i) { + mlir::Value index = builder.createIntegerConstant(loc, idxTy, i); + mlir::Value resultElemAddr = + getAddrFn(builder, loc, resultElemType, resultArr, index); + builder.create<fir::StoreOp>(loc, flagSet, resultElemAddr); + } + builder.setInsertionPointAfter(ifMaskTrueOp); +} + static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder, mlir::func::FuncOp &funcOp, bool isMax, unsigned rank, int maskRank, |