//===- OpToFuncCallLowering.h - GPU ops lowering to custom calls *- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_ #define MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_ #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Builders.h" namespace mlir { namespace { /// Detection trait tor the `getFastmath` instance method. template using has_get_fastmath_t = decltype(std::declval().getFastmath()); } // namespace /// Rewriting that replaces SourceOp with a CallOp to `f32Func` or `f64Func` or /// `f32ApproxFunc` or `f16Func` or `i32Type` depending on the element type and /// the fastMathFlag of that Op, if present. The function declaration is added /// in case it was not added before. /// /// If the input values are of bf16 type (or f16 type if f16Func is empty), the /// value is first casted to f32, the function called and then the result casted /// back. /// /// Example with NVVM: /// %exp_f32 = math.exp %arg_f32 : f32 /// /// will be transformed into /// llvm.call @__nv_expf(%arg_f32) : (f32) -> f32 /// /// If the fastMathFlag attribute of SourceOp is `afn` or `fast`, this Op lowers /// to the approximate calculation function. /// /// Also example with NVVM: /// %exp_f32 = math.exp %arg_f32 fastmath : f32 /// /// will be transformed into /// llvm.call @__nv_fast_expf(%arg_f32) : (f32) -> f32 /// /// Final example with NVVM: /// %pow_f32 = math.fpowi %arg_f32, %arg_i32 /// /// will be transformed into /// llvm.call @__nv_powif(%arg_f32, %arg_i32) : (f32, i32) -> f32 template struct OpToFuncCallLowering : public ConvertOpToLLVMPattern { public: explicit OpToFuncCallLowering(const LLVMTypeConverter &lowering, StringRef f32Func, StringRef f64Func, StringRef f32ApproxFunc, StringRef f16Func, StringRef i32Func = "", PatternBenefit benefit = 1) : ConvertOpToLLVMPattern(lowering, benefit), f32Func(f32Func), f64Func(f64Func), f32ApproxFunc(f32ApproxFunc), f16Func(f16Func), i32Func(i32Func) {} LogicalResult matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { using LLVM::LLVMFuncOp; static_assert( std::is_base_of, SourceOp>::value, "expected single result op"); bool isResultBool = op->getResultTypes().front().isInteger(1); if constexpr (!std::is_base_of, SourceOp>::value) { assert(op->getNumOperands() > 0 && "expected op to take at least one operand"); assert((op->getResultTypes().front() == op->getOperand(0).getType() || isResultBool) && "expected op with same operand and result types"); } if (!op->template getParentOfType()) { return rewriter.notifyMatchFailure( op, "expected op to be within a function region"); } SmallVector castedOperands; for (Value operand : adaptor.getOperands()) castedOperands.push_back(maybeCast(operand, rewriter)); Type castedOperandType = castedOperands.front().getType(); // At ABI level, booleans are treated as i32. Type resultType = isResultBool ? rewriter.getIntegerType(32) : castedOperandType; Type funcType = getFunctionType(resultType, castedOperands); StringRef funcName = getFunctionName(castedOperandType, op); if (funcName.empty()) return failure(); LLVMFuncOp funcOp = appendOrGetFuncOp(funcName, funcType, op); auto callOp = LLVM::CallOp::create(rewriter, op->getLoc(), funcOp, castedOperands); if (resultType == adaptor.getOperands().front().getType()) { rewriter.replaceOp(op, {callOp.getResult()}); return success(); } // Boolean result are mapping to i32 at the ABI level with zero values being // interpreted as false and non-zero values being interpreted as true. Since // there is no guarantee of a specific value being used to indicate true, // compare for inequality with zero (rather than truncate or shift). if (isResultBool) { Value zero = LLVM::ConstantOp::create(rewriter, op->getLoc(), rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0)); Value truncated = LLVM::ICmpOp::create(rewriter, op->getLoc(), LLVM::ICmpPredicate::ne, callOp.getResult(), zero); rewriter.replaceOp(op, {truncated}); return success(); } assert(callOp.getResult().getType().isF32() && "only f32 types are supposed to be truncated back"); Value truncated = LLVM::FPTruncOp::create( rewriter, op->getLoc(), adaptor.getOperands().front().getType(), callOp.getResult()); rewriter.replaceOp(op, {truncated}); return success(); } Value maybeCast(Value operand, PatternRewriter &rewriter) const { Type type = operand.getType(); if (!isa(type)) return operand; // If there's an f16 function, no need to cast f16 values. if (!f16Func.empty() && isa(type)) return operand; return LLVM::FPExtOp::create(rewriter, operand.getLoc(), Float32Type::get(rewriter.getContext()), operand); } Type getFunctionType(Type resultType, ValueRange operands) const { SmallVector operandTypes(operands.getTypes()); return LLVM::LLVMFunctionType::get(resultType, operandTypes); } LLVM::LLVMFuncOp appendOrGetFuncOp(StringRef funcName, Type funcType, Operation *op) const { using LLVM::LLVMFuncOp; auto funcAttr = StringAttr::get(op->getContext(), funcName); auto funcOp = SymbolTable::lookupNearestSymbolFrom(op, funcAttr); if (funcOp) return funcOp; auto parentFunc = op->getParentOfType(); assert(parentFunc && "expected there to be a parent function"); OpBuilder b(parentFunc); // Create a valid global location removing any metadata attached to the // location as debug info metadata inside of a function cannot be used // outside of that function. auto globalloc = op->getLoc()->findInstanceOfOrUnknown(); return LLVMFuncOp::create(b, globalloc, funcName, funcType); } StringRef getFunctionName(Type type, SourceOp op) const { bool useApprox = false; if constexpr (llvm::is_detected::value) { arith::FastMathFlags flag = op.getFastmath(); useApprox = ((uint32_t)arith::FastMathFlags::afn & (uint32_t)flag) && !f32ApproxFunc.empty(); } if (isa(type)) return f16Func; if (isa(type)) { if (useApprox) return f32ApproxFunc; return f32Func; } if (isa(type)) return f64Func; if (type.isInteger(32)) return i32Func; return ""; } const std::string f32Func; const std::string f64Func; const std::string f32ApproxFunc; const std::string f16Func; const std::string i32Func; }; } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_