diff options
Diffstat (limited to 'clang/lib/CodeGen/TargetBuiltins/BuiltinAMDGPU.cpp')
-rw-r--r-- | clang/lib/CodeGen/TargetBuiltins/BuiltinAMDGPU.cpp | 1512 |
1 files changed, 1512 insertions, 0 deletions
diff --git a/clang/lib/CodeGen/TargetBuiltins/BuiltinAMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/BuiltinAMDGPU.cpp new file mode 100644 index 0000000..9bbf66e --- /dev/null +++ b/clang/lib/CodeGen/TargetBuiltins/BuiltinAMDGPU.cpp @@ -0,0 +1,1512 @@ +//===------- BuiltinAMDGPU.cpp - Emit LLVM Code for builtins --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code to emit Builtin calls as LLVM code. +// +//===----------------------------------------------------------------------===// + +#include "CGBuiltin.h" +#include "clang/Basic/TargetBuiltins.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/MemoryModelRelaxationAnnotations.h" +#include "llvm/Support/AMDGPUAddrSpace.h" + +using namespace clang; +using namespace CodeGen; +using namespace llvm; + +namespace { + +// Has second type mangled argument. +static Value * +emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, + Intrinsic::ID IntrinsicID, + Intrinsic::ID ConstrainedIntrinsicID) { + llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); + + CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); + if (CGF.Builder.getIsFPConstrained()) { + Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, + {Src0->getType(), Src1->getType()}); + return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1}); + } + + Function *F = + CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()}); + return CGF.Builder.CreateCall(F, {Src0, Src1}); +} + +// If \p E is not null pointer, insert address space cast to match return +// type of \p E if necessary. +Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF, + const CallExpr *E = nullptr) { + auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr); + auto *Call = CGF.Builder.CreateCall(F); + Call->addRetAttr( + Attribute::getWithDereferenceableBytes(Call->getContext(), 64)); + Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4))); + if (!E) + return Call; + QualType BuiltinRetType = E->getType(); + auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType)); + if (RetTy == Call->getType()) + return Call; + return CGF.Builder.CreateAddrSpaceCast(Call, RetTy); +} + +Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) { + auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr); + auto *Call = CGF.Builder.CreateCall(F); + Call->addRetAttr( + Attribute::getWithDereferenceableBytes(Call->getContext(), 256)); + Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8))); + return Call; +} + +// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. +/// Emit code based on Code Object ABI version. +/// COV_4 : Emit code to use dispatch ptr +/// COV_5+ : Emit code to use implicitarg ptr +/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version" +/// and use its value for COV_4 or COV_5+ approach. It is used for +/// compiling device libraries in an ABI-agnostic way. +Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { + llvm::LoadInst *LD; + + auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion; + + if (Cov == CodeObjectVersionKind::COV_None) { + StringRef Name = "__oclc_ABI_version"; + auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name); + if (!ABIVersionC) + ABIVersionC = new llvm::GlobalVariable( + CGF.CGM.getModule(), CGF.Int32Ty, false, + llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr, + llvm::GlobalVariable::NotThreadLocal, + CGF.CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); + + // This load will be eliminated by the IPSCCP because it is constant + // weak_odr without externally_initialized. Either changing it to weak or + // adding externally_initialized will keep the load. + Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC, + CGF.CGM.getIntAlign()); + + Value *IsCOV5 = CGF.Builder.CreateICmpSGE( + ABIVersion, + llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5)); + + // Indexing the implicit kernarg segment. + Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32( + CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2); + + // Indexing the HSA kernel_dispatch_packet struct. + Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32( + CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2); + + auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP); + LD = CGF.Builder.CreateLoad( + Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2))); + } else { + Value *GEP = nullptr; + if (Cov >= CodeObjectVersionKind::COV_5) { + // Indexing the implicit kernarg segment. + GEP = CGF.Builder.CreateConstGEP1_32( + CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2); + } else { + // Indexing the HSA kernel_dispatch_packet struct. + GEP = CGF.Builder.CreateConstGEP1_32( + CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2); + } + LD = CGF.Builder.CreateLoad( + Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2))); + } + + llvm::MDBuilder MDHelper(CGF.getLLVMContext()); + llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1), + APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1)); + LD->setMetadata(llvm::LLVMContext::MD_range, RNode); + LD->setMetadata(llvm::LLVMContext::MD_noundef, + llvm::MDNode::get(CGF.getLLVMContext(), {})); + LD->setMetadata(llvm::LLVMContext::MD_invariant_load, + llvm::MDNode::get(CGF.getLLVMContext(), {})); + return LD; +} + +// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. +Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) { + const unsigned XOffset = 12; + auto *DP = EmitAMDGPUDispatchPtr(CGF); + // Indexing the HSA kernel_dispatch_packet struct. + auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4); + auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); + auto *LD = CGF.Builder.CreateLoad( + Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(4))); + + llvm::MDBuilder MDB(CGF.getLLVMContext()); + + // Known non-zero. + LD->setMetadata(llvm::LLVMContext::MD_range, + MDB.createRange(APInt(32, 1), APInt::getZero(32))); + LD->setMetadata(llvm::LLVMContext::MD_invariant_load, + llvm::MDNode::get(CGF.getLLVMContext(), {})); + return LD; +} +} // namespace + +// Generates the IR for __builtin_read_exec_*. +// Lowers the builtin to amdgcn_ballot intrinsic. +static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E, + llvm::Type *RegisterType, + llvm::Type *ValueType, bool isExecHi) { + CodeGen::CGBuilderTy &Builder = CGF.Builder; + CodeGen::CodeGenModule &CGM = CGF.CGM; + + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType}); + llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)}); + + if (isExecHi) { + Value *Rt2 = Builder.CreateLShr(Call, 32); + Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty); + return Rt2; + } + + return Call; +} + +// Emit an intrinsic that has 1 float or double operand, and 1 integer. +static Value *emitFPIntBuiltin(CodeGenFunction &CGF, + const CallExpr *E, + unsigned IntrinsicID) { + llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); + + Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + return CGF.Builder.CreateCall(F, {Src0, Src1}); +} + +// For processing memory ordering and memory scope arguments of various +// amdgcn builtins. +// \p Order takes a C++11 comptabile memory-ordering specifier and converts +// it into LLVM's memory ordering specifier using atomic C ABI, and writes +// to \p AO. \p Scope takes a const char * and converts it into AMDGCN +// specific SyncScopeID and writes it to \p SSID. +void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope, + llvm::AtomicOrdering &AO, + llvm::SyncScope::ID &SSID) { + int ord = cast<llvm::ConstantInt>(Order)->getZExtValue(); + + // Map C11/C++11 memory ordering to LLVM memory ordering + assert(llvm::isValidAtomicOrderingCABI(ord)); + switch (static_cast<llvm::AtomicOrderingCABI>(ord)) { + case llvm::AtomicOrderingCABI::acquire: + case llvm::AtomicOrderingCABI::consume: + AO = llvm::AtomicOrdering::Acquire; + break; + case llvm::AtomicOrderingCABI::release: + AO = llvm::AtomicOrdering::Release; + break; + case llvm::AtomicOrderingCABI::acq_rel: + AO = llvm::AtomicOrdering::AcquireRelease; + break; + case llvm::AtomicOrderingCABI::seq_cst: + AO = llvm::AtomicOrdering::SequentiallyConsistent; + break; + case llvm::AtomicOrderingCABI::relaxed: + AO = llvm::AtomicOrdering::Monotonic; + break; + } + + // Some of the atomic builtins take the scope as a string name. + StringRef scp; + if (llvm::getConstantStringInfo(Scope, scp)) { + SSID = getLLVMContext().getOrInsertSyncScopeID(scp); + return; + } + + // Older builtins had an enum argument for the memory scope. + int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue(); + switch (scope) { + case 0: // __MEMORY_SCOPE_SYSTEM + SSID = llvm::SyncScope::System; + break; + case 1: // __MEMORY_SCOPE_DEVICE + SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); + break; + case 2: // __MEMORY_SCOPE_WRKGRP + SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup"); + break; + case 3: // __MEMORY_SCOPE_WVFRNT + SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront"); + break; + case 4: // __MEMORY_SCOPE_SINGLE + SSID = llvm::SyncScope::SingleThread; + break; + default: + SSID = llvm::SyncScope::System; + break; + } +} + +llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments, + unsigned Idx, + const CallExpr *E) { + llvm::Value *Arg = nullptr; + if ((ICEArguments & (1 << Idx)) == 0) { + Arg = EmitScalarExpr(E->getArg(Idx)); + } else { + // If this is required to be a constant, constant fold it so that we + // know that the generated intrinsic gets a ConstantInt. + std::optional<llvm::APSInt> Result = + E->getArg(Idx)->getIntegerConstantExpr(getContext()); + assert(Result && "Expected argument to be a constant"); + Arg = llvm::ConstantInt::get(getLLVMContext(), *Result); + } + return Arg; +} + +void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, + const CallExpr *E) { + constexpr const char *Tag = "amdgpu-as"; + + LLVMContext &Ctx = Inst->getContext(); + SmallVector<MMRAMetadata::TagT, 3> MMRAs; + for (unsigned K = 2; K < E->getNumArgs(); ++K) { + llvm::Value *V = EmitScalarExpr(E->getArg(K)); + StringRef AS; + if (llvm::getConstantStringInfo(V, AS)) { + MMRAs.push_back({Tag, AS}); + // TODO: Delete the resulting unused constant? + continue; + } + CGM.Error(E->getExprLoc(), + "expected an address space name as a string literal"); + } + + llvm::sort(MMRAs); + MMRAs.erase(llvm::unique(MMRAs), MMRAs.end()); + Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); +} + +Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; + llvm::SyncScope::ID SSID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_div_scale: + case AMDGPU::BI__builtin_amdgcn_div_scalef: { + // Translate from the intrinsics's struct return to the builtin's out + // argument. + + Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3)); + + llvm::Value *X = EmitScalarExpr(E->getArg(0)); + llvm::Value *Y = EmitScalarExpr(E->getArg(1)); + llvm::Value *Z = EmitScalarExpr(E->getArg(2)); + + llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale, + X->getType()); + + llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z}); + + llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0); + llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1); + + llvm::Type *RealFlagType = FlagOutPtr.getElementType(); + + llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType); + Builder.CreateStore(FlagExt, FlagOutPtr); + return Result; + } + case AMDGPU::BI__builtin_amdgcn_div_fmas: + case AMDGPU::BI__builtin_amdgcn_div_fmasf: { + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); + + llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas, + Src0->getType()); + llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3); + return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool}); + } + + case AMDGPU::BI__builtin_amdgcn_ds_swizzle: + return emitBuiltinWithOneOverloadedType<2>(*this, E, + Intrinsic::amdgcn_ds_swizzle); + case AMDGPU::BI__builtin_amdgcn_mov_dpp8: + case AMDGPU::BI__builtin_amdgcn_mov_dpp: + case AMDGPU::BI__builtin_amdgcn_update_dpp: { + llvm::SmallVector<llvm::Value *, 6> Args; + // Find out if any arguments are required to be integer constant + // expressions. + unsigned ICEArguments = 0; + ASTContext::GetBuiltinTypeError Error; + getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); + assert(Error == ASTContext::GE_None && "Should not codegen an error"); + llvm::Type *DataTy = ConvertType(E->getArg(0)->getType()); + unsigned Size = DataTy->getPrimitiveSizeInBits(); + llvm::Type *IntTy = + llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u)); + Function *F = + CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8 + ? Intrinsic::amdgcn_mov_dpp8 + : Intrinsic::amdgcn_update_dpp, + IntTy); + assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 || + E->getNumArgs() == 2); + bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp; + if (InsertOld) + Args.push_back(llvm::PoisonValue::get(IntTy)); + for (unsigned I = 0; I != E->getNumArgs(); ++I) { + llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E); + if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) && + Size < 32) { + if (!DataTy->isIntegerTy()) + V = Builder.CreateBitCast( + V, llvm::IntegerType::get(Builder.getContext(), Size)); + V = Builder.CreateZExtOrBitCast(V, IntTy); + } + llvm::Type *ExpTy = + F->getFunctionType()->getFunctionParamType(I + InsertOld); + Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy)); + } + Value *V = Builder.CreateCall(F, Args); + if (Size < 32 && !DataTy->isIntegerTy()) + V = Builder.CreateTrunc( + V, llvm::IntegerType::get(Builder.getContext(), Size)); + return Builder.CreateTruncOrBitCast(V, DataTy); + } + case AMDGPU::BI__builtin_amdgcn_permlane16: + case AMDGPU::BI__builtin_amdgcn_permlanex16: + return emitBuiltinWithOneOverloadedType<6>( + *this, E, + BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16 + ? Intrinsic::amdgcn_permlane16 + : Intrinsic::amdgcn_permlanex16); + case AMDGPU::BI__builtin_amdgcn_permlane64: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::amdgcn_permlane64); + case AMDGPU::BI__builtin_amdgcn_readlane: + return emitBuiltinWithOneOverloadedType<2>(*this, E, + Intrinsic::amdgcn_readlane); + case AMDGPU::BI__builtin_amdgcn_readfirstlane: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::amdgcn_readfirstlane); + case AMDGPU::BI__builtin_amdgcn_div_fixup: + case AMDGPU::BI__builtin_amdgcn_div_fixupf: + case AMDGPU::BI__builtin_amdgcn_div_fixuph: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_div_fixup); + case AMDGPU::BI__builtin_amdgcn_trig_preop: + case AMDGPU::BI__builtin_amdgcn_trig_preopf: + return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop); + case AMDGPU::BI__builtin_amdgcn_rcp: + case AMDGPU::BI__builtin_amdgcn_rcpf: + case AMDGPU::BI__builtin_amdgcn_rcph: + case AMDGPU::BI__builtin_amdgcn_rcp_bf16: + return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp); + case AMDGPU::BI__builtin_amdgcn_sqrt: + case AMDGPU::BI__builtin_amdgcn_sqrtf: + case AMDGPU::BI__builtin_amdgcn_sqrth: + case AMDGPU::BI__builtin_amdgcn_sqrt_bf16: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::amdgcn_sqrt); + case AMDGPU::BI__builtin_amdgcn_rsq: + case AMDGPU::BI__builtin_amdgcn_rsqf: + case AMDGPU::BI__builtin_amdgcn_rsqh: + case AMDGPU::BI__builtin_amdgcn_rsq_bf16: + return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq); + case AMDGPU::BI__builtin_amdgcn_rsq_clamp: + case AMDGPU::BI__builtin_amdgcn_rsq_clampf: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::amdgcn_rsq_clamp); + case AMDGPU::BI__builtin_amdgcn_sinf: + case AMDGPU::BI__builtin_amdgcn_sinh: + case AMDGPU::BI__builtin_amdgcn_sin_bf16: + return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin); + case AMDGPU::BI__builtin_amdgcn_cosf: + case AMDGPU::BI__builtin_amdgcn_cosh: + case AMDGPU::BI__builtin_amdgcn_cos_bf16: + return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos); + case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: + return EmitAMDGPUDispatchPtr(*this, E); + case AMDGPU::BI__builtin_amdgcn_logf: + case AMDGPU::BI__builtin_amdgcn_log_bf16: + return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log); + case AMDGPU::BI__builtin_amdgcn_exp2f: + case AMDGPU::BI__builtin_amdgcn_exp2_bf16: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::amdgcn_exp2); + case AMDGPU::BI__builtin_amdgcn_log_clampf: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::amdgcn_log_clamp); + case AMDGPU::BI__builtin_amdgcn_ldexp: + case AMDGPU::BI__builtin_amdgcn_ldexpf: { + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = + CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()}); + return Builder.CreateCall(F, {Src0, Src1}); + } + case AMDGPU::BI__builtin_amdgcn_ldexph: { + // The raw instruction has a different behavior for out of bounds exponent + // values (implicit truncation instead of saturate to short_min/short_max). + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = + CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty}); + return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)}); + } + case AMDGPU::BI__builtin_amdgcn_frexp_mant: + case AMDGPU::BI__builtin_amdgcn_frexp_mantf: + case AMDGPU::BI__builtin_amdgcn_frexp_manth: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::amdgcn_frexp_mant); + case AMDGPU::BI__builtin_amdgcn_frexp_exp: + case AMDGPU::BI__builtin_amdgcn_frexp_expf: { + Value *Src0 = EmitScalarExpr(E->getArg(0)); + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp, + { Builder.getInt32Ty(), Src0->getType() }); + return Builder.CreateCall(F, Src0); + } + case AMDGPU::BI__builtin_amdgcn_frexp_exph: { + Value *Src0 = EmitScalarExpr(E->getArg(0)); + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp, + { Builder.getInt16Ty(), Src0->getType() }); + return Builder.CreateCall(F, Src0); + } + case AMDGPU::BI__builtin_amdgcn_fract: + case AMDGPU::BI__builtin_amdgcn_fractf: + case AMDGPU::BI__builtin_amdgcn_fracth: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::amdgcn_fract); + case AMDGPU::BI__builtin_amdgcn_lerp: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_lerp); + case AMDGPU::BI__builtin_amdgcn_ubfe: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_ubfe); + case AMDGPU::BI__builtin_amdgcn_sbfe: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_sbfe); + case AMDGPU::BI__builtin_amdgcn_ballot_w32: + case AMDGPU::BI__builtin_amdgcn_ballot_w64: { + llvm::Type *ResultType = ConvertType(E->getType()); + llvm::Value *Src = EmitScalarExpr(E->getArg(0)); + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType }); + return Builder.CreateCall(F, { Src }); + } + case AMDGPU::BI__builtin_amdgcn_tanhf: + case AMDGPU::BI__builtin_amdgcn_tanhh: + case AMDGPU::BI__builtin_amdgcn_tanh_bf16: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::amdgcn_tanh); + case AMDGPU::BI__builtin_amdgcn_uicmp: + case AMDGPU::BI__builtin_amdgcn_uicmpl: + case AMDGPU::BI__builtin_amdgcn_sicmp: + case AMDGPU::BI__builtin_amdgcn_sicmpl: { + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + + // FIXME-GFX10: How should 32 bit mask be handled? + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp, + { Builder.getInt64Ty(), Src0->getType() }); + return Builder.CreateCall(F, { Src0, Src1, Src2 }); + } + case AMDGPU::BI__builtin_amdgcn_fcmp: + case AMDGPU::BI__builtin_amdgcn_fcmpf: { + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + + // FIXME-GFX10: How should 32 bit mask be handled? + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp, + { Builder.getInt64Ty(), Src0->getType() }); + return Builder.CreateCall(F, { Src0, Src1, Src2 }); + } + case AMDGPU::BI__builtin_amdgcn_class: + case AMDGPU::BI__builtin_amdgcn_classf: + case AMDGPU::BI__builtin_amdgcn_classh: + return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class); + case AMDGPU::BI__builtin_amdgcn_fmed3f: + case AMDGPU::BI__builtin_amdgcn_fmed3h: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_fmed3); + case AMDGPU::BI__builtin_amdgcn_ds_append: + case AMDGPU::BI__builtin_amdgcn_ds_consume: { + Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ? + Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume; + Value *Src0 = EmitScalarExpr(E->getArg(0)); + Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() }); + return Builder.CreateCall(F, { Src0, Builder.getFalse() }); + } + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: + IID = Intrinsic::amdgcn_global_load_tr_b64; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: + IID = Intrinsic::amdgcn_global_load_tr_b128; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: + IID = Intrinsic::amdgcn_global_load_tr4_b64; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: + IID = Intrinsic::amdgcn_global_load_tr6_b96; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: + IID = Intrinsic::amdgcn_ds_load_tr4_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: + IID = Intrinsic::amdgcn_ds_load_tr6_b96; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: + IID = Intrinsic::amdgcn_ds_load_tr8_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: + IID = Intrinsic::amdgcn_ds_load_tr16_b128; + break; + case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: + IID = Intrinsic::amdgcn_ds_read_tr4_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: + IID = Intrinsic::amdgcn_ds_read_tr8_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: + IID = Intrinsic::amdgcn_ds_read_tr6_b96; + break; + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16: + IID = Intrinsic::amdgcn_ds_read_tr16_b64; + break; + } + llvm::Type *LoadTy = ConvertType(E->getType()); + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); + return Builder.CreateCall(F, {Addr}); + } + case AMDGPU::BI__builtin_amdgcn_load_to_lds: { + // Should this have asan instrumentation? + return emitBuiltinWithOneOverloadedType<5>(*this, E, + Intrinsic::amdgcn_load_to_lds); + } + case AMDGPU::BI__builtin_amdgcn_get_fpenv: { + Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, + {llvm::Type::getInt64Ty(getLLVMContext())}); + return Builder.CreateCall(F); + } + case AMDGPU::BI__builtin_amdgcn_set_fpenv: { + Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv, + {llvm::Type::getInt64Ty(getLLVMContext())}); + llvm::Value *Env = EmitScalarExpr(E->getArg(0)); + return Builder.CreateCall(F, {Env}); + } + case AMDGPU::BI__builtin_amdgcn_read_exec: + return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); + case AMDGPU::BI__builtin_amdgcn_read_exec_lo: + return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false); + case AMDGPU::BI__builtin_amdgcn_read_exec_hi: + return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true); + case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray: + case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h: + case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l: + case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: { + llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0)); + llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1)); + llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2)); + llvm::Value *RayDir = EmitScalarExpr(E->getArg(3)); + llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4)); + llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5)); + + // The builtins take these arguments as vec4 where the last element is + // ignored. The intrinsic takes them as vec3. + RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin, + {0, 1, 2}); + RayDir = + Builder.CreateShuffleVector(RayDir, RayDir, {0, 1, 2}); + RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir, + {0, 1, 2}); + + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray, + {NodePtr->getType(), RayDir->getType()}); + return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir, + RayInverseDir, TextureDescr}); + } + case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: + case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: + IID = Intrinsic::amdgcn_image_bvh8_intersect_ray; + break; + case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: + IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray; + break; + } + llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0)); + llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1)); + llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2)); + llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3)); + llvm::Value *RayDir = EmitScalarExpr(E->getArg(4)); + llvm::Value *Offset = EmitScalarExpr(E->getArg(5)); + llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6)); + + Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7)); + Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8)); + + llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID); + + llvm::CallInst *CI = Builder.CreateCall( + IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir, + Offset, TextureDescr}); + + llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0); + llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1); + llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2); + + Builder.CreateStore(RetRayOrigin, RetRayOriginPtr); + Builder.CreateStore(RetRayDir, RetRayDirPtr); + + return RetVData; + } + + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_rtn; + break; + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn; + break; + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn; + break; + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn; + break; + } + + SmallVector<Value *, 4> Args; + for (int i = 0, e = E->getNumArgs(); i != e; ++i) + Args.push_back(EmitScalarExpr(E->getArg(i))); + + Function *F = CGM.getIntrinsic(IID); + Value *Call = Builder.CreateCall(F, Args); + Value *Rtn = Builder.CreateExtractValue(Call, 0); + Value *A = Builder.CreateExtractValue(Call, 1); + llvm::Type *RetTy = ConvertType(E->getType()); + Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn, + (uint64_t)0); + // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns + // <2 x i64>, zext the second value. + if (A->getType()->getPrimitiveSizeInBits() < + RetTy->getScalarType()->getPrimitiveSizeInBits()) + A = Builder.CreateZExt(A, RetTy->getScalarType()); + + return Builder.CreateInsertElement(I0, A, 1); + } + case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4: + case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { + llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8); + Function *F = CGM.getIntrinsic( + BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 + ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4 + : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4, + {VT, VT}); + + SmallVector<Value *, 9> Args; + for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I) + Args.push_back(EmitScalarExpr(E->getArg(I))); + return Builder.CreateCall(F, Args); + } + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: + // GFX1250 WMMA builtins + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: { + + // These operations perform a matrix multiplication and accumulation of + // the form: + // D = A * B + C + // We need to specify one type for matrices AB and one for matrices CD. + // Sparse matrix operations can have different types for A and B as well as + // an additional type for sparsity index. + // Destination type should be put before types used for source operands. + SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes; + // On GFX12, the intrinsics with 16-bit accumulator use a packed layout. + // There is no need for the variable opsel argument, so always set it to + // "false". + bool AppendFalseForOpselArg = false; + unsigned BuiltinWMMAOp; + // Need return type when D and C are of different types. + bool NeedReturnType = false; + + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: + AppendFalseForOpselArg = true; + [[fallthrough]]; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: + AppendFalseForOpselArg = true; + [[fallthrough]]; + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8; + break; + // GFX1250 WMMA builtins + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32: + ArgsForMatchingMatrixTypes = {5, 1}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16: + ArgsForMatchingMatrixTypes = {5, 1}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16: + ArgsForMatchingMatrixTypes = {5, 1}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16: + ArgsForMatchingMatrixTypes = {5, 1}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16: + ArgsForMatchingMatrixTypes = {5, 1}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16: + NeedReturnType = true; + ArgsForMatchingMatrixTypes = {1, 5}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8: + ArgsForMatchingMatrixTypes = {3, 0}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8: + ArgsForMatchingMatrixTypes = {4, 1}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4: + ArgsForMatchingMatrixTypes = {5, 1, 3}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4: + ArgsForMatchingMatrixTypes = {3, 0, 1}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8; + break; + } + + SmallVector<Value *, 6> Args; + for (int i = 0, e = E->getNumArgs(); i != e; ++i) + Args.push_back(EmitScalarExpr(E->getArg(i))); + if (AppendFalseForOpselArg) + Args.push_back(Builder.getFalse()); + + SmallVector<llvm::Type *, 6> ArgTypes; + if (NeedReturnType) + ArgTypes.push_back(ConvertType(E->getType())); + for (auto ArgIdx : ArgsForMatchingMatrixTypes) + ArgTypes.push_back(Args[ArgIdx]->getType()); + + Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes); + return Builder.CreateCall(F, Args); + } + // amdgcn workgroup size + case AMDGPU::BI__builtin_amdgcn_workgroup_size_x: + return EmitAMDGPUWorkGroupSize(*this, 0); + case AMDGPU::BI__builtin_amdgcn_workgroup_size_y: + return EmitAMDGPUWorkGroupSize(*this, 1); + case AMDGPU::BI__builtin_amdgcn_workgroup_size_z: + return EmitAMDGPUWorkGroupSize(*this, 2); + + // amdgcn grid size + case AMDGPU::BI__builtin_amdgcn_grid_size_x: + return EmitAMDGPUGridSize(*this, 0); + case AMDGPU::BI__builtin_amdgcn_grid_size_y: + return EmitAMDGPUGridSize(*this, 1); + case AMDGPU::BI__builtin_amdgcn_grid_size_z: + return EmitAMDGPUGridSize(*this, 2); + + // r600 intrinsics + case AMDGPU::BI__builtin_r600_recipsqrt_ieee: + case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: + return emitBuiltinWithOneOverloadedType<1>(*this, E, + Intrinsic::r600_recipsqrt_ieee); + case AMDGPU::BI__builtin_amdgcn_alignbit: { + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType()); + return Builder.CreateCall(F, { Src0, Src1, Src2 }); + } + case AMDGPU::BI__builtin_amdgcn_fence: { + ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), AO, SSID); + FenceInst *Fence = Builder.CreateFence(AO, SSID); + if (E->getNumArgs() > 2) + AddAMDGPUFenceAddressSpaceMMRA(Fence, E); + return Fence; + } + case AMDGPU::BI__builtin_amdgcn_atomic_inc32: + case AMDGPU::BI__builtin_amdgcn_atomic_inc64: + case AMDGPU::BI__builtin_amdgcn_atomic_dec32: + case AMDGPU::BI__builtin_amdgcn_atomic_dec64: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: { + llvm::AtomicRMWInst::BinOp BinOp; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_atomic_inc32: + case AMDGPU::BI__builtin_amdgcn_atomic_inc64: + BinOp = llvm::AtomicRMWInst::UIncWrap; + break; + case AMDGPU::BI__builtin_amdgcn_atomic_dec32: + case AMDGPU::BI__builtin_amdgcn_atomic_dec64: + BinOp = llvm::AtomicRMWInst::UDecWrap; + break; + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: + BinOp = llvm::AtomicRMWInst::FAdd; + break; + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: + BinOp = llvm::AtomicRMWInst::FMin; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: + BinOp = llvm::AtomicRMWInst::FMax; + break; + } + + Address Ptr = CheckAtomicAlignment(*this, E); + Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Type *OrigTy = Val->getType(); + QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType(); + + bool Volatile; + + if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf || + BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf || + BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) { + // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument + Volatile = + cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue(); + } else { + // Infer volatile from the passed type. + Volatile = + PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified(); + } + + if (E->getNumArgs() >= 4) { + // Some of the builtins have explicit ordering and scope arguments. + ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(3)), AO, SSID); + } else { + // Most of the builtins do not have syncscope/order arguments. For DS + // atomics the scope doesn't really matter, as they implicitly operate at + // workgroup scope. + // + // The global/flat cases need to use agent scope to consistently produce + // the native instruction instead of a cmpxchg expansion. + SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); + AO = AtomicOrdering::Monotonic; + + // The v2bf16 builtin uses i16 instead of a natural bfloat type. + if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 || + BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 || + BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) { + llvm::Type *V2BF16Ty = FixedVectorType::get( + llvm::Type::getBFloatTy(Builder.getContext()), 2); + Val = Builder.CreateBitCast(Val, V2BF16Ty); + } + } + + llvm::AtomicRMWInst *RMW = + Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID); + if (Volatile) + RMW->setVolatile(true); + + unsigned AddrSpace = Ptr.getType()->getAddressSpace(); + if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) { + // Most targets require "amdgpu.no.fine.grained.memory" to emit the native + // instruction for flat and global operations. + llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {}); + RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD); + + // Most targets require "amdgpu.ignore.denormal.mode" to emit the native + // instruction, but this only matters for float fadd. + if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy()) + RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD); + } + + return Builder.CreateBitCast(RMW, OrigTy); + } + case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn: + case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: { + llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); + llvm::Type *ResultType = ConvertType(E->getType()); + // s_sendmsg_rtn is mangled using return type only. + Function *F = + CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType}); + return Builder.CreateCall(F, {Arg}); + } + case AMDGPU::BI__builtin_amdgcn_permlane16_swap: + case AMDGPU::BI__builtin_amdgcn_permlane32_swap: { + // Because builtin types are limited, and the intrinsic uses a struct/pair + // output, marshal the pair-of-i32 to <2 x i32>. + Value *VDstOld = EmitScalarExpr(E->getArg(0)); + Value *VSrcOld = EmitScalarExpr(E->getArg(1)); + Value *FI = EmitScalarExpr(E->getArg(2)); + Value *BoundCtrl = EmitScalarExpr(E->getArg(3)); + Function *F = + CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap + ? Intrinsic::amdgcn_permlane16_swap + : Intrinsic::amdgcn_permlane32_swap); + llvm::CallInst *Call = + Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl}); + + llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0); + llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1); + + llvm::Type *ResultType = ConvertType(E->getType()); + + llvm::Value *Insert0 = Builder.CreateInsertElement( + llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0)); + llvm::Value *AsVector = + Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1)); + return AsVector; + } + case AMDGPU::BI__builtin_amdgcn_bitop3_b32: + case AMDGPU::BI__builtin_amdgcn_bitop3_b16: + return emitBuiltinWithOneOverloadedType<4>(*this, E, + Intrinsic::amdgcn_bitop3); + case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: { + // TODO: LLVM has this overloaded to allow for fat pointers, but since + // those haven't been plumbed through to Clang yet, default to creating the + // resource type. + SmallVector<Value *, 4> Args; + for (unsigned I = 0; I < 4; ++I) + Args.push_back(EmitScalarExpr(E->getArg(I))); + llvm::PointerType *RetTy = llvm::PointerType::get( + Builder.getContext(), llvm::AMDGPUAS::BUFFER_RESOURCE); + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc, + {RetTy, Args[0]->getType()}); + return Builder.CreateCall(F, Args); + } + case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128: + return emitBuiltinWithOneOverloadedType<5>( + *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store); + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: { + llvm::Type *RetTy = nullptr; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8: + RetTy = Int8Ty; + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16: + RetTy = Int16Ty; + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32: + RetTy = Int32Ty; + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64: + RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2); + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96: + RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3); + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: + RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4); + break; + } + Function *F = + CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy); + return Builder.CreateCall( + F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))}); + } + case AMDGPU::BI__builtin_amdgcn_s_prefetch_data: + return emitBuiltinWithOneOverloadedType<2>( + *this, E, Intrinsic::amdgcn_s_prefetch_data); + case Builtin::BIlogbf: + case Builtin::BI__builtin_logbf: { + Value *Src0 = EmitScalarExpr(E->getArg(0)); + Function *FrExpFunc = CGM.getIntrinsic( + Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()}); + CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0); + Value *Exp = Builder.CreateExtractValue(FrExp, 1); + Value *Add = Builder.CreateAdd( + Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true); + Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getFloatTy()); + Value *Fabs = + emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs); + Value *FCmpONE = Builder.CreateFCmpONE( + Fabs, ConstantFP::getInfinity(Builder.getFloatTy())); + Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs); + Value *FCmpOEQ = + Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getFloatTy())); + Value *Sel2 = Builder.CreateSelect( + FCmpOEQ, + ConstantFP::getInfinity(Builder.getFloatTy(), /*Negative=*/true), Sel1); + return Sel2; + } + case Builtin::BIlogb: + case Builtin::BI__builtin_logb: { + Value *Src0 = EmitScalarExpr(E->getArg(0)); + Function *FrExpFunc = CGM.getIntrinsic( + Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()}); + CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0); + Value *Exp = Builder.CreateExtractValue(FrExp, 1); + Value *Add = Builder.CreateAdd( + Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true); + Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getDoubleTy()); + Value *Fabs = + emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs); + Value *FCmpONE = Builder.CreateFCmpONE( + Fabs, ConstantFP::getInfinity(Builder.getDoubleTy())); + Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs); + Value *FCmpOEQ = + Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getDoubleTy())); + Value *Sel2 = Builder.CreateSelect( + FCmpOEQ, + ConstantFP::getInfinity(Builder.getDoubleTy(), /*Negative=*/true), + Sel1); + return Sel2; + } + case Builtin::BIscalbnf: + case Builtin::BI__builtin_scalbnf: + case Builtin::BIscalbn: + case Builtin::BI__builtin_scalbn: + return emitBinaryExpMaybeConstrainedFPBuiltin( + *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp); + default: + return nullptr; + } +} |