//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This contains code to emit Builtin calls as LLVM code. // //===----------------------------------------------------------------------===// #include "CGBuiltin.h" #include "clang/Basic/TargetBuiltins.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/Support/AMDGPUAddrSpace.h" using namespace clang; using namespace CodeGen; using namespace llvm; namespace { // Has second type mangled argument. static Value * emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, Intrinsic::ID IntrinsicID, Intrinsic::ID ConstrainedIntrinsicID) { llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); if (CGF.Builder.getIsFPConstrained()) { Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, {Src0->getType(), Src1->getType()}); return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1}); } Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()}); return CGF.Builder.CreateCall(F, {Src0, Src1}); } // If \p E is not null pointer, insert address space cast to match return // type of \p E if necessary. Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF, const CallExpr *E = nullptr) { auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr); auto *Call = CGF.Builder.CreateCall(F); Call->addRetAttr( Attribute::getWithDereferenceableBytes(Call->getContext(), 64)); Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4))); if (!E) return Call; QualType BuiltinRetType = E->getType(); auto *RetTy = cast(CGF.ConvertType(BuiltinRetType)); if (RetTy == Call->getType()) return Call; return CGF.Builder.CreateAddrSpaceCast(Call, RetTy); } Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) { auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr); auto *Call = CGF.Builder.CreateCall(F); Call->addRetAttr( Attribute::getWithDereferenceableBytes(Call->getContext(), 256)); Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8))); return Call; } // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. /// Emit code based on Code Object ABI version. /// COV_4 : Emit code to use dispatch ptr /// COV_5+ : Emit code to use implicitarg ptr /// COV_NONE : Emit code to load a global variable "__oclc_ABI_version" /// and use its value for COV_4 or COV_5+ approach. It is used for /// compiling device libraries in an ABI-agnostic way. Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { llvm::LoadInst *LD; auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion; if (Cov == CodeObjectVersionKind::COV_None) { StringRef Name = "__oclc_ABI_version"; auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name); if (!ABIVersionC) ABIVersionC = new llvm::GlobalVariable( CGF.CGM.getModule(), CGF.Int32Ty, false, llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr, llvm::GlobalVariable::NotThreadLocal, CGF.CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); // This load will be eliminated by the IPSCCP because it is constant // weak_odr without externally_initialized. Either changing it to weak or // adding externally_initialized will keep the load. Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC, CGF.CGM.getIntAlign()); Value *IsCOV5 = CGF.Builder.CreateICmpSGE( ABIVersion, llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5)); // Indexing the implicit kernarg segment. Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32( CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2); // Indexing the HSA kernel_dispatch_packet struct. Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32( CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2); auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP); LD = CGF.Builder.CreateLoad( Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2))); } else { Value *GEP = nullptr; if (Cov >= CodeObjectVersionKind::COV_5) { // Indexing the implicit kernarg segment. GEP = CGF.Builder.CreateConstGEP1_32( CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2); } else { // Indexing the HSA kernel_dispatch_packet struct. GEP = CGF.Builder.CreateConstGEP1_32( CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2); } LD = CGF.Builder.CreateLoad( Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2))); } llvm::MDBuilder MDHelper(CGF.getLLVMContext()); llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1), APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1)); LD->setMetadata(llvm::LLVMContext::MD_range, RNode); LD->setMetadata(llvm::LLVMContext::MD_noundef, llvm::MDNode::get(CGF.getLLVMContext(), {})); LD->setMetadata(llvm::LLVMContext::MD_invariant_load, llvm::MDNode::get(CGF.getLLVMContext(), {})); return LD; } // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) { const unsigned XOffset = 12; auto *DP = EmitAMDGPUDispatchPtr(CGF); // Indexing the HSA kernel_dispatch_packet struct. auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4); auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); auto *LD = CGF.Builder.CreateLoad( Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(4))); llvm::MDBuilder MDB(CGF.getLLVMContext()); // Known non-zero. LD->setMetadata(llvm::LLVMContext::MD_range, MDB.createRange(APInt(32, 1), APInt::getZero(32))); LD->setMetadata(llvm::LLVMContext::MD_invariant_load, llvm::MDNode::get(CGF.getLLVMContext(), {})); return LD; } } // namespace // Generates the IR for __builtin_read_exec_*. // Lowers the builtin to amdgcn_ballot intrinsic. static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E, llvm::Type *RegisterType, llvm::Type *ValueType, bool isExecHi) { CodeGen::CGBuilderTy &Builder = CGF.Builder; CodeGen::CodeGenModule &CGM = CGF.CGM; Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType}); llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)}); if (isExecHi) { Value *Rt2 = Builder.CreateLShr(Call, 32); Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty); return Rt2; } return Call; } // Emit an intrinsic that has 1 float or double operand, and 1 integer. static Value *emitFPIntBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID) { llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); return CGF.Builder.CreateCall(F, {Src0, Src1}); } // For processing memory ordering and memory scope arguments of various // amdgcn builtins. // \p Order takes a C++11 comptabile memory-ordering specifier and converts // it into LLVM's memory ordering specifier using atomic C ABI, and writes // to \p AO. \p Scope takes a const char * and converts it into AMDGCN // specific SyncScopeID and writes it to \p SSID. void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope, llvm::AtomicOrdering &AO, llvm::SyncScope::ID &SSID) { int ord = cast(Order)->getZExtValue(); // Map C11/C++11 memory ordering to LLVM memory ordering assert(llvm::isValidAtomicOrderingCABI(ord)); switch (static_cast(ord)) { case llvm::AtomicOrderingCABI::acquire: case llvm::AtomicOrderingCABI::consume: AO = llvm::AtomicOrdering::Acquire; break; case llvm::AtomicOrderingCABI::release: AO = llvm::AtomicOrdering::Release; break; case llvm::AtomicOrderingCABI::acq_rel: AO = llvm::AtomicOrdering::AcquireRelease; break; case llvm::AtomicOrderingCABI::seq_cst: AO = llvm::AtomicOrdering::SequentiallyConsistent; break; case llvm::AtomicOrderingCABI::relaxed: AO = llvm::AtomicOrdering::Monotonic; break; } // Some of the atomic builtins take the scope as a string name. StringRef scp; if (llvm::getConstantStringInfo(Scope, scp)) { SSID = getLLVMContext().getOrInsertSyncScopeID(scp); return; } // Older builtins had an enum argument for the memory scope. int scope = cast(Scope)->getZExtValue(); switch (scope) { case 0: // __MEMORY_SCOPE_SYSTEM SSID = llvm::SyncScope::System; break; case 1: // __MEMORY_SCOPE_DEVICE SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); break; case 2: // __MEMORY_SCOPE_WRKGRP SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup"); break; case 3: // __MEMORY_SCOPE_WVFRNT SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront"); break; case 4: // __MEMORY_SCOPE_SINGLE SSID = llvm::SyncScope::SingleThread; break; default: SSID = llvm::SyncScope::System; break; } } llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx, const CallExpr *E) { llvm::Value *Arg = nullptr; if ((ICEArguments & (1 << Idx)) == 0) { Arg = EmitScalarExpr(E->getArg(Idx)); } else { // If this is required to be a constant, constant fold it so that we // know that the generated intrinsic gets a ConstantInt. std::optional Result = E->getArg(Idx)->getIntegerConstantExpr(getContext()); assert(Result && "Expected argument to be a constant"); Arg = llvm::ConstantInt::get(getLLVMContext(), *Result); } return Arg; } void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, const CallExpr *E) { constexpr const char *Tag = "amdgpu-as"; LLVMContext &Ctx = Inst->getContext(); SmallVector MMRAs; for (unsigned K = 2; K < E->getNumArgs(); ++K) { llvm::Value *V = EmitScalarExpr(E->getArg(K)); StringRef AS; if (llvm::getConstantStringInfo(V, AS)) { MMRAs.push_back({Tag, AS}); // TODO: Delete the resulting unused constant? continue; } CGM.Error(E->getExprLoc(), "expected an address space name as a string literal"); } llvm::sort(MMRAs); MMRAs.erase(llvm::unique(MMRAs), MMRAs.end()); Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; llvm::SyncScope::ID SSID; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_div_scale: case AMDGPU::BI__builtin_amdgcn_div_scalef: { // Translate from the intrinsics's struct return to the builtin's out // argument. Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3)); llvm::Value *X = EmitScalarExpr(E->getArg(0)); llvm::Value *Y = EmitScalarExpr(E->getArg(1)); llvm::Value *Z = EmitScalarExpr(E->getArg(2)); llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale, X->getType()); llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z}); llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0); llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1); llvm::Type *RealFlagType = FlagOutPtr.getElementType(); llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType); Builder.CreateStore(FlagExt, FlagOutPtr); return Result; } case AMDGPU::BI__builtin_amdgcn_div_fmas: case AMDGPU::BI__builtin_amdgcn_div_fmasf: { llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas, Src0->getType()); llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3); return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool}); } case AMDGPU::BI__builtin_amdgcn_ds_swizzle: return emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::amdgcn_ds_swizzle); case AMDGPU::BI__builtin_amdgcn_mov_dpp8: case AMDGPU::BI__builtin_amdgcn_mov_dpp: case AMDGPU::BI__builtin_amdgcn_update_dpp: { llvm::SmallVector Args; // Find out if any arguments are required to be integer constant // expressions. unsigned ICEArguments = 0; ASTContext::GetBuiltinTypeError Error; getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); llvm::Type *DataTy = ConvertType(E->getArg(0)->getType()); unsigned Size = DataTy->getPrimitiveSizeInBits(); llvm::Type *IntTy = llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u)); Function *F = CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8 ? Intrinsic::amdgcn_mov_dpp8 : Intrinsic::amdgcn_update_dpp, IntTy); assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 || E->getNumArgs() == 2); bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp; if (InsertOld) Args.push_back(llvm::PoisonValue::get(IntTy)); for (unsigned I = 0; I != E->getNumArgs(); ++I) { llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E); if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) && Size < 32) { if (!DataTy->isIntegerTy()) V = Builder.CreateBitCast( V, llvm::IntegerType::get(Builder.getContext(), Size)); V = Builder.CreateZExtOrBitCast(V, IntTy); } llvm::Type *ExpTy = F->getFunctionType()->getFunctionParamType(I + InsertOld); Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy)); } Value *V = Builder.CreateCall(F, Args); if (Size < 32 && !DataTy->isIntegerTy()) V = Builder.CreateTrunc( V, llvm::IntegerType::get(Builder.getContext(), Size)); return Builder.CreateTruncOrBitCast(V, DataTy); } case AMDGPU::BI__builtin_amdgcn_permlane16: case AMDGPU::BI__builtin_amdgcn_permlanex16: return emitBuiltinWithOneOverloadedType<6>( *this, E, BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16 ? Intrinsic::amdgcn_permlane16 : Intrinsic::amdgcn_permlanex16); case AMDGPU::BI__builtin_amdgcn_permlane64: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_permlane64); case AMDGPU::BI__builtin_amdgcn_readlane: return emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::amdgcn_readlane); case AMDGPU::BI__builtin_amdgcn_readfirstlane: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_readfirstlane); case AMDGPU::BI__builtin_amdgcn_div_fixup: case AMDGPU::BI__builtin_amdgcn_div_fixupf: case AMDGPU::BI__builtin_amdgcn_div_fixuph: return emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::amdgcn_div_fixup); case AMDGPU::BI__builtin_amdgcn_trig_preop: case AMDGPU::BI__builtin_amdgcn_trig_preopf: return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop); case AMDGPU::BI__builtin_amdgcn_rcp: case AMDGPU::BI__builtin_amdgcn_rcpf: case AMDGPU::BI__builtin_amdgcn_rcph: case AMDGPU::BI__builtin_amdgcn_rcp_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp); case AMDGPU::BI__builtin_amdgcn_sqrt: case AMDGPU::BI__builtin_amdgcn_sqrtf: case AMDGPU::BI__builtin_amdgcn_sqrth: case AMDGPU::BI__builtin_amdgcn_sqrt_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sqrt); case AMDGPU::BI__builtin_amdgcn_rsq: case AMDGPU::BI__builtin_amdgcn_rsqf: case AMDGPU::BI__builtin_amdgcn_rsqh: case AMDGPU::BI__builtin_amdgcn_rsq_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq); case AMDGPU::BI__builtin_amdgcn_rsq_clamp: case AMDGPU::BI__builtin_amdgcn_rsq_clampf: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq_clamp); case AMDGPU::BI__builtin_amdgcn_sinf: case AMDGPU::BI__builtin_amdgcn_sinh: case AMDGPU::BI__builtin_amdgcn_sin_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin); case AMDGPU::BI__builtin_amdgcn_cosf: case AMDGPU::BI__builtin_amdgcn_cosh: case AMDGPU::BI__builtin_amdgcn_cos_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos); case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: return EmitAMDGPUDispatchPtr(*this, E); case AMDGPU::BI__builtin_amdgcn_logf: case AMDGPU::BI__builtin_amdgcn_log_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log); case AMDGPU::BI__builtin_amdgcn_exp2f: case AMDGPU::BI__builtin_amdgcn_exp2_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_exp2); case AMDGPU::BI__builtin_amdgcn_log_clampf: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log_clamp); case AMDGPU::BI__builtin_amdgcn_ldexp: case AMDGPU::BI__builtin_amdgcn_ldexpf: { llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); llvm::Function *F = CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()}); return Builder.CreateCall(F, {Src0, Src1}); } case AMDGPU::BI__builtin_amdgcn_ldexph: { // The raw instruction has a different behavior for out of bounds exponent // values (implicit truncation instead of saturate to short_min/short_max). llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); llvm::Function *F = CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty}); return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)}); } case AMDGPU::BI__builtin_amdgcn_frexp_mant: case AMDGPU::BI__builtin_amdgcn_frexp_mantf: case AMDGPU::BI__builtin_amdgcn_frexp_manth: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_frexp_mant); case AMDGPU::BI__builtin_amdgcn_frexp_exp: case AMDGPU::BI__builtin_amdgcn_frexp_expf: { Value *Src0 = EmitScalarExpr(E->getArg(0)); Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp, { Builder.getInt32Ty(), Src0->getType() }); return Builder.CreateCall(F, Src0); } case AMDGPU::BI__builtin_amdgcn_frexp_exph: { Value *Src0 = EmitScalarExpr(E->getArg(0)); Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp, { Builder.getInt16Ty(), Src0->getType() }); return Builder.CreateCall(F, Src0); } case AMDGPU::BI__builtin_amdgcn_fract: case AMDGPU::BI__builtin_amdgcn_fractf: case AMDGPU::BI__builtin_amdgcn_fracth: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_fract); case AMDGPU::BI__builtin_amdgcn_lerp: return emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::amdgcn_lerp); case AMDGPU::BI__builtin_amdgcn_ubfe: return emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::amdgcn_ubfe); case AMDGPU::BI__builtin_amdgcn_sbfe: return emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::amdgcn_sbfe); case AMDGPU::BI__builtin_amdgcn_ballot_w32: case AMDGPU::BI__builtin_amdgcn_ballot_w64: { llvm::Type *ResultType = ConvertType(E->getType()); llvm::Value *Src = EmitScalarExpr(E->getArg(0)); Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType }); return Builder.CreateCall(F, { Src }); } case AMDGPU::BI__builtin_amdgcn_tanhf: case AMDGPU::BI__builtin_amdgcn_tanhh: case AMDGPU::BI__builtin_amdgcn_tanh_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_tanh); case AMDGPU::BI__builtin_amdgcn_uicmp: case AMDGPU::BI__builtin_amdgcn_uicmpl: case AMDGPU::BI__builtin_amdgcn_sicmp: case AMDGPU::BI__builtin_amdgcn_sicmpl: { llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); // FIXME-GFX10: How should 32 bit mask be handled? Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp, { Builder.getInt64Ty(), Src0->getType() }); return Builder.CreateCall(F, { Src0, Src1, Src2 }); } case AMDGPU::BI__builtin_amdgcn_fcmp: case AMDGPU::BI__builtin_amdgcn_fcmpf: { llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); // FIXME-GFX10: How should 32 bit mask be handled? Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp, { Builder.getInt64Ty(), Src0->getType() }); return Builder.CreateCall(F, { Src0, Src1, Src2 }); } case AMDGPU::BI__builtin_amdgcn_class: case AMDGPU::BI__builtin_amdgcn_classf: case AMDGPU::BI__builtin_amdgcn_classh: return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class); case AMDGPU::BI__builtin_amdgcn_fmed3f: case AMDGPU::BI__builtin_amdgcn_fmed3h: return emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::amdgcn_fmed3); case AMDGPU::BI__builtin_amdgcn_ds_append: case AMDGPU::BI__builtin_amdgcn_ds_consume: { Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ? Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume; Value *Src0 = EmitScalarExpr(E->getArg(0)); Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() }); return Builder.CreateCall(F, { Src0, Builder.getFalse() }); } case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16: case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16: case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: { Intrinsic::ID IID; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: IID = Intrinsic::amdgcn_global_load_tr_b64; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: IID = Intrinsic::amdgcn_global_load_tr_b128; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: IID = Intrinsic::amdgcn_global_load_tr4_b64; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: IID = Intrinsic::amdgcn_global_load_tr6_b96; break; case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: IID = Intrinsic::amdgcn_ds_load_tr4_b64; break; case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: IID = Intrinsic::amdgcn_ds_load_tr6_b96; break; case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: IID = Intrinsic::amdgcn_ds_load_tr8_b64; break; case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: IID = Intrinsic::amdgcn_ds_load_tr16_b128; break; case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: IID = Intrinsic::amdgcn_ds_read_tr4_b64; break; case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: IID = Intrinsic::amdgcn_ds_read_tr8_b64; break; case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: IID = Intrinsic::amdgcn_ds_read_tr6_b96; break; case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16: case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16: IID = Intrinsic::amdgcn_ds_read_tr16_b64; break; } llvm::Type *LoadTy = ConvertType(E->getType()); llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); return Builder.CreateCall(F, {Addr}); } case AMDGPU::BI__builtin_amdgcn_load_to_lds: { // Should this have asan instrumentation? return emitBuiltinWithOneOverloadedType<5>(*this, E, Intrinsic::amdgcn_load_to_lds); } case AMDGPU::BI__builtin_amdgcn_get_fpenv: { Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, {llvm::Type::getInt64Ty(getLLVMContext())}); return Builder.CreateCall(F); } case AMDGPU::BI__builtin_amdgcn_set_fpenv: { Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv, {llvm::Type::getInt64Ty(getLLVMContext())}); llvm::Value *Env = EmitScalarExpr(E->getArg(0)); return Builder.CreateCall(F, {Env}); } case AMDGPU::BI__builtin_amdgcn_read_exec: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_lo: return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_hi: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true); case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray: case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h: case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l: case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: { llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0)); llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1)); llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2)); llvm::Value *RayDir = EmitScalarExpr(E->getArg(3)); llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4)); llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5)); // The builtins take these arguments as vec4 where the last element is // ignored. The intrinsic takes them as vec3. RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin, {0, 1, 2}); RayDir = Builder.CreateShuffleVector(RayDir, RayDir, {0, 1, 2}); RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir, {0, 1, 2}); Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray, {NodePtr->getType(), RayDir->getType()}); return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir, RayInverseDir, TextureDescr}); } case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: { Intrinsic::ID IID; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: IID = Intrinsic::amdgcn_image_bvh8_intersect_ray; break; case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray; break; } llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0)); llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1)); llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2)); llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3)); llvm::Value *RayDir = EmitScalarExpr(E->getArg(4)); llvm::Value *Offset = EmitScalarExpr(E->getArg(5)); llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6)); Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7)); Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8)); llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID); llvm::CallInst *CI = Builder.CreateCall( IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir, Offset, TextureDescr}); llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0); llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1); llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2); Builder.CreateStore(RetRayOrigin, RetRayOriginPtr); Builder.CreateStore(RetRayDir, RetRayDirPtr); return RetVData; } case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: { Intrinsic::ID IID; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: IID = Intrinsic::amdgcn_ds_bvh_stack_rtn; break; case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn; break; case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn; break; case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn; break; } SmallVector Args; for (int i = 0, e = E->getNumArgs(); i != e; ++i) Args.push_back(EmitScalarExpr(E->getArg(i))); Function *F = CGM.getIntrinsic(IID); Value *Call = Builder.CreateCall(F, Args); Value *Rtn = Builder.CreateExtractValue(Call, 0); Value *A = Builder.CreateExtractValue(Call, 1); llvm::Type *RetTy = ConvertType(E->getType()); Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn, (uint64_t)0); // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns // <2 x i64>, zext the second value. if (A->getType()->getPrimitiveSizeInBits() < RetTy->getScalarType()->getPrimitiveSizeInBits()) A = Builder.CreateZExt(A, RetTy->getScalarType()); return Builder.CreateInsertElement(I0, A, 1); } case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4: case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8); Function *F = CGM.getIntrinsic( BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4, {VT, VT}); SmallVector Args; for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I) Args.push_back(EmitScalarExpr(E->getArg(I))); return Builder.CreateCall(F, Args); } case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: // GFX1250 WMMA builtins case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16: case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4: case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16: case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16: case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16: case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8: case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8: case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8: case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8: case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: { // These operations perform a matrix multiplication and accumulation of // the form: // D = A * B + C // We need to specify one type for matrices AB and one for matrices CD. // Sparse matrix operations can have different types for A and B as well as // an additional type for sparsity index. // Destination type should be put before types used for source operands. SmallVector ArgsForMatchingMatrixTypes; // On GFX12, the intrinsics with 16-bit accumulator use a packed layout. // There is no need for the variable opsel argument, so always set it to // "false". bool AppendFalseForOpselArg = false; unsigned BuiltinWMMAOp; // Need return type when D and C are of different types. bool NeedReturnType = false; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: AppendFalseForOpselArg = true; [[fallthrough]]; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16; break; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: AppendFalseForOpselArg = true; [[fallthrough]]; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied; break; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16; break; case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16; break; case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4; break; case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8; break; // GFX1250 WMMA builtins case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32: ArgsForMatchingMatrixTypes = {5, 1}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16: ArgsForMatchingMatrixTypes = {5, 1}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16: ArgsForMatchingMatrixTypes = {5, 1}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16: ArgsForMatchingMatrixTypes = {5, 1}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16; break; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16: ArgsForMatchingMatrixTypes = {5, 1}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16; break; case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16: NeedReturnType = true; ArgsForMatchingMatrixTypes = {1, 5}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8: ArgsForMatchingMatrixTypes = {3, 0}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8: ArgsForMatchingMatrixTypes = {4, 1}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4: ArgsForMatchingMatrixTypes = {5, 1, 3}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4: ArgsForMatchingMatrixTypes = {3, 0, 1}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16; break; case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16; break; case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8: ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8; break; case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8; break; } SmallVector Args; for (int i = 0, e = E->getNumArgs(); i != e; ++i) Args.push_back(EmitScalarExpr(E->getArg(i))); if (AppendFalseForOpselArg) Args.push_back(Builder.getFalse()); SmallVector ArgTypes; if (NeedReturnType) ArgTypes.push_back(ConvertType(E->getType())); for (auto ArgIdx : ArgsForMatchingMatrixTypes) ArgTypes.push_back(Args[ArgIdx]->getType()); Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes); return Builder.CreateCall(F, Args); } // amdgcn workgroup size case AMDGPU::BI__builtin_amdgcn_workgroup_size_x: return EmitAMDGPUWorkGroupSize(*this, 0); case AMDGPU::BI__builtin_amdgcn_workgroup_size_y: return EmitAMDGPUWorkGroupSize(*this, 1); case AMDGPU::BI__builtin_amdgcn_workgroup_size_z: return EmitAMDGPUWorkGroupSize(*this, 2); // amdgcn grid size case AMDGPU::BI__builtin_amdgcn_grid_size_x: return EmitAMDGPUGridSize(*this, 0); case AMDGPU::BI__builtin_amdgcn_grid_size_y: return EmitAMDGPUGridSize(*this, 1); case AMDGPU::BI__builtin_amdgcn_grid_size_z: return EmitAMDGPUGridSize(*this, 2); // r600 intrinsics case AMDGPU::BI__builtin_r600_recipsqrt_ieee: case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::r600_recipsqrt_ieee); case AMDGPU::BI__builtin_amdgcn_alignbit: { llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType()); return Builder.CreateCall(F, { Src0, Src1, Src2 }); } case AMDGPU::BI__builtin_amdgcn_fence: { ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), AO, SSID); FenceInst *Fence = Builder.CreateFence(AO, SSID); if (E->getNumArgs() > 2) AddAMDGPUFenceAddressSpaceMMRA(Fence, E); return Fence; } case AMDGPU::BI__builtin_amdgcn_atomic_inc32: case AMDGPU::BI__builtin_amdgcn_atomic_inc64: case AMDGPU::BI__builtin_amdgcn_atomic_dec32: case AMDGPU::BI__builtin_amdgcn_atomic_dec64: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: case AMDGPU::BI__builtin_amdgcn_ds_faddf: case AMDGPU::BI__builtin_amdgcn_ds_fminf: case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: { llvm::AtomicRMWInst::BinOp BinOp; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_atomic_inc32: case AMDGPU::BI__builtin_amdgcn_atomic_inc64: BinOp = llvm::AtomicRMWInst::UIncWrap; break; case AMDGPU::BI__builtin_amdgcn_atomic_dec32: case AMDGPU::BI__builtin_amdgcn_atomic_dec64: BinOp = llvm::AtomicRMWInst::UDecWrap; break; case AMDGPU::BI__builtin_amdgcn_ds_faddf: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: BinOp = llvm::AtomicRMWInst::FAdd; break; case AMDGPU::BI__builtin_amdgcn_ds_fminf: case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: BinOp = llvm::AtomicRMWInst::FMin; break; case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: BinOp = llvm::AtomicRMWInst::FMax; break; } Address Ptr = CheckAtomicAlignment(*this, E); Value *Val = EmitScalarExpr(E->getArg(1)); llvm::Type *OrigTy = Val->getType(); QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType(); bool Volatile; if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf || BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf || BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) { // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument Volatile = cast(EmitScalarExpr(E->getArg(4)))->getZExtValue(); } else { // Infer volatile from the passed type. Volatile = PtrTy->castAs()->getPointeeType().isVolatileQualified(); } if (E->getNumArgs() >= 4) { // Some of the builtins have explicit ordering and scope arguments. ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3)), AO, SSID); } else { // Most of the builtins do not have syncscope/order arguments. For DS // atomics the scope doesn't really matter, as they implicitly operate at // workgroup scope. // // The global/flat cases need to use agent scope to consistently produce // the native instruction instead of a cmpxchg expansion. SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); AO = AtomicOrdering::Monotonic; // The v2bf16 builtin uses i16 instead of a natural bfloat type. if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 || BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 || BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) { llvm::Type *V2BF16Ty = FixedVectorType::get( llvm::Type::getBFloatTy(Builder.getContext()), 2); Val = Builder.CreateBitCast(Val, V2BF16Ty); } } llvm::AtomicRMWInst *RMW = Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID); if (Volatile) RMW->setVolatile(true); unsigned AddrSpace = Ptr.getType()->getAddressSpace(); if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) { // Most targets require "amdgpu.no.fine.grained.memory" to emit the native // instruction for flat and global operations. llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {}); RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD); // Most targets require "amdgpu.ignore.denormal.mode" to emit the native // instruction, but this only matters for float fadd. if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy()) RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD); } return Builder.CreateBitCast(RMW, OrigTy); } case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn: case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: { llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); llvm::Type *ResultType = ConvertType(E->getType()); // s_sendmsg_rtn is mangled using return type only. Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType}); return Builder.CreateCall(F, {Arg}); } case AMDGPU::BI__builtin_amdgcn_permlane16_swap: case AMDGPU::BI__builtin_amdgcn_permlane32_swap: { // Because builtin types are limited, and the intrinsic uses a struct/pair // output, marshal the pair-of-i32 to <2 x i32>. Value *VDstOld = EmitScalarExpr(E->getArg(0)); Value *VSrcOld = EmitScalarExpr(E->getArg(1)); Value *FI = EmitScalarExpr(E->getArg(2)); Value *BoundCtrl = EmitScalarExpr(E->getArg(3)); Function *F = CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap ? Intrinsic::amdgcn_permlane16_swap : Intrinsic::amdgcn_permlane32_swap); llvm::CallInst *Call = Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl}); llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0); llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1); llvm::Type *ResultType = ConvertType(E->getType()); llvm::Value *Insert0 = Builder.CreateInsertElement( llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0)); llvm::Value *AsVector = Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1)); return AsVector; } case AMDGPU::BI__builtin_amdgcn_bitop3_b32: case AMDGPU::BI__builtin_amdgcn_bitop3_b16: return emitBuiltinWithOneOverloadedType<4>(*this, E, Intrinsic::amdgcn_bitop3); case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: { // TODO: LLVM has this overloaded to allow for fat pointers, but since // those haven't been plumbed through to Clang yet, default to creating the // resource type. SmallVector Args; for (unsigned I = 0; I < 4; ++I) Args.push_back(EmitScalarExpr(E->getArg(I))); llvm::PointerType *RetTy = llvm::PointerType::get( Builder.getContext(), llvm::AMDGPUAS::BUFFER_RESOURCE); Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc, {RetTy, Args[0]->getType()}); return Builder.CreateCall(F, Args); } case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8: case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16: case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32: case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64: case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96: case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128: return emitBuiltinWithOneOverloadedType<5>( *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store); case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8: case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16: case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32: case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64: case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96: case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: { llvm::Type *RetTy = nullptr; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8: RetTy = Int8Ty; break; case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16: RetTy = Int16Ty; break; case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32: RetTy = Int32Ty; break; case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64: RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2); break; case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96: RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3); break; case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4); break; } Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy); return Builder.CreateCall( F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))}); } case AMDGPU::BI__builtin_amdgcn_s_prefetch_data: return emitBuiltinWithOneOverloadedType<2>( *this, E, Intrinsic::amdgcn_s_prefetch_data); case Builtin::BIlogbf: case Builtin::BI__builtin_logbf: { Value *Src0 = EmitScalarExpr(E->getArg(0)); Function *FrExpFunc = CGM.getIntrinsic( Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()}); CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0); Value *Exp = Builder.CreateExtractValue(FrExp, 1); Value *Add = Builder.CreateAdd( Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true); Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getFloatTy()); Value *Fabs = emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs); Value *FCmpONE = Builder.CreateFCmpONE( Fabs, ConstantFP::getInfinity(Builder.getFloatTy())); Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs); Value *FCmpOEQ = Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getFloatTy())); Value *Sel2 = Builder.CreateSelect( FCmpOEQ, ConstantFP::getInfinity(Builder.getFloatTy(), /*Negative=*/true), Sel1); return Sel2; } case Builtin::BIlogb: case Builtin::BI__builtin_logb: { Value *Src0 = EmitScalarExpr(E->getArg(0)); Function *FrExpFunc = CGM.getIntrinsic( Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()}); CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0); Value *Exp = Builder.CreateExtractValue(FrExp, 1); Value *Add = Builder.CreateAdd( Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true); Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getDoubleTy()); Value *Fabs = emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs); Value *FCmpONE = Builder.CreateFCmpONE( Fabs, ConstantFP::getInfinity(Builder.getDoubleTy())); Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs); Value *FCmpOEQ = Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getDoubleTy())); Value *Sel2 = Builder.CreateSelect( FCmpOEQ, ConstantFP::getInfinity(Builder.getDoubleTy(), /*Negative=*/true), Sel1); return Sel2; } case Builtin::BIscalbnf: case Builtin::BI__builtin_scalbnf: case Builtin::BIscalbn: case Builtin::BI__builtin_scalbn: return emitBinaryExpMaybeConstrainedFPBuiltin( *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp); default: return nullptr; } }