diff options
Diffstat (limited to 'llvm/lib')
155 files changed, 3954 insertions, 1954 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 759c553..2d52f34 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1373,7 +1373,7 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst, if (ConstantFP *CFP = dyn_cast<ConstantFP>(Operand)) return flushDenormalConstantFP(CFP, Inst, IsOutput); - if (isa<ConstantAggregateZero, UndefValue, ConstantExpr>(Operand)) + if (isa<ConstantAggregateZero, UndefValue>(Operand)) return Operand; Type *Ty = Operand->getType(); @@ -1389,6 +1389,9 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst, Ty = VecTy->getElementType(); } + if (isa<ConstantExpr>(Operand)) + return Operand; + if (const auto *CV = dyn_cast<ConstantVector>(Operand)) { SmallVector<Constant *, 16> NewElts; for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) { @@ -2628,14 +2631,14 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, case Intrinsic::nvvm_ceil_d: return ConstantFoldFP( ceil, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); case Intrinsic::nvvm_fabs_ftz: case Intrinsic::nvvm_fabs: return ConstantFoldFP( fabs, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); case Intrinsic::nvvm_floor_ftz_f: @@ -2643,7 +2646,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, case Intrinsic::nvvm_floor_d: return ConstantFoldFP( floor, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); case Intrinsic::nvvm_rcp_rm_ftz_f: @@ -2705,7 +2708,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, return nullptr; return ConstantFoldFP( sqrt, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); // AMDGCN Intrinsics: diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 0990a0d..61a575c 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -2682,6 +2682,21 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, return getAddExpr(NewOps, PreservedFlags); } } + + // Try to push the constant operand into a ZExt: A + zext (-A + B) -> zext + // (B), if trunc (A) + -A + B does not unsigned-wrap. + if (auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(Ops[1])) { + const SCEV *B = ZExt->getOperand(0); + const SCEV *NarrowA = getTruncateExpr(A, B->getType()); + if (isa<SCEVAddExpr>(B) && + NarrowA == getNegativeSCEV(cast<SCEVAddExpr>(B)->getOperand(0)) && + getZeroExtendExpr(NarrowA, ZExt->getType()) == A && + hasFlags(StrengthenNoWrapFlags(this, scAddExpr, {NarrowA, B}, + SCEV::FlagAnyWrap), + SCEV::FlagNUW)) { + return getZeroExtendExpr(getAddExpr(NarrowA, B), ZExt->getType()); + } + } } // Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y) diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp index c871070..7025b83 100644 --- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -525,6 +525,8 @@ AAMDNodes AAMDNodes::merge(const AAMDNodes &Other) const { Result.TBAAStruct = nullptr; Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope); Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias); + Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace( + NoAliasAddrSpace, Other.NoAliasAddrSpace); return Result; } @@ -533,6 +535,8 @@ AAMDNodes AAMDNodes::concat(const AAMDNodes &Other) const { Result.TBAA = Result.TBAAStruct = nullptr; Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope); Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias); + Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace( + NoAliasAddrSpace, Other.NoAliasAddrSpace); return Result; } diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp index 15107c2..2e4063f 100644 --- a/llvm/lib/Analysis/UniformityAnalysis.cpp +++ b/llvm/lib/Analysis/UniformityAnalysis.cpp @@ -178,6 +178,7 @@ bool UniformityInfoWrapperPass::runOnFunction(Function &F) { void UniformityInfoWrapperPass::print(raw_ostream &OS, const Module *) const { OS << "UniformityInfo for function '" << m_function->getName() << "':\n"; + m_uniformityInfo.print(OS); } void UniformityInfoWrapperPass::releaseMemory() { diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 1b3da59..e9cf2ee 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::exp: case Intrinsic::exp10: case Intrinsic::exp2: + case Intrinsic::ldexp: case Intrinsic::log: case Intrinsic::log10: case Intrinsic::log2: @@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::canonicalize: case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::ucmp: @@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( switch (ID) { case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::vp_lrint: @@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::vp_is_fpclass: return OpdIdx == 0; case Intrinsic::powi: + case Intrinsic::ldexp: return OpdIdx == -1 || OpdIdx == 1; default: return OpdIdx == -1; @@ -240,30 +246,6 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI, return Intrinsic::not_intrinsic; } -struct InterleaveIntrinsic { - Intrinsic::ID Interleave, Deinterleave; -}; - -static InterleaveIntrinsic InterleaveIntrinsics[] = { - {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2}, - {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3}, - {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4}, - {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5}, - {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6}, - {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7}, - {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8}, -}; - -Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) { - assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); - return InterleaveIntrinsics[Factor - 2].Interleave; -} - -Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) { - assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); - return InterleaveIntrinsics[Factor - 2].Deinterleave; -} - unsigned llvm::getInterleaveIntrinsicFactor(Intrinsic::ID ID) { switch (ID) { case Intrinsic::vector_interleave2: diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp index 3b436af..f1765d7 100644 --- a/llvm/lib/BinaryFormat/SFrame.cpp +++ b/llvm/lib/BinaryFormat/SFrame.cpp @@ -35,3 +35,36 @@ ArrayRef<EnumEntry<sframe::ABI>> sframe::getABIs() { }; return ArrayRef(ABIs); } + +ArrayRef<EnumEntry<sframe::FREType>> sframe::getFRETypes() { + static constexpr EnumEntry<sframe::FREType> FRETypes[] = { +#define HANDLE_SFRAME_FRE_TYPE(CODE, NAME) {#NAME, sframe::FREType::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(FRETypes); +} + +ArrayRef<EnumEntry<sframe::FDEType>> sframe::getFDETypes() { + static constexpr EnumEntry<sframe::FDEType> FDETypes[] = { +#define HANDLE_SFRAME_FDE_TYPE(CODE, NAME) {#NAME, sframe::FDEType::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(FDETypes); +} + +ArrayRef<EnumEntry<sframe::AArch64PAuthKey>> sframe::getAArch64PAuthKeys() { + static constexpr EnumEntry<sframe::AArch64PAuthKey> AArch64PAuthKeys[] = { +#define HANDLE_SFRAME_AARCH64_PAUTH_KEY(CODE, NAME) \ + {#NAME, sframe::AArch64PAuthKey::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(AArch64PAuthKeys); +} + +ArrayRef<EnumEntry<sframe::FREOffset>> sframe::getFREOffsets() { + static constexpr EnumEntry<sframe::FREOffset> FREOffsets[] = { +#define HANDLE_SFRAME_FRE_OFFSET(CODE, NAME) {#NAME, sframe::FREOffset::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(FREOffsets); +} diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index a7c99b1..dcfd9aa 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -2103,8 +2103,9 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { DI->eraseFromParent(); return; } - - DI->setDebugValueUndef(); + // Move DBG_LABELs without modifying them. Set DBG_VALUEs undef. + if (!DI->isDebugLabel()) + DI->setDebugValueUndef(); DI->moveBefore(&*Loc); }; diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 9512f79..810dc29 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -101,6 +101,7 @@ CGOPT(EABI, EABIVersion) CGOPT(DebuggerKind, DebuggerTuningOpt) CGOPT(bool, EnableStackSizeSection) CGOPT(bool, EnableAddrsig) +CGOPT(bool, EnableCallGraphSection) CGOPT(bool, EmitCallSiteInfo) CGOPT(bool, EnableMachineFunctionSplitter) CGOPT(bool, EnableStaticDataPartitioning) @@ -461,6 +462,11 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(EnableAddrsig); + static cl::opt<bool> EnableCallGraphSection( + "call-graph-section", cl::desc("Emit a call graph section"), + cl::init(false)); + CGBINDOPT(EnableCallGraphSection); + static cl::opt<bool> EmitCallSiteInfo( "emit-call-site-info", cl::desc( @@ -595,6 +601,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter(); Options.EnableStaticDataPartitioning = getEnableStaticDataPartitioning(); Options.EmitAddrsig = getEnableAddrsig(); + Options.EmitCallGraphSection = getEnableCallGraphSection(); Options.EmitCallSiteInfo = getEmitCallSiteInfo(); Options.EnableDebugEntryValues = getEnableDebugEntryValues(); Options.ForceDwarfFrameSection = getForceDwarfFrameSection(); diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 8855740f..9b2851e 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -2186,19 +2186,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, llvm_unreachable("Deinterleave node should already have ReplacementNode"); break; case ComplexDeinterleavingOperation::Splat: { - auto *NewTy = VectorType::getDoubleElementsVectorType( - cast<VectorType>(Node->Real->getType())); auto *R = dyn_cast<Instruction>(Node->Real); auto *I = dyn_cast<Instruction>(Node->Imag); if (R && I) { // Splats that are not constant are interleaved where they are located Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode(); IRBuilder<> IRB(InsertPoint); - ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2, - NewTy, {Node->Real, Node->Imag}); + ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag}); } else { - ReplacementNode = Builder.CreateIntrinsic( - Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag}); + ReplacementNode = + Builder.CreateVectorInterleave({Node->Real, Node->Imag}); } break; } @@ -2226,10 +2223,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0); auto *A = replaceNode(Builder, Node->Operands[0]); auto *B = replaceNode(Builder, Node->Operands[1]); - auto *NewMaskTy = VectorType::getDoubleElementsVectorType( - cast<VectorType>(MaskReal->getType())); - auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, - NewMaskTy, {MaskReal, MaskImag}); + auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag}); ReplacementNode = Builder.CreateSelect(NewMask, A, B); break; } @@ -2260,8 +2254,8 @@ void ComplexDeinterleavingGraph::processReductionSingle( } if (!NewInit) - NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, - {Init, Constant::getNullValue(VTy)}); + NewInit = + Builder.CreateVectorInterleave({Init, Constant::getNullValue(VTy)}); NewPHI->addIncoming(NewInit, Incoming); NewPHI->addIncoming(OperationReplacement, BackEdge); @@ -2281,16 +2275,12 @@ void ComplexDeinterleavingGraph::processReductionOperation( auto *OldPHIImag = ReductionInfo[Imag].first; auto *NewPHI = OldToNewPHI[OldPHIReal]; - auto *VTy = cast<VectorType>(Real->getType()); - auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); - // We have to interleave initial origin values coming from IncomingBlock Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming); Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming); IRBuilder<> Builder(Incoming->getTerminator()); - auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, - {InitReal, InitImag}); + auto *NewInit = Builder.CreateVectorInterleave({InitReal, InitImag}); NewPHI->addIncoming(NewInit, Incoming); NewPHI->addIncoming(OperationReplacement, BackEdge); diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 012d873..9ba1782 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -1009,7 +1009,8 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy, for (unsigned I = 0; I < NumValues; ++I) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]); + MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy, + Offsets[I]); auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, MRI.getType(VRegs[I]), commonAlignment(BaseAlign, Offsets[I])); @@ -1039,7 +1040,8 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy, for (unsigned I = 0; I < NumValues; ++I) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]); + MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy, + Offsets[I]); auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, MRI.getType(VRegs[I]), commonAlignment(BaseAlign, Offsets[I])); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index e8f513a..e84ba91 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5949,8 +5949,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, const TargetOptions &Options = MF->getTarget().Options; LLT DstType = MRI.getType(MI.getOperand(0).getReg()); - if (CanReassociate && - !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc))) + if (CanReassociate && !MI.getFlag(MachineInstr::MIFlag::FmReassoc)) return false; // Floating-point multiply-add with intermediate rounding. @@ -5962,8 +5961,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, if (!HasFMAD && !HasFMA) return false; - AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD; + AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD; // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract)) return false; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index dc5dfab..fd38c30 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1409,7 +1409,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr; for (unsigned i = 0; i < Regs.size(); ++i) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8); + MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8); MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8); Align BaseAlign = getMemOpAlign(LI); @@ -1448,7 +1448,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) { for (unsigned i = 0; i < Vals.size(); ++i) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8); + MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8); MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8); Align BaseAlign = getMemOpAlign(SI); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index ed7b07f..d9d3569 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4170,7 +4170,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); - auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst); + auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrAddReg, PtrReg, OffsetCst); auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy, SmallPtr, *SmallMMO); @@ -4277,8 +4277,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) { LLT PtrTy = MRI.getType(PtrReg); auto OffsetCst = MIRBuilder.buildConstant( LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); - auto SmallPtr = - MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst); + auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrTy, PtrReg, OffsetCst); MachineMemOperand *LargeMMO = MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); @@ -5349,7 +5348,8 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx, unsigned ByteOffset = Offset / 8; Register NewAddrReg; - MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset); + MIRBuilder.materializeObjectPtrOffset(NewAddrReg, AddrReg, OffsetTy, + ByteOffset); MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, ByteOffset, PartTy); @@ -8004,7 +8004,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly. return UnableToLegalize; - if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) { + if (MI.getFlag(MachineInstr::FmAfn)) { unsigned Flags = MI.getFlags(); auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags); MIRBuilder.buildFPTrunc(Dst, Src32, Flags); @@ -9822,7 +9822,7 @@ LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val, if (DstOff != 0) { auto Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); - Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); + Ptr = MIB.buildObjectPtrOffset(PtrTy, Dst, Offset).getReg(0); } MIB.buildStore(Value, Ptr, *StoreMMO); @@ -9962,7 +9962,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, LLT SrcTy = MRI.getType(Src); Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset) .getReg(0); - LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); + LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0); } auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); @@ -9970,7 +9970,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, Register StorePtr = Dst; if (CurrOffset != 0) { LLT DstTy = MRI.getType(Dst); - StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); + StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0); } MIB.buildStore(LdVal, StorePtr, *StoreMMO); CurrOffset += CopyTy.getSizeInBytes(); @@ -10060,7 +10060,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, LLT SrcTy = MRI.getType(Src); auto Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset); - LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); + LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0); } LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); CurrOffset += CopyTy.getSizeInBytes(); @@ -10078,7 +10078,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, LLT DstTy = MRI.getType(Dst); auto Offset = MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset); - StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); + StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0); } MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); CurrOffset += CopyTy.getSizeInBytes(); diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 121d7e8..27df7e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -208,11 +208,20 @@ MachineIRBuilder::buildPtrAdd(const DstOp &Res, const SrcOp &Op0, return buildInstr(TargetOpcode::G_PTR_ADD, {Res}, {Op0, Op1}, Flags); } +MachineInstrBuilder MachineIRBuilder::buildObjectPtrOffset(const DstOp &Res, + const SrcOp &Op0, + const SrcOp &Op1) { + return buildPtrAdd(Res, Op0, Op1, + MachineInstr::MIFlag::NoUWrap | + MachineInstr::MIFlag::InBounds); +} + std::optional<MachineInstrBuilder> MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0, - const LLT ValueTy, uint64_t Value) { + const LLT ValueTy, uint64_t Value, + std::optional<unsigned> Flags) { assert(Res == 0 && "Res is a result argument"); - assert(ValueTy.isScalar() && "invalid offset type"); + assert(ValueTy.isScalar() && "invalid offset type"); if (Value == 0) { Res = Op0; @@ -221,7 +230,14 @@ MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0, Res = getMRI()->createGenericVirtualRegister(getMRI()->getType(Op0)); auto Cst = buildConstant(ValueTy, Value); - return buildPtrAdd(Res, Op0, Cst.getReg(0)); + return buildPtrAdd(Res, Op0, Cst.getReg(0), Flags); +} + +std::optional<MachineInstrBuilder> MachineIRBuilder::materializeObjectPtrOffset( + Register &Res, Register Op0, const LLT ValueTy, uint64_t Value) { + return materializePtrAdd(Res, Op0, ValueTy, Value, + MachineInstr::MIFlag::NoUWrap | + MachineInstr::MIFlag::InBounds); } MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res, diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp index 7153902..8b72c29 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp +++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp @@ -217,6 +217,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) { .Case("nneg", MIToken::kw_nneg) .Case("disjoint", MIToken::kw_disjoint) .Case("samesign", MIToken::kw_samesign) + .Case("inbounds", MIToken::kw_inbounds) .Case("nofpexcept", MIToken::kw_nofpexcept) .Case("unpredictable", MIToken::kw_unpredictable) .Case("debug-location", MIToken::kw_debug_location) @@ -616,6 +617,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) { .Case("!range", MIToken::md_range) .Case("!DIExpression", MIToken::md_diexpr) .Case("!DILocation", MIToken::md_dilocation) + .Case("!noalias.addrspace", MIToken::md_noalias_addrspace) .Default(MIToken::Error); } diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h index d7cd067..0627f17 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.h +++ b/llvm/lib/CodeGen/MIRParser/MILexer.h @@ -78,6 +78,7 @@ struct MIToken { kw_nneg, kw_disjoint, kw_samesign, + kw_inbounds, kw_debug_location, kw_debug_instr_number, kw_dbg_instr_ref, @@ -151,6 +152,7 @@ struct MIToken { md_tbaa, md_alias_scope, md_noalias, + md_noalias_addrspace, md_range, md_diexpr, md_dilocation, diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 3a364d5..6a464d9 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -1477,7 +1477,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Token.is(MIToken::kw_nneg) || Token.is(MIToken::kw_disjoint) || Token.is(MIToken::kw_nusw) || - Token.is(MIToken::kw_samesign)) { + Token.is(MIToken::kw_samesign) || + Token.is(MIToken::kw_inbounds)) { // clang-format on // Mine frame and fast math flags if (Token.is(MIToken::kw_frame_setup)) @@ -1518,6 +1519,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Flags |= MachineInstr::NoUSWrap; if (Token.is(MIToken::kw_samesign)) Flags |= MachineInstr::SameSign; + if (Token.is(MIToken::kw_inbounds)) + Flags |= MachineInstr::InBounds; lex(); } @@ -3482,6 +3485,11 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { if (parseMDNode(AAInfo.NoAlias)) return true; break; + case MIToken::md_noalias_addrspace: + lex(); + if (parseMDNode(AAInfo.NoAliasAddrSpace)) + return true; + break; case MIToken::md_range: lex(); if (parseMDNode(Range)) @@ -3490,7 +3498,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { // TODO: Report an error on duplicate metadata nodes. default: return error("expected 'align' or '!tbaa' or '!alias.scope' or " - "'!noalias' or '!range'"); + "'!noalias' or '!range' or '!noalias.addrspace'"); } } if (expectAndConsume(MIToken::rparen)) diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 1e9fcf3..3e99e57 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -504,13 +504,21 @@ bool MIRParserImpl::initializeCallSiteInfo( return error(Error, ArgRegPair.Reg.SourceRange); CSInfo.ArgRegPairs.emplace_back(Reg, ArgRegPair.ArgNo); } + if (!YamlCSInfo.CalleeTypeIds.empty()) { + for (auto CalleeTypeId : YamlCSInfo.CalleeTypeIds) { + IntegerType *Int64Ty = Type::getInt64Ty(Context); + CSInfo.CalleeTypeIds.push_back(ConstantInt::get(Int64Ty, CalleeTypeId, + /*isSigned=*/false)); + } + } - if (TM.Options.EmitCallSiteInfo) + if (TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection) MF.addCallSiteInfo(&*CallI, std::move(CSInfo)); } - if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo) - return error(Twine("Call site info provided but not used")); + if (!YamlMF.CallSitesInfo.empty() && + !(TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection)) + return error("call site info provided but not used"); return false; } diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index bc4e299..ce1834a 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -525,24 +525,30 @@ static void convertCallSiteObjects(yaml::MachineFunction &YMF, const MachineFunction &MF, ModuleSlotTracker &MST) { const auto *TRI = MF.getSubtarget().getRegisterInfo(); - for (auto CSInfo : MF.getCallSitesInfo()) { + for (auto [MI, CallSiteInfo] : MF.getCallSitesInfo()) { yaml::CallSiteInfo YmlCS; yaml::MachineInstrLoc CallLocation; // Prepare instruction position. - MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator(); + MachineBasicBlock::const_instr_iterator CallI = MI->getIterator(); CallLocation.BlockNum = CallI->getParent()->getNumber(); // Get call instruction offset from the beginning of block. CallLocation.Offset = std::distance(CallI->getParent()->instr_begin(), CallI); YmlCS.CallLocation = CallLocation; + + auto [ArgRegPairs, CalleeTypeIds] = CallSiteInfo; // Construct call arguments and theirs forwarding register info. - for (auto ArgReg : CSInfo.second.ArgRegPairs) { + for (auto ArgReg : ArgRegPairs) { yaml::CallSiteInfo::ArgRegPair YmlArgReg; YmlArgReg.ArgNo = ArgReg.ArgNo; printRegMIR(ArgReg.Reg, YmlArgReg.Reg, TRI); YmlCS.ArgForwardingRegs.emplace_back(YmlArgReg); } + // Get type ids. + for (auto *CalleeTypeId : CalleeTypeIds) { + YmlCS.CalleeTypeIds.push_back(CalleeTypeId->getZExtValue()); + } YMF.CallSitesInfo.push_back(std::move(YmlCS)); } @@ -814,6 +820,8 @@ static void printMI(raw_ostream &OS, MFPrintState &State, OS << "nusw "; if (MI.getFlag(MachineInstr::SameSign)) OS << "samesign "; + if (MI.getFlag(MachineInstr::InBounds)) + OS << "inbounds "; // NOTE: Please add new MIFlags also to the MI_FLAGS_STR in // llvm/utils/update_mir_test_checks.py. diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 7ea2512..60d42e0 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -919,7 +919,7 @@ MachineFunction::getCallSiteInfo(const MachineInstr *MI) { assert(MI->isCandidateForAdditionalCallInfo() && "Call site info refers only to call (MI) candidates"); - if (!Target.Options.EmitCallSiteInfo) + if (!Target.Options.EmitCallSiteInfo && !Target.Options.EmitCallGraphSection) return CallSitesInfo.end(); return CallSitesInfo.find(MI); } diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index da3665b..79047f7 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -585,6 +585,8 @@ uint32_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) { MIFlags |= MachineInstr::MIFlag::NoUSWrap; if (GEP->hasNoUnsignedWrap()) MIFlags |= MachineInstr::MIFlag::NoUWrap; + if (GEP->isInBounds()) + MIFlags |= MachineInstr::MIFlag::InBounds; } // Copy the nonneg flag. @@ -1860,8 +1862,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << "nneg "; if (getFlag(MachineInstr::Disjoint)) OS << "disjoint "; + if (getFlag(MachineInstr::NoUSWrap)) + OS << "nusw "; if (getFlag(MachineInstr::SameSign)) OS << "samesign "; + if (getFlag(MachineInstr::InBounds)) + OS << "inbounds "; // Print the opcode name. if (TII) diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index 0d25169..c612f8de 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -1273,6 +1273,10 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << ", !noalias "; AAInfo.NoAlias->printAsOperand(OS, MST); } + if (AAInfo.NoAliasAddrSpace) { + OS << ", !noalias.addrspace "; + AAInfo.NoAliasAddrSpace->printAsOperand(OS, MST); + } if (getRanges()) { OS << ", !range "; getRanges()->printAsOperand(OS, MST); diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index 0f742c4..21bf052 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -423,7 +423,7 @@ void ModuloScheduleExpander::generateExistingPhis( // potentially define two values. unsigned MaxPhis = PrologStage + 2; if (!InKernel && (int)PrologStage <= LoopValStage) - MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1); + MaxPhis = std::max((int)MaxPhis - LoopValStage, 1); unsigned NumPhis = std::min(NumStages, MaxPhis); Register NewReg; diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 2d7987a..7ede564 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -306,7 +306,12 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate { /// number if it is not zero. If DstReg is a physical register and the /// existing subregister number of the def / use being updated is not zero, /// make sure to set it to the correct physical subregister. - void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx); + /// + /// If \p SubregToRegSrcInst is not empty, we are coalescing a + /// `DstReg = SUBREG_TO_REG SrcReg`, which should introduce an + /// implicit-def of DstReg on instructions that define SrcReg. + void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx, + ArrayRef<MachineInstr *> SubregToRegSrcInst = {}); /// If the given machine operand reads only undefined lanes add an undef /// flag. @@ -1443,6 +1448,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, // CopyMI may have implicit operands, save them so that we can transfer them // over to the newly materialized instruction after CopyMI is removed. + LaneBitmask NewMIImplicitOpsMask; SmallVector<MachineOperand, 4> ImplicitOps; ImplicitOps.reserve(CopyMI->getNumOperands() - CopyMI->getDesc().getNumOperands()); @@ -1457,6 +1463,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, (MO.getSubReg() == 0 && MO.getReg() == DstOperand.getReg())) && "unexpected implicit virtual register def"); ImplicitOps.push_back(MO); + if (MO.isDef() && MO.getReg().isVirtual() && + MRI->shouldTrackSubRegLiveness(DstReg)) + NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg()); } } @@ -1499,14 +1508,11 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, } else { assert(MO.getReg() == NewMI.getOperand(0).getReg()); - // We're only expecting another def of the main output, so the range - // should get updated with the regular output range. - // - // FIXME: The range updating below probably needs updating to look at - // the super register if subranges are tracked. - assert(!MRI->shouldTrackSubRegLiveness(DstReg) && - "subrange update for implicit-def of super register may not be " - "properly handled"); + // If lanemasks need to be tracked, compile the lanemask of the NewMI + // implicit def operands to avoid subranges for the super-regs from + // being removed by code later on in this function. + if (MRI->shouldTrackSubRegLiveness(MO.getReg())) + NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg()); } } } @@ -1606,7 +1612,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber()); VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); for (LiveInterval::SubRange &SR : DstInt.subranges()) { - if ((SR.LaneMask & DstMask).none()) { + if ((SR.LaneMask & DstMask).none() && + (SR.LaneMask & NewMIImplicitOpsMask).none()) { LLVM_DEBUG(dbgs() << "Removing undefined SubRange " << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n"); @@ -1870,11 +1877,14 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx, } } -void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, - unsigned SubIdx) { +void RegisterCoalescer::updateRegDefsUses( + Register SrcReg, Register DstReg, unsigned SubIdx, + ArrayRef<MachineInstr *> SubregToRegSrcInsts) { bool DstIsPhys = DstReg.isPhysical(); LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg); + // Coalescing a COPY may expose reads of 'undef' subregisters. + // If so, then explicitly propagate 'undef' to those operands. if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) { for (MachineOperand &MO : MRI->reg_operands(DstReg)) { if (MO.isUndef()) @@ -1891,6 +1901,15 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, } } + // If DstInt already has a subrange for the unused lanes, then we shouldn't + // create duplicate subranges when we update the interval for unused lanes. + LaneBitmask DstIntLaneMask; + if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) { + for (LiveInterval::SubRange &SR : DstInt->subranges()) + DstIntLaneMask |= SR.LaneMask; + } + + // Go through all instructions to replace uses of 'SrcReg' by 'DstReg'. SmallPtrSet<MachineInstr *, 8> Visited; for (MachineRegisterInfo::reg_instr_iterator I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end(); @@ -1914,6 +1933,80 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr()) Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI)); + bool RequiresImplicitRedef = false; + if (!SubregToRegSrcInsts.empty()) { + // We can only add an implicit-def and undef if the sub registers match, + // e.g. + // %0:gr32 = INSTX + // %0.sub8:gr32 = INSTY // top 24 bits of %0 still defined + // %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub32 + // + // This cannot be transformed into: + // %1.sub32:gr64 = INSTX + // undef %1.sub8:gr64 = INSTY , implicit-def %1 + // + // Because that would thrash the top 24 bits of %1.sub32. + if (is_contained(SubregToRegSrcInsts, UseMI) && + all_of(UseMI->defs(), + [&SubIdx, &SrcReg](const MachineOperand &MO) -> bool { + if (MO.getReg() != SrcReg || !MO.getSubReg() || MO.isUndef()) + return true; + return SubIdx == MO.getSubReg(); + })) { + // Add implicit-def of super-register to express that the whole + // register is defined by the instruction. + MachineInstrBuilder MIB(*MF, UseMI); + MIB.addReg(DstReg, RegState::ImplicitDefine); + RequiresImplicitRedef = true; + } + + // If the coalesed instruction doesn't fully define the register, we need + // to preserve the original super register liveness for SUBREG_TO_REG. + // + // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes, + // but it introduces liveness for other subregisters. Downstream users may + // have been relying on those bits, so we need to ensure their liveness is + // captured with a def of other lanes. + if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) { + // First check if there is sufficient granularity in terms of subranges. + LaneBitmask DstMask = MRI->getMaxLaneMaskForVReg(DstInt->reg()); + LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx); + LaneBitmask UnusedLanes = DstMask & ~UsedLanes; + if ((UnusedLanes & ~DstIntLaneMask).any()) { + BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); + DstInt->createSubRangeFrom(Allocator, UnusedLanes, *DstInt); + DstIntLaneMask |= UnusedLanes; + } + + // After duplicating the live ranges for the low/hi bits, we + // need to update the subranges of the DstReg interval such that + // for a case like this: + // + // entry: + // 16B %1:gpr32 = INSTRUCTION (<=> UseMI) + // : + // if.then: + // 32B %1:gpr32 = MOVIMM32 .. + // 48B %0:gpr64 = SUBREG_TO_REG 0, %1, sub32 + // + // Only the MOVIMM32 require a def of the top lanes and any intervals + // for the top 32-bits of the def at 16B should be removed. + for (LiveInterval::SubRange &SR : DstInt->subranges()) { + if (!Writes || RequiresImplicitRedef || + (SR.LaneMask & UnusedLanes).none()) + continue; + + assert((SR.LaneMask & UnusedLanes) == SR.LaneMask && + "Unexpected lanemask. Subrange needs finer granularity"); + + SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(false); + auto SegmentI = SR.find(UseIdx); + if (SegmentI != SR.end()) + SR.removeSegment(SegmentI, true); + } + } + } + // Replace SrcReg with DstReg in all UseMI operands. for (unsigned Op : Ops) { MachineOperand &MO = UseMI->getOperand(Op); @@ -1922,7 +2015,7 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, // turn a full def into a read-modify-write sub-register def and vice // versa. if (SubIdx && MO.isDef()) - MO.setIsUndef(!Reads); + MO.setIsUndef(!Reads || RequiresImplicitRedef); // A subreg use of a partially undef (super) register may be a complete // undef use now and then has to be marked that way. @@ -2025,6 +2118,30 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI, LIS->shrinkToUses(&LI); } +/// For a given use of value \p Idx, it returns the def in the current block, +/// or otherwise all possible defs in preceding blocks. +static bool FindDefInBlock(SmallPtrSetImpl<MachineBasicBlock *> &VisitedBlocks, + SmallVector<MachineInstr *> &Instrs, + LiveIntervals *LIS, LiveInterval &SrcInt, + MachineBasicBlock *MBB, VNInfo *Idx) { + if (!Idx->isPHIDef()) { + MachineInstr *Def = LIS->getInstructionFromIndex(Idx->def); + assert(Def && "Unable to find a def for SUBREG_TO_REG source operand"); + Instrs.push_back(Def); + return true; + } + + bool Any = false; + if (VisitedBlocks.count(MBB)) + return false; + VisitedBlocks.insert(MBB); + for (MachineBasicBlock *Pred : MBB->predecessors()) { + Any |= FindDefInBlock(VisitedBlocks, Instrs, LIS, SrcInt, Pred, + SrcInt.getVNInfoBefore(LIS->getMBBEndIdx(Pred))); + } + return Any; +} + bool RegisterCoalescer::joinCopy( MachineInstr *CopyMI, bool &Again, SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) { @@ -2156,6 +2273,35 @@ bool RegisterCoalescer::joinCopy( }); } + SmallVector<MachineInstr *> SubregToRegSrcInsts; + if (CopyMI->isSubregToReg()) { + // For the case where the copy instruction is a SUBREG_TO_REG, e.g. + // + // %0:gpr32 = movimm32 .. + // %1:gpr64 = SUBREG_TO_REG 0, %0, sub32 + // ... + // %0:gpr32 = COPY <something> + // + // After joining liveranges, the original `movimm32` will need an + // implicit-def to make it explicit that the entire register is written, + // i.e. + // + // undef %0.sub32:gpr64 = movimm32 ..., implicit-def %0 + // ... + // undef %0.sub32:gpr64 = COPY <something> // Note that this does not + // // require an implicit-def, + // // because it has nothing to + // // do with the SUBREG_TO_REG. + LiveInterval &SrcInt = + LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); + SlotIndex SubregToRegSlotIdx = LIS->getInstructionIndex(*CopyMI); + SmallPtrSet<MachineBasicBlock *, 8> VisitedBlocks; + if (!FindDefInBlock(VisitedBlocks, SubregToRegSrcInsts, LIS, SrcInt, + CopyMI->getParent(), + SrcInt.Query(SubregToRegSlotIdx).valueIn())) + llvm_unreachable("SUBREG_TO_REG src requires a def"); + } + ShrinkMask = LaneBitmask::getNone(); ShrinkMainRange = false; @@ -2225,9 +2371,12 @@ bool RegisterCoalescer::joinCopy( // Rewrite all SrcReg operands to DstReg. // Also update DstReg operands to include DstIdx if it is set. - if (CP.getDstIdx()) + if (CP.getDstIdx()) { + assert(SubregToRegSrcInsts.empty() && "can this happen?"); updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx()); - updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx()); + } + updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(), + SubregToRegSrcInsts); // Shrink subregister ranges if necessary. if (ShrinkMask.any()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d3df434..a43020e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -35,6 +35,7 @@ #include "llvm/CodeGen/ByteProvider.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/SDPatternMatch.h" @@ -15262,23 +15263,31 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) { } } - // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller - // than X, and the And doesn't change the lower iX bits, we can move the - // AssertZext in front of the And and drop the AssertSext. if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND && - N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext && isa<ConstantSDNode>(N0.getOperand(1))) { - SDValue BigA = N0.getOperand(0); - EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); const APInt &Mask = N0.getConstantOperandAPInt(1); - if (AssertVT.bitsLT(BigA_AssertVT) && - Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { - SDLoc DL(N); - SDValue NewAssert = - DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); - return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, - N0.getOperand(1)); + + // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller + // than X, and the And doesn't change the lower iX bits, we can move the + // AssertZext in front of the And and drop the AssertSext. + if (N0.getOperand(0).getOpcode() == ISD::AssertSext && N0.hasOneUse()) { + SDValue BigA = N0.getOperand(0); + EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); + if (AssertVT.bitsLT(BigA_AssertVT) && + Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { + SDLoc DL(N); + SDValue NewAssert = + DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); + return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, + N0.getOperand(1)); + } } + + // Remove AssertZext entirely if the mask guarantees the assertion cannot + // fail. + // TODO: Use KB countMinLeadingZeros to handle non-constant masks? + if (Mask.isIntN(AssertVT.getScalarSizeInBits())) + return N0; } return SDValue(); @@ -22778,8 +22787,10 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG); // If we store purely within object bounds just before its lifetime ends, // we can remove the store. - if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase, - StoreSize.getFixedValue() * 8)) { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + if (LifetimeEndBase.contains( + DAG, MFI.getObjectSize(LifetimeEnd->getFrameIndex()) * 8, + StoreBase, StoreSize.getFixedValue() * 8)) { LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump(); dbgs() << "\nwithin LIFETIME_END of : "; LifetimeEndBase.dump(); dbgs() << "\n"); @@ -28971,13 +28982,100 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, return SDValue(); } +static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, + const TargetLowering &TLI) { + // Match a pattern such as: + // (X | (X >> C0) | (X >> C1) | ...) & Mask + // This extracts contiguous parts of X and ORs them together before comparing. + // We can optimize this so that we directly check (X & SomeMask) instead, + // eliminating the shifts. + + EVT VT = Root.getValueType(); + + // TODO: Support vectors? + if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N0 = Root.getOperand(0); + SDValue N1 = Root.getOperand(1); + + if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1)) + return SDValue(); + + APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal(); + + SDValue Src; + const auto IsSrc = [&](SDValue V) { + if (!Src) { + Src = V; + return true; + } + + return Src == V; + }; + + SmallVector<SDValue> Worklist = {N0}; + APInt PartsMask(VT.getSizeInBits(), 0); + while (!Worklist.empty()) { + SDValue V = Worklist.pop_back_val(); + if (!V.hasOneUse() && (Src && Src != V)) + return SDValue(); + + if (V.getOpcode() == ISD::OR) { + Worklist.push_back(V.getOperand(0)); + Worklist.push_back(V.getOperand(1)); + continue; + } + + if (V.getOpcode() == ISD::SRL) { + SDValue ShiftSrc = V.getOperand(0); + SDValue ShiftAmt = V.getOperand(1); + + if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt)) + return SDValue(); + + auto ShiftAmtVal = cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal(); + if (ShiftAmtVal > RootMask.getBitWidth()) + return SDValue(); + + PartsMask |= (RootMask << ShiftAmtVal); + continue; + } + + if (IsSrc(V)) { + PartsMask |= RootMask; + continue; + } + + return SDValue(); + } + + if (!Src) + return SDValue(); + + SDLoc DL(Root); + return DAG.getNode(ISD::AND, DL, VT, + {Src, DAG.getConstant(PartsMask, DL, VT)}); +} + /// This is a stub for TargetLowering::SimplifySetCC. SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans) { TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); - return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); + if (SDValue C = + TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL)) + return C; + + if (ISD::isIntEqualitySetCC(Cond) && N0.getOpcode() == ISD::AND && + isNullConstant(N1)) { + + if (SDValue Res = matchMergedBFX(N0, DAG, TLI)) + return DAG.getSetCC(DL, VT, Res, N1, Cond); + } + + return SDValue(); } /// Given an ISD::SDIV node expressing a divide by constant, return @@ -29415,7 +29513,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { MachineMemOperand *MMO; }; - auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics { + auto getCharacteristics = [this](SDNode *N) -> MemUseCharacteristics { if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) { int64_t Offset = 0; if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset())) @@ -29428,13 +29526,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { LSN->getBasePtr(), Offset /*base offset*/, LocationSize::precise(Size), LSN->getMemOperand()}; } - if (const auto *LN = cast<LifetimeSDNode>(N)) + if (const auto *LN = cast<LifetimeSDNode>(N)) { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1), 0, - LocationSize::precise(LN->getSize()), + LocationSize::precise(MFI.getObjectSize(LN->getFrameIndex())), (MachineMemOperand *)nullptr}; + } // Default. return {false /*isvolatile*/, /*isAtomic*/ false, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 74172b2..ba0ab23 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3853,7 +3853,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { break; case ISD::FP_TO_FP16: LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n"); - if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) { + if (Node->getFlags().hasApproximateFuncs() && !TLI.useSoftFloat()) { SDValue Op = Node->getOperand(0); MVT SVT = Op.getSimpleValueType(); if ((SVT == MVT::f64 || SVT == MVT::f80) && diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 6a2e782..31e7855 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -888,7 +888,8 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { } if (MI->isCandidateForAdditionalCallInfo()) { - if (DAG->getTarget().Options.EmitCallSiteInfo) + if (DAG->getTarget().Options.EmitCallSiteInfo || + DAG->getTarget().Options.EmitCallGraphSection) MF.addCallSiteInfo(MI, DAG->getCallSiteInfo(Node)); if (auto CalledGlobal = DAG->getCalledGlobal(Node)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 773ff48..02d1100 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -784,10 +784,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { case ISD::TargetFrameIndex: ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex()); break; - case ISD::LIFETIME_START: - case ISD::LIFETIME_END: - ID.AddInteger(cast<LifetimeSDNode>(N)->getSize()); - break; case ISD::PSEUDO_PROBE: ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid()); ID.AddInteger(cast<PseudoProbeSDNode>(N)->getIndex()); @@ -7847,20 +7843,43 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } } - // Perform trivial constant folding. - if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags)) - return SV; + if (N1.getOpcode() == ISD::POISON || N2.getOpcode() == ISD::POISON) { + switch (Opcode) { + case ISD::XOR: + case ISD::ADD: + case ISD::PTRADD: + case ISD::SUB: + case ISD::SIGN_EXTEND_INREG: + case ISD::UDIV: + case ISD::SDIV: + case ISD::UREM: + case ISD::SREM: + case ISD::MUL: + case ISD::AND: + case ISD::SSUBSAT: + case ISD::USUBSAT: + case ISD::UMIN: + case ISD::OR: + case ISD::SADDSAT: + case ISD::UADDSAT: + case ISD::UMAX: + case ISD::SMAX: + case ISD::SMIN: + // fold op(arg1, poison) -> poison, fold op(poison, arg2) -> poison. + return N2.getOpcode() == ISD::POISON ? N2 : N1; + } + } // Canonicalize an UNDEF to the RHS, even over a constant. - if (N1.isUndef()) { + if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() != ISD::UNDEF) { if (TLI->isCommutativeBinOp(Opcode)) { std::swap(N1, N2); } else { switch (Opcode) { case ISD::PTRADD: case ISD::SUB: - // fold op(undef, arg2) -> undef, fold op(poison, arg2) ->poison. - return N1.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT); + // fold op(undef, non_undef_arg2) -> undef. + return N1; case ISD::SIGN_EXTEND_INREG: case ISD::UDIV: case ISD::SDIV: @@ -7868,18 +7887,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::SREM: case ISD::SSUBSAT: case ISD::USUBSAT: - // fold op(undef, arg2) -> 0, fold op(poison, arg2) -> poison. - return N1.getOpcode() == ISD::POISON ? getPOISON(VT) - : getConstant(0, DL, VT); + // fold op(undef, non_undef_arg2) -> 0. + return getConstant(0, DL, VT); } } } // Fold a bunch of operators when the RHS is undef. - if (N2.isUndef()) { + if (N2.getOpcode() == ISD::UNDEF) { switch (Opcode) { case ISD::XOR: - if (N1.isUndef()) + if (N1.getOpcode() == ISD::UNDEF) // Handle undef ^ undef -> 0 special case. This is a common // idiom (misuse). return getConstant(0, DL, VT); @@ -7887,29 +7905,48 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::ADD: case ISD::PTRADD: case ISD::SUB: + // fold op(arg1, undef) -> undef. + return N2; case ISD::UDIV: case ISD::SDIV: case ISD::UREM: case ISD::SREM: - // fold op(arg1, undef) -> undef, fold op(arg1, poison) -> poison. - return N2.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT); + // fold op(arg1, undef) -> poison. + return getPOISON(VT); case ISD::MUL: case ISD::AND: case ISD::SSUBSAT: case ISD::USUBSAT: - // fold op(arg1, undef) -> 0, fold op(arg1, poison) -> poison. - return N2.getOpcode() == ISD::POISON ? getPOISON(VT) - : getConstant(0, DL, VT); + case ISD::UMIN: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> 0. + return N1.getOpcode() == ISD::UNDEF ? N2 : getConstant(0, DL, VT); case ISD::OR: case ISD::SADDSAT: case ISD::UADDSAT: - // fold op(arg1, undef) -> an all-ones constant, fold op(arg1, poison) -> - // poison. - return N2.getOpcode() == ISD::POISON ? getPOISON(VT) - : getAllOnesConstant(DL, VT); + case ISD::UMAX: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> -1. + return N1.getOpcode() == ISD::UNDEF ? N2 : getAllOnesConstant(DL, VT); + case ISD::SMAX: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MAX_INT. + return N1.getOpcode() == ISD::UNDEF + ? N2 + : getConstant( + APInt::getSignedMaxValue(VT.getScalarSizeInBits()), DL, + VT); + case ISD::SMIN: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MIN_INT. + return N1.getOpcode() == ISD::UNDEF + ? N2 + : getConstant( + APInt::getSignedMinValue(VT.getScalarSizeInBits()), DL, + VT); } } + // Perform trivial constant folding. + if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags)) + return SV; + // Memoize this node if possible. SDNode *N; SDVTList VTs = getVTList(VT); @@ -9360,8 +9397,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, } SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl, - SDValue Chain, int FrameIndex, - int64_t Size) { + SDValue Chain, int FrameIndex) { const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END; const auto VTs = getVTList(MVT::Other); SDValue Ops[2] = { @@ -9373,13 +9409,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl, FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); ID.AddInteger(FrameIndex); - ID.AddInteger(Size); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) return SDValue(E, 0); - LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), - dl.getDebugLoc(), VTs, Size); + LifetimeSDNode *N = + newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs); createOperands(N, Ops); CSEMap.InsertNode(N, IP); InsertNode(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 1636465..306e068 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3923,11 +3923,15 @@ void SelectionDAGBuilder::visitFPTrunc(const User &I) { // FPTrunc is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); SDLoc dl = getCurSDLoc(); + SDNodeFlags Flags; + if (auto *TruncInst = dyn_cast<FPMathOperator>(&I)) + Flags.copyFMF(*TruncInst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N, DAG.getTargetConstant( - 0, dl, TLI.getPointerTy(DAG.getDataLayout())))); + 0, dl, TLI.getPointerTy(DAG.getDataLayout())), + Flags)); } void SelectionDAGBuilder::visitFPExt(const User &I) { @@ -7594,8 +7598,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, if (TM.getOptLevel() == CodeGenOptLevel::None) return; - const int64_t ObjectSize = - cast<ConstantInt>(I.getArgOperand(0))->getSExtValue(); const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1)); // First check that the Alloca is static, otherwise it won't have a @@ -7605,7 +7607,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; const int FrameIndex = SI->second; - Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize); + Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex); DAG.setRoot(Res); return; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 9474587..900da76 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -946,8 +946,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { << " -> " << ASC->getDestAddressSpace() << ']'; - } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) { - OS << "<0 to " << LN->getSize() << ">"; } else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) { OS << '<' << AA->getAlign().value() << '>'; } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1764910..48d6b99 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9471,7 +9471,7 @@ SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG, ISD::SRL, DL, VT, DAG.getNode(ISD::MUL, DL, VT, DAG.getNode(ISD::AND, DL, VT, Op, Neg), DAG.getConstant(DeBruijn, DL, VT)), - DAG.getConstant(ShiftAmt, DL, VT)); + DAG.getShiftAmountConstant(ShiftAmt, VT, DL)); Lookup = DAG.getSExtOrTrunc(Lookup, DL, getPointerTy(TD)); SmallVector<uint8_t> Table(BitWidth, 0); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 68b8a00..3c91b0e 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2062,7 +2062,7 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const { // FreeBSD has "__stack_chk_guard" defined externally on libc.so if (M.getDirectAccessExternalData() && - !TM.getTargetTriple().isWindowsGNUEnvironment() && + !TM.getTargetTriple().isOSCygMing() && !(TM.getTargetTriple().isPPC64() && TM.getTargetTriple().isOSFreeBSD()) && (!TM.getTargetTriple().isOSDarwin() || diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 725e951..e9172f4 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1060,27 +1060,27 @@ MCSection *TargetLoweringObjectFileELF::getSectionForConstant( auto &Context = getContext(); if (Kind.isMergeableConst4() && MergeableConst4Section) - return Context.getELFSection(".rodata.cst4." + SectionSuffix, + return Context.getELFSection(".rodata.cst4." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 4); if (Kind.isMergeableConst8() && MergeableConst8Section) - return Context.getELFSection(".rodata.cst8." + SectionSuffix, + return Context.getELFSection(".rodata.cst8." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 8); if (Kind.isMergeableConst16() && MergeableConst16Section) - return Context.getELFSection(".rodata.cst16." + SectionSuffix, + return Context.getELFSection(".rodata.cst16." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 16); if (Kind.isMergeableConst32() && MergeableConst32Section) - return Context.getELFSection(".rodata.cst32." + SectionSuffix, + return Context.getELFSection(".rodata.cst32." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 32); if (Kind.isReadOnly()) - return Context.getELFSection(".rodata." + SectionSuffix, ELF::SHT_PROGBITS, - ELF::SHF_ALLOC); + return Context.getELFSection(".rodata." + SectionSuffix + ".", + ELF::SHT_PROGBITS, ELF::SHF_ALLOC); assert(Kind.isReadOnlyWithRel() && "Unknown section kind"); - return Context.getELFSection(".data.rel.ro." + SectionSuffix, + return Context.getELFSection(".data.rel.ro." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE); } diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp index 6267207..fd54190 100644 --- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp +++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp @@ -369,6 +369,19 @@ static GlobalVariable *getOrCreateRefVariable( AddrOfOldGV, Twine("__ref_").concat(GV->getName()), nullptr, GlobalVariable::NotThreadLocal); + // RefGV is created with isConstant = false, but we want to place RefGV into + // .rdata, not .data. It is important that the GlobalVariable be mutable + // from the compiler's point of view, so that the optimizer does not remove + // the global variable entirely and replace all references to it with its + // initial value. + // + // When the Windows hot-patch loader applies a hot-patch, it maps the + // pages of .rdata as read/write so that it can set each __ref_* variable + // to point to the original variable in the base image. Afterward, pages in + // .rdata are remapped as read-only. This protects the __ref_* variables from + // being overwritten during execution. + RefGV->setSection(".rdata"); + // Create debug info for the replacement global variable. DataLayout Layout = M->getDataLayout(); DIType *DebugType = DebugInfo.createPointerType( diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h index bd0d72f..0e95369 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h @@ -157,8 +157,7 @@ private: processSubtractRelocation(unsigned SectionID, relocation_iterator RelI, const MachOObjectFile &BaseObj, ObjSectionToIDMap &ObjSectionToID) { - const MachOObjectFile &Obj = - static_cast<const MachOObjectFile&>(BaseObj); + const MachOObjectFile &Obj = BaseObj; MachO::any_relocation_info RE = Obj.getRelocation(RelI->getRawDataRefImpl()); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 7928772..3aa4f7a 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1161,7 +1161,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel( Builder.restoreIP(AllocaIP); auto *KernelArgsPtr = Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args"); - Builder.restoreIP(Loc.IP); + updateToLocation(Loc); for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) { llvm::Value *Arg = @@ -1189,7 +1189,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch( if (!updateToLocation(Loc)) return Loc.IP; - Builder.restoreIP(Loc.IP); // On top of the arrays that were filled up, the target offloading call // takes as arguments the device id as well as the host pointer. The host // pointer is used by the runtime library to identify the current target @@ -5955,7 +5954,7 @@ OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc, Builder.restoreIP(AllocaIP); AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name); ArgsBase->setAlignment(Align(8)); - Builder.restoreIP(Loc.IP); + updateToLocation(Loc); // Store the index value with offset in depend vector. for (unsigned I = 0; I < NumLoops; ++I) { @@ -8081,7 +8080,7 @@ void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc, ".offload_ptrs"); AllocaInst *ArgSizes = Builder.CreateAlloca( ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes"); - Builder.restoreIP(Loc.IP); + updateToLocation(Loc); MapperAllocas.ArgsBase = ArgsBase; MapperAllocas.Args = Args; MapperAllocas.ArgSizes = ArgSizes; diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 28037d7..49c6dc7 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -1144,9 +1144,32 @@ Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V, return CreateShuffleVector(V, Zeros, Name + ".splat"); } -Value *IRBuilderBase::CreatePreserveArrayAccessIndex( - Type *ElTy, Value *Base, unsigned Dimension, unsigned LastIndex, - MDNode *DbgInfo) { +Value *IRBuilderBase::CreateVectorInterleave(ArrayRef<Value *> Ops, + const Twine &Name) { + assert(Ops.size() >= 2 && Ops.size() <= 8 && + "Unexpected number of operands to interleave"); + + // Make sure all operands are the same type. + assert(isa<VectorType>(Ops[0]->getType()) && "Unexpected type"); + +#ifndef NDEBUG + for (unsigned I = 1; I < Ops.size(); I++) { + assert(Ops[I]->getType() == Ops[0]->getType() && + "Vector interleave expects matching operand types!"); + } +#endif + + unsigned IID = Intrinsic::getInterleaveIntrinsicID(Ops.size()); + auto *SubvecTy = cast<VectorType>(Ops[0]->getType()); + Type *DestTy = VectorType::get(SubvecTy->getElementType(), + SubvecTy->getElementCount() * Ops.size()); + return CreateIntrinsic(IID, {DestTy}, Ops, {}, Name); +} + +Value *IRBuilderBase::CreatePreserveArrayAccessIndex(Type *ElTy, Value *Base, + unsigned Dimension, + unsigned LastIndex, + MDNode *DbgInfo) { auto *BaseType = Base->getType(); assert(isa<PointerType>(BaseType) && "Invalid Base ptr type for preserve.array.access.index."); diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index 6c35ade..58a1f74 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -1133,3 +1133,27 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) { "Shouldn't change the signature"); return NewDecl; } + +struct InterleaveIntrinsic { + Intrinsic::ID Interleave, Deinterleave; +}; + +static InterleaveIntrinsic InterleaveIntrinsics[] = { + {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2}, + {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3}, + {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4}, + {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5}, + {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6}, + {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7}, + {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8}, +}; + +Intrinsic::ID Intrinsic::getInterleaveIntrinsicID(unsigned Factor) { + assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); + return InterleaveIntrinsics[Factor - 2].Interleave; +} + +Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) { + assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); + return InterleaveIntrinsics[Factor - 2].Deinterleave; +} diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index 0dbd07f..1157cbe 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -1796,6 +1796,7 @@ AAMDNodes Instruction::getAAMetadata() const { Result.TBAAStruct = Info.lookup(LLVMContext::MD_tbaa_struct); Result.Scope = Info.lookup(LLVMContext::MD_alias_scope); Result.NoAlias = Info.lookup(LLVMContext::MD_noalias); + Result.NoAliasAddrSpace = Info.lookup(LLVMContext::MD_noalias_addrspace); } return Result; } @@ -1805,6 +1806,7 @@ void Instruction::setAAMetadata(const AAMDNodes &N) { setMetadata(LLVMContext::MD_tbaa_struct, N.TBAAStruct); setMetadata(LLVMContext::MD_alias_scope, N.Scope); setMetadata(LLVMContext::MD_noalias, N.NoAlias); + setMetadata(LLVMContext::MD_noalias_addrspace, N.NoAliasAddrSpace); } void Instruction::setNoSanitizeMetadata() { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 73e79c0..0323b4d 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/LTO/LTO.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StableHashing.h" @@ -742,18 +743,19 @@ Error LTO::add(std::unique_ptr<InputFile> Input, Conf.VisibilityScheme = Config::ELF; } - const SymbolResolution *ResI = Res.begin(); - for (unsigned I = 0; I != Input->Mods.size(); ++I) - if (Error Err = addModule(*Input, I, ResI, Res.end())) + ArrayRef<SymbolResolution> InputRes = Res; + for (unsigned I = 0; I != Input->Mods.size(); ++I) { + if (auto Err = addModule(*Input, InputRes, I, Res).moveInto(Res)) return Err; + } - assert(ResI == Res.end()); + assert(Res.empty()); return Error::success(); } -Error LTO::addModule(InputFile &Input, unsigned ModI, - const SymbolResolution *&ResI, - const SymbolResolution *ResE) { +Expected<ArrayRef<SymbolResolution>> +LTO::addModule(InputFile &Input, ArrayRef<SymbolResolution> InputRes, + unsigned ModI, ArrayRef<SymbolResolution> Res) { Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo(); if (!LTOInfo) return LTOInfo.takeError(); @@ -782,28 +784,32 @@ Error LTO::addModule(InputFile &Input, unsigned ModI, bool IsThinLTO = LTOInfo->IsThinLTO && (LTOMode != LTOK_UnifiedRegular); auto ModSyms = Input.module_symbols(ModI); - addModuleToGlobalRes(ModSyms, {ResI, ResE}, + addModuleToGlobalRes(ModSyms, Res, IsThinLTO ? ThinLTO.ModuleMap.size() + 1 : 0, LTOInfo->HasSummary); if (IsThinLTO) - return addThinLTO(BM, ModSyms, ResI, ResE); + return addThinLTO(BM, ModSyms, Res); RegularLTO.EmptyCombinedModule = false; - Expected<RegularLTOState::AddedModule> ModOrErr = - addRegularLTO(BM, ModSyms, ResI, ResE); + auto ModOrErr = addRegularLTO(Input, InputRes, BM, ModSyms, Res); if (!ModOrErr) return ModOrErr.takeError(); + Res = ModOrErr->second; - if (!LTOInfo->HasSummary) - return linkRegularLTO(std::move(*ModOrErr), /*LivenessFromIndex=*/false); + if (!LTOInfo->HasSummary) { + if (Error Err = linkRegularLTO(std::move(ModOrErr->first), + /*LivenessFromIndex=*/false)) + return Err; + return Res; + } // Regular LTO module summaries are added to a dummy module that represents // the combined regular LTO module. if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, "")) return Err; - RegularLTO.ModsWithSummaries.push_back(std::move(*ModOrErr)); - return Error::success(); + RegularLTO.ModsWithSummaries.push_back(std::move(ModOrErr->first)); + return Res; } // Checks whether the given global value is in a non-prevailing comdat @@ -839,10 +845,11 @@ handleNonPrevailingComdat(GlobalValue &GV, // Add a regular LTO object to the link. // The resulting module needs to be linked into the combined LTO module with // linkRegularLTO. -Expected<LTO::RegularLTOState::AddedModule> -LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, - const SymbolResolution *&ResI, - const SymbolResolution *ResE) { +Expected< + std::pair<LTO::RegularLTOState::AddedModule, ArrayRef<SymbolResolution>>> +LTO::addRegularLTO(InputFile &Input, ArrayRef<SymbolResolution> InputRes, + BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, + ArrayRef<SymbolResolution> Res) { RegularLTOState::AddedModule Mod; Expected<std::unique_ptr<Module>> MOrErr = BM.getLazyModule(RegularLTO.Ctx, /*ShouldLazyLoadMetadata*/ true, @@ -855,13 +862,34 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, if (Error Err = M.materializeMetadata()) return std::move(Err); - // If cfi.functions is present and we are in regular LTO mode, LowerTypeTests - // will rename local functions in the merged module as "<function name>.1". - // This causes linking errors, since other parts of the module expect the - // original function name. - if (LTOMode == LTOK_UnifiedRegular) + if (LTOMode == LTOK_UnifiedRegular) { + // cfi.functions metadata is intended to be used with ThinLTO and may + // trigger invalid IR transformations if they are present when doing regular + // LTO, so delete it. if (NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions")) M.eraseNamedMetadata(CfiFunctionsMD); + } else if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) { + // Delete aliases entries for non-prevailing symbols on the ThinLTO side of + // this input file. + DenseSet<StringRef> Prevailing; + for (auto [I, R] : zip(Input.symbols(), InputRes)) + if (R.Prevailing && !I.getIRName().empty()) + Prevailing.insert(I.getIRName()); + std::vector<MDNode *> AliasGroups; + for (MDNode *AliasGroup : AliasesMD->operands()) { + std::vector<Metadata *> Aliases; + for (Metadata *Alias : AliasGroup->operands()) { + if (isa<MDString>(Alias) && + Prevailing.count(cast<MDString>(Alias)->getString())) + Aliases.push_back(Alias); + } + if (Aliases.size() > 1) + AliasGroups.push_back(MDTuple::get(RegularLTO.Ctx, Aliases)); + } + AliasesMD->clearOperands(); + for (MDNode *G : AliasGroups) + AliasesMD->addOperand(G); + } UpgradeDebugInfo(M); @@ -899,22 +927,22 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, std::set<const Comdat *> NonPrevailingComdats; SmallSet<StringRef, 2> NonPrevailingAsmSymbols; for (const InputFile::Symbol &Sym : Syms) { - assert(ResI != ResE); - SymbolResolution Res = *ResI++; + assert(!Res.empty()); + const SymbolResolution &R = Res.consume_front(); assert(MsymI != MsymE); ModuleSymbolTable::Symbol Msym = *MsymI++; Skip(); if (GlobalValue *GV = dyn_cast_if_present<GlobalValue *>(Msym)) { - if (Res.Prevailing) { + if (R.Prevailing) { if (Sym.isUndefined()) continue; Mod.Keep.push_back(GV); // For symbols re-defined with linker -wrap and -defsym options, // set the linkage to weak to inhibit IPO. The linkage will be // restored by the linker. - if (Res.LinkerRedefined) + if (R.LinkerRedefined) GV->setLinkage(GlobalValue::WeakAnyLinkage); GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage(); @@ -938,7 +966,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } // Set the 'local' flag based on the linker resolution for this symbol. - if (Res.FinalDefinitionInLinkageUnit) { + if (R.FinalDefinitionInLinkageUnit) { GV->setDSOLocal(true); if (GV->hasDLLImportStorageClass()) GV->setDLLStorageClass(GlobalValue::DLLStorageClassTypes:: @@ -947,7 +975,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } else if (auto *AS = dyn_cast_if_present<ModuleSymbolTable::AsmSymbol *>(Msym)) { // Collect non-prevailing symbols. - if (!Res.Prevailing) + if (!R.Prevailing) NonPrevailingAsmSymbols.insert(AS->first); } else { llvm_unreachable("unknown symbol type"); @@ -965,7 +993,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, CommonRes.Alignment = std::max(Align(SymAlignValue), CommonRes.Alignment); } - CommonRes.Prevailing |= Res.Prevailing; + CommonRes.Prevailing |= R.Prevailing; } } @@ -991,7 +1019,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } assert(MsymI == MsymE); - return std::move(Mod); + return std::make_pair(std::move(Mod), Res); } Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod, @@ -1032,19 +1060,19 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod, } // Add a ThinLTO module to the link. -Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, - const SymbolResolution *&ResI, - const SymbolResolution *ResE) { - const SymbolResolution *ResITmp = ResI; +Expected<ArrayRef<SymbolResolution>> +LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, + ArrayRef<SymbolResolution> Res) { + ArrayRef<SymbolResolution> ResTmp = Res; for (const InputFile::Symbol &Sym : Syms) { - assert(ResITmp != ResE); - SymbolResolution Res = *ResITmp++; + assert(!ResTmp.empty()); + const SymbolResolution &R = ResTmp.consume_front(); if (!Sym.getIRName().empty()) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (Res.Prevailing) + if (R.Prevailing) ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); } } @@ -1059,14 +1087,14 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n"); for (const InputFile::Symbol &Sym : Syms) { - assert(ResI != ResE); - SymbolResolution Res = *ResI++; + assert(!Res.empty()); + const SymbolResolution &R = Res.consume_front(); if (!Sym.getIRName().empty()) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (Res.Prevailing) { + if (R.Prevailing) { assert(ThinLTO.PrevailingModuleForGUID[GUID] == BM.getModuleIdentifier()); @@ -1074,7 +1102,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // switch the linkage to `weak` to prevent IPOs from happening. // Find the summary in the module for this very GV and record the new // linkage so that we can switch it when we import the GV. - if (Res.LinkerRedefined) + if (R.LinkerRedefined) if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( GUID, BM.getModuleIdentifier())) S->setLinkage(GlobalValue::WeakAnyLinkage); @@ -1082,7 +1110,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // If the linker resolved the symbol to a local definition then mark it // as local in the summary for the module we are adding. - if (Res.FinalDefinitionInLinkageUnit) { + if (R.FinalDefinitionInLinkageUnit) { if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( GUID, BM.getModuleIdentifier())) { S->setDSOLocal(true); @@ -1110,7 +1138,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } } - return Error::success(); + return Res; } unsigned LTO::getMaxTasks() const { diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index 1074669..a214513 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -484,7 +484,8 @@ void MCMachOStreamer::finalizeCGProfile() { // For each entry, reserve space for 2 32-bit indices and a 64-bit count. size_t SectionBytes = W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t)); - (*CGProfileSection->begin()).appendContents(SectionBytes, 0); + (*CGProfileSection->begin()) + .setVarContents(std::vector<char>(SectionBytes, 0)); } MCStreamer *llvm::createMachOStreamer(MCContext &Context, @@ -520,5 +521,6 @@ void MCMachOStreamer::createAddrSigSection() { // (instead of emitting a zero-sized section) so these relocations are // technically valid, even though we don't expect these relocations to // actually be applied by the linker. - Frag->appendContents(8, 0); + constexpr char zero[8] = {}; + Frag->setVarContents(zero); } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 9c7b05b..e277143 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -57,6 +57,10 @@ void MCObjectStreamer::insert(MCFragment *F) { newFragment(); } +void MCObjectStreamer::appendContents(ArrayRef<char> Contents) { + CurFrag->appendContents(Contents); +} + void MCObjectStreamer::appendContents(size_t Num, char Elt) { CurFrag->appendContents(Num, Elt); } @@ -538,8 +542,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) { void MCObjectStreamer::emitBytes(StringRef Data) { MCDwarfLineEntry::make(this, getCurrentSectionOnly()); - MCFragment *DF = getCurrentFragment(); - DF->appendContents(ArrayRef(Data.data(), Data.size())); + appendContents(ArrayRef(Data.data(), Data.size())); } void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill, diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp index 898ac5d..26f45ce 100644 --- a/llvm/lib/MC/MCXCOFFStreamer.cpp +++ b/llvm/lib/MC/MCXCOFFStreamer.cpp @@ -103,16 +103,8 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility( void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) { // Add a Fixup here to later record a relocation of type R_REF to prevent the // ref symbol from being garbage collected (by the binder). - MCFragment *DF = getCurrentFragment(); - const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext()); - std::optional<MCFixupKind> MaybeKind = - getAssembler().getBackend().getFixupKind("R_REF"); - if (!MaybeKind) - report_fatal_error("failed to get fixup kind for R_REF relocation"); - - MCFixupKind Kind = *MaybeKind; - MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, Kind); - DF->addFixup(Fixup); + addFixup(MCSymbolRefExpr::create(Symbol, getContext()), + XCOFF::RelocationType::R_REF); } void MCXCOFFStreamer::emitXCOFFRenameDirective(const MCSymbol *Name, diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 7b5c3c0..e87696a 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -806,7 +806,7 @@ uint64_t MachObjectWriter::writeObject() { } MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0, SectionKind::getMetadata()); - llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data()); + llvm::copy(OS.str(), Sec->curFragList()->Head->getVarContents().data()); } unsigned NumSections = Asm.end() - Asm.begin(); diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp index 62a71d4..9b55f76 100644 --- a/llvm/lib/ObjCopy/COFF/COFFReader.cpp +++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp @@ -135,7 +135,7 @@ Error COFFReader::readSymbols(Object &Obj, bool IsBigObj) const { // it is, find the target section unique id. const coff_aux_section_definition *SD = SymRef.getSectionDefinition(); const coff_aux_weak_external *WE = SymRef.getWeakExternal(); - if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE) { + if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE && !Obj.IsPE) { int32_t Index = SD->getNumber(IsBigObj); if (Index <= 0 || static_cast<uint32_t>(Index - 1) >= Sections.size()) return createStringError(object_error::parse_failed, diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp index 2d74d1d..5863490 100644 --- a/llvm/lib/Object/SFrameParser.cpp +++ b/llvm/lib/Object/SFrameParser.cpp @@ -10,27 +10,41 @@ #include "llvm/BinaryFormat/SFrame.h" #include "llvm/Object/Error.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; using namespace llvm::object; -template <typename T> -static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data, - uint64_t Offset) { - static_assert(std::is_trivial_v<T>); - if (Data.size() < Offset + sizeof(T)) { +static Expected<ArrayRef<uint8_t>> +getDataSlice(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Size) { + uint64_t End = SaturatingAdd(Offset, Size); + // Data.size() cannot be UINT64_MAX, as it would occupy the whole address + // space. + if (End > Data.size()) { return createStringError( formatv("unexpected end of data at offset {0:x} while reading [{1:x}, " "{2:x})", - Data.size(), Offset, Offset + sizeof(T)) + Data.size(), Offset, End) .str(), object_error::unexpected_eof); } - return *reinterpret_cast<const T *>(Data.data() + Offset); + return Data.slice(Offset, Size); +} + +template <typename T> +static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data, + uint64_t Offset) { + static_assert(std::is_trivial_v<T>); + Expected<ArrayRef<uint8_t>> Slice = getDataSlice(Data, Offset, sizeof(T)); + if (!Slice) + return Slice.takeError(); + + return *reinterpret_cast<const T *>(Slice->data()); } template <endianness E> -Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) { +Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents, + uint64_t SectionAddress) { Expected<const sframe::Preamble<E> &> Preamble = getDataSliceAs<sframe::Preamble<E>>(Contents, 0); if (!Preamble) @@ -48,8 +62,44 @@ Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) { getDataSliceAs<sframe::Header<E>>(Contents, 0); if (!Header) return Header.takeError(); - return SFrameParser(Contents, *Header); + return SFrameParser(Contents, SectionAddress, *Header); +} + +template <endianness E> +Expected<ArrayRef<uint8_t>> SFrameParser<E>::getAuxHeader() const { + return getDataSlice(Data, sizeof(Header), Header.AuxHdrLen); +} + +template <endianness E> +Expected<ArrayRef<sframe::FuncDescEntry<E>>> SFrameParser<E>::fdes() const { + Expected<ArrayRef<uint8_t>> Slice = getDataSlice( + Data, getFDEBase(), Header.NumFDEs * sizeof(sframe::FuncDescEntry<E>)); + if (!Slice) + return Slice.takeError(); + return ArrayRef( + reinterpret_cast<const sframe::FuncDescEntry<E> *>(Slice->data()), + Header.NumFDEs); +} + +template <endianness E> +uint64_t SFrameParser<E>::getAbsoluteStartAddress( + typename FDERange::iterator FDE) const { + uint64_t Result = SectionAddress + FDE->StartAddress; + + if ((getPreamble().Flags.value() & sframe::Flags::FDEFuncStartPCRel) == + sframe::Flags::FDEFuncStartPCRel) { + uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data()); + uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE); + + assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() && + "Iterator does not belong to this object!"); + + Result += FDEPtr - DataPtr; + } + + return Result; } -template class llvm::object::SFrameParser<endianness::big>; -template class llvm::object::SFrameParser<endianness::little>; +template class LLVM_EXPORT_TEMPLATE llvm::object::SFrameParser<endianness::big>; +template class LLVM_EXPORT_TEMPLATE + llvm::object::SFrameParser<endianness::little>; diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index fd89583..1b111dc 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -84,6 +84,7 @@ MODULE_PASS("global-merge-func", GlobalMergeFuncPass()) MODULE_PASS("globalopt", GlobalOptPass()) MODULE_PASS("globalsplit", GlobalSplitPass()) MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass()) +MODULE_PASS("hipstdpar-math-fixup", HipStdParMathFixupPass()) MODULE_PASS("hipstdpar-select-accelerator-code", HipStdParAcceleratorCodeSelectionPass()) MODULE_PASS("hotcoldsplit", HotColdSplittingPass()) diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index 277247e..cc02cae 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -1190,7 +1190,7 @@ Expected<size_t> readNativeFile(file_t FD, MutableArrayRef<char> Buf) { size_t Size = Buf.size(); #endif ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Size); - if (ssize_t(NumRead) == -1) + if (NumRead == -1) return errorCodeToError(errnoAsErrorCode()); // The underlying operation on these platforms allow opening directories // for reading in more cases than other platforms. diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc index d862dbd..8dd7c88 100644 --- a/llvm/lib/Support/Windows/Threading.inc +++ b/llvm/lib/Support/Windows/Threading.inc @@ -106,7 +106,67 @@ void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); } +namespace llvm::sys::windows { +HMODULE loadSystemModuleSecure(LPCWSTR lpModuleName) { + // Ensure we load indeed a module from system32 path. + // As per GetModuleHandle documentation: + // "If lpModuleName does not include a path and there is more than one loaded + // module with the same base name and extension, you cannot predict which + // module handle will be returned.". This mitigates + // https://learn.microsoft.com/en-us/security-updates/securityadvisories/2010/2269637 + SmallVector<wchar_t, MAX_PATH> Buf; + size_t Size = MAX_PATH; + do { + Buf.resize_for_overwrite(Size); + SetLastError(NO_ERROR); + Size = ::GetSystemDirectoryW(Buf.data(), Buf.size()); + if (Size == 0) + return NULL; + + // Try again with larger buffer. + } while (Size > Buf.size()); + + Buf.truncate(Size); + Buf.push_back(L'\\'); + Buf.append(lpModuleName, lpModuleName + std::wcslen(lpModuleName)); + Buf.push_back(0); + + return ::GetModuleHandleW(Buf.data()); +} +} // namespace llvm::sys::windows + SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) { + HMODULE kernelM = llvm::sys::windows::loadSystemModuleSecure(L"kernel32.dll"); + if (kernelM) { + // SetThreadInformation is only available on Windows 8 and later. Since we + // still support compilation on Windows 7, we load the function dynamically. + typedef BOOL(WINAPI * SetThreadInformation_t)( + HANDLE hThread, THREAD_INFORMATION_CLASS ThreadInformationClass, + _In_reads_bytes_(ThreadInformationSize) PVOID ThreadInformation, + ULONG ThreadInformationSize); + static const auto pfnSetThreadInformation = + (SetThreadInformation_t)::GetProcAddress(kernelM, + "SetThreadInformation"); + if (pfnSetThreadInformation) { + auto setThreadInformation = [](ULONG ControlMaskAndStateMask) { + THREAD_POWER_THROTTLING_STATE state{}; + state.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION; + state.ControlMask = ControlMaskAndStateMask; + state.StateMask = ControlMaskAndStateMask; + return pfnSetThreadInformation( + ::GetCurrentThread(), ThreadPowerThrottling, &state, sizeof(state)); + }; + + // Use EcoQoS for ThreadPriority::Background available (running on most + // efficent cores at the most efficient cpu frequency): + // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadinformation + // https://learn.microsoft.com/en-us/windows/win32/procthread/quality-of-service + setThreadInformation(Priority == ThreadPriority::Background + ? THREAD_POWER_THROTTLING_EXECUTION_SPEED + : 0); + } + } + // https://docs.microsoft.com/en-us/windows/desktop/api/processthreadsapi/nf-processthreadsapi-setthreadpriority // Begin background processing mode. The system lowers the resource scheduling // priorities of the thread so that it can perform background work without diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 1f3e5dc..3f318e2 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -985,6 +985,12 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { } break; + case GETDAGOPNAME: + if (const auto *Dag = dyn_cast<DagInit>(LHS)) { + return Dag->getName(); + } + break; + case LOG2: if (const auto *LHSi = dyn_cast_or_null<IntInit>( LHS->convertInitializerTo(IntRecTy::get(RK)))) { @@ -1050,6 +1056,9 @@ std::string UnOpInit::getAsString() const { case SIZE: Result = "!size"; break; case EMPTY: Result = "!empty"; break; case GETDAGOP: Result = "!getdagop"; break; + case GETDAGOPNAME: + Result = "!getdagopname"; + break; case LOG2 : Result = "!logtwo"; break; case LISTFLATTEN: Result = "!listflatten"; @@ -1310,7 +1319,11 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { SmallVector<std::pair<const Init *, const StringInit *>, 8> Args; llvm::append_range(Args, LHSs->getArgAndNames()); llvm::append_range(Args, RHSs->getArgAndNames()); - return DagInit::get(Op, Args); + // Use the name of the LHS DAG if it's set, otherwise the name of the RHS. + const auto *NameInit = LHSs->getName(); + if (!NameInit) + NameInit = RHSs->getName(); + return DagInit::get(Op, NameInit, Args); } break; } @@ -1508,6 +1521,14 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { return DagInit::get(Op, Dag->getArgs(), Dag->getArgNames()); break; } + case SETDAGOPNAME: { + const auto *Dag = dyn_cast<DagInit>(LHS); + const auto *Op = dyn_cast<StringInit>(RHS); + if (Dag && Op) + return DagInit::get(Dag->getOperator(), Op, Dag->getArgs(), + Dag->getArgNames()); + break; + } case ADD: case SUB: case MUL: @@ -1620,6 +1641,9 @@ std::string BinOpInit::getAsString() const { case STRCONCAT: Result = "!strconcat"; break; case INTERLEAVE: Result = "!interleave"; break; case SETDAGOP: Result = "!setdagop"; break; + case SETDAGOPNAME: + Result = "!setdagopname"; + break; case GETDAGARG: Result = "!getdagarg<" + getType()->getAsString() + ">"; break; diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index aea1bb0..c369916 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -680,6 +680,8 @@ tgtok::TokKind TGLexer::LexExclaim() { .Case("find", tgtok::XFind) .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated. .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated. + .Case("setdagopname", tgtok::XSetDagOpName) + .Case("getdagopname", tgtok::XGetDagOpName) .Case("getdagarg", tgtok::XGetDagArg) .Case("getdagname", tgtok::XGetDagName) .Case("setdagarg", tgtok::XSetDagArg) diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index ed7d8f3..5725e39 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -150,6 +150,8 @@ enum TokKind { XGt, XSetDagOp, XGetDagOp, + XSetDagOpName, + XGetDagOpName, XExists, XListRemove, XToLower, diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index 62c5355..81b61b1 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "TGParser.h" +#include "TGLexer.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" @@ -1199,6 +1200,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { case tgtok::XCast: case tgtok::XRepr: case tgtok::XGetDagOp: + case tgtok::XGetDagOpName: case tgtok::XInitialized: { // Value ::= !unop '(' Value ')' UnOpInit::UnaryOp Code; const RecTy *Type = nullptr; @@ -1287,6 +1289,11 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } Code = UnOpInit::GETDAGOP; break; + case tgtok::XGetDagOpName: + Lex.Lex(); // eat the operation + Type = StringRecTy::get(Records); + Code = UnOpInit::GETDAGOPNAME; + break; case tgtok::XInitialized: Lex.Lex(); // eat the operation Code = UnOpInit::INITIALIZED; @@ -1514,7 +1521,8 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { case tgtok::XInterleave: case tgtok::XGetDagArg: case tgtok::XGetDagName: - case tgtok::XSetDagOp: { // Value ::= !binop '(' Value ',' Value ')' + case tgtok::XSetDagOp: + case tgtok::XSetDagOpName: { // Value ::= !binop '(' Value ',' Value ')' tgtok::TokKind OpTok = Lex.getCode(); SMLoc OpLoc = Lex.getLoc(); Lex.Lex(); // eat the operation @@ -1550,6 +1558,9 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { case tgtok::XStrConcat: Code = BinOpInit::STRCONCAT; break; case tgtok::XInterleave: Code = BinOpInit::INTERLEAVE; break; case tgtok::XSetDagOp: Code = BinOpInit::SETDAGOP; break; + case tgtok::XSetDagOpName: + Code = BinOpInit::SETDAGOPNAME; + break; case tgtok::XGetDagArg: Code = BinOpInit::GETDAGARG; break; @@ -1580,6 +1591,10 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } ArgType = DagRecTy::get(Records); break; + case tgtok::XSetDagOpName: + Type = DagRecTy::get(Records); + ArgType = DagRecTy::get(Records); + break; case tgtok::XGetDagName: Type = StringRecTy::get(Records); ArgType = DagRecTy::get(Records); @@ -1773,22 +1788,26 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { // Deal with BinOps whose arguments have different types, by // rewriting ArgType in between them. switch (Code) { - case BinOpInit::SETDAGOP: - // After parsing the first dag argument, switch to expecting - // a record, with no restriction on its superclasses. - ArgType = RecordRecTy::get(Records, {}); - break; - case BinOpInit::GETDAGARG: - // After parsing the first dag argument, expect an index integer or a - // name string. - ArgType = nullptr; - break; - case BinOpInit::GETDAGNAME: - // After parsing the first dag argument, expect an index integer. - ArgType = IntRecTy::get(Records); - break; - default: - break; + case BinOpInit::SETDAGOPNAME: + // After parsing the first dag argument, expect a string. + ArgType = StringRecTy::get(Records); + break; + case BinOpInit::SETDAGOP: + // After parsing the first dag argument, switch to expecting + // a record, with no restriction on its superclasses. + ArgType = RecordRecTy::get(Records, {}); + break; + case BinOpInit::GETDAGARG: + // After parsing the first dag argument, expect an index integer or a + // name string. + ArgType = nullptr; + break; + case BinOpInit::GETDAGNAME: + // After parsing the first dag argument, expect an index integer. + ArgType = IntRecTy::get(Records); + break; + default: + break; } if (!consume(tgtok::comma)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index eca7ca5..ad42f4b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5296,7 +5296,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 0, AArch64::LD1B_2Z_IMM_PSEUDO, AArch64::LD1B_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5307,7 +5307,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 1, AArch64::LD1H_2Z_IMM_PSEUDO, AArch64::LD1H_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5317,7 +5317,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 2, AArch64::LD1W_2Z_IMM_PSEUDO, AArch64::LD1W_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5327,7 +5327,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 3, AArch64::LD1D_2Z_IMM_PSEUDO, AArch64::LD1D_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5341,7 +5341,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 0, AArch64::LD1B_4Z_IMM_PSEUDO, AArch64::LD1B_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5352,7 +5352,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 1, AArch64::LD1H_4Z_IMM_PSEUDO, AArch64::LD1H_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5362,7 +5362,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 2, AArch64::LD1W_4Z_IMM_PSEUDO, AArch64::LD1W_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5372,7 +5372,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 3, AArch64::LD1D_4Z_IMM_PSEUDO, AArch64::LD1D_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5386,7 +5386,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM_PSEUDO, AArch64::LDNT1B_2Z_PSEUDO); @@ -5398,7 +5398,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM_PSEUDO, AArch64::LDNT1H_2Z_PSEUDO); @@ -5409,7 +5409,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM_PSEUDO, AArch64::LDNT1W_2Z_PSEUDO); @@ -5420,7 +5420,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM_PSEUDO, AArch64::LDNT1D_2Z_PSEUDO); @@ -5435,7 +5435,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM_PSEUDO, AArch64::LDNT1B_4Z_PSEUDO); @@ -5447,7 +5447,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM_PSEUDO, AArch64::LDNT1H_4Z_PSEUDO); @@ -5458,7 +5458,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM_PSEUDO, AArch64::LDNT1W_4Z_PSEUDO); @@ -5469,7 +5469,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM_PSEUDO, AArch64::LDNT1D_4Z_PSEUDO); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7b49754..4f6e3dd 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8952,6 +8952,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool &IsTailCall = CLI.IsTailCall; CallingConv::ID &CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); MachineFunction::CallSiteInfo CSInfo; @@ -8991,6 +8992,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, *DAG.getContext()); RetCCInfo.AnalyzeCallResult(Ins, RetCC); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + // Check callee args/returns for SVE registers and set calling convention // accordingly. if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { @@ -11325,7 +11330,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue AArch64TargetLowering::LowerSELECT_CC( ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, - iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs, + iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags, const SDLoc &DL, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. @@ -11386,6 +11391,22 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return DAG.getNode(ISD::AND, DL, VT, LHS, Shift); } + // Canonicalise absolute difference patterns: + // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc -> + // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc + // + // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc -> + // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc + // The second forms can be matched into subs+cneg. + if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) { + if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS && + FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) + FVal = DAG.getNegative(TVal, DL, TVal.getValueType()); + else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS && + FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) + TVal = DAG.getNegative(FVal, DL, FVal.getValueType()); + } + unsigned Opcode = AArch64ISD::CSEL; // If both the TVal and the FVal are constants, see if we can swap them in @@ -11523,7 +11544,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return true; } })) { - bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs; + bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs(); SDValue VectorCmp = emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG); if (VectorCmp) @@ -11537,7 +11558,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); - if (DAG.getTarget().Options.UnsafeFPMath) { + if (Flags.hasNoSignedZeros()) { // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); @@ -11616,10 +11637,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); - bool HasNoNans = Op->getFlags().hasNoNaNs(); + SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, - DAG); + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, @@ -11627,7 +11647,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); - bool HasNoNans = Op->getFlags().hasNoNaNs(); SDLoc DL(Op); EVT Ty = Op.getValueType(); @@ -11694,8 +11713,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, DAG.getUNDEF(MVT::f32), FVal); } - SDValue Res = - LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG); + SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), + Op->getFlags(), DL, DAG); if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res); @@ -12292,7 +12311,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SDLoc DL(Operand); EVT VT = Operand.getValueType(); - SDNodeFlags Flags = SDNodeFlags::AllowReassociation; + // Ensure nodes can be recognized by isAssociativeAndCommutative. + SDNodeFlags Flags = + SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros; // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) @@ -16674,7 +16695,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath)); + I->getFastMathFlags().allowContract())); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding @@ -24112,6 +24133,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, Store->getMemOperand()); } +// Combine store (fp_to_int X) to use vector semantics around the conversion +// when NEON is available. This allows us to store the in-vector result directly +// without transferring the result into a GPR in the process. +static SDValue combineStoreValueFPToInt(StoreSDNode *ST, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + // Limit to post-legalization in order to avoid peeling truncating stores. + if (DCI.isBeforeLegalize()) + return SDValue(); + if (!Subtarget->isNeonAvailable()) + return SDValue(); + // Source operand is already a vector. + SDValue Value = ST->getValue(); + if (Value.getValueType().isVector()) + return SDValue(); + + // Look through potential assertions. + while (Value->isAssert()) + Value = Value.getOperand(0); + + if (Value.getOpcode() != ISD::FP_TO_SINT && + Value.getOpcode() != ISD::FP_TO_UINT) + return SDValue(); + if (!Value->hasOneUse()) + return SDValue(); + + SDValue FPSrc = Value.getOperand(0); + EVT SrcVT = FPSrc.getValueType(); + if (SrcVT != MVT::f32 && SrcVT != MVT::f64) + return SDValue(); + + // No support for assignments such as i64 = fp_to_sint i32 + EVT VT = Value.getSimpleValueType(); + if (VT != SrcVT.changeTypeToInteger()) + return SDValue(); + + // Create a 128-bit element vector to avoid widening. The floating point + // conversion is transformed into a single element conversion via a pattern. + unsigned NumElements = 128 / SrcVT.getFixedSizeInBits(); + EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements); + EVT VecDstVT = VecSrcVT.changeTypeToInteger(); + SDLoc DL(ST); + SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc); + SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP); + + SDValue Zero = DAG.getVectorIdxConstant(0, DL); + SDValue Extracted = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero); + + DCI.CombineTo(ST->getValue().getNode(), Extracted); + return SDValue(ST, 0); +} + bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) { return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) || (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) || @@ -24194,6 +24269,9 @@ static SDValue performSTORECombine(SDNode *N, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(ST); + if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget)) + return Res; + auto hasValidElementTypeForFPTruncStore = [](EVT VT) { EVT EltVT = VT.getVectorElementType(); return EltVT == MVT::f32 || EltVT == MVT::f64; @@ -26926,6 +27004,23 @@ static SDValue performSHLCombine(SDNode *N, return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS); } +static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntrinsicID = N->getConstantOperandVal(1); + auto Register = + (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR + : AArch64SysReg::RNDRRS); + SDLoc DL(N); + SDValue A = DAG.getNode( + AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), + N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); + SDValue B = DAG.getNode( + AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); + return DAG.getMergeValues( + {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -27241,22 +27336,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); case Intrinsic::aarch64_rndr: - case Intrinsic::aarch64_rndrrs: { - unsigned IntrinsicID = N->getConstantOperandVal(1); - auto Register = - (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR - : AArch64SysReg::RNDRRS); - SDLoc DL(N); - SDValue A = DAG.getNode( - AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), - N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); - SDValue B = DAG.getNode( - AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); - return DAG.getMergeValues( - {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); - } + case Intrinsic::aarch64_rndrrs: + return performRNDRCombine(N, DAG); case Intrinsic::aarch64_sme_ldr_zt: return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N), DAG.getVTList(MVT::Other), N->getOperand(0), diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 95d0e3b..ea63edd8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -662,7 +662,7 @@ private: SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, iterator_range<SDNode::user_iterator> Users, - bool HasNoNans, const SDLoc &dl, + SDNodeFlags Flags, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 8685d7a0..59d4fd2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -6574,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by // the target options or if FADD/FSUB has the contract fast-math flag. - return Options.UnsafeFPMath || - Options.AllowFPOpFusion == FPOpFusion::Fast || + return Options.AllowFPOpFusion == FPOpFusion::Fast || Inst.getFlag(MachineInstr::FmContract); - return true; } return false; } @@ -6680,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, case AArch64::FMUL_ZZZ_H: case AArch64::FMUL_ZZZ_S: case AArch64::FMUL_ZZZ_D: - return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || - (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && - Inst.getFlag(MachineInstr::MIFlag::FmNsz)); + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); // == Integer types == // -- Base instructions -- diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 07cacfa..251fd44 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6668,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; } +def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>; +def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>; + // int -> float conversion of value in lane 0 of simd vector should use // correct cvtf variant to avoid costly fpr <-> gpr register transfers. def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))), diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index abcd550..b97d622 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -12,7 +12,7 @@ // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi -// MOVi64imm + ADDXrr ==> ANDXri + ANDXri +// MOVi64imm + ADDXrr ==> ADDXri + ADDXri // // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi // MOVi64imm + SUBXrr ==> SUBXri + SUBXri @@ -125,8 +125,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { template <typename T> bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); + // Strategy used to split logical immediate bitmasks. + enum class SplitStrategy { + Intersect, + }; template <typename T> - bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0); + bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, unsigned OtherOpc = 0); bool visitORR(MachineInstr &MI); bool visitCSEL(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); @@ -158,14 +163,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", template <typename T> static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { T UImm = static_cast<T>(Imm); - if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) - return false; - - // If this immediate can be handled by one instruction, do not split it. - SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; - AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); - if (Insn.size() == 1) - return false; // The bitmask immediate consists of consecutive ones. Let's say there is // constant 0b00000000001000000000010000000000 which does not consist of @@ -194,8 +191,9 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> -bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, - unsigned OtherOpc) { +bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, + unsigned OtherOpc) { // Try below transformation. // // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri @@ -208,9 +206,26 @@ bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, return splitTwoPartImm<T>( MI, - [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0, - T &Imm1) -> std::optional<OpcodePair> { - if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) + [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0, + T &Imm1) -> std::optional<OpcodePair> { + // If this immediate is already a suitable bitmask, don't split it. + // TODO: Should we just combine the two instructions in this case? + if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) + return std::nullopt; + + // If this immediate can be handled by one instruction, don't split it. + SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; + AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); + if (Insn.size() == 1) + return std::nullopt; + + bool SplitSucc = false; + switch (Strategy) { + case SplitStrategy::Intersect: + SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1); + break; + } + if (SplitSucc) return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); return std::nullopt; }, @@ -859,16 +874,20 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { Changed |= visitINSERT(MI); break; case AArch64::ANDWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); + Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); + Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDSWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri); + Changed |= trySplitLogicalImm<uint32_t>( + AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri); break; case AArch64::ANDSXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri); + Changed |= trySplitLogicalImm<uint64_t>( + AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri); break; case AArch64::ORRWrs: Changed |= visitORR(MI); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index bb0f667b..e0e1af7 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1650,6 +1650,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.eraseFromParent(); return true; }; + auto LowerTriOp = [&MI, &MIB](unsigned Opcode) { + MIB.buildInstr(Opcode, {MI.getOperand(0)}, + {MI.getOperand(2), MI.getOperand(3), MI.getOperand(4)}); + MI.eraseFromParent(); + return true; + }; Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrinsicID) { @@ -1828,6 +1834,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(TargetOpcode::G_USUBSAT); break; } + case Intrinsic::aarch64_neon_udot: + return LowerTriOp(AArch64::G_UDOT); + case Intrinsic::aarch64_neon_sdot: + return LowerTriOp(AArch64::G_SDOT); case Intrinsic::vector_reverse: // TODO: Add support for vector_reverse diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8b8fc8b..8a0c4ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -286,6 +286,12 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", "VMEM CU scope prefetches do not fail on illegal address" >; +def FeatureCUStores : SubtargetFeature<"cu-stores", + "HasCUStores", + "true", + "Whether SCOPE_CU stores can be used on GFX12.5" +>; + def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -1383,6 +1389,9 @@ def FeatureAddSubU64Insts : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", "Has v_add_u64 and v_sub_u64 instructions">; +def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst", + "true", "Has v_mad_u32 instruction">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -1988,6 +1997,7 @@ def FeatureISAVersion12 : FeatureSet< def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, + FeatureCUStores, FeatureCuMode, Feature64BitLiterals, FeatureLDSBankCount32, @@ -2042,6 +2052,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureVmemPrefInsts, FeatureLshlAddU64Inst, FeatureAddSubU64Insts, + FeatureMadU32Inst, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2422,7 +2433,7 @@ def HasAtomicFMinFMaxF64FlatInsts : def HasLdsAtomicAddF64 : Predicate<"Subtarget->hasLdsAtomicAddF64()">, - AssemblerPredicate<(any_of FeatureGFX90AInsts)>; + AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>; def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>; @@ -2565,6 +2576,10 @@ def HasFmaakFmamkF64Insts : Predicate<"Subtarget->hasFmaakFmamkF64Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; +def HasAddMinMaxInsts : + Predicate<"Subtarget->hasAddMinMaxInsts()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def HasPkAddMinMaxInsts : Predicate<"Subtarget->hasPkAddMinMaxInsts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; @@ -2832,6 +2847,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; +def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">, + AssemblerPredicate<(all_of FeatureMadU32Inst)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 4b3dc37..6681393 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= @@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; } - if (MF.getSubtarget<GCNSubtarget>().isWave32()) { + if (ST.isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } + if (isGFX1250(ST) && ST.hasCUStores()) { + KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES; + } // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be // un-evaluatable at this point so it cannot be conditionally checked here. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index c01e5d3..992572f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -143,6 +143,9 @@ def gi_global_saddr_cpol : def gi_global_saddr_glc : GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, GIComplexPatternEquiv<GlobalSAddrGLC>; +def gi_global_saddr_no_ioffset : + GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">, + GIComplexPatternEquiv<GlobalSAddrNoIOffset>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index dfaa145..39b4200 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; unsigned Opc; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1); if (Subtarget->hasMADIntraFwdBug()) Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 : AMDGPU::V_MAD_U64_U32_gfx11_e64; + else if (UseNoCarry) + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp }; + + if (UseNoCarry) { + MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops); + ReplaceUses(SDValue(N, 0), SDValue(Mad, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } @@ -2049,6 +2060,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &CPol) const { + bool ScaleOffset; + SDValue DummyOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset, + false)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 5636d89..983f1aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -174,6 +174,8 @@ private: bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f25ce87..6118933 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4846,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } -// Detect when CMP and SELECT use the same constant and fold them to avoid -// loading the constant twice. Specifically handles patterns like: -// %cmp = icmp eq i32 %val, 4242 -// %sel = select i1 %cmp, i32 4242, i32 %other -// It can be optimized to reuse %val instead of 4242 in select. -static SDValue -foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AMDGPUSubtarget *ST) { - SDValue Cond = N->getOperand(0); - SDValue TrueVal = N->getOperand(1); - SDValue FalseVal = N->getOperand(2); - - // Check if condition is a comparison. - if (Cond.getOpcode() != ISD::SETCC) - return SDValue(); - - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); - bool isInteger = LHS.getValueType().isInteger(); - - // Handle simple floating-point and integer types only. - if (!isFloatingPoint && !isInteger) - return SDValue(); - - bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); - bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); - if (!isEquality && !isNonEquality) - return SDValue(); - - SDValue ArgVal, ConstVal; - if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || - (isInteger && isa<ConstantSDNode>(RHS))) { - ConstVal = RHS; - ArgVal = LHS; - } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || - (isInteger && isa<ConstantSDNode>(LHS))) { - ConstVal = LHS; - ArgVal = RHS; - } else { - return SDValue(); - } - - // Check if constant should not be optimized - early return if not. - if (isFloatingPoint) { - const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); - const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); - - // Only optimize normal floating-point values (finite, non-zero, and - // non-subnormal as per IEEE 754), skip optimization for inlinable - // floating-point constants. - if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) - return SDValue(); - } else { - int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); - - // Skip optimization for inlinable integer immediates. - // Inlinable immediates include: -16 to 64 (inclusive). - if (IntVal >= -16 && IntVal <= 64) - return SDValue(); - } - - // For equality and non-equality comparisons, patterns: - // select (setcc x, const), const, y -> select (setcc x, const), x, y - // select (setccinv x, const), y, const -> select (setccinv x, const), y, x - if (!(isEquality && TrueVal == ConstVal) && - !(isNonEquality && FalseVal == ConstVal)) - return SDValue(); - - SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; - SDValue SelectRHS = - (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; - return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, - SelectLHS, SelectRHS); -} - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; - // Try to fold CMP + SELECT patterns with shared constants (both FP and - // integer). - if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) - return Folded; - SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 266dee1..b0d3b12 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && + MRI->use_nodbg_empty(I.getOperand(1).getReg()); unsigned Opc; if (Subtarget->hasMADIntraFwdBug()) Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 : AMDGPU::V_MAD_I64_I32_gfx11_e64; + else if (UseNoCarry) + Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64 + : AMDGPU::V_MAD_NC_I64_I32_e64; else Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; + + if (UseNoCarry) + I.removeOperand(1); + I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); @@ -3995,6 +4004,9 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const { } unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64; + if (!IsB32 && STI.hasTrue16BitInsts()) + Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64 + : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64; unsigned CBL = STI.getConstantBusLimit(Opc); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -5789,6 +5801,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset( + MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol, false); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); Register PtrBase; @@ -6971,13 +6994,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : (int64_t)SISrcMods::DST_OP_SEL); } @@ -6986,13 +7009,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)(SISrcMods::OP_SEL_0) : 0); } @@ -7021,8 +7044,9 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + ? (int64_t)SISrcMods::DST_OP_SEL + : 0); } void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index fe9743d0a..140e753 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -264,6 +264,8 @@ private: selectGlobalSAddrCPol(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrNoIOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fedfa3f..50da8fd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); if (ST.hasVOP3PInsts()) { - getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElements(0, S16, 2) - .minScalar(0, S16) - .widenScalarToNextPow2(0) - .scalarize(0) - .lower(); + getActionDefinitionsBuilder(G_ABS) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + if (ST.hasIntMinMax64()) { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, S64, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } else { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } } else { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) .legalFor({S32, S16}) @@ -1682,7 +1699,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasFlatAtomicFaddF32Inst()) Atomic.legalFor({{S32, FlatPtr}}); - if (ST.hasGFX90AInsts()) { + if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations // TODO: Move atomic expansion into legalizer @@ -2295,8 +2312,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture( LLT::scalar(32), commonAlignment(Align(64), Offset)); // Pointer address - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -2317,8 +2334,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), StructOffset)); - B.buildPtrAdd(LoadAddr, QueuePtr, - B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); + B.buildObjectPtrOffset( + LoadAddr, QueuePtr, + B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -4500,8 +4518,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, llvm_unreachable("failed to find kernarg segment ptr"); auto COffset = B.buildConstant(LLT::scalar(64), Offset); - // TODO: Should get nuw - return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); + return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0); } /// Legalize a value that's loaded from kernel arguments. This is only used by @@ -5676,8 +5693,8 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) return false; - // FIXME: This should be nuw - B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + B.buildObjectPtrOffset(DstReg, KernargPtrReg, + B.buildConstant(IdxTy, Offset).getReg(0)); return true; } @@ -7019,8 +7036,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( // Pointer address Register LoadAddr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); B.buildCopy(SGPR01, Temp); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index f471881..b45627d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -294,7 +294,8 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, BasePlusOffset = Base; } else { auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); - BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); + BasePlusOffset = + B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0); } auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c5a1d9e..6bca2fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4009,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: - case AMDGPU::G_SMIN: - case AMDGPU::G_SMAX: - case AMDGPU::G_UMIN: - case AMDGPU::G_UMAX: case AMDGPU::G_ABS: case AMDGPU::G_SHUFFLE_VECTOR: case AMDGPU::G_SBFX: @@ -4022,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: + if (isSALUMapping(MI)) { + // There are no scalar 64-bit min and max, use vector instruction instead. + if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 && + Subtarget.hasIntMinMax64()) + return getDefaultMappingVOP(MI); + return getDefaultMappingSOP(MI); + } + return getDefaultMappingVOP(MI); case AMDGPU::G_FADD: case AMDGPU::G_FSUB: case AMDGPU::G_FMUL: @@ -4566,8 +4574,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pknorm_u16: case Intrinsic::amdgcn_cvt_pk_i16: case Intrinsic::amdgcn_cvt_pk_u16: + case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32: case Intrinsic::amdgcn_cvt_pk_f16_fp8: case Intrinsic::amdgcn_cvt_pk_f16_bf8: + case Intrinsic::amdgcn_cvt_pk_fp8_f16: + case Intrinsic::amdgcn_cvt_pk_bf8_f16: + case Intrinsic::amdgcn_cvt_sr_fp8_f16: + case Intrinsic::amdgcn_cvt_sr_bf8_f16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: @@ -5364,6 +5377,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c865082..c1f1703 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -104,7 +104,9 @@ #include "llvm/Transforms/Scalar/FlattenCFG.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/InferAddressSpaces.h" +#include "llvm/Transforms/Scalar/LICM.h" #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" #include "llvm/Transforms/Scalar/Sink.h" @@ -836,8 +838,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // When we are not using -fgpu-rdc, we can run accelerator code // selection relatively early, but still after linking to prevent // eager removal of potentially reachable symbols. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } PM.addPass(AMDGPUPrintfRuntimeBindingPass()); } @@ -916,8 +920,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // selection after linking to prevent, otherwise we end up removing // potentially reachable symbols that were exported as external in other // modules. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. @@ -2062,7 +2068,12 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { // TODO: May want to move later or split into an early and late one. addPass(AMDGPUCodeGenPreparePass(TM)); - // TODO: LICM + // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may + // have expanded. + if (TM.getOptLevel() > CodeGenOptLevel::Less) { + addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()), + /*UseMemorySSA=*/true)); + } } Base::addIRPasses(addPass); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 421fc42..a4ea8cf 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -689,6 +689,8 @@ public: bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); } + bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); } + bool isVISrcB32() const { return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32); } @@ -2036,6 +2038,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_KIMM16: return &APFloat::IEEEhalf(); case AMDGPU::OPERAND_REG_IMM_BF16: @@ -2405,6 +2408,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_KIMM32: @@ -2456,6 +2460,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo setImmKindConst(); return; } + [[fallthrough]]; + + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: Inst.addOperand(MCOperand::createImm(Lo_32(Val))); setImmKindLiteral(); @@ -3761,6 +3768,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16) return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm()); + if (OperandType == AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16) + return false; + llvm_unreachable("invalid operand type"); } default: @@ -6066,6 +6076,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { ExprVal, ValRange); if (Val) ImpliedUserSGPRCount += 1; + } else if (ID == ".amdhsa_uses_cu_stores") { + if (!isGFX1250()) + return Error(IDRange.Start, "directive requires gfx12.5", IDRange); + + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange); } else if (ID == ".amdhsa_wavefront_size32") { EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 10) @@ -9415,7 +9431,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 || Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 || Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) { + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) { Inst.addOperand(Inst.getOperand(0)); } diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index f99e716..1956a15 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -2489,7 +2489,7 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> { } //===----------------------------------------------------------------------===// -// MUBUF - GFX11, GFX12. +// MUBUF - GFX11, GFX12, GFX1250. //===----------------------------------------------------------------------===// // gfx11 instruction that accept both old and new assembler name. @@ -2600,6 +2600,12 @@ multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op, def : Mnem_gfx12<gfx11_name, gfx12_name>; } +multiclass MUBUF_Real_Atomic_gfx12_Renamed<bits<8> op, string real_name> : + MUBUF_Real_Atomic_gfx12_impl<op, 0, real_name>, + MUBUF_Real_Atomic_gfx12_impl<op, 1, real_name> { + def : Mnem_gfx12<get_BUF_ps<NAME>.Mnemonic, real_name>; +} + defm BUFFER_GL0_INV : MUBUF_Real_gfx11<0x02B>; defm BUFFER_GL1_INV : MUBUF_Real_gfx11<0x02C>; @@ -2678,6 +2684,10 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04B, "buffer defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>; defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>; +defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_gfx12<0x055>; +defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05b, "buffer_atomic_min_num_f64">; +defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05c, "buffer_atomic_max_num_f64">; + //===----------------------------------------------------------------------===// // MUBUF - GFX10. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 42edec0..c466f9c 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -199,6 +199,7 @@ add_llvm_target(AMDGPUCodeGen Instrumentation MC MIRParser + ObjCARC Passes Scalar SelectionDAG diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 319cc9d..3ff675d 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1397,6 +1397,9 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>; defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>; +defm DS_ADD_F64 : DS_Real_gfx12<0x054>; +defm DS_ADD_RTN_F64 : DS_Real_gfx12<0x074>; + let AssemblerPredicate = HasLdsBarrierArriveAtomic in { defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>; defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 5c1989b..ffe6b06 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + if (isGFX1250()) + PRINT_DIRECTIVE(".amdhsa_uses_cu_stores", + KERNEL_CODE_PROPERTY_USES_CU_STORES); if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0, diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 7207c25..d5d1074 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -11,6 +11,7 @@ let WantsRoot = true in { def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>; def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>; + def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>; def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; @@ -369,31 +370,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> { } } -class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< +// Async loads, introduced in gfx1250, will store directly +// to a DS address in vdst (they will not use M0 for DS addess). +class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo< opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), - (ins flat_offset:$offset, CPol_0:$cpol)), - " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { - let LGKM_CNT = 1; + !if(IsAsync, (ins VGPR_32:$vdst), (ins)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + (ins flat_offset:$offset, CPol_0:$cpol)), + !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let LGKM_CNT = !not(IsAsync); + let VM_CNT = !not(IsAsync); + let ASYNC_CNT = IsAsync; let is_flat_global = 1; let lds = 1; let has_data = 0; + let has_vdst = IsAsync; // vdst for ds address with IsAsync + let mayLoad = 1; + let mayStore = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let VALU = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]); + let Defs = !if(IsAsync, [ASYNCcnt], []); + let SchedRW = [WriteVMEM, WriteLDS]; +} + +multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> { + def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>, + GlobalSaddrTable<1, opName>; +} + +class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< + opName, + (outs ), + !con( + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + (ins flat_offset:$offset, CPol_0:$cpol)), + " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let VM_CNT = 0; + let ASYNC_CNT = 1; + let is_flat_global = 1; + let lds = 1; + let has_data = 1; // vdata for ds address let has_vdst = 0; let mayLoad = 1; let mayStore = 1; let has_saddr = 1; let enabled_saddr = EnableSaddr; let VALU = 1; - let Uses = [M0, EXEC]; + let Uses = [EXEC, ASYNCcnt]; + let Defs = [ASYNCcnt]; let SchedRW = [WriteVMEM, WriteLDS]; } -multiclass FLAT_Global_Load_LDS_Pseudo<string opName> { - def "" : FLAT_Global_Load_LDS_Pseudo<opName>, +multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> { + def "" : FLAT_Global_STORE_LDS_Pseudo<opName>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>, + def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>, GlobalSaddrTable<1, opName>; } @@ -1156,6 +1194,15 @@ let SubtargetPredicate = isGFX12Plus in { let SubtargetPredicate = isGFX1250Plus in { +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">; + def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>; def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">; } // End SubtargetPredicate = isGFX1250Plus @@ -1315,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $dsaddr, $vaddr, $offset, $cpol) +>; + +class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $dsaddr, $saddr, $voffset, $offset, $cpol) +>; + +class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $vaddr, $dsaddr, $offset, $cpol) +>; + +class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $saddr, $voffset, $dsaddr, $offset, $cpol) +>; + class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), (inst $saddr, $voffset, $offset, $cpol) @@ -1525,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $saddr, $offset, $cpol) >; +multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatLoadLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + +multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatStoreLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadSignedPat <inst, node, vt> { let AddedComplexity = 10; @@ -2091,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in { defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>; } // End SubtargetPredicate = isGFX125xOnly +let OtherPredicates = [isGFX1250Plus] in { + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>; + + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>; +} + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; @@ -3374,12 +3473,29 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>; defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>; +defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">; +defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">; + +defm GLOBAL_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm GLOBAL_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "global_atomic_min_num_f64">; +defm GLOBAL_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "global_atomic_max_num_f64">; + def True16D16Table : GenericTable { let FilterClass = "True16D16Table"; let CppTypeName = "True16D16Info"; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 785ede3..bdd900d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -248,6 +248,7 @@ protected: bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; bool HasSafeCUPrefetch = false; + bool HasCUStores = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -272,6 +273,7 @@ protected: bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; bool HasAddSubU64Insts = false; + bool HasMadU32Inst = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -714,7 +716,9 @@ public: bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); } // DS_ADD_F64/DS_ADD_RTN_F64 - bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); } + bool hasLdsAtomicAddF64() const { + return hasGFX90AInsts() || hasGFX1250Insts(); + } bool hasMultiDwordFlatScratchAddressing() const { return getGeneration() >= GFX9; @@ -998,6 +1002,8 @@ public: bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + bool hasCUStores() const { return HasCUStores; } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1516,9 +1522,22 @@ public: // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + // \returns true if the target has V_MAD_U32 instruction. + bool hasMadU32Inst() const { return HasMadU32Inst; } + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. bool hasVectorMulU64() const { return GFX1250Insts; } + // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32 + // instructions. + bool hasMadU64U32NoCarry() const { return GFX1250Insts; } + + // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions. + bool hasIntMinMax64() const { return GFX1250Insts; } + + // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions. + bool hasAddMinMaxInsts() const { return GFX1250Insts; } + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 11b072e..15088ac 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -540,6 +540,8 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, printImmediateBFloat16(static_cast<uint16_t>(Imm), STI, O)) return; break; + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + break; default: llvm_unreachable("bad operand type"); } @@ -770,6 +772,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index c49ad79..f358084 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -341,6 +341,9 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm)) .value_or(255); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return 255; + case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: case AMDGPU::OPERAND_KIMM64: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 10f6d33..43ca548 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, ".amdhsa_user_sgpr_private_segment_size"); + if (isGFX1250(STI)) + PrintField(KD.kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES, + ".amdhsa_uses_cu_stores"); if (IVersion.Major >= 10) PrintField(KD.kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 40b8bcd..c564145 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -208,6 +208,7 @@ enum OperandType : unsigned { OPERAND_REG_IMM_V2BF16, OPERAND_REG_IMM_V2FP16, OPERAND_REG_IMM_V2INT16, + OPERAND_REG_IMM_NOINLINE_V2FP16, OPERAND_REG_IMM_V2INT32, OPERAND_REG_IMM_V2FP32, diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index b77da4d..e934152 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -468,6 +468,7 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8d51ec6..f4d7408 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, Custom); } + if (Subtarget->hasIntMinMax64()) + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64, + Legal); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, @@ -1256,6 +1260,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { return AMDGPUTargetLowering::getPointerMemTy(DL, AS); } +static unsigned getIntrMemWidth(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + return 8; + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + return 32; + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + return 64; + case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + return 128; + default: + llvm_unreachable("Unknown width"); + } +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1527,6 +1550,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(1); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(0); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { Info.opc = ISD::INTRINSIC_VOID; @@ -1623,10 +1666,18 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: Ptr = II->getArgOperand(1); break; default: @@ -4241,7 +4292,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = BaseAddr.getValue(1); Align StackAlign = TFL->getStackAlign(); if (Alignment > StackAlign) { - uint64_t ScaledAlignment = (uint64_t)Alignment.value() + uint64_t ScaledAlignment = Alignment.value() << Subtarget->getWavefrontSizeLog2(); uint64_t StackAlignMask = ScaledAlignment - 1; SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, @@ -15896,6 +15947,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +SDValue SITargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + // Detect when CMP and SELECT use the same constant and fold them to avoid + // loading the constant twice. Specifically handles patterns like: + // %cmp = icmp eq i32 %val, 4242 + // %sel = select i1 %cmp, i32 4242, i32 %other + // It can be optimized to reuse %val instead of 4242 in select. + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Skip optimization for inlinable immediates. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + if (AMDGPU::isInlinableIntLiteral( + cast<ConstantSDNode>(ConstVal)->getSExtValue())) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -15944,6 +16067,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMulCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); + case ISD::SELECT: + if (auto Res = performSelectCombine(N, DCI)) + return Res; + break; case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index acf6158..dedd9ae 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -211,6 +211,7 @@ private: SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 520c321..4b48fc4 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1380,6 +1380,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Modified = true; } else WaitcntInstr = &II; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + assert(ST->hasVMemToLDSLoad()); + LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II + << "Before: " << Wait.LoadCnt << '\n';); + ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); + + // It is possible (but unlikely) that this is the only wait instruction, + // in which case, we exit this loop without a WaitcntInstr to consume + // `Wait`. But that works because `Wait` was passed in by reference, and + // the callee eventually calls createNewWaitcnt on it. We test this + // possibility in an articial MIR test since such a situation cannot be + // recreated by running the memory legalizer. + II.eraseFromParent(); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); @@ -1551,6 +1565,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + // Architectures higher than GFX10 do not have direct loads to + // LDS, so no work required here yet. + II.eraseFromParent(); + continue; } else { std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); assert(CT.has_value()); @@ -2415,6 +2434,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + Opcode == AMDGPU::S_WAITCNT_lds_direct || counterTypeForInstr(Opcode).has_value(); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2aa6b4e..044a681 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4438,6 +4438,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const { case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return false; case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { @@ -9281,6 +9283,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { default: if (MI.isMetaInstruction()) return 0; + + // If D16 Pseudo inst, get correct MC code size + const auto *D16Info = AMDGPU::getT16D16Helper(Opc); + if (D16Info) { + // Assume d16_lo/hi inst are always in same size + unsigned LoInstOpcode = D16Info->LoOp; + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode); + DescSize = Desc.getSize(); + } + return DescSize; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 83b0490..efcc88e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2859,6 +2859,7 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>; def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>; def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>; +def VOP_I16_V2F16 : VOPProfile<[i16, v2f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; @@ -2926,6 +2927,7 @@ def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>; +def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>; def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>; diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index f1262e11..53f554e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (Pos == Position::AFTER) --MI; @@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (VSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) @@ -2564,7 +2584,9 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) + // We also require SCOPE_SE minimum if we not have the "cu-stores" feature. + if (Scope == CPol::SCOPE_CU && + (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) return setScope(MI, CPol::SCOPE_SE); return false; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 218841d..36d1a3b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1218,6 +1218,8 @@ def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> { def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">; def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">; +def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">; + //===----------------------------------------------------------------------===// // VRegSrc_* Operands with a VGPR //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e103ccc..8303410 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +// Represents the point at which a wave must wait for all outstanding direct loads to LDS. +// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. + +def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { + let hasSideEffects = 0; +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b5b3cc9..5827f18 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) { } bool isAsyncStore(unsigned Opc) { - return false; // placeholder before async store implementation. + return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250; } bool isTensorStore(unsigned Opc) { @@ -2652,6 +2659,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: @@ -3016,6 +3024,8 @@ bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) { case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return isInlinableLiteralV2BF16(Literal); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return false; default: llvm_unreachable("bad packed operand type"); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index c09a9d6..74d59f4 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1636,6 +1636,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: return 2; default: diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 550ec9d..9de7d6d 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1344,6 +1344,8 @@ def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">; } // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit] let SubtargetPredicate = HasPkFmacF16Inst in { +// FIXME: V_PK_FMAC_F16 is currently not used in instruction selection. +// If this changes, ensure the DPP variant is not used for GFX11+. defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; } // End SubtargetPredicate = HasPkFmacF16Inst @@ -1904,7 +1906,7 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName, VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> : - VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>; + VOP2Only_Real_e32<GFX11Gen, op>, VOP2Only_Real_e32<GFX12Gen, op>; multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> : VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index b6f9568..2d3caec 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -32,9 +32,10 @@ class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { let HasExtDPP = 0; } -let HasExt64BitDPP = 1 in { -def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; -def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; +def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let HasClamp = 1; @@ -44,6 +45,10 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; } +let HasExt64BitDPP = 1 in { +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; + class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { let HasExtVOP3DPP = 0; let HasExtDPP = 0; @@ -52,10 +57,13 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>; def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>; - -def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { +def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> { let HasExtVOP3DPP = 0; - let HasExtDPP = 0; + let HasExt64BitDPP = 1; +} +def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>; +def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> { + let HasClamp = 1; } } // End HasExt64BitDPP = 1; @@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32 defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">; defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; +let SchedRW = [WriteIntMul] in { + let SubtargetPredicate = HasMadU32Inst in + defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>; + let SubtargetPredicate = isGFX1250Plus in { + defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>; + defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>; + } +} + let SchedRW = [WriteDoubleAdd] in { let FPDPRounding = 1 in { defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">; @@ -185,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f } // End SchedRW = [WriteDoubleAdd] } // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1 +let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in { +defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>; +defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>; +defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>; +defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>; +} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] + } // End isReMaterializable = 1 let Uses = [MODE, VCC, EXEC] in { @@ -722,6 +746,13 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { defm V_MAXIMUM3_F16 : VOP3Inst_t16 <"v_maximum3_f16", VOP_F16_F16_F16_F16, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 +let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>; +} + defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>; @@ -848,6 +879,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>; def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>; def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>; +let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in + def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>; + def : GCNPat< (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; @@ -858,6 +892,13 @@ def : GCNPat< (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +let SubtargetPredicate = HasAddMinMaxInsts in { +def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>; +def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>; +def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>; +def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>; +} + def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>; def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>; @@ -972,10 +1013,12 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim unsigned Val = N->getZExtValue(); unsigned New = 0; if (}] # modifier_idx # [{ == 0) { - New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) - : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); - } else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) { - New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) + : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); + } else if (}] # modifier_idx # [{== 1) { + New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + } if (}] # modifier_idx # [{== 2) { + New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; } return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32); }]>; @@ -1427,34 +1470,72 @@ let SubtargetPredicate = isGFX12Plus in { } // End SubtargetPredicate = isGFX12Plus -let SubtargetPredicate = HasBitOp3Insts in { +let HasClamp = 0, HasModifiers = 1 in { +def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>; +def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>; +def BitOp3_B16_fake16_Profile : VOP3_Profile_Fake16<BitOp3_B16_Profile>; +} + +let OtherPredicates = [HasBitOp3Insts] in { let isReMaterializable = 1 in { - defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", - VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>; + let SubtargetPredicate = isGFX940Plus in + defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", BitOp3_B16_Profile>; + let SubtargetPredicate = isGFX1250Plus in + defm V_BITOP3_B16_gfx1250 : VOP3Inst_t16_with_profiles <"v_bitop3_b16_gfx1250", BitOp3_B16_Profile, + BitOp3_B16_t16_Profile, BitOp3_B16_fake16_Profile>; defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32", VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>, VOPD_Component<0x12, "v_bitop2_b32">; } + def : GCNPat< (i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; def : GCNPat< - (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; - - def : GCNPat< (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; - def : GCNPat< - (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; -} // End SubtargetPredicate = HasBitOp3Insts + let SubtargetPredicate = isGFX940Plus in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } // End SubtargetPredicate = isGFX940Plus + + let SubtargetPredicate = isGFX1250Plus in { + let True16Predicate = UseFakeTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } + let True16Predicate = UseRealTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + } + } // End SubtargetPredicate = isGFX1250Plus + +} // End OtherPredicates = [HasBitOp3Insts] class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat< (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), @@ -1531,6 +1612,7 @@ def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true; let SubtargetPredicate = HasBF16ConversionInsts in { let ReadsModeReg = 0 in { defm V_CVT_PK_BF16_F32 : VOP3Inst<"v_cvt_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32>>; + defm V_CVT_SR_PK_BF16_F32 : VOP3Inst<"v_cvt_sr_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_bf16_f32>; } def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)), (V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>; @@ -1541,6 +1623,53 @@ let SubtargetPredicate = HasBF16ConversionInsts in { (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>; } +let Src0RC64 = VSrc_NoInline_v2f16 in { +def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>; +def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>; +def VOP3_CVT_PK_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_PK_F8_F16_Profile>; +} + +let ReadsModeReg = 0, IsPacked = 0, SubtargetPredicate = isGFX125xOnly in { + defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f16_gfx1250", + VOP3_CVT_PK_F8_F16_Profile, + VOP3_CVT_PK_F8_F16_True16_Profile, + VOP3_CVT_PK_F8_F16_Fake16_Profile, + int_amdgcn_cvt_pk_fp8_f16>; + defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f16_gfx1250", + VOP3_CVT_PK_F8_F16_Profile, + VOP3_CVT_PK_F8_F16_True16_Profile, + VOP3_CVT_PK_F8_F16_Fake16_Profile, + int_amdgcn_cvt_pk_bf8_f16>; +} + +let HasClamp = 0, HasOpSel = 1 in { +def VOP3_CVT_SR_F8_F16_Profile : VOP3_CVT_SR_F8_ByteSel_Profile<f16>; +def VOP3_CVT_SR_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_SR_F8_F16_Profile>; +def VOP3_CVT_SR_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_SR_F8_F16_Profile>; +} + +let SubtargetPredicate = isGFX1250Plus in { + let ReadsModeReg = 0 in { + // These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel + // to select a byte in the vdst. Bits 0 and 1 are unused. + let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { + defm V_CVT_SR_FP8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_fp8_f16", VOP3_CVT_SR_F8_F16_Profile, + VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; + defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile, + VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; + } + } // End ReadsModeReg = 0 + + let True16Predicate = UseRealTrue16Insts in { + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>; + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>; + } + let True16Predicate = UseFakeTrue16Insts in { + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_fake16_e64, f16>; + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_fake16_e64, f16>; + } +} // End SubtargetPredicate = isGFX1250Plus + class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat< (DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)), (inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in) @@ -1746,6 +1875,21 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; +defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250<0x233, "v_bitop3_b16">; +defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>; + +defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>; +defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>; +defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>; +defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>; +defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>; +defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>; +defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>; +defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>; +defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>; +defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>; +defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>; + defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">; defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">; defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >; @@ -1918,7 +2062,14 @@ let AssemblerPredicate = isGFX11Plus in { // These instructions differ from GFX12 variant by supporting DPP: defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; +defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; +defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; +defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; +defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x372, "v_cvt_pk_fp8_f16">; +defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x373, "v_cvt_pk_bf8_f16">; +defm V_CVT_SR_FP8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x374>; +defm V_CVT_SR_BF8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x375>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index c21e2d3..a029376 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -401,6 +401,19 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { let Inst{49-41} = src0; } +class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0); +} + class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { bits<6> attr; bits<2> attrchan; @@ -1506,6 +1519,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1525,6 +1539,7 @@ class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1540,6 +1555,7 @@ class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1723,6 +1739,34 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> let Inst{14 - 8} = sdst; } +class VOP3_BITOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo p, GFXGen Gen, string asmName> + : VOP3_DPP16_Gen_t16<op, p, Gen, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + +class VOP3_BITOP3_DPP8<bits<10> op, VOP_Pseudo p, string asmName> + : Base_VOP3_DPP8_t16<op, p, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> : Base_VOP3_DPP8<op, ps, opName> { bits<8> sdst; @@ -1943,6 +1987,29 @@ multiclass VOP3be_Realtriple< multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> : VOP3be_Realtriple<Gen, op, 1>; +multiclass VOP3_BITOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName> { + def _e64_dpp#Gen.Suffix : + VOP3_BITOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(NAME#"_e64"#"_dpp"), Gen, asmName>; +} + +multiclass VOP3_BITOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP3_Pseudo>(NAME#"_e64"); + def _e64_dpp8#Gen.Suffix : VOP3_BITOP3_DPP8<op, ps, asmName> { + let DecoderNamespace = + Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16"); + let AssemblerPredicate = Gen.AssemblerPredicate; + } +} + +multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); + let IsSingle = ps.Pfl.IsSingle, AsmString = asmName # ps.AsmOperands in { + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3a_BITOP3_gfx12<op, ps.Pfl>; + } +} + //===----------------------------------------------------------------------===// // VOP3 GFX11 //===----------------------------------------------------------------------===// @@ -2004,6 +2071,15 @@ multiclass VOP3Only_Real_Base_gfx1250<bits<10> op> : multiclass VOP3Only_Realtriple_gfx1250<bits<10> op, bit isSingle = 0> : VOP3_Realtriple<GFX1250Gen, op, isSingle>; +multiclass VOP3Only_Realtriple_with_name_gfx1250<bits<10> op, string opName, + string asmName, string pseudo_mnemonic = "", + bit isSingle = 0> : + VOP3_Realtriple_with_name<GFX1250Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; + +multiclass VOP3Only_Realtriple_t16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3Only_Realtriple_with_name_gfx1250<op, opName, asmName, pseudo_mnemonic, isSingle>; + multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; @@ -2024,6 +2100,13 @@ multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, defm _fake16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic>; } +multiclass VOP3Only_Realtriple_t16_and_fake16_gfx1250<bits<10> op, + string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = ""> { + defm _t16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic>; + defm _fake16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic>; +} + multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName, string asmName, bit isSingle = 0> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); @@ -2046,6 +2129,16 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName, VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>, VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>; +multiclass VOP3_Real_BITOP3_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> : + VOP3_BITOP3_Real_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp8_Base<GFX1250Gen, op, asmName>; + +multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> { + defm _t16 : VOP3_Real_BITOP3_gfx1250<op, asmName>; + defm _fake16: VOP3_Real_BITOP3_gfx1250<op, asmName>; +} + multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0, string opName = NAME> : VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 066b392..bd4b75f 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2423,6 +2423,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallingConv::ID CallConv = CLI.CallConv; bool doesNotRet = CLI.DoesNotReturn; bool isVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -2446,6 +2447,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, !Subtarget->noBTIAtReturnTwice()) GuardWithBTI = AFI->branchTargetEnforcement(); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + // Determine whether this is a non-secure function call. if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call")) isCmseNSCall = true; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 868556b..6dfe846 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -1284,14 +1284,11 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; } // Add the R_ARM_NONE fixup at the same position void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name); + visitUsedSymbol(*PersonalitySym); const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext()); - - visitUsedExpr(*PersonalityRef); - MCFragment *DF = getCurrentFragment(); - DF->addFixup( - MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4)); + addFixup(PersonalityRef, FK_Data_4); } void ARMELFStreamer::FlushPendingOffset() { diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index a87b9a2..bed6bc9 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -957,47 +957,47 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) { return; } - // MapDef type may be a struct type or a non-pointer derived type - const DIType *OrigTy = Ty; - while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) { - auto Tag = DTy->getTag(); - if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type && - Tag != dwarf::DW_TAG_volatile_type && - Tag != dwarf::DW_TAG_restrict_type) - break; - Ty = DTy->getBaseType(); - } - - const auto *CTy = dyn_cast<DICompositeType>(Ty); - if (!CTy) - return; - - auto Tag = CTy->getTag(); - if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl()) - return; - - // Visit all struct members to ensure their types are visited. - const DINodeArray Elements = CTy->getElements(); - for (const auto *Element : Elements) { - const auto *MemberType = cast<DIDerivedType>(Element); - const DIType *MemberBaseType = MemberType->getBaseType(); - - // If the member is a composite type, that may indicate the currently - // visited composite type is a wrapper, and the member represents the - // actual map definition. - // In that case, visit the member with `visitMapDefType` instead of - // `visitTypeEntry`, treating it specifically as a map definition rather - // than as a regular composite type. - const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); - if (MemberCTy) { - visitMapDefType(MemberBaseType, TypeId); - } else { - visitTypeEntry(MemberBaseType); + uint32_t TmpId; + switch (Ty->getTag()) { + case dwarf::DW_TAG_typedef: + case dwarf::DW_TAG_const_type: + case dwarf::DW_TAG_volatile_type: + case dwarf::DW_TAG_restrict_type: + case dwarf::DW_TAG_pointer_type: + visitMapDefType(dyn_cast<DIDerivedType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_array_type: + // Visit nested map array and jump to the element type + visitMapDefType(dyn_cast<DICompositeType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_structure_type: { + // Visit all struct members to ensure their types are visited. + const auto *CTy = cast<DICompositeType>(Ty); + const DINodeArray Elements = CTy->getElements(); + for (const auto *Element : Elements) { + const auto *MemberType = cast<DIDerivedType>(Element); + const DIType *MemberBaseType = MemberType->getBaseType(); + // If the member is a composite type, that may indicate the currently + // visited composite type is a wrapper, and the member represents the + // actual map definition. + // In that case, visit the member with `visitMapDefType` instead of + // `visitTypeEntry`, treating it specifically as a map definition rather + // than as a regular composite type. + const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); + if (MemberCTy) { + visitMapDefType(MemberBaseType, TmpId); + } else { + visitTypeEntry(MemberBaseType); + } } + break; + } + default: + break; } // Visit this type, struct or a const/typedef/volatile/restrict type - visitTypeEntry(OrigTy, TypeId, false, false); + visitTypeEntry(Ty, TypeId, false, false); } /// Read file contents from the actual file or from the source diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index d96136c..a5bf0e5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2621,9 +2621,38 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - if (isa<ConstantSDNode>(Op->getOperand(2))) + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = EltVT.getScalarSizeInBits(); + SDLoc DL(Op); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + + if (isa<ConstantSDNode>(Op2)) return Op; - return SDValue(); + + MVT IdxTy = MVT::getIntegerVT(EltSizeInBits); + MVT IdxVTy = MVT::getVectorVT(IdxTy, NumElts); + + if (!isTypeLegal(VT) || !isTypeLegal(IdxVTy)) + return SDValue(); + + SDValue SplatElt = DAG.getSplatBuildVector(VT, DL, Op1); + SDValue SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2); + + SmallVector<SDValue, 32> RawIndices; + for (unsigned i = 0; i < NumElts; ++i) + RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT())); + SDValue Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices); + + // insert vec, elt, idx + // => + // select (splatidx == {0,1,2...}) ? splatelt : vec + SDValue SelectCC = + DAG.getSetCC(DL, IdxVTy, SplatIdx, Indices, ISD::CondCode::SETEQ); + return DAG.getNode(ISD::VSELECT, DL, VT, SelectCC, SplatElt, Op0); } SDValue LoongArchTargetLowering::lowerATOMIC_FENCE(SDValue Op, diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index ec6b382..881ba8e 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -3341,6 +3341,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool &IsTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -3397,8 +3398,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get a count of how many bytes are to be pushed on the stack. unsigned StackSize = CCInfo.getStackSize(); - // Call site info for function parameters tracking. + // Call site info for function parameters tracking and call base type info. MachineFunction::CallSiteInfo CSInfo; + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); // Check if it's really possible to do a tail call. Restrict it to functions // that are part of this compilation unit. diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 65e7c56..95abcde 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -56,9 +56,7 @@ INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm) { - doMulWide = (OptLevel > CodeGenOptLevel::None); -} + : SelectionDAGISel(tm, OptLevel), TM(tm) {} bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); @@ -145,18 +143,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryStoreVector(N)) return; break; - case NVPTXISD::LoadParam: - case NVPTXISD::LoadParamV2: - case NVPTXISD::LoadParamV4: - if (tryLoadParam(N)) - return; - break; - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - if (tryStoreParam(N)) - return; - break; case ISD::INTRINSIC_W_CHAIN: if (tryIntrinsicChain(N)) return; @@ -1462,267 +1448,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { return true; } -bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { - SDValue Chain = Node->getOperand(0); - SDValue Offset = Node->getOperand(2); - SDValue Glue = Node->getOperand(3); - SDLoc DL(Node); - MemSDNode *Mem = cast<MemSDNode>(Node); - - unsigned VecSize; - switch (Node->getOpcode()) { - default: - return false; - case NVPTXISD::LoadParam: - VecSize = 1; - break; - case NVPTXISD::LoadParamV2: - VecSize = 2; - break; - case NVPTXISD::LoadParamV4: - VecSize = 4; - break; - } - - EVT EltVT = Node->getValueType(0); - EVT MemVT = Mem->getMemoryVT(); - - std::optional<unsigned> Opcode; - - switch (VecSize) { - default: - return false; - case 1: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, - NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64); - break; - case 2: - Opcode = - pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, - NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, - NVPTX::LoadParamMemV2I64); - break; - case 4: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16, - NVPTX::LoadParamMemV4I32, {/* no v4i64 */}); - break; - } - if (!Opcode) - return false; - - SDVTList VTs; - if (VecSize == 1) { - VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); - } else if (VecSize == 2) { - VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); - } else { - EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; - VTs = CurDAG->getVTList(EVTs); - } - - unsigned OffsetVal = Offset->getAsZExtVal(); - - SmallVector<SDValue, 2> Ops( - {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); - return true; -} - -// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri) -#define getOpcV2H(ty, opKind0, opKind1) \ - NVPTX::StoreParamV2##ty##_##opKind0##opKind1 - -#define getOpcV2H1(ty, opKind0, isImm1) \ - (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r) - -#define getOpcodeForVectorStParamV2(ty, isimm) \ - (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1]) - -#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \ - NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3 - -#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \ - (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \ - : getOpcV4H(ty, opKind0, opKind1, opKind2, r) - -#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \ - (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \ - : getOpcV4H3(ty, opKind0, opKind1, r, isImm3) - -#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \ - (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \ - : getOpcV4H2(ty, opKind0, r, isImm2, isImm3) - -#define getOpcodeForVectorStParamV4(ty, isimm) \ - (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \ - : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3]) - -#define getOpcodeForVectorStParam(n, ty, isimm) \ - (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \ - : getOpcodeForVectorStParamV4(ty, isimm) - -static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, - unsigned NumElts, - MVT::SimpleValueType MemTy, - SelectionDAG *CurDAG, SDLoc DL) { - // Determine which inputs are registers and immediates make new operators - // with constant values - SmallVector<bool, 4> IsImm(NumElts, false); - for (unsigned i = 0; i < NumElts; i++) { - IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i])); - if (IsImm[i]) { - SDValue Imm = Ops[i]; - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[i] = Imm; - } - } - - // Get opcode for MemTy, size, and register/immediate operand ordering - switch (MemTy) { - case MVT::i8: - return getOpcodeForVectorStParam(NumElts, I8, IsImm); - case MVT::i16: - return getOpcodeForVectorStParam(NumElts, I16, IsImm); - case MVT::i32: - return getOpcodeForVectorStParam(NumElts, I32, IsImm); - case MVT::i64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(I64, IsImm); - case MVT::f32: - return getOpcodeForVectorStParam(NumElts, F32, IsImm); - case MVT::f64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(F64, IsImm); - - // These cases don't support immediates, just use the all register version - // and generate moves. - case MVT::i1: - return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr - : NVPTX::StoreParamV4I8_rrrr; - case MVT::f16: - case MVT::bf16: - return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr - : NVPTX::StoreParamV4I16_rrrr; - case MVT::v2f16: - case MVT::v2bf16: - case MVT::v2i16: - case MVT::v4i8: - return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr - : NVPTX::StoreParamV4I32_rrrr; - default: - llvm_unreachable("Cannot select st.param for unknown MemTy"); - } -} - -bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { - SDLoc DL(N); - SDValue Chain = N->getOperand(0); - SDValue Param = N->getOperand(1); - unsigned ParamVal = Param->getAsZExtVal(); - SDValue Offset = N->getOperand(2); - unsigned OffsetVal = Offset->getAsZExtVal(); - MemSDNode *Mem = cast<MemSDNode>(N); - SDValue Glue = N->getOperand(N->getNumOperands() - 1); - - // How many elements do we have? - unsigned NumElts; - switch (N->getOpcode()) { - default: - llvm_unreachable("Unexpected opcode"); - case NVPTXISD::StoreParam: - NumElts = 1; - break; - case NVPTXISD::StoreParamV2: - NumElts = 2; - break; - case NVPTXISD::StoreParamV4: - NumElts = 4; - break; - } - - // Build vector of operands - SmallVector<SDValue, 8> Ops; - for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(N->getOperand(i + 3)); - Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32), - CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - // Determine target opcode - // If we have an i1, use an 8-bit store. The lowering code in - // NVPTXISelLowering will have already emitted an upcast. - std::optional<unsigned> Opcode; - switch (NumElts) { - default: - llvm_unreachable("Unexpected NumElts"); - case 1: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - SDValue Imm = Ops[0]; - if (MemTy != MVT::f16 && MemTy != MVT::bf16 && - (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) { - // Convert immediate to target constant - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[0] = Imm; - // Use immediate version of store param - Opcode = - pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, NVPTX::StoreParamI16_i, - NVPTX::StoreParamI32_i, NVPTX::StoreParamI64_i); - } else - Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, - NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r, - NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r); - if (Opcode == NVPTX::StoreParamI8_r) { - // Fine tune the opcode depending on the size of the operand. - // This helps to avoid creating redundant COPY instructions in - // InstrEmitter::AddRegisterOperand(). - switch (Ops[0].getSimpleValueType().SimpleTy) { - default: - break; - case MVT::i32: - Opcode = NVPTX::StoreParamI8TruncI32_r; - break; - case MVT::i64: - Opcode = NVPTX::StoreParamI8TruncI64_r; - break; - } - } - break; - } - case 2: - case 4: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL); - break; - } - } - - SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); - SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); - MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); - CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); - - ReplaceNode(N, Ret); - return true; -} - /// SelectBFE - Look for instruction sequences that can be made more efficient /// by using the 'bfe' (bit-field extract) PTX instruction bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index b99b4ef..9e0f88e5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -40,9 +40,6 @@ private: class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { const NVPTXTargetMachine &TM; - // If true, generate mul.wide from sext and mul - bool doMulWide; - NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const; bool usePrecSqrtF32(const SDNode *N) const; bool useF32FTZ() const; @@ -78,8 +75,6 @@ private: bool tryLDG(MemSDNode *N); bool tryStore(SDNode *N); bool tryStoreVector(SDNode *N); - bool tryLoadParam(SDNode *N); - bool tryStoreParam(SDNode *N); bool tryFence(SDNode *N); void SelectAddrSpaceCast(SDNode *N); bool tryBFE(SDNode *N); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index ddcecc00..4fd3623 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -843,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT, ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD, - ISD::STORE}); + ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}); // setcc for f16x2 and bf16x2 needs special handling to prevent // legalizer's attempt to scalarize it due to v2i1 not being legal. @@ -1075,12 +1075,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::DeclareArrayParam) MAKE_CASE(NVPTXISD::DeclareScalarParam) MAKE_CASE(NVPTXISD::CALL) - MAKE_CASE(NVPTXISD::LoadParam) - MAKE_CASE(NVPTXISD::LoadParamV2) - MAKE_CASE(NVPTXISD::LoadParamV4) - MAKE_CASE(NVPTXISD::StoreParam) - MAKE_CASE(NVPTXISD::StoreParamV2) - MAKE_CASE(NVPTXISD::StoreParamV4) MAKE_CASE(NVPTXISD::MoveParam) MAKE_CASE(NVPTXISD::UNPACK_VECTOR) MAKE_CASE(NVPTXISD::BUILD_VECTOR) @@ -1318,105 +1312,6 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, return DL.getABITypeAlign(Ty); } -static bool adjustElementType(EVT &ElementType) { - switch (ElementType.getSimpleVT().SimpleTy) { - default: - return false; - case MVT::f16: - case MVT::bf16: - ElementType = MVT::i16; - return true; - case MVT::f32: - case MVT::v2f16: - case MVT::v2bf16: - ElementType = MVT::i32; - return true; - case MVT::f64: - ElementType = MVT::i64; - return true; - } -} - -// Use byte-store when the param address of the argument value is unaligned. -// This may happen when the return value is a field of a packed structure. -// -// This is called in LowerCall() when passing the param values. -static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, - uint64_t Offset, EVT ElementType, - SDValue StVal, SDValue &InGlue, - unsigned ArgID, const SDLoc &dl) { - // Bit logic only works on integer types - if (adjustElementType(ElementType)) - StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); - - // Store each byte - SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - // Shift the byte to the last byte position - SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, - DAG.getConstant(i * 8, dl, MVT::i32)); - SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - ShiftVal, InGlue}; - // Trunc store only the last byte by using - // st.param.b8 - // The register type can be larger than b8. - Chain = DAG.getMemIntrinsicNode( - NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, - MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); - } - return Chain; -} - -// Use byte-load when the param adress of the returned value is unaligned. -// This may happen when the returned value is a field of a packed structure. -static SDValue -LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, - EVT ElementType, SDValue &InGlue, - SmallVectorImpl<SDValue> &TempProxyRegOps, - const SDLoc &dl) { - // Bit logic only works on integer types - EVT MergedType = ElementType; - adjustElementType(MergedType); - - // Load each byte and construct the whole value. Initial value to 0 - SDValue RetVal = DAG.getConstant(0, dl, MergedType); - // LoadParamMemI8 loads into i16 register only - SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - InGlue}; - // This will be selected to LoadParamMemI8 - SDValue LdVal = - DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, - MVT::i8, MachinePointerInfo(), Align(1)); - SDValue TmpLdVal = LdVal.getValue(0); - Chain = LdVal.getValue(1); - InGlue = LdVal.getValue(2); - - TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, - TmpLdVal.getSimpleValueType(), TmpLdVal); - TempProxyRegOps.push_back(TmpLdVal); - - SDValue CMask = DAG.getConstant(255, dl, MergedType); - SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); - // Need to extend the i16 register to the whole width. - TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); - // Mask off the high bits. Leave only the lower 8bits. - // Do this because we are using loadparam.b8. - TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); - // Shift and merge - TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); - RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); - } - if (ElementType != MergedType) - RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); - - return RetVal; -} - static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func) { if (!Func) @@ -1483,10 +1378,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SelectionDAG &DAG = CLI.DAG; SDLoc dl = CLI.DL; - SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; - SDValue Chain = CLI.Chain; + const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; SDValue Callee = CLI.Callee; - bool &isTailCall = CLI.IsTailCall; ArgListTy &Args = CLI.getArgs(); Type *RetTy = CLI.RetTy; const CallBase *CB = CLI.CB; @@ -1496,6 +1389,36 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return DAG.getConstant(I, dl, MVT::i32); }; + const unsigned UniqueCallSite = GlobalUniqueCallSite++; + const SDValue CallChain = CLI.Chain; + const SDValue StartChain = + DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl); + SDValue DeclareGlue = StartChain.getValue(1); + + SmallVector<SDValue, 16> CallPrereqs{StartChain}; + + const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) { + // PTX ABI requires integral types to be at least 32 bits in size. FP16 is + // loaded/stored using i16, so it's handled here as well. + const unsigned SizeBits = promoteScalarArgumentSize(Size * 8); + SDValue Declare = + DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(SizeBits), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + + const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align, + unsigned Size) { + SDValue Declare = DAG.getNode( + NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + // Variadic arguments. // // Normally, for each argument, we declare a param scalar or a param @@ -1511,15 +1434,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // // After all vararg is processed, 'VAOffset' holds the size of the // vararg byte array. + assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) && + "Non-VarArg function with extra arguments"); - SDValue VADeclareParam; // vararg byte array const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic - unsigned VAOffset = 0; // current offset in the param array + unsigned VAOffset = 0; // current offset in the param array - const unsigned UniqueCallSite = GlobalUniqueCallSite++; - SDValue TempChain = Chain; - Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); - SDValue InGlue = Chain.getValue(1); + const SDValue VADeclareParam = + CLI.Args.size() > FirstVAArg + ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32), + Align(STI.getMaxRequiredAlignment()), 0) + : SDValue(); // Args.size() and Outs.size() need not match. // Outs.size() will be larger @@ -1580,43 +1505,19 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) && "type size mismatch"); - const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> { - if (IsVAArg) { - if (ArgI == FirstVAArg) { - VADeclareParam = DAG.getNode( - NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()), - GetI32(0), InGlue}); - return VADeclareParam; - } - return std::nullopt; - } - if (IsByVal || shouldPassAsArray(Arg.Ty)) { - // declare .param .align <align> .b8 .param<n>[<size>]; - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(ArgAlign.value()), - GetI32(TypeSize), InGlue}); - } + const SDValue ArgDeclare = [&]() { + if (IsVAArg) + return VADeclareParam; + + if (IsByVal || shouldPassAsArray(Arg.Ty)) + return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize); + assert(ArgOuts.size() == 1 && "We must pass only one value as non-array"); - // declare .param .b<size> .param<n>; - - // PTX ABI requires integral types to be at least 32 bits in - // size. FP16 is loaded/stored using i16, so it's handled - // here as well. - const unsigned PromotedSize = - (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) - ? promoteScalarArgumentSize(TypeSize * 8) - : TypeSize * 8; - - return DAG.getNode(NVPTXISD::DeclareScalarParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(PromotedSize), InGlue}); + assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) && + "Only int and float types are supported as non-array arguments"); + + return MakeDeclareScalarParam(ParamSymbol, TypeSize); }(); - if (ArgDeclare) { - Chain = ArgDeclare->getValue(0); - InGlue = ArgDeclare->getValue(1); - } // PTX Interoperability Guide 3.3(A): [Integer] Values shorter // than 32-bits are sign extended or zero extended, depending on @@ -1626,36 +1527,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32; const auto GetStoredValue = [&](const unsigned I, EVT EltVT, - const Align PartAlign) { - SDValue StVal; + const MaybeAlign PartAlign) { if (IsByVal) { SDValue Ptr = ArgOutVals[0]; auto MPI = refinePtrAS(Ptr, DAG, DL, *this); SDValue SrcAddr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I])); - StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign); - } else { - StVal = ArgOutVals[I]; - - auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType()); - if (PromotedVT != StVal.getValueType()) { - StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT, - StVal); - } + return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign); } + SDValue StVal = ArgOutVals[I]; + assert(promoteScalarIntegerPTX(StVal.getValueType()) == + StVal.getValueType() && + "OutVal type should always be legal"); - if (ExtendIntegerParam) { - assert(VTs.size() == 1 && "Scalar can't have multiple parts."); - // zext/sext to i32 - StVal = - DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, MVT::i32, StVal); - } else if (EltVT.getSizeInBits() < 16) { - // Use 16-bit registers for small stores as it's the - // smallest general purpose register size supported by NVPTX. - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - return StVal; + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT StoreVT = + ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); + + return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl); }; const auto VectorInfo = @@ -1664,23 +1554,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned J = 0; for (const unsigned NumElts : VectorInfo) { const int CurOffset = Offsets[J]; - EVT EltVT = promoteScalarIntegerPTX(VTs[J]); - const Align PartAlign = commonAlignment(ArgAlign, CurOffset); - - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar store. In such cases, fall back to byte stores. - if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) { - - SDValue StVal = GetStoredValue(J, EltVT, PartAlign); - Chain = LowerUnalignedStoreParam(DAG, Chain, - CurOffset + (IsByVal ? VAOffset : 0), - EltVT, StVal, InGlue, ArgI, dl); - - // LowerUnalignedStoreParam took care of inserting the necessary nodes - // into the SDAG, so just move on to the next element. - J++; - continue; - } + const EVT EltVT = promoteScalarIntegerPTX(VTs[J]); if (IsVAArg && !IsByVal) // Align each part of the variadic argument to their type. @@ -1688,44 +1562,45 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((IsVAArg || VAOffset == 0) && "VAOffset must be 0 for non-VA args"); - SmallVector<SDValue, 6> StoreOperands{ - Chain, GetI32(IsVAArg ? FirstVAArg : ArgI), - GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))}; - // Record the values to store. - for (const unsigned K : llvm::seq(NumElts)) - StoreOperands.push_back(GetStoredValue(J + K, EltVT, PartAlign)); - StoreOperands.push_back(InGlue); + const unsigned Offset = + (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset)); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset)); - NVPTXISD::NodeType Op; - switch (NumElts) { - case 1: - Op = NVPTXISD::StoreParam; - break; - case 2: - Op = NVPTXISD::StoreParamV2; - break; - case 4: - Op = NVPTXISD::StoreParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); + const MaybeAlign CurrentAlign = ExtendIntegerParam + ? MaybeAlign(std::nullopt) + : commonAlignment(ArgAlign, Offset); + + SDValue Val; + if (NumElts == 1) { + Val = GetStoredValue(J, EltVT, CurrentAlign); + } else { + SmallVector<SDValue, 8> StoreVals; + for (const unsigned K : llvm::seq(NumElts)) { + SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign); + if (ValJ.getValueType().isVector()) + DAG.ExtractVectorElements(ValJ, StoreVals); + else + StoreVals.push_back(ValJ); + } + + EVT VT = EVT::getVectorVT( + *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size()); + Val = DAG.getBuildVector(VT, dl, StoreVals); } - // Adjust type of the store op if we've extended the scalar - // return value. - EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; - Chain = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, - TheStoreType, MachinePointerInfo(), PartAlign, - MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); + SDValue StoreParam = + DAG.getStore(ArgDeclare, dl, Val, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); + CallPrereqs.push_back(StoreParam); // TODO: We may need to support vector types that can be passed // as scalars in variadic arguments. if (IsVAArg && !IsByVal) { assert(NumElts == 1 && "Vectorization is expected to be disabled for variadics."); + const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext())); } @@ -1736,33 +1611,21 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VAOffset += TypeSize; } - GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); - // Handle Result if (!Ins.empty()) { - const SDValue RetDeclare = [&]() { - const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); - const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy); - if (shouldPassAsArray(RetTy)) { - const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(RetAlign.value()), - GetI32(ResultSize / 8), InGlue}); - } - const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize); - return DAG.getNode( - NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue}); - }(); - Chain = RetDeclare.getValue(0); - InGlue = RetDeclare.getValue(1); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + const unsigned ResultSize = DL.getTypeAllocSize(RetTy); + if (shouldPassAsArray(RetTy)) { + const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); + MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize); + } else { + MakeDeclareScalarParam(RetSymbol, ResultSize); + } } - const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); // Set the size of the vararg param byte array if the callee is a variadic // function and the variadic part is not empty. - if (HasVAArgs) { + if (VADeclareParam) { SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), VADeclareParam.getOperand(2), GetI32(VAOffset), @@ -1771,6 +1634,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VADeclareParam->getVTList(), DeclareParamOps); } + const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); // If the type of the callsite does not match that of the function, convert // the callsite to an indirect call. const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func); @@ -1800,15 +1664,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // instruction. // The prototype is embedded in a string and put as the operand for a // CallPrototype SDNode which will print out to the value of the string. + const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); std::string Proto = getPrototype(DL, RetTy, Args, CLI.Outs, HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB, UniqueCallSite); const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); - Chain = DAG.getNode( - NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue}, - {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue}); - InGlue = Chain.getValue(1); + const SDValue PrototypeDeclare = DAG.getNode( + NVPTXISD::CallPrototype, dl, MVT::Other, + {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)}); + CallPrereqs.push_back(PrototypeDeclare); } if (ConvertToIndirectCall) { @@ -1826,24 +1691,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const unsigned NumArgs = std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size()); /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) - Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), - GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, - GetI32(Proto), InGlue}); - InGlue = Chain.getValue(1); - + /// NumParams, Callee, Proto) + const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs); + const SDValue Call = DAG.getNode( + NVPTXISD::CALL, dl, MVT::Other, + {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), + GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)}); + + SmallVector<SDValue, 16> LoadChains{Call}; SmallVector<SDValue, 16> ProxyRegOps; - // An item of the vector is filled if the element does not need a ProxyReg - // operation on it and should be added to InVals as is. ProxyRegOps and - // ProxyRegTruncates contain empty/none items at the same index. - SmallVector<SDValue, 16> RetElts; - // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` - // to use the values of `LoadParam`s and to be replaced later then - // `CALLSEQ_END` is added. - SmallVector<SDValue, 16> TempProxyRegOps; - - // Generate loads from param memory/moves from registers for result if (!Ins.empty()) { SmallVector<EVT, 16> VTs; SmallVector<uint64_t, 16> Offsets; @@ -1860,104 +1716,65 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); unsigned I = 0; - for (const unsigned VectorizedSize : VectorInfo) { - EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]); - EVT EltType = Ins[I].VT; - const Align EltAlign = commonAlignment(RetAlign, Offsets[I]); - - if (TheLoadType != VTs[I]) - EltType = TheLoadType; - - if (ExtendIntegerRetVal) { - TheLoadType = MVT::i32; - EltType = MVT::i32; - } else if (TheLoadType.getSizeInBits() < 16) { - EltType = MVT::i16; - } + for (const unsigned NumElts : VectorInfo) { + const MaybeAlign CurrentAlign = + ExtendIntegerRetVal ? MaybeAlign(std::nullopt) + : commonAlignment(RetAlign, Offsets[I]); - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar load. In such cases, fall back to byte loads. - if (VectorizedSize == 1 && RetTy->isAggregateType() && - EltAlign < DAG.getEVTAlign(TheLoadType)) { - SDValue Ret = LowerUnalignedLoadRetParam( - DAG, Chain, Offsets[I], TheLoadType, InGlue, TempProxyRegOps, dl); - ProxyRegOps.push_back(SDValue()); - RetElts.resize(I); - RetElts.push_back(Ret); - - I++; - continue; - } + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT LoadVT = + ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); - SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType); - LoadVTs.append({MVT::Other, MVT::Glue}); + const unsigned PackingAmt = + LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; - NVPTXISD::NodeType Op; - switch (VectorizedSize) { - case 1: - Op = NVPTXISD::LoadParam; - break; - case 2: - Op = NVPTXISD::LoadParamV2; - break; - case 4: - Op = NVPTXISD::LoadParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } + const EVT VecVT = NumElts == 1 ? LoadVT + : EVT::getVectorVT(*DAG.getContext(), + LoadVT.getScalarType(), + NumElts * PackingAmt); - SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue}; - SDValue RetVal = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, - MachinePointerInfo(), EltAlign, MachineMemOperand::MOLoad); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I])); - for (const unsigned J : llvm::seq(VectorizedSize)) { - ProxyRegOps.push_back(RetVal.getValue(J)); - } + SDValue R = + DAG.getLoad(VecVT, dl, Call, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); - Chain = RetVal.getValue(VectorizedSize); - InGlue = RetVal.getValue(VectorizedSize + 1); + LoadChains.push_back(R.getValue(1)); - I += VectorizedSize; + if (NumElts == 1) + ProxyRegOps.push_back(R); + else + for (const unsigned J : llvm::seq(NumElts)) { + SDValue Elt = DAG.getNode( + LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR + : ISD::EXTRACT_VECTOR_ELT, + dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl)); + ProxyRegOps.push_back(Elt); + } + I += NumElts; } } - Chain = - DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); - InGlue = Chain.getValue(1); + const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains); + const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite, + UniqueCallSite + 1, SDValue(), dl); // Append ProxyReg instructions to the chain to make sure that `callseq_end` // will not get lost. Otherwise, during libcalls expansion, the nodes can become // dangling. - for (const unsigned I : llvm::seq(ProxyRegOps.size())) { - if (I < RetElts.size() && RetElts[I]) { - InVals.push_back(RetElts[I]); - continue; - } - - SDValue Ret = - DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(), - {Chain, ProxyRegOps[I]}); - - const EVT ExpectedVT = Ins[I].VT; - if (!Ret.getValueType().bitsEq(ExpectedVT)) { - Ret = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Ret); - } + for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) { + SDValue Proxy = + DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg}); + SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl); InVals.push_back(Ret); } - for (SDValue &T : TempProxyRegOps) { - SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(), - {Chain, T.getOperand(0)}); - DAG.ReplaceAllUsesWith(T, Repl); - DAG.RemoveDeadNode(T.getNode()); - } - - // set isTailCall to false for now, until we figure out how to express + // set IsTailCall to false for now, until we figure out how to express // tail call optimization in PTX - isTailCall = false; - return Chain; + CLI.IsTailCall = false; + return CallEnd; } SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, @@ -5117,10 +4934,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { Operands.push_back(DCI.DAG.getIntPtrConstant( cast<LoadSDNode>(LD)->getExtensionType(), DL)); break; - case NVPTXISD::LoadParamV2: - OldNumOutputs = 2; - Opcode = NVPTXISD::LoadParamV4; - break; case NVPTXISD::LoadV2: OldNumOutputs = 2; Opcode = NVPTXISD::LoadV4; @@ -5201,12 +5014,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV2; break; - case NVPTXISD::StoreParam: - Opcode = NVPTXISD::StoreParamV2; - break; - case NVPTXISD::StoreParamV2: - Opcode = NVPTXISD::StoreParamV4; - break; case NVPTXISD::StoreV2: MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV4; @@ -5218,7 +5025,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, return SDValue(); Opcode = NVPTXISD::StoreV8; break; - case NVPTXISD::StoreParamV4: case NVPTXISD::StoreV8: // PTX doesn't support the next doubling of operands return SDValue(); @@ -5263,30 +5069,11 @@ static SDValue combinePackingMovIntoStore(SDNode *N, MemVT, ST->getMemOperand()); } -static SDValue PerformStoreCombineHelper(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - unsigned Front, unsigned Back) { - if (all_of(N->ops().drop_front(Front).drop_back(Back), - [](const SDUse &U) { return U.get()->isUndef(); })) - // Operand 0 is the previous value in the chain. Cannot return EntryToken - // as the previous value will become unused and eliminated later. - return N->getOperand(0); - - return combinePackingMovIntoStore(N, DCI, Front, Back); -} - static SDValue PerformStoreCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return combinePackingMovIntoStore(N, DCI, 1, 2); } -static SDValue PerformStoreParamCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // Operands from the 3rd to the 2nd last one are the values to be stored. - // {Chain, ArgID, Offset, Val, Glue} - return PerformStoreCombineHelper(N, DCI, 3, 1); -} - /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, @@ -5432,6 +5219,42 @@ static SDValue PerformREMCombine(SDNode *N, return SDValue(); } +// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y) +static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.hasOneUse()) + return SDValue(); + EVT ToVT = N->getValueType(0); + EVT FromVT = Op.getValueType(); + if (!((ToVT == MVT::i32 && FromVT == MVT::i16) || + (ToVT == MVT::i64 && FromVT == MVT::i32))) + return SDValue(); + if (!(Op.getOpcode() == ISD::MUL || + (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1))))) + return SDValue(); + + SDLoc DL(N); + unsigned ExtOpcode = N->getOpcode(); + unsigned Opcode = 0; + if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap()) + Opcode = NVPTXISD::MUL_WIDE_SIGNED; + else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap()) + Opcode = NVPTXISD::MUL_WIDE_UNSIGNED; + else + return SDValue(); + SDValue RHS = Op.getOperand(1); + if (Op.getOpcode() == ISD::SHL) { + const auto ShiftAmt = Op.getConstantOperandVal(1); + const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, DL, ToVT); + } + return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS); +} + enum OperandSignedness { Signed = 0, Unsigned, @@ -5942,6 +5765,86 @@ static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, N->getConstantOperandAPInt(2), N->getConstantOperandVal(3)), SDLoc(N), N->getValueType(0)); + return SDValue(); +} + +// During call lowering we wrap the return values in a ProxyReg node which +// depend on the chain value produced by the completed call. This ensures that +// the full call is emitted in cases where libcalls are used to legalize +// operations. To improve the functioning of other DAG combines we pull all +// operations we can through one of these nodes, ensuring that the ProxyReg +// directly wraps a load. That is: +// +// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0))) +// +static SDValue sinkProxyReg(SDValue R, SDValue Chain, + TargetLowering::DAGCombinerInfo &DCI) { + switch (R.getOpcode()) { + case ISD::TRUNCATE: + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::BITCAST: { + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V); + return SDValue(); + } + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + case ISD::OR: { + if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI)) + if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B); + return SDValue(); + } + case ISD::Constant: + return R; + case ISD::LOAD: + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: { + return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(), + {Chain, R}); + } + case ISD::BUILD_VECTOR: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SmallVector<SDValue, 16> Ops; + for (auto &Op : R->ops()) { + SDValue V = sinkProxyReg(Op, Chain, DCI); + if (!V) + return SDValue(); + Ops.push_back(V); + } + return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops); + } + case ISD::EXTRACT_VECTOR_ELT: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(R), + R.getValueType(), V, R.getOperand(1)); + return SDValue(); + } + default: + return SDValue(); + } +} + +static SDValue combineProxyReg(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + // If the ProxyReg is not wrapping a load, try to pull the operations through + // the ProxyReg. + if (Reg.getOpcode() != ISD::LOAD) { + if (SDValue V = sinkProxyReg(Reg, Chain, DCI)) + return V; + } return SDValue(); } @@ -5958,6 +5861,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return combineADDRSPACECAST(N, DCI); case ISD::AND: return PerformANDCombine(N, DCI); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return combineMulWide(N, DCI, OptLevel); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: @@ -5965,7 +5871,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FADD: return PerformFADDCombine(N, DCI, OptLevel); case ISD::LOAD: - case NVPTXISD::LoadParamV2: case NVPTXISD::LoadV2: case NVPTXISD::LoadV4: return combineUnpackingMovIntoLoad(N, DCI); @@ -5973,6 +5878,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformMULCombine(N, DCI, OptLevel); case NVPTXISD::PRMT: return combinePRMT(N, DCI, OptLevel); + case NVPTXISD::ProxyReg: + return combineProxyReg(N, DCI); case ISD::SETCC: return PerformSETCCCombine(N, DCI, STI.getSmVersion()); case ISD::SHL: @@ -5980,10 +5887,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SREM: case ISD::UREM: return PerformREMCombine(N, DCI, OptLevel); - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - return PerformStoreParamCombine(N, DCI); case ISD::STORE: case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: @@ -6332,6 +6235,22 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, Results.push_back(NewValue.getValue(3)); } +static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI, + SmallVectorImpl<SDValue> &Results) { + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType()); + + SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT); + SDValue NewProxy = + DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg}); + SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0)); + + Results.push_back(Res); +} + void NVPTXTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -6349,6 +6268,9 @@ void NVPTXTargetLowering::ReplaceNodeResults( case ISD::CopyFromReg: ReplaceCopyFromReg_128(N, DAG, Results); return; + case NVPTXISD::ProxyReg: + replaceProxyReg(N, DAG, *this, Results); + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 228e2aa..cf72a1e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -38,7 +38,7 @@ enum NodeType : unsigned { /// This node represents a PTX call instruction. It's operands are as follows: /// /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) CALL, MoveParam, @@ -84,13 +84,7 @@ enum NodeType : unsigned { StoreV2, StoreV4, StoreV8, - LoadParam, - LoadParamV2, - LoadParamV4, - StoreParam, - StoreParamV2, - StoreParamV4, - LAST_MEMORY_OPCODE = StoreParamV4, + LAST_MEMORY_OPCODE = StoreV8, }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 442b900..6000b40 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -125,8 +125,6 @@ def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; def doRsqrtOpt : Predicate<"doRsqrtOpt()">; -def doMulWide : Predicate<"doMulWide">; - def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; @@ -836,36 +834,28 @@ def MULWIDES64 : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.s32">; def MULWIDES64Imm : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.s32">; -def MULWIDES64Imm64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.s32">; def MULWIDEU64 : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.u32">; def MULWIDEU64Imm : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.u32">; -def MULWIDEU64Imm64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.u32">; def MULWIDES32 : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.s16">; def MULWIDES32Imm : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.s16">; -def MULWIDES32Imm32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.s16">; def MULWIDEU32 : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.u16">; def MULWIDEU32Imm : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.u16">; -def MULWIDEU32Imm32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.u16">; -def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; -def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; -def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; +def SDTMulWide : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>; +def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide, [SDNPCommutative]>; +def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide, [SDNPCommutative]>; // Matchers for signed, unsigned mul.wide ISD nodes. -let Predicates = [doMulWide] in { +let Predicates = [hasOptEnabled] in { def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>; def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>; def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>; @@ -877,85 +867,6 @@ let Predicates = [doMulWide] in { def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>; } -// Predicates used for converting some patterns to mul.wide. -def SInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(32); -}]>; - -def UInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(32); -}]>; - -def SInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(16); -}]>; - -def UInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(16); -}]>; - -def IntConst_0_30 : PatLeaf<(imm), [{ - // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(31); -}]>; - -def IntConst_0_14 : PatLeaf<(imm), [{ - // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(15); -}]>; - -def SHL2MUL32 : SDNodeXForm<imm, [{ - const APInt &v = N->getAPIntValue(); - APInt temp(32, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); -}]>; - -def SHL2MUL16 : SDNodeXForm<imm, [{ - const APInt &v = N->getAPIntValue(); - APInt temp(16, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); -}]>; - -// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. -let Predicates = [doMulWide] in { - def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)), - (MULWIDES64Imm $a, (SHL2MUL32 $b))>; - def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)), - (MULWIDEU64Imm $a, (SHL2MUL32 $b))>; - - def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)), - (MULWIDES32Imm $a, (SHL2MUL16 $b))>; - def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)), - (MULWIDEU32Imm $a, (SHL2MUL16 $b))>; - - // Convert "sign/zero-extend then multiply" to mul.wide. - def : Pat<(mul (sext i32:$a), (sext i32:$b)), - (MULWIDES64 $a, $b)>; - def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>; - - def : Pat<(mul (zext i32:$a), (zext i32:$b)), - (MULWIDEU64 $a, $b)>; - def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>; - - def : Pat<(mul (sext i16:$a), (sext i16:$b)), - (MULWIDES32 $a, $b)>; - def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>; - - def : Pat<(mul (zext i16:$a), (zext i16:$b)), - (MULWIDEU32 $a, $b)>; - def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>; -} - // // Integer multiply-add // @@ -991,6 +902,39 @@ defm MAD32 : MAD<"mad.lo.s32", i32, B32, i32imm>; defm MAD64 : MAD<"mad.lo.s64", i64, B64, i64imm>; } +multiclass MAD_WIDE<string PtxSuffix, OneUse2 Op, RegTyInfo BigT, RegTyInfo SmallT> { + def rrr: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.RC:$b, BigT.RC:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), BigT.Ty:$c))]>; + def rri: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.RC:$b, BigT.Imm:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), imm:$c))]>; + def rir: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.RC:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), BigT.Ty:$c))]>; + def rii: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.Imm:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), imm:$c))]>; +} + +def mul_wide_unsigned_oneuse : OneUse2<mul_wide_unsigned>; +def mul_wide_signed_oneuse : OneUse2<mul_wide_signed>; + +let Predicates = [hasOptEnabled] in { +defm MAD_WIDE_U16 : MAD_WIDE<"u16", mul_wide_unsigned_oneuse, I32RT, I16RT>; +defm MAD_WIDE_S16 : MAD_WIDE<"s16", mul_wide_signed_oneuse, I32RT, I16RT>; +defm MAD_WIDE_U32 : MAD_WIDE<"u32", mul_wide_unsigned_oneuse, I64RT, I32RT>; +defm MAD_WIDE_S32 : MAD_WIDE<"s32", mul_wide_signed_oneuse, I64RT, I32RT>; +} + foreach t = [I16RT, I32RT, I64RT] in { def NEG_S # t.Size : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), @@ -1516,20 +1460,19 @@ def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtN // Byte extraction via shift/trunc/sext -def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), - (CVT_s8_s32 $s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), +def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), (CVT_s8_s32 $s, CvtNONE)>; +def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), (CVT_s8_s64 $s, CvtNONE)>; + +def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), (BFE_S64rii $s, imm:$o, 8)>; + +def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), - (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), + (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; + def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))), (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), - (BFE_S64rii $s, imm:$o, 8)>; -def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), - (CVT_s8_s64 $s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), - (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; //----------------------------------- // Comparison instructions (setp, set) @@ -1713,56 +1656,39 @@ def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>; //----------------------------------- // Comparison and Selection //----------------------------------- +// TODO: These patterns seem very specific and brittle. We should try to find +// a more general solution. def cond_signed : PatLeaf<(cond), [{ return isSignedIntSetCC(N->get()); }]>; -def cond_not_signed : PatLeaf<(cond), [{ - return !isSignedIntSetCC(N->get()); -}]>; +// A 16-bit signed comparison of sign-extended byte extracts can be converted +// to 32-bit comparison if we change the PRMT to sign-extend the extracted +// bytes. +def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)), + (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)), + cond_signed:$cc), + (SETP_i32rr (PRMT_B32rii i32:$a, 0, (to_sign_extend_selector $sel_a), PrmtNONE), + (PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE), + (cond2cc $cc))>; + +// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit +// comparison because we know that the truncate is just trancating off zeros +// and that the most-significant byte is also zeros so the meaning of signed and +// unsigned comparisons will not be changed. +def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), + (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), + cond:$cc), + (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), + (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), + (cond2cc $cc))>; -// comparisons of i8 extracted with PRMT as i32 -// It's faster to do comparison directly on i32 extracted by PRMT, -// instead of the long conversion and sign extending. -def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)), - (i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)), - (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), - (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), - (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), - cond_not_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; def SDTDeclareArrayParam : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; def SDTDeclareScalarParam : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; -def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; -def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; -def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; -def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>; def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; @@ -1774,104 +1700,20 @@ def declare_array_param : def declare_scalar_param : SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam, [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -def LoadParam : - SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV2 : - SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV4 : - SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def StoreParam : - SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV2 : - SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV4 : - SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; def proxy_reg : SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) def SDTCallProfile : SDTypeProfile<0, 6, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<5, i32>]>; -def call : - SDNode<"NVPTXISD::CALL", SDTCallProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -let mayLoad = true in { - class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b), - !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"), - []>; - - class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b), - !strconcat("ld.param.v2", opstr, - " \t{{$dst, $dst2}}, [retval0$b];"), []>; - - class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, - regclass:$dst4), - (ins Offseti32imm:$b), - !strconcat("ld.param.v4", opstr, - " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"), - []>; -} - -let mayStore = true in { - - multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> { - foreach op = [IMMType, regclass] in - if !or(support_imm, !isa<NVPTXRegClass>(op)) then - def _ # !if(!isa<NVPTXRegClass>(op), "r", "i") - : NVPTXInst<(outs), - (ins op:$val, i32imm:$a, Offseti32imm:$b), - "st.param" # opstr # " \t[param$a$b], $val;", - []>; - } - - multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, - i32imm:$a, Offseti32imm:$b), - "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};", - []>; - } - - multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - foreach op3 = [IMMType, regclass] in - foreach op4 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - # !if(!isa<NVPTXRegClass>(op3), "r", "i") - # !if(!isa<NVPTXRegClass>(op4), "r", "i") - - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4, - i32imm:$a, Offseti32imm:$b), - "st.param.v4" # opstr # - " \t[param$a$b], {{$val1, $val2, $val3, $val4}};", - []>; - } -} +def call : SDNode<"NVPTXISD::CALL", SDTCallProfile, [SDNPHasChain, SDNPSideEffect]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, -/// NumParams, Callee, Proto, InGlue) +/// NumParams, Callee, Proto) def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; } @@ -1908,43 +1750,6 @@ foreach is_convergent = [0, 1] in { (call_uni_inst $addr, imm:$rets, imm:$params)>; } -def LoadParamMemI64 : LoadParamMemInst<B64, ".b64">; -def LoadParamMemI32 : LoadParamMemInst<B32, ".b32">; -def LoadParamMemI16 : LoadParamMemInst<B16, ".b16">; -def LoadParamMemI8 : LoadParamMemInst<B16, ".b8">; -def LoadParamMemV2I64 : LoadParamV2MemInst<B64, ".b64">; -def LoadParamMemV2I32 : LoadParamV2MemInst<B32, ".b32">; -def LoadParamMemV2I16 : LoadParamV2MemInst<B16, ".b16">; -def LoadParamMemV2I8 : LoadParamV2MemInst<B16, ".b8">; -def LoadParamMemV4I32 : LoadParamV4MemInst<B32, ".b32">; -def LoadParamMemV4I16 : LoadParamV4MemInst<B16, ".b16">; -def LoadParamMemV4I8 : LoadParamV4MemInst<B16, ".b8">; - -defm StoreParamI64 : StoreParamInst<B64, i64imm, ".b64">; -defm StoreParamI32 : StoreParamInst<B32, i32imm, ".b32">; -defm StoreParamI16 : StoreParamInst<B16, i16imm, ".b16">; -defm StoreParamI8 : StoreParamInst<B16, i8imm, ".b8">; - -defm StoreParamI8TruncI32 : StoreParamInst<B32, i8imm, ".b8", /* support_imm */ false>; -defm StoreParamI8TruncI64 : StoreParamInst<B64, i8imm, ".b8", /* support_imm */ false>; - -defm StoreParamV2I64 : StoreParamV2Inst<B64, i64imm, ".b64">; -defm StoreParamV2I32 : StoreParamV2Inst<B32, i32imm, ".b32">; -defm StoreParamV2I16 : StoreParamV2Inst<B16, i16imm, ".b16">; -defm StoreParamV2I8 : StoreParamV2Inst<B16, i8imm, ".b8">; - -defm StoreParamV4I32 : StoreParamV4Inst<B32, i32imm, ".b32">; -defm StoreParamV4I16 : StoreParamV4Inst<B16, i16imm, ".b16">; -defm StoreParamV4I8 : StoreParamV4Inst<B16, i8imm, ".b8">; - -defm StoreParamF32 : StoreParamInst<B32, f32imm, ".b32">; -defm StoreParamF64 : StoreParamInst<B64, f64imm, ".b64">; - -defm StoreParamV2F32 : StoreParamV2Inst<B32, f32imm, ".b32">; -defm StoreParamV2F64 : StoreParamV2Inst<B64, f64imm, ".b64">; - -defm StoreParamV4F32 : StoreParamV4Inst<B32, f32imm, ".b32">; - def DECLARE_PARAM_array : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size), ".param .align $align .b8 \t$a[$size];", []>; @@ -1957,6 +1762,18 @@ def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size), def : Pat<(declare_scalar_param externalsym:$a, imm:$size), (DECLARE_PARAM_scalar (to_texternsym $a), imm:$size)>; +// Call prototype wrapper, this is a dummy instruction that just prints it's +// operand which is string defining the prototype. +def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def CallPrototype : + SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def ProtoIdent : Operand<i32> { let PrintMethod = "printProtoIdent"; } +def CALL_PROTOTYPE : + NVPTXInst<(outs), (ins ProtoIdent:$ident), + "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; + + foreach t = [I32RT, I64RT] in { defvar inst_name = "MOV" # t.Size # "_PARAM"; def inst_name : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), "mov.b" # t.Size>; @@ -1976,6 +1793,32 @@ defm ProxyRegB16 : ProxyRegInst<"b16", B16>; defm ProxyRegB32 : ProxyRegInst<"b32", B32>; defm ProxyRegB64 : ProxyRegInst<"b64", B64>; + +// Callseq start and end + +// Note: these nodes are marked as SDNPMayStore and SDNPMayLoad because +// they define the scope in which the declared params may be used. Therefore +// we add these flags to ensure ld.param and st.param are not sunk or hoisted +// out of that scope. + +def callseq_start : SDNode<"ISD::CALLSEQ_START", + SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>, + [SDNPHasChain, SDNPOutGlue, + SDNPSideEffect, SDNPMayStore, SDNPMayLoad]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", + SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPSideEffect, SDNPMayStore, SDNPMayLoad]>; + +def Callseq_Start : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\{ // callseq $amt1, $amt2", + [(callseq_start timm:$amt1, timm:$amt2)]>; +def Callseq_End : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\} // callseq $amt1", + [(callseq_end timm:$amt1, timm:$amt2)]>; + // // Load / Store Handling // @@ -2519,26 +2362,6 @@ def : Pat<(brcond i32:$a, bb:$target), def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target), (CBranchOther $a, bb:$target)>; -// Call -def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, - SDTCisVT<1, i32>]>; -def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; - -def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; -def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPSideEffect]>; - -def Callseq_Start : - NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), - "\\{ // callseq $amt1, $amt2", - [(callseq_start timm:$amt1, timm:$amt2)]>; -def Callseq_End : - NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), - "\\} // callseq $amt1", - [(callseq_end timm:$amt1, timm:$amt2)]>; - // trap instruction def trapinst : BasicNVPTXInst<(outs), (ins), "trap", [(trap)]>, Requires<[noPTXASUnreachableBug]>; // Emit an `exit` as well to convey to ptxas that `trap` exits the CFG. @@ -2547,18 +2370,6 @@ def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[ // brkpt instruction def debugtrapinst : BasicNVPTXInst<(outs), (ins), "brkpt", [(debugtrap)]>; -// Call prototype wrapper -def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def CallPrototype : - SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def ProtoIdent : Operand<i32> { - let PrintMethod = "printProtoIdent"; -} -def CALL_PROTOTYPE : - NVPTXInst<(outs), (ins ProtoIdent:$ident), - "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; - def SDTDynAllocaOp : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 5779d4e..0e8828f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -243,8 +243,6 @@ public: createObjectTargetWriter() const override { return createPPCXCOFFObjectWriter(TT.isArch64Bit()); } - - std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; }; } // end anonymous namespace @@ -279,13 +277,6 @@ ELFPPCAsmBackend::getFixupKind(StringRef Name) const { return std::nullopt; } -std::optional<MCFixupKind> -XCOFFPPCAsmBackend::getFixupKind(StringRef Name) const { - return StringSwitch<std::optional<MCFixupKind>>(Name) - .Case("R_REF", PPC::fixup_ppc_nofixup) - .Default(std::nullopt); -} - MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index 9e8ee9f..df0c666 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -48,8 +48,7 @@ enum Fixups { /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the /// TLS general and local dynamic models, or inserts the thread-pointer - /// register number. It can also be used to tie the ref symbol to prevent it - /// from being garbage collected on AIX. + /// register number. fixup_ppc_nofixup, /// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp index f75ab62..a04f404 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -56,6 +56,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize( switch ((unsigned)Fixup.getKind()) { default: report_fatal_error("Unimplemented fixup kind."); + case XCOFF::RelocationType::R_REF: + return {XCOFF::RelocationType::R_REF, 0}; case PPC::fixup_ppc_half16: { const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15; switch (Specifier) { @@ -96,12 +98,6 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize( return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25}; case PPC::fixup_ppc_br24abs: return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25}; - case PPC::fixup_ppc_nofixup: { - if (Specifier == PPC::S_None) - return {XCOFF::RelocationType::R_REF, 0}; - else - llvm_unreachable("Unsupported Modifier"); - } break; case FK_Data_4: case FK_Data_8: const uint8_t SignAndSizeForFKData = diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index d295f35..1dc485d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -2159,8 +2159,115 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; } -class XXEvalPattern <dag pattern, bits<8> imm> : - Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} +// ============================================================================= +// XXEVAL Instruction Pattern Definitions +// ============================================================================= +// +// XXEVAL instruction performs 256 different logical operations on three vector +// operands using an 8-bit immediate value to select the operation. +// Format: xxeval XT, XA, XB, XC, IMM +// For example: +// Equivalent function A?xor(B,C):and(B,C) is performed by +// xxeval XT, XA, XB, XC, 22 +// +// REGISTER CLASS CONSTRAINTS: +// - XXEVAL natively supports: VSRC register class [v4i32, v4f32, v2f64, v2i64] +// - Other vector types [v16i8, v8i16] require COPY_TO_REGCLASS to/from VRRC +// ============================================================================= + +class XXEvalPattern<dag pattern, bits<8> imm> + : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} + +class XXEvalPatterns<ValueType Vt, dag InputPattern, bits<8> Imm> + : Pat<(Vt InputPattern), + !if(!or(!eq(Vt, v4i32), !eq(Vt, v2i64)), + // VSRC path: direct XXEVAL for v4i32 and v2i64 + (XXEVAL $vA, $vB, $vC, Imm), + // VRRC path: wrap with COPY_TO_REGCLASS for other types + (COPY_TO_REGCLASS(XXEVAL(COPY_TO_REGCLASS Vt:$vA, VSRC), + (COPY_TO_REGCLASS Vt:$vB, VSRC), + (COPY_TO_REGCLASS Vt:$vC, VSRC), Imm), + VRRC))> {} + +// ============================================================================= +// PatFrags for Bitcast-Aware Vector bitwise Operations +// +// Each PatFrags defines TWO alternatives for pattern matcher to choose: +// - Direct operation (for v4i32) +// - Bitcast operation (for other types: v2i64, v16i8, v8i16) +// ============================================================================= + +// Basic Binary Operations +def VAnd + : PatFrags<(ops node:$a, node:$b), [(and node:$a, node:$b), + (bitconvert(and + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VXor + : PatFrags<(ops node:$a, node:$b), [(xor node:$a, node:$b), + (bitconvert(xor + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VOr : PatFrags<(ops node:$a, node:$b), [(or node:$a, node:$b), + (bitconvert(or + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VNot + : PatFrags<(ops node:$a), [(vnot node:$a), + (bitconvert(vnot(v4i32(bitconvert node:$a))))]>; + +// Derived bitwise operations +// Vector NOR operation (not(or)) +def VNor + : PatFrags<(ops node:$a, node:$b), [(vnot(or node:$a, node:$b)), + (bitconvert(vnot(or + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b)))))]>; + +// Vector EQV operation (not(xor)) +def VEqv + : PatFrags<(ops node:$a, node:$b), [(vnot(xor node:$a, node:$b)), + (bitconvert(vnot(xor + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b)))))]>; + +// ============================================================================= +// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectAnd +// This class matches the equivalent Ternary Operation: A ? f(B,C) : AND(B,C) +// and emit the corresponding xxeval instruction with the imm value. +// +// The patterns implement xxeval vector select operations where: +// - A is the selector vector +// - f(B,C) is the "true" case op on vectors B and C (XOR, NOR, EQV, or NOT) +// - AND(B,C) is the "false" case op on vectors B and C +// ============================================================================= +multiclass XXEvalTernarySelectAnd<ValueType Vt> { + // Pattern: A ? XOR(B,C) : AND(B,C) XXEVAL immediate value: 22 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 22>; + + // Pattern: A ? NOR(B,C) : AND(B,C) XXEVAL immediate value: 24 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 24>; + + // Pattern: A ? EQV(B,C) : AND(B,C) XXEVAL immediate value: 25 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 25>; + + // Pattern: A ? NOT(C) : AND(B,C) XXEVAL immediate value: 26 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 26>; + + // Pattern: A ? NOT(B) : AND(B,C) XXEVAL immediate value: 28 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>; +} let Predicates = [PrefixInstrs, HasP10Vector] in { let AddedComplexity = 400 in { @@ -2270,6 +2377,11 @@ let Predicates = [PrefixInstrs, HasP10Vector] in { // (xor A, (or B, C)) def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; + // XXEval Patterns for ternary Operations. + foreach Ty = [v4i32, v2i64, v8i16, v16i8] in { + defm : XXEvalTernarySelectAnd<Ty>; + } + // Anonymous patterns to select prefixed VSX loads and stores. // Load / Store f128 def : Pat<(f128 (load PDForm:$src)), diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 5e54b82..67cc01e 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -534,16 +534,26 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address, return MCDisassembler::Success; } -static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); + if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(Imm)); + return MCDisassembler::Success; +} static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, uint64_t Address, + const MCDisassembler *Decoder) { + if (Imm < RISCVZC::RA_S0) + return MCDisassembler::Fail; + return decodeZcmpRlist(Inst, Imm, Address, Decoder); +} + +static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, + uint64_t Address, const MCDisassembler *Decoder); static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, @@ -592,24 +602,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, return S; } -static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); - if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(Imm)); - return MCDisassembler::Success; -} - -static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - if (Imm < RISCVZC::RA_S0) - return MCDisassembler::Fail; - return decodeZcmpRlist(Inst, Imm, Address, Decoder); -} - // Add implied SP operand for C.*SP compressed instructions. The SP operand // isn't explicitly encoded in the instruction. void RISCVDisassembler::addSPOperands(MCInst &MI) const { diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td index 4c303a9..da6b95d 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.td +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td @@ -95,3 +95,7 @@ def CSR_XLEN_F32_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F32_V_Interrupt, // Same as CSR_XLEN_F64_V_Interrupt, but excluding X16-X31. def CSR_XLEN_F64_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F64_V_Interrupt, (sequence "X%u", 16, 31))>; + +def CSR_RT_MostRegs : CalleeSavedRegs<(sub CSR_Interrupt, X6, X7, X28)>; +def CSR_RT_MostRegs_RVE : CalleeSavedRegs<(sub CSR_RT_MostRegs, + (sequence "X%u", 16, 31))>; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 34910b7..f223fdbe 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -634,7 +634,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) { // Transform (sra (shl X, C1) C2) with C1 < C2 // -> (SignedBitfieldExtract X, msb, lsb) if (N0.getOpcode() == ISD::SHL) { - auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N01C) return false; @@ -750,7 +750,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) { // Transform (sra (shl X, C1) C2) with C1 > C2 // -> (NDS.BFOS X, lsb, msb) if (N0.getOpcode() == ISD::SHL) { - auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N01C) return false; @@ -1191,7 +1191,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C) // where C2 has 32 leading zeros and C3 trailing zeros. SDNode *SRLIW = CurDAG->getMachineNode( - RISCV::SRLIW, DL, VT, N0->getOperand(0), + RISCV::SRLIW, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(TrailingZeros, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), @@ -1210,7 +1210,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // - without Zba a tablegen pattern applies the very same // transform as we would have done here SDNode *SLLI = CurDAG->getMachineNode( - RISCV::SLLI, DL, VT, N0->getOperand(0), + RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LeadingZeros, DL, VT)); SDNode *SRLI = CurDAG->getMachineNode( RISCV::SRLI, DL, VT, SDValue(SLLI, 0), @@ -1239,7 +1239,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { unsigned TrailingZeros = llvm::countr_zero(Mask); if (LeadingZeros == 32 && TrailingZeros > ShAmt) { SDNode *SRLIW = CurDAG->getMachineNode( - RISCV::SRLIW, DL, VT, N0->getOperand(0), + RISCV::SRLIW, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(TrailingZeros, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), @@ -1266,7 +1266,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (TrailingOnes == 32) { SDNode *SRLI = CurDAG->getMachineNode( Subtarget->is64Bit() ? RISCV::SRLIW : RISCV::SRLI, DL, VT, - N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); + N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); ReplaceNode(Node, SRLI); return; } @@ -1279,19 +1279,19 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (HasBitTest && ShAmt + 1 == TrailingOnes) { SDNode *BEXTI = CurDAG->getMachineNode( Subtarget->hasStdExtZbs() ? RISCV::BEXTI : RISCV::TH_TST, DL, VT, - N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); + N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); ReplaceNode(Node, BEXTI); return; } const unsigned Msb = TrailingOnes - 1; const unsigned Lsb = ShAmt; - if (tryUnsignedBitfieldExtract(Node, DL, VT, N0->getOperand(0), Msb, Lsb)) + if (tryUnsignedBitfieldExtract(Node, DL, VT, N0.getOperand(0), Msb, Lsb)) return; unsigned LShAmt = Subtarget->getXLen() - TrailingOnes; SDNode *SLLI = - CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LShAmt, DL, VT)); SDNode *SRLI = CurDAG->getMachineNode( RISCV::SRLI, DL, VT, SDValue(SLLI, 0), @@ -1328,7 +1328,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { break; unsigned LShAmt = Subtarget->getXLen() - ExtSize; SDNode *SLLI = - CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LShAmt, DL, VT)); SDNode *SRAI = CurDAG->getMachineNode( RISCV::SRAI, DL, VT, SDValue(SLLI, 0), @@ -2942,8 +2942,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, /// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9. bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, SDValue &Offset) { - // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only - // a 9-bit immediate can be folded. + if (SelectAddrFrameIndex(Addr, Base, Offset)) + return true; SDLoc DL(Addr); MVT VT = Addr.getSimpleValueType(); @@ -2953,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, if (isUInt<9>(CVal)) { Base = Addr.getOperand(0); - // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only - // a 9-bit immediate can be folded. + if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base)) + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT); Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 607edd3..c0ada51 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2739,27 +2739,6 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const { } } -bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV( - EVT ScalarTy) const { - if (!ScalarTy.isSimple()) - return false; - switch (ScalarTy.getSimpleVT().SimpleTy) { - case MVT::iPTR: - return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true; - case MVT::i8: - case MVT::i16: - case MVT::i32: - case MVT::f16: - case MVT::bf16: - case MVT::f32: - return true; - case MVT::i64: - case MVT::f64: - return Subtarget.hasVInstructionsI64(); - default: - return false; - } -} unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const { return NumRepeatedDivisors; @@ -20751,6 +20730,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getAllOnesConstant(DL, VT); return DAG.getConstant(0, DL, VT); } + case Intrinsic::riscv_vsseg2_mask: + case Intrinsic::riscv_vsseg3_mask: + case Intrinsic::riscv_vsseg4_mask: + case Intrinsic::riscv_vsseg5_mask: + case Intrinsic::riscv_vsseg6_mask: + case Intrinsic::riscv_vsseg7_mask: + case Intrinsic::riscv_vsseg8_mask: { + SDValue Tuple = N->getOperand(2); + unsigned NF = Tuple.getValueType().getRISCVVectorTupleNumFields(); + + if (Subtarget.hasOptimizedSegmentLoadStore(NF) || !Tuple.hasOneUse() || + Tuple.getOpcode() != RISCVISD::TUPLE_INSERT || + !Tuple.getOperand(0).isUndef()) + return SDValue(); + + SDValue Val = Tuple.getOperand(1); + unsigned Idx = Tuple.getConstantOperandVal(2); + + unsigned SEW = Val.getValueType().getScalarSizeInBits(); + assert(Log2_64(SEW) == N->getConstantOperandVal(6) && + "Type mismatch without bitcast?"); + unsigned Stride = SEW / 8 * NF; + unsigned Offset = SEW / 8 * Idx; + + SDValue Ops[] = { + /*Chain=*/N->getOperand(0), + /*IntID=*/ + DAG.getTargetConstant(Intrinsic::riscv_vsse_mask, DL, XLenVT), + /*StoredVal=*/Val, + /*Ptr=*/ + DAG.getNode(ISD::ADD, DL, XLenVT, N->getOperand(3), + DAG.getConstant(Offset, DL, XLenVT)), + /*Stride=*/DAG.getConstant(Stride, DL, XLenVT), + /*Mask=*/N->getOperand(4), + /*VL=*/N->getOperand(5)}; + + auto *OldMemSD = cast<MemIntrinsicSDNode>(N); + // Match getTgtMemIntrinsic for non-unit stride case + EVT MemVT = OldMemSD->getMemoryVT().getScalarType(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + OldMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize); + + SDVTList VTs = DAG.getVTList(MVT::Other); + return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, VTs, Ops, MemVT, + MMO); + } } } case ISD::EXPERIMENTAL_VP_REVERSE: @@ -20843,6 +20869,68 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } break; } + case RISCVISD::TUPLE_EXTRACT: { + EVT VT = N->getValueType(0); + SDValue Tuple = N->getOperand(0); + unsigned Idx = N->getConstantOperandVal(1); + if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN) + break; + + unsigned NF = 0; + switch (Tuple.getConstantOperandVal(1)) { + default: + break; + case Intrinsic::riscv_vlseg2_mask: + case Intrinsic::riscv_vlseg3_mask: + case Intrinsic::riscv_vlseg4_mask: + case Intrinsic::riscv_vlseg5_mask: + case Intrinsic::riscv_vlseg6_mask: + case Intrinsic::riscv_vlseg7_mask: + case Intrinsic::riscv_vlseg8_mask: + NF = Tuple.getValueType().getRISCVVectorTupleNumFields(); + break; + } + + if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF)) + break; + + unsigned SEW = VT.getScalarSizeInBits(); + assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) && + "Type mismatch without bitcast?"); + unsigned Stride = SEW / 8 * NF; + unsigned Offset = SEW / 8 * Idx; + + SDValue Ops[] = { + /*Chain=*/Tuple.getOperand(0), + /*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT), + /*Passthru=*/Tuple.getOperand(2), + /*Ptr=*/ + DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3), + DAG.getConstant(Offset, DL, XLenVT)), + /*Stride=*/DAG.getConstant(Stride, DL, XLenVT), + /*Mask=*/Tuple.getOperand(4), + /*VL=*/Tuple.getOperand(5), + /*Policy=*/Tuple.getOperand(6)}; + + auto *TupleMemSD = cast<MemIntrinsicSDNode>(Tuple); + // Match getTgtMemIntrinsic for non-unit stride case + EVT MemVT = TupleMemSD->getMemoryVT().getScalarType(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize); + + SDVTList VTs = DAG.getVTList({VT, MVT::Other}); + SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, + Ops, MemVT, MMO); + DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1)); + return Result.getValue(0); + } + case RISCVISD::TUPLE_INSERT: { + // tuple_insert tuple, undef, idx -> tuple + if (N->getOperand(1).isUndef()) + return N->getOperand(0); + break; + } } return SDValue(); @@ -22367,6 +22455,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( case CallingConv::C: case CallingConv::Fast: case CallingConv::SPIR_KERNEL: + case CallingConv::PreserveMost: case CallingConv::GRAAL: case CallingConv::RISCV_VectorCall: #define CC_VLS_CASE(ABI_VLEN) case CallingConv::RISCV_VLSCall_##ABI_VLEN: @@ -22636,8 +22725,14 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; EVT PtrVT = getPointerTy(DAG.getDataLayout()); MVT XLenVT = Subtarget.getXLenVT(); + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction::CallSiteInfo CSInfo; + + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); // Analyze the operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; @@ -22895,6 +22990,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, if (CLI.CFIType) Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); + if (MF.getTarget().Options.EmitCallGraphSection && CB && + CB->isIndirectCall()) + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); return Ret; } @@ -22902,6 +23000,10 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); if (CLI.CFIType) Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); + + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); Glue = Chain.getValue(1); @@ -24260,7 +24362,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return false; EVT ScalarType = DataType.getScalarType(); - if (!isLegalLoadStoreElementTypeForRVV(ScalarType)) + if (!isLegalElementTypeForRVV(ScalarType)) return false; if (!Subtarget.enableUnalignedVectorMem() && diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a788c0b7..ca70c46 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -384,7 +384,6 @@ public: bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override; bool isLegalElementTypeForRVV(EVT ScalarTy) const; - bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const; bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index dd365cf..8297d50 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -136,6 +136,7 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr> //===----------------------------------------------------------------------===// let Predicates = [HasStdExtP] in { +let IsSignExtendingOpW = 1 in def CLS : Unary_r<0b011000000011, 0b001, "cls">; def ABS : Unary_r<0b011000000111, 0b001, "abs">; } // Predicates = [HasStdExtP] @@ -146,8 +147,10 @@ let Predicates = [HasStdExtP, IsRV64] in { def REV16 : Unary_r<0b011010110000, 0b101, "rev16">; def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">; +let IsSignExtendingOpW = 1 in { def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">; def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; +} } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index f391300..5265613 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1120,27 +1120,11 @@ let Predicates = [HasVendorXqcisync, IsRV32] in { def QC_C_SYNCWF : QCIRVInst16CBSYNC<0b100, "qc.c.syncwf">; def QC_C_SYNCWL : QCIRVInst16CBSYNC<0b101, "qc.c.syncwl">; - let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in - def QC_C_DELAY : RVInst16CI<0b000, 0b10, (outs), - (ins uimm5nonzero:$imm), - "qc.c.delay", "$imm"> { - let Inst{12} = 0; - let Inst{11-7} = 0; - let Inst{6-2} = imm{4-0}; - } + // qc.c.delay implemented as an alias, below } // Predicates = [HasVendorXqcisync, IsRV32] let Predicates = [HasVendorXqcisim, IsRV32] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { - def QC_PSYSCALLI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm10:$imm10), - "qc.psyscalli", "$imm10"> { - bits<10> imm10; - - let rs1 = 0; - let rd = 0; - let imm12 = {0b00, imm10}; - } - def QC_PPUTCI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm8:$imm8), "qc.pputci", "$imm8"> { bits<8> imm8; @@ -1150,18 +1134,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { let imm12 = {0b0100, imm8}; } - def QC_PCOREDUMP : QCISim_NONE<0b0110, "qc.pcoredump">; - def QC_PPREGS : QCISim_NONE<0b0111, "qc.ppregs">; - def QC_PPREG : QCISim_RS1<0b1000, "qc.ppreg">; - def QC_PPUTC : QCISim_RS1<0b1001, "qc.pputc">; - def QC_PPUTS : QCISim_RS1<0b1010, "qc.pputs">; - def QC_PEXIT : QCISim_RS1<0b1011, "qc.pexit">; - def QC_PSYSCALL : QCISim_RS1<0b1100, "qc.psyscall">; - - def QC_C_PTRACE : RVInst16CI<0b000, 0b10, (outs), (ins), "qc.c.ptrace", ""> { - let rd = 0; - let imm = 0; - } + // The other instructions are all implemented as aliases, below } // mayLoad = 0, mayStore = 0, hasSideEffects = 1 } // Predicates = [HasVendorXqcisim, IsRV32] @@ -1218,6 +1191,27 @@ let EmitPriority = 0 in { } // EmitPriority = 0 } // Predicates = [HasVendorXqcilo, IsRV32] +let Predicates = [HasVendorXqcisim, IsRV32] in { +let EmitPriority = 1 in { + def : InstAlias<"qc.c.ptrace", (C_SLLI X0, 0)>; + + def : InstAlias<"qc.psyscalli $imm", (SLTI X0, X0, uimm10:$imm)>; + def : InstAlias<"qc.pcoredump", (SLTI X0, X0, 1536)>; + def : InstAlias<"qc.ppregs", (SLTI X0, X0, 1792)>; + def : InstAlias<"qc.ppreg $rs1", (SLTI X0, GPR:$rs1, -2048)>; + def : InstAlias<"qc.pputc $rs1", (SLTI X0, GPR:$rs1, -1792)>; + def : InstAlias<"qc.pputs $rs1", (SLTI X0, GPR:$rs1, -1536)>; + def : InstAlias<"qc.pexit $rs1", (SLTI X0, GPR:$rs1, -1280)>; + def : InstAlias<"qc.psyscall $rs1", (SLTI X0, GPR:$rs1, -1024)>; +} // EmitPriority = 1 +} // Predicates = [HasVendorXqcisim, IsRV32] + +let Predicates = [HasVendorXqcisync, IsRV32] in { +let EmitPriority = 1 in { + def : InstAlias<"qc.c.delay $imm", (C_SLLI X0, uimm5nonzero:$imm)>; +} +} // Predicates = [HasVendorXqcisync, IsRV32] + //===----------------------------------------------------------------------===// // Pseudo-instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 3cbe668..726920e 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType( if (!isTypeLegal(VT)) return false; - if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) || + if (!isLegalElementTypeForRVV(VT.getScalarType()) || !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace, Alignment)) return false; @@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) return false; - // If the segment load is going to be performed segment at a time anyways - // and there's only one element used, use a strided load instead. This - // will be equally fast, and create less vector register pressure. - if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) { - unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // For rv64, need to truncate i64 to i32 to match signature. As VL is at most - // the number of active lanes (which is bounded by i32) this is safe. - VL = Builder.CreateTrunc(VL, Builder.getInt32Ty()); - - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, - {VTy, BasePtr->getType(), Stride->getType()}, - {BasePtr, Stride, Mask, VL}); - Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes); - CI->addParamAttr(0, - Attribute::getWithAlignment(CI->getContext(), Alignment)); - Shuffles[0]->replaceAllUsesWith(CI); - return true; - }; - CallInst *VlsegN = Builder.CreateIntrinsic( FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); @@ -289,33 +266,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) return false; - unsigned Index; - // If the segment store only has one active lane (i.e. the interleave is - // just a spread shuffle), we can use a strided store instead. This will - // be equally fast, and create less vector register pressure. - if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) && - isSpreadMask(Mask, Factor, Index)) { - unsigned ScalarSizeInBytes = - DL.getTypeStoreSize(ShuffleVTy->getElementType()); - Value *Data = SVI->getOperand(0); - Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0)); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // For rv64, need to truncate i64 to i32 to match signature. As VL is at - // most the number of active lanes (which is bounded by i32) this is safe. - VL = Builder.CreateTrunc(VL, Builder.getInt32Ty()); - - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store, - {VTy, BasePtr->getType(), Stride->getType()}, - {Data, BasePtr, Stride, LaneMask, VL}); - Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes); - CI->addParamAttr(1, - Attribute::getWithAlignment(CI->getContext(), Alignment)); - return true; - } - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 5404123..7e58b6f 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -68,6 +68,9 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { auto &Subtarget = MF->getSubtarget<RISCVSubtarget>(); if (MF->getFunction().getCallingConv() == CallingConv::GHC) return CSR_NoRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return Subtarget.hasStdExtE() ? CSR_RT_MostRegs_RVE_SaveList + : CSR_RT_MostRegs_SaveList; if (MF->getFunction().hasFnAttribute("interrupt")) { if (Subtarget.hasVInstructions()) { if (Subtarget.hasStdExtD()) @@ -573,6 +576,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int64_t Val = Offset.getFixed(); int64_t Lo12 = SignExtend64<12>(Val); unsigned Opc = MI.getOpcode(); + if (Opc == RISCV::ADDI && !isInt<12>(Val)) { // We chose to emit the canonical immediate sequence rather than folding // the offset into the using add under the theory that doing so doesn't @@ -585,6 +589,9 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, (Lo12 & 0b11111) != 0) { // Prefetch instructions require the offset to be 32 byte aligned. MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); + } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) { + // MIPS Prefetch instructions require the offset to be 9 bits encoded. + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); } else if ((Opc == RISCV::PseudoRV32ZdinxLD || Opc == RISCV::PseudoRV32ZdinxSD) && Lo12 >= 2044) { @@ -811,7 +818,13 @@ RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF, if (CC == CallingConv::GHC) return CSR_NoRegs_RegMask; - switch (Subtarget.getTargetABI()) { + RISCVABI::ABI ABI = Subtarget.getTargetABI(); + if (CC == CallingConv::PreserveMost) { + if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E) + return CSR_RT_MostRegs_RVE_RegMask; + return CSR_RT_MostRegs_RegMask; + } + switch (ABI) { default: llvm_unreachable("Unrecognized ABI"); case RISCVABI::ABI_ILP32E: diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index fd634b5..61dbd06 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1191,9 +1191,6 @@ static const CostTblEntry VectorIntrinsicCostTable[]{ {Intrinsic::roundeven, MVT::f64, 9}, {Intrinsic::rint, MVT::f32, 7}, {Intrinsic::rint, MVT::f64, 7}, - {Intrinsic::lrint, MVT::i32, 1}, - {Intrinsic::lrint, MVT::i64, 1}, - {Intrinsic::llrint, MVT::i64, 1}, {Intrinsic::nearbyint, MVT::f32, 9}, {Intrinsic::nearbyint, MVT::f64, 9}, {Intrinsic::bswap, MVT::i16, 3}, @@ -1251,11 +1248,48 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, switch (ICA.getID()) { case Intrinsic::lrint: case Intrinsic::llrint: - // We can't currently lower half or bfloat vector lrint/llrint. - if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]); - VecTy && VecTy->getElementType()->is16bitFPTy()) - return InstructionCost::getInvalid(); - [[fallthrough]]; + case Intrinsic::lround: + case Intrinsic::llround: { + auto LT = getTypeLegalizationCost(RetTy); + Type *SrcTy = ICA.getArgTypes().front(); + auto SrcLT = getTypeLegalizationCost(SrcTy); + if (ST->hasVInstructions() && LT.second.isVector()) { + SmallVector<unsigned, 2> Ops; + unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType()); + unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType()); + if (LT.second.getVectorElementType() == MVT::bf16) { + if (!ST->hasVInstructionsBF16Minimal()) + return InstructionCost::getInvalid(); + if (DstEltSz == 32) + Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V}; + else + Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V}; + } else if (LT.second.getVectorElementType() == MVT::f16 && + !ST->hasVInstructionsF16()) { + if (!ST->hasVInstructionsF16Minimal()) + return InstructionCost::getInvalid(); + if (DstEltSz == 32) + Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V}; + else + Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V}; + + } else if (SrcEltSz > DstEltSz) { + Ops = {RISCV::VFNCVT_X_F_W}; + } else if (SrcEltSz < DstEltSz) { + Ops = {RISCV::VFWCVT_X_F_V}; + } else { + Ops = {RISCV::VFCVT_X_F_V}; + } + + // We need to use the source LMUL in the case of a narrowing op, and the + // destination LMUL otherwise. + if (SrcEltSz > DstEltSz) + return SrcLT.first * + getRISCVInstructionCost(Ops, SrcLT.second, CostKind); + return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind); + } + break; + } case Intrinsic::ceil: case Intrinsic::floor: case Intrinsic::trunc: diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index f0510ec..d62d99c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -265,7 +265,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedLoad(Type *DataType, Align Alignment, @@ -297,7 +297,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedGather(Type *DataType, Align Alignment) const override { diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 5cda6a0..7505507 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -74,17 +74,20 @@ class SPIRVLegalizePointerCast : public FunctionPass { // Returns the loaded value. Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType, FixedVectorType *TargetType, Value *Source) { - // We expect the codegen to avoid doing implicit bitcast from a load. - assert(TargetType->getElementType() == SourceType->getElementType()); - assert(TargetType->getNumElements() < SourceType->getNumElements()); - + assert(TargetType->getNumElements() <= SourceType->getNumElements()); LoadInst *NewLoad = B.CreateLoad(SourceType, Source); buildAssignType(B, SourceType, NewLoad); + Value *AssignValue = NewLoad; + if (TargetType->getElementType() != SourceType->getElementType()) { + AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast, + {TargetType, SourceType}, {NewLoad}); + buildAssignType(B, TargetType, AssignValue); + } SmallVector<int> Mask(/* Size= */ TargetType->getNumElements()); for (unsigned I = 0; I < TargetType->getNumElements(); ++I) Mask[I] = I; - Value *Output = B.CreateShuffleVector(NewLoad, NewLoad, Mask); + Value *Output = B.CreateShuffleVector(AssignValue, AssignValue, Mask); buildAssignType(B, TargetType, Output); return Output; } @@ -135,8 +138,9 @@ class SPIRVLegalizePointerCast : public FunctionPass { Output = loadFirstValueFromAggregate(B, SVT->getElementType(), OriginalOperand, LI); } - // Destination is a smaller vector than source. + // Destination is a smaller vector than source or different vector type. // - float3 v3 = vector4; + // - float4 v2 = int4; else if (SVT && DVT) Output = loadVectorFromVector(B, SVT, DVT, OriginalOperand); // Destination is the scalar type stored at the start of an aggregate. diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 721f64a..1995e0f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -335,6 +335,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { getActionDefinitionsBuilder({G_SMULH, G_UMULH}).alwaysLegal(); } + getActionDefinitionsBuilder(G_IS_FPCLASS).custom(); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } @@ -355,9 +357,14 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpvType, bool SPIRVLegalizerInfo::legalizeCustom( LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const { - auto Opc = MI.getOpcode(); MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - if (Opc == TargetOpcode::G_ICMP) { + switch (MI.getOpcode()) { + default: + // TODO: implement legalization for other opcodes. + return true; + case TargetOpcode::G_IS_FPCLASS: + return legalizeIsFPClass(Helper, MI, LocObserver); + case TargetOpcode::G_ICMP: { assert(GR->getSPIRVTypeForVReg(MI.getOperand(0).getReg())); auto &Op0 = MI.getOperand(2); auto &Op1 = MI.getOperand(3); @@ -378,6 +385,238 @@ bool SPIRVLegalizerInfo::legalizeCustom( } return true; } - // TODO: implement legalization for other opcodes. + } +} + +// Note this code was copied from LegalizerHelper::lowerISFPCLASS and adjusted +// to ensure that all instructions created during the lowering have SPIR-V types +// assigned to them. +bool SPIRVLegalizerInfo::legalizeIsFPClass( + LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); + FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm()); + + auto &MIRBuilder = Helper.MIRBuilder; + auto &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + Type *LLVMDstTy = + IntegerType::get(MIRBuilder.getContext(), DstTy.getScalarSizeInBits()); + if (DstTy.isVector()) + LLVMDstTy = VectorType::get(LLVMDstTy, DstTy.getElementCount()); + SPIRVType *SPIRVDstTy = GR->getOrCreateSPIRVType( + LLVMDstTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + + unsigned BitSize = SrcTy.getScalarSizeInBits(); + const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType()); + + LLT IntTy = LLT::scalar(BitSize); + Type *LLVMIntTy = IntegerType::get(MIRBuilder.getContext(), BitSize); + if (SrcTy.isVector()) { + IntTy = LLT::vector(SrcTy.getElementCount(), IntTy); + LLVMIntTy = VectorType::get(LLVMIntTy, SrcTy.getElementCount()); + } + SPIRVType *SPIRVIntTy = GR->getOrCreateSPIRVType( + LLVMIntTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + + // Clang doesn't support capture of structured bindings: + LLT DstTyCopy = DstTy; + const auto assignSPIRVTy = [&](MachineInstrBuilder &&MI) { + // Assign this MI's (assumed only) destination to one of the two types we + // expect: either the G_IS_FPCLASS's destination type, or the integer type + // bitcast from the source type. + LLT MITy = MRI.getType(MI.getReg(0)); + assert((MITy == IntTy || MITy == DstTyCopy) && + "Unexpected LLT type while lowering G_IS_FPCLASS"); + auto *SPVTy = MITy == IntTy ? SPIRVIntTy : SPIRVDstTy; + GR->assignSPIRVTypeToVReg(SPVTy, MI.getReg(0), MF); + return MI; + }; + + // Helper to build and assign a constant in one go + const auto buildSPIRVConstant = [&](LLT Ty, auto &&C) -> MachineInstrBuilder { + if (!Ty.isFixedVector()) + return assignSPIRVTy(MIRBuilder.buildConstant(Ty, C)); + auto ScalarC = MIRBuilder.buildConstant(Ty.getScalarType(), C); + assert((Ty == IntTy || Ty == DstTyCopy) && + "Unexpected LLT type while lowering constant for G_IS_FPCLASS"); + SPIRVType *VecEltTy = GR->getOrCreateSPIRVType( + (Ty == IntTy ? LLVMIntTy : LLVMDstTy)->getScalarType(), MIRBuilder, + SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + GR->assignSPIRVTypeToVReg(VecEltTy, ScalarC.getReg(0), MF); + return assignSPIRVTy(MIRBuilder.buildSplatBuildVector(Ty, ScalarC)); + }; + + if (Mask == fcNone) { + MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 0)); + MI.eraseFromParent(); + return true; + } + if (Mask == fcAllFlags) { + MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 1)); + MI.eraseFromParent(); + return true; + } + + // Note that rather than creating a COPY here (between a floating-point and + // integer type of the same size) we create a SPIR-V bitcast immediately. We + // can't create a G_BITCAST because the LLTs are the same, and we can't seem + // to correctly lower COPYs to SPIR-V bitcasts at this moment. + Register ResVReg = MRI.createGenericVirtualRegister(IntTy); + MRI.setRegClass(ResVReg, GR->getRegClass(SPIRVIntTy)); + GR->assignSPIRVTypeToVReg(SPIRVIntTy, ResVReg, Helper.MIRBuilder.getMF()); + auto AsInt = MIRBuilder.buildInstr(SPIRV::OpBitcast) + .addDef(ResVReg) + .addUse(GR->getSPIRVTypeID(SPIRVIntTy)) + .addUse(SrcReg); + AsInt = assignSPIRVTy(std::move(AsInt)); + + // Various masks. + APInt SignBit = APInt::getSignMask(BitSize); + APInt ValueMask = APInt::getSignedMaxValue(BitSize); // All bits but sign. + APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit. + APInt ExpMask = Inf; + APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf; + APInt QNaNBitMask = + APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1); + APInt InversionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits()); + + auto SignBitC = buildSPIRVConstant(IntTy, SignBit); + auto ValueMaskC = buildSPIRVConstant(IntTy, ValueMask); + auto InfC = buildSPIRVConstant(IntTy, Inf); + auto ExpMaskC = buildSPIRVConstant(IntTy, ExpMask); + auto ZeroC = buildSPIRVConstant(IntTy, 0); + + auto Abs = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC)); + auto Sign = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs)); + + auto Res = buildSPIRVConstant(DstTy, 0); + + const auto appendToRes = [&](MachineInstrBuilder &&ToAppend) { + Res = assignSPIRVTy( + MIRBuilder.buildOr(DstTyCopy, Res, assignSPIRVTy(std::move(ToAppend)))); + }; + + // Tests that involve more than one class should be processed first. + if ((Mask & fcFinite) == fcFinite) { + // finite(V) ==> abs(V) u< exp_mask + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs, + ExpMaskC)); + Mask &= ~fcFinite; + } else if ((Mask & fcFinite) == fcPosFinite) { + // finite(V) && V > 0 ==> V u< exp_mask + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt, + ExpMaskC)); + Mask &= ~fcPosFinite; + } else if ((Mask & fcFinite) == fcNegFinite) { + // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1 + auto Cmp = assignSPIRVTy(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, + DstTy, Abs, ExpMaskC)); + appendToRes(MIRBuilder.buildAnd(DstTy, Cmp, Sign)); + Mask &= ~fcNegFinite; + } + + if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) { + // fcZero | fcSubnormal => test all exponent bits are 0 + // TODO: Handle sign bit specific cases + // TODO: Handle inverted case + if (PartialCheck == (fcZero | fcSubnormal)) { + auto ExpBits = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC)); + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + ExpBits, ZeroC)); + Mask &= ~PartialCheck; + } + } + + // Check for individual classes. + if (FPClassTest PartialCheck = Mask & fcZero) { + if (PartialCheck == fcPosZero) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, ZeroC)); + else if (PartialCheck == fcZero) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC)); + else // fcNegZero + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, SignBitC)); + } + + if (FPClassTest PartialCheck = Mask & fcSubnormal) { + // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set) + // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set) + auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs; + auto OneC = buildSPIRVConstant(IntTy, 1); + auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC); + auto SubnormalRes = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne, + buildSPIRVConstant(IntTy, AllOneMantissa))); + if (PartialCheck == fcNegSubnormal) + SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign); + appendToRes(std::move(SubnormalRes)); + } + + if (FPClassTest PartialCheck = Mask & fcInf) { + if (PartialCheck == fcPosInf) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, InfC)); + else if (PartialCheck == fcInf) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC)); + else { // fcNegInf + APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt(); + auto NegInfC = buildSPIRVConstant(IntTy, NegInf); + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, NegInfC)); + } + } + + if (FPClassTest PartialCheck = Mask & fcNan) { + auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask); + if (PartialCheck == fcNan) { + // isnan(V) ==> abs(V) u> int(inf) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC)); + } else if (PartialCheck == fcQNan) { + // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs, + InfWithQnanBitC)); + } else { // fcSNan + // issignaling(V) ==> abs(V) u> unsigned(Inf) && + // abs(V) u< (unsigned(Inf) | quiet_bit) + auto IsNan = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC)); + auto IsNotQnan = assignSPIRVTy(MIRBuilder.buildICmp( + CmpInst::Predicate::ICMP_ULT, DstTy, Abs, InfWithQnanBitC)); + appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan)); + } + } + + if (FPClassTest PartialCheck = Mask & fcNormal) { + // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u< + // (max_exp-1)) + APInt ExpLSB = ExpMask & ~(ExpMask.shl(1)); + auto ExpMinusOne = assignSPIRVTy( + MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB))); + APInt MaxExpMinusOne = ExpMask - ExpLSB; + auto NormalRes = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne, + buildSPIRVConstant(IntTy, MaxExpMinusOne))); + if (PartialCheck == fcNegNormal) + NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign); + else if (PartialCheck == fcPosNormal) { + auto PosSign = assignSPIRVTy(MIRBuilder.buildXor( + DstTy, Sign, buildSPIRVConstant(DstTy, InversionMask))); + NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign); + } + appendToRes(std::move(NormalRes)); + } + + MIRBuilder.buildCopy(DstReg, Res); + MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h index 6335f21..eeefa42 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h @@ -30,6 +30,10 @@ public: bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override; SPIRVLegalizerInfo(const SPIRVSubtarget &ST); + +private: + bool legalizeIsFPClass(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const; }; } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRVMACHINELEGALIZER_H diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td index a606209..089be5f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssembly.td +++ b/llvm/lib/Target/WebAssembly/WebAssembly.td @@ -49,6 +49,8 @@ def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", "Enable FP16 instructions">; +def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">; + def FeatureMultiMemory : SubtargetFeature<"multimemory", "HasMultiMemory", "true", "Enable multiple memories">; @@ -71,7 +73,6 @@ def FeatureReferenceTypes : SubtargetFeature<"reference-types", "HasReferenceTypes", "true", "Enable reference types">; -def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">; def FeatureRelaxedSIMD : SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD", "Enable relaxed-simd instructions">; @@ -139,10 +140,10 @@ def : ProcessorModel<"lime1", NoSchedModel, def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt, FeatureCallIndirectOverlong, FeatureExceptionHandling, - FeatureExtendedConst, FeatureFP16, FeatureMultiMemory, - FeatureMultivalue, FeatureMutableGlobals, + FeatureExtendedConst, FeatureFP16, FeatureGC, + FeatureMultiMemory, FeatureMultivalue, FeatureMutableGlobals, FeatureNontrappingFPToInt, FeatureRelaxedSIMD, - FeatureReferenceTypes, FeatureGC, FeatureSIMD128, + FeatureReferenceTypes, FeatureSIMD128, FeatureSignExt, FeatureTailCall]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index cd434f7..3f80b2a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -3436,8 +3436,7 @@ static SDValue performSETCCCombine(SDNode *N, return SDValue(); } -static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::MUL); +static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (VT != MVT::v8i32 && VT != MVT::v16i32) return SDValue(); @@ -3523,6 +3522,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue performMulCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N->getOpcode() == ISD::MUL); + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + if (auto Res = TryWideExtMulCombine(N, DCI.DAG)) + return Res; + + // We don't natively support v16i8 mul, but we do support v8i16 so split the + // inputs and extend them to v8i16. Only do this before legalization in case + // a narrow vector is widened and may be simplified later. + if (!DCI.isBeforeLegalize() || VT != MVT::v16i8) + return SDValue(); + + SDLoc DL(N); + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue LowLHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS); + SDValue HighLHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS); + SDValue LowRHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS); + SDValue HighRHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS); + + SDValue MulLow = + DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS)); + SDValue MulHigh = DAG.getBitcast( + VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS)); + + // Take the low byte of each lane. + return DAG.getVectorShuffle( + VT, DL, MulLow, MulHigh, + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3557,6 +3596,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return performLowerPartialReduction(N, DCI.DAG); } case ISD::MUL: - return performMulCombine(N, DCI.DAG); + return performMulCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index 2b632fd..13d048a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -50,6 +50,9 @@ def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<(all_of FeatureFP16), "fp16">; +def HasGC : Predicate<"Subtarget->hasGC()">, + AssemblerPredicate<(all_of FeatureGC), "gc">; + def HasMultiMemory : Predicate<"Subtarget->hasMultiMemory()">, AssemblerPredicate<(all_of FeatureMultiMemory), "multimemory">; @@ -76,9 +79,6 @@ def HasReferenceTypes : Predicate<"Subtarget->hasReferenceTypes()">, AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">; -def HasGC : Predicate<"Subtarget->hasGC()">, - AssemblerPredicate<(all_of FeatureGC), "gc">; - def HasRelaxedSIMD : Predicate<"Subtarget->hasRelaxedSIMD()">, AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index d13862f..143298b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1540,6 +1540,8 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; + def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), + (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; } defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 28f6599..c3990d1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -782,6 +782,24 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) { for (Instruction &I : BB) { if (I.getType()->isVoidTy()) continue; + + if (isa<AllocaInst>(&I)) { + // If the alloca has any lifetime marker that is no longer dominated + // by the alloca, remove all lifetime markers. Lifetime markers must + // always work directly on the alloca, and this is no longer possible. + bool HasNonDominatedLifetimeMarker = any_of(I.users(), [&](User *U) { + auto *UserI = cast<Instruction>(U); + return UserI->isLifetimeStartOrEnd() && !DT.dominates(&I, UserI); + }); + if (HasNonDominatedLifetimeMarker) { + for (User *U : make_early_inc_range(I.users())) { + auto *UserI = cast<Instruction>(U); + if (UserI->isLifetimeStartOrEnd()) + UserI->eraseFromParent(); + } + } + } + unsigned VarID = SSA.AddVariable(I.getName(), I.getType()); // If a value is defined by an invoke instruction, it is only available in // its normal destination and not in its unwind destination. @@ -1269,10 +1287,20 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { // Setjmp preparation + SmallVector<AllocaInst *> StaticAllocas; + for (Instruction &I : F.getEntryBlock()) + if (auto *AI = dyn_cast<AllocaInst>(&I)) + if (AI->isStaticAlloca()) + StaticAllocas.push_back(AI); + BasicBlock *Entry = &F.getEntryBlock(); DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram()); SplitBlock(Entry, &*Entry->getFirstInsertionPt()); + // Move static allocas back into the entry block, so they stay static. + for (AllocaInst *AI : StaticAllocas) + AI->moveBefore(Entry->getTerminator()->getIterator()); + IRB.SetInsertPoint(Entry->getTerminator()->getIterator()); // This alloca'ed pointer is used by the runtime to identify function // invocations. It's just for pointer comparisons. It will never be diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h index f814274..2f88bbb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h @@ -46,12 +46,12 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo { bool HasExceptionHandling = false; bool HasExtendedConst = false; bool HasFP16 = false; + bool HasGC = false; bool HasMultiMemory = false; bool HasMultivalue = false; bool HasMutableGlobals = false; bool HasNontrappingFPToInt = false; bool HasReferenceTypes = false; - bool HasGC = false; bool HasSignExt = false; bool HasTailCall = false; bool HasWideArithmetic = false; diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 1bf9f8b..f9bd233 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -104,6 +104,7 @@ add_llvm_target(X86CodeGen ${sources} IRPrinter Instrumentation MC + ObjCARC ProfileData Scalar SelectionDAG diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 0ff7f23..067bd43 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3673,6 +3673,12 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CLI.NumResultRegs = RVLocs.size(); CLI.Call = MIB; + // Add call site info for call graph section. + if (TM.Options.EmitCallGraphSection && CB && CB->isIndirectCall()) { + MachineFunction::CallSiteInfo CSInfo(*CB); + MF->addCallSiteInfo(CLI.Call, std::move(CSInfo)); + } + return true; } @@ -4042,6 +4048,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, MO.setReg(IndexReg); } + if (MI->isCall()) + FuncInfo.MF->moveAdditionalCallInfo(MI, Result); Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); Result->cloneInstrSymbols(*FuncInfo.MF, *MI); MachineBasicBlock::iterator I(MI); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 11ab8dc..7244a6d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58074,11 +58074,9 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) { // res, flags2 = sub 0, (and X, Y) // cload/cstore ..., cond_ne, flag2 // -> - // res, flags2 = and X, Y + // res, flags2 = cmp (and X, Y), 0 // cload/cstore ..., cond_ne, flag2 - Ops[4] = DAG.getNode(X86ISD::AND, DL, Sub->getVTList(), Op1.getOperand(0), - Op1.getOperand(1)) - .getValue(1); + Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Op1, Sub.getOperand(0)); } else { return SDValue(); } diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index b4639ac..5862c7e 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2060,6 +2060,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + if (IsIndirectCall && !IsWin64 && M->getModuleFlag("import-call-optimization")) errorUnsupported(DAG, dl, diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index e5c896f..126be71 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -446,6 +446,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["tanh-insts"] = true; Features["transpose-load-f4f6-insts"] = true; Features["bf16-trans-insts"] = true; + Features["bf16-cvt-insts"] = true; Features["fp8-conversion-insts"] = true; Features["fp8e5m3-insts"] = true; Features["permlane16-swap"] = true; diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index ee6651c..6acb0bc 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -277,6 +277,8 @@ StringRef Triple::getVendorTypeName(VendorType Kind) { case PC: return "pc"; case SCEI: return "scei"; case SUSE: return "suse"; + case Meta: + return "meta"; } llvm_unreachable("Invalid VendorType!"); @@ -390,6 +392,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) { case OpenHOS: return "ohos"; case PAuthTest: return "pauthtest"; + case MTIA: + return "mtia"; case LLVM: return "llvm"; case Mlibc: @@ -677,6 +681,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("suse", Triple::SUSE) .Case("oe", Triple::OpenEmbedded) .Case("intel", Triple::Intel) + .Case("meta", Triple::Meta) .Default(Triple::UnknownVendor); } @@ -780,6 +785,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) { .StartsWith("pauthtest", Triple::PAuthTest) .StartsWith("llvm", Triple::LLVM) .StartsWith("mlibc", Triple::Mlibc) + .StartsWith("mtia", Triple::MTIA) .Default(Triple::UnknownEnvironment); } diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 64b33e4..ab906f9 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1568,7 +1568,7 @@ private: if (DebugLoc SuspendLoc = S->getDebugLoc()) { std::string LabelName = ("__coro_resume_" + Twine(SuspendIndex)).str(); - DILocation &DILoc = *SuspendLoc.get(); + DILocation &DILoc = *SuspendLoc; DILabel *ResumeLabel = DBuilder.createLabel(DIS, LabelName, DILoc.getFile(), SuspendLoc.getLine(), SuspendLoc.getCol(), diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp index b3910c4..d895cd7 100644 --- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp +++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp @@ -37,6 +37,16 @@ // memory that ends up in one of the runtime equivalents, since this can // happen if e.g. a library that was compiled without interposition returns // an allocation that can be validly passed to `free`. +// +// 3. MathFixup (required): Some accelerators might have an incomplete +// implementation for the intrinsics used to implement some of the math +// functions in <cmath> / their corresponding libcall lowerings. Since this +// can vary quite significantly between accelerators, we replace calls to a +// set of intrinsics / lib functions known to be problematic with calls to a +// HIPSTDPAR specific forwarding layer, which gives an uniform interface for +// accelerators to implement in their own runtime components. This pass +// should run before AcceleratorCodeSelection so as to prevent the spurious +// removal of the HIPSTDPAR specific forwarding functions. //===----------------------------------------------------------------------===// #include "llvm/Transforms/HipStdPar/HipStdPar.h" @@ -49,6 +59,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -519,3 +530,110 @@ HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) { return PreservedAnalyses::none(); } + +static constexpr std::pair<StringLiteral, StringLiteral> MathLibToHipStdPar[]{ + {"acosh", "__hipstdpar_acosh_f64"}, + {"acoshf", "__hipstdpar_acosh_f32"}, + {"asinh", "__hipstdpar_asinh_f64"}, + {"asinhf", "__hipstdpar_asinh_f32"}, + {"atanh", "__hipstdpar_atanh_f64"}, + {"atanhf", "__hipstdpar_atanh_f32"}, + {"cbrt", "__hipstdpar_cbrt_f64"}, + {"cbrtf", "__hipstdpar_cbrt_f32"}, + {"erf", "__hipstdpar_erf_f64"}, + {"erff", "__hipstdpar_erf_f32"}, + {"erfc", "__hipstdpar_erfc_f64"}, + {"erfcf", "__hipstdpar_erfc_f32"}, + {"fdim", "__hipstdpar_fdim_f64"}, + {"fdimf", "__hipstdpar_fdim_f32"}, + {"expm1", "__hipstdpar_expm1_f64"}, + {"expm1f", "__hipstdpar_expm1_f32"}, + {"hypot", "__hipstdpar_hypot_f64"}, + {"hypotf", "__hipstdpar_hypot_f32"}, + {"ilogb", "__hipstdpar_ilogb_f64"}, + {"ilogbf", "__hipstdpar_ilogb_f32"}, + {"lgamma", "__hipstdpar_lgamma_f64"}, + {"lgammaf", "__hipstdpar_lgamma_f32"}, + {"log1p", "__hipstdpar_log1p_f64"}, + {"log1pf", "__hipstdpar_log1p_f32"}, + {"logb", "__hipstdpar_logb_f64"}, + {"logbf", "__hipstdpar_logb_f32"}, + {"nextafter", "__hipstdpar_nextafter_f64"}, + {"nextafterf", "__hipstdpar_nextafter_f32"}, + {"nexttoward", "__hipstdpar_nexttoward_f64"}, + {"nexttowardf", "__hipstdpar_nexttoward_f32"}, + {"remainder", "__hipstdpar_remainder_f64"}, + {"remainderf", "__hipstdpar_remainder_f32"}, + {"remquo", "__hipstdpar_remquo_f64"}, + {"remquof", "__hipstdpar_remquo_f32"}, + {"scalbln", "__hipstdpar_scalbln_f64"}, + {"scalblnf", "__hipstdpar_scalbln_f32"}, + {"scalbn", "__hipstdpar_scalbn_f64"}, + {"scalbnf", "__hipstdpar_scalbn_f32"}, + {"tgamma", "__hipstdpar_tgamma_f64"}, + {"tgammaf", "__hipstdpar_tgamma_f32"}}; + +PreservedAnalyses HipStdParMathFixupPass::run(Module &M, + ModuleAnalysisManager &) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector<std::pair<Function *, std::string>> ToReplace; + for (auto &&F : M) { + if (!F.hasName()) + continue; + + StringRef N = F.getName(); + Intrinsic::ID ID = F.getIntrinsicID(); + + switch (ID) { + case Intrinsic::not_intrinsic: { + auto It = + find_if(MathLibToHipStdPar, [&](auto &&M) { return M.first == N; }); + if (It == std::cend(MathLibToHipStdPar)) + continue; + ToReplace.emplace_back(&F, It->second); + break; + } + case Intrinsic::acos: + case Intrinsic::asin: + case Intrinsic::atan: + case Intrinsic::atan2: + case Intrinsic::cosh: + case Intrinsic::modf: + case Intrinsic::sinh: + case Intrinsic::tan: + case Intrinsic::tanh: + break; + default: { + if (F.getReturnType()->isDoubleTy()) { + switch (ID) { + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::pow: + case Intrinsic::sin: + break; + default: + continue; + } + break; + } + continue; + } + } + + ToReplace.emplace_back(&F, N); + llvm::replace(ToReplace.back().second, '.', '_'); + StringRef Prefix = "llvm"; + ToReplace.back().second.replace(0, Prefix.size(), "__hipstdpar"); + } + for (auto &&[F, NewF] : ToReplace) + F->replaceAllUsesWith( + M.getOrInsertFunction(NewF, F->getFunctionType()).getCallee()); + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index f43202e..8262c8c 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -1863,7 +1863,6 @@ void AttributeInferer::run(const SCCNodeSet &SCCNodes, struct SCCNodesResult { SCCNodeSet SCCNodes; - bool HasUnknownCall; }; } // end anonymous namespace @@ -2227,29 +2226,13 @@ static void addWillReturn(const SCCNodeSet &SCCNodes, static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) { SCCNodesResult Res; - Res.HasUnknownCall = false; for (Function *F : Functions) { if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) || F->isPresplitCoroutine()) { - // Treat any function we're trying not to optimize as if it were an - // indirect call and omit it from the node set used below. - Res.HasUnknownCall = true; + // Omit any functions we're trying not to optimize from the set. continue; } - // Track whether any functions in this SCC have an unknown call edge. - // Note: if this is ever a performance hit, we can common it with - // subsequent routines which also do scans over the instructions of the - // function. - if (!Res.HasUnknownCall) { - for (Instruction &I : instructions(*F)) { - if (auto *CB = dyn_cast<CallBase>(&I)) { - if (!CB->getCalledFunction()) { - Res.HasUnknownCall = true; - break; - } - } - } - } + Res.SCCNodes.insert(F); } return Res; @@ -2282,15 +2265,10 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter, addColdAttrs(Nodes.SCCNodes, Changed); addWillReturn(Nodes.SCCNodes, Changed); addNoUndefAttrs(Nodes.SCCNodes, Changed); - - // If we have no external nodes participating in the SCC, we can deduce some - // more precise attributes as well. - if (!Nodes.HasUnknownCall) { - addNoAliasAttrs(Nodes.SCCNodes, Changed); - addNonNullAttrs(Nodes.SCCNodes, Changed); - inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed); - addNoRecurseAttrs(Nodes.SCCNodes, Changed); - } + addNoAliasAttrs(Nodes.SCCNodes, Changed); + addNonNullAttrs(Nodes.SCCNodes, Changed); + inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed); + addNoRecurseAttrs(Nodes.SCCNodes, Changed); // Finally, infer the maximal set of attributes from the ones we've inferred // above. This is handling the cases where one attribute on a signature diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 486205c..57844a1 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -502,8 +502,7 @@ class LowerTypeTestsModule { uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); TypeIdLowering importTypeId(StringRef TypeId); void importTypeTest(CallInst *CI); - void importFunction(Function *F, bool isJumpTableCanonical, - std::vector<GlobalAlias *> &AliasesToErase); + void importFunction(Function *F, bool isJumpTableCanonical); BitSetInfo buildBitSet(Metadata *TypeId, @@ -1103,9 +1102,8 @@ void LowerTypeTestsModule::maybeReplaceComdat(Function *F, // ThinLTO backend: the function F has a jump table entry; update this module // accordingly. isJumpTableCanonical describes the type of the jump table entry. -void LowerTypeTestsModule::importFunction( - Function *F, bool isJumpTableCanonical, - std::vector<GlobalAlias *> &AliasesToErase) { +void LowerTypeTestsModule::importFunction(Function *F, + bool isJumpTableCanonical) { assert(F->getType()->getAddressSpace() == 0); GlobalValue::VisibilityTypes Visibility = F->getVisibility(); @@ -1135,23 +1133,23 @@ void LowerTypeTestsModule::importFunction( } else { F->setName(Name + ".cfi"); maybeReplaceComdat(F, Name); - F->setLinkage(GlobalValue::ExternalLinkage); FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, F->getAddressSpace(), Name, &M); FDecl->setVisibility(Visibility); Visibility = GlobalValue::HiddenVisibility; - // Delete aliases pointing to this function, they'll be re-created in the - // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed - // will want to reset the aliasees first. + // Update aliases pointing to this function to also include the ".cfi" suffix, + // We expect the jump table entry to either point to the real function or an + // alias. Redirect all other users to the jump table entry. for (auto &U : F->uses()) { if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) { + std::string AliasName = A->getName().str() + ".cfi"; Function *AliasDecl = Function::Create( F->getFunctionType(), GlobalValue::ExternalLinkage, F->getAddressSpace(), "", &M); AliasDecl->takeName(A); A->replaceAllUsesWith(AliasDecl); - AliasesToErase.push_back(A); + A->setName(AliasName); } } } @@ -2077,16 +2075,13 @@ bool LowerTypeTestsModule::lower() { Decls.push_back(&F); } - std::vector<GlobalAlias *> AliasesToErase; { ScopedSaveAliaseesAndUsed S(M); for (auto *F : Defs) - importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase); + importFunction(F, /*isJumpTableCanonical*/ true); for (auto *F : Decls) - importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase); + importFunction(F, /*isJumpTableCanonical*/ false); } - for (GlobalAlias *GA : AliasesToErase) - GA->eraseFromParent(); return true; } @@ -2137,6 +2132,18 @@ bool LowerTypeTestsModule::lower() { if (auto Alias = dyn_cast<AliasSummary>(RefGVS.get())) AddressTaken.insert(Alias->getAliaseeGUID()); } + auto IsAddressTaken = [&](GlobalValue::GUID GUID) { + if (AddressTaken.count(GUID)) + return true; + auto VI = ExportSummary->getValueInfo(GUID); + if (!VI) + return false; + for (auto &I : VI.getSummaryList()) + if (auto Alias = dyn_cast<AliasSummary>(I.get())) + if (AddressTaken.count(Alias->getAliaseeGUID())) + return true; + return false; + }; for (auto *FuncMD : CfiFunctionsMD->operands()) { assert(FuncMD->getNumOperands() >= 2); StringRef FunctionName = @@ -2153,7 +2160,7 @@ bool LowerTypeTestsModule::lower() { // have no live references (and are not exported with cross-DSO CFI.) if (!ExportSummary->isGUIDLive(GUID)) continue; - if (!AddressTaken.count(GUID)) { + if (!IsAddressTaken(GUID)) { if (!CrossDsoCfi || Linkage != CFL_Definition) continue; @@ -2227,6 +2234,43 @@ bool LowerTypeTestsModule::lower() { } } + struct AliasToCreate { + Function *Alias; + std::string TargetName; + }; + std::vector<AliasToCreate> AliasesToCreate; + + // Parse alias data to replace stand-in function declarations for aliases + // with an alias to the intended target. + if (ExportSummary) { + if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) { + for (auto *AliasMD : AliasesMD->operands()) { + SmallVector<Function *> Aliases; + for (Metadata *MD : AliasMD->operands()) { + auto *MDS = dyn_cast<MDString>(MD); + if (!MDS) + continue; + StringRef AliasName = MDS->getString(); + if (!ExportedFunctions.count(AliasName)) + continue; + auto *AliasF = M.getFunction(AliasName); + if (AliasF) + Aliases.push_back(AliasF); + } + + if (Aliases.empty()) + continue; + + for (unsigned I = 1; I != Aliases.size(); ++I) { + auto *AliasF = Aliases[I]; + ExportedFunctions.erase(AliasF->getName()); + AliasesToCreate.push_back( + {AliasF, std::string(Aliases[0]->getName())}); + } + } + } + } + DenseMap<GlobalObject *, GlobalTypeMember *> GlobalTypeMembers; for (GlobalObject &GO : M.global_objects()) { if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker()) @@ -2414,47 +2458,16 @@ bool LowerTypeTestsModule::lower() { allocateByteArrays(); - // Parse alias data to replace stand-in function declarations for aliases - // with an alias to the intended target. - if (ExportSummary) { - if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) { - for (auto *AliasMD : AliasesMD->operands()) { - assert(AliasMD->getNumOperands() >= 4); - StringRef AliasName = - cast<MDString>(AliasMD->getOperand(0))->getString(); - StringRef Aliasee = cast<MDString>(AliasMD->getOperand(1))->getString(); - - if (auto It = ExportedFunctions.find(Aliasee); - It == ExportedFunctions.end() || - It->second.Linkage != CFL_Definition || !M.getNamedAlias(Aliasee)) - continue; - - GlobalValue::VisibilityTypes Visibility = - static_cast<GlobalValue::VisibilityTypes>( - cast<ConstantAsMetadata>(AliasMD->getOperand(2)) - ->getValue() - ->getUniqueInteger() - .getZExtValue()); - bool Weak = - static_cast<bool>(cast<ConstantAsMetadata>(AliasMD->getOperand(3)) - ->getValue() - ->getUniqueInteger() - .getZExtValue()); - - auto *Alias = GlobalAlias::create("", M.getNamedAlias(Aliasee)); - Alias->setVisibility(Visibility); - if (Weak) - Alias->setLinkage(GlobalValue::WeakAnyLinkage); - - if (auto *F = M.getFunction(AliasName)) { - Alias->takeName(F); - F->replaceAllUsesWith(Alias); - F->eraseFromParent(); - } else { - Alias->setName(AliasName); - } - } - } + for (auto A : AliasesToCreate) { + auto *Target = M.getNamedValue(A.TargetName); + if (!isa<GlobalAlias>(Target)) + continue; + auto *AliasGA = GlobalAlias::create("", Target); + AliasGA->setVisibility(A.Alias->getVisibility()); + AliasGA->setLinkage(A.Alias->getLinkage()); + AliasGA->takeName(A.Alias); + A.Alias->replaceAllUsesWith(AliasGA); + A.Alias->eraseFromParent(); } // Emit .symver directives for exported functions, if they exist. diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 0164fcd..c009c1e 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId, "Number of missing alloc nodes for context ids"); STATISTIC(SkippedCallsCloning, "Number of calls skipped during cloning due to unexpected operand"); +STATISTIC(MismatchedCloneAssignments, + "Number of callsites assigned to call multiple non-matching clones"); static cl::opt<std::string> DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -730,7 +732,7 @@ private: /// of the functions tracked calls to their new versions in the CallMap. /// Assigns new clones to clone number CloneNo. FuncInfo cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, + FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); @@ -897,7 +899,7 @@ private: CallsiteContextGraph<ModuleCallsiteContextGraph, Function, Instruction *>::FuncInfo cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, - std::map<CallInfo, CallInfo> &CallMap, + DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo); std::string getLabel(const Function *Func, const Instruction *Call, @@ -989,7 +991,7 @@ private: CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, IndexCall>::FuncInfo cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, - std::map<CallInfo, CallInfo> &CallMap, + DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo); std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, @@ -2060,6 +2062,20 @@ static bool isMemProfClone(const Function &F) { return F.getName().contains(MemProfCloneSuffix); } +// Return the clone number of the given function by extracting it from the +// memprof suffix. Assumes the caller has already confirmed it is a memprof +// clone. +static unsigned getMemProfCloneNum(const Function &F) { + assert(isMemProfClone(F)); + auto Pos = F.getName().find_last_of('.'); + assert(Pos > 0); + unsigned CloneNo; + bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); + assert(!Err); + (void)Err; + return CloneNo; +} + std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const { @@ -3979,7 +3995,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { - if (CalleeFunc.cloneNo() > 0) + auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + if (isMemProfClone(*CurF)) { + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + auto CurCalleeCloneNo = getMemProfCloneNum(*CurF); + if (CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + } + if (NewCalleeCloneNo > 0) cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); OREGetter(CallerCall.call()->getFunction()) .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) @@ -3995,7 +4026,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, assert(CI && "Caller cannot be an allocation which should not have profiled calls"); assert(CI->Clones.size() > CallerCall.cloneNo()); - CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()]; + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + CurCalleeCloneNo = NewCalleeCloneNo; } // Update the debug information attached to NewFunc to use the clone Name. Note @@ -4019,7 +4062,7 @@ static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) { CallsiteContextGraph<ModuleCallsiteContextGraph, Function, Instruction *>::FuncInfo ModuleCallsiteContextGraph::cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, + FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { // Use existing LLVM facilities for cloning and obtaining Call in clone ValueToValueMapTy VMap; @@ -4042,7 +4085,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite( CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, IndexCall>::FuncInfo IndexCallsiteContextGraph::cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, + FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { // Check how many clones we have of Call (and therefore function). // The next clone number is the current size of versions array. @@ -4457,14 +4500,24 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; }; + // Information for a single clone of this Func. + struct FuncCloneInfo { + // The function clone. + FuncInfo FuncClone; + // Remappings of each call of interest (from original uncloned call to the + // corresponding cloned call in this function clone). + DenseMap<CallInfo, CallInfo> CallMap; + }; + // Walk all functions for which we saw calls with memprof metadata, and handle // cloning for each of its calls. for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { FuncInfo OrigFunc(Func); - // Map from each clone of OrigFunc to a map of remappings of each call of - // interest (from original uncloned call to the corresponding cloned call in - // that function clone). - std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; + // Map from each clone number of OrigFunc to information about that function + // clone (the function clone FuncInfo and call remappings). The index into + // the vector is the clone number, as function clones are created and + // numbered sequentially. + std::vector<FuncCloneInfo> FuncCloneInfos; for (auto &Call : CallsWithMetadata) { ContextNode *Node = getNodeForInst(Call); // Skip call if we do not have a node for it (all uses of its stack ids @@ -4488,8 +4541,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // Record the clone of callsite node assigned to this function clone. FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; - assert(FuncClonesToCallMap.count(FuncClone)); - std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; + assert(FuncCloneInfos.size() > FuncClone.cloneNo()); + DenseMap<CallInfo, CallInfo> &CallMap = + FuncCloneInfos[FuncClone.cloneNo()].CallMap; CallInfo CallClone(Call); if (auto It = CallMap.find(Call); It != CallMap.end()) CallClone = It->second; @@ -4528,10 +4582,10 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // than existing function clones, which would have been assigned to an // earlier clone in the list (we assign callsite clones to function // clones greedily). - if (FuncClonesToCallMap.size() < NodeCloneCount) { + if (FuncCloneInfos.size() < NodeCloneCount) { // If this is the first callsite copy, assign to original function. if (NodeCloneCount == 1) { - // Since FuncClonesToCallMap is empty in this case, no clones have + // Since FuncCloneInfos is empty in this case, no clones have // been created for this function yet, and no callers should have // been assigned a function clone for this callee node yet. assert(llvm::none_of( @@ -4540,7 +4594,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { })); // Initialize with empty call map, assign Clone to original function // and its callers, and skip to the next clone. - FuncClonesToCallMap[OrigFunc] = {}; + FuncCloneInfos.push_back( + {OrigFunc, DenseMap<CallInfo, CallInfo>()}); AssignCallsiteCloneToFuncClone( OrigFunc, Call, Clone, AllocationCallToContextNodeMap.count(Call)); @@ -4572,14 +4627,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { } // Clone function and save it along with the CallInfo map created - // during cloning in the FuncClonesToCallMap. - std::map<CallInfo, CallInfo> NewCallMap; - unsigned CloneNo = FuncClonesToCallMap.size(); + // during cloning in the FuncCloneInfos. + DenseMap<CallInfo, CallInfo> NewCallMap; + unsigned CloneNo = FuncCloneInfos.size(); assert(CloneNo > 0 && "Clone 0 is the original function, which " "should already exist in the map"); FuncInfo NewFuncClone = cloneFunctionForCallsite( OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); - FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); + FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)}); FunctionClonesAnalysis++; Changed = true; @@ -4680,8 +4735,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // CallMap is set up as indexed by original Call at clone 0. CallInfo OrigCall(Callee->getOrigNode()->Call); OrigCall.setCloneNo(0); - std::map<CallInfo, CallInfo> &CallMap = - FuncClonesToCallMap[NewFuncClone]; + DenseMap<CallInfo, CallInfo> &CallMap = + FuncCloneInfos[NewFuncClone.cloneNo()].CallMap; assert(CallMap.count(OrigCall)); CallInfo NewCall(CallMap[OrigCall]); assert(NewCall); @@ -4703,6 +4758,19 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // where the callers were assigned to different clones of a function. } + auto FindFirstAvailFuncClone = [&]() { + // Find first function in FuncCloneInfos without an assigned + // clone of this callsite Node. We should always have one + // available at this point due to the earlier cloning when the + // FuncCloneInfos size was smaller than the clone number. + for (auto &CF : FuncCloneInfos) { + if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone)) + return CF.FuncClone; + } + llvm_unreachable( + "Expected an available func clone for this callsite clone"); + }; + // See if we can use existing function clone. Walk through // all caller edges to see if any have already been assigned to // a clone of this callsite's function. If we can use it, do so. If not, @@ -4819,16 +4887,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // clone of OrigFunc for another caller during this iteration over // its caller edges. if (!FuncCloneAssignedToCurCallsiteClone) { - // Find first function in FuncClonesToCallMap without an assigned - // clone of this callsite Node. We should always have one - // available at this point due to the earlier cloning when the - // FuncClonesToCallMap size was smaller than the clone number. - for (auto &CF : FuncClonesToCallMap) { - if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { - FuncCloneAssignedToCurCallsiteClone = CF.first; - break; - } - } + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); assert(FuncCloneAssignedToCurCallsiteClone); // Assign Clone to FuncCloneAssignedToCurCallsiteClone AssignCallsiteCloneToFuncClone( @@ -4842,6 +4901,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { FuncCloneAssignedToCurCallsiteClone); } } + // If we didn't assign a function clone to this callsite clone yet, e.g. + // none of its callers has a non-null call, do the assignment here. + // We want to ensure that every callsite clone is assigned to some + // function clone, so that the call updates below work as expected. + // In particular if this is the original callsite, we want to ensure it + // is assigned to the original function, otherwise the original function + // will appear available for assignment to other callsite clones, + // leading to unintended effects. For one, the unknown and not updated + // callers will call into cloned paths leading to the wrong hints, + // because they still call the original function (clone 0). Also, + // because all callsites start out as being clone 0 by default, we can't + // easily distinguish between callsites explicitly assigned to clone 0 + // vs those never assigned, which can lead to multiple updates of the + // calls when invoking updateCall below, with mismatched clone values. + // TODO: Add a flag to the callsite nodes or some other mechanism to + // better distinguish and identify callsite clones that are not getting + // assigned to function clones as expected. + if (!FuncCloneAssignedToCurCallsiteClone) { + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); + assert(FuncCloneAssignedToCurCallsiteClone && + "No available func clone for this callsite clone"); + AssignCallsiteCloneToFuncClone( + FuncCloneAssignedToCurCallsiteClone, Call, Clone, + /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call)); + } } if (VerifyCCG) { checkNode<DerivedCCG, FuncTy, CallTy>(Node); diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index e276376..4387c38 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -384,6 +384,10 @@ void splitAndWriteThinLTOBitcode( for (auto &F : M) if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F)) CfiFunctions.insert(&F); + for (auto &A : M.aliases()) + if (auto *F = dyn_cast<Function>(A.getAliasee())) + if (HasTypeMetadata(F)) + CfiFunctions.insert(&A); // Remove all globals with type metadata, globals with comdats that live in // MergedM, and aliases pointing to such globals from the thin LTO module. @@ -403,12 +407,12 @@ void splitAndWriteThinLTOBitcode( auto &Ctx = MergedM->getContext(); SmallVector<MDNode *, 8> CfiFunctionMDs; for (auto *V : CfiFunctions) { - Function &F = *cast<Function>(V); + Function &F = *cast<Function>(V->getAliaseeObject()); SmallVector<MDNode *, 2> Types; F.getMetadata(LLVMContext::MD_type, Types); SmallVector<Metadata *, 4> Elts; - Elts.push_back(MDString::get(Ctx, F.getName())); + Elts.push_back(MDString::get(Ctx, V->getName())); CfiFunctionLinkage Linkage; if (lowertypetests::isJumpTableCanonical(&F)) Linkage = CFL_Definition; @@ -428,29 +432,24 @@ void splitAndWriteThinLTOBitcode( NMD->addOperand(MD); } - SmallVector<MDNode *, 8> FunctionAliases; + MapVector<Function *, std::vector<GlobalAlias *>> FunctionAliases; for (auto &A : M.aliases()) { if (!isa<Function>(A.getAliasee())) continue; auto *F = cast<Function>(A.getAliasee()); - - Metadata *Elts[] = { - MDString::get(Ctx, A.getName()), - MDString::get(Ctx, F->getName()), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt8Ty(Ctx), A.getVisibility())), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt8Ty(Ctx), A.isWeakForLinker())), - }; - - FunctionAliases.push_back(MDTuple::get(Ctx, Elts)); + FunctionAliases[F].push_back(&A); } if (!FunctionAliases.empty()) { NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("aliases"); - for (auto *MD : FunctionAliases) - NMD->addOperand(MD); + for (auto &Alias : FunctionAliases) { + SmallVector<Metadata *> Elts; + Elts.push_back(MDString::get(Ctx, Alias.first->getName())); + for (auto *A : Alias.second) + Elts.push_back(MDString::get(Ctx, A->getName())); + NMD->addOperand(MDTuple::get(Ctx, Elts)); + } } SmallVector<MDNode *, 8> Symvers; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index b231c04..d7971e8 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -11,10 +11,13 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/FloatingPointPredicateUtils.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" @@ -3589,6 +3592,154 @@ static Value *foldOrOfInversions(BinaryOperator &I, return nullptr; } +/// Match \p V as "shufflevector -> bitcast" or "extractelement -> zext -> shl" +/// patterns, which extract vector elements and pack them in the same relative +/// positions. +/// +/// \p Vec is the underlying vector being extracted from. +/// \p Mask is a bitmask identifying which packed elements are obtained from the +/// vector. +/// \p VecOffset is the vector element corresponding to index 0 of the +/// mask. +static bool matchSubIntegerPackFromVector(Value *V, Value *&Vec, + int64_t &VecOffset, + SmallBitVector &Mask, + const DataLayout &DL) { + static const auto m_ConstShlOrSelf = [](const auto &Base, uint64_t &ShlAmt) { + ShlAmt = 0; + return m_CombineOr(m_Shl(Base, m_ConstantInt(ShlAmt)), Base); + }; + + // First try to match extractelement -> zext -> shl + uint64_t VecIdx, ShlAmt; + if (match(V, m_ConstShlOrSelf(m_ZExtOrSelf(m_ExtractElt( + m_Value(Vec), m_ConstantInt(VecIdx))), + ShlAmt))) { + auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType()); + if (!VecTy) + return false; + auto *EltTy = dyn_cast<IntegerType>(VecTy->getElementType()); + if (!EltTy) + return false; + + const unsigned EltBitWidth = EltTy->getBitWidth(); + const unsigned TargetBitWidth = V->getType()->getIntegerBitWidth(); + if (TargetBitWidth % EltBitWidth != 0 || ShlAmt % EltBitWidth != 0) + return false; + const unsigned TargetEltWidth = TargetBitWidth / EltBitWidth; + const unsigned ShlEltAmt = ShlAmt / EltBitWidth; + + const unsigned MaskIdx = + DL.isLittleEndian() ? ShlEltAmt : TargetEltWidth - ShlEltAmt - 1; + + VecOffset = static_cast<int64_t>(VecIdx) - static_cast<int64_t>(MaskIdx); + Mask.resize(TargetEltWidth); + Mask.set(MaskIdx); + return true; + } + + // Now try to match a bitcasted subvector. + Instruction *SrcVecI; + if (!match(V, m_BitCast(m_Instruction(SrcVecI)))) + return false; + + auto *SrcTy = dyn_cast<FixedVectorType>(SrcVecI->getType()); + if (!SrcTy) + return false; + + Mask.resize(SrcTy->getNumElements()); + + // First check for a subvector obtained from a shufflevector. + if (isa<ShuffleVectorInst>(SrcVecI)) { + Constant *ConstVec; + ArrayRef<int> ShuffleMask; + if (!match(SrcVecI, m_Shuffle(m_Value(Vec), m_Constant(ConstVec), + m_Mask(ShuffleMask)))) + return false; + + auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType()); + if (!VecTy) + return false; + + const unsigned NumVecElts = VecTy->getNumElements(); + bool FoundVecOffset = false; + for (unsigned Idx = 0; Idx < ShuffleMask.size(); ++Idx) { + if (ShuffleMask[Idx] == PoisonMaskElem) + return false; + const unsigned ShuffleIdx = ShuffleMask[Idx]; + if (ShuffleIdx >= NumVecElts) { + const unsigned ConstIdx = ShuffleIdx - NumVecElts; + auto *ConstElt = + dyn_cast<ConstantInt>(ConstVec->getAggregateElement(ConstIdx)); + if (!ConstElt || !ConstElt->isNullValue()) + return false; + continue; + } + + if (FoundVecOffset) { + if (VecOffset + Idx != ShuffleIdx) + return false; + } else { + if (ShuffleIdx < Idx) + return false; + VecOffset = ShuffleIdx - Idx; + FoundVecOffset = true; + } + Mask.set(Idx); + } + return FoundVecOffset; + } + + // Check for a subvector obtained as an (insertelement V, 0, idx) + uint64_t InsertIdx; + if (!match(SrcVecI, + m_InsertElt(m_Value(Vec), m_Zero(), m_ConstantInt(InsertIdx)))) + return false; + + auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType()); + if (!VecTy) + return false; + VecOffset = 0; + bool AlreadyInsertedMaskedElt = Mask.test(InsertIdx); + Mask.set(); + if (!AlreadyInsertedMaskedElt) + Mask.reset(InsertIdx); + return true; +} + +/// Try to fold the join of two scalar integers whose contents are packed +/// elements of the same vector. +static Instruction *foldIntegerPackFromVector(Instruction &I, + InstCombiner::BuilderTy &Builder, + const DataLayout &DL) { + assert(I.getOpcode() == Instruction::Or); + Value *LhsVec, *RhsVec; + int64_t LhsVecOffset, RhsVecOffset; + SmallBitVector Mask; + if (!matchSubIntegerPackFromVector(I.getOperand(0), LhsVec, LhsVecOffset, + Mask, DL)) + return nullptr; + if (!matchSubIntegerPackFromVector(I.getOperand(1), RhsVec, RhsVecOffset, + Mask, DL)) + return nullptr; + if (LhsVec != RhsVec || LhsVecOffset != RhsVecOffset) + return nullptr; + + // Convert into shufflevector -> bitcast; + const unsigned ZeroVecIdx = + cast<FixedVectorType>(LhsVec->getType())->getNumElements(); + SmallVector<int> ShuffleMask(Mask.size(), ZeroVecIdx); + for (unsigned Idx : Mask.set_bits()) { + assert(LhsVecOffset + Idx >= 0); + ShuffleMask[Idx] = LhsVecOffset + Idx; + } + + Value *MaskedVec = Builder.CreateShuffleVector( + LhsVec, Constant::getNullValue(LhsVec->getType()), ShuffleMask, + I.getName() + ".v"); + return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType()); +} + // A decomposition of ((X & Mask) * Factor). The NUW / NSW bools // track these properities for preservation. Note that we can decompose // equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask * @@ -3766,6 +3917,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *X = foldComplexAndOrPatterns(I, Builder)) return X; + if (Instruction *X = foldIntegerPackFromVector(I, Builder, DL)) + return X; + // (A & B) | (C & D) -> A ^ D where A == ~C && B == ~D // (A & B) | (C & D) -> A ^ C where A == ~D && B == ~C if (Value *V = foldOrOfInversions(I, Builder)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index da9b126..b268fea 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -163,6 +163,11 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( LaterIndices.push_back(IdxVal); } + Value *Idx = GEP->getOperand(2); + // If the index type is non-canonical, wait for it to be canonicalized. + if (Idx->getType() != DL.getIndexType(GEP->getType())) + return nullptr; + enum { Overdefined = -3, Undefined = -2 }; // Variables for our state machines. @@ -290,17 +295,6 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( // Now that we've scanned the entire array, emit our new comparison(s). We // order the state machines in complexity of the generated code. - Value *Idx = GEP->getOperand(2); - - // If the index is larger than the pointer offset size of the target, truncate - // the index down like the GEP would do implicitly. We don't have to do this - // for an inbounds GEP because the index can't be out of range. - if (!GEP->isInBounds()) { - Type *PtrIdxTy = DL.getIndexType(GEP->getType()); - unsigned OffsetSize = PtrIdxTy->getIntegerBitWidth(); - if (Idx->getType()->getPrimitiveSizeInBits().getFixedValue() > OffsetSize) - Idx = Builder.CreateTrunc(Idx, PtrIdxTy); - } // If inbounds keyword is not present, Idx * ElementSize can overflow. // Let's assume that ElementSize is 2 and the wanted value is at offset 0. diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index e2a9255..9e33320 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2777,6 +2777,12 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP, Indices.append(GEP.idx_begin()+1, GEP.idx_end()); } + // Don't create GEPs with more than one variable index. + unsigned NumVarIndices = + count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); }); + if (NumVarIndices > 1) + return nullptr; + if (!Indices.empty()) return replaceInstUsesWith( GEP, Builder.CreateGEP( @@ -3199,6 +3205,14 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return replaceInstUsesWith(GEP, NewGEP); } + // Strip trailing zero indices. + auto *LastIdx = dyn_cast<Constant>(Indices.back()); + if (LastIdx && LastIdx->isNullValue() && !LastIdx->getType()->isVectorTy()) { + return replaceInstUsesWith( + GEP, Builder.CreateGEP(GEP.getSourceElementType(), PtrOp, + drop_end(Indices), "", GEP.getNoWrapFlags())); + } + // Scalarize vector operands; prefer splat-of-gep.as canonical form. // Note that this looses information about undef lanes; we run it after // demanded bits to partially mitigate that loss. @@ -3225,6 +3239,30 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return replaceInstUsesWith(GEP, Res); } + bool SeenVarIndex = false; + for (auto [IdxNum, Idx] : enumerate(Indices)) { + if (isa<Constant>(Idx)) + continue; + + if (!SeenVarIndex) { + SeenVarIndex = true; + continue; + } + + // GEP has multiple variable indices: Split it. + ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum); + Value *FrontGEP = + Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices, + GEP.getName() + ".split", GEP.getNoWrapFlags()); + + SmallVector<Value *> BackIndices; + BackIndices.push_back(Constant::getNullValue(NewScalarIndexTy)); + append_range(BackIndices, drop_begin(Indices, IdxNum)); + return GetElementPtrInst::Create( + GetElementPtrInst::getIndexedType(GEPEltType, FrontIndices), FrontGEP, + BackIndices, GEP.getNoWrapFlags()); + } + // Check to see if the inputs to the PHI node are getelementptr instructions. if (auto *PN = dyn_cast<PHINode>(PtrOp)) { if (Value *NewPtrOp = foldGEPOfPhi(GEP, PN, Builder)) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index df31f07..54d9a83 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4769,6 +4769,79 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOriginForNaryOp(I); } + // Approximately handle AVX Galois Field Affine Transformation + // + // e.g., + // <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8) + // <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8) + // <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8) + // Out A x b + // where A and x are packed matrices, b is a vector, + // Out = A * x + b in GF(2) + // + // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix + // computation also includes a parity calculation. + // + // For the bitwise AND of bits V1 and V2, the exact shadow is: + // Out_Shadow = (V1_Shadow & V2_Shadow) + // | (V1 & V2_Shadow) + // | (V1_Shadow & V2 ) + // + // We approximate the shadow of gf2p8affineqb using: + // Out_Shadow = gf2p8affineqb(x_Shadow, A_shadow, 0) + // | gf2p8affineqb(x, A_shadow, 0) + // | gf2p8affineqb(x_Shadow, A, 0) + // | set1_epi8(b_Shadow) + // + // This approximation has false negatives: if an intermediate dot-product + // contains an even number of 1's, the parity is 0. + // It has no false positives. + void handleAVXGF2P8Affine(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + + assert(I.arg_size() == 3); + Value *A = I.getOperand(0); + Value *X = I.getOperand(1); + Value *B = I.getOperand(2); + + assert(isFixedIntVector(A)); + assert(cast<VectorType>(A->getType()) + ->getElementType() + ->getScalarSizeInBits() == 8); + + assert(A->getType() == X->getType()); + + assert(B->getType()->isIntegerTy()); + assert(B->getType()->getScalarSizeInBits() == 8); + + assert(I.getType() == A->getType()); + + Value *AShadow = getShadow(A); + Value *XShadow = getShadow(X); + Value *BZeroShadow = getCleanShadow(B); + + CallInst *AShadowXShadow = IRB.CreateIntrinsic( + I.getType(), I.getIntrinsicID(), {XShadow, AShadow, BZeroShadow}); + CallInst *AShadowX = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), + {X, AShadow, BZeroShadow}); + CallInst *XShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), + {XShadow, A, BZeroShadow}); + + unsigned NumElements = cast<FixedVectorType>(I.getType())->getNumElements(); + Value *BShadow = getShadow(B); + Value *BBroadcastShadow = getCleanShadow(AShadow); + // There is no LLVM IR intrinsic for _mm512_set1_epi8. + // This loop generates a lot of LLVM IR, which we expect that CodeGen will + // lower appropriately (e.g., VPBROADCASTB). + // Besides, b is often a constant, in which case it is fully initialized. + for (unsigned i = 0; i < NumElements; i++) + BBroadcastShadow = IRB.CreateInsertElement(BBroadcastShadow, BShadow, i); + + setShadow(&I, IRB.CreateOr( + {AShadowXShadow, AShadowX, XShadowA, BBroadcastShadow})); + setOriginForNaryOp(I); + } + // Handle Arm NEON vector load intrinsics (vld*). // // The WithLane instructions (ld[234]lane) are similar to: @@ -5604,6 +5677,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { break; } + // AVX Galois Field New Instructions + case Intrinsic::x86_vgf2p8affineqb_128: + case Intrinsic::x86_vgf2p8affineqb_256: + case Intrinsic::x86_vgf2p8affineqb_512: + handleAVXGF2P8Affine(I); + break; + case Intrinsic::fshl: case Intrinsic::fshr: handleFunnelShift(I); diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 0f63ed0..9b87180 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -1360,13 +1360,10 @@ struct DSEState { /// indicating whether \p I is a free-like call. std::optional<std::pair<MemoryLocation, bool>> getLocForTerminator(Instruction *I) const { - uint64_t Len; - Value *Ptr; - if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len), - m_Value(Ptr)))) - return {std::make_pair(MemoryLocation(Ptr, Len), false)}; - if (auto *CB = dyn_cast<CallBase>(I)) { + if (CB->getIntrinsicID() == Intrinsic::lifetime_end) + return { + std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)}; if (Value *FreedOp = getFreedOperand(CB, &TLI)) return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)}; } diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 6a3f656..1a52af1 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -651,7 +651,7 @@ class NewGVN { BitVector TouchedInstructions; DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange; - mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred; + mutable DenseMap<const IntrinsicInst *, const Value *> PredicateSwapChoice; #ifndef NDEBUG // Debugging for how many times each block and instruction got processed. @@ -840,7 +840,7 @@ private: // Ranking unsigned int getRank(const Value *) const; bool shouldSwapOperands(const Value *, const Value *) const; - bool shouldSwapOperandsForIntrinsic(const Value *, const Value *, + bool shouldSwapOperandsForPredicate(const Value *, const Value *, const IntrinsicInst *I) const; // Reachability handling. @@ -1624,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const { Value *AdditionallyUsedValue = CmpOp0; // Sort the ops. - if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) { + if (shouldSwapOperandsForPredicate(FirstOp, SecondOp, I)) { std::swap(FirstOp, SecondOp); Predicate = CmpInst::getSwappedPredicate(Predicate); AdditionallyUsedValue = CmpOp1; @@ -3024,7 +3024,7 @@ void NewGVN::cleanupTables() { PredicateToUsers.clear(); MemoryToUsers.clear(); RevisitOnReachabilityChange.clear(); - IntrinsicInstPred.clear(); + PredicateSwapChoice.clear(); } // Assign local DFS number mapping to instructions, and leave space for Value @@ -4250,20 +4250,18 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B); } -bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B, +bool NewGVN::shouldSwapOperandsForPredicate(const Value *A, const Value *B, const IntrinsicInst *I) const { - auto LookupResult = IntrinsicInstPred.find(I); if (shouldSwapOperands(A, B)) { - if (LookupResult == IntrinsicInstPred.end()) - IntrinsicInstPred.insert({I, B}); - else - LookupResult->second = B; + PredicateSwapChoice[I] = B; return true; } - if (LookupResult != IntrinsicInstPred.end()) { + auto LookupResult = PredicateSwapChoice.find(I); + if (LookupResult != PredicateSwapChoice.end()) { auto *SeenPredicate = LookupResult->second; if (SeenPredicate) { + // We previously decided to swap B to the left. Keep that choice. if (SeenPredicate == B) return true; else diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index b9292af..b78c702 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -703,6 +703,7 @@ private: // Add U as additional user of V. void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); } + void handlePredicate(Instruction *I, Value *CopyOf, const PredicateBase *PI); void handleCallOverdefined(CallBase &CB); void handleCallResult(CallBase &CB); void handleCallArguments(CallBase &CB); @@ -1927,6 +1928,75 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) { } } +void SCCPInstVisitor::handlePredicate(Instruction *I, Value *CopyOf, + const PredicateBase *PI) { + ValueLatticeElement CopyOfVal = getValueState(CopyOf); + const std::optional<PredicateConstraint> &Constraint = PI->getConstraint(); + if (!Constraint) { + mergeInValue(ValueState[I], I, CopyOfVal); + return; + } + + CmpInst::Predicate Pred = Constraint->Predicate; + Value *OtherOp = Constraint->OtherOp; + + // Wait until OtherOp is resolved. + if (getValueState(OtherOp).isUnknown()) { + addAdditionalUser(OtherOp, I); + return; + } + + ValueLatticeElement CondVal = getValueState(OtherOp); + ValueLatticeElement &IV = ValueState[I]; + if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) { + auto ImposedCR = + ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType())); + + // Get the range imposed by the condition. + if (CondVal.isConstantRange()) + ImposedCR = ConstantRange::makeAllowedICmpRegion( + Pred, CondVal.getConstantRange()); + + // Combine range info for the original value with the new range from the + // condition. + auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(), + /*UndefAllowed=*/true); + // Treat an unresolved input like a full range. + if (CopyOfCR.isEmptySet()) + CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth()); + auto NewCR = ImposedCR.intersectWith(CopyOfCR); + // If the existing information is != x, do not use the information from + // a chained predicate, as the != x information is more likely to be + // helpful in practice. + if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement()) + NewCR = CopyOfCR; + + // The new range is based on a branch condition. That guarantees that + // neither of the compare operands can be undef in the branch targets, + // unless we have conditions that are always true/false (e.g. icmp ule + // i32, %a, i32_max). For the latter overdefined/empty range will be + // inferred, but the branch will get folded accordingly anyways. + addAdditionalUser(OtherOp, I); + mergeInValue( + IV, I, ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false)); + return; + } else if (Pred == CmpInst::ICMP_EQ && + (CondVal.isConstant() || CondVal.isNotConstant())) { + // For non-integer values or integer constant expressions, only + // propagate equal constants or not-constants. + addAdditionalUser(OtherOp, I); + mergeInValue(IV, I, CondVal); + return; + } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) { + // Propagate inequalities. + addAdditionalUser(OtherOp, I); + mergeInValue(IV, I, ValueLatticeElement::getNot(CondVal.getConstant())); + return; + } + + return (void)mergeInValue(IV, I, CopyOfVal); +} + void SCCPInstVisitor::handleCallResult(CallBase &CB) { Function *F = CB.getCalledFunction(); @@ -1936,77 +2006,10 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { return; Value *CopyOf = CB.getOperand(0); - ValueLatticeElement CopyOfVal = getValueState(CopyOf); - const auto *PI = getPredicateInfoFor(&CB); + const PredicateBase *PI = getPredicateInfoFor(&CB); assert(PI && "Missing predicate info for ssa.copy"); - - const std::optional<PredicateConstraint> &Constraint = - PI->getConstraint(); - if (!Constraint) { - mergeInValue(ValueState[&CB], &CB, CopyOfVal); - return; - } - - CmpInst::Predicate Pred = Constraint->Predicate; - Value *OtherOp = Constraint->OtherOp; - - // Wait until OtherOp is resolved. - if (getValueState(OtherOp).isUnknown()) { - addAdditionalUser(OtherOp, &CB); - return; - } - - ValueLatticeElement CondVal = getValueState(OtherOp); - ValueLatticeElement &IV = ValueState[&CB]; - if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) { - auto ImposedCR = - ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType())); - - // Get the range imposed by the condition. - if (CondVal.isConstantRange()) - ImposedCR = ConstantRange::makeAllowedICmpRegion( - Pred, CondVal.getConstantRange()); - - // Combine range info for the original value with the new range from the - // condition. - auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(), - /*UndefAllowed=*/true); - // Treat an unresolved input like a full range. - if (CopyOfCR.isEmptySet()) - CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth()); - auto NewCR = ImposedCR.intersectWith(CopyOfCR); - // If the existing information is != x, do not use the information from - // a chained predicate, as the != x information is more likely to be - // helpful in practice. - if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement()) - NewCR = CopyOfCR; - - // The new range is based on a branch condition. That guarantees that - // neither of the compare operands can be undef in the branch targets, - // unless we have conditions that are always true/false (e.g. icmp ule - // i32, %a, i32_max). For the latter overdefined/empty range will be - // inferred, but the branch will get folded accordingly anyways. - addAdditionalUser(OtherOp, &CB); - mergeInValue( - IV, &CB, - ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false)); - return; - } else if (Pred == CmpInst::ICMP_EQ && - (CondVal.isConstant() || CondVal.isNotConstant())) { - // For non-integer values or integer constant expressions, only - // propagate equal constants or not-constants. - addAdditionalUser(OtherOp, &CB); - mergeInValue(IV, &CB, CondVal); - return; - } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) { - // Propagate inequalities. - addAdditionalUser(OtherOp, &CB); - mergeInValue(IV, &CB, - ValueLatticeElement::getNot(CondVal.getConstant())); - return; - } - - return (void)mergeInValue(IV, &CB, CopyOfVal); + handlePredicate(&CB, CopyOf, PI); + return; } if (II->getIntrinsicID() == Intrinsic::vscale) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7b7efb8..fe93fcd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -93,6 +93,7 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -155,6 +156,7 @@ #include <utility> using namespace llvm; +using namespace SCEVPatternMatch; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { /// ElementCount to include loops whose trip count is a function of vscale. static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L) { - return ElementCount::getFixed(SE->getSmallConstantTripCount(L)); + if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L)) + return ElementCount::getFixed(ExpectedTC); + + const SCEV *BTC = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BTC)) + return ElementCount::getFixed(0); + + const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L); + if (isa<SCEVVScale>(ExitCount)) + return ElementCount::getScalable(1); + + const APInt *Scale; + if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale()))) + if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap()) + if (Scale->getActiveBits() <= 32) + return ElementCount::getScalable(Scale->getZExtValue()); + + return ElementCount::getFixed(0); } /// Returns "best known" trip count, which is either a valid positive trip count @@ -2593,12 +2612,12 @@ static void cse(BasicBlock *BB) { } } -/// This function attempts to return a value that represents the vectorization -/// factor at runtime. For fixed-width VFs we know this precisely at compile +/// This function attempts to return a value that represents the ElementCount +/// at runtime. For fixed-width VFs we know this precisely at compile /// time, but for scalable VFs we calculate it based on an estimate of the /// vscale value. -static unsigned getEstimatedRuntimeVF(ElementCount VF, - std::optional<unsigned> VScale) { +static unsigned estimateElementCount(ElementCount VF, + std::optional<unsigned> VScale) { unsigned EstimatedVF = VF.getKnownMinValue(); if (VF.isScalable()) if (VScale) @@ -2708,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // use the value of vscale used for tuning. Loop *VectorLoop = LI->getLoopFor(HeaderBB); unsigned EstimatedVFxUF = - getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning()); + estimateElementCount(VF * UF, Cost->getVScaleForTuning()); setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); } @@ -3003,7 +3022,7 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { // is correct. The easiest form of the later is to require that all values // stored are the same. return !(Legal->isInvariant(getLoadStorePointerOperand(I)) && - Legal->isInvariant(cast<StoreInst>(I)->getValueOperand())); + TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand())); } case Instruction::UDiv: case Instruction::SDiv: @@ -4337,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); unsigned Width = - getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning()); + estimateElementCount(Candidate.Width, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF << " costs: " << (Candidate.Cost / Width)); if (VF.isScalable()) @@ -4445,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 ? EpilogueVectorizationMinVF : TTI.getEpilogueVectorizationMinVF(); - return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >= + return estimateElementCount(VF * Multiplier, VScaleForTuning) >= MinVFThreshold; } @@ -4498,7 +4517,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. ElementCount EstimatedRuntimeVF = ElementCount::getFixed( - getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning())); + estimateElementCount(MainLoopVF, CM.getVScaleForTuning())); ScalarEvolution &SE = *PSE.getSE(); Type *TCType = Legal->getWidestInductionType(); @@ -4745,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning); - // Try to get the exact trip count, or an estimate based on profiling data or // ConstantMax from PSE, failing that. - if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) { + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); + + // For fixed length VFs treat a scalable trip count as unknown. + if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) { + // Re-evaluate trip counts and VFs to be in the same numerical space. + unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning); + unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning); + // At least one iteration must be scalar when this constraint holds. So the // maximum available iterations for interleaving is one less. - unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) - ? BestKnownTC->getFixedValue() - 1 - : BestKnownTC->getFixedValue(); + if (requiresScalarEpilogue(VF.isVector())) + --AvailableTC; unsigned InterleaveCountLB = bit_floor(std::max( 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); @@ -6925,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, // Now compute and add the VPlan-based cost. Cost += Plan.cost(VF, CostCtx); #ifndef NDEBUG - unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning()); + unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << " (Estimated cost per lane: "); if (Cost.isValid()) { @@ -7292,6 +7315,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // Regions are dissolved after optimizing for VF and UF, which completely // removes unneeded loop regions first. VPlanTransforms::dissolveLoopRegions(BestVPlan); + // Canonicalize EVL loops after regions are dissolved. + VPlanTransforms::canonicalizeEVLLoops(BestVPlan); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, OrigLoop->getParentLoop(), @@ -9611,7 +9636,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. - unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale); + unsigned IntVF = estimateElementCount(VF.Width, VScale); uint64_t RtC = TotalCost.getValue(); uint64_t Div = ScalarC * IntVF - VF.Cost.getValue(); uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 225658b..68e7c20 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3391,12 +3391,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, // must use intrinsics to interleave. if (VecTy->isScalableTy()) { assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors"); - VectorType *InterleaveTy = - VectorType::get(VecTy->getElementType(), - VecTy->getElementCount().multiplyCoefficientBy(Factor)); - return Builder.CreateIntrinsic(InterleaveTy, - getInterleaveIntrinsicID(Factor), Vals, - /*FMFSource=*/nullptr, Name); + return Builder.CreateVectorInterleave(Vals, Name); } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -3503,8 +3498,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(InterleaveFactor <= 8 && "Unsupported deinterleave factor for scalable vectors"); NewLoad = State.Builder.CreateIntrinsic( - getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(), - NewLoad, + Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), + NewLoad->getType(), NewLoad, /*FMFSource=*/nullptr, "strided.vec"); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 935a4e4..47a9ff0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1094,6 +1094,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return Def->replaceAllUsesWith(A); + if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0)))) + return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1) + : R.getOperand(0)); + if (match(Def, m_Not(m_VPValue(A)))) { if (match(A, m_Not(m_VPValue(A)))) return Def->replaceAllUsesWith(A); @@ -2240,6 +2244,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { // Try to optimize header mask recipes away to their EVL variants. for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { + // TODO: Split optimizeMaskToEVL out and move into + // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in + // tryToBuildVPlanWithVPRecipes beforehand. for (VPUser *U : collectUsersRecursively(HeaderMask)) { auto *CurRecipe = cast<VPRecipeBase>(U); VPRecipeBase *EVLRecipe = @@ -2261,6 +2268,20 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { } ToErase.push_back(CurRecipe); } + + // Replace header masks with a mask equivalent to predicating by EVL: + // + // icmp ule widen-canonical-iv backedge-taken-count + // -> + // icmp ult step-vector, EVL + VPRecipeBase *EVLR = EVL.getDefiningRecipe(); + VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator())); + Type *EVLType = TypeInfo.inferScalarType(&EVL); + VPValue *EVLMask = Builder.createICmp( + CmpInst::ICMP_ULT, + Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL); + HeaderMask->replaceAllUsesWith(EVLMask); + ToErase.push_back(HeaderMask->getDefiningRecipe()); } for (VPRecipeBase *R : reverse(ToErase)) { @@ -2369,6 +2390,66 @@ bool VPlanTransforms::tryAddExplicitVectorLength( return true; } +void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { + using namespace llvm::VPlanPatternMatch; + // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe. + // There should be only one EVL PHI in the entire plan. + VPEVLBasedIVPHIRecipe *EVLPhi = nullptr; + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_shallow(Plan.getEntry()))) + for (VPRecipeBase &R : VPBB->phis()) + if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) { + assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected"); + EVLPhi = PhiR; + } + + // Early return if no EVL PHI is found. + if (!EVLPhi) + return; + + VPBasicBlock *HeaderVPBB = EVLPhi->getParent(); + VPValue *EVLIncrement = EVLPhi->getBackedgeValue(); + + // Convert EVLPhi to concrete recipe. + auto *ScalarR = + VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement}, + EVLPhi->getDebugLoc(), "evl.based.iv"); + EVLPhi->replaceAllUsesWith(ScalarR); + EVLPhi->eraseFromParent(); + + // Replace CanonicalIVInc with EVL-PHI increment. + auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin()); + VPValue *Backedge = CanonicalIV->getIncomingValue(1); + assert(match(Backedge, + m_c_Binary<Instruction::Add>(m_Specific(CanonicalIV), + m_Specific(&Plan.getVFxUF()))) && + "Unexpected canonical iv"); + Backedge->replaceAllUsesWith(EVLIncrement); + + // Remove unused phi and increment. + VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe(); + CanonicalIVIncrement->eraseFromParent(); + CanonicalIV->eraseFromParent(); + + // Replace the use of VectorTripCount in the latch-exiting block. + // Before: (branch-on-count EVLIVInc, VectorTripCount) + // After: (branch-on-count EVLIVInc, TripCount) + + VPBasicBlock *LatchExiting = + HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock(); + auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator()); + // Skip single-iteration loop region + if (match(LatchExitingBr, m_BranchOnCond(m_True()))) + return; + assert(LatchExitingBr && + match(LatchExitingBr, + m_BranchOnCount(m_VPValue(EVLIncrement), + m_Specific(&Plan.getVectorTripCount()))) && + "Unexpected terminator in EVL loop"); + LatchExitingBr->setOperand(1, Plan.getTripCount()); +} + void VPlanTransforms::dropPoisonGeneratingRecipes( VPlan &Plan, const std::function<bool(BasicBlock *)> &BlockNeedsPredication) { @@ -2700,15 +2781,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) { - auto *ScalarR = VPBuilder(PhiR).createScalarPhi( - {PhiR->getStartValue(), PhiR->getBackedgeValue()}, - PhiR->getDebugLoc(), "evl.based.iv"); - PhiR->replaceAllUsesWith(ScalarR); - ToRemove.push_back(PhiR); - continue; - } - if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) { expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo); ToRemove.push_back(WidenIVR); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index d5af6cd..880159f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -209,6 +209,18 @@ struct VPlanTransforms { /// Replace loop regions with explicit CFG. static void dissolveLoopRegions(VPlan &Plan); + /// Transform EVL loops to use variable-length stepping after region + /// dissolution. + /// + /// Once loop regions are replaced with explicit CFG, EVL loops can step with + /// variable vector lengths instead of fixed lengths. This transformation: + /// * Makes EVL-Phi concrete. + // * Removes CanonicalIV and increment. + /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc, + /// VectorTripCount) with variable-length stepping (branch-on-cond + /// EVLIVInc, TripCount). + static void canonicalizeEVLLoops(VPlan &Plan); + /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy); diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 38ada33..57d01cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -17,6 +17,7 @@ #include "VPlanCFG.h" #include "VPlanDominatorTree.h" #include "VPlanHelpers.h" +#include "VPlanPatternMatch.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/TypeSwitch.h" @@ -171,7 +172,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case<VPInstructionWithType>( [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); }) .Case<VPInstruction>([&](const VPInstruction *I) { - if (I->getOpcode() == Instruction::PHI) + if (I->getOpcode() == Instruction::PHI || + I->getOpcode() == Instruction::ICmp) return VerifyEVLUse(*I, 1); switch (I->getOpcode()) { case Instruction::Add: @@ -192,7 +194,13 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { errs() << "EVL used by unexpected VPInstruction\n"; return false; } - if (I->getNumUsers() != 1) { + // EVLIVIncrement is only used by EVLIV & BranchOnCount. + // Having more than two users is unexpected. + if ((I->getNumUsers() != 1) && + (I->getNumUsers() != 2 || none_of(I->users(), [&I](VPUser *U) { + using namespace llvm::VPlanPatternMatch; + return match(U, m_BranchOnCount(m_Specific(I), m_VPValue())); + }))) { errs() << "EVL is used in VPInstruction with multiple users\n"; return false; } |