diff options
Diffstat (limited to 'llvm/lib')
125 files changed, 3021 insertions, 849 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 759c553..2d52f34 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1373,7 +1373,7 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst, if (ConstantFP *CFP = dyn_cast<ConstantFP>(Operand)) return flushDenormalConstantFP(CFP, Inst, IsOutput); - if (isa<ConstantAggregateZero, UndefValue, ConstantExpr>(Operand)) + if (isa<ConstantAggregateZero, UndefValue>(Operand)) return Operand; Type *Ty = Operand->getType(); @@ -1389,6 +1389,9 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst, Ty = VecTy->getElementType(); } + if (isa<ConstantExpr>(Operand)) + return Operand; + if (const auto *CV = dyn_cast<ConstantVector>(Operand)) { SmallVector<Constant *, 16> NewElts; for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) { @@ -2628,14 +2631,14 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, case Intrinsic::nvvm_ceil_d: return ConstantFoldFP( ceil, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); case Intrinsic::nvvm_fabs_ftz: case Intrinsic::nvvm_fabs: return ConstantFoldFP( fabs, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); case Intrinsic::nvvm_floor_ftz_f: @@ -2643,7 +2646,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, case Intrinsic::nvvm_floor_d: return ConstantFoldFP( floor, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); case Intrinsic::nvvm_rcp_rm_ftz_f: @@ -2705,7 +2708,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, return nullptr; return ConstantFoldFP( sqrt, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); // AMDGCN Intrinsics: diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp index c871070..7025b83 100644 --- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -525,6 +525,8 @@ AAMDNodes AAMDNodes::merge(const AAMDNodes &Other) const { Result.TBAAStruct = nullptr; Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope); Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias); + Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace( + NoAliasAddrSpace, Other.NoAliasAddrSpace); return Result; } @@ -533,6 +535,8 @@ AAMDNodes AAMDNodes::concat(const AAMDNodes &Other) const { Result.TBAA = Result.TBAAStruct = nullptr; Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope); Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias); + Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace( + NoAliasAddrSpace, Other.NoAliasAddrSpace); return Result; } diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp index 15107c2..2e4063f 100644 --- a/llvm/lib/Analysis/UniformityAnalysis.cpp +++ b/llvm/lib/Analysis/UniformityAnalysis.cpp @@ -178,6 +178,7 @@ bool UniformityInfoWrapperPass::runOnFunction(Function &F) { void UniformityInfoWrapperPass::print(raw_ostream &OS, const Module *) const { OS << "UniformityInfo for function '" << m_function->getName() << "':\n"; + m_uniformityInfo.print(OS); } void UniformityInfoWrapperPass::releaseMemory() { diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 1b3da59..e9cf2ee 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::exp: case Intrinsic::exp10: case Intrinsic::exp2: + case Intrinsic::ldexp: case Intrinsic::log: case Intrinsic::log10: case Intrinsic::log2: @@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::canonicalize: case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::ucmp: @@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( switch (ID) { case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::vp_lrint: @@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::vp_is_fpclass: return OpdIdx == 0; case Intrinsic::powi: + case Intrinsic::ldexp: return OpdIdx == -1 || OpdIdx == 1; default: return OpdIdx == -1; @@ -240,30 +246,6 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI, return Intrinsic::not_intrinsic; } -struct InterleaveIntrinsic { - Intrinsic::ID Interleave, Deinterleave; -}; - -static InterleaveIntrinsic InterleaveIntrinsics[] = { - {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2}, - {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3}, - {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4}, - {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5}, - {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6}, - {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7}, - {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8}, -}; - -Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) { - assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); - return InterleaveIntrinsics[Factor - 2].Interleave; -} - -Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) { - assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); - return InterleaveIntrinsics[Factor - 2].Deinterleave; -} - unsigned llvm::getInterleaveIntrinsicFactor(Intrinsic::ID ID) { switch (ID) { case Intrinsic::vector_interleave2: diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp index 3b436af..f1765d7 100644 --- a/llvm/lib/BinaryFormat/SFrame.cpp +++ b/llvm/lib/BinaryFormat/SFrame.cpp @@ -35,3 +35,36 @@ ArrayRef<EnumEntry<sframe::ABI>> sframe::getABIs() { }; return ArrayRef(ABIs); } + +ArrayRef<EnumEntry<sframe::FREType>> sframe::getFRETypes() { + static constexpr EnumEntry<sframe::FREType> FRETypes[] = { +#define HANDLE_SFRAME_FRE_TYPE(CODE, NAME) {#NAME, sframe::FREType::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(FRETypes); +} + +ArrayRef<EnumEntry<sframe::FDEType>> sframe::getFDETypes() { + static constexpr EnumEntry<sframe::FDEType> FDETypes[] = { +#define HANDLE_SFRAME_FDE_TYPE(CODE, NAME) {#NAME, sframe::FDEType::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(FDETypes); +} + +ArrayRef<EnumEntry<sframe::AArch64PAuthKey>> sframe::getAArch64PAuthKeys() { + static constexpr EnumEntry<sframe::AArch64PAuthKey> AArch64PAuthKeys[] = { +#define HANDLE_SFRAME_AARCH64_PAUTH_KEY(CODE, NAME) \ + {#NAME, sframe::AArch64PAuthKey::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(AArch64PAuthKeys); +} + +ArrayRef<EnumEntry<sframe::FREOffset>> sframe::getFREOffsets() { + static constexpr EnumEntry<sframe::FREOffset> FREOffsets[] = { +#define HANDLE_SFRAME_FRE_OFFSET(CODE, NAME) {#NAME, sframe::FREOffset::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(FREOffsets); +} diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index a7c99b1..dcfd9aa 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -2103,8 +2103,9 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { DI->eraseFromParent(); return; } - - DI->setDebugValueUndef(); + // Move DBG_LABELs without modifying them. Set DBG_VALUEs undef. + if (!DI->isDebugLabel()) + DI->setDebugValueUndef(); DI->moveBefore(&*Loc); }; diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 8855740f..9b2851e 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -2186,19 +2186,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, llvm_unreachable("Deinterleave node should already have ReplacementNode"); break; case ComplexDeinterleavingOperation::Splat: { - auto *NewTy = VectorType::getDoubleElementsVectorType( - cast<VectorType>(Node->Real->getType())); auto *R = dyn_cast<Instruction>(Node->Real); auto *I = dyn_cast<Instruction>(Node->Imag); if (R && I) { // Splats that are not constant are interleaved where they are located Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode(); IRBuilder<> IRB(InsertPoint); - ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2, - NewTy, {Node->Real, Node->Imag}); + ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag}); } else { - ReplacementNode = Builder.CreateIntrinsic( - Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag}); + ReplacementNode = + Builder.CreateVectorInterleave({Node->Real, Node->Imag}); } break; } @@ -2226,10 +2223,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0); auto *A = replaceNode(Builder, Node->Operands[0]); auto *B = replaceNode(Builder, Node->Operands[1]); - auto *NewMaskTy = VectorType::getDoubleElementsVectorType( - cast<VectorType>(MaskReal->getType())); - auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, - NewMaskTy, {MaskReal, MaskImag}); + auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag}); ReplacementNode = Builder.CreateSelect(NewMask, A, B); break; } @@ -2260,8 +2254,8 @@ void ComplexDeinterleavingGraph::processReductionSingle( } if (!NewInit) - NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, - {Init, Constant::getNullValue(VTy)}); + NewInit = + Builder.CreateVectorInterleave({Init, Constant::getNullValue(VTy)}); NewPHI->addIncoming(NewInit, Incoming); NewPHI->addIncoming(OperationReplacement, BackEdge); @@ -2281,16 +2275,12 @@ void ComplexDeinterleavingGraph::processReductionOperation( auto *OldPHIImag = ReductionInfo[Imag].first; auto *NewPHI = OldToNewPHI[OldPHIReal]; - auto *VTy = cast<VectorType>(Real->getType()); - auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); - // We have to interleave initial origin values coming from IncomingBlock Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming); Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming); IRBuilder<> Builder(Incoming->getTerminator()); - auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, - {InitReal, InitImag}); + auto *NewInit = Builder.CreateVectorInterleave({InitReal, InitImag}); NewPHI->addIncoming(NewInit, Incoming); NewPHI->addIncoming(OperationReplacement, BackEdge); diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 012d873..9ba1782 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -1009,7 +1009,8 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy, for (unsigned I = 0; I < NumValues; ++I) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]); + MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy, + Offsets[I]); auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, MRI.getType(VRegs[I]), commonAlignment(BaseAlign, Offsets[I])); @@ -1039,7 +1040,8 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy, for (unsigned I = 0; I < NumValues; ++I) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]); + MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy, + Offsets[I]); auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, MRI.getType(VRegs[I]), commonAlignment(BaseAlign, Offsets[I])); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index e8f513a..e84ba91 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5949,8 +5949,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, const TargetOptions &Options = MF->getTarget().Options; LLT DstType = MRI.getType(MI.getOperand(0).getReg()); - if (CanReassociate && - !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc))) + if (CanReassociate && !MI.getFlag(MachineInstr::MIFlag::FmReassoc)) return false; // Floating-point multiply-add with intermediate rounding. @@ -5962,8 +5961,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, if (!HasFMAD && !HasFMA) return false; - AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD; + AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD; // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract)) return false; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index dc5dfab..fd38c30 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1409,7 +1409,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr; for (unsigned i = 0; i < Regs.size(); ++i) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8); + MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8); MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8); Align BaseAlign = getMemOpAlign(LI); @@ -1448,7 +1448,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) { for (unsigned i = 0; i < Vals.size(); ++i) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8); + MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8); MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8); Align BaseAlign = getMemOpAlign(SI); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index ed7b07f..d9d3569 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4170,7 +4170,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); - auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst); + auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrAddReg, PtrReg, OffsetCst); auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy, SmallPtr, *SmallMMO); @@ -4277,8 +4277,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) { LLT PtrTy = MRI.getType(PtrReg); auto OffsetCst = MIRBuilder.buildConstant( LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); - auto SmallPtr = - MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst); + auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrTy, PtrReg, OffsetCst); MachineMemOperand *LargeMMO = MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); @@ -5349,7 +5348,8 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx, unsigned ByteOffset = Offset / 8; Register NewAddrReg; - MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset); + MIRBuilder.materializeObjectPtrOffset(NewAddrReg, AddrReg, OffsetTy, + ByteOffset); MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, ByteOffset, PartTy); @@ -8004,7 +8004,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly. return UnableToLegalize; - if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) { + if (MI.getFlag(MachineInstr::FmAfn)) { unsigned Flags = MI.getFlags(); auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags); MIRBuilder.buildFPTrunc(Dst, Src32, Flags); @@ -9822,7 +9822,7 @@ LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val, if (DstOff != 0) { auto Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); - Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); + Ptr = MIB.buildObjectPtrOffset(PtrTy, Dst, Offset).getReg(0); } MIB.buildStore(Value, Ptr, *StoreMMO); @@ -9962,7 +9962,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, LLT SrcTy = MRI.getType(Src); Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset) .getReg(0); - LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); + LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0); } auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); @@ -9970,7 +9970,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, Register StorePtr = Dst; if (CurrOffset != 0) { LLT DstTy = MRI.getType(Dst); - StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); + StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0); } MIB.buildStore(LdVal, StorePtr, *StoreMMO); CurrOffset += CopyTy.getSizeInBytes(); @@ -10060,7 +10060,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, LLT SrcTy = MRI.getType(Src); auto Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset); - LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); + LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0); } LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); CurrOffset += CopyTy.getSizeInBytes(); @@ -10078,7 +10078,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, LLT DstTy = MRI.getType(Dst); auto Offset = MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset); - StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); + StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0); } MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); CurrOffset += CopyTy.getSizeInBytes(); diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 121d7e8..27df7e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -208,11 +208,20 @@ MachineIRBuilder::buildPtrAdd(const DstOp &Res, const SrcOp &Op0, return buildInstr(TargetOpcode::G_PTR_ADD, {Res}, {Op0, Op1}, Flags); } +MachineInstrBuilder MachineIRBuilder::buildObjectPtrOffset(const DstOp &Res, + const SrcOp &Op0, + const SrcOp &Op1) { + return buildPtrAdd(Res, Op0, Op1, + MachineInstr::MIFlag::NoUWrap | + MachineInstr::MIFlag::InBounds); +} + std::optional<MachineInstrBuilder> MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0, - const LLT ValueTy, uint64_t Value) { + const LLT ValueTy, uint64_t Value, + std::optional<unsigned> Flags) { assert(Res == 0 && "Res is a result argument"); - assert(ValueTy.isScalar() && "invalid offset type"); + assert(ValueTy.isScalar() && "invalid offset type"); if (Value == 0) { Res = Op0; @@ -221,7 +230,14 @@ MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0, Res = getMRI()->createGenericVirtualRegister(getMRI()->getType(Op0)); auto Cst = buildConstant(ValueTy, Value); - return buildPtrAdd(Res, Op0, Cst.getReg(0)); + return buildPtrAdd(Res, Op0, Cst.getReg(0), Flags); +} + +std::optional<MachineInstrBuilder> MachineIRBuilder::materializeObjectPtrOffset( + Register &Res, Register Op0, const LLT ValueTy, uint64_t Value) { + return materializePtrAdd(Res, Op0, ValueTy, Value, + MachineInstr::MIFlag::NoUWrap | + MachineInstr::MIFlag::InBounds); } MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res, diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp index 7153902..8b72c29 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp +++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp @@ -217,6 +217,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) { .Case("nneg", MIToken::kw_nneg) .Case("disjoint", MIToken::kw_disjoint) .Case("samesign", MIToken::kw_samesign) + .Case("inbounds", MIToken::kw_inbounds) .Case("nofpexcept", MIToken::kw_nofpexcept) .Case("unpredictable", MIToken::kw_unpredictable) .Case("debug-location", MIToken::kw_debug_location) @@ -616,6 +617,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) { .Case("!range", MIToken::md_range) .Case("!DIExpression", MIToken::md_diexpr) .Case("!DILocation", MIToken::md_dilocation) + .Case("!noalias.addrspace", MIToken::md_noalias_addrspace) .Default(MIToken::Error); } diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h index d7cd067..0627f17 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.h +++ b/llvm/lib/CodeGen/MIRParser/MILexer.h @@ -78,6 +78,7 @@ struct MIToken { kw_nneg, kw_disjoint, kw_samesign, + kw_inbounds, kw_debug_location, kw_debug_instr_number, kw_dbg_instr_ref, @@ -151,6 +152,7 @@ struct MIToken { md_tbaa, md_alias_scope, md_noalias, + md_noalias_addrspace, md_range, md_diexpr, md_dilocation, diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 3a364d5..6a464d9 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -1477,7 +1477,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Token.is(MIToken::kw_nneg) || Token.is(MIToken::kw_disjoint) || Token.is(MIToken::kw_nusw) || - Token.is(MIToken::kw_samesign)) { + Token.is(MIToken::kw_samesign) || + Token.is(MIToken::kw_inbounds)) { // clang-format on // Mine frame and fast math flags if (Token.is(MIToken::kw_frame_setup)) @@ -1518,6 +1519,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Flags |= MachineInstr::NoUSWrap; if (Token.is(MIToken::kw_samesign)) Flags |= MachineInstr::SameSign; + if (Token.is(MIToken::kw_inbounds)) + Flags |= MachineInstr::InBounds; lex(); } @@ -3482,6 +3485,11 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { if (parseMDNode(AAInfo.NoAlias)) return true; break; + case MIToken::md_noalias_addrspace: + lex(); + if (parseMDNode(AAInfo.NoAliasAddrSpace)) + return true; + break; case MIToken::md_range: lex(); if (parseMDNode(Range)) @@ -3490,7 +3498,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { // TODO: Report an error on duplicate metadata nodes. default: return error("expected 'align' or '!tbaa' or '!alias.scope' or " - "'!noalias' or '!range'"); + "'!noalias' or '!range' or '!noalias.addrspace'"); } } if (expectAndConsume(MIToken::rparen)) diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index ad7835a..ce1834a 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -820,6 +820,8 @@ static void printMI(raw_ostream &OS, MFPrintState &State, OS << "nusw "; if (MI.getFlag(MachineInstr::SameSign)) OS << "samesign "; + if (MI.getFlag(MachineInstr::InBounds)) + OS << "inbounds "; // NOTE: Please add new MIFlags also to the MI_FLAGS_STR in // llvm/utils/update_mir_test_checks.py. diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index da3665b..79047f7 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -585,6 +585,8 @@ uint32_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) { MIFlags |= MachineInstr::MIFlag::NoUSWrap; if (GEP->hasNoUnsignedWrap()) MIFlags |= MachineInstr::MIFlag::NoUWrap; + if (GEP->isInBounds()) + MIFlags |= MachineInstr::MIFlag::InBounds; } // Copy the nonneg flag. @@ -1860,8 +1862,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << "nneg "; if (getFlag(MachineInstr::Disjoint)) OS << "disjoint "; + if (getFlag(MachineInstr::NoUSWrap)) + OS << "nusw "; if (getFlag(MachineInstr::SameSign)) OS << "samesign "; + if (getFlag(MachineInstr::InBounds)) + OS << "inbounds "; // Print the opcode name. if (TII) diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index 0d25169..c612f8de 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -1273,6 +1273,10 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << ", !noalias "; AAInfo.NoAlias->printAsOperand(OS, MST); } + if (AAInfo.NoAliasAddrSpace) { + OS << ", !noalias.addrspace "; + AAInfo.NoAliasAddrSpace->printAsOperand(OS, MST); + } if (getRanges()) { OS << ", !range "; getRanges()->printAsOperand(OS, MST); diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index 0f742c4..21bf052 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -423,7 +423,7 @@ void ModuloScheduleExpander::generateExistingPhis( // potentially define two values. unsigned MaxPhis = PrologStage + 2; if (!InKernel && (int)PrologStage <= LoopValStage) - MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1); + MaxPhis = std::max((int)MaxPhis - LoopValStage, 1); unsigned NumPhis = std::min(NumStages, MaxPhis); Register NewReg; diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 2d7987a..7ede564 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -306,7 +306,12 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate { /// number if it is not zero. If DstReg is a physical register and the /// existing subregister number of the def / use being updated is not zero, /// make sure to set it to the correct physical subregister. - void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx); + /// + /// If \p SubregToRegSrcInst is not empty, we are coalescing a + /// `DstReg = SUBREG_TO_REG SrcReg`, which should introduce an + /// implicit-def of DstReg on instructions that define SrcReg. + void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx, + ArrayRef<MachineInstr *> SubregToRegSrcInst = {}); /// If the given machine operand reads only undefined lanes add an undef /// flag. @@ -1443,6 +1448,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, // CopyMI may have implicit operands, save them so that we can transfer them // over to the newly materialized instruction after CopyMI is removed. + LaneBitmask NewMIImplicitOpsMask; SmallVector<MachineOperand, 4> ImplicitOps; ImplicitOps.reserve(CopyMI->getNumOperands() - CopyMI->getDesc().getNumOperands()); @@ -1457,6 +1463,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, (MO.getSubReg() == 0 && MO.getReg() == DstOperand.getReg())) && "unexpected implicit virtual register def"); ImplicitOps.push_back(MO); + if (MO.isDef() && MO.getReg().isVirtual() && + MRI->shouldTrackSubRegLiveness(DstReg)) + NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg()); } } @@ -1499,14 +1508,11 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, } else { assert(MO.getReg() == NewMI.getOperand(0).getReg()); - // We're only expecting another def of the main output, so the range - // should get updated with the regular output range. - // - // FIXME: The range updating below probably needs updating to look at - // the super register if subranges are tracked. - assert(!MRI->shouldTrackSubRegLiveness(DstReg) && - "subrange update for implicit-def of super register may not be " - "properly handled"); + // If lanemasks need to be tracked, compile the lanemask of the NewMI + // implicit def operands to avoid subranges for the super-regs from + // being removed by code later on in this function. + if (MRI->shouldTrackSubRegLiveness(MO.getReg())) + NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg()); } } } @@ -1606,7 +1612,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber()); VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); for (LiveInterval::SubRange &SR : DstInt.subranges()) { - if ((SR.LaneMask & DstMask).none()) { + if ((SR.LaneMask & DstMask).none() && + (SR.LaneMask & NewMIImplicitOpsMask).none()) { LLVM_DEBUG(dbgs() << "Removing undefined SubRange " << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n"); @@ -1870,11 +1877,14 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx, } } -void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, - unsigned SubIdx) { +void RegisterCoalescer::updateRegDefsUses( + Register SrcReg, Register DstReg, unsigned SubIdx, + ArrayRef<MachineInstr *> SubregToRegSrcInsts) { bool DstIsPhys = DstReg.isPhysical(); LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg); + // Coalescing a COPY may expose reads of 'undef' subregisters. + // If so, then explicitly propagate 'undef' to those operands. if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) { for (MachineOperand &MO : MRI->reg_operands(DstReg)) { if (MO.isUndef()) @@ -1891,6 +1901,15 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, } } + // If DstInt already has a subrange for the unused lanes, then we shouldn't + // create duplicate subranges when we update the interval for unused lanes. + LaneBitmask DstIntLaneMask; + if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) { + for (LiveInterval::SubRange &SR : DstInt->subranges()) + DstIntLaneMask |= SR.LaneMask; + } + + // Go through all instructions to replace uses of 'SrcReg' by 'DstReg'. SmallPtrSet<MachineInstr *, 8> Visited; for (MachineRegisterInfo::reg_instr_iterator I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end(); @@ -1914,6 +1933,80 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr()) Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI)); + bool RequiresImplicitRedef = false; + if (!SubregToRegSrcInsts.empty()) { + // We can only add an implicit-def and undef if the sub registers match, + // e.g. + // %0:gr32 = INSTX + // %0.sub8:gr32 = INSTY // top 24 bits of %0 still defined + // %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub32 + // + // This cannot be transformed into: + // %1.sub32:gr64 = INSTX + // undef %1.sub8:gr64 = INSTY , implicit-def %1 + // + // Because that would thrash the top 24 bits of %1.sub32. + if (is_contained(SubregToRegSrcInsts, UseMI) && + all_of(UseMI->defs(), + [&SubIdx, &SrcReg](const MachineOperand &MO) -> bool { + if (MO.getReg() != SrcReg || !MO.getSubReg() || MO.isUndef()) + return true; + return SubIdx == MO.getSubReg(); + })) { + // Add implicit-def of super-register to express that the whole + // register is defined by the instruction. + MachineInstrBuilder MIB(*MF, UseMI); + MIB.addReg(DstReg, RegState::ImplicitDefine); + RequiresImplicitRedef = true; + } + + // If the coalesed instruction doesn't fully define the register, we need + // to preserve the original super register liveness for SUBREG_TO_REG. + // + // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes, + // but it introduces liveness for other subregisters. Downstream users may + // have been relying on those bits, so we need to ensure their liveness is + // captured with a def of other lanes. + if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) { + // First check if there is sufficient granularity in terms of subranges. + LaneBitmask DstMask = MRI->getMaxLaneMaskForVReg(DstInt->reg()); + LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx); + LaneBitmask UnusedLanes = DstMask & ~UsedLanes; + if ((UnusedLanes & ~DstIntLaneMask).any()) { + BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); + DstInt->createSubRangeFrom(Allocator, UnusedLanes, *DstInt); + DstIntLaneMask |= UnusedLanes; + } + + // After duplicating the live ranges for the low/hi bits, we + // need to update the subranges of the DstReg interval such that + // for a case like this: + // + // entry: + // 16B %1:gpr32 = INSTRUCTION (<=> UseMI) + // : + // if.then: + // 32B %1:gpr32 = MOVIMM32 .. + // 48B %0:gpr64 = SUBREG_TO_REG 0, %1, sub32 + // + // Only the MOVIMM32 require a def of the top lanes and any intervals + // for the top 32-bits of the def at 16B should be removed. + for (LiveInterval::SubRange &SR : DstInt->subranges()) { + if (!Writes || RequiresImplicitRedef || + (SR.LaneMask & UnusedLanes).none()) + continue; + + assert((SR.LaneMask & UnusedLanes) == SR.LaneMask && + "Unexpected lanemask. Subrange needs finer granularity"); + + SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(false); + auto SegmentI = SR.find(UseIdx); + if (SegmentI != SR.end()) + SR.removeSegment(SegmentI, true); + } + } + } + // Replace SrcReg with DstReg in all UseMI operands. for (unsigned Op : Ops) { MachineOperand &MO = UseMI->getOperand(Op); @@ -1922,7 +2015,7 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, // turn a full def into a read-modify-write sub-register def and vice // versa. if (SubIdx && MO.isDef()) - MO.setIsUndef(!Reads); + MO.setIsUndef(!Reads || RequiresImplicitRedef); // A subreg use of a partially undef (super) register may be a complete // undef use now and then has to be marked that way. @@ -2025,6 +2118,30 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI, LIS->shrinkToUses(&LI); } +/// For a given use of value \p Idx, it returns the def in the current block, +/// or otherwise all possible defs in preceding blocks. +static bool FindDefInBlock(SmallPtrSetImpl<MachineBasicBlock *> &VisitedBlocks, + SmallVector<MachineInstr *> &Instrs, + LiveIntervals *LIS, LiveInterval &SrcInt, + MachineBasicBlock *MBB, VNInfo *Idx) { + if (!Idx->isPHIDef()) { + MachineInstr *Def = LIS->getInstructionFromIndex(Idx->def); + assert(Def && "Unable to find a def for SUBREG_TO_REG source operand"); + Instrs.push_back(Def); + return true; + } + + bool Any = false; + if (VisitedBlocks.count(MBB)) + return false; + VisitedBlocks.insert(MBB); + for (MachineBasicBlock *Pred : MBB->predecessors()) { + Any |= FindDefInBlock(VisitedBlocks, Instrs, LIS, SrcInt, Pred, + SrcInt.getVNInfoBefore(LIS->getMBBEndIdx(Pred))); + } + return Any; +} + bool RegisterCoalescer::joinCopy( MachineInstr *CopyMI, bool &Again, SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) { @@ -2156,6 +2273,35 @@ bool RegisterCoalescer::joinCopy( }); } + SmallVector<MachineInstr *> SubregToRegSrcInsts; + if (CopyMI->isSubregToReg()) { + // For the case where the copy instruction is a SUBREG_TO_REG, e.g. + // + // %0:gpr32 = movimm32 .. + // %1:gpr64 = SUBREG_TO_REG 0, %0, sub32 + // ... + // %0:gpr32 = COPY <something> + // + // After joining liveranges, the original `movimm32` will need an + // implicit-def to make it explicit that the entire register is written, + // i.e. + // + // undef %0.sub32:gpr64 = movimm32 ..., implicit-def %0 + // ... + // undef %0.sub32:gpr64 = COPY <something> // Note that this does not + // // require an implicit-def, + // // because it has nothing to + // // do with the SUBREG_TO_REG. + LiveInterval &SrcInt = + LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); + SlotIndex SubregToRegSlotIdx = LIS->getInstructionIndex(*CopyMI); + SmallPtrSet<MachineBasicBlock *, 8> VisitedBlocks; + if (!FindDefInBlock(VisitedBlocks, SubregToRegSrcInsts, LIS, SrcInt, + CopyMI->getParent(), + SrcInt.Query(SubregToRegSlotIdx).valueIn())) + llvm_unreachable("SUBREG_TO_REG src requires a def"); + } + ShrinkMask = LaneBitmask::getNone(); ShrinkMainRange = false; @@ -2225,9 +2371,12 @@ bool RegisterCoalescer::joinCopy( // Rewrite all SrcReg operands to DstReg. // Also update DstReg operands to include DstIdx if it is set. - if (CP.getDstIdx()) + if (CP.getDstIdx()) { + assert(SubregToRegSrcInsts.empty() && "can this happen?"); updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx()); - updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx()); + } + updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(), + SubregToRegSrcInsts); // Shrink subregister ranges if necessary. if (ShrinkMask.any()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d3df434..a43020e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -35,6 +35,7 @@ #include "llvm/CodeGen/ByteProvider.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/SDPatternMatch.h" @@ -15262,23 +15263,31 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) { } } - // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller - // than X, and the And doesn't change the lower iX bits, we can move the - // AssertZext in front of the And and drop the AssertSext. if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND && - N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext && isa<ConstantSDNode>(N0.getOperand(1))) { - SDValue BigA = N0.getOperand(0); - EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); const APInt &Mask = N0.getConstantOperandAPInt(1); - if (AssertVT.bitsLT(BigA_AssertVT) && - Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { - SDLoc DL(N); - SDValue NewAssert = - DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); - return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, - N0.getOperand(1)); + + // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller + // than X, and the And doesn't change the lower iX bits, we can move the + // AssertZext in front of the And and drop the AssertSext. + if (N0.getOperand(0).getOpcode() == ISD::AssertSext && N0.hasOneUse()) { + SDValue BigA = N0.getOperand(0); + EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); + if (AssertVT.bitsLT(BigA_AssertVT) && + Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { + SDLoc DL(N); + SDValue NewAssert = + DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); + return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, + N0.getOperand(1)); + } } + + // Remove AssertZext entirely if the mask guarantees the assertion cannot + // fail. + // TODO: Use KB countMinLeadingZeros to handle non-constant masks? + if (Mask.isIntN(AssertVT.getScalarSizeInBits())) + return N0; } return SDValue(); @@ -22778,8 +22787,10 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG); // If we store purely within object bounds just before its lifetime ends, // we can remove the store. - if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase, - StoreSize.getFixedValue() * 8)) { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + if (LifetimeEndBase.contains( + DAG, MFI.getObjectSize(LifetimeEnd->getFrameIndex()) * 8, + StoreBase, StoreSize.getFixedValue() * 8)) { LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump(); dbgs() << "\nwithin LIFETIME_END of : "; LifetimeEndBase.dump(); dbgs() << "\n"); @@ -28971,13 +28982,100 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, return SDValue(); } +static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, + const TargetLowering &TLI) { + // Match a pattern such as: + // (X | (X >> C0) | (X >> C1) | ...) & Mask + // This extracts contiguous parts of X and ORs them together before comparing. + // We can optimize this so that we directly check (X & SomeMask) instead, + // eliminating the shifts. + + EVT VT = Root.getValueType(); + + // TODO: Support vectors? + if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N0 = Root.getOperand(0); + SDValue N1 = Root.getOperand(1); + + if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1)) + return SDValue(); + + APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal(); + + SDValue Src; + const auto IsSrc = [&](SDValue V) { + if (!Src) { + Src = V; + return true; + } + + return Src == V; + }; + + SmallVector<SDValue> Worklist = {N0}; + APInt PartsMask(VT.getSizeInBits(), 0); + while (!Worklist.empty()) { + SDValue V = Worklist.pop_back_val(); + if (!V.hasOneUse() && (Src && Src != V)) + return SDValue(); + + if (V.getOpcode() == ISD::OR) { + Worklist.push_back(V.getOperand(0)); + Worklist.push_back(V.getOperand(1)); + continue; + } + + if (V.getOpcode() == ISD::SRL) { + SDValue ShiftSrc = V.getOperand(0); + SDValue ShiftAmt = V.getOperand(1); + + if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt)) + return SDValue(); + + auto ShiftAmtVal = cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal(); + if (ShiftAmtVal > RootMask.getBitWidth()) + return SDValue(); + + PartsMask |= (RootMask << ShiftAmtVal); + continue; + } + + if (IsSrc(V)) { + PartsMask |= RootMask; + continue; + } + + return SDValue(); + } + + if (!Src) + return SDValue(); + + SDLoc DL(Root); + return DAG.getNode(ISD::AND, DL, VT, + {Src, DAG.getConstant(PartsMask, DL, VT)}); +} + /// This is a stub for TargetLowering::SimplifySetCC. SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans) { TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); - return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); + if (SDValue C = + TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL)) + return C; + + if (ISD::isIntEqualitySetCC(Cond) && N0.getOpcode() == ISD::AND && + isNullConstant(N1)) { + + if (SDValue Res = matchMergedBFX(N0, DAG, TLI)) + return DAG.getSetCC(DL, VT, Res, N1, Cond); + } + + return SDValue(); } /// Given an ISD::SDIV node expressing a divide by constant, return @@ -29415,7 +29513,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { MachineMemOperand *MMO; }; - auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics { + auto getCharacteristics = [this](SDNode *N) -> MemUseCharacteristics { if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) { int64_t Offset = 0; if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset())) @@ -29428,13 +29526,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { LSN->getBasePtr(), Offset /*base offset*/, LocationSize::precise(Size), LSN->getMemOperand()}; } - if (const auto *LN = cast<LifetimeSDNode>(N)) + if (const auto *LN = cast<LifetimeSDNode>(N)) { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1), 0, - LocationSize::precise(LN->getSize()), + LocationSize::precise(MFI.getObjectSize(LN->getFrameIndex())), (MachineMemOperand *)nullptr}; + } // Default. return {false /*isvolatile*/, /*isAtomic*/ false, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 74172b2..ba0ab23 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3853,7 +3853,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { break; case ISD::FP_TO_FP16: LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n"); - if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) { + if (Node->getFlags().hasApproximateFuncs() && !TLI.useSoftFloat()) { SDValue Op = Node->getOperand(0); MVT SVT = Op.getSimpleValueType(); if ((SVT == MVT::f64 || SVT == MVT::f80) && diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 773ff48..02d1100 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -784,10 +784,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { case ISD::TargetFrameIndex: ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex()); break; - case ISD::LIFETIME_START: - case ISD::LIFETIME_END: - ID.AddInteger(cast<LifetimeSDNode>(N)->getSize()); - break; case ISD::PSEUDO_PROBE: ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid()); ID.AddInteger(cast<PseudoProbeSDNode>(N)->getIndex()); @@ -7847,20 +7843,43 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } } - // Perform trivial constant folding. - if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags)) - return SV; + if (N1.getOpcode() == ISD::POISON || N2.getOpcode() == ISD::POISON) { + switch (Opcode) { + case ISD::XOR: + case ISD::ADD: + case ISD::PTRADD: + case ISD::SUB: + case ISD::SIGN_EXTEND_INREG: + case ISD::UDIV: + case ISD::SDIV: + case ISD::UREM: + case ISD::SREM: + case ISD::MUL: + case ISD::AND: + case ISD::SSUBSAT: + case ISD::USUBSAT: + case ISD::UMIN: + case ISD::OR: + case ISD::SADDSAT: + case ISD::UADDSAT: + case ISD::UMAX: + case ISD::SMAX: + case ISD::SMIN: + // fold op(arg1, poison) -> poison, fold op(poison, arg2) -> poison. + return N2.getOpcode() == ISD::POISON ? N2 : N1; + } + } // Canonicalize an UNDEF to the RHS, even over a constant. - if (N1.isUndef()) { + if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() != ISD::UNDEF) { if (TLI->isCommutativeBinOp(Opcode)) { std::swap(N1, N2); } else { switch (Opcode) { case ISD::PTRADD: case ISD::SUB: - // fold op(undef, arg2) -> undef, fold op(poison, arg2) ->poison. - return N1.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT); + // fold op(undef, non_undef_arg2) -> undef. + return N1; case ISD::SIGN_EXTEND_INREG: case ISD::UDIV: case ISD::SDIV: @@ -7868,18 +7887,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::SREM: case ISD::SSUBSAT: case ISD::USUBSAT: - // fold op(undef, arg2) -> 0, fold op(poison, arg2) -> poison. - return N1.getOpcode() == ISD::POISON ? getPOISON(VT) - : getConstant(0, DL, VT); + // fold op(undef, non_undef_arg2) -> 0. + return getConstant(0, DL, VT); } } } // Fold a bunch of operators when the RHS is undef. - if (N2.isUndef()) { + if (N2.getOpcode() == ISD::UNDEF) { switch (Opcode) { case ISD::XOR: - if (N1.isUndef()) + if (N1.getOpcode() == ISD::UNDEF) // Handle undef ^ undef -> 0 special case. This is a common // idiom (misuse). return getConstant(0, DL, VT); @@ -7887,29 +7905,48 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::ADD: case ISD::PTRADD: case ISD::SUB: + // fold op(arg1, undef) -> undef. + return N2; case ISD::UDIV: case ISD::SDIV: case ISD::UREM: case ISD::SREM: - // fold op(arg1, undef) -> undef, fold op(arg1, poison) -> poison. - return N2.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT); + // fold op(arg1, undef) -> poison. + return getPOISON(VT); case ISD::MUL: case ISD::AND: case ISD::SSUBSAT: case ISD::USUBSAT: - // fold op(arg1, undef) -> 0, fold op(arg1, poison) -> poison. - return N2.getOpcode() == ISD::POISON ? getPOISON(VT) - : getConstant(0, DL, VT); + case ISD::UMIN: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> 0. + return N1.getOpcode() == ISD::UNDEF ? N2 : getConstant(0, DL, VT); case ISD::OR: case ISD::SADDSAT: case ISD::UADDSAT: - // fold op(arg1, undef) -> an all-ones constant, fold op(arg1, poison) -> - // poison. - return N2.getOpcode() == ISD::POISON ? getPOISON(VT) - : getAllOnesConstant(DL, VT); + case ISD::UMAX: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> -1. + return N1.getOpcode() == ISD::UNDEF ? N2 : getAllOnesConstant(DL, VT); + case ISD::SMAX: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MAX_INT. + return N1.getOpcode() == ISD::UNDEF + ? N2 + : getConstant( + APInt::getSignedMaxValue(VT.getScalarSizeInBits()), DL, + VT); + case ISD::SMIN: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MIN_INT. + return N1.getOpcode() == ISD::UNDEF + ? N2 + : getConstant( + APInt::getSignedMinValue(VT.getScalarSizeInBits()), DL, + VT); } } + // Perform trivial constant folding. + if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags)) + return SV; + // Memoize this node if possible. SDNode *N; SDVTList VTs = getVTList(VT); @@ -9360,8 +9397,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, } SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl, - SDValue Chain, int FrameIndex, - int64_t Size) { + SDValue Chain, int FrameIndex) { const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END; const auto VTs = getVTList(MVT::Other); SDValue Ops[2] = { @@ -9373,13 +9409,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl, FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); ID.AddInteger(FrameIndex); - ID.AddInteger(Size); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) return SDValue(E, 0); - LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), - dl.getDebugLoc(), VTs, Size); + LifetimeSDNode *N = + newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs); createOperands(N, Ops); CSEMap.InsertNode(N, IP); InsertNode(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 1636465..306e068 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3923,11 +3923,15 @@ void SelectionDAGBuilder::visitFPTrunc(const User &I) { // FPTrunc is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); SDLoc dl = getCurSDLoc(); + SDNodeFlags Flags; + if (auto *TruncInst = dyn_cast<FPMathOperator>(&I)) + Flags.copyFMF(*TruncInst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N, DAG.getTargetConstant( - 0, dl, TLI.getPointerTy(DAG.getDataLayout())))); + 0, dl, TLI.getPointerTy(DAG.getDataLayout())), + Flags)); } void SelectionDAGBuilder::visitFPExt(const User &I) { @@ -7594,8 +7598,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, if (TM.getOptLevel() == CodeGenOptLevel::None) return; - const int64_t ObjectSize = - cast<ConstantInt>(I.getArgOperand(0))->getSExtValue(); const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1)); // First check that the Alloca is static, otherwise it won't have a @@ -7605,7 +7607,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; const int FrameIndex = SI->second; - Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize); + Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex); DAG.setRoot(Res); return; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 9474587..900da76 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -946,8 +946,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { << " -> " << ASC->getDestAddressSpace() << ']'; - } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) { - OS << "<0 to " << LN->getSize() << ">"; } else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) { OS << '<' << AA->getAlign().value() << '>'; } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1764910..48d6b99 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9471,7 +9471,7 @@ SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG, ISD::SRL, DL, VT, DAG.getNode(ISD::MUL, DL, VT, DAG.getNode(ISD::AND, DL, VT, Op, Neg), DAG.getConstant(DeBruijn, DL, VT)), - DAG.getConstant(ShiftAmt, DL, VT)); + DAG.getShiftAmountConstant(ShiftAmt, VT, DL)); Lookup = DAG.getSExtOrTrunc(Lookup, DL, getPointerTy(TD)); SmallVector<uint8_t> Table(BitWidth, 0); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 68b8a00..3c91b0e 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2062,7 +2062,7 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const { // FreeBSD has "__stack_chk_guard" defined externally on libc.so if (M.getDirectAccessExternalData() && - !TM.getTargetTriple().isWindowsGNUEnvironment() && + !TM.getTargetTriple().isOSCygMing() && !(TM.getTargetTriple().isPPC64() && TM.getTargetTriple().isOSFreeBSD()) && (!TM.getTargetTriple().isOSDarwin() || diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 725e951..e9172f4 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1060,27 +1060,27 @@ MCSection *TargetLoweringObjectFileELF::getSectionForConstant( auto &Context = getContext(); if (Kind.isMergeableConst4() && MergeableConst4Section) - return Context.getELFSection(".rodata.cst4." + SectionSuffix, + return Context.getELFSection(".rodata.cst4." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 4); if (Kind.isMergeableConst8() && MergeableConst8Section) - return Context.getELFSection(".rodata.cst8." + SectionSuffix, + return Context.getELFSection(".rodata.cst8." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 8); if (Kind.isMergeableConst16() && MergeableConst16Section) - return Context.getELFSection(".rodata.cst16." + SectionSuffix, + return Context.getELFSection(".rodata.cst16." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 16); if (Kind.isMergeableConst32() && MergeableConst32Section) - return Context.getELFSection(".rodata.cst32." + SectionSuffix, + return Context.getELFSection(".rodata.cst32." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 32); if (Kind.isReadOnly()) - return Context.getELFSection(".rodata." + SectionSuffix, ELF::SHT_PROGBITS, - ELF::SHF_ALLOC); + return Context.getELFSection(".rodata." + SectionSuffix + ".", + ELF::SHT_PROGBITS, ELF::SHF_ALLOC); assert(Kind.isReadOnlyWithRel() && "Unknown section kind"); - return Context.getELFSection(".data.rel.ro." + SectionSuffix, + return Context.getELFSection(".data.rel.ro." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE); } diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp index 6267207..fd54190 100644 --- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp +++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp @@ -369,6 +369,19 @@ static GlobalVariable *getOrCreateRefVariable( AddrOfOldGV, Twine("__ref_").concat(GV->getName()), nullptr, GlobalVariable::NotThreadLocal); + // RefGV is created with isConstant = false, but we want to place RefGV into + // .rdata, not .data. It is important that the GlobalVariable be mutable + // from the compiler's point of view, so that the optimizer does not remove + // the global variable entirely and replace all references to it with its + // initial value. + // + // When the Windows hot-patch loader applies a hot-patch, it maps the + // pages of .rdata as read/write so that it can set each __ref_* variable + // to point to the original variable in the base image. Afterward, pages in + // .rdata are remapped as read-only. This protects the __ref_* variables from + // being overwritten during execution. + RefGV->setSection(".rdata"); + // Create debug info for the replacement global variable. DataLayout Layout = M->getDataLayout(); DIType *DebugType = DebugInfo.createPointerType( diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h index bd0d72f..0e95369 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h @@ -157,8 +157,7 @@ private: processSubtractRelocation(unsigned SectionID, relocation_iterator RelI, const MachOObjectFile &BaseObj, ObjSectionToIDMap &ObjSectionToID) { - const MachOObjectFile &Obj = - static_cast<const MachOObjectFile&>(BaseObj); + const MachOObjectFile &Obj = BaseObj; MachO::any_relocation_info RE = Obj.getRelocation(RelI->getRawDataRefImpl()); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 7928772..3aa4f7a 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1161,7 +1161,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel( Builder.restoreIP(AllocaIP); auto *KernelArgsPtr = Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args"); - Builder.restoreIP(Loc.IP); + updateToLocation(Loc); for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) { llvm::Value *Arg = @@ -1189,7 +1189,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch( if (!updateToLocation(Loc)) return Loc.IP; - Builder.restoreIP(Loc.IP); // On top of the arrays that were filled up, the target offloading call // takes as arguments the device id as well as the host pointer. The host // pointer is used by the runtime library to identify the current target @@ -5955,7 +5954,7 @@ OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc, Builder.restoreIP(AllocaIP); AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name); ArgsBase->setAlignment(Align(8)); - Builder.restoreIP(Loc.IP); + updateToLocation(Loc); // Store the index value with offset in depend vector. for (unsigned I = 0; I < NumLoops; ++I) { @@ -8081,7 +8080,7 @@ void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc, ".offload_ptrs"); AllocaInst *ArgSizes = Builder.CreateAlloca( ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes"); - Builder.restoreIP(Loc.IP); + updateToLocation(Loc); MapperAllocas.ArgsBase = ArgsBase; MapperAllocas.Args = Args; MapperAllocas.ArgSizes = ArgSizes; diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 28037d7..49c6dc7 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -1144,9 +1144,32 @@ Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V, return CreateShuffleVector(V, Zeros, Name + ".splat"); } -Value *IRBuilderBase::CreatePreserveArrayAccessIndex( - Type *ElTy, Value *Base, unsigned Dimension, unsigned LastIndex, - MDNode *DbgInfo) { +Value *IRBuilderBase::CreateVectorInterleave(ArrayRef<Value *> Ops, + const Twine &Name) { + assert(Ops.size() >= 2 && Ops.size() <= 8 && + "Unexpected number of operands to interleave"); + + // Make sure all operands are the same type. + assert(isa<VectorType>(Ops[0]->getType()) && "Unexpected type"); + +#ifndef NDEBUG + for (unsigned I = 1; I < Ops.size(); I++) { + assert(Ops[I]->getType() == Ops[0]->getType() && + "Vector interleave expects matching operand types!"); + } +#endif + + unsigned IID = Intrinsic::getInterleaveIntrinsicID(Ops.size()); + auto *SubvecTy = cast<VectorType>(Ops[0]->getType()); + Type *DestTy = VectorType::get(SubvecTy->getElementType(), + SubvecTy->getElementCount() * Ops.size()); + return CreateIntrinsic(IID, {DestTy}, Ops, {}, Name); +} + +Value *IRBuilderBase::CreatePreserveArrayAccessIndex(Type *ElTy, Value *Base, + unsigned Dimension, + unsigned LastIndex, + MDNode *DbgInfo) { auto *BaseType = Base->getType(); assert(isa<PointerType>(BaseType) && "Invalid Base ptr type for preserve.array.access.index."); diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index 6c35ade..58a1f74 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -1133,3 +1133,27 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) { "Shouldn't change the signature"); return NewDecl; } + +struct InterleaveIntrinsic { + Intrinsic::ID Interleave, Deinterleave; +}; + +static InterleaveIntrinsic InterleaveIntrinsics[] = { + {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2}, + {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3}, + {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4}, + {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5}, + {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6}, + {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7}, + {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8}, +}; + +Intrinsic::ID Intrinsic::getInterleaveIntrinsicID(unsigned Factor) { + assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); + return InterleaveIntrinsics[Factor - 2].Interleave; +} + +Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) { + assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); + return InterleaveIntrinsics[Factor - 2].Deinterleave; +} diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index 0dbd07f..1157cbe 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -1796,6 +1796,7 @@ AAMDNodes Instruction::getAAMetadata() const { Result.TBAAStruct = Info.lookup(LLVMContext::MD_tbaa_struct); Result.Scope = Info.lookup(LLVMContext::MD_alias_scope); Result.NoAlias = Info.lookup(LLVMContext::MD_noalias); + Result.NoAliasAddrSpace = Info.lookup(LLVMContext::MD_noalias_addrspace); } return Result; } @@ -1805,6 +1806,7 @@ void Instruction::setAAMetadata(const AAMDNodes &N) { setMetadata(LLVMContext::MD_tbaa_struct, N.TBAAStruct); setMetadata(LLVMContext::MD_alias_scope, N.Scope); setMetadata(LLVMContext::MD_noalias, N.NoAlias); + setMetadata(LLVMContext::MD_noalias_addrspace, N.NoAliasAddrSpace); } void Instruction::setNoSanitizeMetadata() { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 73e79c0..7f07568 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/LTO/LTO.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StableHashing.h" @@ -742,18 +743,18 @@ Error LTO::add(std::unique_ptr<InputFile> Input, Conf.VisibilityScheme = Config::ELF; } - const SymbolResolution *ResI = Res.begin(); - for (unsigned I = 0; I != Input->Mods.size(); ++I) - if (Error Err = addModule(*Input, I, ResI, Res.end())) + for (unsigned I = 0; I != Input->Mods.size(); ++I) { + if (auto Err = addModule(*Input, I, Res).moveInto(Res)) return Err; + } - assert(ResI == Res.end()); + assert(Res.empty()); return Error::success(); } -Error LTO::addModule(InputFile &Input, unsigned ModI, - const SymbolResolution *&ResI, - const SymbolResolution *ResE) { +Expected<ArrayRef<SymbolResolution>> +LTO::addModule(InputFile &Input, unsigned ModI, + ArrayRef<SymbolResolution> Res) { Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo(); if (!LTOInfo) return LTOInfo.takeError(); @@ -782,28 +783,32 @@ Error LTO::addModule(InputFile &Input, unsigned ModI, bool IsThinLTO = LTOInfo->IsThinLTO && (LTOMode != LTOK_UnifiedRegular); auto ModSyms = Input.module_symbols(ModI); - addModuleToGlobalRes(ModSyms, {ResI, ResE}, + addModuleToGlobalRes(ModSyms, Res, IsThinLTO ? ThinLTO.ModuleMap.size() + 1 : 0, LTOInfo->HasSummary); if (IsThinLTO) - return addThinLTO(BM, ModSyms, ResI, ResE); + return addThinLTO(BM, ModSyms, Res); RegularLTO.EmptyCombinedModule = false; - Expected<RegularLTOState::AddedModule> ModOrErr = - addRegularLTO(BM, ModSyms, ResI, ResE); + auto ModOrErr = addRegularLTO(BM, ModSyms, Res); if (!ModOrErr) return ModOrErr.takeError(); + Res = ModOrErr->second; - if (!LTOInfo->HasSummary) - return linkRegularLTO(std::move(*ModOrErr), /*LivenessFromIndex=*/false); + if (!LTOInfo->HasSummary) { + if (Error Err = linkRegularLTO(std::move(ModOrErr->first), + /*LivenessFromIndex=*/false)) + return Err; + return Res; + } // Regular LTO module summaries are added to a dummy module that represents // the combined regular LTO module. if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, "")) return Err; - RegularLTO.ModsWithSummaries.push_back(std::move(*ModOrErr)); - return Error::success(); + RegularLTO.ModsWithSummaries.push_back(std::move(ModOrErr->first)); + return Res; } // Checks whether the given global value is in a non-prevailing comdat @@ -839,10 +844,10 @@ handleNonPrevailingComdat(GlobalValue &GV, // Add a regular LTO object to the link. // The resulting module needs to be linked into the combined LTO module with // linkRegularLTO. -Expected<LTO::RegularLTOState::AddedModule> +Expected< + std::pair<LTO::RegularLTOState::AddedModule, ArrayRef<SymbolResolution>>> LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, - const SymbolResolution *&ResI, - const SymbolResolution *ResE) { + ArrayRef<SymbolResolution> Res) { RegularLTOState::AddedModule Mod; Expected<std::unique_ptr<Module>> MOrErr = BM.getLazyModule(RegularLTO.Ctx, /*ShouldLazyLoadMetadata*/ true, @@ -899,22 +904,22 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, std::set<const Comdat *> NonPrevailingComdats; SmallSet<StringRef, 2> NonPrevailingAsmSymbols; for (const InputFile::Symbol &Sym : Syms) { - assert(ResI != ResE); - SymbolResolution Res = *ResI++; + assert(!Res.empty()); + const SymbolResolution &R = Res.consume_front(); assert(MsymI != MsymE); ModuleSymbolTable::Symbol Msym = *MsymI++; Skip(); if (GlobalValue *GV = dyn_cast_if_present<GlobalValue *>(Msym)) { - if (Res.Prevailing) { + if (R.Prevailing) { if (Sym.isUndefined()) continue; Mod.Keep.push_back(GV); // For symbols re-defined with linker -wrap and -defsym options, // set the linkage to weak to inhibit IPO. The linkage will be // restored by the linker. - if (Res.LinkerRedefined) + if (R.LinkerRedefined) GV->setLinkage(GlobalValue::WeakAnyLinkage); GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage(); @@ -938,7 +943,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } // Set the 'local' flag based on the linker resolution for this symbol. - if (Res.FinalDefinitionInLinkageUnit) { + if (R.FinalDefinitionInLinkageUnit) { GV->setDSOLocal(true); if (GV->hasDLLImportStorageClass()) GV->setDLLStorageClass(GlobalValue::DLLStorageClassTypes:: @@ -947,7 +952,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } else if (auto *AS = dyn_cast_if_present<ModuleSymbolTable::AsmSymbol *>(Msym)) { // Collect non-prevailing symbols. - if (!Res.Prevailing) + if (!R.Prevailing) NonPrevailingAsmSymbols.insert(AS->first); } else { llvm_unreachable("unknown symbol type"); @@ -965,7 +970,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, CommonRes.Alignment = std::max(Align(SymAlignValue), CommonRes.Alignment); } - CommonRes.Prevailing |= Res.Prevailing; + CommonRes.Prevailing |= R.Prevailing; } } @@ -991,7 +996,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } assert(MsymI == MsymE); - return std::move(Mod); + return std::make_pair(std::move(Mod), Res); } Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod, @@ -1032,19 +1037,19 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod, } // Add a ThinLTO module to the link. -Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, - const SymbolResolution *&ResI, - const SymbolResolution *ResE) { - const SymbolResolution *ResITmp = ResI; +Expected<ArrayRef<SymbolResolution>> +LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, + ArrayRef<SymbolResolution> Res) { + ArrayRef<SymbolResolution> ResTmp = Res; for (const InputFile::Symbol &Sym : Syms) { - assert(ResITmp != ResE); - SymbolResolution Res = *ResITmp++; + assert(!ResTmp.empty()); + const SymbolResolution &R = ResTmp.consume_front(); if (!Sym.getIRName().empty()) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (Res.Prevailing) + if (R.Prevailing) ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); } } @@ -1059,14 +1064,14 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n"); for (const InputFile::Symbol &Sym : Syms) { - assert(ResI != ResE); - SymbolResolution Res = *ResI++; + assert(!Res.empty()); + const SymbolResolution &R = Res.consume_front(); if (!Sym.getIRName().empty()) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (Res.Prevailing) { + if (R.Prevailing) { assert(ThinLTO.PrevailingModuleForGUID[GUID] == BM.getModuleIdentifier()); @@ -1074,7 +1079,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // switch the linkage to `weak` to prevent IPOs from happening. // Find the summary in the module for this very GV and record the new // linkage so that we can switch it when we import the GV. - if (Res.LinkerRedefined) + if (R.LinkerRedefined) if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( GUID, BM.getModuleIdentifier())) S->setLinkage(GlobalValue::WeakAnyLinkage); @@ -1082,7 +1087,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // If the linker resolved the symbol to a local definition then mark it // as local in the summary for the module we are adding. - if (Res.FinalDefinitionInLinkageUnit) { + if (R.FinalDefinitionInLinkageUnit) { if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( GUID, BM.getModuleIdentifier())) { S->setDSOLocal(true); @@ -1110,7 +1115,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } } - return Error::success(); + return Res; } unsigned LTO::getMaxTasks() const { diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index 1074669..a214513 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -484,7 +484,8 @@ void MCMachOStreamer::finalizeCGProfile() { // For each entry, reserve space for 2 32-bit indices and a 64-bit count. size_t SectionBytes = W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t)); - (*CGProfileSection->begin()).appendContents(SectionBytes, 0); + (*CGProfileSection->begin()) + .setVarContents(std::vector<char>(SectionBytes, 0)); } MCStreamer *llvm::createMachOStreamer(MCContext &Context, @@ -520,5 +521,6 @@ void MCMachOStreamer::createAddrSigSection() { // (instead of emitting a zero-sized section) so these relocations are // technically valid, even though we don't expect these relocations to // actually be applied by the linker. - Frag->appendContents(8, 0); + constexpr char zero[8] = {}; + Frag->setVarContents(zero); } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 9c7b05b..e82393a 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -46,23 +46,83 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() { return nullptr; } +constexpr size_t FragBlockSize = 16384; +// Ensure the new fragment can at least store a few bytes. +constexpr size_t NewFragHeadroom = 8; + +static_assert(NewFragHeadroom >= alignof(MCFragment)); +static_assert(FragBlockSize >= sizeof(MCFragment) + NewFragHeadroom); + +MCFragment *MCObjectStreamer::allocFragSpace(size_t Headroom) { + auto Size = std::max(FragBlockSize, sizeof(MCFragment) + Headroom); + FragSpace = Size - sizeof(MCFragment); + auto Chunk = std::unique_ptr<char[]>(new char[Size]); + auto *F = reinterpret_cast<MCFragment *>(Chunk.get()); + FragStorage.push_back(std::move(Chunk)); + return F; +} + void MCObjectStreamer::newFragment() { - addFragment(getContext().allocFragment<MCFragment>()); + MCFragment *F; + if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) { + auto End = reinterpret_cast<size_t>(getCurFragEnd()); + F = reinterpret_cast<MCFragment *>( + alignToPowerOf2(End, alignof(MCFragment))); + FragSpace -= size_t(F) - End + sizeof(MCFragment); + } else { + F = allocFragSpace(0); + } + new (F) MCFragment(); + addFragment(F); +} + +void MCObjectStreamer::ensureHeadroom(size_t Headroom) { + if (Headroom <= FragSpace) + return; + auto *F = allocFragSpace(Headroom); + new (F) MCFragment(); + addFragment(F); } -void MCObjectStreamer::insert(MCFragment *F) { - assert(F->getKind() != MCFragment::FT_Data && +void MCObjectStreamer::insert(MCFragment *Frag) { + assert(Frag->getKind() != MCFragment::FT_Data && "F should have a variable-size tail"); + // Frag is not connected to FragSpace. Before modifying CurFrag with + // addFragment(Frag), allocate an empty fragment to maintain FragSpace + // connectivity, potentially reusing CurFrag's associated space. + MCFragment *F; + if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) { + auto End = reinterpret_cast<size_t>(getCurFragEnd()); + F = reinterpret_cast<MCFragment *>( + alignToPowerOf2(End, alignof(MCFragment))); + FragSpace -= size_t(F) - End + sizeof(MCFragment); + } else { + F = allocFragSpace(0); + } + new (F) MCFragment(); + + addFragment(Frag); addFragment(F); - newFragment(); +} + +void MCObjectStreamer::appendContents(ArrayRef<char> Contents) { + ensureHeadroom(Contents.size()); + assert(FragSpace >= Contents.size()); + llvm::copy(Contents, getCurFragEnd()); + CurFrag->FixedSize += Contents.size(); + FragSpace -= Contents.size(); } void MCObjectStreamer::appendContents(size_t Num, char Elt) { - CurFrag->appendContents(Num, Elt); + ensureHeadroom(Num); + MutableArrayRef<char> Data(getCurFragEnd(), Num); + llvm::fill(Data, Elt); + CurFrag->FixedSize += Num; + FragSpace -= Num; } void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) { - CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind)); + CurFrag->addFixup(MCFixup::create(getCurFragSize(), Value, Kind)); } // As a compile-time optimization, avoid allocating and evaluating an MCExpr @@ -111,6 +171,8 @@ void MCObjectStreamer::reset() { } EmitEHFrame = true; EmitDebugFrame = false; + FragStorage.clear(); + FragSpace = 0; MCStreamer::reset(); } @@ -139,7 +201,6 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) { void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) { MCStreamer::emitValueImpl(Value, Size, Loc); - MCFragment *DF = getCurrentFragment(); MCDwarfLineEntry::make(this, getCurrentSectionOnly()); @@ -154,9 +215,9 @@ void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, emitIntValue(AbsValue, Size); return; } - DF->addFixup(MCFixup::create(DF->getContents().size(), Value, - MCFixup::getDataKindForSize(Size))); - DF->appendContents(Size, 0); + ensureHeadroom(Size); + addFixup(Value, MCFixup::getDataKindForSize(Size)); + appendContents(Size, 0); } MCSymbol *MCObjectStreamer::emitCFILabel() { @@ -190,7 +251,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { // section. MCFragment *F = CurFrag; Symbol->setFragment(F); - Symbol->setOffset(F->getContents().size()); + Symbol->setOffset(F->getFixedSize()); emitPendingAssignments(Symbol); } @@ -256,6 +317,21 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { F0 = CurFrag; } + // To maintain connectivity between CurFrag and FragSpace when CurFrag is + // modified, allocate an empty fragment and append it to the fragment list. + // (Subsections[I].second.Tail is not connected to FragSpace.) + MCFragment *F; + if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) { + auto End = reinterpret_cast<size_t>(getCurFragEnd()); + F = reinterpret_cast<MCFragment *>( + alignToPowerOf2(End, alignof(MCFragment))); + FragSpace -= size_t(F) - End + sizeof(MCFragment); + } else { + F = allocFragSpace(0); + } + new (F) MCFragment(); + F->setParent(Section); + auto &Subsections = Section->Subsections; size_t I = 0, E = Subsections.size(); while (I != E && Subsections[I].first < Subsection) @@ -263,13 +339,16 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { // If the subsection number is not in the sorted Subsections list, create a // new fragment list. if (I == E || Subsections[I].first != Subsection) { - auto *F = getContext().allocFragment<MCFragment>(); - F->setParent(Section); Subsections.insert(Subsections.begin() + I, {Subsection, MCSection::FragList{F, F}}); + Section->CurFragList = &Subsections[I].second; + CurFrag = F; + } else { + Section->CurFragList = &Subsections[I].second; + CurFrag = Subsections[I].second.Tail; + // Ensure CurFrag is associated with FragSpace. + addFragment(F); } - Section->CurFragList = &Subsections[I].second; - CurFrag = Section->CurFragList->Tail; // Define the section symbol at subsection 0's initial fragment if required. if (!NewSec) @@ -340,11 +419,15 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst, MCFragment *F = getCurrentFragment(); // Append the instruction to the data fragment. - size_t CodeOffset = F->getContents().size(); + size_t CodeOffset = getCurFragSize(); + SmallString<16> Content; SmallVector<MCFixup, 1> Fixups; - getAssembler().getEmitter().encodeInstruction( - Inst, F->getContentsForAppending(), Fixups, STI); - F->doneAppending(); + getAssembler().getEmitter().encodeInstruction(Inst, Content, Fixups, STI); + appendContents(Content); + if (CurFrag != F) { + F = CurFrag; + CodeOffset = 0; + } F->setHasInstructions(STI); if (Fixups.empty()) @@ -538,8 +621,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) { void MCObjectStreamer::emitBytes(StringRef Data) { MCDwarfLineEntry::make(this, getCurrentSectionOnly()); - MCFragment *DF = getCurrentFragment(); - DF->appendContents(ArrayRef(Data.data(), Data.size())); + appendContents(ArrayRef(Data.data(), Data.size())); } void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill, diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp index 72a8dd7..a87648a 100644 --- a/llvm/lib/MC/MCWin64EH.cpp +++ b/llvm/lib/MC/MCWin64EH.cpp @@ -318,6 +318,9 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // Emit the epilog instructions. if (EnableUnwindV2) { + // Ensure the fixups and appended content apply to the same fragment. + OS->ensureHeadroom(info->EpilogMap.size() * 2); + bool IsLast = true; for (const auto &Epilog : llvm::reverse(info->EpilogMap)) { if (IsLast) { diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index 1ffe25c..8be5054 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -280,6 +280,7 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) { void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) { visitUsedSymbol(*Symbol); const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext()); + ensureHeadroom(2); addFixup(SRE, FK_SecRel_2); appendContents(2, 0); } @@ -293,6 +294,7 @@ void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol, if (Offset) MCE = MCBinaryExpr::createAdd( MCE, MCConstantExpr::create(Offset, getContext()), getContext()); + ensureHeadroom(4); addFixup(MCE, FK_SecRel_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -308,6 +310,7 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol, if (Offset) MCE = MCBinaryExpr::createAdd( MCE, MCConstantExpr::create(Offset, getContext()), getContext()); + ensureHeadroom(4); addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -318,6 +321,7 @@ void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) { // Create Symbol for section number. const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create( *Symbol, this->getWriter(), getContext()); + ensureHeadroom(4); addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -328,6 +332,7 @@ void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) { // Create Symbol for section offset. const MCExpr *MCE = MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext()); + ensureHeadroom(4); addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp index 898ac5d..26f45ce 100644 --- a/llvm/lib/MC/MCXCOFFStreamer.cpp +++ b/llvm/lib/MC/MCXCOFFStreamer.cpp @@ -103,16 +103,8 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility( void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) { // Add a Fixup here to later record a relocation of type R_REF to prevent the // ref symbol from being garbage collected (by the binder). - MCFragment *DF = getCurrentFragment(); - const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext()); - std::optional<MCFixupKind> MaybeKind = - getAssembler().getBackend().getFixupKind("R_REF"); - if (!MaybeKind) - report_fatal_error("failed to get fixup kind for R_REF relocation"); - - MCFixupKind Kind = *MaybeKind; - MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, Kind); - DF->addFixup(Fixup); + addFixup(MCSymbolRefExpr::create(Symbol, getContext()), + XCOFF::RelocationType::R_REF); } void MCXCOFFStreamer::emitXCOFFRenameDirective(const MCSymbol *Name, diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 7b5c3c0..e87696a 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -806,7 +806,7 @@ uint64_t MachObjectWriter::writeObject() { } MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0, SectionKind::getMetadata()); - llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data()); + llvm::copy(OS.str(), Sec->curFragList()->Head->getVarContents().data()); } unsigned NumSections = Asm.end() - Asm.begin(); diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp index 62a71d4..9b55f76 100644 --- a/llvm/lib/ObjCopy/COFF/COFFReader.cpp +++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp @@ -135,7 +135,7 @@ Error COFFReader::readSymbols(Object &Obj, bool IsBigObj) const { // it is, find the target section unique id. const coff_aux_section_definition *SD = SymRef.getSectionDefinition(); const coff_aux_weak_external *WE = SymRef.getWeakExternal(); - if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE) { + if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE && !Obj.IsPE) { int32_t Index = SD->getNumber(IsBigObj); if (Index <= 0 || static_cast<uint32_t>(Index - 1) >= Sections.size()) return createStringError(object_error::parse_failed, diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp index 2d74d1d..5863490 100644 --- a/llvm/lib/Object/SFrameParser.cpp +++ b/llvm/lib/Object/SFrameParser.cpp @@ -10,27 +10,41 @@ #include "llvm/BinaryFormat/SFrame.h" #include "llvm/Object/Error.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; using namespace llvm::object; -template <typename T> -static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data, - uint64_t Offset) { - static_assert(std::is_trivial_v<T>); - if (Data.size() < Offset + sizeof(T)) { +static Expected<ArrayRef<uint8_t>> +getDataSlice(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Size) { + uint64_t End = SaturatingAdd(Offset, Size); + // Data.size() cannot be UINT64_MAX, as it would occupy the whole address + // space. + if (End > Data.size()) { return createStringError( formatv("unexpected end of data at offset {0:x} while reading [{1:x}, " "{2:x})", - Data.size(), Offset, Offset + sizeof(T)) + Data.size(), Offset, End) .str(), object_error::unexpected_eof); } - return *reinterpret_cast<const T *>(Data.data() + Offset); + return Data.slice(Offset, Size); +} + +template <typename T> +static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data, + uint64_t Offset) { + static_assert(std::is_trivial_v<T>); + Expected<ArrayRef<uint8_t>> Slice = getDataSlice(Data, Offset, sizeof(T)); + if (!Slice) + return Slice.takeError(); + + return *reinterpret_cast<const T *>(Slice->data()); } template <endianness E> -Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) { +Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents, + uint64_t SectionAddress) { Expected<const sframe::Preamble<E> &> Preamble = getDataSliceAs<sframe::Preamble<E>>(Contents, 0); if (!Preamble) @@ -48,8 +62,44 @@ Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) { getDataSliceAs<sframe::Header<E>>(Contents, 0); if (!Header) return Header.takeError(); - return SFrameParser(Contents, *Header); + return SFrameParser(Contents, SectionAddress, *Header); +} + +template <endianness E> +Expected<ArrayRef<uint8_t>> SFrameParser<E>::getAuxHeader() const { + return getDataSlice(Data, sizeof(Header), Header.AuxHdrLen); +} + +template <endianness E> +Expected<ArrayRef<sframe::FuncDescEntry<E>>> SFrameParser<E>::fdes() const { + Expected<ArrayRef<uint8_t>> Slice = getDataSlice( + Data, getFDEBase(), Header.NumFDEs * sizeof(sframe::FuncDescEntry<E>)); + if (!Slice) + return Slice.takeError(); + return ArrayRef( + reinterpret_cast<const sframe::FuncDescEntry<E> *>(Slice->data()), + Header.NumFDEs); +} + +template <endianness E> +uint64_t SFrameParser<E>::getAbsoluteStartAddress( + typename FDERange::iterator FDE) const { + uint64_t Result = SectionAddress + FDE->StartAddress; + + if ((getPreamble().Flags.value() & sframe::Flags::FDEFuncStartPCRel) == + sframe::Flags::FDEFuncStartPCRel) { + uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data()); + uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE); + + assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() && + "Iterator does not belong to this object!"); + + Result += FDEPtr - DataPtr; + } + + return Result; } -template class llvm::object::SFrameParser<endianness::big>; -template class llvm::object::SFrameParser<endianness::little>; +template class LLVM_EXPORT_TEMPLATE llvm::object::SFrameParser<endianness::big>; +template class LLVM_EXPORT_TEMPLATE + llvm::object::SFrameParser<endianness::little>; diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index 277247e..cc02cae 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -1190,7 +1190,7 @@ Expected<size_t> readNativeFile(file_t FD, MutableArrayRef<char> Buf) { size_t Size = Buf.size(); #endif ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Size); - if (ssize_t(NumRead) == -1) + if (NumRead == -1) return errorCodeToError(errnoAsErrorCode()); // The underlying operation on these platforms allow opening directories // for reading in more cases than other platforms. diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc index d862dbd..8dd7c88 100644 --- a/llvm/lib/Support/Windows/Threading.inc +++ b/llvm/lib/Support/Windows/Threading.inc @@ -106,7 +106,67 @@ void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); } +namespace llvm::sys::windows { +HMODULE loadSystemModuleSecure(LPCWSTR lpModuleName) { + // Ensure we load indeed a module from system32 path. + // As per GetModuleHandle documentation: + // "If lpModuleName does not include a path and there is more than one loaded + // module with the same base name and extension, you cannot predict which + // module handle will be returned.". This mitigates + // https://learn.microsoft.com/en-us/security-updates/securityadvisories/2010/2269637 + SmallVector<wchar_t, MAX_PATH> Buf; + size_t Size = MAX_PATH; + do { + Buf.resize_for_overwrite(Size); + SetLastError(NO_ERROR); + Size = ::GetSystemDirectoryW(Buf.data(), Buf.size()); + if (Size == 0) + return NULL; + + // Try again with larger buffer. + } while (Size > Buf.size()); + + Buf.truncate(Size); + Buf.push_back(L'\\'); + Buf.append(lpModuleName, lpModuleName + std::wcslen(lpModuleName)); + Buf.push_back(0); + + return ::GetModuleHandleW(Buf.data()); +} +} // namespace llvm::sys::windows + SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) { + HMODULE kernelM = llvm::sys::windows::loadSystemModuleSecure(L"kernel32.dll"); + if (kernelM) { + // SetThreadInformation is only available on Windows 8 and later. Since we + // still support compilation on Windows 7, we load the function dynamically. + typedef BOOL(WINAPI * SetThreadInformation_t)( + HANDLE hThread, THREAD_INFORMATION_CLASS ThreadInformationClass, + _In_reads_bytes_(ThreadInformationSize) PVOID ThreadInformation, + ULONG ThreadInformationSize); + static const auto pfnSetThreadInformation = + (SetThreadInformation_t)::GetProcAddress(kernelM, + "SetThreadInformation"); + if (pfnSetThreadInformation) { + auto setThreadInformation = [](ULONG ControlMaskAndStateMask) { + THREAD_POWER_THROTTLING_STATE state{}; + state.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION; + state.ControlMask = ControlMaskAndStateMask; + state.StateMask = ControlMaskAndStateMask; + return pfnSetThreadInformation( + ::GetCurrentThread(), ThreadPowerThrottling, &state, sizeof(state)); + }; + + // Use EcoQoS for ThreadPriority::Background available (running on most + // efficent cores at the most efficient cpu frequency): + // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadinformation + // https://learn.microsoft.com/en-us/windows/win32/procthread/quality-of-service + setThreadInformation(Priority == ThreadPriority::Background + ? THREAD_POWER_THROTTLING_EXECUTION_SPEED + : 0); + } + } + // https://docs.microsoft.com/en-us/windows/desktop/api/processthreadsapi/nf-processthreadsapi-setthreadpriority // Begin background processing mode. The system lowers the resource scheduling // priorities of the thread so that it can perform background work without diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 1f3e5dc..3f318e2 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -985,6 +985,12 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { } break; + case GETDAGOPNAME: + if (const auto *Dag = dyn_cast<DagInit>(LHS)) { + return Dag->getName(); + } + break; + case LOG2: if (const auto *LHSi = dyn_cast_or_null<IntInit>( LHS->convertInitializerTo(IntRecTy::get(RK)))) { @@ -1050,6 +1056,9 @@ std::string UnOpInit::getAsString() const { case SIZE: Result = "!size"; break; case EMPTY: Result = "!empty"; break; case GETDAGOP: Result = "!getdagop"; break; + case GETDAGOPNAME: + Result = "!getdagopname"; + break; case LOG2 : Result = "!logtwo"; break; case LISTFLATTEN: Result = "!listflatten"; @@ -1310,7 +1319,11 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { SmallVector<std::pair<const Init *, const StringInit *>, 8> Args; llvm::append_range(Args, LHSs->getArgAndNames()); llvm::append_range(Args, RHSs->getArgAndNames()); - return DagInit::get(Op, Args); + // Use the name of the LHS DAG if it's set, otherwise the name of the RHS. + const auto *NameInit = LHSs->getName(); + if (!NameInit) + NameInit = RHSs->getName(); + return DagInit::get(Op, NameInit, Args); } break; } @@ -1508,6 +1521,14 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { return DagInit::get(Op, Dag->getArgs(), Dag->getArgNames()); break; } + case SETDAGOPNAME: { + const auto *Dag = dyn_cast<DagInit>(LHS); + const auto *Op = dyn_cast<StringInit>(RHS); + if (Dag && Op) + return DagInit::get(Dag->getOperator(), Op, Dag->getArgs(), + Dag->getArgNames()); + break; + } case ADD: case SUB: case MUL: @@ -1620,6 +1641,9 @@ std::string BinOpInit::getAsString() const { case STRCONCAT: Result = "!strconcat"; break; case INTERLEAVE: Result = "!interleave"; break; case SETDAGOP: Result = "!setdagop"; break; + case SETDAGOPNAME: + Result = "!setdagopname"; + break; case GETDAGARG: Result = "!getdagarg<" + getType()->getAsString() + ">"; break; diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index aea1bb0..c369916 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -680,6 +680,8 @@ tgtok::TokKind TGLexer::LexExclaim() { .Case("find", tgtok::XFind) .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated. .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated. + .Case("setdagopname", tgtok::XSetDagOpName) + .Case("getdagopname", tgtok::XGetDagOpName) .Case("getdagarg", tgtok::XGetDagArg) .Case("getdagname", tgtok::XGetDagName) .Case("setdagarg", tgtok::XSetDagArg) diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index ed7d8f3..5725e39 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -150,6 +150,8 @@ enum TokKind { XGt, XSetDagOp, XGetDagOp, + XSetDagOpName, + XGetDagOpName, XExists, XListRemove, XToLower, diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index 62c5355..81b61b1 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "TGParser.h" +#include "TGLexer.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" @@ -1199,6 +1200,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { case tgtok::XCast: case tgtok::XRepr: case tgtok::XGetDagOp: + case tgtok::XGetDagOpName: case tgtok::XInitialized: { // Value ::= !unop '(' Value ')' UnOpInit::UnaryOp Code; const RecTy *Type = nullptr; @@ -1287,6 +1289,11 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } Code = UnOpInit::GETDAGOP; break; + case tgtok::XGetDagOpName: + Lex.Lex(); // eat the operation + Type = StringRecTy::get(Records); + Code = UnOpInit::GETDAGOPNAME; + break; case tgtok::XInitialized: Lex.Lex(); // eat the operation Code = UnOpInit::INITIALIZED; @@ -1514,7 +1521,8 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { case tgtok::XInterleave: case tgtok::XGetDagArg: case tgtok::XGetDagName: - case tgtok::XSetDagOp: { // Value ::= !binop '(' Value ',' Value ')' + case tgtok::XSetDagOp: + case tgtok::XSetDagOpName: { // Value ::= !binop '(' Value ',' Value ')' tgtok::TokKind OpTok = Lex.getCode(); SMLoc OpLoc = Lex.getLoc(); Lex.Lex(); // eat the operation @@ -1550,6 +1558,9 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { case tgtok::XStrConcat: Code = BinOpInit::STRCONCAT; break; case tgtok::XInterleave: Code = BinOpInit::INTERLEAVE; break; case tgtok::XSetDagOp: Code = BinOpInit::SETDAGOP; break; + case tgtok::XSetDagOpName: + Code = BinOpInit::SETDAGOPNAME; + break; case tgtok::XGetDagArg: Code = BinOpInit::GETDAGARG; break; @@ -1580,6 +1591,10 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } ArgType = DagRecTy::get(Records); break; + case tgtok::XSetDagOpName: + Type = DagRecTy::get(Records); + ArgType = DagRecTy::get(Records); + break; case tgtok::XGetDagName: Type = StringRecTy::get(Records); ArgType = DagRecTy::get(Records); @@ -1773,22 +1788,26 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { // Deal with BinOps whose arguments have different types, by // rewriting ArgType in between them. switch (Code) { - case BinOpInit::SETDAGOP: - // After parsing the first dag argument, switch to expecting - // a record, with no restriction on its superclasses. - ArgType = RecordRecTy::get(Records, {}); - break; - case BinOpInit::GETDAGARG: - // After parsing the first dag argument, expect an index integer or a - // name string. - ArgType = nullptr; - break; - case BinOpInit::GETDAGNAME: - // After parsing the first dag argument, expect an index integer. - ArgType = IntRecTy::get(Records); - break; - default: - break; + case BinOpInit::SETDAGOPNAME: + // After parsing the first dag argument, expect a string. + ArgType = StringRecTy::get(Records); + break; + case BinOpInit::SETDAGOP: + // After parsing the first dag argument, switch to expecting + // a record, with no restriction on its superclasses. + ArgType = RecordRecTy::get(Records, {}); + break; + case BinOpInit::GETDAGARG: + // After parsing the first dag argument, expect an index integer or a + // name string. + ArgType = nullptr; + break; + case BinOpInit::GETDAGNAME: + // After parsing the first dag argument, expect an index integer. + ArgType = IntRecTy::get(Records); + break; + default: + break; } if (!consume(tgtok::comma)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index eca7ca5..ad42f4b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5296,7 +5296,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 0, AArch64::LD1B_2Z_IMM_PSEUDO, AArch64::LD1B_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5307,7 +5307,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 1, AArch64::LD1H_2Z_IMM_PSEUDO, AArch64::LD1H_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5317,7 +5317,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 2, AArch64::LD1W_2Z_IMM_PSEUDO, AArch64::LD1W_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5327,7 +5327,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 3, AArch64::LD1D_2Z_IMM_PSEUDO, AArch64::LD1D_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5341,7 +5341,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 0, AArch64::LD1B_4Z_IMM_PSEUDO, AArch64::LD1B_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5352,7 +5352,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 1, AArch64::LD1H_4Z_IMM_PSEUDO, AArch64::LD1H_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5362,7 +5362,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 2, AArch64::LD1W_4Z_IMM_PSEUDO, AArch64::LD1W_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5372,7 +5372,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 3, AArch64::LD1D_4Z_IMM_PSEUDO, AArch64::LD1D_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5386,7 +5386,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM_PSEUDO, AArch64::LDNT1B_2Z_PSEUDO); @@ -5398,7 +5398,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM_PSEUDO, AArch64::LDNT1H_2Z_PSEUDO); @@ -5409,7 +5409,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM_PSEUDO, AArch64::LDNT1W_2Z_PSEUDO); @@ -5420,7 +5420,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM_PSEUDO, AArch64::LDNT1D_2Z_PSEUDO); @@ -5435,7 +5435,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM_PSEUDO, AArch64::LDNT1B_4Z_PSEUDO); @@ -5447,7 +5447,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM_PSEUDO, AArch64::LDNT1H_4Z_PSEUDO); @@ -5458,7 +5458,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM_PSEUDO, AArch64::LDNT1W_4Z_PSEUDO); @@ -5469,7 +5469,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM_PSEUDO, AArch64::LDNT1D_4Z_PSEUDO); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7b49754..4fef93c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11325,7 +11325,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue AArch64TargetLowering::LowerSELECT_CC( ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, - iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs, + iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags, const SDLoc &DL, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. @@ -11386,6 +11386,22 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return DAG.getNode(ISD::AND, DL, VT, LHS, Shift); } + // Canonicalise absolute difference patterns: + // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc -> + // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc + // + // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc -> + // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc + // The second forms can be matched into subs+cneg. + if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) { + if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS && + FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) + FVal = DAG.getNegative(TVal, DL, TVal.getValueType()); + else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS && + FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) + TVal = DAG.getNegative(FVal, DL, FVal.getValueType()); + } + unsigned Opcode = AArch64ISD::CSEL; // If both the TVal and the FVal are constants, see if we can swap them in @@ -11523,7 +11539,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return true; } })) { - bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs; + bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs(); SDValue VectorCmp = emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG); if (VectorCmp) @@ -11537,7 +11553,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); - if (DAG.getTarget().Options.UnsafeFPMath) { + if (Flags.hasNoSignedZeros()) { // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); @@ -11616,10 +11632,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); - bool HasNoNans = Op->getFlags().hasNoNaNs(); + SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, - DAG); + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, @@ -11627,7 +11642,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); - bool HasNoNans = Op->getFlags().hasNoNaNs(); SDLoc DL(Op); EVT Ty = Op.getValueType(); @@ -11694,8 +11708,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, DAG.getUNDEF(MVT::f32), FVal); } - SDValue Res = - LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG); + SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), + Op->getFlags(), DL, DAG); if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res); @@ -12292,7 +12306,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SDLoc DL(Operand); EVT VT = Operand.getValueType(); - SDNodeFlags Flags = SDNodeFlags::AllowReassociation; + // Ensure nodes can be recognized by isAssociativeAndCommutative. + SDNodeFlags Flags = + SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros; // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) @@ -16674,7 +16690,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath)); + I->getFastMathFlags().allowContract())); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding @@ -24112,6 +24128,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, Store->getMemOperand()); } +// Combine store (fp_to_int X) to use vector semantics around the conversion +// when NEON is available. This allows us to store the in-vector result directly +// without transferring the result into a GPR in the process. +static SDValue combineStoreValueFPToInt(StoreSDNode *ST, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + // Limit to post-legalization in order to avoid peeling truncating stores. + if (DCI.isBeforeLegalize()) + return SDValue(); + if (!Subtarget->isNeonAvailable()) + return SDValue(); + // Source operand is already a vector. + SDValue Value = ST->getValue(); + if (Value.getValueType().isVector()) + return SDValue(); + + // Look through potential assertions. + while (Value->isAssert()) + Value = Value.getOperand(0); + + if (Value.getOpcode() != ISD::FP_TO_SINT && + Value.getOpcode() != ISD::FP_TO_UINT) + return SDValue(); + if (!Value->hasOneUse()) + return SDValue(); + + SDValue FPSrc = Value.getOperand(0); + EVT SrcVT = FPSrc.getValueType(); + if (SrcVT != MVT::f32 && SrcVT != MVT::f64) + return SDValue(); + + // No support for assignments such as i64 = fp_to_sint i32 + EVT VT = Value.getSimpleValueType(); + if (VT != SrcVT.changeTypeToInteger()) + return SDValue(); + + // Create a 128-bit element vector to avoid widening. The floating point + // conversion is transformed into a single element conversion via a pattern. + unsigned NumElements = 128 / SrcVT.getFixedSizeInBits(); + EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements); + EVT VecDstVT = VecSrcVT.changeTypeToInteger(); + SDLoc DL(ST); + SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc); + SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP); + + SDValue Zero = DAG.getVectorIdxConstant(0, DL); + SDValue Extracted = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero); + + DCI.CombineTo(ST->getValue().getNode(), Extracted); + return SDValue(ST, 0); +} + bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) { return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) || (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) || @@ -24194,6 +24264,9 @@ static SDValue performSTORECombine(SDNode *N, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(ST); + if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget)) + return Res; + auto hasValidElementTypeForFPTruncStore = [](EVT VT) { EVT EltVT = VT.getVectorElementType(); return EltVT == MVT::f32 || EltVT == MVT::f64; @@ -26926,6 +26999,23 @@ static SDValue performSHLCombine(SDNode *N, return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS); } +static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntrinsicID = N->getConstantOperandVal(1); + auto Register = + (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR + : AArch64SysReg::RNDRRS); + SDLoc DL(N); + SDValue A = DAG.getNode( + AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), + N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); + SDValue B = DAG.getNode( + AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); + return DAG.getMergeValues( + {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -27241,22 +27331,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); case Intrinsic::aarch64_rndr: - case Intrinsic::aarch64_rndrrs: { - unsigned IntrinsicID = N->getConstantOperandVal(1); - auto Register = - (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR - : AArch64SysReg::RNDRRS); - SDLoc DL(N); - SDValue A = DAG.getNode( - AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), - N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); - SDValue B = DAG.getNode( - AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); - return DAG.getMergeValues( - {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); - } + case Intrinsic::aarch64_rndrrs: + return performRNDRCombine(N, DAG); case Intrinsic::aarch64_sme_ldr_zt: return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N), DAG.getVTList(MVT::Other), N->getOperand(0), diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 95d0e3b..ea63edd8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -662,7 +662,7 @@ private: SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, iterator_range<SDNode::user_iterator> Users, - bool HasNoNans, const SDLoc &dl, + SDNodeFlags Flags, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 8685d7a0..59d4fd2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -6574,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by // the target options or if FADD/FSUB has the contract fast-math flag. - return Options.UnsafeFPMath || - Options.AllowFPOpFusion == FPOpFusion::Fast || + return Options.AllowFPOpFusion == FPOpFusion::Fast || Inst.getFlag(MachineInstr::FmContract); - return true; } return false; } @@ -6680,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, case AArch64::FMUL_ZZZ_H: case AArch64::FMUL_ZZZ_S: case AArch64::FMUL_ZZZ_D: - return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || - (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && - Inst.getFlag(MachineInstr::MIFlag::FmNsz)); + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); // == Integer types == // -- Base instructions -- diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 07cacfa..251fd44 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6668,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; } +def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>; +def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>; + // int -> float conversion of value in lane 0 of simd vector should use // correct cvtf variant to avoid costly fpr <-> gpr register transfers. def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))), diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index abcd550..b97d622 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -12,7 +12,7 @@ // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi -// MOVi64imm + ADDXrr ==> ANDXri + ANDXri +// MOVi64imm + ADDXrr ==> ADDXri + ADDXri // // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi // MOVi64imm + SUBXrr ==> SUBXri + SUBXri @@ -125,8 +125,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { template <typename T> bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); + // Strategy used to split logical immediate bitmasks. + enum class SplitStrategy { + Intersect, + }; template <typename T> - bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0); + bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, unsigned OtherOpc = 0); bool visitORR(MachineInstr &MI); bool visitCSEL(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); @@ -158,14 +163,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", template <typename T> static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { T UImm = static_cast<T>(Imm); - if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) - return false; - - // If this immediate can be handled by one instruction, do not split it. - SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; - AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); - if (Insn.size() == 1) - return false; // The bitmask immediate consists of consecutive ones. Let's say there is // constant 0b00000000001000000000010000000000 which does not consist of @@ -194,8 +191,9 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> -bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, - unsigned OtherOpc) { +bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, + unsigned OtherOpc) { // Try below transformation. // // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri @@ -208,9 +206,26 @@ bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, return splitTwoPartImm<T>( MI, - [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0, - T &Imm1) -> std::optional<OpcodePair> { - if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) + [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0, + T &Imm1) -> std::optional<OpcodePair> { + // If this immediate is already a suitable bitmask, don't split it. + // TODO: Should we just combine the two instructions in this case? + if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) + return std::nullopt; + + // If this immediate can be handled by one instruction, don't split it. + SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; + AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); + if (Insn.size() == 1) + return std::nullopt; + + bool SplitSucc = false; + switch (Strategy) { + case SplitStrategy::Intersect: + SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1); + break; + } + if (SplitSucc) return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); return std::nullopt; }, @@ -859,16 +874,20 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { Changed |= visitINSERT(MI); break; case AArch64::ANDWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); + Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); + Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDSWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri); + Changed |= trySplitLogicalImm<uint32_t>( + AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri); break; case AArch64::ANDSXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri); + Changed |= trySplitLogicalImm<uint64_t>( + AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri); break; case AArch64::ORRWrs: Changed |= visitORR(MI); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index bb0f667b..e0e1af7 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1650,6 +1650,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.eraseFromParent(); return true; }; + auto LowerTriOp = [&MI, &MIB](unsigned Opcode) { + MIB.buildInstr(Opcode, {MI.getOperand(0)}, + {MI.getOperand(2), MI.getOperand(3), MI.getOperand(4)}); + MI.eraseFromParent(); + return true; + }; Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrinsicID) { @@ -1828,6 +1834,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(TargetOpcode::G_USUBSAT); break; } + case Intrinsic::aarch64_neon_udot: + return LowerTriOp(AArch64::G_UDOT); + case Intrinsic::aarch64_neon_sdot: + return LowerTriOp(AArch64::G_SDOT); case Intrinsic::vector_reverse: // TODO: Add support for vector_reverse diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8b8fc8b..071c940 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -286,6 +286,12 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", "VMEM CU scope prefetches do not fail on illegal address" >; +def FeatureCUStores : SubtargetFeature<"cu-stores", + "HasCUStores", + "true", + "Whether SCOPE_CU stores can be used on GFX12.5" +>; + def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -1383,6 +1389,9 @@ def FeatureAddSubU64Insts : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", "Has v_add_u64 and v_sub_u64 instructions">; +def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst", + "true", "Has v_mad_u32 instruction">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -1988,6 +1997,7 @@ def FeatureISAVersion12 : FeatureSet< def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, + FeatureCUStores, FeatureCuMode, Feature64BitLiterals, FeatureLDSBankCount32, @@ -2042,6 +2052,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureVmemPrefInsts, FeatureLshlAddU64Inst, FeatureAddSubU64Insts, + FeatureMadU32Inst, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2422,7 +2433,7 @@ def HasAtomicFMinFMaxF64FlatInsts : def HasLdsAtomicAddF64 : Predicate<"Subtarget->hasLdsAtomicAddF64()">, - AssemblerPredicate<(any_of FeatureGFX90AInsts)>; + AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>; def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>; @@ -2832,6 +2843,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; +def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">, + AssemblerPredicate<(all_of FeatureMadU32Inst)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 4b3dc37..6681393 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= @@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; } - if (MF.getSubtarget<GCNSubtarget>().isWave32()) { + if (ST.isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } + if (isGFX1250(ST) && ST.hasCUStores()) { + KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES; + } // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be // un-evaluatable at this point so it cannot be conditionally checked here. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index c01e5d3..992572f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -143,6 +143,9 @@ def gi_global_saddr_cpol : def gi_global_saddr_glc : GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, GIComplexPatternEquiv<GlobalSAddrGLC>; +def gi_global_saddr_no_ioffset : + GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">, + GIComplexPatternEquiv<GlobalSAddrNoIOffset>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index dfaa145..39b4200 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; unsigned Opc; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1); if (Subtarget->hasMADIntraFwdBug()) Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 : AMDGPU::V_MAD_U64_U32_gfx11_e64; + else if (UseNoCarry) + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp }; + + if (UseNoCarry) { + MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops); + ReplaceUses(SDValue(N, 0), SDValue(Mad, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } @@ -2049,6 +2060,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &CPol) const { + bool ScaleOffset; + SDValue DummyOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset, + false)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 5636d89..983f1aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -174,6 +174,8 @@ private: bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 266dee1..fab99f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && + MRI->use_nodbg_empty(I.getOperand(1).getReg()); unsigned Opc; if (Subtarget->hasMADIntraFwdBug()) Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 : AMDGPU::V_MAD_I64_I32_gfx11_e64; + else if (UseNoCarry) + Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64 + : AMDGPU::V_MAD_NC_I64_I32_e64; else Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; + + if (UseNoCarry) + I.removeOperand(1); + I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); @@ -3995,6 +4004,9 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const { } unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64; + if (!IsB32 && STI.hasTrue16BitInsts()) + Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64 + : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64; unsigned CBL = STI.getConstantBusLimit(Opc); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -5789,6 +5801,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset( + MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol, false); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); Register PtrBase; @@ -6971,13 +6994,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : (int64_t)SISrcMods::DST_OP_SEL); } @@ -6986,13 +7009,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)(SISrcMods::OP_SEL_0) : 0); } @@ -7015,14 +7038,15 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + ? (int64_t)SISrcMods::DST_OP_SEL + : 0); } void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index fe9743d0a..140e753 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -264,6 +264,8 @@ private: selectGlobalSAddrCPol(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrNoIOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fedfa3f..50da8fd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); if (ST.hasVOP3PInsts()) { - getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElements(0, S16, 2) - .minScalar(0, S16) - .widenScalarToNextPow2(0) - .scalarize(0) - .lower(); + getActionDefinitionsBuilder(G_ABS) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + if (ST.hasIntMinMax64()) { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, S64, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } else { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } } else { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) .legalFor({S32, S16}) @@ -1682,7 +1699,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasFlatAtomicFaddF32Inst()) Atomic.legalFor({{S32, FlatPtr}}); - if (ST.hasGFX90AInsts()) { + if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations // TODO: Move atomic expansion into legalizer @@ -2295,8 +2312,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture( LLT::scalar(32), commonAlignment(Align(64), Offset)); // Pointer address - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -2317,8 +2334,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), StructOffset)); - B.buildPtrAdd(LoadAddr, QueuePtr, - B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); + B.buildObjectPtrOffset( + LoadAddr, QueuePtr, + B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -4500,8 +4518,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, llvm_unreachable("failed to find kernarg segment ptr"); auto COffset = B.buildConstant(LLT::scalar(64), Offset); - // TODO: Should get nuw - return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); + return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0); } /// Legalize a value that's loaded from kernel arguments. This is only used by @@ -5676,8 +5693,8 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) return false; - // FIXME: This should be nuw - B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + B.buildObjectPtrOffset(DstReg, KernargPtrReg, + B.buildConstant(IdxTy, Offset).getReg(0)); return true; } @@ -7019,8 +7036,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( // Pointer address Register LoadAddr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); B.buildCopy(SGPR01, Temp); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index f471881..b45627d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -294,7 +294,8 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, BasePlusOffset = Base; } else { auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); - BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); + BasePlusOffset = + B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0); } auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c5a1d9e..6c40eb5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4009,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: - case AMDGPU::G_SMIN: - case AMDGPU::G_SMAX: - case AMDGPU::G_UMIN: - case AMDGPU::G_UMAX: case AMDGPU::G_ABS: case AMDGPU::G_SHUFFLE_VECTOR: case AMDGPU::G_SBFX: @@ -4022,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: + if (isSALUMapping(MI)) { + // There are no scalar 64-bit min and max, use vector instruction instead. + if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 && + Subtarget.hasIntMinMax64()) + return getDefaultMappingVOP(MI); + return getDefaultMappingSOP(MI); + } + return getDefaultMappingVOP(MI); case AMDGPU::G_FADD: case AMDGPU::G_FSUB: case AMDGPU::G_FMUL: @@ -5364,6 +5372,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 38f9ee5..c1f1703 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -104,7 +104,9 @@ #include "llvm/Transforms/Scalar/FlattenCFG.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/InferAddressSpaces.h" +#include "llvm/Transforms/Scalar/LICM.h" #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" #include "llvm/Transforms/Scalar/Sink.h" @@ -2066,7 +2068,12 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { // TODO: May want to move later or split into an early and late one. addPass(AMDGPUCodeGenPreparePass(TM)); - // TODO: LICM + // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may + // have expanded. + if (TM.getOptLevel() > CodeGenOptLevel::Less) { + addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()), + /*UseMemorySSA=*/true)); + } } Base::addIRPasses(addPass); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 421fc42..44e65b3 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -6066,6 +6066,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { ExprVal, ValRange); if (Val) ImpliedUserSGPRCount += 1; + } else if (ID == ".amdhsa_uses_cu_stores") { + if (!isGFX1250()) + return Error(IDRange.Start, "directive requires gfx12.5", IDRange); + + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange); } else if (ID == ".amdhsa_wavefront_size32") { EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 10) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index f99e716..1956a15 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -2489,7 +2489,7 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> { } //===----------------------------------------------------------------------===// -// MUBUF - GFX11, GFX12. +// MUBUF - GFX11, GFX12, GFX1250. //===----------------------------------------------------------------------===// // gfx11 instruction that accept both old and new assembler name. @@ -2600,6 +2600,12 @@ multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op, def : Mnem_gfx12<gfx11_name, gfx12_name>; } +multiclass MUBUF_Real_Atomic_gfx12_Renamed<bits<8> op, string real_name> : + MUBUF_Real_Atomic_gfx12_impl<op, 0, real_name>, + MUBUF_Real_Atomic_gfx12_impl<op, 1, real_name> { + def : Mnem_gfx12<get_BUF_ps<NAME>.Mnemonic, real_name>; +} + defm BUFFER_GL0_INV : MUBUF_Real_gfx11<0x02B>; defm BUFFER_GL1_INV : MUBUF_Real_gfx11<0x02C>; @@ -2678,6 +2684,10 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04B, "buffer defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>; defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>; +defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_gfx12<0x055>; +defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05b, "buffer_atomic_min_num_f64">; +defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05c, "buffer_atomic_max_num_f64">; + //===----------------------------------------------------------------------===// // MUBUF - GFX10. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 42edec0..c466f9c 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -199,6 +199,7 @@ add_llvm_target(AMDGPUCodeGen Instrumentation MC MIRParser + ObjCARC Passes Scalar SelectionDAG diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 319cc9d..3ff675d 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1397,6 +1397,9 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>; defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>; +defm DS_ADD_F64 : DS_Real_gfx12<0x054>; +defm DS_ADD_RTN_F64 : DS_Real_gfx12<0x074>; + let AssemblerPredicate = HasLdsBarrierArriveAtomic in { defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>; defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 5c1989b..ffe6b06 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + if (isGFX1250()) + PRINT_DIRECTIVE(".amdhsa_uses_cu_stores", + KERNEL_CODE_PROPERTY_USES_CU_STORES); if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0, diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 0f172e0d..d5d1074 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -11,6 +11,7 @@ let WantsRoot = true in { def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>; def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>; + def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>; def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; @@ -1361,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $dsaddr, $vaddr, $offset, $cpol) +>; + +class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $dsaddr, $saddr, $voffset, $offset, $cpol) +>; + +class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $vaddr, $dsaddr, $offset, $cpol) +>; + +class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $saddr, $voffset, $dsaddr, $offset, $cpol) +>; + class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), (inst $saddr, $voffset, $offset, $cpol) @@ -1571,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $saddr, $offset, $cpol) >; +multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatLoadLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + +multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatStoreLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadSignedPat <inst, node, vt> { let AddedComplexity = 10; @@ -2137,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in { defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>; } // End SubtargetPredicate = isGFX125xOnly +let OtherPredicates = [isGFX1250Plus] in { + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>; + + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>; +} + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; @@ -3435,6 +3488,14 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>; defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>; +defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">; +defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">; + +defm GLOBAL_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm GLOBAL_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "global_atomic_min_num_f64">; +defm GLOBAL_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "global_atomic_max_num_f64">; + def True16D16Table : GenericTable { let FilterClass = "True16D16Table"; let CppTypeName = "True16D16Info"; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 785ede3..fba1c5a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -248,6 +248,7 @@ protected: bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; bool HasSafeCUPrefetch = false; + bool HasCUStores = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -272,6 +273,7 @@ protected: bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; bool HasAddSubU64Insts = false; + bool HasMadU32Inst = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -714,7 +716,9 @@ public: bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); } // DS_ADD_F64/DS_ADD_RTN_F64 - bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); } + bool hasLdsAtomicAddF64() const { + return hasGFX90AInsts() || hasGFX1250Insts(); + } bool hasMultiDwordFlatScratchAddressing() const { return getGeneration() >= GFX9; @@ -998,6 +1002,8 @@ public: bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + bool hasCUStores() const { return HasCUStores; } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1516,9 +1522,19 @@ public: // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + // \returns true if the target has V_MAD_U32 instruction. + bool hasMadU32Inst() const { return HasMadU32Inst; } + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. bool hasVectorMulU64() const { return GFX1250Insts; } + // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32 + // instructions. + bool hasMadU64U32NoCarry() const { return GFX1250Insts; } + + // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions. + bool hasIntMinMax64() const { return GFX1250Insts; } + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 10f6d33..43ca548 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, ".amdhsa_user_sgpr_private_segment_size"); + if (isGFX1250(STI)) + PrintField(KD.kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES, + ".amdhsa_uses_cu_stores"); if (IVersion.Major >= 10) PrintField(KD.kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9017f4f..f4d7408 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, Custom); } + if (Subtarget->hasIntMinMax64()) + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64, + Legal); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, @@ -1256,6 +1260,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { return AMDGPUTargetLowering::getPointerMemTy(DL, AS); } +static unsigned getIntrMemWidth(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + return 8; + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + return 32; + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + return 64; + case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + return 128; + default: + llvm_unreachable("Unknown width"); + } +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1527,6 +1550,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(1); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(0); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { Info.opc = ISD::INTRINSIC_VOID; @@ -1623,10 +1666,18 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: Ptr = II->getArgOperand(1); break; default: @@ -4241,7 +4292,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = BaseAddr.getValue(1); Align StackAlign = TFL->getStackAlign(); if (Alignment > StackAlign) { - uint64_t ScaledAlignment = (uint64_t)Alignment.value() + uint64_t ScaledAlignment = Alignment.value() << Subtarget->getWavefrontSizeLog2(); uint64_t StackAlignMask = ScaledAlignment - 1; SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 520c321..4b48fc4 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1380,6 +1380,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Modified = true; } else WaitcntInstr = &II; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + assert(ST->hasVMemToLDSLoad()); + LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II + << "Before: " << Wait.LoadCnt << '\n';); + ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); + + // It is possible (but unlikely) that this is the only wait instruction, + // in which case, we exit this loop without a WaitcntInstr to consume + // `Wait`. But that works because `Wait` was passed in by reference, and + // the callee eventually calls createNewWaitcnt on it. We test this + // possibility in an articial MIR test since such a situation cannot be + // recreated by running the memory legalizer. + II.eraseFromParent(); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); @@ -1551,6 +1565,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + // Architectures higher than GFX10 do not have direct loads to + // LDS, so no work required here yet. + II.eraseFromParent(); + continue; } else { std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); assert(CT.has_value()); @@ -2415,6 +2434,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + Opcode == AMDGPU::S_WAITCNT_lds_direct || counterTypeForInstr(Opcode).has_value(); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2aa6b4e..c2da937 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9281,6 +9281,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { default: if (MI.isMetaInstruction()) return 0; + + // If D16 Pseudo inst, get correct MC code size + const auto *D16Info = AMDGPU::getT16D16Helper(Opc); + if (D16Info) { + // Assume d16_lo/hi inst are always in same size + unsigned LoInstOpcode = D16Info->LoOp; + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode); + DescSize = Desc.getSize(); + } + return DescSize; } } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index f1262e11..53f554e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (Pos == Position::AFTER) --MI; @@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (VSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) @@ -2564,7 +2584,9 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) + // We also require SCOPE_SE minimum if we not have the "cu-stores" feature. + if (Scope == CPol::SCOPE_CU && + (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) return setScope(MI, CPol::SCOPE_SE); return false; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e103ccc..8303410 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +// Represents the point at which a wave must wait for all outstanding direct loads to LDS. +// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. + +def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { + let hasSideEffects = 0; +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index b6f9568..9f1e53d 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -32,9 +32,10 @@ class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { let HasExtDPP = 0; } -let HasExt64BitDPP = 1 in { -def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; -def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; +def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let HasClamp = 1; @@ -44,6 +45,10 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; } +let HasExt64BitDPP = 1 in { +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; + class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { let HasExtVOP3DPP = 0; let HasExtDPP = 0; @@ -52,10 +57,13 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>; def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>; - -def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { +def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> { let HasExtVOP3DPP = 0; - let HasExtDPP = 0; + let HasExt64BitDPP = 1; +} +def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>; +def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> { + let HasClamp = 1; } } // End HasExt64BitDPP = 1; @@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32 defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">; defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; +let SchedRW = [WriteIntMul] in { + let SubtargetPredicate = HasMadU32Inst in + defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>; + let SubtargetPredicate = isGFX1250Plus in { + defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>; + defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>; + } +} + let SchedRW = [WriteDoubleAdd] in { let FPDPRounding = 1 in { defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">; @@ -185,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f } // End SchedRW = [WriteDoubleAdd] } // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1 +let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in { +defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>; +defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>; +defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>; +defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>; +} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] + } // End isReMaterializable = 1 let Uses = [MODE, VCC, EXEC] in { @@ -848,6 +872,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>; def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>; def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>; +let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in + def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>; + def : GCNPat< (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; @@ -972,10 +999,10 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim unsigned Val = N->getZExtValue(); unsigned New = 0; if (}] # modifier_idx # [{ == 0) { - New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) - : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); + New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) + : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); } else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) { - New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; } return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32); }]>; @@ -1019,7 +1046,7 @@ def gi_SrcSelToOpSelXForm : GICustomOperandRenderer<"renderSrcSelToOpSelXForm">, def DstSelToOpSel3XForm : SDNodeXForm<timm, [{ uint32_t V = N->getZExtValue(); return CurDAG->getTargetConstant( - (V & 0x2) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE, + (V & 0x1) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE, SDLoc(N), MVT::i32); }]>; def gi_DstSelToOpSel3XForm : GICustomOperandRenderer<"renderDstSelToOpSel3XFormXForm">, @@ -1427,34 +1454,72 @@ let SubtargetPredicate = isGFX12Plus in { } // End SubtargetPredicate = isGFX12Plus -let SubtargetPredicate = HasBitOp3Insts in { +let HasClamp = 0, HasModifiers = 1 in { +def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>; +def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>; +def BitOp3_B16_fake16_Profile : VOP3_Profile_Fake16<BitOp3_B16_Profile>; +} + +let OtherPredicates = [HasBitOp3Insts] in { let isReMaterializable = 1 in { - defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", - VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>; + let SubtargetPredicate = isGFX940Plus in + defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", BitOp3_B16_Profile>; + let SubtargetPredicate = isGFX1250Plus in + defm V_BITOP3_B16_gfx1250 : VOP3Inst_t16_with_profiles <"v_bitop3_b16_gfx1250", BitOp3_B16_Profile, + BitOp3_B16_t16_Profile, BitOp3_B16_fake16_Profile>; defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32", VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>, VOPD_Component<0x12, "v_bitop2_b32">; } + def : GCNPat< (i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; def : GCNPat< - (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; - - def : GCNPat< (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; - def : GCNPat< - (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; -} // End SubtargetPredicate = HasBitOp3Insts + let SubtargetPredicate = isGFX940Plus in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } // End SubtargetPredicate = isGFX940Plus + + let SubtargetPredicate = isGFX1250Plus in { + let True16Predicate = UseFakeTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } + let True16Predicate = UseRealTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + } + } // End SubtargetPredicate = isGFX1250Plus + +} // End OtherPredicates = [HasBitOp3Insts] class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat< (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), @@ -1746,6 +1811,17 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; +defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250<0x233, "v_bitop3_b16">; +defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>; + +defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>; +defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>; +defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>; +defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>; +defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>; +defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>; +defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>; + defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">; defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">; defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index c21e2d3..badbba9 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -401,6 +401,19 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { let Inst{49-41} = src0; } +class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0); +} + class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { bits<6> attr; bits<2> attrchan; @@ -1506,6 +1519,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1525,6 +1539,7 @@ class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1540,6 +1555,7 @@ class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1723,6 +1739,34 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> let Inst{14 - 8} = sdst; } +class VOP3_BITOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo p, GFXGen Gen, string asmName> + : VOP3_DPP16_Gen_t16<op, p, Gen, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + +class VOP3_BITOP3_DPP8<bits<10> op, VOP_Pseudo p, string asmName> + : Base_VOP3_DPP8_t16<op, p, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> : Base_VOP3_DPP8<op, ps, opName> { bits<8> sdst; @@ -1943,6 +1987,29 @@ multiclass VOP3be_Realtriple< multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> : VOP3be_Realtriple<Gen, op, 1>; +multiclass VOP3_BITOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName> { + def _e64_dpp#Gen.Suffix : + VOP3_BITOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(NAME#"_e64"#"_dpp"), Gen, asmName>; +} + +multiclass VOP3_BITOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP3_Pseudo>(NAME#"_e64"); + def _e64_dpp8#Gen.Suffix : VOP3_BITOP3_DPP8<op, ps, asmName> { + let DecoderNamespace = + Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16"); + let AssemblerPredicate = Gen.AssemblerPredicate; + } +} + +multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); + let IsSingle = ps.Pfl.IsSingle, AsmString = asmName # ps.AsmOperands in { + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3a_BITOP3_gfx12<op, ps.Pfl>; + } +} + //===----------------------------------------------------------------------===// // VOP3 GFX11 //===----------------------------------------------------------------------===// @@ -2046,6 +2113,16 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName, VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>, VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>; +multiclass VOP3_Real_BITOP3_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> : + VOP3_BITOP3_Real_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp8_Base<GFX1250Gen, op, asmName>; + +multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> { + defm _t16 : VOP3_Real_BITOP3_gfx1250<op, asmName>; + defm _fake16: VOP3_Real_BITOP3_gfx1250<op, asmName>; +} + multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0, string opName = NAME> : VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>, diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 868556b..6dfe846 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -1284,14 +1284,11 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; } // Add the R_ARM_NONE fixup at the same position void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name); + visitUsedSymbol(*PersonalitySym); const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext()); - - visitUsedExpr(*PersonalityRef); - MCFragment *DF = getCurrentFragment(); - DF->addFixup( - MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4)); + addFixup(PersonalityRef, FK_Data_4); } void ARMELFStreamer::FlushPendingOffset() { diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index d96136c..a5bf0e5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2621,9 +2621,38 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - if (isa<ConstantSDNode>(Op->getOperand(2))) + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = EltVT.getScalarSizeInBits(); + SDLoc DL(Op); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + + if (isa<ConstantSDNode>(Op2)) return Op; - return SDValue(); + + MVT IdxTy = MVT::getIntegerVT(EltSizeInBits); + MVT IdxVTy = MVT::getVectorVT(IdxTy, NumElts); + + if (!isTypeLegal(VT) || !isTypeLegal(IdxVTy)) + return SDValue(); + + SDValue SplatElt = DAG.getSplatBuildVector(VT, DL, Op1); + SDValue SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2); + + SmallVector<SDValue, 32> RawIndices; + for (unsigned i = 0; i < NumElts; ++i) + RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT())); + SDValue Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices); + + // insert vec, elt, idx + // => + // select (splatidx == {0,1,2...}) ? splatelt : vec + SDValue SelectCC = + DAG.getSetCC(DL, IdxVTy, SplatIdx, Indices, ISD::CondCode::SETEQ); + return DAG.getNode(ISD::VSELECT, DL, VT, SelectCC, SplatElt, Op0); } SDValue LoongArchTargetLowering::lowerATOMIC_FENCE(SDValue Op, diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index d9680c7..7a8395a 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -1034,12 +1034,14 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() { void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_GPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); // fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64. S.addFixup(Value, Mips::fixup_Mips_GPREL32); S.appendContents(8, 0); @@ -1047,24 +1049,28 @@ void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_DTPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); S.addFixup(Value, Mips::fixup_Mips_DTPREL64); S.appendContents(8, 0); } void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_TPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); S.addFixup(Value, Mips::fixup_Mips_TPREL64); S.appendContents(8, 0); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 96f52275..95abcde 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -56,9 +56,7 @@ INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm) { - doMulWide = (OptLevel > CodeGenOptLevel::None); -} + : SelectionDAGISel(tm, OptLevel), TM(tm) {} bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index e504a8f..9e0f88e5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -40,9 +40,6 @@ private: class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { const NVPTXTargetMachine &TM; - // If true, generate mul.wide from sext and mul - bool doMulWide; - NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const; bool usePrecSqrtF32(const SDNode *N) const; bool useF32FTZ() const; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index f79b862..4fd3623 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -843,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT, ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD, - ISD::STORE}); + ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}); // setcc for f16x2 and bf16x2 needs special handling to prevent // legalizer's attempt to scalarize it due to v2i1 not being legal. @@ -5219,6 +5219,42 @@ static SDValue PerformREMCombine(SDNode *N, return SDValue(); } +// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y) +static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.hasOneUse()) + return SDValue(); + EVT ToVT = N->getValueType(0); + EVT FromVT = Op.getValueType(); + if (!((ToVT == MVT::i32 && FromVT == MVT::i16) || + (ToVT == MVT::i64 && FromVT == MVT::i32))) + return SDValue(); + if (!(Op.getOpcode() == ISD::MUL || + (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1))))) + return SDValue(); + + SDLoc DL(N); + unsigned ExtOpcode = N->getOpcode(); + unsigned Opcode = 0; + if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap()) + Opcode = NVPTXISD::MUL_WIDE_SIGNED; + else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap()) + Opcode = NVPTXISD::MUL_WIDE_UNSIGNED; + else + return SDValue(); + SDValue RHS = Op.getOperand(1); + if (Op.getOpcode() == ISD::SHL) { + const auto ShiftAmt = Op.getConstantOperandVal(1); + const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, DL, ToVT); + } + return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS); +} + enum OperandSignedness { Signed = 0, Unsigned, @@ -5825,6 +5861,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return combineADDRSPACECAST(N, DCI); case ISD::AND: return PerformANDCombine(N, DCI); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return combineMulWide(N, DCI, OptLevel); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 86d6f7c..41bfe7e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -125,8 +125,6 @@ def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; def doRsqrtOpt : Predicate<"doRsqrtOpt()">; -def doMulWide : Predicate<"doMulWide">; - def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; @@ -836,36 +834,28 @@ def MULWIDES64 : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.s32">; def MULWIDES64Imm : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.s32">; -def MULWIDES64Imm64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.s32">; def MULWIDEU64 : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.u32">; def MULWIDEU64Imm : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.u32">; -def MULWIDEU64Imm64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.u32">; def MULWIDES32 : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.s16">; def MULWIDES32Imm : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.s16">; -def MULWIDES32Imm32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.s16">; def MULWIDEU32 : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.u16">; def MULWIDEU32Imm : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.u16">; -def MULWIDEU32Imm32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.u16">; -def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; -def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; -def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; +def SDTMulWide : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>; +def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide, [SDNPCommutative]>; +def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide, [SDNPCommutative]>; // Matchers for signed, unsigned mul.wide ISD nodes. -let Predicates = [doMulWide] in { +let Predicates = [hasOptEnabled] in { def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>; def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>; def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>; @@ -877,85 +867,6 @@ let Predicates = [doMulWide] in { def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>; } -// Predicates used for converting some patterns to mul.wide. -def SInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(32); -}]>; - -def UInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(32); -}]>; - -def SInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(16); -}]>; - -def UInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(16); -}]>; - -def IntConst_0_30 : PatLeaf<(imm), [{ - // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(31); -}]>; - -def IntConst_0_14 : PatLeaf<(imm), [{ - // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(15); -}]>; - -def SHL2MUL32 : SDNodeXForm<imm, [{ - const APInt &v = N->getAPIntValue(); - APInt temp(32, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); -}]>; - -def SHL2MUL16 : SDNodeXForm<imm, [{ - const APInt &v = N->getAPIntValue(); - APInt temp(16, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); -}]>; - -// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. -let Predicates = [doMulWide] in { - def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)), - (MULWIDES64Imm $a, (SHL2MUL32 $b))>; - def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)), - (MULWIDEU64Imm $a, (SHL2MUL32 $b))>; - - def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)), - (MULWIDES32Imm $a, (SHL2MUL16 $b))>; - def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)), - (MULWIDEU32Imm $a, (SHL2MUL16 $b))>; - - // Convert "sign/zero-extend then multiply" to mul.wide. - def : Pat<(mul (sext i32:$a), (sext i32:$b)), - (MULWIDES64 $a, $b)>; - def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>; - - def : Pat<(mul (zext i32:$a), (zext i32:$b)), - (MULWIDEU64 $a, $b)>; - def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>; - - def : Pat<(mul (sext i16:$a), (sext i16:$b)), - (MULWIDES32 $a, $b)>; - def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>; - - def : Pat<(mul (zext i16:$a), (zext i16:$b)), - (MULWIDEU32 $a, $b)>; - def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>; -} - // // Integer multiply-add // @@ -991,6 +902,39 @@ defm MAD32 : MAD<"mad.lo.s32", i32, B32, i32imm>; defm MAD64 : MAD<"mad.lo.s64", i64, B64, i64imm>; } +multiclass MAD_WIDE<string PtxSuffix, OneUse2 Op, RegTyInfo BigT, RegTyInfo SmallT> { + def rrr: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.RC:$b, BigT.RC:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), BigT.Ty:$c))]>; + def rri: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.RC:$b, BigT.Imm:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), imm:$c))]>; + def rir: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.RC:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), BigT.Ty:$c))]>; + def rii: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.Imm:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), imm:$c))]>; +} + +def mul_wide_unsigned_oneuse : OneUse2<mul_wide_unsigned>; +def mul_wide_signed_oneuse : OneUse2<mul_wide_signed>; + +let Predicates = [hasOptEnabled] in { +defm MAD_WIDE_U16 : MAD_WIDE<"u16", mul_wide_unsigned_oneuse, I32RT, I16RT>; +defm MAD_WIDE_S16 : MAD_WIDE<"s16", mul_wide_signed_oneuse, I32RT, I16RT>; +defm MAD_WIDE_U32 : MAD_WIDE<"u32", mul_wide_unsigned_oneuse, I64RT, I32RT>; +defm MAD_WIDE_S32 : MAD_WIDE<"s32", mul_wide_signed_oneuse, I64RT, I32RT>; +} + foreach t = [I16RT, I32RT, I64RT] in { def NEG_S # t.Size : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 5779d4e..0e8828f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -243,8 +243,6 @@ public: createObjectTargetWriter() const override { return createPPCXCOFFObjectWriter(TT.isArch64Bit()); } - - std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; }; } // end anonymous namespace @@ -279,13 +277,6 @@ ELFPPCAsmBackend::getFixupKind(StringRef Name) const { return std::nullopt; } -std::optional<MCFixupKind> -XCOFFPPCAsmBackend::getFixupKind(StringRef Name) const { - return StringSwitch<std::optional<MCFixupKind>>(Name) - .Case("R_REF", PPC::fixup_ppc_nofixup) - .Default(std::nullopt); -} - MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index 9e8ee9f..df0c666 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -48,8 +48,7 @@ enum Fixups { /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the /// TLS general and local dynamic models, or inserts the thread-pointer - /// register number. It can also be used to tie the ref symbol to prevent it - /// from being garbage collected on AIX. + /// register number. fixup_ppc_nofixup, /// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp index f75ab62..a04f404 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -56,6 +56,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize( switch ((unsigned)Fixup.getKind()) { default: report_fatal_error("Unimplemented fixup kind."); + case XCOFF::RelocationType::R_REF: + return {XCOFF::RelocationType::R_REF, 0}; case PPC::fixup_ppc_half16: { const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15; switch (Specifier) { @@ -96,12 +98,6 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize( return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25}; case PPC::fixup_ppc_br24abs: return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25}; - case PPC::fixup_ppc_nofixup: { - if (Specifier == PPC::S_None) - return {XCOFF::RelocationType::R_REF, 0}; - else - llvm_unreachable("Unsupported Modifier"); - } break; case FK_Data_4: case FK_Data_8: const uint8_t SignAndSizeForFKData = diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index d295f35..1dc485d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -2159,8 +2159,115 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; } -class XXEvalPattern <dag pattern, bits<8> imm> : - Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} +// ============================================================================= +// XXEVAL Instruction Pattern Definitions +// ============================================================================= +// +// XXEVAL instruction performs 256 different logical operations on three vector +// operands using an 8-bit immediate value to select the operation. +// Format: xxeval XT, XA, XB, XC, IMM +// For example: +// Equivalent function A?xor(B,C):and(B,C) is performed by +// xxeval XT, XA, XB, XC, 22 +// +// REGISTER CLASS CONSTRAINTS: +// - XXEVAL natively supports: VSRC register class [v4i32, v4f32, v2f64, v2i64] +// - Other vector types [v16i8, v8i16] require COPY_TO_REGCLASS to/from VRRC +// ============================================================================= + +class XXEvalPattern<dag pattern, bits<8> imm> + : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} + +class XXEvalPatterns<ValueType Vt, dag InputPattern, bits<8> Imm> + : Pat<(Vt InputPattern), + !if(!or(!eq(Vt, v4i32), !eq(Vt, v2i64)), + // VSRC path: direct XXEVAL for v4i32 and v2i64 + (XXEVAL $vA, $vB, $vC, Imm), + // VRRC path: wrap with COPY_TO_REGCLASS for other types + (COPY_TO_REGCLASS(XXEVAL(COPY_TO_REGCLASS Vt:$vA, VSRC), + (COPY_TO_REGCLASS Vt:$vB, VSRC), + (COPY_TO_REGCLASS Vt:$vC, VSRC), Imm), + VRRC))> {} + +// ============================================================================= +// PatFrags for Bitcast-Aware Vector bitwise Operations +// +// Each PatFrags defines TWO alternatives for pattern matcher to choose: +// - Direct operation (for v4i32) +// - Bitcast operation (for other types: v2i64, v16i8, v8i16) +// ============================================================================= + +// Basic Binary Operations +def VAnd + : PatFrags<(ops node:$a, node:$b), [(and node:$a, node:$b), + (bitconvert(and + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VXor + : PatFrags<(ops node:$a, node:$b), [(xor node:$a, node:$b), + (bitconvert(xor + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VOr : PatFrags<(ops node:$a, node:$b), [(or node:$a, node:$b), + (bitconvert(or + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VNot + : PatFrags<(ops node:$a), [(vnot node:$a), + (bitconvert(vnot(v4i32(bitconvert node:$a))))]>; + +// Derived bitwise operations +// Vector NOR operation (not(or)) +def VNor + : PatFrags<(ops node:$a, node:$b), [(vnot(or node:$a, node:$b)), + (bitconvert(vnot(or + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b)))))]>; + +// Vector EQV operation (not(xor)) +def VEqv + : PatFrags<(ops node:$a, node:$b), [(vnot(xor node:$a, node:$b)), + (bitconvert(vnot(xor + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b)))))]>; + +// ============================================================================= +// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectAnd +// This class matches the equivalent Ternary Operation: A ? f(B,C) : AND(B,C) +// and emit the corresponding xxeval instruction with the imm value. +// +// The patterns implement xxeval vector select operations where: +// - A is the selector vector +// - f(B,C) is the "true" case op on vectors B and C (XOR, NOR, EQV, or NOT) +// - AND(B,C) is the "false" case op on vectors B and C +// ============================================================================= +multiclass XXEvalTernarySelectAnd<ValueType Vt> { + // Pattern: A ? XOR(B,C) : AND(B,C) XXEVAL immediate value: 22 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 22>; + + // Pattern: A ? NOR(B,C) : AND(B,C) XXEVAL immediate value: 24 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 24>; + + // Pattern: A ? EQV(B,C) : AND(B,C) XXEVAL immediate value: 25 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 25>; + + // Pattern: A ? NOT(C) : AND(B,C) XXEVAL immediate value: 26 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 26>; + + // Pattern: A ? NOT(B) : AND(B,C) XXEVAL immediate value: 28 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>; +} let Predicates = [PrefixInstrs, HasP10Vector] in { let AddedComplexity = 400 in { @@ -2270,6 +2377,11 @@ let Predicates = [PrefixInstrs, HasP10Vector] in { // (xor A, (or B, C)) def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; + // XXEval Patterns for ternary Operations. + foreach Ty = [v4i32, v2i64, v8i16, v16i8] in { + defm : XXEvalTernarySelectAnd<Ty>; + } + // Anonymous patterns to select prefixed VSX loads and stores. // Load / Store f128 def : Pat<(f128 (load PDForm:$src)), diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td index 4c303a9..da6b95d 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.td +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td @@ -95,3 +95,7 @@ def CSR_XLEN_F32_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F32_V_Interrupt, // Same as CSR_XLEN_F64_V_Interrupt, but excluding X16-X31. def CSR_XLEN_F64_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F64_V_Interrupt, (sequence "X%u", 16, 31))>; + +def CSR_RT_MostRegs : CalleeSavedRegs<(sub CSR_Interrupt, X6, X7, X28)>; +def CSR_RT_MostRegs_RVE : CalleeSavedRegs<(sub CSR_RT_MostRegs, + (sequence "X%u", 16, 31))>; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 34910b7..f223fdbe 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -634,7 +634,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) { // Transform (sra (shl X, C1) C2) with C1 < C2 // -> (SignedBitfieldExtract X, msb, lsb) if (N0.getOpcode() == ISD::SHL) { - auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N01C) return false; @@ -750,7 +750,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) { // Transform (sra (shl X, C1) C2) with C1 > C2 // -> (NDS.BFOS X, lsb, msb) if (N0.getOpcode() == ISD::SHL) { - auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N01C) return false; @@ -1191,7 +1191,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C) // where C2 has 32 leading zeros and C3 trailing zeros. SDNode *SRLIW = CurDAG->getMachineNode( - RISCV::SRLIW, DL, VT, N0->getOperand(0), + RISCV::SRLIW, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(TrailingZeros, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), @@ -1210,7 +1210,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // - without Zba a tablegen pattern applies the very same // transform as we would have done here SDNode *SLLI = CurDAG->getMachineNode( - RISCV::SLLI, DL, VT, N0->getOperand(0), + RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LeadingZeros, DL, VT)); SDNode *SRLI = CurDAG->getMachineNode( RISCV::SRLI, DL, VT, SDValue(SLLI, 0), @@ -1239,7 +1239,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { unsigned TrailingZeros = llvm::countr_zero(Mask); if (LeadingZeros == 32 && TrailingZeros > ShAmt) { SDNode *SRLIW = CurDAG->getMachineNode( - RISCV::SRLIW, DL, VT, N0->getOperand(0), + RISCV::SRLIW, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(TrailingZeros, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), @@ -1266,7 +1266,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (TrailingOnes == 32) { SDNode *SRLI = CurDAG->getMachineNode( Subtarget->is64Bit() ? RISCV::SRLIW : RISCV::SRLI, DL, VT, - N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); + N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); ReplaceNode(Node, SRLI); return; } @@ -1279,19 +1279,19 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (HasBitTest && ShAmt + 1 == TrailingOnes) { SDNode *BEXTI = CurDAG->getMachineNode( Subtarget->hasStdExtZbs() ? RISCV::BEXTI : RISCV::TH_TST, DL, VT, - N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); + N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); ReplaceNode(Node, BEXTI); return; } const unsigned Msb = TrailingOnes - 1; const unsigned Lsb = ShAmt; - if (tryUnsignedBitfieldExtract(Node, DL, VT, N0->getOperand(0), Msb, Lsb)) + if (tryUnsignedBitfieldExtract(Node, DL, VT, N0.getOperand(0), Msb, Lsb)) return; unsigned LShAmt = Subtarget->getXLen() - TrailingOnes; SDNode *SLLI = - CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LShAmt, DL, VT)); SDNode *SRLI = CurDAG->getMachineNode( RISCV::SRLI, DL, VT, SDValue(SLLI, 0), @@ -1328,7 +1328,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { break; unsigned LShAmt = Subtarget->getXLen() - ExtSize; SDNode *SLLI = - CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LShAmt, DL, VT)); SDNode *SRAI = CurDAG->getMachineNode( RISCV::SRAI, DL, VT, SDValue(SLLI, 0), @@ -2942,8 +2942,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, /// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9. bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, SDValue &Offset) { - // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only - // a 9-bit immediate can be folded. + if (SelectAddrFrameIndex(Addr, Base, Offset)) + return true; SDLoc DL(Addr); MVT VT = Addr.getSimpleValueType(); @@ -2953,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, if (isUInt<9>(CVal)) { Base = Addr.getOperand(0); - // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only - // a 9-bit immediate can be folded. + if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base)) + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT); Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 607edd3..b47d89b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2739,27 +2739,6 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const { } } -bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV( - EVT ScalarTy) const { - if (!ScalarTy.isSimple()) - return false; - switch (ScalarTy.getSimpleVT().SimpleTy) { - case MVT::iPTR: - return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true; - case MVT::i8: - case MVT::i16: - case MVT::i32: - case MVT::f16: - case MVT::bf16: - case MVT::f32: - return true; - case MVT::i64: - case MVT::f64: - return Subtarget.hasVInstructionsI64(); - default: - return false; - } -} unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const { return NumRepeatedDivisors; @@ -20751,6 +20730,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getAllOnesConstant(DL, VT); return DAG.getConstant(0, DL, VT); } + case Intrinsic::riscv_vsseg2_mask: + case Intrinsic::riscv_vsseg3_mask: + case Intrinsic::riscv_vsseg4_mask: + case Intrinsic::riscv_vsseg5_mask: + case Intrinsic::riscv_vsseg6_mask: + case Intrinsic::riscv_vsseg7_mask: + case Intrinsic::riscv_vsseg8_mask: { + SDValue Tuple = N->getOperand(2); + unsigned NF = Tuple.getValueType().getRISCVVectorTupleNumFields(); + + if (Subtarget.hasOptimizedSegmentLoadStore(NF) || !Tuple.hasOneUse() || + Tuple.getOpcode() != RISCVISD::TUPLE_INSERT || + !Tuple.getOperand(0).isUndef()) + return SDValue(); + + SDValue Val = Tuple.getOperand(1); + unsigned Idx = Tuple.getConstantOperandVal(2); + + unsigned SEW = Val.getValueType().getScalarSizeInBits(); + assert(Log2_64(SEW) == N->getConstantOperandVal(6) && + "Type mismatch without bitcast?"); + unsigned Stride = SEW / 8 * NF; + unsigned Offset = SEW / 8 * Idx; + + SDValue Ops[] = { + /*Chain=*/N->getOperand(0), + /*IntID=*/ + DAG.getTargetConstant(Intrinsic::riscv_vsse_mask, DL, XLenVT), + /*StoredVal=*/Val, + /*Ptr=*/ + DAG.getNode(ISD::ADD, DL, XLenVT, N->getOperand(3), + DAG.getConstant(Offset, DL, XLenVT)), + /*Stride=*/DAG.getConstant(Stride, DL, XLenVT), + /*Mask=*/N->getOperand(4), + /*VL=*/N->getOperand(5)}; + + auto *OldMemSD = cast<MemIntrinsicSDNode>(N); + // Match getTgtMemIntrinsic for non-unit stride case + EVT MemVT = OldMemSD->getMemoryVT().getScalarType(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + OldMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize); + + SDVTList VTs = DAG.getVTList(MVT::Other); + return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, VTs, Ops, MemVT, + MMO); + } } } case ISD::EXPERIMENTAL_VP_REVERSE: @@ -20843,6 +20869,68 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } break; } + case RISCVISD::TUPLE_EXTRACT: { + EVT VT = N->getValueType(0); + SDValue Tuple = N->getOperand(0); + unsigned Idx = N->getConstantOperandVal(1); + if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN) + break; + + unsigned NF = 0; + switch (Tuple.getConstantOperandVal(1)) { + default: + break; + case Intrinsic::riscv_vlseg2_mask: + case Intrinsic::riscv_vlseg3_mask: + case Intrinsic::riscv_vlseg4_mask: + case Intrinsic::riscv_vlseg5_mask: + case Intrinsic::riscv_vlseg6_mask: + case Intrinsic::riscv_vlseg7_mask: + case Intrinsic::riscv_vlseg8_mask: + NF = Tuple.getValueType().getRISCVVectorTupleNumFields(); + break; + } + + if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF)) + break; + + unsigned SEW = VT.getScalarSizeInBits(); + assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) && + "Type mismatch without bitcast?"); + unsigned Stride = SEW / 8 * NF; + unsigned Offset = SEW / 8 * Idx; + + SDValue Ops[] = { + /*Chain=*/Tuple.getOperand(0), + /*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT), + /*Passthru=*/Tuple.getOperand(2), + /*Ptr=*/ + DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3), + DAG.getConstant(Offset, DL, XLenVT)), + /*Stride=*/DAG.getConstant(Stride, DL, XLenVT), + /*Mask=*/Tuple.getOperand(4), + /*VL=*/Tuple.getOperand(5), + /*Policy=*/Tuple.getOperand(6)}; + + auto *TupleMemSD = cast<MemIntrinsicSDNode>(Tuple); + // Match getTgtMemIntrinsic for non-unit stride case + EVT MemVT = TupleMemSD->getMemoryVT().getScalarType(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize); + + SDVTList VTs = DAG.getVTList({VT, MVT::Other}); + SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, + Ops, MemVT, MMO); + DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1)); + return Result.getValue(0); + } + case RISCVISD::TUPLE_INSERT: { + // tuple_insert tuple, undef, idx -> tuple + if (N->getOperand(1).isUndef()) + return N->getOperand(0); + break; + } } return SDValue(); @@ -22367,6 +22455,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( case CallingConv::C: case CallingConv::Fast: case CallingConv::SPIR_KERNEL: + case CallingConv::PreserveMost: case CallingConv::GRAAL: case CallingConv::RISCV_VectorCall: #define CC_VLS_CASE(ABI_VLEN) case CallingConv::RISCV_VLSCall_##ABI_VLEN: @@ -24260,7 +24349,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return false; EVT ScalarType = DataType.getScalarType(); - if (!isLegalLoadStoreElementTypeForRVV(ScalarType)) + if (!isLegalElementTypeForRVV(ScalarType)) return false; if (!Subtarget.enableUnalignedVectorMem() && diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a788c0b7..ca70c46 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -384,7 +384,6 @@ public: bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override; bool isLegalElementTypeForRVV(EVT ScalarTy) const; - bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const; bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index dd365cf..8297d50 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -136,6 +136,7 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr> //===----------------------------------------------------------------------===// let Predicates = [HasStdExtP] in { +let IsSignExtendingOpW = 1 in def CLS : Unary_r<0b011000000011, 0b001, "cls">; def ABS : Unary_r<0b011000000111, 0b001, "abs">; } // Predicates = [HasStdExtP] @@ -146,8 +147,10 @@ let Predicates = [HasStdExtP, IsRV64] in { def REV16 : Unary_r<0b011010110000, 0b101, "rev16">; def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">; +let IsSignExtendingOpW = 1 in { def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">; def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; +} } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index f391300..5265613 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1120,27 +1120,11 @@ let Predicates = [HasVendorXqcisync, IsRV32] in { def QC_C_SYNCWF : QCIRVInst16CBSYNC<0b100, "qc.c.syncwf">; def QC_C_SYNCWL : QCIRVInst16CBSYNC<0b101, "qc.c.syncwl">; - let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in - def QC_C_DELAY : RVInst16CI<0b000, 0b10, (outs), - (ins uimm5nonzero:$imm), - "qc.c.delay", "$imm"> { - let Inst{12} = 0; - let Inst{11-7} = 0; - let Inst{6-2} = imm{4-0}; - } + // qc.c.delay implemented as an alias, below } // Predicates = [HasVendorXqcisync, IsRV32] let Predicates = [HasVendorXqcisim, IsRV32] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { - def QC_PSYSCALLI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm10:$imm10), - "qc.psyscalli", "$imm10"> { - bits<10> imm10; - - let rs1 = 0; - let rd = 0; - let imm12 = {0b00, imm10}; - } - def QC_PPUTCI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm8:$imm8), "qc.pputci", "$imm8"> { bits<8> imm8; @@ -1150,18 +1134,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { let imm12 = {0b0100, imm8}; } - def QC_PCOREDUMP : QCISim_NONE<0b0110, "qc.pcoredump">; - def QC_PPREGS : QCISim_NONE<0b0111, "qc.ppregs">; - def QC_PPREG : QCISim_RS1<0b1000, "qc.ppreg">; - def QC_PPUTC : QCISim_RS1<0b1001, "qc.pputc">; - def QC_PPUTS : QCISim_RS1<0b1010, "qc.pputs">; - def QC_PEXIT : QCISim_RS1<0b1011, "qc.pexit">; - def QC_PSYSCALL : QCISim_RS1<0b1100, "qc.psyscall">; - - def QC_C_PTRACE : RVInst16CI<0b000, 0b10, (outs), (ins), "qc.c.ptrace", ""> { - let rd = 0; - let imm = 0; - } + // The other instructions are all implemented as aliases, below } // mayLoad = 0, mayStore = 0, hasSideEffects = 1 } // Predicates = [HasVendorXqcisim, IsRV32] @@ -1218,6 +1191,27 @@ let EmitPriority = 0 in { } // EmitPriority = 0 } // Predicates = [HasVendorXqcilo, IsRV32] +let Predicates = [HasVendorXqcisim, IsRV32] in { +let EmitPriority = 1 in { + def : InstAlias<"qc.c.ptrace", (C_SLLI X0, 0)>; + + def : InstAlias<"qc.psyscalli $imm", (SLTI X0, X0, uimm10:$imm)>; + def : InstAlias<"qc.pcoredump", (SLTI X0, X0, 1536)>; + def : InstAlias<"qc.ppregs", (SLTI X0, X0, 1792)>; + def : InstAlias<"qc.ppreg $rs1", (SLTI X0, GPR:$rs1, -2048)>; + def : InstAlias<"qc.pputc $rs1", (SLTI X0, GPR:$rs1, -1792)>; + def : InstAlias<"qc.pputs $rs1", (SLTI X0, GPR:$rs1, -1536)>; + def : InstAlias<"qc.pexit $rs1", (SLTI X0, GPR:$rs1, -1280)>; + def : InstAlias<"qc.psyscall $rs1", (SLTI X0, GPR:$rs1, -1024)>; +} // EmitPriority = 1 +} // Predicates = [HasVendorXqcisim, IsRV32] + +let Predicates = [HasVendorXqcisync, IsRV32] in { +let EmitPriority = 1 in { + def : InstAlias<"qc.c.delay $imm", (C_SLLI X0, uimm5nonzero:$imm)>; +} +} // Predicates = [HasVendorXqcisync, IsRV32] + //===----------------------------------------------------------------------===// // Pseudo-instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 3cbe668..726920e 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType( if (!isTypeLegal(VT)) return false; - if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) || + if (!isLegalElementTypeForRVV(VT.getScalarType()) || !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace, Alignment)) return false; @@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) return false; - // If the segment load is going to be performed segment at a time anyways - // and there's only one element used, use a strided load instead. This - // will be equally fast, and create less vector register pressure. - if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) { - unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // For rv64, need to truncate i64 to i32 to match signature. As VL is at most - // the number of active lanes (which is bounded by i32) this is safe. - VL = Builder.CreateTrunc(VL, Builder.getInt32Ty()); - - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, - {VTy, BasePtr->getType(), Stride->getType()}, - {BasePtr, Stride, Mask, VL}); - Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes); - CI->addParamAttr(0, - Attribute::getWithAlignment(CI->getContext(), Alignment)); - Shuffles[0]->replaceAllUsesWith(CI); - return true; - }; - CallInst *VlsegN = Builder.CreateIntrinsic( FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); @@ -289,33 +266,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) return false; - unsigned Index; - // If the segment store only has one active lane (i.e. the interleave is - // just a spread shuffle), we can use a strided store instead. This will - // be equally fast, and create less vector register pressure. - if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) && - isSpreadMask(Mask, Factor, Index)) { - unsigned ScalarSizeInBytes = - DL.getTypeStoreSize(ShuffleVTy->getElementType()); - Value *Data = SVI->getOperand(0); - Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0)); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // For rv64, need to truncate i64 to i32 to match signature. As VL is at - // most the number of active lanes (which is bounded by i32) this is safe. - VL = Builder.CreateTrunc(VL, Builder.getInt32Ty()); - - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store, - {VTy, BasePtr->getType(), Stride->getType()}, - {Data, BasePtr, Stride, LaneMask, VL}); - Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes); - CI->addParamAttr(1, - Attribute::getWithAlignment(CI->getContext(), Alignment)); - return true; - } - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 5404123..7e58b6f 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -68,6 +68,9 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { auto &Subtarget = MF->getSubtarget<RISCVSubtarget>(); if (MF->getFunction().getCallingConv() == CallingConv::GHC) return CSR_NoRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return Subtarget.hasStdExtE() ? CSR_RT_MostRegs_RVE_SaveList + : CSR_RT_MostRegs_SaveList; if (MF->getFunction().hasFnAttribute("interrupt")) { if (Subtarget.hasVInstructions()) { if (Subtarget.hasStdExtD()) @@ -573,6 +576,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int64_t Val = Offset.getFixed(); int64_t Lo12 = SignExtend64<12>(Val); unsigned Opc = MI.getOpcode(); + if (Opc == RISCV::ADDI && !isInt<12>(Val)) { // We chose to emit the canonical immediate sequence rather than folding // the offset into the using add under the theory that doing so doesn't @@ -585,6 +589,9 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, (Lo12 & 0b11111) != 0) { // Prefetch instructions require the offset to be 32 byte aligned. MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); + } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) { + // MIPS Prefetch instructions require the offset to be 9 bits encoded. + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); } else if ((Opc == RISCV::PseudoRV32ZdinxLD || Opc == RISCV::PseudoRV32ZdinxSD) && Lo12 >= 2044) { @@ -811,7 +818,13 @@ RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF, if (CC == CallingConv::GHC) return CSR_NoRegs_RegMask; - switch (Subtarget.getTargetABI()) { + RISCVABI::ABI ABI = Subtarget.getTargetABI(); + if (CC == CallingConv::PreserveMost) { + if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E) + return CSR_RT_MostRegs_RVE_RegMask; + return CSR_RT_MostRegs_RegMask; + } + switch (ABI) { default: llvm_unreachable("Unrecognized ABI"); case RISCVABI::ABI_ILP32E: diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index fd634b5..61dbd06 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1191,9 +1191,6 @@ static const CostTblEntry VectorIntrinsicCostTable[]{ {Intrinsic::roundeven, MVT::f64, 9}, {Intrinsic::rint, MVT::f32, 7}, {Intrinsic::rint, MVT::f64, 7}, - {Intrinsic::lrint, MVT::i32, 1}, - {Intrinsic::lrint, MVT::i64, 1}, - {Intrinsic::llrint, MVT::i64, 1}, {Intrinsic::nearbyint, MVT::f32, 9}, {Intrinsic::nearbyint, MVT::f64, 9}, {Intrinsic::bswap, MVT::i16, 3}, @@ -1251,11 +1248,48 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, switch (ICA.getID()) { case Intrinsic::lrint: case Intrinsic::llrint: - // We can't currently lower half or bfloat vector lrint/llrint. - if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]); - VecTy && VecTy->getElementType()->is16bitFPTy()) - return InstructionCost::getInvalid(); - [[fallthrough]]; + case Intrinsic::lround: + case Intrinsic::llround: { + auto LT = getTypeLegalizationCost(RetTy); + Type *SrcTy = ICA.getArgTypes().front(); + auto SrcLT = getTypeLegalizationCost(SrcTy); + if (ST->hasVInstructions() && LT.second.isVector()) { + SmallVector<unsigned, 2> Ops; + unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType()); + unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType()); + if (LT.second.getVectorElementType() == MVT::bf16) { + if (!ST->hasVInstructionsBF16Minimal()) + return InstructionCost::getInvalid(); + if (DstEltSz == 32) + Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V}; + else + Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V}; + } else if (LT.second.getVectorElementType() == MVT::f16 && + !ST->hasVInstructionsF16()) { + if (!ST->hasVInstructionsF16Minimal()) + return InstructionCost::getInvalid(); + if (DstEltSz == 32) + Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V}; + else + Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V}; + + } else if (SrcEltSz > DstEltSz) { + Ops = {RISCV::VFNCVT_X_F_W}; + } else if (SrcEltSz < DstEltSz) { + Ops = {RISCV::VFWCVT_X_F_V}; + } else { + Ops = {RISCV::VFCVT_X_F_V}; + } + + // We need to use the source LMUL in the case of a narrowing op, and the + // destination LMUL otherwise. + if (SrcEltSz > DstEltSz) + return SrcLT.first * + getRISCVInstructionCost(Ops, SrcLT.second, CostKind); + return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind); + } + break; + } case Intrinsic::ceil: case Intrinsic::floor: case Intrinsic::trunc: diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index f0510ec..d62d99c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -265,7 +265,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedLoad(Type *DataType, Align Alignment, @@ -297,7 +297,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedGather(Type *DataType, Align Alignment) const override { diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 5cda6a0..7505507 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -74,17 +74,20 @@ class SPIRVLegalizePointerCast : public FunctionPass { // Returns the loaded value. Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType, FixedVectorType *TargetType, Value *Source) { - // We expect the codegen to avoid doing implicit bitcast from a load. - assert(TargetType->getElementType() == SourceType->getElementType()); - assert(TargetType->getNumElements() < SourceType->getNumElements()); - + assert(TargetType->getNumElements() <= SourceType->getNumElements()); LoadInst *NewLoad = B.CreateLoad(SourceType, Source); buildAssignType(B, SourceType, NewLoad); + Value *AssignValue = NewLoad; + if (TargetType->getElementType() != SourceType->getElementType()) { + AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast, + {TargetType, SourceType}, {NewLoad}); + buildAssignType(B, TargetType, AssignValue); + } SmallVector<int> Mask(/* Size= */ TargetType->getNumElements()); for (unsigned I = 0; I < TargetType->getNumElements(); ++I) Mask[I] = I; - Value *Output = B.CreateShuffleVector(NewLoad, NewLoad, Mask); + Value *Output = B.CreateShuffleVector(AssignValue, AssignValue, Mask); buildAssignType(B, TargetType, Output); return Output; } @@ -135,8 +138,9 @@ class SPIRVLegalizePointerCast : public FunctionPass { Output = loadFirstValueFromAggregate(B, SVT->getElementType(), OriginalOperand, LI); } - // Destination is a smaller vector than source. + // Destination is a smaller vector than source or different vector type. // - float3 v3 = vector4; + // - float4 v2 = int4; else if (SVT && DVT) Output = loadVectorFromVector(B, SVT, DVT, OriginalOperand); // Destination is the scalar type stored at the start of an aggregate. diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 721f64a..1995e0f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -335,6 +335,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { getActionDefinitionsBuilder({G_SMULH, G_UMULH}).alwaysLegal(); } + getActionDefinitionsBuilder(G_IS_FPCLASS).custom(); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } @@ -355,9 +357,14 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpvType, bool SPIRVLegalizerInfo::legalizeCustom( LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const { - auto Opc = MI.getOpcode(); MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - if (Opc == TargetOpcode::G_ICMP) { + switch (MI.getOpcode()) { + default: + // TODO: implement legalization for other opcodes. + return true; + case TargetOpcode::G_IS_FPCLASS: + return legalizeIsFPClass(Helper, MI, LocObserver); + case TargetOpcode::G_ICMP: { assert(GR->getSPIRVTypeForVReg(MI.getOperand(0).getReg())); auto &Op0 = MI.getOperand(2); auto &Op1 = MI.getOperand(3); @@ -378,6 +385,238 @@ bool SPIRVLegalizerInfo::legalizeCustom( } return true; } - // TODO: implement legalization for other opcodes. + } +} + +// Note this code was copied from LegalizerHelper::lowerISFPCLASS and adjusted +// to ensure that all instructions created during the lowering have SPIR-V types +// assigned to them. +bool SPIRVLegalizerInfo::legalizeIsFPClass( + LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); + FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm()); + + auto &MIRBuilder = Helper.MIRBuilder; + auto &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + Type *LLVMDstTy = + IntegerType::get(MIRBuilder.getContext(), DstTy.getScalarSizeInBits()); + if (DstTy.isVector()) + LLVMDstTy = VectorType::get(LLVMDstTy, DstTy.getElementCount()); + SPIRVType *SPIRVDstTy = GR->getOrCreateSPIRVType( + LLVMDstTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + + unsigned BitSize = SrcTy.getScalarSizeInBits(); + const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType()); + + LLT IntTy = LLT::scalar(BitSize); + Type *LLVMIntTy = IntegerType::get(MIRBuilder.getContext(), BitSize); + if (SrcTy.isVector()) { + IntTy = LLT::vector(SrcTy.getElementCount(), IntTy); + LLVMIntTy = VectorType::get(LLVMIntTy, SrcTy.getElementCount()); + } + SPIRVType *SPIRVIntTy = GR->getOrCreateSPIRVType( + LLVMIntTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + + // Clang doesn't support capture of structured bindings: + LLT DstTyCopy = DstTy; + const auto assignSPIRVTy = [&](MachineInstrBuilder &&MI) { + // Assign this MI's (assumed only) destination to one of the two types we + // expect: either the G_IS_FPCLASS's destination type, or the integer type + // bitcast from the source type. + LLT MITy = MRI.getType(MI.getReg(0)); + assert((MITy == IntTy || MITy == DstTyCopy) && + "Unexpected LLT type while lowering G_IS_FPCLASS"); + auto *SPVTy = MITy == IntTy ? SPIRVIntTy : SPIRVDstTy; + GR->assignSPIRVTypeToVReg(SPVTy, MI.getReg(0), MF); + return MI; + }; + + // Helper to build and assign a constant in one go + const auto buildSPIRVConstant = [&](LLT Ty, auto &&C) -> MachineInstrBuilder { + if (!Ty.isFixedVector()) + return assignSPIRVTy(MIRBuilder.buildConstant(Ty, C)); + auto ScalarC = MIRBuilder.buildConstant(Ty.getScalarType(), C); + assert((Ty == IntTy || Ty == DstTyCopy) && + "Unexpected LLT type while lowering constant for G_IS_FPCLASS"); + SPIRVType *VecEltTy = GR->getOrCreateSPIRVType( + (Ty == IntTy ? LLVMIntTy : LLVMDstTy)->getScalarType(), MIRBuilder, + SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + GR->assignSPIRVTypeToVReg(VecEltTy, ScalarC.getReg(0), MF); + return assignSPIRVTy(MIRBuilder.buildSplatBuildVector(Ty, ScalarC)); + }; + + if (Mask == fcNone) { + MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 0)); + MI.eraseFromParent(); + return true; + } + if (Mask == fcAllFlags) { + MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 1)); + MI.eraseFromParent(); + return true; + } + + // Note that rather than creating a COPY here (between a floating-point and + // integer type of the same size) we create a SPIR-V bitcast immediately. We + // can't create a G_BITCAST because the LLTs are the same, and we can't seem + // to correctly lower COPYs to SPIR-V bitcasts at this moment. + Register ResVReg = MRI.createGenericVirtualRegister(IntTy); + MRI.setRegClass(ResVReg, GR->getRegClass(SPIRVIntTy)); + GR->assignSPIRVTypeToVReg(SPIRVIntTy, ResVReg, Helper.MIRBuilder.getMF()); + auto AsInt = MIRBuilder.buildInstr(SPIRV::OpBitcast) + .addDef(ResVReg) + .addUse(GR->getSPIRVTypeID(SPIRVIntTy)) + .addUse(SrcReg); + AsInt = assignSPIRVTy(std::move(AsInt)); + + // Various masks. + APInt SignBit = APInt::getSignMask(BitSize); + APInt ValueMask = APInt::getSignedMaxValue(BitSize); // All bits but sign. + APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit. + APInt ExpMask = Inf; + APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf; + APInt QNaNBitMask = + APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1); + APInt InversionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits()); + + auto SignBitC = buildSPIRVConstant(IntTy, SignBit); + auto ValueMaskC = buildSPIRVConstant(IntTy, ValueMask); + auto InfC = buildSPIRVConstant(IntTy, Inf); + auto ExpMaskC = buildSPIRVConstant(IntTy, ExpMask); + auto ZeroC = buildSPIRVConstant(IntTy, 0); + + auto Abs = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC)); + auto Sign = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs)); + + auto Res = buildSPIRVConstant(DstTy, 0); + + const auto appendToRes = [&](MachineInstrBuilder &&ToAppend) { + Res = assignSPIRVTy( + MIRBuilder.buildOr(DstTyCopy, Res, assignSPIRVTy(std::move(ToAppend)))); + }; + + // Tests that involve more than one class should be processed first. + if ((Mask & fcFinite) == fcFinite) { + // finite(V) ==> abs(V) u< exp_mask + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs, + ExpMaskC)); + Mask &= ~fcFinite; + } else if ((Mask & fcFinite) == fcPosFinite) { + // finite(V) && V > 0 ==> V u< exp_mask + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt, + ExpMaskC)); + Mask &= ~fcPosFinite; + } else if ((Mask & fcFinite) == fcNegFinite) { + // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1 + auto Cmp = assignSPIRVTy(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, + DstTy, Abs, ExpMaskC)); + appendToRes(MIRBuilder.buildAnd(DstTy, Cmp, Sign)); + Mask &= ~fcNegFinite; + } + + if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) { + // fcZero | fcSubnormal => test all exponent bits are 0 + // TODO: Handle sign bit specific cases + // TODO: Handle inverted case + if (PartialCheck == (fcZero | fcSubnormal)) { + auto ExpBits = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC)); + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + ExpBits, ZeroC)); + Mask &= ~PartialCheck; + } + } + + // Check for individual classes. + if (FPClassTest PartialCheck = Mask & fcZero) { + if (PartialCheck == fcPosZero) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, ZeroC)); + else if (PartialCheck == fcZero) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC)); + else // fcNegZero + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, SignBitC)); + } + + if (FPClassTest PartialCheck = Mask & fcSubnormal) { + // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set) + // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set) + auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs; + auto OneC = buildSPIRVConstant(IntTy, 1); + auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC); + auto SubnormalRes = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne, + buildSPIRVConstant(IntTy, AllOneMantissa))); + if (PartialCheck == fcNegSubnormal) + SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign); + appendToRes(std::move(SubnormalRes)); + } + + if (FPClassTest PartialCheck = Mask & fcInf) { + if (PartialCheck == fcPosInf) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, InfC)); + else if (PartialCheck == fcInf) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC)); + else { // fcNegInf + APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt(); + auto NegInfC = buildSPIRVConstant(IntTy, NegInf); + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, NegInfC)); + } + } + + if (FPClassTest PartialCheck = Mask & fcNan) { + auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask); + if (PartialCheck == fcNan) { + // isnan(V) ==> abs(V) u> int(inf) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC)); + } else if (PartialCheck == fcQNan) { + // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs, + InfWithQnanBitC)); + } else { // fcSNan + // issignaling(V) ==> abs(V) u> unsigned(Inf) && + // abs(V) u< (unsigned(Inf) | quiet_bit) + auto IsNan = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC)); + auto IsNotQnan = assignSPIRVTy(MIRBuilder.buildICmp( + CmpInst::Predicate::ICMP_ULT, DstTy, Abs, InfWithQnanBitC)); + appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan)); + } + } + + if (FPClassTest PartialCheck = Mask & fcNormal) { + // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u< + // (max_exp-1)) + APInt ExpLSB = ExpMask & ~(ExpMask.shl(1)); + auto ExpMinusOne = assignSPIRVTy( + MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB))); + APInt MaxExpMinusOne = ExpMask - ExpLSB; + auto NormalRes = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne, + buildSPIRVConstant(IntTy, MaxExpMinusOne))); + if (PartialCheck == fcNegNormal) + NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign); + else if (PartialCheck == fcPosNormal) { + auto PosSign = assignSPIRVTy(MIRBuilder.buildXor( + DstTy, Sign, buildSPIRVConstant(DstTy, InversionMask))); + NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign); + } + appendToRes(std::move(NormalRes)); + } + + MIRBuilder.buildCopy(DstReg, Res); + MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h index 6335f21..eeefa42 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h @@ -30,6 +30,10 @@ public: bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override; SPIRVLegalizerInfo(const SPIRVSubtarget &ST); + +private: + bool legalizeIsFPClass(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const; }; } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRVMACHINELEGALIZER_H diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index cd434f7..3f80b2a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -3436,8 +3436,7 @@ static SDValue performSETCCCombine(SDNode *N, return SDValue(); } -static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::MUL); +static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (VT != MVT::v8i32 && VT != MVT::v16i32) return SDValue(); @@ -3523,6 +3522,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue performMulCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N->getOpcode() == ISD::MUL); + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + if (auto Res = TryWideExtMulCombine(N, DCI.DAG)) + return Res; + + // We don't natively support v16i8 mul, but we do support v8i16 so split the + // inputs and extend them to v8i16. Only do this before legalization in case + // a narrow vector is widened and may be simplified later. + if (!DCI.isBeforeLegalize() || VT != MVT::v16i8) + return SDValue(); + + SDLoc DL(N); + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue LowLHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS); + SDValue HighLHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS); + SDValue LowRHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS); + SDValue HighRHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS); + + SDValue MulLow = + DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS)); + SDValue MulHigh = DAG.getBitcast( + VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS)); + + // Take the low byte of each lane. + return DAG.getVectorShuffle( + VT, DL, MulLow, MulHigh, + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3557,6 +3596,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return performLowerPartialReduction(N, DCI.DAG); } case ISD::MUL: - return performMulCombine(N, DCI.DAG); + return performMulCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 28f6599..c3990d1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -782,6 +782,24 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) { for (Instruction &I : BB) { if (I.getType()->isVoidTy()) continue; + + if (isa<AllocaInst>(&I)) { + // If the alloca has any lifetime marker that is no longer dominated + // by the alloca, remove all lifetime markers. Lifetime markers must + // always work directly on the alloca, and this is no longer possible. + bool HasNonDominatedLifetimeMarker = any_of(I.users(), [&](User *U) { + auto *UserI = cast<Instruction>(U); + return UserI->isLifetimeStartOrEnd() && !DT.dominates(&I, UserI); + }); + if (HasNonDominatedLifetimeMarker) { + for (User *U : make_early_inc_range(I.users())) { + auto *UserI = cast<Instruction>(U); + if (UserI->isLifetimeStartOrEnd()) + UserI->eraseFromParent(); + } + } + } + unsigned VarID = SSA.AddVariable(I.getName(), I.getType()); // If a value is defined by an invoke instruction, it is only available in // its normal destination and not in its unwind destination. @@ -1269,10 +1287,20 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { // Setjmp preparation + SmallVector<AllocaInst *> StaticAllocas; + for (Instruction &I : F.getEntryBlock()) + if (auto *AI = dyn_cast<AllocaInst>(&I)) + if (AI->isStaticAlloca()) + StaticAllocas.push_back(AI); + BasicBlock *Entry = &F.getEntryBlock(); DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram()); SplitBlock(Entry, &*Entry->getFirstInsertionPt()); + // Move static allocas back into the entry block, so they stay static. + for (AllocaInst *AI : StaticAllocas) + AI->moveBefore(Entry->getTerminator()->getIterator()); + IRB.SetInsertPoint(Entry->getTerminator()->getIterator()); // This alloca'ed pointer is used by the runtime to identify function // invocations. It's just for pointer comparisons. It will never be diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 1bf9f8b..f9bd233 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -104,6 +104,7 @@ add_llvm_target(X86CodeGen ${sources} IRPrinter Instrumentation MC + ObjCARC ProfileData Scalar SelectionDAG diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 11ab8dc..7244a6d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58074,11 +58074,9 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) { // res, flags2 = sub 0, (and X, Y) // cload/cstore ..., cond_ne, flag2 // -> - // res, flags2 = and X, Y + // res, flags2 = cmp (and X, Y), 0 // cload/cstore ..., cond_ne, flag2 - Ops[4] = DAG.getNode(X86ISD::AND, DL, Sub->getVTList(), Op1.getOperand(0), - Op1.getOperand(1)) - .getValue(1); + Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Op1, Sub.getOperand(0)); } else { return SDValue(); } diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 64b33e4..ab906f9 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1568,7 +1568,7 @@ private: if (DebugLoc SuspendLoc = S->getDebugLoc()) { std::string LabelName = ("__coro_resume_" + Twine(SuspendIndex)).str(); - DILocation &DILoc = *SuspendLoc.get(); + DILocation &DILoc = *SuspendLoc; DILabel *ResumeLabel = DBuilder.createLabel(DIS, LabelName, DILoc.getFile(), SuspendLoc.getLine(), SuspendLoc.getCol(), diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index f43202e..8262c8c 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -1863,7 +1863,6 @@ void AttributeInferer::run(const SCCNodeSet &SCCNodes, struct SCCNodesResult { SCCNodeSet SCCNodes; - bool HasUnknownCall; }; } // end anonymous namespace @@ -2227,29 +2226,13 @@ static void addWillReturn(const SCCNodeSet &SCCNodes, static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) { SCCNodesResult Res; - Res.HasUnknownCall = false; for (Function *F : Functions) { if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) || F->isPresplitCoroutine()) { - // Treat any function we're trying not to optimize as if it were an - // indirect call and omit it from the node set used below. - Res.HasUnknownCall = true; + // Omit any functions we're trying not to optimize from the set. continue; } - // Track whether any functions in this SCC have an unknown call edge. - // Note: if this is ever a performance hit, we can common it with - // subsequent routines which also do scans over the instructions of the - // function. - if (!Res.HasUnknownCall) { - for (Instruction &I : instructions(*F)) { - if (auto *CB = dyn_cast<CallBase>(&I)) { - if (!CB->getCalledFunction()) { - Res.HasUnknownCall = true; - break; - } - } - } - } + Res.SCCNodes.insert(F); } return Res; @@ -2282,15 +2265,10 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter, addColdAttrs(Nodes.SCCNodes, Changed); addWillReturn(Nodes.SCCNodes, Changed); addNoUndefAttrs(Nodes.SCCNodes, Changed); - - // If we have no external nodes participating in the SCC, we can deduce some - // more precise attributes as well. - if (!Nodes.HasUnknownCall) { - addNoAliasAttrs(Nodes.SCCNodes, Changed); - addNonNullAttrs(Nodes.SCCNodes, Changed); - inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed); - addNoRecurseAttrs(Nodes.SCCNodes, Changed); - } + addNoAliasAttrs(Nodes.SCCNodes, Changed); + addNonNullAttrs(Nodes.SCCNodes, Changed); + inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed); + addNoRecurseAttrs(Nodes.SCCNodes, Changed); // Finally, infer the maximal set of attributes from the ones we've inferred // above. This is handling the cases where one attribute on a signature diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index f00796f..c009c1e 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId, "Number of missing alloc nodes for context ids"); STATISTIC(SkippedCallsCloning, "Number of calls skipped during cloning due to unexpected operand"); +STATISTIC(MismatchedCloneAssignments, + "Number of callsites assigned to call multiple non-matching clones"); static cl::opt<std::string> DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -730,7 +732,7 @@ private: /// of the functions tracked calls to their new versions in the CallMap. /// Assigns new clones to clone number CloneNo. FuncInfo cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, + FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); @@ -897,7 +899,7 @@ private: CallsiteContextGraph<ModuleCallsiteContextGraph, Function, Instruction *>::FuncInfo cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, - std::map<CallInfo, CallInfo> &CallMap, + DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo); std::string getLabel(const Function *Func, const Instruction *Call, @@ -989,7 +991,7 @@ private: CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, IndexCall>::FuncInfo cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, - std::map<CallInfo, CallInfo> &CallMap, + DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo); std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, @@ -2060,6 +2062,20 @@ static bool isMemProfClone(const Function &F) { return F.getName().contains(MemProfCloneSuffix); } +// Return the clone number of the given function by extracting it from the +// memprof suffix. Assumes the caller has already confirmed it is a memprof +// clone. +static unsigned getMemProfCloneNum(const Function &F) { + assert(isMemProfClone(F)); + auto Pos = F.getName().find_last_of('.'); + assert(Pos > 0); + unsigned CloneNo; + bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); + assert(!Err); + (void)Err; + return CloneNo; +} + std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const { @@ -3979,7 +3995,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { - if (CalleeFunc.cloneNo() > 0) + auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + if (isMemProfClone(*CurF)) { + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + auto CurCalleeCloneNo = getMemProfCloneNum(*CurF); + if (CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + } + if (NewCalleeCloneNo > 0) cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); OREGetter(CallerCall.call()->getFunction()) .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) @@ -3995,7 +4026,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, assert(CI && "Caller cannot be an allocation which should not have profiled calls"); assert(CI->Clones.size() > CallerCall.cloneNo()); - CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()]; + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + CurCalleeCloneNo = NewCalleeCloneNo; } // Update the debug information attached to NewFunc to use the clone Name. Note @@ -4019,7 +4062,7 @@ static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) { CallsiteContextGraph<ModuleCallsiteContextGraph, Function, Instruction *>::FuncInfo ModuleCallsiteContextGraph::cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, + FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { // Use existing LLVM facilities for cloning and obtaining Call in clone ValueToValueMapTy VMap; @@ -4042,7 +4085,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite( CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, IndexCall>::FuncInfo IndexCallsiteContextGraph::cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, + FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { // Check how many clones we have of Call (and therefore function). // The next clone number is the current size of versions array. @@ -4463,7 +4506,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { FuncInfo FuncClone; // Remappings of each call of interest (from original uncloned call to the // corresponding cloned call in this function clone). - std::map<CallInfo, CallInfo> CallMap; + DenseMap<CallInfo, CallInfo> CallMap; }; // Walk all functions for which we saw calls with memprof metadata, and handle @@ -4499,7 +4542,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; assert(FuncCloneInfos.size() > FuncClone.cloneNo()); - std::map<CallInfo, CallInfo> &CallMap = + DenseMap<CallInfo, CallInfo> &CallMap = FuncCloneInfos[FuncClone.cloneNo()].CallMap; CallInfo CallClone(Call); if (auto It = CallMap.find(Call); It != CallMap.end()) @@ -4551,7 +4594,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { })); // Initialize with empty call map, assign Clone to original function // and its callers, and skip to the next clone. - FuncCloneInfos.push_back({OrigFunc, {}}); + FuncCloneInfos.push_back( + {OrigFunc, DenseMap<CallInfo, CallInfo>()}); AssignCallsiteCloneToFuncClone( OrigFunc, Call, Clone, AllocationCallToContextNodeMap.count(Call)); @@ -4584,7 +4628,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // Clone function and save it along with the CallInfo map created // during cloning in the FuncCloneInfos. - std::map<CallInfo, CallInfo> NewCallMap; + DenseMap<CallInfo, CallInfo> NewCallMap; unsigned CloneNo = FuncCloneInfos.size(); assert(CloneNo > 0 && "Clone 0 is the original function, which " "should already exist in the map"); @@ -4691,7 +4735,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // CallMap is set up as indexed by original Call at clone 0. CallInfo OrigCall(Callee->getOrigNode()->Call); OrigCall.setCloneNo(0); - std::map<CallInfo, CallInfo> &CallMap = + DenseMap<CallInfo, CallInfo> &CallMap = FuncCloneInfos[NewFuncClone.cloneNo()].CallMap; assert(CallMap.count(OrigCall)); CallInfo NewCall(CallMap[OrigCall]); @@ -4714,6 +4758,19 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // where the callers were assigned to different clones of a function. } + auto FindFirstAvailFuncClone = [&]() { + // Find first function in FuncCloneInfos without an assigned + // clone of this callsite Node. We should always have one + // available at this point due to the earlier cloning when the + // FuncCloneInfos size was smaller than the clone number. + for (auto &CF : FuncCloneInfos) { + if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone)) + return CF.FuncClone; + } + llvm_unreachable( + "Expected an available func clone for this callsite clone"); + }; + // See if we can use existing function clone. Walk through // all caller edges to see if any have already been assigned to // a clone of this callsite's function. If we can use it, do so. If not, @@ -4830,16 +4887,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // clone of OrigFunc for another caller during this iteration over // its caller edges. if (!FuncCloneAssignedToCurCallsiteClone) { - // Find first function in FuncCloneInfos without an assigned - // clone of this callsite Node. We should always have one - // available at this point due to the earlier cloning when the - // FuncCloneInfos size was smaller than the clone number. - for (auto &CF : FuncCloneInfos) { - if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone)) { - FuncCloneAssignedToCurCallsiteClone = CF.FuncClone; - break; - } - } + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); assert(FuncCloneAssignedToCurCallsiteClone); // Assign Clone to FuncCloneAssignedToCurCallsiteClone AssignCallsiteCloneToFuncClone( @@ -4853,6 +4901,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { FuncCloneAssignedToCurCallsiteClone); } } + // If we didn't assign a function clone to this callsite clone yet, e.g. + // none of its callers has a non-null call, do the assignment here. + // We want to ensure that every callsite clone is assigned to some + // function clone, so that the call updates below work as expected. + // In particular if this is the original callsite, we want to ensure it + // is assigned to the original function, otherwise the original function + // will appear available for assignment to other callsite clones, + // leading to unintended effects. For one, the unknown and not updated + // callers will call into cloned paths leading to the wrong hints, + // because they still call the original function (clone 0). Also, + // because all callsites start out as being clone 0 by default, we can't + // easily distinguish between callsites explicitly assigned to clone 0 + // vs those never assigned, which can lead to multiple updates of the + // calls when invoking updateCall below, with mismatched clone values. + // TODO: Add a flag to the callsite nodes or some other mechanism to + // better distinguish and identify callsite clones that are not getting + // assigned to function clones as expected. + if (!FuncCloneAssignedToCurCallsiteClone) { + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); + assert(FuncCloneAssignedToCurCallsiteClone && + "No available func clone for this callsite clone"); + AssignCallsiteCloneToFuncClone( + FuncCloneAssignedToCurCallsiteClone, Call, Clone, + /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call)); + } } if (VerifyCCG) { checkNode<DerivedCCG, FuncTy, CallTy>(Node); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index b231c04..d7971e8 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -11,10 +11,13 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/FloatingPointPredicateUtils.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" @@ -3589,6 +3592,154 @@ static Value *foldOrOfInversions(BinaryOperator &I, return nullptr; } +/// Match \p V as "shufflevector -> bitcast" or "extractelement -> zext -> shl" +/// patterns, which extract vector elements and pack them in the same relative +/// positions. +/// +/// \p Vec is the underlying vector being extracted from. +/// \p Mask is a bitmask identifying which packed elements are obtained from the +/// vector. +/// \p VecOffset is the vector element corresponding to index 0 of the +/// mask. +static bool matchSubIntegerPackFromVector(Value *V, Value *&Vec, + int64_t &VecOffset, + SmallBitVector &Mask, + const DataLayout &DL) { + static const auto m_ConstShlOrSelf = [](const auto &Base, uint64_t &ShlAmt) { + ShlAmt = 0; + return m_CombineOr(m_Shl(Base, m_ConstantInt(ShlAmt)), Base); + }; + + // First try to match extractelement -> zext -> shl + uint64_t VecIdx, ShlAmt; + if (match(V, m_ConstShlOrSelf(m_ZExtOrSelf(m_ExtractElt( + m_Value(Vec), m_ConstantInt(VecIdx))), + ShlAmt))) { + auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType()); + if (!VecTy) + return false; + auto *EltTy = dyn_cast<IntegerType>(VecTy->getElementType()); + if (!EltTy) + return false; + + const unsigned EltBitWidth = EltTy->getBitWidth(); + const unsigned TargetBitWidth = V->getType()->getIntegerBitWidth(); + if (TargetBitWidth % EltBitWidth != 0 || ShlAmt % EltBitWidth != 0) + return false; + const unsigned TargetEltWidth = TargetBitWidth / EltBitWidth; + const unsigned ShlEltAmt = ShlAmt / EltBitWidth; + + const unsigned MaskIdx = + DL.isLittleEndian() ? ShlEltAmt : TargetEltWidth - ShlEltAmt - 1; + + VecOffset = static_cast<int64_t>(VecIdx) - static_cast<int64_t>(MaskIdx); + Mask.resize(TargetEltWidth); + Mask.set(MaskIdx); + return true; + } + + // Now try to match a bitcasted subvector. + Instruction *SrcVecI; + if (!match(V, m_BitCast(m_Instruction(SrcVecI)))) + return false; + + auto *SrcTy = dyn_cast<FixedVectorType>(SrcVecI->getType()); + if (!SrcTy) + return false; + + Mask.resize(SrcTy->getNumElements()); + + // First check for a subvector obtained from a shufflevector. + if (isa<ShuffleVectorInst>(SrcVecI)) { + Constant *ConstVec; + ArrayRef<int> ShuffleMask; + if (!match(SrcVecI, m_Shuffle(m_Value(Vec), m_Constant(ConstVec), + m_Mask(ShuffleMask)))) + return false; + + auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType()); + if (!VecTy) + return false; + + const unsigned NumVecElts = VecTy->getNumElements(); + bool FoundVecOffset = false; + for (unsigned Idx = 0; Idx < ShuffleMask.size(); ++Idx) { + if (ShuffleMask[Idx] == PoisonMaskElem) + return false; + const unsigned ShuffleIdx = ShuffleMask[Idx]; + if (ShuffleIdx >= NumVecElts) { + const unsigned ConstIdx = ShuffleIdx - NumVecElts; + auto *ConstElt = + dyn_cast<ConstantInt>(ConstVec->getAggregateElement(ConstIdx)); + if (!ConstElt || !ConstElt->isNullValue()) + return false; + continue; + } + + if (FoundVecOffset) { + if (VecOffset + Idx != ShuffleIdx) + return false; + } else { + if (ShuffleIdx < Idx) + return false; + VecOffset = ShuffleIdx - Idx; + FoundVecOffset = true; + } + Mask.set(Idx); + } + return FoundVecOffset; + } + + // Check for a subvector obtained as an (insertelement V, 0, idx) + uint64_t InsertIdx; + if (!match(SrcVecI, + m_InsertElt(m_Value(Vec), m_Zero(), m_ConstantInt(InsertIdx)))) + return false; + + auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType()); + if (!VecTy) + return false; + VecOffset = 0; + bool AlreadyInsertedMaskedElt = Mask.test(InsertIdx); + Mask.set(); + if (!AlreadyInsertedMaskedElt) + Mask.reset(InsertIdx); + return true; +} + +/// Try to fold the join of two scalar integers whose contents are packed +/// elements of the same vector. +static Instruction *foldIntegerPackFromVector(Instruction &I, + InstCombiner::BuilderTy &Builder, + const DataLayout &DL) { + assert(I.getOpcode() == Instruction::Or); + Value *LhsVec, *RhsVec; + int64_t LhsVecOffset, RhsVecOffset; + SmallBitVector Mask; + if (!matchSubIntegerPackFromVector(I.getOperand(0), LhsVec, LhsVecOffset, + Mask, DL)) + return nullptr; + if (!matchSubIntegerPackFromVector(I.getOperand(1), RhsVec, RhsVecOffset, + Mask, DL)) + return nullptr; + if (LhsVec != RhsVec || LhsVecOffset != RhsVecOffset) + return nullptr; + + // Convert into shufflevector -> bitcast; + const unsigned ZeroVecIdx = + cast<FixedVectorType>(LhsVec->getType())->getNumElements(); + SmallVector<int> ShuffleMask(Mask.size(), ZeroVecIdx); + for (unsigned Idx : Mask.set_bits()) { + assert(LhsVecOffset + Idx >= 0); + ShuffleMask[Idx] = LhsVecOffset + Idx; + } + + Value *MaskedVec = Builder.CreateShuffleVector( + LhsVec, Constant::getNullValue(LhsVec->getType()), ShuffleMask, + I.getName() + ".v"); + return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType()); +} + // A decomposition of ((X & Mask) * Factor). The NUW / NSW bools // track these properities for preservation. Note that we can decompose // equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask * @@ -3766,6 +3917,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *X = foldComplexAndOrPatterns(I, Builder)) return X; + if (Instruction *X = foldIntegerPackFromVector(I, Builder, DL)) + return X; + // (A & B) | (C & D) -> A ^ D where A == ~C && B == ~D // (A & B) | (C & D) -> A ^ C where A == ~D && B == ~C if (Value *V = foldOrOfInversions(I, Builder)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index da9b126..b268fea 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -163,6 +163,11 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( LaterIndices.push_back(IdxVal); } + Value *Idx = GEP->getOperand(2); + // If the index type is non-canonical, wait for it to be canonicalized. + if (Idx->getType() != DL.getIndexType(GEP->getType())) + return nullptr; + enum { Overdefined = -3, Undefined = -2 }; // Variables for our state machines. @@ -290,17 +295,6 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( // Now that we've scanned the entire array, emit our new comparison(s). We // order the state machines in complexity of the generated code. - Value *Idx = GEP->getOperand(2); - - // If the index is larger than the pointer offset size of the target, truncate - // the index down like the GEP would do implicitly. We don't have to do this - // for an inbounds GEP because the index can't be out of range. - if (!GEP->isInBounds()) { - Type *PtrIdxTy = DL.getIndexType(GEP->getType()); - unsigned OffsetSize = PtrIdxTy->getIntegerBitWidth(); - if (Idx->getType()->getPrimitiveSizeInBits().getFixedValue() > OffsetSize) - Idx = Builder.CreateTrunc(Idx, PtrIdxTy); - } // If inbounds keyword is not present, Idx * ElementSize can overflow. // Let's assume that ElementSize is 2 and the wanted value is at offset 0. diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index e2a9255..9e33320 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2777,6 +2777,12 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP, Indices.append(GEP.idx_begin()+1, GEP.idx_end()); } + // Don't create GEPs with more than one variable index. + unsigned NumVarIndices = + count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); }); + if (NumVarIndices > 1) + return nullptr; + if (!Indices.empty()) return replaceInstUsesWith( GEP, Builder.CreateGEP( @@ -3199,6 +3205,14 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return replaceInstUsesWith(GEP, NewGEP); } + // Strip trailing zero indices. + auto *LastIdx = dyn_cast<Constant>(Indices.back()); + if (LastIdx && LastIdx->isNullValue() && !LastIdx->getType()->isVectorTy()) { + return replaceInstUsesWith( + GEP, Builder.CreateGEP(GEP.getSourceElementType(), PtrOp, + drop_end(Indices), "", GEP.getNoWrapFlags())); + } + // Scalarize vector operands; prefer splat-of-gep.as canonical form. // Note that this looses information about undef lanes; we run it after // demanded bits to partially mitigate that loss. @@ -3225,6 +3239,30 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return replaceInstUsesWith(GEP, Res); } + bool SeenVarIndex = false; + for (auto [IdxNum, Idx] : enumerate(Indices)) { + if (isa<Constant>(Idx)) + continue; + + if (!SeenVarIndex) { + SeenVarIndex = true; + continue; + } + + // GEP has multiple variable indices: Split it. + ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum); + Value *FrontGEP = + Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices, + GEP.getName() + ".split", GEP.getNoWrapFlags()); + + SmallVector<Value *> BackIndices; + BackIndices.push_back(Constant::getNullValue(NewScalarIndexTy)); + append_range(BackIndices, drop_begin(Indices, IdxNum)); + return GetElementPtrInst::Create( + GetElementPtrInst::getIndexedType(GEPEltType, FrontIndices), FrontGEP, + BackIndices, GEP.getNoWrapFlags()); + } + // Check to see if the inputs to the PHI node are getelementptr instructions. if (auto *PN = dyn_cast<PHINode>(PtrOp)) { if (Value *NewPtrOp = foldGEPOfPhi(GEP, PN, Builder)) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index df31f07..54d9a83 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4769,6 +4769,79 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOriginForNaryOp(I); } + // Approximately handle AVX Galois Field Affine Transformation + // + // e.g., + // <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8) + // <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8) + // <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8) + // Out A x b + // where A and x are packed matrices, b is a vector, + // Out = A * x + b in GF(2) + // + // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix + // computation also includes a parity calculation. + // + // For the bitwise AND of bits V1 and V2, the exact shadow is: + // Out_Shadow = (V1_Shadow & V2_Shadow) + // | (V1 & V2_Shadow) + // | (V1_Shadow & V2 ) + // + // We approximate the shadow of gf2p8affineqb using: + // Out_Shadow = gf2p8affineqb(x_Shadow, A_shadow, 0) + // | gf2p8affineqb(x, A_shadow, 0) + // | gf2p8affineqb(x_Shadow, A, 0) + // | set1_epi8(b_Shadow) + // + // This approximation has false negatives: if an intermediate dot-product + // contains an even number of 1's, the parity is 0. + // It has no false positives. + void handleAVXGF2P8Affine(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + + assert(I.arg_size() == 3); + Value *A = I.getOperand(0); + Value *X = I.getOperand(1); + Value *B = I.getOperand(2); + + assert(isFixedIntVector(A)); + assert(cast<VectorType>(A->getType()) + ->getElementType() + ->getScalarSizeInBits() == 8); + + assert(A->getType() == X->getType()); + + assert(B->getType()->isIntegerTy()); + assert(B->getType()->getScalarSizeInBits() == 8); + + assert(I.getType() == A->getType()); + + Value *AShadow = getShadow(A); + Value *XShadow = getShadow(X); + Value *BZeroShadow = getCleanShadow(B); + + CallInst *AShadowXShadow = IRB.CreateIntrinsic( + I.getType(), I.getIntrinsicID(), {XShadow, AShadow, BZeroShadow}); + CallInst *AShadowX = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), + {X, AShadow, BZeroShadow}); + CallInst *XShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), + {XShadow, A, BZeroShadow}); + + unsigned NumElements = cast<FixedVectorType>(I.getType())->getNumElements(); + Value *BShadow = getShadow(B); + Value *BBroadcastShadow = getCleanShadow(AShadow); + // There is no LLVM IR intrinsic for _mm512_set1_epi8. + // This loop generates a lot of LLVM IR, which we expect that CodeGen will + // lower appropriately (e.g., VPBROADCASTB). + // Besides, b is often a constant, in which case it is fully initialized. + for (unsigned i = 0; i < NumElements; i++) + BBroadcastShadow = IRB.CreateInsertElement(BBroadcastShadow, BShadow, i); + + setShadow(&I, IRB.CreateOr( + {AShadowXShadow, AShadowX, XShadowA, BBroadcastShadow})); + setOriginForNaryOp(I); + } + // Handle Arm NEON vector load intrinsics (vld*). // // The WithLane instructions (ld[234]lane) are similar to: @@ -5604,6 +5677,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { break; } + // AVX Galois Field New Instructions + case Intrinsic::x86_vgf2p8affineqb_128: + case Intrinsic::x86_vgf2p8affineqb_256: + case Intrinsic::x86_vgf2p8affineqb_512: + handleAVXGF2P8Affine(I); + break; + case Intrinsic::fshl: case Intrinsic::fshr: handleFunnelShift(I); diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 0f63ed0..9b87180 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -1360,13 +1360,10 @@ struct DSEState { /// indicating whether \p I is a free-like call. std::optional<std::pair<MemoryLocation, bool>> getLocForTerminator(Instruction *I) const { - uint64_t Len; - Value *Ptr; - if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len), - m_Value(Ptr)))) - return {std::make_pair(MemoryLocation(Ptr, Len), false)}; - if (auto *CB = dyn_cast<CallBase>(I)) { + if (CB->getIntrinsicID() == Intrinsic::lifetime_end) + return { + std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)}; if (Value *FreedOp = getFreedOperand(CB, &TLI)) return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)}; } diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 6a3f656..1a52af1 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -651,7 +651,7 @@ class NewGVN { BitVector TouchedInstructions; DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange; - mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred; + mutable DenseMap<const IntrinsicInst *, const Value *> PredicateSwapChoice; #ifndef NDEBUG // Debugging for how many times each block and instruction got processed. @@ -840,7 +840,7 @@ private: // Ranking unsigned int getRank(const Value *) const; bool shouldSwapOperands(const Value *, const Value *) const; - bool shouldSwapOperandsForIntrinsic(const Value *, const Value *, + bool shouldSwapOperandsForPredicate(const Value *, const Value *, const IntrinsicInst *I) const; // Reachability handling. @@ -1624,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const { Value *AdditionallyUsedValue = CmpOp0; // Sort the ops. - if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) { + if (shouldSwapOperandsForPredicate(FirstOp, SecondOp, I)) { std::swap(FirstOp, SecondOp); Predicate = CmpInst::getSwappedPredicate(Predicate); AdditionallyUsedValue = CmpOp1; @@ -3024,7 +3024,7 @@ void NewGVN::cleanupTables() { PredicateToUsers.clear(); MemoryToUsers.clear(); RevisitOnReachabilityChange.clear(); - IntrinsicInstPred.clear(); + PredicateSwapChoice.clear(); } // Assign local DFS number mapping to instructions, and leave space for Value @@ -4250,20 +4250,18 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B); } -bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B, +bool NewGVN::shouldSwapOperandsForPredicate(const Value *A, const Value *B, const IntrinsicInst *I) const { - auto LookupResult = IntrinsicInstPred.find(I); if (shouldSwapOperands(A, B)) { - if (LookupResult == IntrinsicInstPred.end()) - IntrinsicInstPred.insert({I, B}); - else - LookupResult->second = B; + PredicateSwapChoice[I] = B; return true; } - if (LookupResult != IntrinsicInstPred.end()) { + auto LookupResult = PredicateSwapChoice.find(I); + if (LookupResult != PredicateSwapChoice.end()) { auto *SeenPredicate = LookupResult->second; if (SeenPredicate) { + // We previously decided to swap B to the left. Keep that choice. if (SeenPredicate == B) return true; else diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index b9292af..b78c702 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -703,6 +703,7 @@ private: // Add U as additional user of V. void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); } + void handlePredicate(Instruction *I, Value *CopyOf, const PredicateBase *PI); void handleCallOverdefined(CallBase &CB); void handleCallResult(CallBase &CB); void handleCallArguments(CallBase &CB); @@ -1927,6 +1928,75 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) { } } +void SCCPInstVisitor::handlePredicate(Instruction *I, Value *CopyOf, + const PredicateBase *PI) { + ValueLatticeElement CopyOfVal = getValueState(CopyOf); + const std::optional<PredicateConstraint> &Constraint = PI->getConstraint(); + if (!Constraint) { + mergeInValue(ValueState[I], I, CopyOfVal); + return; + } + + CmpInst::Predicate Pred = Constraint->Predicate; + Value *OtherOp = Constraint->OtherOp; + + // Wait until OtherOp is resolved. + if (getValueState(OtherOp).isUnknown()) { + addAdditionalUser(OtherOp, I); + return; + } + + ValueLatticeElement CondVal = getValueState(OtherOp); + ValueLatticeElement &IV = ValueState[I]; + if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) { + auto ImposedCR = + ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType())); + + // Get the range imposed by the condition. + if (CondVal.isConstantRange()) + ImposedCR = ConstantRange::makeAllowedICmpRegion( + Pred, CondVal.getConstantRange()); + + // Combine range info for the original value with the new range from the + // condition. + auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(), + /*UndefAllowed=*/true); + // Treat an unresolved input like a full range. + if (CopyOfCR.isEmptySet()) + CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth()); + auto NewCR = ImposedCR.intersectWith(CopyOfCR); + // If the existing information is != x, do not use the information from + // a chained predicate, as the != x information is more likely to be + // helpful in practice. + if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement()) + NewCR = CopyOfCR; + + // The new range is based on a branch condition. That guarantees that + // neither of the compare operands can be undef in the branch targets, + // unless we have conditions that are always true/false (e.g. icmp ule + // i32, %a, i32_max). For the latter overdefined/empty range will be + // inferred, but the branch will get folded accordingly anyways. + addAdditionalUser(OtherOp, I); + mergeInValue( + IV, I, ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false)); + return; + } else if (Pred == CmpInst::ICMP_EQ && + (CondVal.isConstant() || CondVal.isNotConstant())) { + // For non-integer values or integer constant expressions, only + // propagate equal constants or not-constants. + addAdditionalUser(OtherOp, I); + mergeInValue(IV, I, CondVal); + return; + } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) { + // Propagate inequalities. + addAdditionalUser(OtherOp, I); + mergeInValue(IV, I, ValueLatticeElement::getNot(CondVal.getConstant())); + return; + } + + return (void)mergeInValue(IV, I, CopyOfVal); +} + void SCCPInstVisitor::handleCallResult(CallBase &CB) { Function *F = CB.getCalledFunction(); @@ -1936,77 +2006,10 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { return; Value *CopyOf = CB.getOperand(0); - ValueLatticeElement CopyOfVal = getValueState(CopyOf); - const auto *PI = getPredicateInfoFor(&CB); + const PredicateBase *PI = getPredicateInfoFor(&CB); assert(PI && "Missing predicate info for ssa.copy"); - - const std::optional<PredicateConstraint> &Constraint = - PI->getConstraint(); - if (!Constraint) { - mergeInValue(ValueState[&CB], &CB, CopyOfVal); - return; - } - - CmpInst::Predicate Pred = Constraint->Predicate; - Value *OtherOp = Constraint->OtherOp; - - // Wait until OtherOp is resolved. - if (getValueState(OtherOp).isUnknown()) { - addAdditionalUser(OtherOp, &CB); - return; - } - - ValueLatticeElement CondVal = getValueState(OtherOp); - ValueLatticeElement &IV = ValueState[&CB]; - if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) { - auto ImposedCR = - ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType())); - - // Get the range imposed by the condition. - if (CondVal.isConstantRange()) - ImposedCR = ConstantRange::makeAllowedICmpRegion( - Pred, CondVal.getConstantRange()); - - // Combine range info for the original value with the new range from the - // condition. - auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(), - /*UndefAllowed=*/true); - // Treat an unresolved input like a full range. - if (CopyOfCR.isEmptySet()) - CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth()); - auto NewCR = ImposedCR.intersectWith(CopyOfCR); - // If the existing information is != x, do not use the information from - // a chained predicate, as the != x information is more likely to be - // helpful in practice. - if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement()) - NewCR = CopyOfCR; - - // The new range is based on a branch condition. That guarantees that - // neither of the compare operands can be undef in the branch targets, - // unless we have conditions that are always true/false (e.g. icmp ule - // i32, %a, i32_max). For the latter overdefined/empty range will be - // inferred, but the branch will get folded accordingly anyways. - addAdditionalUser(OtherOp, &CB); - mergeInValue( - IV, &CB, - ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false)); - return; - } else if (Pred == CmpInst::ICMP_EQ && - (CondVal.isConstant() || CondVal.isNotConstant())) { - // For non-integer values or integer constant expressions, only - // propagate equal constants or not-constants. - addAdditionalUser(OtherOp, &CB); - mergeInValue(IV, &CB, CondVal); - return; - } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) { - // Propagate inequalities. - addAdditionalUser(OtherOp, &CB); - mergeInValue(IV, &CB, - ValueLatticeElement::getNot(CondVal.getConstant())); - return; - } - - return (void)mergeInValue(IV, &CB, CopyOfVal); + handlePredicate(&CB, CopyOf, PI); + return; } if (II->getIntrinsicID() == Intrinsic::vscale) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7b7efb8..fe93fcd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -93,6 +93,7 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -155,6 +156,7 @@ #include <utility> using namespace llvm; +using namespace SCEVPatternMatch; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { /// ElementCount to include loops whose trip count is a function of vscale. static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L) { - return ElementCount::getFixed(SE->getSmallConstantTripCount(L)); + if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L)) + return ElementCount::getFixed(ExpectedTC); + + const SCEV *BTC = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BTC)) + return ElementCount::getFixed(0); + + const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L); + if (isa<SCEVVScale>(ExitCount)) + return ElementCount::getScalable(1); + + const APInt *Scale; + if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale()))) + if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap()) + if (Scale->getActiveBits() <= 32) + return ElementCount::getScalable(Scale->getZExtValue()); + + return ElementCount::getFixed(0); } /// Returns "best known" trip count, which is either a valid positive trip count @@ -2593,12 +2612,12 @@ static void cse(BasicBlock *BB) { } } -/// This function attempts to return a value that represents the vectorization -/// factor at runtime. For fixed-width VFs we know this precisely at compile +/// This function attempts to return a value that represents the ElementCount +/// at runtime. For fixed-width VFs we know this precisely at compile /// time, but for scalable VFs we calculate it based on an estimate of the /// vscale value. -static unsigned getEstimatedRuntimeVF(ElementCount VF, - std::optional<unsigned> VScale) { +static unsigned estimateElementCount(ElementCount VF, + std::optional<unsigned> VScale) { unsigned EstimatedVF = VF.getKnownMinValue(); if (VF.isScalable()) if (VScale) @@ -2708,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // use the value of vscale used for tuning. Loop *VectorLoop = LI->getLoopFor(HeaderBB); unsigned EstimatedVFxUF = - getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning()); + estimateElementCount(VF * UF, Cost->getVScaleForTuning()); setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); } @@ -3003,7 +3022,7 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { // is correct. The easiest form of the later is to require that all values // stored are the same. return !(Legal->isInvariant(getLoadStorePointerOperand(I)) && - Legal->isInvariant(cast<StoreInst>(I)->getValueOperand())); + TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand())); } case Instruction::UDiv: case Instruction::SDiv: @@ -4337,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); unsigned Width = - getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning()); + estimateElementCount(Candidate.Width, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF << " costs: " << (Candidate.Cost / Width)); if (VF.isScalable()) @@ -4445,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 ? EpilogueVectorizationMinVF : TTI.getEpilogueVectorizationMinVF(); - return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >= + return estimateElementCount(VF * Multiplier, VScaleForTuning) >= MinVFThreshold; } @@ -4498,7 +4517,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. ElementCount EstimatedRuntimeVF = ElementCount::getFixed( - getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning())); + estimateElementCount(MainLoopVF, CM.getVScaleForTuning())); ScalarEvolution &SE = *PSE.getSE(); Type *TCType = Legal->getWidestInductionType(); @@ -4745,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning); - // Try to get the exact trip count, or an estimate based on profiling data or // ConstantMax from PSE, failing that. - if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) { + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); + + // For fixed length VFs treat a scalable trip count as unknown. + if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) { + // Re-evaluate trip counts and VFs to be in the same numerical space. + unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning); + unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning); + // At least one iteration must be scalar when this constraint holds. So the // maximum available iterations for interleaving is one less. - unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) - ? BestKnownTC->getFixedValue() - 1 - : BestKnownTC->getFixedValue(); + if (requiresScalarEpilogue(VF.isVector())) + --AvailableTC; unsigned InterleaveCountLB = bit_floor(std::max( 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); @@ -6925,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, // Now compute and add the VPlan-based cost. Cost += Plan.cost(VF, CostCtx); #ifndef NDEBUG - unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning()); + unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << " (Estimated cost per lane: "); if (Cost.isValid()) { @@ -7292,6 +7315,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // Regions are dissolved after optimizing for VF and UF, which completely // removes unneeded loop regions first. VPlanTransforms::dissolveLoopRegions(BestVPlan); + // Canonicalize EVL loops after regions are dissolved. + VPlanTransforms::canonicalizeEVLLoops(BestVPlan); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, OrigLoop->getParentLoop(), @@ -9611,7 +9636,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. - unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale); + unsigned IntVF = estimateElementCount(VF.Width, VScale); uint64_t RtC = TotalCost.getValue(); uint64_t Div = ScalarC * IntVF - VF.Cost.getValue(); uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 225658b..68e7c20 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3391,12 +3391,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, // must use intrinsics to interleave. if (VecTy->isScalableTy()) { assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors"); - VectorType *InterleaveTy = - VectorType::get(VecTy->getElementType(), - VecTy->getElementCount().multiplyCoefficientBy(Factor)); - return Builder.CreateIntrinsic(InterleaveTy, - getInterleaveIntrinsicID(Factor), Vals, - /*FMFSource=*/nullptr, Name); + return Builder.CreateVectorInterleave(Vals, Name); } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -3503,8 +3498,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(InterleaveFactor <= 8 && "Unsupported deinterleave factor for scalable vectors"); NewLoad = State.Builder.CreateIntrinsic( - getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(), - NewLoad, + Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), + NewLoad->getType(), NewLoad, /*FMFSource=*/nullptr, "strided.vec"); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8de05c1..47a9ff0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2244,6 +2244,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { // Try to optimize header mask recipes away to their EVL variants. for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { + // TODO: Split optimizeMaskToEVL out and move into + // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in + // tryToBuildVPlanWithVPRecipes beforehand. for (VPUser *U : collectUsersRecursively(HeaderMask)) { auto *CurRecipe = cast<VPRecipeBase>(U); VPRecipeBase *EVLRecipe = @@ -2265,6 +2268,20 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { } ToErase.push_back(CurRecipe); } + + // Replace header masks with a mask equivalent to predicating by EVL: + // + // icmp ule widen-canonical-iv backedge-taken-count + // -> + // icmp ult step-vector, EVL + VPRecipeBase *EVLR = EVL.getDefiningRecipe(); + VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator())); + Type *EVLType = TypeInfo.inferScalarType(&EVL); + VPValue *EVLMask = Builder.createICmp( + CmpInst::ICMP_ULT, + Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL); + HeaderMask->replaceAllUsesWith(EVLMask); + ToErase.push_back(HeaderMask->getDefiningRecipe()); } for (VPRecipeBase *R : reverse(ToErase)) { @@ -2373,6 +2390,66 @@ bool VPlanTransforms::tryAddExplicitVectorLength( return true; } +void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { + using namespace llvm::VPlanPatternMatch; + // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe. + // There should be only one EVL PHI in the entire plan. + VPEVLBasedIVPHIRecipe *EVLPhi = nullptr; + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_shallow(Plan.getEntry()))) + for (VPRecipeBase &R : VPBB->phis()) + if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) { + assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected"); + EVLPhi = PhiR; + } + + // Early return if no EVL PHI is found. + if (!EVLPhi) + return; + + VPBasicBlock *HeaderVPBB = EVLPhi->getParent(); + VPValue *EVLIncrement = EVLPhi->getBackedgeValue(); + + // Convert EVLPhi to concrete recipe. + auto *ScalarR = + VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement}, + EVLPhi->getDebugLoc(), "evl.based.iv"); + EVLPhi->replaceAllUsesWith(ScalarR); + EVLPhi->eraseFromParent(); + + // Replace CanonicalIVInc with EVL-PHI increment. + auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin()); + VPValue *Backedge = CanonicalIV->getIncomingValue(1); + assert(match(Backedge, + m_c_Binary<Instruction::Add>(m_Specific(CanonicalIV), + m_Specific(&Plan.getVFxUF()))) && + "Unexpected canonical iv"); + Backedge->replaceAllUsesWith(EVLIncrement); + + // Remove unused phi and increment. + VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe(); + CanonicalIVIncrement->eraseFromParent(); + CanonicalIV->eraseFromParent(); + + // Replace the use of VectorTripCount in the latch-exiting block. + // Before: (branch-on-count EVLIVInc, VectorTripCount) + // After: (branch-on-count EVLIVInc, TripCount) + + VPBasicBlock *LatchExiting = + HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock(); + auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator()); + // Skip single-iteration loop region + if (match(LatchExitingBr, m_BranchOnCond(m_True()))) + return; + assert(LatchExitingBr && + match(LatchExitingBr, + m_BranchOnCount(m_VPValue(EVLIncrement), + m_Specific(&Plan.getVectorTripCount()))) && + "Unexpected terminator in EVL loop"); + LatchExitingBr->setOperand(1, Plan.getTripCount()); +} + void VPlanTransforms::dropPoisonGeneratingRecipes( VPlan &Plan, const std::function<bool(BasicBlock *)> &BlockNeedsPredication) { @@ -2704,15 +2781,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) { - auto *ScalarR = VPBuilder(PhiR).createScalarPhi( - {PhiR->getStartValue(), PhiR->getBackedgeValue()}, - PhiR->getDebugLoc(), "evl.based.iv"); - PhiR->replaceAllUsesWith(ScalarR); - ToRemove.push_back(PhiR); - continue; - } - if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) { expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo); ToRemove.push_back(WidenIVR); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index d5af6cd..880159f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -209,6 +209,18 @@ struct VPlanTransforms { /// Replace loop regions with explicit CFG. static void dissolveLoopRegions(VPlan &Plan); + /// Transform EVL loops to use variable-length stepping after region + /// dissolution. + /// + /// Once loop regions are replaced with explicit CFG, EVL loops can step with + /// variable vector lengths instead of fixed lengths. This transformation: + /// * Makes EVL-Phi concrete. + // * Removes CanonicalIV and increment. + /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc, + /// VectorTripCount) with variable-length stepping (branch-on-cond + /// EVLIVInc, TripCount). + static void canonicalizeEVLLoops(VPlan &Plan); + /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy); diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 38ada33..57d01cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -17,6 +17,7 @@ #include "VPlanCFG.h" #include "VPlanDominatorTree.h" #include "VPlanHelpers.h" +#include "VPlanPatternMatch.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/TypeSwitch.h" @@ -171,7 +172,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case<VPInstructionWithType>( [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); }) .Case<VPInstruction>([&](const VPInstruction *I) { - if (I->getOpcode() == Instruction::PHI) + if (I->getOpcode() == Instruction::PHI || + I->getOpcode() == Instruction::ICmp) return VerifyEVLUse(*I, 1); switch (I->getOpcode()) { case Instruction::Add: @@ -192,7 +194,13 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { errs() << "EVL used by unexpected VPInstruction\n"; return false; } - if (I->getNumUsers() != 1) { + // EVLIVIncrement is only used by EVLIV & BranchOnCount. + // Having more than two users is unexpected. + if ((I->getNumUsers() != 1) && + (I->getNumUsers() != 2 || none_of(I->users(), [&I](VPUser *U) { + using namespace llvm::VPlanPatternMatch; + return match(U, m_BranchOnCount(m_Specific(I), m_VPValue())); + }))) { errs() << "EVL is used in VPInstruction with multiple users\n"; return false; } |